diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 55d3561..62b01fd 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -24,7 +24,7 @@ jobs: pip install poetry poetry install - - name: Run detection benchmark test + - name: Run benchmark run: | - poetry run python benchmark.py --max 5 --result_path results --pdftext_only - poetry run python scripts/verify_benchmark_scores.py results/results.json + poetry run python benchmark/benchmark.py --max 5 --result_path results --pdftext_only + poetry run python benchmark/verify_benchmark_scores.py results/results.json diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml new file mode 100644 index 0000000..ba09274 --- /dev/null +++ b/.github/workflows/scripts.yml @@ -0,0 +1,28 @@ +name: Script test + +on: [push] + +env: + TORCH_DEVICE: "cpu" + PYTHONIOENCODING: "utf-8" + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + steps: + - uses: actions/checkout@v3 + + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install Python dependencies + run: | + pip install poetry + poetry install + - name: Run script + run: | + poetry run pdftext benchmark/adversarial_short.pdf diff --git a/benchmark/adversarial_short.pdf b/benchmark/adversarial_short.pdf new file mode 100644 index 0000000..2c9e2d1 Binary files /dev/null and b/benchmark/adversarial_short.pdf differ diff --git a/benchmark.py b/benchmark/benchmark.py similarity index 100% rename from benchmark.py rename to benchmark/benchmark.py diff --git a/scripts/verify_benchmark_scores.py b/benchmark/verify_benchmark_scores.py similarity index 100% rename from scripts/verify_benchmark_scores.py rename to benchmark/verify_benchmark_scores.py diff --git a/extract_text.py b/extract_text.py index ed8f5d5..d7fd86b 100644 --- a/extract_text.py +++ b/extract_text.py @@ -1,59 +1,4 @@ -import json -from pathlib import Path -from typing import List - -import click -import pypdfium2 as pdfium - -from pdftext.extraction import plain_text_output, dictionary_output - -def parse_range_str(range_str: str) -> List[int]: - range_lst = range_str.split(",") - page_lst = [] - for i in range_lst: - if "-" in i: - start, end = i.split("-") - page_lst += list(range(int(start), int(end) + 1)) - else: - page_lst.append(int(i)) - page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order - return page_lst - -@click.command(help="Extract plain text or JSON from PDF.") -@click.argument("pdf_path", type=click.Path(exists=True)) -@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout") -@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False) -@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False) -@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False) -@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None) -@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False) -@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False) -@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None) -def main( - pdf_path: Path, - out_path: Path | None, - **kwargs -): - pages = None - if kwargs["page_range"] is not None: - pdf_doc = pdfium.PdfDocument(pdf_path) - pages = parse_range_str(kwargs["page_range"]) - doc_len = len(pdf_doc) - pdf_doc.close() - assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided" - - if kwargs["json"]: - text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"]) - text = json.dumps(text) - else: - text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"]) - - if out_path is None: - print(text) - else: - with open(out_path, "w+") as f: - f.write(text) - +from pdftext.scripts.extract_text import extract_text_cli if __name__ == "__main__": - main() + extract_text_cli() diff --git a/pdftext/scripts/extract_text.py b/pdftext/scripts/extract_text.py new file mode 100644 index 0000000..124902c --- /dev/null +++ b/pdftext/scripts/extract_text.py @@ -0,0 +1,55 @@ +import json +from pathlib import Path +from typing import List + +import click +import pypdfium2 as pdfium + +from pdftext.extraction import plain_text_output, dictionary_output + +def parse_range_str(range_str: str) -> List[int]: + range_lst = range_str.split(",") + page_lst = [] + for i in range_lst: + if "-" in i: + start, end = i.split("-") + page_lst += list(range(int(start), int(end) + 1)) + else: + page_lst.append(int(i)) + page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order + return page_lst + +@click.command(help="Extract plain text or JSON from PDF.") +@click.argument("pdf_path", type=click.Path(exists=True)) +@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout") +@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False) +@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False) +@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False) +@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None) +@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False) +@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False) +@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None) +def extract_text_cli( + pdf_path: Path, + out_path: Path | None, + **kwargs +): + pages = None + if kwargs["page_range"] is not None: + pdf_doc = pdfium.PdfDocument(pdf_path) + pages = parse_range_str(kwargs["page_range"]) + doc_len = len(pdf_doc) + pdf_doc.close() + assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided" + + if kwargs["json"]: + text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"]) + text = json.dumps(text) + else: + text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"]) + + if out_path is None: + print(text) + else: + with open(out_path, "w+") as f: + f.write(text) \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index eec3e2c..1f3bd4d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -663,13 +663,13 @@ tqdm = ["tqdm"] [[package]] name = "huggingface-hub" -version = "0.27.1" +version = "0.28.0" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" files = [ - {file = "huggingface_hub-0.27.1-py3-none-any.whl", hash = "sha256:1c5155ca7d60b60c2e2fc38cbb3ffb7f7c3adf48f824015b219af9061771daec"}, - {file = "huggingface_hub-0.27.1.tar.gz", hash = "sha256:c004463ca870283909d715d20f066ebd6968c2207dae9393fdffb3c1d4d8f98b"}, + {file = "huggingface_hub-0.28.0-py3-none-any.whl", hash = "sha256:71cff4e500efe68061d94b7f6d3114e183715088be7a90bf4dd84af83b5f5cdb"}, + {file = "huggingface_hub-0.28.0.tar.gz", hash = "sha256:c2b18c02a47d4384763caddb4d0ab2a8fc6c16e0800d6de4d55d0a896244aba3"}, ] [package.dependencies] @@ -682,13 +682,13 @@ tqdm = ">=4.42.1" typing-extensions = ">=3.7.4.3" [package.extras] -all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] cli = ["InquirerPy (==0.3.4)"] -dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] hf-transfer = ["hf-transfer (>=0.1.4)"] inference = ["aiohttp"] -quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.5.0)"] +quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.9.0)"] tensorflow = ["graphviz", "pydot", "tensorflow"] tensorflow-testing = ["keras (<3.0)", "tensorflow"] testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] diff --git a/pyproject.toml b/pyproject.toml index d727b31..0fabf17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,9 +10,6 @@ keywords = ["pdf", "text", "extraction"] packages = [ {include = "pdftext"} ] -include = [ - "extract_text.py" -] [tool.poetry.dependencies] python = "^3.10" @@ -35,4 +32,4 @@ requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] -pdftext = "extract_text:main" +pdftext = "pdftext.scripts.extract_text:extract_text_cli"