Merge pull request #34 from VikParuchuri/dev

Dev
VikParuchuri · Jan 28, 2025 · 92fd696 · 92fd696
2 parents 54914cb + fd881b7
commit 92fd696
Show file tree

Hide file tree

Showing 9 changed files with 95 additions and 70 deletions.
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -24,7 +24,7 @@ jobs:
           pip install poetry
           poetry install
 
-      - name: Run detection benchmark test
+      - name: Run benchmark
         run: |
-          poetry run python benchmark.py --max 5 --result_path results --pdftext_only
-          poetry run python scripts/verify_benchmark_scores.py results/results.json
+          poetry run python benchmark/benchmark.py --max 5 --result_path results --pdftext_only
+          poetry run python benchmark/verify_benchmark_scores.py results/results.json
diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml
@@ -0,0 +1,28 @@
+name: Script test
+
+on: [push]
+
+env:
+  TORCH_DEVICE: "cpu"
+  PYTHONIOENCODING: "utf-8"
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install Python dependencies
+        run: |
+          pip install poetry
+          poetry install
+      - name: Run script
+        run: |
+          poetry run pdftext benchmark/adversarial_short.pdf
diff --git a/benchmark/adversarial_short.pdf b/benchmark/adversarial_short.pdf
diff --git a/benchmark.py → benchmark/benchmark.py b/benchmark.py → benchmark/benchmark.py
diff --git a/scripts/verify_benchmark_scores.py → benchmark/verify_benchmark_scores.py b/scripts/verify_benchmark_scores.py → benchmark/verify_benchmark_scores.py
diff --git a/extract_text.py b/extract_text.py
@@ -1,59 +1,4 @@
-import json
-from pathlib import Path
-from typing import List
-
-import click
-import pypdfium2 as pdfium
-
-from pdftext.extraction import plain_text_output, dictionary_output
-
-def parse_range_str(range_str: str) -> List[int]:
-    range_lst = range_str.split(",")
-    page_lst = []
-    for i in range_lst:
-        if "-" in i:
-            start, end = i.split("-")
-            page_lst += list(range(int(start), int(end) + 1))
-        else:
-            page_lst.append(int(i))
-    page_lst = sorted(list(set(page_lst)))  # Deduplicate page numbers and sort in order
-    return page_lst
-
-@click.command(help="Extract plain text or JSON from PDF.")
-@click.argument("pdf_path", type=click.Path(exists=True))
-@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout")
-@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False)
-@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False)
-@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False)
-@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None)
-@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False)
-@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False)
-@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None)
-def main(
-        pdf_path: Path,
-        out_path: Path | None,
-        **kwargs
-):
-    pages = None
-    if kwargs["page_range"] is not None:
-        pdf_doc = pdfium.PdfDocument(pdf_path)
-        pages = parse_range_str(kwargs["page_range"])
-        doc_len = len(pdf_doc)
-        pdf_doc.close()
-        assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided"
-
-    if kwargs["json"]:
-        text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"])
-        text = json.dumps(text)
-    else:
-        text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"])
-
-    if out_path is None:
-        print(text)
-    else:
-        with open(out_path, "w+") as f:
-            f.write(text)
-
+from pdftext.scripts.extract_text import extract_text_cli
 
 if __name__ == "__main__":
-    main()
+    extract_text_cli()
diff --git a/pdftext/scripts/extract_text.py b/pdftext/scripts/extract_text.py
@@ -0,0 +1,55 @@
+import json
+from pathlib import Path
+from typing import List
+
+import click
+import pypdfium2 as pdfium
+
+from pdftext.extraction import plain_text_output, dictionary_output
+
+def parse_range_str(range_str: str) -> List[int]:
+    range_lst = range_str.split(",")
+    page_lst = []
+    for i in range_lst:
+        if "-" in i:
+            start, end = i.split("-")
+            page_lst += list(range(int(start), int(end) + 1))
+        else:
+            page_lst.append(int(i))
+    page_lst = sorted(list(set(page_lst)))  # Deduplicate page numbers and sort in order
+    return page_lst
+
+@click.command(help="Extract plain text or JSON from PDF.")
+@click.argument("pdf_path", type=click.Path(exists=True))
+@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout")
+@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False)
+@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False)
+@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False)
+@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None)
+@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False)
+@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False)
+@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None)
+def extract_text_cli(
+        pdf_path: Path,
+        out_path: Path | None,
+        **kwargs
+):
+    pages = None
+    if kwargs["page_range"] is not None:
+        pdf_doc = pdfium.PdfDocument(pdf_path)
+        pages = parse_range_str(kwargs["page_range"])
+        doc_len = len(pdf_doc)
+        pdf_doc.close()
+        assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided"
+
+    if kwargs["json"]:
+        text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"])
+        text = json.dumps(text)
+    else:
+        text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"])
+
+    if out_path is None:
+        print(text)
+    else:
+        with open(out_path, "w+") as f:
+            f.write(text)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,9 +10,6 @@ keywords = ["pdf", "text", "extraction"]
 packages = [
     {include = "pdftext"}
 ]
-include = [
-    "extract_text.py"
-]
 
 [tool.poetry.dependencies]
 python = "^3.10"
@@ -35,4 +32,4 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
-pdftext = "extract_text:main"
+pdftext = "pdftext.scripts.extract_text:extract_text_cli"