Merge pull request #25 from VikParuchuri/dev

Add table extraction
VikParuchuri · Jan 10, 2025 · 4671d86 · 4671d86
2 parents 96ac8a0 + 505aae1
commit 4671d86
Show file tree

Hide file tree

Showing 15 changed files with 474 additions and 41 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,19 @@
+name: Unit tests
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install python dependencies
+        run: |
+          pip install poetry
+          poetry install
+      - name: Run tests
+        run: poetry run pytest
diff --git a/.github/workflows/tests.yml → .github/workflows/integration.yml b/.github/workflows/tests.yml → .github/workflows/integration.yml
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ pdftext PDF_PATH --out_path output.txt
 - `--out_path` path to the output txt file.  If not specified, will write to stdout.
 - `--sort` will attempt to sort in reading order if specified.
 - `--keep_hyphens` will keep hyphens in the output (they will be stripped and words joined otherwise)
-- `--pages` will specify pages (comma separated) to extract
+- `--page_range` will specify pages (comma separated) to extract.  Like `0,5-10,12`.
 - `--workers` specifies the number of parallel workers to use
 - `--flatten_pdf` merges form fields into the PDF
 
@@ -42,7 +42,7 @@ pdftext PDF_PATH --out_path output.txt --json
 - `--out_path` path to the output txt file.  If not specified, will write to stdout.
 - `--json` specifies json output
 - `--sort` will attempt to sort in reading order if specified.
-- `--pages` will specify pages (comma separated) to extract
+- `--page_range` will specify pages (comma separated) to extract.  Like `0,5-10,12`.
 - `--keep_chars` will keep individual characters in the json output
 - `--workers` specifies the number of parallel workers to use
 - `--flatten_pdf` merges form fields into the PDF
@@ -88,6 +88,22 @@ from pdftext.extraction import dictionary_output
 text = dictionary_output(PDF_PATH, sort=False, page_range=[1,2,3], keep_chars=False) # Optional arguments explained above
 ```
 
+Extract text from table cells:
+
+```python
+from pdftext.extraction import table_output
+
+table_inputs = [
+  # Each dictionary entry is a single page
+  {
+    "tables": [[5,10,10,20]], # Coordinates for tables on the page
+    "img_size": [512, 512] # The size of the image the tables were detected in
+  }
+]
+text = table_output(PDF_PATH, table_inputs, page_range=[1,2,3])
+
+```
+
 If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper.  pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well.
 
 # Benchmarks
@@ -99,8 +115,8 @@ Here are the scores, run on an M1 Macbook, without multiprocessing:
 | Library    | Time (s per page) | Alignment Score (% accuracy vs pymupdf) |
 |------------|-------------------|-----------------------------------------|
 | pymupdf    | 0.32              | --                                      |
-| pdftext    | 1.4               | 97.76                                   |
-| pdfplumber | 3.0               | 90.3                                    |
+| pdftext    | 1.36              | 97.78                                   |
+| pdfplumber | 3.16              | 90.36                                   |
 
 pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same character information).
 

diff --git a/extract_text.py b/extract_text.py
@@ -1,41 +1,57 @@
-import argparse
 import json
+from pathlib import Path
+from typing import List
+
+import click
 import pypdfium2 as pdfium
 
 from pdftext.extraction import plain_text_output, dictionary_output
 
-
-def main():
-    parser = argparse.ArgumentParser(description="Extract plain text from PDF.  Not guaranteed to be in order.")
-    parser.add_argument("pdf_path", type=str, help="Path to the PDF file")
-    parser.add_argument("--out_path", type=str, help="Path to the output text file, defaults to stdout", default=None)
-    parser.add_argument("--json", action="store_true", help="Output json instead of plain text", default=False)
-    parser.add_argument("--sort", action="store_true", help="Attempt to sort the text by reading order", default=False)
-    parser.add_argument("--keep_hyphens", action="store_true", help="Keep hyphens in words", default=False)
-    parser.add_argument("--pages", type=str, help="Comma separated pages to extract, like 1,2,3", default=None)
-    parser.add_argument("--flatten_pdf", action="store_true", help="Flatten form fields and annotations into page contents", default=False)
-    parser.add_argument("--keep_chars", action="store_true", help="Keep character level information", default=False)
-    parser.add_argument("--workers", type=int, help="Number of workers to use for parallel processing", default=None)
-    args = parser.parse_args()
-
+def parse_range_str(range_str: str) -> List[int]:
+    range_lst = range_str.split(",")
+    page_lst = []
+    for i in range_lst:
+        if "-" in i:
+            start, end = i.split("-")
+            page_lst += list(range(int(start), int(end) + 1))
+        else:
+            page_lst.append(int(i))
+    page_lst = sorted(list(set(page_lst)))  # Deduplicate page numbers and sort in order
+    return page_lst
+
+@click.command(help="Extract plain text or JSON from PDF.")
+@click.argument("pdf_path", type=click.Path(exists=True))
+@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout")
+@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False)
+@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False)
+@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False)
+@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None)
+@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False)
+@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False)
+@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None)
+def main(
+        pdf_path: Path,
+        out_path: Path | None,
+        **kwargs
+):
     pages = None
-    if args.pages is not None:
-        pdf_doc = pdfium.PdfDocument(args.pdf_path)
-        pages = [int(p) for p in args.pages.split(",")]
+    if kwargs["page_range"] is not None:
+        pdf_doc = pdfium.PdfDocument(pdf_path)
+        pages = parse_range_str(kwargs["page_range"])
         doc_len = len(pdf_doc)
         pdf_doc.close()
-        assert all(p <= doc_len for p in pages), "Invalid page number(s) provided"
+        assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided"
 
-    if args.json:
-        text = dictionary_output(args.pdf_path, sort=args.sort, page_range=pages, flatten_pdf=args.flatten_pdf, keep_chars=args.keep_chars, workers=args.workers)
+    if kwargs["json"]:
+        text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"])
         text = json.dumps(text)
     else:
-        text = plain_text_output(args.pdf_path, sort=args.sort, hyphens=args.keep_hyphens, page_range=pages, flatten_pdf=args.flatten_pdf, workers=args.workers)
+        text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"])
 
-    if args.out_path is None:
+    if out_path is None:
         print(text)
     else:
-        with open(args.out_path, "w+") as f:
+        with open(out_path, "w+") as f:
             f.write(text)
 
 

diff --git a/pdftext/extraction.py b/pdftext/extraction.py
@@ -9,7 +9,9 @@
 
 from pdftext.pdf.pages import get_pages
 from pdftext.postprocessing import handle_hyphens, merge_text, postprocess_text, sort_blocks
+from pdftext.schema import Pages, TableInputs, Tables
 from pdftext.settings import settings
+from pdftext.tables import table_cell_text
 
 
 def _load_pdf(pdf, flatten_pdf):
@@ -22,7 +24,7 @@ def _load_pdf(pdf, flatten_pdf):
     return pdf
 
 
-def _get_page_range(page_range, flatten_pdf=False, quote_loosebox=True):
+def _get_page_range(page_range, flatten_pdf=False, quote_loosebox=True) -> Pages:
     return get_pages(pdf_doc, page_range, flatten_pdf, quote_loosebox)
 
 
@@ -38,7 +40,7 @@ def worker_init(pdf_path, flatten_pdf):
     atexit.register(partial(worker_shutdown, pdf_doc))
 
 
-def _get_pages(pdf_path, page_range=None, flatten_pdf=False, quote_loosebox=True, workers=None):
+def _get_pages(pdf_path, page_range=None, flatten_pdf=False, quote_loosebox=True, workers=None) -> Pages:
     pdf_doc = _load_pdf(pdf_path, flatten_pdf)
     if page_range is None:
         page_range = range(len(pdf_doc))
@@ -70,7 +72,7 @@ def plain_text_output(pdf_path, sort=False, hyphens=False, page_range=None, flat
 
 
 def paginated_plain_text_output(pdf_path, sort=False, hyphens=False, page_range=None, flatten_pdf=False, workers=None) -> List[str]:
-    pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf)
+    pages: Pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf)
     text = []
     for page in pages:
         text.append(merge_text(page, sort=sort, hyphens=hyphens).strip())
@@ -87,8 +89,16 @@ def _process_span(span, page_width, page_height, keep_chars):
             char["bbox"] = char["bbox"].bbox
 
 
-def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None):
-    pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox)
+def dictionary_output(
+        pdf_path,
+        sort=False,
+        page_range=None,
+        keep_chars=False,
+        flatten_pdf=False,
+        quote_loosebox=True,
+        workers=None
+) -> Pages:
+    pages: Pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox)
     for page in pages:
         page_width, page_height = page["width"], page["height"]
         for block in page["blocks"]:
@@ -111,3 +121,27 @@ def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, f
             page["width"], page["height"] = page["height"], page["width"]
             page["bbox"] = [page["bbox"][2], page["bbox"][3], page["bbox"][0], page["bbox"][1]]
     return pages
+
+def table_output(
+    pdf_path: str,
+    table_inputs: TableInputs,
+    page_range=None,
+    flatten_pdf=False,
+    quote_loosebox=True,
+    workers=None,
+    pages: Pages | None = None
+) -> List[Tables]:
+    # Extract pages if they don't exist
+    if not pages:
+        pages: Pages = dictionary_output(pdf_path, page_range=page_range, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox, workers=workers, keep_chars=True)
+
+    assert len(pages) == len(table_inputs), "Number of pages and table inputs must match"
+
+    # Extract table cells per page
+    out_tables = []
+    for page, table_input in zip(pages, table_inputs):
+        tables = table_cell_text(table_input["tables"], page, table_input["img_size"])
+        assert len(tables) == len(table_input["tables"]), "Number of tables and table inputs must match"
+        out_tables.append(tables)
+    return out_tables
+
diff --git a/pdftext/postprocessing.py b/pdftext/postprocessing.py
@@ -76,10 +76,7 @@ def sort_blocks(blocks: List, tolerance=1.25) -> List:
     # Sort blocks into best guess reading order
     vertical_groups = {}
     for block in blocks:
-        bbox = block["bbox"]
-        # Handle both Bbox object and raw list cases
-        y_coord = bbox[1] if isinstance(bbox, (list, tuple)) else bbox.y_start
-        group_key = round(y_coord / tolerance) * tolerance
+        group_key = round(block["bbox"][1] / tolerance) * tolerance
         if group_key not in vertical_groups:
             vertical_groups[group_key] = []
         vertical_groups[group_key].append(block)
@@ -88,7 +85,7 @@ def sort_blocks(blocks: List, tolerance=1.25) -> List:
     sorted_page_blocks = []
     for _, group in sorted(vertical_groups.items()):
         # Handle both Bbox object and raw list cases for x coordinate
-        sorted_group = sorted(group, key=lambda x: x["bbox"][0] if isinstance(x["bbox"], (list, tuple)) else x["bbox"].x_start)
+        sorted_group = sorted(group, key=lambda x: x["bbox"][0])
         sorted_page_blocks.extend(sorted_group)
 
     return sorted_page_blocks

diff --git a/pdftext/schema.py b/pdftext/schema.py
@@ -7,6 +7,9 @@ class Bbox:
     def __init__(self, bbox: List[float]):
         self.bbox = bbox
 
+    def __getitem__(self, item):
+        return self.bbox[item]
+
     @property
     def height(self):
         return self.bbox[3] - self.bbox[1]
@@ -101,6 +104,18 @@ def rotate(self, page_width: float, page_height: float, rotation: int) -> Bbox:
 
         return Bbox(rotated_bbox)
 
+    def rescale(self, img_size: List[int], page: Page) -> Bbox:
+        w_scale = img_size[0] / page["width"]
+        h_scale = img_size[1] / page["height"]
+        new_bbox = [
+            self.bbox[0] * w_scale,
+            self.bbox[1] * h_scale,
+            self.bbox[2] * w_scale,
+            self.bbox[3] * h_scale
+        ]
+
+        return Bbox(new_bbox)
+
 
 class Char(TypedDict):
     bbox: Bbox
@@ -116,7 +131,7 @@ class Span(TypedDict):
     font: Dict[str, Union[Any, str]]
     font_weight: float
     font_size: float
-    chars: List[Char]
+    chars: List[Char] | None
     char_start_idx: int
     char_end_idx: int
 
@@ -137,10 +152,21 @@ class Page(TypedDict):
     width: int
     height: int
     blocks: List[Block]
+    rotation: int
+
+class TableCell(TypedDict):
+    text: str
+    bbox: Bbox
+
+class TableInput(TypedDict):
+    tables: List[List[int]]
+    img_size: List[int]
 
 
 Chars = List[Char]
 Spans = List[Span]
 Lines = List[Line]
 Blocks = List[Block]
 Pages = List[Page]
+Tables = List[List[TableCell]]
+TableInputs = List[TableInput]