Add table tests

VikParuchuri · Jan 8, 2025 · 1ae8f34 · 1ae8f34
1 parent eccb7d2
commit 1ae8f34
Show file tree

Hide file tree

Showing 6 changed files with 79 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ pdftext PDF_PATH --out_path output.txt --json
 - `--out_path` path to the output txt file.  If not specified, will write to stdout.
 - `--json` specifies json output
 - `--sort` will attempt to sort in reading order if specified.
-- `--pages` will specify pages (comma separated) to extract
+- `--page_range` will specify pages (comma separated) to extract.  Like `0,5-10,12`.
 - `--keep_chars` will keep individual characters in the json output
 - `--workers` specifies the number of parallel workers to use
 - `--flatten_pdf` merges form fields into the PDF
@@ -88,6 +88,22 @@ from pdftext.extraction import dictionary_output
 text = dictionary_output(PDF_PATH, sort=False, page_range=[1,2,3], keep_chars=False) # Optional arguments explained above
 ```
 
+Extract text from table cells:
+
+```python
+from pdftext.extraction import table_output
+
+table_inputs = [
+  # Each dictionary entry is a single page
+  {
+    "tables": [[5,10,10,20]], # Coordinates for tables on the page
+    "img_size": [512, 512] # The size of the image the tables were detected in
+  }
+]
+text = table_output(PDF_PATH, table_inputs, page_range=[1,2,3])
+
+```
+
 If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper.  pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well.
 
 # Benchmarks

diff --git a/pdftext/postprocessing.py b/pdftext/postprocessing.py
@@ -76,8 +76,7 @@ def sort_blocks(blocks: List, tolerance=1.25) -> List:
     # Sort blocks into best guess reading order
     vertical_groups = {}
     for block in blocks:
-        bbox = block["bbox"]
-        group_key = round(bbox[1] / tolerance) * tolerance
+        group_key = round(block["bbox"][1] / tolerance) * tolerance
         if group_key not in vertical_groups:
             vertical_groups[group_key] = []
         vertical_groups[group_key].append(block)

diff --git a/pdftext/tables.py b/pdftext/tables.py
@@ -1,26 +1,8 @@
 from typing import List
 import numpy as np
 
-from pdftext.schema import Pages, Page, Bbox, Tables
-
-
-def sort_text_lines(lines: List[dict], tolerance=1.25):
-    # Sorts in reading order.  Not 100% accurate, this should only
-    # be used as a starting point for more advanced sorting.
-    vertical_groups = {}
-    for line in lines:
-        group_key = (line["bbox"][1] / tolerance) * tolerance
-        if group_key not in vertical_groups:
-            vertical_groups[group_key] = []
-        vertical_groups[group_key].append(line)
-
-    # Sort each group horizontally and flatten the groups into a single list
-    sorted_lines = []
-    for _, group in sorted(vertical_groups.items()):
-        sorted_group = sorted(group, key=lambda x: x["bbox"][0])
-        sorted_lines.extend(sorted_group)
-
-    return sorted_lines
+from pdftext.postprocessing import sort_blocks
+from pdftext.schema import Page, Bbox, Tables
 
 
 def get_dynamic_gap_thresh(page: Page, img_size: list, default_thresh=.01, min_chars=100):
@@ -43,12 +25,11 @@ def get_dynamic_gap_thresh(page: Page, img_size: list, default_thresh=.01, min_c
     return cell_gap_thresh
 
 
-def is_same_span(char, curr_box, img_size, space_thresh, rotation):
+def is_same_span(bbox, curr_box, img_size, space_thresh, rotation):
     def normalized_diff(a, b, dimension, mult=1, use_abs=True):
         func = abs if use_abs else lambda x: x
         return func(a - b) / img_size[dimension] < space_thresh * mult
 
-    bbox = char["bbox"]
     if rotation == 90:
         return all([
             normalized_diff(bbox[0], curr_box[0], 0, use_abs=False),
@@ -90,29 +71,30 @@ def table_cell_text(tables: List[List[int]], page: Page, img_size: list, table_t
 
         for block in page["blocks"]:
             for line in block["lines"]:
-                if line["bbox"].intersection_pct(table_poly) < table_thresh:
+                line_bbox = Bbox(bbox=line["bbox"]).rescale(img_size, page)
+                if line_bbox.intersection_pct(table_poly) < table_thresh:
                     continue
                 curr_span = None
                 curr_box = None
                 for span in line["spans"]:
                     for char in span["chars"]:
-                        char["bbox"] = char["bbox"].rescale(img_size, page) # Rescale to match image dimensions
+                        bbox = Bbox(bbox=char["bbox"]).rescale(img_size, page).bbox
                         same_span = False
                         if curr_span:
-                            same_span = is_same_span(char, curr_box, img_size, space_thresh, rotation)
+                            same_span = is_same_span(bbox, curr_box, img_size, space_thresh, rotation)
 
                         if curr_span is None:
                             curr_span = char["char"]
-                            curr_box = char["bbox"]
+                            curr_box = bbox
                         elif same_span:
                             curr_span += char["char"]
-                            curr_box = [min(curr_box[0], char["bbox"][0]), min(curr_box[1], char["bbox"][1]),
-                                        max(curr_box[2], char["bbox"][2]), max(curr_box[3], char["bbox"][3])]
+                            curr_box = [min(curr_box[0], bbox[0]), min(curr_box[1], bbox[1]),
+                                        max(curr_box[2], bbox[2]), max(curr_box[3], bbox[3])]
                         else:
                             if curr_span.strip():
                                 table_text.append({"text": curr_span, "bbox": curr_box})
                             curr_span = char["char"]
-                            curr_box = char["bbox"]
+                            curr_box = bbox
                 if curr_span is not None and curr_span.strip():
                     table_text.append({"text": curr_span, "bbox": curr_box})
         # Adjust to be relative to input table
@@ -124,6 +106,6 @@ def table_cell_text(tables: List[List[int]], page: Page, img_size: list, table_t
                 item["bbox"][3] - table[1]
             ]
             item["bbox"] = Bbox(bbox=item["bbox"])
-        table_text = sort_text_lines(table_text)
+        table_text = sort_blocks(table_text)
         table_texts.append(table_text)
     return table_texts
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdftext"
-version = "0.4.1"
+version = "0.5.0"
 description = "Extract structured text from pdfs quickly"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 license = "Apache-2.0"

diff --git a/tests/test_extraction.py b/tests/test_extraction.py
@@ -1,4 +1,5 @@
-from pdftext.extraction import paginated_plain_text_output, plain_text_output
+from pdftext.extraction import paginated_plain_text_output, plain_text_output, dictionary_output
+from pdftext.schema import Pages
 
 
 def test_paginated_output(pdf_path, pdf_doc):
@@ -10,5 +11,17 @@ def text_plain_text_output(pdf_path):
     text = plain_text_output(pdf_path)
     assert "Subspace" in text
 
-
 def test_page_range(pdf_path):
+    pages = [0, 1, 3]
+    text = paginated_plain_text_output(pdf_path, page_range=pages)
+    assert len(text) == len(pages)
+
+def test_json_output(pdf_path, pdf_doc):
+    pages: Pages = dictionary_output(pdf_path)
+    assert len(pages) == len(pdf_doc)
+    assert "Subspace" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["text"]
+
+def test_keep_chars(pdf_path):
+    pages: Pages = dictionary_output(pdf_path, keep_chars=True)
+    assert "Subspace" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["text"]
+    assert "bbox" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["chars"][0]
diff --git a/tests/test_tables.py b/tests/test_tables.py
@@ -0,0 +1,33 @@
+from pdftext.extraction import table_output
+
+
+def test_table_extraction(pdf_path, pdf_doc):
+    pages = [5]
+    page_size = pdf_doc[5].get_size()
+    img_size = [p * 2 for p in page_size]
+
+    # Rescale to img size
+    def rescale_table(bbox):
+        return [
+            bbox[0] * img_size[0],
+            bbox[1] * img_size[1],
+            bbox[2] * img_size[0],
+            bbox[3] * img_size[1]
+        ]
+
+    table_inputs = [
+        {
+            "tables": [
+                rescale_table([0.0925, 0.116, 0.871, 0.324]),
+                rescale_table([0.171, 0.365, 0.794, 0.492])
+            ],
+            "img_size": img_size
+        }
+    ]
+    tables = table_output(pdf_path, table_inputs, page_range=pages)
+    assert len(tables) == 1
+    assert len(tables[0]) == 2
+    assert len(tables[0][0]) == 127
+    assert len(tables[0][1]) == 74
+    assert tables[0][0][-1]["text"].strip() == "58.45"
+    assert tables[0][1][-1]["text"].strip() == "7.0h"