Skip to content

Commit

Permalink
Add table tests
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 8, 2025
1 parent eccb7d2 commit 1ae8f34
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 36 deletions.
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ pdftext PDF_PATH --out_path output.txt --json
- `--out_path` path to the output txt file. If not specified, will write to stdout.
- `--json` specifies json output
- `--sort` will attempt to sort in reading order if specified.
- `--pages` will specify pages (comma separated) to extract
- `--page_range` will specify pages (comma separated) to extract. Like `0,5-10,12`.
- `--keep_chars` will keep individual characters in the json output
- `--workers` specifies the number of parallel workers to use
- `--flatten_pdf` merges form fields into the PDF
Expand Down Expand Up @@ -88,6 +88,22 @@ from pdftext.extraction import dictionary_output
text = dictionary_output(PDF_PATH, sort=False, page_range=[1,2,3], keep_chars=False) # Optional arguments explained above
```

Extract text from table cells:

```python
from pdftext.extraction import table_output

table_inputs = [
# Each dictionary entry is a single page
{
"tables": [[5,10,10,20]], # Coordinates for tables on the page
"img_size": [512, 512] # The size of the image the tables were detected in
}
]
text = table_output(PDF_PATH, table_inputs, page_range=[1,2,3])

```

If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper. pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well.

# Benchmarks
Expand Down
3 changes: 1 addition & 2 deletions pdftext/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ def sort_blocks(blocks: List, tolerance=1.25) -> List:
# Sort blocks into best guess reading order
vertical_groups = {}
for block in blocks:
bbox = block["bbox"]
group_key = round(bbox[1] / tolerance) * tolerance
group_key = round(block["bbox"][1] / tolerance) * tolerance
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)
Expand Down
42 changes: 12 additions & 30 deletions pdftext/tables.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,8 @@
from typing import List
import numpy as np

from pdftext.schema import Pages, Page, Bbox, Tables


def sort_text_lines(lines: List[dict], tolerance=1.25):
# Sorts in reading order. Not 100% accurate, this should only
# be used as a starting point for more advanced sorting.
vertical_groups = {}
for line in lines:
group_key = (line["bbox"][1] / tolerance) * tolerance
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(line)

# Sort each group horizontally and flatten the groups into a single list
sorted_lines = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x["bbox"][0])
sorted_lines.extend(sorted_group)

return sorted_lines
from pdftext.postprocessing import sort_blocks
from pdftext.schema import Page, Bbox, Tables


def get_dynamic_gap_thresh(page: Page, img_size: list, default_thresh=.01, min_chars=100):
Expand All @@ -43,12 +25,11 @@ def get_dynamic_gap_thresh(page: Page, img_size: list, default_thresh=.01, min_c
return cell_gap_thresh


def is_same_span(char, curr_box, img_size, space_thresh, rotation):
def is_same_span(bbox, curr_box, img_size, space_thresh, rotation):
def normalized_diff(a, b, dimension, mult=1, use_abs=True):
func = abs if use_abs else lambda x: x
return func(a - b) / img_size[dimension] < space_thresh * mult

bbox = char["bbox"]
if rotation == 90:
return all([
normalized_diff(bbox[0], curr_box[0], 0, use_abs=False),
Expand Down Expand Up @@ -90,29 +71,30 @@ def table_cell_text(tables: List[List[int]], page: Page, img_size: list, table_t

for block in page["blocks"]:
for line in block["lines"]:
if line["bbox"].intersection_pct(table_poly) < table_thresh:
line_bbox = Bbox(bbox=line["bbox"]).rescale(img_size, page)
if line_bbox.intersection_pct(table_poly) < table_thresh:
continue
curr_span = None
curr_box = None
for span in line["spans"]:
for char in span["chars"]:
char["bbox"] = char["bbox"].rescale(img_size, page) # Rescale to match image dimensions
bbox = Bbox(bbox=char["bbox"]).rescale(img_size, page).bbox
same_span = False
if curr_span:
same_span = is_same_span(char, curr_box, img_size, space_thresh, rotation)
same_span = is_same_span(bbox, curr_box, img_size, space_thresh, rotation)

if curr_span is None:
curr_span = char["char"]
curr_box = char["bbox"]
curr_box = bbox
elif same_span:
curr_span += char["char"]
curr_box = [min(curr_box[0], char["bbox"][0]), min(curr_box[1], char["bbox"][1]),
max(curr_box[2], char["bbox"][2]), max(curr_box[3], char["bbox"][3])]
curr_box = [min(curr_box[0], bbox[0]), min(curr_box[1], bbox[1]),
max(curr_box[2], bbox[2]), max(curr_box[3], bbox[3])]
else:
if curr_span.strip():
table_text.append({"text": curr_span, "bbox": curr_box})
curr_span = char["char"]
curr_box = char["bbox"]
curr_box = bbox
if curr_span is not None and curr_span.strip():
table_text.append({"text": curr_span, "bbox": curr_box})
# Adjust to be relative to input table
Expand All @@ -124,6 +106,6 @@ def table_cell_text(tables: List[List[int]], page: Page, img_size: list, table_t
item["bbox"][3] - table[1]
]
item["bbox"] = Bbox(bbox=item["bbox"])
table_text = sort_text_lines(table_text)
table_text = sort_blocks(table_text)
table_texts.append(table_text)
return table_texts
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.4.1"
version = "0.5.0"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down
17 changes: 15 additions & 2 deletions tests/test_extraction.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pdftext.extraction import paginated_plain_text_output, plain_text_output
from pdftext.extraction import paginated_plain_text_output, plain_text_output, dictionary_output
from pdftext.schema import Pages


def test_paginated_output(pdf_path, pdf_doc):
Expand All @@ -10,5 +11,17 @@ def text_plain_text_output(pdf_path):
text = plain_text_output(pdf_path)
assert "Subspace" in text


def test_page_range(pdf_path):
pages = [0, 1, 3]
text = paginated_plain_text_output(pdf_path, page_range=pages)
assert len(text) == len(pages)

def test_json_output(pdf_path, pdf_doc):
pages: Pages = dictionary_output(pdf_path)
assert len(pages) == len(pdf_doc)
assert "Subspace" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["text"]

def test_keep_chars(pdf_path):
pages: Pages = dictionary_output(pdf_path, keep_chars=True)
assert "Subspace" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["text"]
assert "bbox" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["chars"][0]
33 changes: 33 additions & 0 deletions tests/test_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from pdftext.extraction import table_output


def test_table_extraction(pdf_path, pdf_doc):
pages = [5]
page_size = pdf_doc[5].get_size()
img_size = [p * 2 for p in page_size]

# Rescale to img size
def rescale_table(bbox):
return [
bbox[0] * img_size[0],
bbox[1] * img_size[1],
bbox[2] * img_size[0],
bbox[3] * img_size[1]
]

table_inputs = [
{
"tables": [
rescale_table([0.0925, 0.116, 0.871, 0.324]),
rescale_table([0.171, 0.365, 0.794, 0.492])
],
"img_size": img_size
}
]
tables = table_output(pdf_path, table_inputs, page_range=pages)
assert len(tables) == 1
assert len(tables[0]) == 2
assert len(tables[0][0]) == 127
assert len(tables[0][1]) == 74
assert tables[0][0][-1]["text"].strip() == "58.45"
assert tables[0][1][-1]["text"].strip() == "7.0h"

0 comments on commit 1ae8f34

Please sign in to comment.