Skip to content

Commit

Permalink
Add more table tests
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 8, 2025
1 parent 1ae8f34 commit 4b95588
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 4 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ Here are the scores, run on an M1 Macbook, without multiprocessing:
| Library | Time (s per page) | Alignment Score (% accuracy vs pymupdf) |
|------------|-------------------|-----------------------------------------|
| pymupdf | 0.32 | -- |
| pdftext | 1.4 | 97.76 |
| pdfplumber | 3.0 | 90.3 |
| pdftext | 1.36 | 97.78 |
| pdfplumber | 3.16 | 90.36 |

pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same character information).

Expand Down
21 changes: 19 additions & 2 deletions tests/test_tables.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
from pdftext.extraction import table_output


def test_table_extraction(pdf_path, pdf_doc):
_table_extraction(pdf_path, pdf_doc)

def test_table_extraction_small(pdf_path, pdf_doc):
_table_extraction(pdf_path, pdf_doc, img_size_scaler=0.5)

def _table_extraction(pdf_path, pdf_doc, img_size_scaler: float = 2.0):
pages = [5]
page_size = pdf_doc[5].get_size()
img_size = [p * 2 for p in page_size]
img_size = [p * img_size_scaler for p in page_size]

# Rescale to img size
def rescale_table(bbox):
Expand All @@ -31,3 +36,15 @@ def rescale_table(bbox):
assert len(tables[0][1]) == 74
assert tables[0][0][-1]["text"].strip() == "58.45"
assert tables[0][1][-1]["text"].strip() == "7.0h"

table1_bbox = table_inputs[0]["tables"][0]
table1_width = table1_bbox[2] - table1_bbox[0]
table1_height = table1_bbox[3] - table1_bbox[1]

# Ensure bboxes are within table bounds (they should be rescaled)
for cell in tables[0][0]:
assert 0 <= cell["bbox"][0] <= table1_width
assert 0 <= cell["bbox"][1] <= table1_height
assert 0 <= cell["bbox"][2] <= table1_width
assert 0 <= cell["bbox"][3] <= table1_height

0 comments on commit 4b95588

Please sign in to comment.