diff --git a/README.md b/README.md index ebe09e9..59e2721 100644 --- a/README.md +++ b/README.md @@ -115,8 +115,8 @@ Here are the scores, run on an M1 Macbook, without multiprocessing: | Library | Time (s per page) | Alignment Score (% accuracy vs pymupdf) | |------------|-------------------|-----------------------------------------| | pymupdf | 0.32 | -- | -| pdftext | 1.4 | 97.76 | -| pdfplumber | 3.0 | 90.3 | +| pdftext | 1.36 | 97.78 | +| pdfplumber | 3.16 | 90.36 | pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same character information). diff --git a/tests/test_tables.py b/tests/test_tables.py index 9f16943..1f08f27 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -1,10 +1,15 @@ from pdftext.extraction import table_output - def test_table_extraction(pdf_path, pdf_doc): + _table_extraction(pdf_path, pdf_doc) + +def test_table_extraction_small(pdf_path, pdf_doc): + _table_extraction(pdf_path, pdf_doc, img_size_scaler=0.5) + +def _table_extraction(pdf_path, pdf_doc, img_size_scaler: float = 2.0): pages = [5] page_size = pdf_doc[5].get_size() - img_size = [p * 2 for p in page_size] + img_size = [p * img_size_scaler for p in page_size] # Rescale to img size def rescale_table(bbox): @@ -31,3 +36,15 @@ def rescale_table(bbox): assert len(tables[0][1]) == 74 assert tables[0][0][-1]["text"].strip() == "58.45" assert tables[0][1][-1]["text"].strip() == "7.0h" + + table1_bbox = table_inputs[0]["tables"][0] + table1_width = table1_bbox[2] - table1_bbox[0] + table1_height = table1_bbox[3] - table1_bbox[1] + + # Ensure bboxes are within table bounds (they should be rescaled) + for cell in tables[0][0]: + assert 0 <= cell["bbox"][0] <= table1_width + assert 0 <= cell["bbox"][1] <= table1_height + assert 0 <= cell["bbox"][2] <= table1_width + assert 0 <= cell["bbox"][3] <= table1_height +