diff --git a/README.md b/README.md index e97806e..ebe09e9 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ pdftext PDF_PATH --out_path output.txt --json - `--out_path` path to the output txt file. If not specified, will write to stdout. - `--json` specifies json output - `--sort` will attempt to sort in reading order if specified. -- `--pages` will specify pages (comma separated) to extract +- `--page_range` will specify pages (comma separated) to extract. Like `0,5-10,12`. - `--keep_chars` will keep individual characters in the json output - `--workers` specifies the number of parallel workers to use - `--flatten_pdf` merges form fields into the PDF @@ -88,6 +88,22 @@ from pdftext.extraction import dictionary_output text = dictionary_output(PDF_PATH, sort=False, page_range=[1,2,3], keep_chars=False) # Optional arguments explained above ``` +Extract text from table cells: + +```python +from pdftext.extraction import table_output + +table_inputs = [ + # Each dictionary entry is a single page + { + "tables": [[5,10,10,20]], # Coordinates for tables on the page + "img_size": [512, 512] # The size of the image the tables were detected in + } +] +text = table_output(PDF_PATH, table_inputs, page_range=[1,2,3]) + +``` + If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper. pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well. # Benchmarks diff --git a/pdftext/postprocessing.py b/pdftext/postprocessing.py index 7c38076..eb77c28 100644 --- a/pdftext/postprocessing.py +++ b/pdftext/postprocessing.py @@ -76,8 +76,7 @@ def sort_blocks(blocks: List, tolerance=1.25) -> List: # Sort blocks into best guess reading order vertical_groups = {} for block in blocks: - bbox = block["bbox"] - group_key = round(bbox[1] / tolerance) * tolerance + group_key = round(block["bbox"][1] / tolerance) * tolerance if group_key not in vertical_groups: vertical_groups[group_key] = [] vertical_groups[group_key].append(block) diff --git a/pdftext/tables.py b/pdftext/tables.py index 688989c..e819f83 100644 --- a/pdftext/tables.py +++ b/pdftext/tables.py @@ -1,26 +1,8 @@ from typing import List import numpy as np -from pdftext.schema import Pages, Page, Bbox, Tables - - -def sort_text_lines(lines: List[dict], tolerance=1.25): - # Sorts in reading order. Not 100% accurate, this should only - # be used as a starting point for more advanced sorting. - vertical_groups = {} - for line in lines: - group_key = (line["bbox"][1] / tolerance) * tolerance - if group_key not in vertical_groups: - vertical_groups[group_key] = [] - vertical_groups[group_key].append(line) - - # Sort each group horizontally and flatten the groups into a single list - sorted_lines = [] - for _, group in sorted(vertical_groups.items()): - sorted_group = sorted(group, key=lambda x: x["bbox"][0]) - sorted_lines.extend(sorted_group) - - return sorted_lines +from pdftext.postprocessing import sort_blocks +from pdftext.schema import Page, Bbox, Tables def get_dynamic_gap_thresh(page: Page, img_size: list, default_thresh=.01, min_chars=100): @@ -43,12 +25,11 @@ def get_dynamic_gap_thresh(page: Page, img_size: list, default_thresh=.01, min_c return cell_gap_thresh -def is_same_span(char, curr_box, img_size, space_thresh, rotation): +def is_same_span(bbox, curr_box, img_size, space_thresh, rotation): def normalized_diff(a, b, dimension, mult=1, use_abs=True): func = abs if use_abs else lambda x: x return func(a - b) / img_size[dimension] < space_thresh * mult - bbox = char["bbox"] if rotation == 90: return all([ normalized_diff(bbox[0], curr_box[0], 0, use_abs=False), @@ -90,29 +71,30 @@ def table_cell_text(tables: List[List[int]], page: Page, img_size: list, table_t for block in page["blocks"]: for line in block["lines"]: - if line["bbox"].intersection_pct(table_poly) < table_thresh: + line_bbox = Bbox(bbox=line["bbox"]).rescale(img_size, page) + if line_bbox.intersection_pct(table_poly) < table_thresh: continue curr_span = None curr_box = None for span in line["spans"]: for char in span["chars"]: - char["bbox"] = char["bbox"].rescale(img_size, page) # Rescale to match image dimensions + bbox = Bbox(bbox=char["bbox"]).rescale(img_size, page).bbox same_span = False if curr_span: - same_span = is_same_span(char, curr_box, img_size, space_thresh, rotation) + same_span = is_same_span(bbox, curr_box, img_size, space_thresh, rotation) if curr_span is None: curr_span = char["char"] - curr_box = char["bbox"] + curr_box = bbox elif same_span: curr_span += char["char"] - curr_box = [min(curr_box[0], char["bbox"][0]), min(curr_box[1], char["bbox"][1]), - max(curr_box[2], char["bbox"][2]), max(curr_box[3], char["bbox"][3])] + curr_box = [min(curr_box[0], bbox[0]), min(curr_box[1], bbox[1]), + max(curr_box[2], bbox[2]), max(curr_box[3], bbox[3])] else: if curr_span.strip(): table_text.append({"text": curr_span, "bbox": curr_box}) curr_span = char["char"] - curr_box = char["bbox"] + curr_box = bbox if curr_span is not None and curr_span.strip(): table_text.append({"text": curr_span, "bbox": curr_box}) # Adjust to be relative to input table @@ -124,6 +106,6 @@ def table_cell_text(tables: List[List[int]], page: Page, img_size: list, table_t item["bbox"][3] - table[1] ] item["bbox"] = Bbox(bbox=item["bbox"]) - table_text = sort_text_lines(table_text) + table_text = sort_blocks(table_text) table_texts.append(table_text) return table_texts \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 64ed8f2..949be5e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.4.1" +version = "0.5.0" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0" diff --git a/tests/test_extraction.py b/tests/test_extraction.py index 43eb628..98f119e 100644 --- a/tests/test_extraction.py +++ b/tests/test_extraction.py @@ -1,4 +1,5 @@ -from pdftext.extraction import paginated_plain_text_output, plain_text_output +from pdftext.extraction import paginated_plain_text_output, plain_text_output, dictionary_output +from pdftext.schema import Pages def test_paginated_output(pdf_path, pdf_doc): @@ -10,5 +11,17 @@ def text_plain_text_output(pdf_path): text = plain_text_output(pdf_path) assert "Subspace" in text - def test_page_range(pdf_path): + pages = [0, 1, 3] + text = paginated_plain_text_output(pdf_path, page_range=pages) + assert len(text) == len(pages) + +def test_json_output(pdf_path, pdf_doc): + pages: Pages = dictionary_output(pdf_path) + assert len(pages) == len(pdf_doc) + assert "Subspace" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["text"] + +def test_keep_chars(pdf_path): + pages: Pages = dictionary_output(pdf_path, keep_chars=True) + assert "Subspace" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["text"] + assert "bbox" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["chars"][0] \ No newline at end of file diff --git a/tests/test_tables.py b/tests/test_tables.py new file mode 100644 index 0000000..9f16943 --- /dev/null +++ b/tests/test_tables.py @@ -0,0 +1,33 @@ +from pdftext.extraction import table_output + + +def test_table_extraction(pdf_path, pdf_doc): + pages = [5] + page_size = pdf_doc[5].get_size() + img_size = [p * 2 for p in page_size] + + # Rescale to img size + def rescale_table(bbox): + return [ + bbox[0] * img_size[0], + bbox[1] * img_size[1], + bbox[2] * img_size[0], + bbox[3] * img_size[1] + ] + + table_inputs = [ + { + "tables": [ + rescale_table([0.0925, 0.116, 0.871, 0.324]), + rescale_table([0.171, 0.365, 0.794, 0.492]) + ], + "img_size": img_size + } + ] + tables = table_output(pdf_path, table_inputs, page_range=pages) + assert len(tables) == 1 + assert len(tables[0]) == 2 + assert len(tables[0][0]) == 127 + assert len(tables[0][1]) == 74 + assert tables[0][0][-1]["text"].strip() == "58.45" + assert tables[0][1][-1]["text"].strip() == "7.0h"