diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..f85a520 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,19 @@ +name: Unit tests + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install python dependencies + run: | + pip install poetry + poetry install + - name: Run tests + run: poetry run pytest \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/integration.yml similarity index 100% rename from .github/workflows/tests.yml rename to .github/workflows/integration.yml diff --git a/README.md b/README.md index 4f34f42..e97806e 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ pdftext PDF_PATH --out_path output.txt - `--out_path` path to the output txt file. If not specified, will write to stdout. - `--sort` will attempt to sort in reading order if specified. - `--keep_hyphens` will keep hyphens in the output (they will be stripped and words joined otherwise) -- `--pages` will specify pages (comma separated) to extract +- `--page_range` will specify pages (comma separated) to extract. Like `0,5-10,12`. - `--workers` specifies the number of parallel workers to use - `--flatten_pdf` merges form fields into the PDF diff --git a/extract_text.py b/extract_text.py index 19e2e00..ed8f5d5 100644 --- a/extract_text.py +++ b/extract_text.py @@ -1,41 +1,57 @@ -import argparse import json +from pathlib import Path +from typing import List + +import click import pypdfium2 as pdfium from pdftext.extraction import plain_text_output, dictionary_output - -def main(): - parser = argparse.ArgumentParser(description="Extract plain text from PDF. Not guaranteed to be in order.") - parser.add_argument("pdf_path", type=str, help="Path to the PDF file") - parser.add_argument("--out_path", type=str, help="Path to the output text file, defaults to stdout", default=None) - parser.add_argument("--json", action="store_true", help="Output json instead of plain text", default=False) - parser.add_argument("--sort", action="store_true", help="Attempt to sort the text by reading order", default=False) - parser.add_argument("--keep_hyphens", action="store_true", help="Keep hyphens in words", default=False) - parser.add_argument("--pages", type=str, help="Comma separated pages to extract, like 1,2,3", default=None) - parser.add_argument("--flatten_pdf", action="store_true", help="Flatten form fields and annotations into page contents", default=False) - parser.add_argument("--keep_chars", action="store_true", help="Keep character level information", default=False) - parser.add_argument("--workers", type=int, help="Number of workers to use for parallel processing", default=None) - args = parser.parse_args() - +def parse_range_str(range_str: str) -> List[int]: + range_lst = range_str.split(",") + page_lst = [] + for i in range_lst: + if "-" in i: + start, end = i.split("-") + page_lst += list(range(int(start), int(end) + 1)) + else: + page_lst.append(int(i)) + page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order + return page_lst + +@click.command(help="Extract plain text or JSON from PDF.") +@click.argument("pdf_path", type=click.Path(exists=True)) +@click.option("--out_path", type=click.Path(exists=False), help="Path to the output text file, defaults to stdout") +@click.option("--json", is_flag=True, help="Output json instead of plain text", default=False) +@click.option("--sort", is_flag=True, help="Attempt to sort the text by reading order", default=False) +@click.option("--keep_hyphens", is_flag=True, help="Keep hyphens in words", default=False) +@click.option("--page_range", type=str, help="Page numbers or ranges to extract, comma separated like 1,2-4,10", default=None) +@click.option("--flatten_pdf", is_flag=True, help="Flatten form fields and annotations into page contents", default=False) +@click.option("--keep_chars", is_flag=True, help="Keep character level information", default=False) +@click.option("--workers", type=int, help="Number of workers to use for parallel processing", default=None) +def main( + pdf_path: Path, + out_path: Path | None, + **kwargs +): pages = None - if args.pages is not None: - pdf_doc = pdfium.PdfDocument(args.pdf_path) - pages = [int(p) for p in args.pages.split(",")] + if kwargs["page_range"] is not None: + pdf_doc = pdfium.PdfDocument(pdf_path) + pages = parse_range_str(kwargs["page_range"]) doc_len = len(pdf_doc) pdf_doc.close() - assert all(p <= doc_len for p in pages), "Invalid page number(s) provided" + assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided" - if args.json: - text = dictionary_output(args.pdf_path, sort=args.sort, page_range=pages, flatten_pdf=args.flatten_pdf, keep_chars=args.keep_chars, workers=args.workers) + if kwargs["json"]: + text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"]) text = json.dumps(text) else: - text = plain_text_output(args.pdf_path, sort=args.sort, hyphens=args.keep_hyphens, page_range=pages, flatten_pdf=args.flatten_pdf, workers=args.workers) + text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"]) - if args.out_path is None: + if out_path is None: print(text) else: - with open(args.out_path, "w+") as f: + with open(out_path, "w+") as f: f.write(text) diff --git a/pdftext/extraction.py b/pdftext/extraction.py index 9daf8ba..b01ffc3 100644 --- a/pdftext/extraction.py +++ b/pdftext/extraction.py @@ -9,7 +9,9 @@ from pdftext.pdf.pages import get_pages from pdftext.postprocessing import handle_hyphens, merge_text, postprocess_text, sort_blocks +from pdftext.schema import Pages, TableInputs, Tables from pdftext.settings import settings +from pdftext.tables import table_cell_text def _load_pdf(pdf, flatten_pdf): @@ -22,7 +24,7 @@ def _load_pdf(pdf, flatten_pdf): return pdf -def _get_page_range(page_range, flatten_pdf=False, quote_loosebox=True): +def _get_page_range(page_range, flatten_pdf=False, quote_loosebox=True) -> Pages: return get_pages(pdf_doc, page_range, flatten_pdf, quote_loosebox) @@ -38,7 +40,7 @@ def worker_init(pdf_path, flatten_pdf): atexit.register(partial(worker_shutdown, pdf_doc)) -def _get_pages(pdf_path, page_range=None, flatten_pdf=False, quote_loosebox=True, workers=None): +def _get_pages(pdf_path, page_range=None, flatten_pdf=False, quote_loosebox=True, workers=None) -> Pages: pdf_doc = _load_pdf(pdf_path, flatten_pdf) if page_range is None: page_range = range(len(pdf_doc)) @@ -70,7 +72,7 @@ def plain_text_output(pdf_path, sort=False, hyphens=False, page_range=None, flat def paginated_plain_text_output(pdf_path, sort=False, hyphens=False, page_range=None, flatten_pdf=False, workers=None) -> List[str]: - pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf) + pages: Pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf) text = [] for page in pages: text.append(merge_text(page, sort=sort, hyphens=hyphens).strip()) @@ -87,8 +89,16 @@ def _process_span(span, page_width, page_height, keep_chars): char["bbox"] = char["bbox"].bbox -def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None): - pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox) +def dictionary_output( + pdf_path, + sort=False, + page_range=None, + keep_chars=False, + flatten_pdf=False, + quote_loosebox=True, + workers=None +) -> Pages: + pages: Pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox) for page in pages: page_width, page_height = page["width"], page["height"] for block in page["blocks"]: @@ -111,3 +121,27 @@ def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, f page["width"], page["height"] = page["height"], page["width"] page["bbox"] = [page["bbox"][2], page["bbox"][3], page["bbox"][0], page["bbox"][1]] return pages + +def table_output( + pdf_path: str, + table_inputs: TableInputs, + page_range=None, + flatten_pdf=False, + quote_loosebox=True, + workers=None, + pages: Pages | None = None +) -> List[Tables]: + # Extract pages if they don't exist + if not pages: + pages: Pages = dictionary_output(pdf_path, page_range=page_range, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox, workers=workers, keep_chars=True) + + assert len(pages) == len(table_inputs), "Number of pages and table inputs must match" + + # Extract table cells per page + out_tables = [] + for page, table_input in zip(pages, table_inputs): + tables = table_cell_text(table_input["tables"], page, table_input["img_size"]) + assert len(tables) == len(table_input["tables"]), "Number of tables and table inputs must match" + out_tables.append(tables) + return out_tables + diff --git a/pdftext/schema.py b/pdftext/schema.py index 20e5e8b..3692720 100644 --- a/pdftext/schema.py +++ b/pdftext/schema.py @@ -7,6 +7,9 @@ class Bbox: def __init__(self, bbox: List[float]): self.bbox = bbox + def __getitem__(self, item): + return self.bbox[item] + @property def height(self): return self.bbox[3] - self.bbox[1] @@ -101,6 +104,18 @@ def rotate(self, page_width: float, page_height: float, rotation: int) -> Bbox: return Bbox(rotated_bbox) + def rescale(self, img_size: List[int], page: Page) -> Bbox: + w_scale = img_size[0] / page["width"] + h_scale = img_size[1] / page["height"] + new_bbox = [ + self.bbox[0] * w_scale, + self.bbox[1] * h_scale, + self.bbox[2] * w_scale, + self.bbox[3] * h_scale + ] + + return Bbox(new_bbox) + class Char(TypedDict): bbox: Bbox @@ -116,7 +131,7 @@ class Span(TypedDict): font: Dict[str, Union[Any, str]] font_weight: float font_size: float - chars: List[Char] + chars: List[Char] | None char_start_idx: int char_end_idx: int @@ -137,6 +152,15 @@ class Page(TypedDict): width: int height: int blocks: List[Block] + rotation: int + +class TableCell(TypedDict): + text: str + bbox: Bbox + +class TableInput(TypedDict): + tables: List[List[int]] + img_size: List[int] Chars = List[Char] @@ -144,3 +168,5 @@ class Page(TypedDict): Lines = List[Line] Blocks = List[Block] Pages = List[Page] +Tables = List[List[TableCell]] +TableInputs = List[TableInput] diff --git a/pdftext/tables.py b/pdftext/tables.py new file mode 100644 index 0000000..688989c --- /dev/null +++ b/pdftext/tables.py @@ -0,0 +1,129 @@ +from typing import List +import numpy as np + +from pdftext.schema import Pages, Page, Bbox, Tables + + +def sort_text_lines(lines: List[dict], tolerance=1.25): + # Sorts in reading order. Not 100% accurate, this should only + # be used as a starting point for more advanced sorting. + vertical_groups = {} + for line in lines: + group_key = (line["bbox"][1] / tolerance) * tolerance + if group_key not in vertical_groups: + vertical_groups[group_key] = [] + vertical_groups[group_key].append(line) + + # Sort each group horizontally and flatten the groups into a single list + sorted_lines = [] + for _, group in sorted(vertical_groups.items()): + sorted_group = sorted(group, key=lambda x: x["bbox"][0]) + sorted_lines.extend(sorted_group) + + return sorted_lines + + +def get_dynamic_gap_thresh(page: Page, img_size: list, default_thresh=.01, min_chars=100): + space_dists = [] + for block in page["blocks"]: + for line in block["lines"]: + for span in line["spans"]: + for i in range(1, len(span["chars"])): + char1 = span["chars"][i - 1] + char2 = span["chars"][i] + if page["rotation"] == 90: + space_dists.append((char2["bbox"][0] - char1["bbox"][2]) / img_size[0]) + elif page["rotation"] == 180: + space_dists.append((char2["bbox"][1] - char1["bbox"][3]) / img_size[1]) + elif page["rotation"] == 270: + space_dists.append((char1["bbox"][0] - char2["bbox"][2]) / img_size[0]) + else: + space_dists.append((char1["bbox"][1] - char2["bbox"][3]) / img_size[1]) + cell_gap_thresh = np.percentile(space_dists, 80) if len(space_dists) > min_chars else default_thresh + return cell_gap_thresh + + +def is_same_span(char, curr_box, img_size, space_thresh, rotation): + def normalized_diff(a, b, dimension, mult=1, use_abs=True): + func = abs if use_abs else lambda x: x + return func(a - b) / img_size[dimension] < space_thresh * mult + + bbox = char["bbox"] + if rotation == 90: + return all([ + normalized_diff(bbox[0], curr_box[0], 0, use_abs=False), + normalized_diff(bbox[1], curr_box[3], 1), + normalized_diff(bbox[0], curr_box[0], 0, mult=5) + ]) + elif rotation == 180: + return all([ + normalized_diff(bbox[2], curr_box[0], 0, use_abs=False), + normalized_diff(bbox[1], curr_box[1], 1), + normalized_diff(bbox[2], curr_box[0], 1, mult=5) + ]) + elif rotation == 270: + return all([ + normalized_diff(bbox[0], curr_box[0], 0, use_abs=False), + normalized_diff(bbox[3], curr_box[1], 1), + normalized_diff(bbox[0], curr_box[0], 1, mult=5) + ]) + else: # 0 or default case + return all([ + normalized_diff(bbox[0], curr_box[2], 0, use_abs=False), + normalized_diff(bbox[1], curr_box[1], 1), + normalized_diff(bbox[0], curr_box[2], 1, mult=5) + ]) + + +def table_cell_text(tables: List[List[int]], page: Page, img_size: list, table_thresh=.8, space_thresh=.01) -> Tables: + # Note: table is a list of 4 ints representing the bounding box of the table. This is against the image dims - this can be different from the page dims. + # We rescale the characters below to account for this. + assert all(len(table) == 4 for table in tables), "Tables must be a list of 4 ints representing the bounding box of the table" + assert len(img_size) == 2, "img_size must be a list of 2 ints representing the image dimensions width, height" + + table_texts = [] + space_thresh = max(space_thresh, get_dynamic_gap_thresh(page, img_size, default_thresh=space_thresh)) + for table in tables: + table_poly = Bbox(bbox=table) + table_text = [] + rotation = page["rotation"] + + for block in page["blocks"]: + for line in block["lines"]: + if line["bbox"].intersection_pct(table_poly) < table_thresh: + continue + curr_span = None + curr_box = None + for span in line["spans"]: + for char in span["chars"]: + char["bbox"] = char["bbox"].rescale(img_size, page) # Rescale to match image dimensions + same_span = False + if curr_span: + same_span = is_same_span(char, curr_box, img_size, space_thresh, rotation) + + if curr_span is None: + curr_span = char["char"] + curr_box = char["bbox"] + elif same_span: + curr_span += char["char"] + curr_box = [min(curr_box[0], char["bbox"][0]), min(curr_box[1], char["bbox"][1]), + max(curr_box[2], char["bbox"][2]), max(curr_box[3], char["bbox"][3])] + else: + if curr_span.strip(): + table_text.append({"text": curr_span, "bbox": curr_box}) + curr_span = char["char"] + curr_box = char["bbox"] + if curr_span is not None and curr_span.strip(): + table_text.append({"text": curr_span, "bbox": curr_box}) + # Adjust to be relative to input table + for item in table_text: + item["bbox"] = [ + item["bbox"][0] - table[0], + item["bbox"][1] - table[1], + item["bbox"][2] - table[0], + item["bbox"][3] - table[1] + ] + item["bbox"] = Bbox(bbox=item["bbox"]) + table_text = sort_text_lines(table_text) + table_texts.append(table_text) + return table_texts \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 89cff96..8b9d267 100644 --- a/poetry.lock +++ b/poetry.lock @@ -355,6 +355,20 @@ files = [ {file = "charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3"}, ] +[[package]] +name = "click" +version = "8.1.8" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, + {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [[package]] name = "colorama" version = "0.4.6" @@ -476,6 +490,20 @@ files = [ graph = ["objgraph (>=1.7.2)"] profile = ["gprof2dot (>=2022.7.29)"] +[[package]] +name = "exceptiongroup" +version = "1.2.2" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, +] + +[package.extras] +test = ["pytest (>=6)"] + [[package]] name = "filelock" version = "3.16.1" @@ -683,6 +711,17 @@ files = [ [package.extras] all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + [[package]] name = "multidict" version = "6.1.0" @@ -1105,6 +1144,21 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa typing = ["typing-extensions"] xmp = ["defusedxml"] +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + [[package]] name = "propcache" version = "0.2.1" @@ -1452,6 +1506,28 @@ files = [ {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"}, ] +[[package]] +name = "pytest" +version = "8.3.4" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, + {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -1699,6 +1775,47 @@ files = [ [package.extras] widechars = ["wcwidth"] +[[package]] +name = "tomli" +version = "2.2.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.8" +files = [ + {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, + {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, + {file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a"}, + {file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee"}, + {file = "tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e"}, + {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4"}, + {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106"}, + {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8"}, + {file = "tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff"}, + {file = "tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b"}, + {file = "tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea"}, + {file = "tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8"}, + {file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192"}, + {file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222"}, + {file = "tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77"}, + {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6"}, + {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd"}, + {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e"}, + {file = "tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98"}, + {file = "tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4"}, + {file = "tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7"}, + {file = "tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c"}, + {file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13"}, + {file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281"}, + {file = "tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272"}, + {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140"}, + {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2"}, + {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744"}, + {file = "tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec"}, + {file = "tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69"}, + {file = "tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc"}, + {file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"}, +] + [[package]] name = "tqdm" version = "4.67.1" @@ -1990,4 +2107,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "bcda666dbc1a9479fe2cb0c91e620a14649d3321b8acd9af332e7db5441b4e9d" +content-hash = "ddd1896418384a59ea5cc93165805531aaa09c17748180768da7266c57c8307e" diff --git a/pyproject.toml b/pyproject.toml index bd81db1..64ed8f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ pypdfium2 = "=4.30.0" pydantic = "^2.7.1" pydantic-settings = "^2.2.1" +click = "^8.1.8" [tool.poetry.group.dev.dependencies] pymupdf = "^1.24.2" datasets = "^2.19.0" @@ -28,6 +29,7 @@ pillow = "^10.3.0" rapidfuzz = "^3.8.1" tabulate = "^0.9.0" +pytest = "^8.3.4" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..05ef295 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,7 @@ +[pytest] +testpaths=tests +pythonpath=. +filterwarnings = + ignore::UserWarning + ignore::PendingDeprecationWarning + ignore::DeprecationWarning \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..4eefc39 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,12 @@ +import pytest +import pypdfium2 as pdfium + +@pytest.fixture(scope="session") +def pdf_path(): + return "tests/data/adversarial.pdf" + +@pytest.fixture() +def pdf_doc(pdf_path): + doc = pdfium.PdfDocument(pdf_path) + yield doc + doc.close() diff --git a/tests/data/adversarial.pdf b/tests/data/adversarial.pdf new file mode 100644 index 0000000..83cb649 Binary files /dev/null and b/tests/data/adversarial.pdf differ diff --git a/tests/test_extraction.py b/tests/test_extraction.py new file mode 100644 index 0000000..43eb628 --- /dev/null +++ b/tests/test_extraction.py @@ -0,0 +1,14 @@ +from pdftext.extraction import paginated_plain_text_output, plain_text_output + + +def test_paginated_output(pdf_path, pdf_doc): + text = paginated_plain_text_output(pdf_path) + assert len(text) == len(pdf_doc) + assert "Subspace" in text[0] + +def text_plain_text_output(pdf_path): + text = plain_text_output(pdf_path) + assert "Subspace" in text + + +def test_page_range(pdf_path):