From fae2334bf16573044abd2b40fb36b644db5b0ec8 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 25 Apr 2024 15:49:27 -0700 Subject: [PATCH] Drop validation --- .github/workflows/tests.yml | 2 +- README.md | 19 ++++++++++++------- benchmark.py | 14 ++++++++++---- pdftext/inference.py | 6 +++++- pdftext/postprocessing.py | 6 ++---- pyproject.toml | 2 +- 6 files changed, 31 insertions(+), 18 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b7cb985..c9239f8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,6 +20,6 @@ jobs: poetry install - name: Run detection benchmark test run: | - poetry run python benchmark.py --max 5 --result_path results + poetry run python benchmark.py --max 5 --result_path results --pdftext_only poetry run python scripts/verify_benchmark_scores.py results/results.json diff --git a/README.md b/README.md index 6eab318..67d41fc 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,10 @@ Text extraction like [PyMuPDF](https://github.com/pymupdf/PyMuPDF), but without the AGPL license. PDFText extracts plain text or structured blocks and lines. It's built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2), so it's [fast, accurate](#benchmarks), and Apache licensed. +## Community + +[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development. + # Installation You'll need python 3.9+ first. Then run `pip install pdftext`. @@ -77,17 +81,17 @@ If you want more customization, check out the `pdftext.extraction._get_pages` fu # Benchmarks -I benchmarked extraction speed and accuracy of [pymupdf](https://pymupdf.readthedocs.io/en/latest/), [pdfplumber](https://github.com/jsvine/pdfplumber), and pdftext. I chose pymupdf because it extracts blocks and lines. Pdfplumber extracts words and bboxes. I did not benchmark pypdf, even though it is a great library, because it doesn't provide individual words/lines and bbox information. +I benchmarked extraction speed and accuracy of [pymupdf](https://pymupdf.readthedocs.io/en/latest/), [pdfplumber](https://github.com/jsvine/pdfplumber), and pdftext. I chose pymupdf because it extracts blocks and lines. Pdfplumber extracts words and bboxes. I did not benchmark pypdf, even though it is a great library, because it doesn't provide individual character/line/block and bbox information. -Here are the scores: +Here are the scores, run on an M1 Macbook, without multiprocessing: | Library | Time (s per page) | Alignment Score (% accuracy vs pymupdf) | |------------|-------------------|-----------------------------------------| | pymupdf | 0.32 | -- | -| pdftext | 1.79 | 96.22 | -| pdfplumber | 3.0 | 89.88 | +| pdftext | 1.57 | 97.66 | +| pdfplumber | 3.0 | 90.3 | -pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same information). +pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same character information). There are additional benchmarks for pypdfium2 and other tools [here](https://github.com/py-pdf/benchmarks). @@ -95,7 +99,7 @@ There are additional benchmarks for pypdfium2 and other tools [here](https://git I used a benchmark set of 200 pdfs extracted from [common crawl](https://huggingface.co/datasets/pixparse/pdfa-eng-wds), then processed by a team at HuggingFace. -For each library, I used a detailed extraction method, to pull out font information, as well as just the words. This ensured we were comparing similar performance numbers. +For each library, I used a detailed extraction method, to pull out font information, as well as just the words. This ensured we were comparing similar performance numbers. I formatted the text similarly when extracting - newlines after lines, and double newlines after blocks. For pdfplumber, I could only do the newlines after lines, since it doesn't recognize blocks. For the alignment score, I extracted the text, then used the rapidfuzz library to find the alignment percentage. I used the text extracted by pymupdf as the pseudo-ground truth. @@ -114,10 +118,11 @@ The benchmark script has a few options: - `--max` this controls the maximum number of pdfs to benchmark - `--result_path` a folder to save the results. A file called `results.json` will be created in the folder. +- `--pdftext_only` skip running pdfplumber, which can be slow. # How it works -PDFText is a very light wrapper around pypdfium2. It first uses pypdfium2 to extract characters in order, along with font and other information. Then it uses a simple decision tree algorithm to group characters into lines and blocks. It then done some simple postprocessing to clean up the text. +PDFText is a very light wrapper around pypdfium2. It first uses pypdfium2 to extract characters in order, along with font and other information. Then it uses a simple decision tree algorithm to group characters into lines and blocks. It does some simple postprocessing to clean up the text. # Credits diff --git a/benchmark.py b/benchmark.py index a2ba903..fc70b92 100644 --- a/benchmark.py +++ b/benchmark.py @@ -30,8 +30,8 @@ def pymupdf_inference(pdf_path): for line in block["lines"]: for span in line["spans"]: text += span["text"] - if not text.endswith("\n"): - text += "\n\n" + text = text.rstrip() + "\n" + text = text.rstrip() + "\n\n" pages.append(text) return pages @@ -41,8 +41,10 @@ def pdfplumber_inference(pdf_path): pages = [] for i in range(len(pdf.pages)): page = pdf.pages[i] - words = page.extract_words(use_text_flow=True) - text = "".join([word["text"] for word in words]) + lines = page.extract_text_lines(strip=False, return_chars=True, keep_text_flow=True) + text = "" + for line in lines: + text += line["text"].rstrip() + "\n" pages.append(text) return pages @@ -55,6 +57,7 @@ def main(): parser = argparse.ArgumentParser(description="Benchmark pdf extraction.") parser.add_argument("--result_path", type=str, help="Path to the output text file, defaults to stdout", default=None) parser.add_argument("--max", type=int, help="Maximum number of pages to process.", default=None) + parser.add_argument("--pdftext_only", action="store_true", help="Only run pdftext inference", default=False) args = parser.parse_args() split = "train" @@ -66,6 +69,9 @@ def main(): alignments = defaultdict(list) times_tools = ["pymupdf", "pdftext", "pdfplumber"] alignment_tools = ["pdftext", "pdfplumber"] + if args.pdftext_only: + times_tools = ["pdftext", "pymupdf"] + alignment_tools = ["pdftext"] model = get_model() for i in tqdm(range(len(dataset)), desc="Benchmarking"): row = dataset[i] diff --git a/pdftext/inference.py b/pdftext/inference.py index e4352be..14756ab 100644 --- a/pdftext/inference.py +++ b/pdftext/inference.py @@ -1,5 +1,7 @@ from itertools import chain +import sklearn + from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES @@ -152,7 +154,9 @@ def inference(text_chars, model): training_rows = [tl[1] for tl in training_list] training_idxs = [tl[0] for tl in training_list] - predictions = model.predict(training_rows) + # Disable nan, etc, validation for a small speedup + with sklearn.config_context(assume_finite=True): + predictions = model.predict(training_rows) for pred, page_idx in zip(predictions, training_idxs): next_prediction[page_idx] = pred page_blocks = sorted(page_blocks.items()) diff --git a/pdftext/postprocessing.py b/pdftext/postprocessing.py index 9d06389..6b8d9eb 100644 --- a/pdftext/postprocessing.py +++ b/pdftext/postprocessing.py @@ -62,11 +62,9 @@ def merge_text(page: Dict, sort=False) -> str: for char in line["chars"]: line_text += char["char"] line_text = postprocess_text(line_text) - if line_text.endswith("\n"): - line_text = line_text[:-1].strip() + " " + line_text = line_text.rstrip() + "\n" block_text += line_text - if not block_text.endswith("\n"): - block_text += "\n\n" + block_text = block_text.rstrip() + "\n\n" text += block_text return text diff --git a/pyproject.toml b/pyproject.toml index 8fd2c7a..dbd5d28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.1.1" +version = "0.1.2" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"