diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b7cb985..c9239f8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,6 +20,6 @@ jobs: poetry install - name: Run detection benchmark test run: | - poetry run python benchmark.py --max 5 --result_path results + poetry run python benchmark.py --max 5 --result_path results --pdftext_only poetry run python scripts/verify_benchmark_scores.py results/results.json diff --git a/README.md b/README.md index 6eab318..7a764d6 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ Here are the scores: | Library | Time (s per page) | Alignment Score (% accuracy vs pymupdf) | |------------|-------------------|-----------------------------------------| | pymupdf | 0.32 | -- | -| pdftext | 1.79 | 96.22 | +| pdftext | 1.57 | 96.22 | | pdfplumber | 3.0 | 89.88 | pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same information). diff --git a/benchmark.py b/benchmark.py index a2ba903..d6332f8 100644 --- a/benchmark.py +++ b/benchmark.py @@ -55,6 +55,7 @@ def main(): parser = argparse.ArgumentParser(description="Benchmark pdf extraction.") parser.add_argument("--result_path", type=str, help="Path to the output text file, defaults to stdout", default=None) parser.add_argument("--max", type=int, help="Maximum number of pages to process.", default=None) + parser.add_argument("--pdftext_only", action="store_true", help="Only run pdftext inference", default=False) args = parser.parse_args() split = "train" @@ -66,6 +67,9 @@ def main(): alignments = defaultdict(list) times_tools = ["pymupdf", "pdftext", "pdfplumber"] alignment_tools = ["pdftext", "pdfplumber"] + if args.pdftext_only: + times_tools = ["pdftext", "pymupdf"] + alignment_tools = ["pdftext"] model = get_model() for i in tqdm(range(len(dataset)), desc="Benchmarking"): row = dataset[i] diff --git a/pdftext/inference.py b/pdftext/inference.py index e4352be..14756ab 100644 --- a/pdftext/inference.py +++ b/pdftext/inference.py @@ -1,5 +1,7 @@ from itertools import chain +import sklearn + from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES @@ -152,7 +154,9 @@ def inference(text_chars, model): training_rows = [tl[1] for tl in training_list] training_idxs = [tl[0] for tl in training_list] - predictions = model.predict(training_rows) + # Disable nan, etc, validation for a small speedup + with sklearn.config_context(assume_finite=True): + predictions = model.predict(training_rows) for pred, page_idx in zip(predictions, training_idxs): next_prediction[page_idx] = pred page_blocks = sorted(page_blocks.items()) diff --git a/pyproject.toml b/pyproject.toml index 8fd2c7a..dbd5d28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.1.1" +version = "0.1.2" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"