Drop validation

VikParuchuri · Apr 25, 2024 · b2a4e8b · b2a4e8b
1 parent cc6a6e4
commit b2a4e8b
Show file tree

Hide file tree

Showing 6 changed files with 14 additions and 8 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -20,6 +20,6 @@ jobs:
           poetry install
       - name: Run detection benchmark test
         run: |
-          poetry run python benchmark.py --max 5 --result_path results
+          poetry run python benchmark.py --max 5 --result_path results --pdftext_only
           poetry run python scripts/verify_benchmark_scores.py results/results.json 
 
diff --git a/README.md b/README.md
@@ -84,7 +84,7 @@ Here are the scores:
 | Library    | Time (s per page) | Alignment Score (% accuracy vs pymupdf) |
 |------------|-------------------|-----------------------------------------|
 | pymupdf    | 0.32              | --                                      |
-| pdftext    | 1.79              | 96.22                                   |
+| pdftext    | 1.57              | 96.22                                   |
 | pdfplumber | 3.0               | 89.88                                   |
 
 pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same information).

diff --git a/benchmark.py b/benchmark.py
@@ -55,6 +55,7 @@ def main():
     parser = argparse.ArgumentParser(description="Benchmark pdf extraction.")
     parser.add_argument("--result_path", type=str, help="Path to the output text file, defaults to stdout", default=None)
     parser.add_argument("--max", type=int, help="Maximum number of pages to process.", default=None)
+    parser.add_argument("--pdftext_only", action="store_true", help="Only run pdftext inference", default=False)
     args = parser.parse_args()
 
     split = "train"
@@ -66,6 +67,9 @@ def main():
     alignments = defaultdict(list)
     times_tools = ["pymupdf", "pdftext", "pdfplumber"]
     alignment_tools = ["pdftext", "pdfplumber"]
+    if args.pdftext_only:
+        times_tools = ["pdftext", "pymupdf"]
+        alignment_tools = ["pdftext"]
     model = get_model()
     for i in tqdm(range(len(dataset)), desc="Benchmarking"):
         row = dataset[i]

diff --git a/pdftext/inference.py b/pdftext/inference.py
@@ -1,5 +1,7 @@
 from itertools import chain
 
+import sklearn
+
 from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES
 
 
@@ -152,7 +154,9 @@ def inference(text_chars, model):
         training_rows = [tl[1] for tl in training_list]
         training_idxs = [tl[0] for tl in training_list]
 
-        predictions = model.predict(training_rows)
+        # Disable nan, etc, validation for a small speedup
+        with sklearn.config_context(assume_finite=True):
+            predictions = model.predict(training_rows)
         for pred, page_idx in zip(predictions, training_idxs):
             next_prediction[page_idx] = pred
     page_blocks = sorted(page_blocks.items())

diff --git a/pdftext/postprocessing.py b/pdftext/postprocessing.py
@@ -62,11 +62,9 @@ def merge_text(page: Dict, sort=False) -> str:
             for char in line["chars"]:
                 line_text += char["char"]
             line_text = postprocess_text(line_text)
-            if line_text.endswith("\n"):
-                line_text = line_text[:-1].strip() + " "
+            line_text = line_text.rstrip() + "\n"
 
             block_text += line_text
-        if not block_text.endswith("\n"):
-            block_text += "\n\n"
+        block_text = block_text.rstrip() + "\n\n"
         text += block_text
     return text
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdftext"
-version = "0.1.1"
+version = "0.1.2"
 description = "Extract structured text from pdfs quickly"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 license = "Apache-2.0"