From fae2334bf16573044abd2b40fb36b644db5b0ec8 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Thu, 25 Apr 2024 15:49:27 -0700
Subject: [PATCH] Drop validation

---
 .github/workflows/tests.yml |  2 +-
 README.md                   | 19 ++++++++++++-------
 benchmark.py                | 14 ++++++++++----
 pdftext/inference.py        |  6 +++++-
 pdftext/postprocessing.py   |  6 ++----
 pyproject.toml              |  2 +-
 6 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index b7cb985..c9239f8 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -20,6 +20,6 @@ jobs:
           poetry install
       - name: Run detection benchmark test
         run: |
-          poetry run python benchmark.py --max 5 --result_path results
+          poetry run python benchmark.py --max 5 --result_path results --pdftext_only
           poetry run python scripts/verify_benchmark_scores.py results/results.json 
 
diff --git a/README.md b/README.md
index 6eab318..67d41fc 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,10 @@
 
 Text extraction like [PyMuPDF](https://github.com/pymupdf/PyMuPDF), but without the AGPL license.  PDFText extracts plain text or structured blocks and lines.  It's built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2), so it's [fast, accurate](#benchmarks), and Apache licensed.
 
+## Community
+
+[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
+
 # Installation
 
 You'll need python 3.9+ first.  Then run `pip install pdftext`.
@@ -77,17 +81,17 @@ If you want more customization, check out the `pdftext.extraction._get_pages` fu
 
 # Benchmarks
 
-I benchmarked extraction speed and accuracy of [pymupdf](https://pymupdf.readthedocs.io/en/latest/), [pdfplumber](https://github.com/jsvine/pdfplumber), and pdftext.  I chose pymupdf because it extracts blocks and lines.  Pdfplumber extracts words and bboxes.  I did not benchmark pypdf, even though it is a great library, because it doesn't provide individual words/lines and bbox information.
+I benchmarked extraction speed and accuracy of [pymupdf](https://pymupdf.readthedocs.io/en/latest/), [pdfplumber](https://github.com/jsvine/pdfplumber), and pdftext.  I chose pymupdf because it extracts blocks and lines.  Pdfplumber extracts words and bboxes.  I did not benchmark pypdf, even though it is a great library, because it doesn't provide individual character/line/block and bbox information.
 
-Here are the scores:
+Here are the scores, run on an M1 Macbook, without multiprocessing:
 
 | Library    | Time (s per page) | Alignment Score (% accuracy vs pymupdf) |
 |------------|-------------------|-----------------------------------------|
 | pymupdf    | 0.32              | --                                      |
-| pdftext    | 1.79              | 96.22                                   |
-| pdfplumber | 3.0               | 89.88                                   |
+| pdftext    | 1.57              | 97.66                                   |
+| pdfplumber | 3.0               | 90.3                                    |
 
-pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same information).
+pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same character information).
 
 There are additional benchmarks for pypdfium2 and other tools [here](https://github.com/py-pdf/benchmarks).
 
@@ -95,7 +99,7 @@ There are additional benchmarks for pypdfium2 and other tools [here](https://git
 
 I used a benchmark set of 200 pdfs extracted from [common crawl](https://huggingface.co/datasets/pixparse/pdfa-eng-wds), then processed by a team at HuggingFace.
 
-For each library, I used a detailed extraction method, to pull out font information, as well as just the words.  This ensured we were comparing similar performance numbers.
+For each library, I used a detailed extraction method, to pull out font information, as well as just the words.  This ensured we were comparing similar performance numbers.  I formatted the text similarly when extracting - newlines after lines, and double newlines after blocks.  For pdfplumber, I could only do the newlines after lines, since it doesn't recognize blocks.
 
 For the alignment score, I extracted the text, then used the rapidfuzz library to find the alignment percentage.  I used the text extracted by pymupdf as the pseudo-ground truth.
 
@@ -114,10 +118,11 @@ The benchmark script has a few options:
 
 - `--max` this controls the maximum number of pdfs to benchmark
 - `--result_path` a folder to save the results.  A file called `results.json` will be created in the folder.
+- `--pdftext_only` skip running pdfplumber, which can be slow.
 
 # How it works
 
-PDFText is a very light wrapper around pypdfium2.  It first uses pypdfium2 to extract characters in order, along with font and other information.  Then it uses a simple decision tree algorithm to group characters into lines and blocks.  It then done some simple postprocessing to clean up the text.
+PDFText is a very light wrapper around pypdfium2.  It first uses pypdfium2 to extract characters in order, along with font and other information.  Then it uses a simple decision tree algorithm to group characters into lines and blocks.  It does some simple postprocessing to clean up the text.
 
 # Credits
 
diff --git a/benchmark.py b/benchmark.py
index a2ba903..fc70b92 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -30,8 +30,8 @@ def pymupdf_inference(pdf_path):
             for line in block["lines"]:
                 for span in line["spans"]:
                     text += span["text"]
-            if not text.endswith("\n"):
-                text += "\n\n"
+                text = text.rstrip() + "\n"
+            text = text.rstrip() + "\n\n"
         pages.append(text)
     return pages
 
@@ -41,8 +41,10 @@ def pdfplumber_inference(pdf_path):
         pages = []
         for i in range(len(pdf.pages)):
             page = pdf.pages[i]
-            words = page.extract_words(use_text_flow=True)
-            text = "".join([word["text"] for word in words])
+            lines = page.extract_text_lines(strip=False, return_chars=True, keep_text_flow=True)
+            text = ""
+            for line in lines:
+                text += line["text"].rstrip() + "\n"
             pages.append(text)
     return pages
 
@@ -55,6 +57,7 @@ def main():
     parser = argparse.ArgumentParser(description="Benchmark pdf extraction.")
     parser.add_argument("--result_path", type=str, help="Path to the output text file, defaults to stdout", default=None)
     parser.add_argument("--max", type=int, help="Maximum number of pages to process.", default=None)
+    parser.add_argument("--pdftext_only", action="store_true", help="Only run pdftext inference", default=False)
     args = parser.parse_args()
 
     split = "train"
@@ -66,6 +69,9 @@ def main():
     alignments = defaultdict(list)
     times_tools = ["pymupdf", "pdftext", "pdfplumber"]
     alignment_tools = ["pdftext", "pdfplumber"]
+    if args.pdftext_only:
+        times_tools = ["pdftext", "pymupdf"]
+        alignment_tools = ["pdftext"]
     model = get_model()
     for i in tqdm(range(len(dataset)), desc="Benchmarking"):
         row = dataset[i]
diff --git a/pdftext/inference.py b/pdftext/inference.py
index e4352be..14756ab 100644
--- a/pdftext/inference.py
+++ b/pdftext/inference.py
@@ -1,5 +1,7 @@
 from itertools import chain
 
+import sklearn
+
 from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES
 
 
@@ -152,7 +154,9 @@ def inference(text_chars, model):
         training_rows = [tl[1] for tl in training_list]
         training_idxs = [tl[0] for tl in training_list]
 
-        predictions = model.predict(training_rows)
+        # Disable nan, etc, validation for a small speedup
+        with sklearn.config_context(assume_finite=True):
+            predictions = model.predict(training_rows)
         for pred, page_idx in zip(predictions, training_idxs):
             next_prediction[page_idx] = pred
     page_blocks = sorted(page_blocks.items())
diff --git a/pdftext/postprocessing.py b/pdftext/postprocessing.py
index 9d06389..6b8d9eb 100644
--- a/pdftext/postprocessing.py
+++ b/pdftext/postprocessing.py
@@ -62,11 +62,9 @@ def merge_text(page: Dict, sort=False) -> str:
             for char in line["chars"]:
                 line_text += char["char"]
             line_text = postprocess_text(line_text)
-            if line_text.endswith("\n"):
-                line_text = line_text[:-1].strip() + " "
+            line_text = line_text.rstrip() + "\n"
 
             block_text += line_text
-        if not block_text.endswith("\n"):
-            block_text += "\n\n"
+        block_text = block_text.rstrip() + "\n\n"
         text += block_text
     return text
diff --git a/pyproject.toml b/pyproject.toml
index 8fd2c7a..dbd5d28 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdftext"
-version = "0.1.1"
+version = "0.1.2"
 description = "Extract structured text from pdfs quickly"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 license = "Apache-2.0"