Add alignment benchmark

VikParuchuri · Apr 25, 2024 · 7359e5e · 7359e5e
1 parent 4eef1e1
commit 7359e5e
Show file tree

Hide file tree

Showing 7 changed files with 229 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -1,16 +1,12 @@
 # PDFText
 
-Text extraction like PyMuPDF, but without the AGPL license.  PDFText extracts plain text or structured blocks and lines, similar to [PymuPDF](https://github.com/pymupdf/PyMuPDF).  It's built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2), so it's [fast, accurate](https://github.com/py-pdf/benchmarks), and Apache licensed.
+Text extraction like [PyMuPDF]((https://github.com/pymupdf/PyMuPDF), but without the AGPL license.  PDFText extracts plain text or structured blocks and lines.  It's built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2), so it's [fast, accurate](#benchmarks), and Apache licensed.
 
 # Installation
 
-You'll need python 3.9+ first.  Then run:
+You'll need python 3.9+ first.  Then run `pip install pdftext`.
 
-```shell
-pip install pdftext
-```
-
-# CLI Usage
+# Usage
 
 - Inspect the settings in `pdftext/settings.py`.  You can override any settings with environment variables.
 
@@ -77,4 +73,42 @@ text = dictionary_output(PDF_PATH)
 
 If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper.  pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well.
 
+# Benchmarks
+
+I benchmarked extraction speed and accuracy of [pymupdf](https://pymupdf.readthedocs.io/en/latest/), [pdfplumber](https://github.com/jsvine/pdfplumber), and pdftext.
+
+Here are the scores:
+
++------------+-------------------+-----------------------------------------+
+|  Library   | Time (s per page) | Alignment Score (% accuracy vs pymupdf) |
++------------+-------------------+-----------------------------------------+
+|  pymupdf   |       0.31        |                   --                    |
+|  pdftext   |       1.55        |                  95.73                  |
+| pdfplumber |       3.39        |                  89.55                  |
++------------+-------------------+-----------------------------------------+
+
+pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same information).
+
+There are additional benchmarks for pypdfium2 and other tools [here](https://github.com/py-pdf/benchmarks).
+
+## Methodology
+
+I used a benchmark set of 200 pdfs extracted from [common crawl](https://huggingface.co/datasets/pixparse/pdfa-eng-wds), then processed by a team at HuggingFace.
+
+For each library, I used a detailed extraction method, to pull out font information, as well as just the words.  This ensured we were comparing similar elements.
+
+For the alignment score, I extracted the text, flattened it by removing all non-newline whitespace, then used the rapidfuzz library to find the alignment percentage.  I used the text extracted by pymupdf as the pseudo-ground truth.
+
+# How it works
+
+PDFText is a very light wrapper around pypdfium2.  It first uses pypdfium2 to extract characters in order, along with font and other information.  Then it uses a simple decision tree algorithm to group characters into lines and blocks.  It then done some simple postprocessing to clean up the text.
+
+# Credits
+
+This is built on some amazing open source work, including:
+
+- [pypdfium2](https://github.com/pypdfium2-team/pypdfium2)
+- [scikit-learn](https://scikit-learn.org/stable/index.html)
+- [pypdf2](https://github.com/py-pdf/benchmarks) for very thorough and fair benchmarks
 
+Thank you to the [pymupdf](https://github.com/pymupdf/PyMuPDF) devs for creating such a great library - I just wish it had a simpler license!
diff --git a/benchmark.py b/benchmark.py
@@ -4,12 +4,15 @@
 from statistics import mean
 import os
 import json
+import re
 
 import fitz as pymupdf
 import datasets
 import pdfplumber
+from rapidfuzz import fuzz
+import tabulate
 
-from pdftext.extraction import dictionary_output
+from pdftext.extraction import paginated_plain_text_output
 from pdftext.settings import settings
 
 
@@ -18,7 +21,14 @@ def pymupdf_inference(pdf_path):
     pages = []
     for i in range(len(doc)):
         page = doc[i]
-        text = page.get_text("dict")
+        blocks = page.get_text("dict", flags=pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES)
+        text = ""
+        for block in blocks["blocks"]:
+            for line in block["lines"]:
+                for span in line["spans"]:
+                    text += span["text"]
+            if not text.endswith("\n"):
+                text += "\n\n"
         pages.append(text)
     return pages
 
@@ -33,6 +43,15 @@ def pdfplumber_inference(pdf_path):
     return pages
 
 
+def flatten_text(page: str):
+    # Replace all text, except newlines, so we can compare block parsing effectively.
+    return re.sub(r'[ \t\r\f\v]+', '', page)
+
+
+def compare_docs(doc1: str, doc2: str):
+    return fuzz.ratio(flatten_text(doc1), flatten_text(doc2))
+
+
 def main():
     parser = argparse.ArgumentParser(description="Benchmark pdf extraction.")
     parser.add_argument("--result_path", type=str, help="Path to the output text file, defaults to stdout", default=None)
@@ -47,6 +66,8 @@ def main():
     mu_times = []
     pdftext_times = []
     pdfplumber_times = []
+    pdftext_alignment = []
+    pdfplumber_alignment = []
     for i in range(len(dataset)):
         row = dataset[i]
         pdf = row["pdf"]
@@ -61,21 +82,39 @@ def main():
 
 
             start = time.time()
-            pdftext_pages = dictionary_output(pdf_path)
+            pdftext_pages = paginated_plain_text_output(pdf_path)
             pdftext_times.append(time.time() - start)
 
             start = time.time()
             pdfplumber_pages = pdfplumber_inference(pdf_path)
             pdfplumber_times.append(time.time() - start)
 
-    print(f"MuPDF avg time: {mean(mu_times):.2f}")
-    print(f"pdfplumber avg time: {mean(pdfplumber_times):.2f}")
-    print(f"pdftext avg time: {mean(pdftext_times):.2f}")
+            alignments = [compare_docs(mu_page, pdftext_page) for mu_page, pdftext_page in zip(mu_pages, pdftext_pages)]
+            pdftext_alignment.append(mean(alignments))
+
+            alignments = [compare_docs(mu_page, pdfplumber_page) for mu_page, pdfplumber_page in zip(mu_pages, pdfplumber_pages)]
+            pdfplumber_alignment.append(mean(alignments))
+
+    print("Benchmark Scores")
+    headers = ["Library", "Time (s per page)", "Alignment Score (% accuracy vs pymupdf)"]
+    table = [
+        ["pymupdf", round(mean(mu_times), 2), "--"],
+        ["pdftext", round(mean(pdftext_times), 2), round(mean(pdftext_alignment), 2)],
+        ["pdfplumber", round(mean(pdfplumber_times), 2), round(mean(pdfplumber_alignment), 2)]
+    ]
+    table = tabulate.tabulate(table, tablefmt="pretty", headers=headers)
+    print(table)
 
     results = {
-        "mu_times": mu_times,
-        "pdftext_times": pdftext_times,
-        "pdfplumber_times": pdfplumber_times
+        "times": {
+            "pymupdf": mean(mu_times),
+            "pdftext": mean(pdftext_times),
+            "pdfplumber": mean(pdfplumber_times)
+        },
+        "alignments": {
+            "pdftext": pdftext_alignment,
+            "pdfplumber": pdfplumber_alignment
+        }
     }
 
     result_path = args.result_path

diff --git a/models/dt.joblib b/models/dt.joblib
diff --git a/pdftext/extraction.py b/pdftext/extraction.py
@@ -1,3 +1,5 @@
+from typing import List
+
 from pdftext.inference import inference
 from pdftext.model import get_model
 from pdftext.pdf.chars import get_pdfium_chars
@@ -12,11 +14,16 @@ def _get_pages(pdf_path):
     return pages
 
 
-def plain_text_output(pdf_path, sort=False):
+def plain_text_output(pdf_path, sort=False) -> str:
+    text = paginated_plain_text_output(pdf_path, sort=sort)
+    return "\n".join(text)
+
+
+def paginated_plain_text_output(pdf_path, sort=False) -> List[str]:
     pages = _get_pages(pdf_path)
-    text = ""
+    text = []
     for page in pages:
-        text += merge_text(page, sort=sort).strip() + "\n"
+        text.append(merge_text(page, sort=sort).strip())
     return text
 
 

diff --git a/pdftext/inference.py b/pdftext/inference.py
@@ -1,6 +1,6 @@
 from itertools import chain
 
-from pdftext.pdf.utils import LINE_BREAKS
+from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES
 
 
 def update_current(current, new_char):
@@ -25,11 +25,21 @@ def create_training_row(char_info, prev_char, currblock, avg_x_gap, avg_y_gap):
     char_center_y = (char_info["bbox"][3] + char_info["bbox"][1]) / 2
     x_gap = char_info["bbox"][0] - prev_char["bbox"][2]
     y_gap = char_info["bbox"][1] - prev_char["bbox"][3]
+    font_match = all(
+        [char_info["font"][key] == prev_char["font"][key] for key in ["name", "size", "weight", "flags"]] +
+        [char_info["rotation"] == prev_char["rotation"]]
+    )
+    is_space = any([
+        char in SPACES,
+        char in TABS,
+    ])
 
     training_row = {
         "is_newline": char in LINE_BREAKS,
+        "is_space": is_space,
         "x_gap": x_gap,
         "y_gap": y_gap,
+        "font_match": font_match,
         "x_outer_gap": char_info["bbox"][2] - prev_char["bbox"][0],
         "y_outer_gap": char_info["bbox"][3] - prev_char["bbox"][1],
         "x_gap_ratio": x_gap / avg_x_gap if avg_x_gap > 0 else 0,