From dc483f96198526762ddd084cc0fcb5cfbab72d3b Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Mon, 29 Apr 2024 15:53:04 -0700 Subject: [PATCH] Fix benchmark --- benchmark.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/benchmark.py b/benchmark.py index fc70b92..d641439 100644 --- a/benchmark.py +++ b/benchmark.py @@ -13,6 +13,7 @@ from rapidfuzz import fuzz import tabulate from tqdm import tqdm +import pypdfium2 as pdfium from pdftext.extraction import paginated_plain_text_output from pdftext.model import get_model @@ -31,7 +32,7 @@ def pymupdf_inference(pdf_path): for span in line["spans"]: text += span["text"] text = text.rstrip() + "\n" - text = text.rstrip() + "\n\n" + text = text.rstrip() + "\n" pages.append(text) return pages @@ -49,6 +50,11 @@ def pdfplumber_inference(pdf_path): return pages +def pdftext_inference(pdf_path, model): + pdf = pdfium.PdfDocument(pdf_path) + return paginated_plain_text_output(pdf, model=model) + + def compare_docs(doc1: str, doc2: str): return fuzz.ratio(doc1, doc2) @@ -70,7 +76,7 @@ def main(): times_tools = ["pymupdf", "pdftext", "pdfplumber"] alignment_tools = ["pdftext", "pdfplumber"] if args.pdftext_only: - times_tools = ["pdftext", "pymupdf"] + times_tools = ["pymupdf", "pdftext"] alignment_tools = ["pdftext"] model = get_model() for i in tqdm(range(len(dataset)), desc="Benchmarking"): @@ -82,8 +88,8 @@ def main(): f.seek(0) pdf_path = f.name - pdftext_inference = partial(paginated_plain_text_output, model=model) - inference_funcs = [pymupdf_inference, pdftext_inference, pdfplumber_inference] + pdftext_inference_model = partial(pdftext_inference, model=model) + inference_funcs = [pymupdf_inference, pdftext_inference_model, pdfplumber_inference] for tool, inference_func in zip(times_tools, inference_funcs): start = time.time() pages = inference_func(pdf_path)