From 95c06c82ea45ae8685b55a1e3904af45e76ee853 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Wed, 29 Jan 2025 14:17:27 -0500
Subject: [PATCH] Update overall benchmark

---
 benchmarks/overall.py                    | 132 -----------------------
 benchmarks/overall/inference.py          |  47 ++++++++
 benchmarks/overall/overall.py            |  88 +++++++++++++++
 benchmarks/overall/scoring.py            |  30 ++++++
 benchmarks/scoring.py                    |  36 -------
 benchmarks/table/table.py                |  22 ++--
 marker/builders/llm_layout.py            |   6 +-
 marker/processors/equation.py            |   9 +-
 marker/processors/llm/__init__.py        |   6 +-
 marker/processors/llm/llm_table_merge.py |   6 +-
 poetry.lock                              | 115 +++++++++++---------
 pyproject.toml                           |   2 +-
 12 files changed, 260 insertions(+), 239 deletions(-)
 delete mode 100644 benchmarks/overall.py
 create mode 100644 benchmarks/overall/inference.py
 create mode 100644 benchmarks/overall/overall.py
 create mode 100644 benchmarks/overall/scoring.py
 delete mode 100644 benchmarks/scoring.py

diff --git a/benchmarks/overall.py b/benchmarks/overall.py
deleted file mode 100644
index f6fb9591..00000000
--- a/benchmarks/overall.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import tempfile
-import time
-from collections import defaultdict
-
-import click
-from tqdm import tqdm
-import pypdfium2 as pdfium
-
-from marker.config.parser import ConfigParser
-from marker.converters.pdf import PdfConverter
-from marker.logger import configure_logging
-from marker.models import create_model_dict
-from pdftext.extraction import plain_text_output
-import json
-import os
-import subprocess
-import shutil
-from tabulate import tabulate
-
-from marker.settings import settings
-from scoring import score_text
-
-configure_logging()
-
-
-def nougat_prediction(pdf_filename, batch_size=1):
-    out_dir = tempfile.mkdtemp()
-    subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
-    md_file = os.listdir(out_dir)[0]
-    with open(os.path.join(out_dir, md_file), "r") as f:
-        data = f.read()
-    shutil.rmtree(out_dir)
-    return data
-
-@click.command(help="Benchmark PDF to MD conversion.")
-@click.argument("in_folder", type=str)
-@click.argument("reference_folder", type=str)
-@click.argument("out_file", type=str)
-@click.option("--nougat", is_flag=True, help="Run nougat and compare")
-@click.option("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
-def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_out_path: str):
-    methods = ["marker"]
-    if nougat:
-        methods.append("nougat")
-
-    model_dict = create_model_dict()
-
-    scores = defaultdict(dict)
-    benchmark_files = os.listdir(in_folder)
-    benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")]
-    times = defaultdict(dict)
-    pages = defaultdict(int)
-
-    for idx, fname in tqdm(enumerate(benchmark_files)):
-        md_filename = fname.rsplit(".", 1)[0] + ".md"
-
-        reference_filename = os.path.join(reference_folder, md_filename)
-        with open(reference_filename, "r") as f:
-            reference = f.read()
-
-        pdf_filename = os.path.join(in_folder, fname)
-        doc = pdfium.PdfDocument(pdf_filename)
-        pages[fname] = len(doc)
-
-        config_parser = ConfigParser({"output_format": "markdown"})
-        for method in methods:
-            start = time.time()
-            if method == "marker":
-                converter = PdfConverter(
-                    config=config_parser.generate_config_dict(),
-                    artifact_dict=model_dict,
-                    processor_list=None,
-                    renderer=config_parser.get_renderer()
-                )
-                full_text = converter(pdf_filename).markdown
-            elif method == "nougat":
-                full_text = nougat_prediction(pdf_filename, batch_size=1)
-            elif method == "naive":
-                full_text = plain_text_output(doc, workers=1)
-            else:
-                raise ValueError(f"Unknown method {method}")
-
-            times[method][fname] = time.time() - start
-
-            score = score_text(full_text, reference)
-            scores[method][fname] = score
-
-            if md_out_path:
-                md_out_filename = f"{method}_{md_filename}"
-                with open(os.path.join(md_out_path, md_out_filename), "w+") as f:
-                    f.write(full_text)
-
-    total_pages = sum(pages.values())
-    with open(out_file, "w+") as f:
-        write_data = defaultdict(dict)
-        for method in methods:
-            total_time = sum(times[method].values())
-            file_stats = {
-                fname:
-                {
-                    "time": times[method][fname],
-                    "score": scores[method][fname],
-                    "pages": pages[fname]
-                }
-
-                for fname in benchmark_files
-            }
-            write_data[method] = {
-                "files": file_stats,
-                "avg_score": sum(scores[method].values()) / len(scores[method]),
-                "time_per_page": total_time / total_pages,
-                "time_per_doc": total_time / len(scores[method])
-            }
-
-        json.dump(write_data, f, indent=4)
-
-    summary_table = []
-    score_table = []
-    score_headers = benchmark_files
-    for method in methods:
-        summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]])
-        score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]])
-
-    print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"]))
-    print("")
-    print("Scores by file")
-    print(tabulate(score_table, headers=["Method", *score_headers]))
-
-
-if __name__ == "__main__":
-    main()
-
diff --git a/benchmarks/overall/inference.py b/benchmarks/overall/inference.py
new file mode 100644
index 00000000..040a53af
--- /dev/null
+++ b/benchmarks/overall/inference.py
@@ -0,0 +1,47 @@
+import io
+
+import fitz as pymupdf
+import tempfile
+from bs4 import BeautifulSoup
+
+from marker.converters.pdf import PdfConverter
+
+def open_pymupdf(pdf_bytes):
+    stream = io.BytesIO(pdf_bytes)
+    return pymupdf.open(stream=stream)
+
+def clip_pdf_to_bbox(doc, bbox, padding=1):
+    page = doc[0]
+    height, width = page.bound().height, page.bound().width
+    remove_left = [0, 0, bbox[0] - padding, height]
+    remove_top = [0, 0, width, bbox[1] - padding]
+    remove_right = [bbox[2] + padding, 0, width, height]
+    remove_bottom = [0, bbox[3] + padding, width, height]
+    for remove in [remove_left, remove_top, remove_right, remove_bottom]:
+        clip_rect = pymupdf.Rect(*remove)
+        page.add_redact_annot(clip_rect)
+    page.apply_redactions()
+
+    clip_rect = pymupdf.Rect(*bbox)
+    page.set_cropbox(clip_rect)
+    return doc
+
+def get_marker_block_html(marker_models: dict, gt_blocks: list, pdf_bytes: bytes):
+    block_html = []
+    for block in gt_blocks:
+        bbox = block["bbox"]
+        doc2 = open_pymupdf(pdf_bytes)
+        clip_pdf_to_bbox(doc2, bbox)
+        block_converter = PdfConverter(
+            artifact_dict=marker_models,
+            config={"page_range": [0], "force_layout_block": block["block_type"], "disable_tqdm": True},
+            renderer="marker.renderers.html.HTMLRenderer"
+        )
+        with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
+            doc2.save(f)
+            rendered = block_converter(f.name)
+        html = rendered.html
+        soup = BeautifulSoup(html, "html.parser")
+        inner_html = str(soup.find("body").decode_contents())
+        block_html.append(inner_html)
+    return block_html
\ No newline at end of file
diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py
new file mode 100644
index 00000000..57e80286
--- /dev/null
+++ b/benchmarks/overall/overall.py
@@ -0,0 +1,88 @@
+import json
+import os
+from collections import defaultdict
+from pathlib import Path
+
+import click
+import datasets
+import tabulate
+from tqdm import tqdm
+
+from marker.logger import configure_logging
+from marker.models import create_model_dict
+from inference import get_marker_block_html
+from marker.settings import settings
+from scoring import score_blocks
+
+configure_logging()
+
+@click.command(help="Benchmark PDF to MD conversion.")
+@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
+@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against.  Possible values:", default="")
+@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
+@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
+def main(
+        dataset: str,
+        other_methods: str,
+        result_path: str,
+        max_rows: int
+):
+    allowed_methods = [""]
+    methods = other_methods.split(",")
+    for method in methods:
+        if method not in allowed_methods:
+            raise ValueError(f"Method {method} not allowed.  Allowed methods are {allowed_methods}")
+
+    model_dict = create_model_dict()
+    ds = datasets.load_dataset(dataset, split="train")
+
+    bench_scores = {}
+    averages_by_type = defaultdict(list)
+    averages_by_block_type = defaultdict(list)
+    for idx, sample in tqdm(enumerate(ds), desc="Running benchmark"):
+        gt_blocks = json.loads(sample["gt_blocks"])
+        doc_type = sample["classification"]
+        pdf_bytes = sample["pdf"] # This is a single page PDF
+        marker_html = get_marker_block_html(model_dict, gt_blocks, pdf_bytes)
+        gt_html = [block["html"] for block in gt_blocks]
+        scores = score_blocks(gt_html, marker_html)
+        gt_weights = [len(ht) for ht in gt_html]
+        overall_score = sum([s * w for s, w in zip(scores, gt_weights)]) / sum(gt_weights)
+        bench_scores[idx] = {
+            "scores": scores,
+            "weights": gt_weights,
+            "overall_score": overall_score # Weighted score, weighted by length of GT block
+        }
+
+        averages_by_type[doc_type].append(overall_score)
+
+        for score, gt_block in zip(scores, gt_blocks):
+            averages_by_block_type[gt_block["block_type"]].append(score)
+
+        if max_rows is not None and idx >= max_rows:
+            break
+
+    for k in averages_by_type:
+        averages_by_type[k] = sum(averages_by_type[k]) / len(averages_by_type[k])
+    averages_by_type = sorted(averages_by_type.items())
+
+    print(tabulate.tabulate(averages_by_type, headers=["Document Type", "Average Score"], tablefmt="github"))
+
+    for k in averages_by_block_type:
+        averages_by_block_type[k] = sum(averages_by_block_type[k]) / len(averages_by_block_type[k])
+    averages_by_block_type = sorted(averages_by_block_type.items())
+
+    print(tabulate.tabulate(averages_by_block_type, headers=["Block Type", "Average Score"], tablefmt="github"))
+
+    overall_average = sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
+    print(tabulate.tabulate([["Overall Average", overall_average]], tablefmt="github"))
+
+    out_path = Path(result_path) / "overall.json"
+    with open(out_path, "w") as f:
+        json.dump(bench_scores, f, indent=2)
+
+    print(f"Results saved to {out_path}.")
+
+if __name__ == "__main__":
+    main()
+
diff --git a/benchmarks/overall/scoring.py b/benchmarks/overall/scoring.py
new file mode 100644
index 00000000..3ae19a98
--- /dev/null
+++ b/benchmarks/overall/scoring.py
@@ -0,0 +1,30 @@
+import re
+from bs4 import BeautifulSoup
+
+from markdownify import markdownify as md
+from rapidfuzz import fuzz
+
+def standardize_html(html):
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Convert all headers to h1 so we don't penalize small differences in header levels
+    for tag in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
+        tag.name = "h1"
+
+    html = str(soup)
+    markdown = md(html)
+    markdown = markdown.replace("<br>", "\n")
+    markdown = re.sub(r"\s+", " ", markdown)
+    markdown = re.sub(r"\n+", "\n", markdown)
+    markdown = re.sub("\\.+", ".", markdown) # Replace repeated periods with a single period, like in table of contents
+    return markdown.strip()
+
+
+def score_blocks(gt_html, method_html):
+    scores = []
+    for gt, method in zip(gt_html, method_html):
+        gt= standardize_html(gt)
+        method = standardize_html(method)
+        score = fuzz.ratio(gt, method)
+        scores.append(score)
+    return scores
\ No newline at end of file
diff --git a/benchmarks/scoring.py b/benchmarks/scoring.py
deleted file mode 100644
index 5aa9faff..00000000
--- a/benchmarks/scoring.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from rapidfuzz import fuzz
-from statistics import mean
-
-CHUNK_MIN_CHARS = 25
-
-def chunk_text(text, chunk_len=500):
-    chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
-    chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
-    return chunks
-
-
-def overlap_score(hypothesis_chunks, reference_chunks):
-    length_modifier = len(hypothesis_chunks) / len(reference_chunks)
-    search_distance = max(len(reference_chunks) // 5, 10)
-    chunk_scores = []
-    for i, hyp_chunk in enumerate(hypothesis_chunks):
-        max_score = 0
-        total_len = 0
-        i_offset = int(i * length_modifier)
-        chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
-        for j in chunk_range:
-            ref_chunk = reference_chunks[j]
-            score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
-            if score > max_score:
-                max_score = score
-                total_len = len(ref_chunk)
-        chunk_scores.append(max_score)
-    return chunk_scores
-
-
-def score_text(hypothesis, reference):
-    # Returns a 0-1 alignment score
-    hypothesis_chunks = chunk_text(hypothesis)
-    reference_chunks = chunk_text(reference)
-    chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
-    return mean(chunk_scores)
diff --git a/benchmarks/table/table.py b/benchmarks/table/table.py
index 3116274d..dfeb5eb0 100644
--- a/benchmarks/table/table.py
+++ b/benchmarks/table/table.py
@@ -1,10 +1,10 @@
 import os
-from itertools import repeat
-from tkinter import Image
-
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for an op, which is not supported on MPS
 
+from pathlib import Path
+from itertools import repeat
 from typing import List
+
 import numpy as np
 import base64
 import time
@@ -20,6 +20,7 @@
 import pypdfium2 as pdfium
 from marker.util import matrix_intersection_area
 from marker.renderers.json import JSONOutput, JSONBlockOutput
+from marker.settings import settings
 
 from marker.config.parser import ConfigParser
 from marker.converters.table import TableConverter
@@ -47,7 +48,7 @@ def extract_tables(children: List[JSONBlockOutput]):
 
 
 @click.command(help="Benchmark Table to HTML Conversion")
-@click.argument("out_file", type=str)
+@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.")
 @click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
 @click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
 @click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
@@ -55,7 +56,7 @@ def extract_tables(children: List[JSONBlockOutput]):
 @click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
 @click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.")
 def main(
-        out_file: str,
+        result_path: str,
         dataset: str,
         max_rows: int,
         max_workers: int,
@@ -64,7 +65,7 @@ def main(
         use_gemini: bool = False
 ):
     models = create_model_dict()
-    config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size})
+    config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True})
     start = time.time()
 
 
@@ -93,9 +94,7 @@ def main(
             with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
                 temp_pdf_file.write(pdf_binary)
                 temp_pdf_file.seek(0)
-                tqdm.disable = True
                 marker_json = converter(temp_pdf_file.name).children
-                tqdm.disable = False
 
                 doc = pdfium.PdfDocument(temp_pdf_file.name)
                 page_image = doc[0].render(scale=92/72).to_pil()
@@ -223,8 +222,11 @@ def main(
         "gemini": gemini_results
     }
 
-    with open(out_file, "w+") as f:
+    out_path = Path(result_path) / "table.json"
+    with open(out_path, "w+") as f:
         json.dump(results, f, indent=2)
 
+    print(f"Results saved to {out_path}.")
+
 if __name__ == '__main__':
     main()
\ No newline at end of file
diff --git a/marker/builders/llm_layout.py b/marker/builders/llm_layout.py
index b061ea48..c9aae671 100644
--- a/marker/builders/llm_layout.py
+++ b/marker/builders/llm_layout.py
@@ -50,6 +50,10 @@ class LLMLayoutBuilder(LayoutBuilder):
         int,
         "The timeout for requests to the Gemini model.",
     ] = 60
+    disable_tqdm: Annotated[
+        bool,
+        "Whether to disable the tqdm progress bar.",
+    ] = False
     topk_relabelling_prompt: Annotated[
         str,
         "The prompt to use for relabelling blocks.",
@@ -107,7 +111,7 @@ def __call__(self, document: Document, provider: PdfProvider):
             print(f"Error relabelling blocks: {e}")
 
     def relabel_blocks(self, document: Document):
-        pbar = tqdm(desc="LLM layout relabelling")
+        pbar = tqdm(desc="LLM layout relabelling", disable=self.disable_tqdm)
         with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
             futures = []
             for page in document.pages:
diff --git a/marker/processors/equation.py b/marker/processors/equation.py
index 5f5be17c..6bd79fa7 100644
--- a/marker/processors/equation.py
+++ b/marker/processors/equation.py
@@ -1,7 +1,4 @@
 from typing import Annotated, List, Optional, Tuple
-
-from texify.inference import batch_inference
-from texify.model.model import GenerateVisionEncoderDecoderModel
 from tqdm import tqdm
 
 from marker.models import TexifyPredictor
@@ -32,6 +29,10 @@ class EquationProcessor(BaseProcessor):
         int,
         "The number of tokens to buffer above max for the Texify model.",
     ] = 256
+    disable_tqdm: Annotated[
+        bool,
+        "Whether to disable the tqdm progress bar.",
+    ] = False
 
     def __init__(self, texify_model: TexifyPredictor, config=None):
         super().__init__(config)
@@ -80,7 +81,7 @@ def get_latex_batched(self, equation_data: List[dict]):
         predictions = [""] * len(equation_data)
         batch_size = self.get_batch_size()
 
-        for i in tqdm(range(0, len(equation_data), batch_size), desc="Recognizing equations"):
+        for i in tqdm(range(0, len(equation_data), batch_size), desc="Recognizing equations", disable=self.disable_tqdm):
             # Dynamically set max length to save inference time
             min_idx = i
             max_idx = min(min_idx + batch_size, len(equation_data))
diff --git a/marker/processors/llm/__init__.py b/marker/processors/llm/__init__.py
index c41853ac..5f36139e 100644
--- a/marker/processors/llm/__init__.py
+++ b/marker/processors/llm/__init__.py
@@ -44,6 +44,10 @@ class BaseLLMProcessor(BaseProcessor):
         bool,
         "Whether to use the LLM model.",
     ] = False
+    disable_tqdm: Annotated[
+        bool,
+        "Whether to disable the tqdm progress bar.",
+    ] = False
     block_types = None
 
     def __init__(self, config=None):
@@ -73,7 +77,7 @@ def rewrite_blocks(self, document: Document):
         if total_blocks == 0:
             return
 
-        pbar = tqdm(desc=f"{self.__class__.__name__} running")
+        pbar = tqdm(desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm)
         with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
             for future in as_completed([
                 executor.submit(self.process_rewriting, document, page, block)
diff --git a/marker/processors/llm/llm_table_merge.py b/marker/processors/llm/llm_table_merge.py
index e2012998..a3f74396 100644
--- a/marker/processors/llm/llm_table_merge.py
+++ b/marker/processors/llm/llm_table_merge.py
@@ -44,6 +44,10 @@ class LLMTableMergeProcessor(BaseLLMProcessor):
         int,
         "The maximum gap between columns to merge tables"
     ] = 50
+    disable_tqdm: Annotated[
+        bool,
+        "Whether to disable the tqdm progress bar.",
+    ] = False
     table_merge_prompt: Annotated[
         str,
         "The prompt to use for rewriting text.",
@@ -137,7 +141,7 @@ def get_column_count(cells: List[TableCell]):
         return max_cols
 
     def rewrite_blocks(self, document: Document):
-        pbar = tqdm(desc=f"{self.__class__.__name__} running")
+        pbar = tqdm(desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm)
         table_runs = []
         table_run = []
         prev_block = None
diff --git a/poetry.lock b/poetry.lock
index 652f9c68..c053780c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2729,6 +2729,18 @@ files = [
 [package.dependencies]
 nvidia-nvjitlink-cu12 = "*"
 
+[[package]]
+name = "nvidia-cusparselt-cu12"
+version = "0.6.2"
+description = "NVIDIA cuSPARSELt"
+optional = false
+python-versions = "*"
+files = [
+    {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8"},
+    {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9"},
+    {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-win_amd64.whl", hash = "sha256:0057c91d230703924c0422feabe4ce768841f9b4b44d28586b6f6d2eb86fbe70"},
+]
+
 [[package]]
 name = "nvidia-nccl-cu12"
 version = "2.21.5"
@@ -3566,6 +3578,23 @@ files = [
 [package.extras]
 windows-terminal = ["colorama (>=0.4.6)"]
 
+[[package]]
+name = "pymupdf"
+version = "1.25.2"
+description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents."
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pymupdf-1.25.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:59dea22b633cc4fc13670b4c5db50d71f8cd4f420814420f33ce47ddcb61e1f6"},
+    {file = "pymupdf-1.25.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:e8b8a874497cd0deee89a6a4fb76a3a08173c8d39e88fc7cf715764ec5a243e9"},
+    {file = "pymupdf-1.25.2-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f61e5cdb25b86eb28d34aa3557b49ecf9e361d5f5cd3b1660406f8f0bf813af7"},
+    {file = "pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ae8cfa7a97d78f813d286ecba32369059d88073edd1e5cf105f4cd0811f71925"},
+    {file = "pymupdf-1.25.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:295505fe1ecb7c7b57d4124d373e207ea311d8e40bc7ac3016d8ec2d60b091e9"},
+    {file = "pymupdf-1.25.2-cp39-abi3-win32.whl", hash = "sha256:b9488c8b82bb9be36fb13ee0c8d43b0ddcc50af83b61da01e6040413d9e67da6"},
+    {file = "pymupdf-1.25.2-cp39-abi3-win_amd64.whl", hash = "sha256:1b4ca6f5780d319a08dff885a5a0e3585c5d7af04dcfa063c535b88371fd91c1"},
+    {file = "pymupdf-1.25.2.tar.gz", hash = "sha256:9ea88ff1b3ccb359620f106a6fd5ba6877d959d21d78272052c3496ceede6eec"},
+]
+
 [[package]]
 name = "pyparsing"
 version = "3.2.1"
@@ -4729,27 +4758,6 @@ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
 test = ["pre-commit", "pytest (>=7.0)", "pytest-timeout"]
 typing = ["mypy (>=1.6,<2.0)", "traitlets (>=5.11.1)"]
 
-[[package]]
-name = "texify"
-version = "0.2.1"
-description = "OCR for latex images"
-optional = false
-python-versions = "<4.0,>=3.10"
-files = [
-    {file = "texify-0.2.1-py3-none-any.whl", hash = "sha256:861c90ea6167fb6c2b334d5fcf0116dd9e1585af359463dec83115891c09dcfa"},
-    {file = "texify-0.2.1.tar.gz", hash = "sha256:bab30f8445aa60e36de122fb86deb77b3f25348a885d4d5f3c67d6b6f5bb2e81"},
-]
-
-[package.dependencies]
-ftfy = ">=6.1.3,<7.0.0"
-Pillow = ">=10.1.0,<11.0.0"
-pydantic = ">=2.5.2,<3.0.0"
-pydantic-settings = ">=2.1.0,<3.0.0"
-pypdfium2 = ">=4.25.0,<5.0.0"
-python-dotenv = ">=1.0.0,<2.0.0"
-torch = ">=2.1.2,<3.0.0"
-transformers = ">=4.36.2,<5.0.0"
-
 [[package]]
 name = "threadpoolctl"
 version = "3.5.0"
@@ -4865,28 +4873,31 @@ files = [
 
 [[package]]
 name = "torch"
-version = "2.5.1"
+version = "2.6.0"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 optional = false
-python-versions = ">=3.8.0"
+python-versions = ">=3.9.0"
 files = [
-    {file = "torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:71328e1bbe39d213b8721678f9dcac30dfc452a46d586f1d514a6aa0a99d4744"},
-    {file = "torch-2.5.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:34bfa1a852e5714cbfa17f27c49d8ce35e1b7af5608c4bc6e81392c352dbc601"},
-    {file = "torch-2.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:32a037bd98a241df6c93e4c789b683335da76a2ac142c0973675b715102dc5fa"},
-    {file = "torch-2.5.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:23d062bf70776a3d04dbe74db950db2a5245e1ba4f27208a87f0d743b0d06e86"},
-    {file = "torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:de5b7d6740c4b636ef4db92be922f0edc425b65ed78c5076c43c42d362a45457"},
-    {file = "torch-2.5.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:340ce0432cad0d37f5a31be666896e16788f1adf8ad7be481196b503dad675b9"},
-    {file = "torch-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:603c52d2fe06433c18b747d25f5c333f9c1d58615620578c326d66f258686f9a"},
-    {file = "torch-2.5.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:31f8c39660962f9ae4eeec995e3049b5492eb7360dd4f07377658ef4d728fa4c"},
-    {file = "torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:ed231a4b3a5952177fafb661213d690a72caaad97d5824dd4fc17ab9e15cec03"},
-    {file = "torch-2.5.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:3f4b7f10a247e0dcd7ea97dc2d3bfbfc90302ed36d7f3952b0008d0df264e697"},
-    {file = "torch-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:73e58e78f7d220917c5dbfad1a40e09df9929d3b95d25e57d9f8558f84c9a11c"},
-    {file = "torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1"},
-    {file = "torch-2.5.1-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:9b61edf3b4f6e3b0e0adda8b3960266b9009d02b37555971f4d1c8f7a05afed7"},
-    {file = "torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:1f3b7fb3cf7ab97fae52161423f81be8c6b8afac8d9760823fd623994581e1a3"},
-    {file = "torch-2.5.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7974e3dce28b5a21fb554b73e1bc9072c25dde873fa00d54280861e7a009d7dc"},
-    {file = "torch-2.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:46c817d3ea33696ad3b9df5e774dba2257e9a4cd3c4a3afbf92f6bb13ac5ce2d"},
-    {file = "torch-2.5.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:8046768b7f6d35b85d101b4b38cba8aa2f3cd51952bc4c06a49580f2ce682291"},
+    {file = "torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961"},
+    {file = "torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab"},
+    {file = "torch-2.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:56eeaf2ecac90da5d9e35f7f35eb286da82673ec3c582e310a8d1631a1c02341"},
+    {file = "torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628"},
+    {file = "torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1"},
+    {file = "torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d"},
+    {file = "torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7"},
+    {file = "torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21"},
+    {file = "torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9"},
+    {file = "torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb"},
+    {file = "torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239"},
+    {file = "torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989"},
+    {file = "torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf"},
+    {file = "torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b"},
+    {file = "torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc"},
+    {file = "torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2"},
+    {file = "torch-2.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ea955317cfcd3852b1402b62af258ce735c2edeee42ca9419b6bc889e5ae053"},
+    {file = "torch-2.6.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb2c6c3e65049f081940f5ab15c9136c7de40d3f01192541c920a07c7c585b7e"},
+    {file = "torch-2.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:683410f97984103148e31b38a8631acf31c3034c020c0f4d26171e7626d8317a"},
+    {file = "torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c"},
 ]
 
 [package.dependencies]
@@ -4903,17 +4914,18 @@ nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux
 nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusparselt-cu12 = {version = "0.6.2", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 setuptools = {version = "*", markers = "python_version >= \"3.12\""}
 sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
-triton = {version = "3.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""}
-typing-extensions = ">=4.8.0"
+triton = {version = "3.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+typing-extensions = ">=4.10.0"
 
 [package.extras]
 opt-einsum = ["opt-einsum (>=3.3)"]
-optree = ["optree (>=0.12.0)"]
+optree = ["optree (>=0.13.0)"]
 
 [[package]]
 name = "tornado"
@@ -5042,21 +5054,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
 
 [[package]]
 name = "triton"
-version = "3.1.0"
+version = "3.2.0"
 description = "A language and compiler for custom Deep Learning operations"
 optional = false
 python-versions = "*"
 files = [
-    {file = "triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8"},
-    {file = "triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c"},
-    {file = "triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc"},
-    {file = "triton-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dadaca7fc24de34e180271b5cf864c16755702e9f63a16f62df714a8099126a"},
-    {file = "triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11"},
+    {file = "triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62"},
+    {file = "triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220"},
+    {file = "triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c"},
+    {file = "triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0"},
+    {file = "triton-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30ceed0eff2c4a73b14eb63e052992f44bbdf175f3fad21e1ac8097a772de7ee"},
 ]
 
-[package.dependencies]
-filelock = "*"
-
 [package.extras]
 build = ["cmake (>=3.20)", "lit"]
 tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
@@ -5489,4 +5498,4 @@ propcache = ">=0.2.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "d43373ff00de4feb00b0aed4fe98d2a84ecb5742d1a916cabbace5104f888d54"
+content-hash = "9d330f12a8bad0352ec550e1d6a77348b10f6bca7ecc41769813bec85d3f9e08"
diff --git a/pyproject.toml b/pyproject.toml
index 08d8c72e..0e787c60 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,6 @@ python-dotenv = "^1.0.0"
 torch = "^2.5.1"
 tqdm = "^4.66.1"
 ftfy = "^6.1.1"
-texify = "^0.2.1"
 rapidfuzz = "^3.8.1"
 surya-ocr = "~0.10.0"
 regex = "^2024.4.28"
@@ -50,6 +49,7 @@ apted = "1.0.3"
 distance = "0.1.3"
 lxml = "5.3.0"
 tabulate = "^0.9.0"
+pymupdf = "^1.25.2"
 
 [tool.poetry.scripts]
 marker = "marker.scripts.convert:convert_cli"