From 95c06c82ea45ae8685b55a1e3904af45e76ee853 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 29 Jan 2025 14:17:27 -0500 Subject: [PATCH] Update overall benchmark --- benchmarks/overall.py | 132 ----------------------- benchmarks/overall/inference.py | 47 ++++++++ benchmarks/overall/overall.py | 88 +++++++++++++++ benchmarks/overall/scoring.py | 30 ++++++ benchmarks/scoring.py | 36 ------- benchmarks/table/table.py | 22 ++-- marker/builders/llm_layout.py | 6 +- marker/processors/equation.py | 9 +- marker/processors/llm/__init__.py | 6 +- marker/processors/llm/llm_table_merge.py | 6 +- poetry.lock | 115 +++++++++++--------- pyproject.toml | 2 +- 12 files changed, 260 insertions(+), 239 deletions(-) delete mode 100644 benchmarks/overall.py create mode 100644 benchmarks/overall/inference.py create mode 100644 benchmarks/overall/overall.py create mode 100644 benchmarks/overall/scoring.py delete mode 100644 benchmarks/scoring.py diff --git a/benchmarks/overall.py b/benchmarks/overall.py deleted file mode 100644 index f6fb9591..00000000 --- a/benchmarks/overall.py +++ /dev/null @@ -1,132 +0,0 @@ -import tempfile -import time -from collections import defaultdict - -import click -from tqdm import tqdm -import pypdfium2 as pdfium - -from marker.config.parser import ConfigParser -from marker.converters.pdf import PdfConverter -from marker.logger import configure_logging -from marker.models import create_model_dict -from pdftext.extraction import plain_text_output -import json -import os -import subprocess -import shutil -from tabulate import tabulate - -from marker.settings import settings -from scoring import score_text - -configure_logging() - - -def nougat_prediction(pdf_filename, batch_size=1): - out_dir = tempfile.mkdtemp() - subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True) - md_file = os.listdir(out_dir)[0] - with open(os.path.join(out_dir, md_file), "r") as f: - data = f.read() - shutil.rmtree(out_dir) - return data - -@click.command(help="Benchmark PDF to MD conversion.") -@click.argument("in_folder", type=str) -@click.argument("reference_folder", type=str) -@click.argument("out_file", type=str) -@click.option("--nougat", is_flag=True, help="Run nougat and compare") -@click.option("--md_out_path", type=str, default=None, help="Output path for generated markdown files") -def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_out_path: str): - methods = ["marker"] - if nougat: - methods.append("nougat") - - model_dict = create_model_dict() - - scores = defaultdict(dict) - benchmark_files = os.listdir(in_folder) - benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")] - times = defaultdict(dict) - pages = defaultdict(int) - - for idx, fname in tqdm(enumerate(benchmark_files)): - md_filename = fname.rsplit(".", 1)[0] + ".md" - - reference_filename = os.path.join(reference_folder, md_filename) - with open(reference_filename, "r") as f: - reference = f.read() - - pdf_filename = os.path.join(in_folder, fname) - doc = pdfium.PdfDocument(pdf_filename) - pages[fname] = len(doc) - - config_parser = ConfigParser({"output_format": "markdown"}) - for method in methods: - start = time.time() - if method == "marker": - converter = PdfConverter( - config=config_parser.generate_config_dict(), - artifact_dict=model_dict, - processor_list=None, - renderer=config_parser.get_renderer() - ) - full_text = converter(pdf_filename).markdown - elif method == "nougat": - full_text = nougat_prediction(pdf_filename, batch_size=1) - elif method == "naive": - full_text = plain_text_output(doc, workers=1) - else: - raise ValueError(f"Unknown method {method}") - - times[method][fname] = time.time() - start - - score = score_text(full_text, reference) - scores[method][fname] = score - - if md_out_path: - md_out_filename = f"{method}_{md_filename}" - with open(os.path.join(md_out_path, md_out_filename), "w+") as f: - f.write(full_text) - - total_pages = sum(pages.values()) - with open(out_file, "w+") as f: - write_data = defaultdict(dict) - for method in methods: - total_time = sum(times[method].values()) - file_stats = { - fname: - { - "time": times[method][fname], - "score": scores[method][fname], - "pages": pages[fname] - } - - for fname in benchmark_files - } - write_data[method] = { - "files": file_stats, - "avg_score": sum(scores[method].values()) / len(scores[method]), - "time_per_page": total_time / total_pages, - "time_per_doc": total_time / len(scores[method]) - } - - json.dump(write_data, f, indent=4) - - summary_table = [] - score_table = [] - score_headers = benchmark_files - for method in methods: - summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]]) - score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]]) - - print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"])) - print("") - print("Scores by file") - print(tabulate(score_table, headers=["Method", *score_headers])) - - -if __name__ == "__main__": - main() - diff --git a/benchmarks/overall/inference.py b/benchmarks/overall/inference.py new file mode 100644 index 00000000..040a53af --- /dev/null +++ b/benchmarks/overall/inference.py @@ -0,0 +1,47 @@ +import io + +import fitz as pymupdf +import tempfile +from bs4 import BeautifulSoup + +from marker.converters.pdf import PdfConverter + +def open_pymupdf(pdf_bytes): + stream = io.BytesIO(pdf_bytes) + return pymupdf.open(stream=stream) + +def clip_pdf_to_bbox(doc, bbox, padding=1): + page = doc[0] + height, width = page.bound().height, page.bound().width + remove_left = [0, 0, bbox[0] - padding, height] + remove_top = [0, 0, width, bbox[1] - padding] + remove_right = [bbox[2] + padding, 0, width, height] + remove_bottom = [0, bbox[3] + padding, width, height] + for remove in [remove_left, remove_top, remove_right, remove_bottom]: + clip_rect = pymupdf.Rect(*remove) + page.add_redact_annot(clip_rect) + page.apply_redactions() + + clip_rect = pymupdf.Rect(*bbox) + page.set_cropbox(clip_rect) + return doc + +def get_marker_block_html(marker_models: dict, gt_blocks: list, pdf_bytes: bytes): + block_html = [] + for block in gt_blocks: + bbox = block["bbox"] + doc2 = open_pymupdf(pdf_bytes) + clip_pdf_to_bbox(doc2, bbox) + block_converter = PdfConverter( + artifact_dict=marker_models, + config={"page_range": [0], "force_layout_block": block["block_type"], "disable_tqdm": True}, + renderer="marker.renderers.html.HTMLRenderer" + ) + with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: + doc2.save(f) + rendered = block_converter(f.name) + html = rendered.html + soup = BeautifulSoup(html, "html.parser") + inner_html = str(soup.find("body").decode_contents()) + block_html.append(inner_html) + return block_html \ No newline at end of file diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py new file mode 100644 index 00000000..57e80286 --- /dev/null +++ b/benchmarks/overall/overall.py @@ -0,0 +1,88 @@ +import json +import os +from collections import defaultdict +from pathlib import Path + +import click +import datasets +import tabulate +from tqdm import tqdm + +from marker.logger import configure_logging +from marker.models import create_model_dict +from inference import get_marker_block_html +from marker.settings import settings +from scoring import score_blocks + +configure_logging() + +@click.command(help="Benchmark PDF to MD conversion.") +@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark") +@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values:", default="") +@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.") +@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.") +def main( + dataset: str, + other_methods: str, + result_path: str, + max_rows: int +): + allowed_methods = [""] + methods = other_methods.split(",") + for method in methods: + if method not in allowed_methods: + raise ValueError(f"Method {method} not allowed. Allowed methods are {allowed_methods}") + + model_dict = create_model_dict() + ds = datasets.load_dataset(dataset, split="train") + + bench_scores = {} + averages_by_type = defaultdict(list) + averages_by_block_type = defaultdict(list) + for idx, sample in tqdm(enumerate(ds), desc="Running benchmark"): + gt_blocks = json.loads(sample["gt_blocks"]) + doc_type = sample["classification"] + pdf_bytes = sample["pdf"] # This is a single page PDF + marker_html = get_marker_block_html(model_dict, gt_blocks, pdf_bytes) + gt_html = [block["html"] for block in gt_blocks] + scores = score_blocks(gt_html, marker_html) + gt_weights = [len(ht) for ht in gt_html] + overall_score = sum([s * w for s, w in zip(scores, gt_weights)]) / sum(gt_weights) + bench_scores[idx] = { + "scores": scores, + "weights": gt_weights, + "overall_score": overall_score # Weighted score, weighted by length of GT block + } + + averages_by_type[doc_type].append(overall_score) + + for score, gt_block in zip(scores, gt_blocks): + averages_by_block_type[gt_block["block_type"]].append(score) + + if max_rows is not None and idx >= max_rows: + break + + for k in averages_by_type: + averages_by_type[k] = sum(averages_by_type[k]) / len(averages_by_type[k]) + averages_by_type = sorted(averages_by_type.items()) + + print(tabulate.tabulate(averages_by_type, headers=["Document Type", "Average Score"], tablefmt="github")) + + for k in averages_by_block_type: + averages_by_block_type[k] = sum(averages_by_block_type[k]) / len(averages_by_block_type[k]) + averages_by_block_type = sorted(averages_by_block_type.items()) + + print(tabulate.tabulate(averages_by_block_type, headers=["Block Type", "Average Score"], tablefmt="github")) + + overall_average = sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores) + print(tabulate.tabulate([["Overall Average", overall_average]], tablefmt="github")) + + out_path = Path(result_path) / "overall.json" + with open(out_path, "w") as f: + json.dump(bench_scores, f, indent=2) + + print(f"Results saved to {out_path}.") + +if __name__ == "__main__": + main() + diff --git a/benchmarks/overall/scoring.py b/benchmarks/overall/scoring.py new file mode 100644 index 00000000..3ae19a98 --- /dev/null +++ b/benchmarks/overall/scoring.py @@ -0,0 +1,30 @@ +import re +from bs4 import BeautifulSoup + +from markdownify import markdownify as md +from rapidfuzz import fuzz + +def standardize_html(html): + soup = BeautifulSoup(html, "html.parser") + + # Convert all headers to h1 so we don't penalize small differences in header levels + for tag in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]): + tag.name = "h1" + + html = str(soup) + markdown = md(html) + markdown = markdown.replace("
", "\n") + markdown = re.sub(r"\s+", " ", markdown) + markdown = re.sub(r"\n+", "\n", markdown) + markdown = re.sub("\\.+", ".", markdown) # Replace repeated periods with a single period, like in table of contents + return markdown.strip() + + +def score_blocks(gt_html, method_html): + scores = [] + for gt, method in zip(gt_html, method_html): + gt= standardize_html(gt) + method = standardize_html(method) + score = fuzz.ratio(gt, method) + scores.append(score) + return scores \ No newline at end of file diff --git a/benchmarks/scoring.py b/benchmarks/scoring.py deleted file mode 100644 index 5aa9faff..00000000 --- a/benchmarks/scoring.py +++ /dev/null @@ -1,36 +0,0 @@ -from rapidfuzz import fuzz -from statistics import mean - -CHUNK_MIN_CHARS = 25 - -def chunk_text(text, chunk_len=500): - chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)] - chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS] - return chunks - - -def overlap_score(hypothesis_chunks, reference_chunks): - length_modifier = len(hypothesis_chunks) / len(reference_chunks) - search_distance = max(len(reference_chunks) // 5, 10) - chunk_scores = [] - for i, hyp_chunk in enumerate(hypothesis_chunks): - max_score = 0 - total_len = 0 - i_offset = int(i * length_modifier) - chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance)) - for j in chunk_range: - ref_chunk = reference_chunks[j] - score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100 - if score > max_score: - max_score = score - total_len = len(ref_chunk) - chunk_scores.append(max_score) - return chunk_scores - - -def score_text(hypothesis, reference): - # Returns a 0-1 alignment score - hypothesis_chunks = chunk_text(hypothesis) - reference_chunks = chunk_text(reference) - chunk_scores = overlap_score(hypothesis_chunks, reference_chunks) - return mean(chunk_scores) diff --git a/benchmarks/table/table.py b/benchmarks/table/table.py index 3116274d..dfeb5eb0 100644 --- a/benchmarks/table/table.py +++ b/benchmarks/table/table.py @@ -1,10 +1,10 @@ import os -from itertools import repeat -from tkinter import Image - -os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS +from pathlib import Path +from itertools import repeat from typing import List + import numpy as np import base64 import time @@ -20,6 +20,7 @@ import pypdfium2 as pdfium from marker.util import matrix_intersection_area from marker.renderers.json import JSONOutput, JSONBlockOutput +from marker.settings import settings from marker.config.parser import ConfigParser from marker.converters.table import TableConverter @@ -47,7 +48,7 @@ def extract_tables(children: List[JSONBlockOutput]): @click.command(help="Benchmark Table to HTML Conversion") -@click.argument("out_file", type=str) +@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.") @click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use") @click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process") @click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use") @@ -55,7 +56,7 @@ def extract_tables(children: List[JSONBlockOutput]): @click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.") @click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.") def main( - out_file: str, + result_path: str, dataset: str, max_rows: int, max_workers: int, @@ -64,7 +65,7 @@ def main( use_gemini: bool = False ): models = create_model_dict() - config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size}) + config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True}) start = time.time() @@ -93,9 +94,7 @@ def main( with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file: temp_pdf_file.write(pdf_binary) temp_pdf_file.seek(0) - tqdm.disable = True marker_json = converter(temp_pdf_file.name).children - tqdm.disable = False doc = pdfium.PdfDocument(temp_pdf_file.name) page_image = doc[0].render(scale=92/72).to_pil() @@ -223,8 +222,11 @@ def main( "gemini": gemini_results } - with open(out_file, "w+") as f: + out_path = Path(result_path) / "table.json" + with open(out_path, "w+") as f: json.dump(results, f, indent=2) + print(f"Results saved to {out_path}.") + if __name__ == '__main__': main() \ No newline at end of file diff --git a/marker/builders/llm_layout.py b/marker/builders/llm_layout.py index b061ea48..c9aae671 100644 --- a/marker/builders/llm_layout.py +++ b/marker/builders/llm_layout.py @@ -50,6 +50,10 @@ class LLMLayoutBuilder(LayoutBuilder): int, "The timeout for requests to the Gemini model.", ] = 60 + disable_tqdm: Annotated[ + bool, + "Whether to disable the tqdm progress bar.", + ] = False topk_relabelling_prompt: Annotated[ str, "The prompt to use for relabelling blocks.", @@ -107,7 +111,7 @@ def __call__(self, document: Document, provider: PdfProvider): print(f"Error relabelling blocks: {e}") def relabel_blocks(self, document: Document): - pbar = tqdm(desc="LLM layout relabelling") + pbar = tqdm(desc="LLM layout relabelling", disable=self.disable_tqdm) with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: futures = [] for page in document.pages: diff --git a/marker/processors/equation.py b/marker/processors/equation.py index 5f5be17c..6bd79fa7 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -1,7 +1,4 @@ from typing import Annotated, List, Optional, Tuple - -from texify.inference import batch_inference -from texify.model.model import GenerateVisionEncoderDecoderModel from tqdm import tqdm from marker.models import TexifyPredictor @@ -32,6 +29,10 @@ class EquationProcessor(BaseProcessor): int, "The number of tokens to buffer above max for the Texify model.", ] = 256 + disable_tqdm: Annotated[ + bool, + "Whether to disable the tqdm progress bar.", + ] = False def __init__(self, texify_model: TexifyPredictor, config=None): super().__init__(config) @@ -80,7 +81,7 @@ def get_latex_batched(self, equation_data: List[dict]): predictions = [""] * len(equation_data) batch_size = self.get_batch_size() - for i in tqdm(range(0, len(equation_data), batch_size), desc="Recognizing equations"): + for i in tqdm(range(0, len(equation_data), batch_size), desc="Recognizing equations", disable=self.disable_tqdm): # Dynamically set max length to save inference time min_idx = i max_idx = min(min_idx + batch_size, len(equation_data)) diff --git a/marker/processors/llm/__init__.py b/marker/processors/llm/__init__.py index c41853ac..5f36139e 100644 --- a/marker/processors/llm/__init__.py +++ b/marker/processors/llm/__init__.py @@ -44,6 +44,10 @@ class BaseLLMProcessor(BaseProcessor): bool, "Whether to use the LLM model.", ] = False + disable_tqdm: Annotated[ + bool, + "Whether to disable the tqdm progress bar.", + ] = False block_types = None def __init__(self, config=None): @@ -73,7 +77,7 @@ def rewrite_blocks(self, document: Document): if total_blocks == 0: return - pbar = tqdm(desc=f"{self.__class__.__name__} running") + pbar = tqdm(desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm) with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: for future in as_completed([ executor.submit(self.process_rewriting, document, page, block) diff --git a/marker/processors/llm/llm_table_merge.py b/marker/processors/llm/llm_table_merge.py index e2012998..a3f74396 100644 --- a/marker/processors/llm/llm_table_merge.py +++ b/marker/processors/llm/llm_table_merge.py @@ -44,6 +44,10 @@ class LLMTableMergeProcessor(BaseLLMProcessor): int, "The maximum gap between columns to merge tables" ] = 50 + disable_tqdm: Annotated[ + bool, + "Whether to disable the tqdm progress bar.", + ] = False table_merge_prompt: Annotated[ str, "The prompt to use for rewriting text.", @@ -137,7 +141,7 @@ def get_column_count(cells: List[TableCell]): return max_cols def rewrite_blocks(self, document: Document): - pbar = tqdm(desc=f"{self.__class__.__name__} running") + pbar = tqdm(desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm) table_runs = [] table_run = [] prev_block = None diff --git a/poetry.lock b/poetry.lock index 652f9c68..c053780c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2729,6 +2729,18 @@ files = [ [package.dependencies] nvidia-nvjitlink-cu12 = "*" +[[package]] +name = "nvidia-cusparselt-cu12" +version = "0.6.2" +description = "NVIDIA cuSPARSELt" +optional = false +python-versions = "*" +files = [ + {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8"}, + {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9"}, + {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-win_amd64.whl", hash = "sha256:0057c91d230703924c0422feabe4ce768841f9b4b44d28586b6f6d2eb86fbe70"}, +] + [[package]] name = "nvidia-nccl-cu12" version = "2.21.5" @@ -3566,6 +3578,23 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pymupdf" +version = "1.25.2" +description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents." +optional = false +python-versions = ">=3.9" +files = [ + {file = "pymupdf-1.25.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:59dea22b633cc4fc13670b4c5db50d71f8cd4f420814420f33ce47ddcb61e1f6"}, + {file = "pymupdf-1.25.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:e8b8a874497cd0deee89a6a4fb76a3a08173c8d39e88fc7cf715764ec5a243e9"}, + {file = "pymupdf-1.25.2-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f61e5cdb25b86eb28d34aa3557b49ecf9e361d5f5cd3b1660406f8f0bf813af7"}, + {file = "pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ae8cfa7a97d78f813d286ecba32369059d88073edd1e5cf105f4cd0811f71925"}, + {file = "pymupdf-1.25.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:295505fe1ecb7c7b57d4124d373e207ea311d8e40bc7ac3016d8ec2d60b091e9"}, + {file = "pymupdf-1.25.2-cp39-abi3-win32.whl", hash = "sha256:b9488c8b82bb9be36fb13ee0c8d43b0ddcc50af83b61da01e6040413d9e67da6"}, + {file = "pymupdf-1.25.2-cp39-abi3-win_amd64.whl", hash = "sha256:1b4ca6f5780d319a08dff885a5a0e3585c5d7af04dcfa063c535b88371fd91c1"}, + {file = "pymupdf-1.25.2.tar.gz", hash = "sha256:9ea88ff1b3ccb359620f106a6fd5ba6877d959d21d78272052c3496ceede6eec"}, +] + [[package]] name = "pyparsing" version = "3.2.1" @@ -4729,27 +4758,6 @@ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["pre-commit", "pytest (>=7.0)", "pytest-timeout"] typing = ["mypy (>=1.6,<2.0)", "traitlets (>=5.11.1)"] -[[package]] -name = "texify" -version = "0.2.1" -description = "OCR for latex images" -optional = false -python-versions = "<4.0,>=3.10" -files = [ - {file = "texify-0.2.1-py3-none-any.whl", hash = "sha256:861c90ea6167fb6c2b334d5fcf0116dd9e1585af359463dec83115891c09dcfa"}, - {file = "texify-0.2.1.tar.gz", hash = "sha256:bab30f8445aa60e36de122fb86deb77b3f25348a885d4d5f3c67d6b6f5bb2e81"}, -] - -[package.dependencies] -ftfy = ">=6.1.3,<7.0.0" -Pillow = ">=10.1.0,<11.0.0" -pydantic = ">=2.5.2,<3.0.0" -pydantic-settings = ">=2.1.0,<3.0.0" -pypdfium2 = ">=4.25.0,<5.0.0" -python-dotenv = ">=1.0.0,<2.0.0" -torch = ">=2.1.2,<3.0.0" -transformers = ">=4.36.2,<5.0.0" - [[package]] name = "threadpoolctl" version = "3.5.0" @@ -4865,28 +4873,31 @@ files = [ [[package]] name = "torch" -version = "2.5.1" +version = "2.6.0" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" optional = false -python-versions = ">=3.8.0" +python-versions = ">=3.9.0" files = [ - {file = "torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:71328e1bbe39d213b8721678f9dcac30dfc452a46d586f1d514a6aa0a99d4744"}, - {file = "torch-2.5.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:34bfa1a852e5714cbfa17f27c49d8ce35e1b7af5608c4bc6e81392c352dbc601"}, - {file = "torch-2.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:32a037bd98a241df6c93e4c789b683335da76a2ac142c0973675b715102dc5fa"}, - {file = "torch-2.5.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:23d062bf70776a3d04dbe74db950db2a5245e1ba4f27208a87f0d743b0d06e86"}, - {file = "torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:de5b7d6740c4b636ef4db92be922f0edc425b65ed78c5076c43c42d362a45457"}, - {file = "torch-2.5.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:340ce0432cad0d37f5a31be666896e16788f1adf8ad7be481196b503dad675b9"}, - {file = "torch-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:603c52d2fe06433c18b747d25f5c333f9c1d58615620578c326d66f258686f9a"}, - {file = "torch-2.5.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:31f8c39660962f9ae4eeec995e3049b5492eb7360dd4f07377658ef4d728fa4c"}, - {file = "torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:ed231a4b3a5952177fafb661213d690a72caaad97d5824dd4fc17ab9e15cec03"}, - {file = "torch-2.5.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:3f4b7f10a247e0dcd7ea97dc2d3bfbfc90302ed36d7f3952b0008d0df264e697"}, - {file = "torch-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:73e58e78f7d220917c5dbfad1a40e09df9929d3b95d25e57d9f8558f84c9a11c"}, - {file = "torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1"}, - {file = "torch-2.5.1-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:9b61edf3b4f6e3b0e0adda8b3960266b9009d02b37555971f4d1c8f7a05afed7"}, - {file = "torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:1f3b7fb3cf7ab97fae52161423f81be8c6b8afac8d9760823fd623994581e1a3"}, - {file = "torch-2.5.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7974e3dce28b5a21fb554b73e1bc9072c25dde873fa00d54280861e7a009d7dc"}, - {file = "torch-2.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:46c817d3ea33696ad3b9df5e774dba2257e9a4cd3c4a3afbf92f6bb13ac5ce2d"}, - {file = "torch-2.5.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:8046768b7f6d35b85d101b4b38cba8aa2f3cd51952bc4c06a49580f2ce682291"}, + {file = "torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961"}, + {file = "torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab"}, + {file = "torch-2.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:56eeaf2ecac90da5d9e35f7f35eb286da82673ec3c582e310a8d1631a1c02341"}, + {file = "torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628"}, + {file = "torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1"}, + {file = "torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d"}, + {file = "torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7"}, + {file = "torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21"}, + {file = "torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9"}, + {file = "torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb"}, + {file = "torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239"}, + {file = "torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989"}, + {file = "torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf"}, + {file = "torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b"}, + {file = "torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc"}, + {file = "torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2"}, + {file = "torch-2.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ea955317cfcd3852b1402b62af258ce735c2edeee42ca9419b6bc889e5ae053"}, + {file = "torch-2.6.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb2c6c3e65049f081940f5ab15c9136c7de40d3f01192541c920a07c7c585b7e"}, + {file = "torch-2.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:683410f97984103148e31b38a8631acf31c3034c020c0f4d26171e7626d8317a"}, + {file = "torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c"}, ] [package.dependencies] @@ -4903,17 +4914,18 @@ nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cusparselt-cu12 = {version = "0.6.2", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} setuptools = {version = "*", markers = "python_version >= \"3.12\""} sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""} -triton = {version = "3.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""} -typing-extensions = ">=4.8.0" +triton = {version = "3.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +typing-extensions = ">=4.10.0" [package.extras] opt-einsum = ["opt-einsum (>=3.3)"] -optree = ["optree (>=0.12.0)"] +optree = ["optree (>=0.13.0)"] [[package]] name = "tornado" @@ -5042,21 +5054,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"] [[package]] name = "triton" -version = "3.1.0" +version = "3.2.0" description = "A language and compiler for custom Deep Learning operations" optional = false python-versions = "*" files = [ - {file = "triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8"}, - {file = "triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c"}, - {file = "triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc"}, - {file = "triton-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dadaca7fc24de34e180271b5cf864c16755702e9f63a16f62df714a8099126a"}, - {file = "triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11"}, + {file = "triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62"}, + {file = "triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220"}, + {file = "triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c"}, + {file = "triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0"}, + {file = "triton-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30ceed0eff2c4a73b14eb63e052992f44bbdf175f3fad21e1ac8097a772de7ee"}, ] -[package.dependencies] -filelock = "*" - [package.extras] build = ["cmake (>=3.20)", "lit"] tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"] @@ -5489,4 +5498,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "d43373ff00de4feb00b0aed4fe98d2a84ecb5742d1a916cabbace5104f888d54" +content-hash = "9d330f12a8bad0352ec550e1d6a77348b10f6bca7ecc41769813bec85d3f9e08" diff --git a/pyproject.toml b/pyproject.toml index 08d8c72e..0e787c60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,6 @@ python-dotenv = "^1.0.0" torch = "^2.5.1" tqdm = "^4.66.1" ftfy = "^6.1.1" -texify = "^0.2.1" rapidfuzz = "^3.8.1" surya-ocr = "~0.10.0" regex = "^2024.4.28" @@ -50,6 +49,7 @@ apted = "1.0.3" distance = "0.1.3" lxml = "5.3.0" tabulate = "^0.9.0" +pymupdf = "^1.25.2" [tool.poetry.scripts] marker = "marker.scripts.convert:convert_cli"