diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml deleted file mode 100644 index 5d49aa1c..00000000 --- a/.github/workflows/benchmark.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Integration test with benchmark - -on: [push] - -env: - TORCH_DEVICE: "cpu" - -jobs: - benchmark: - runs-on: [ubuntu-latest, windows-latest] - steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.11 - uses: actions/setup-python@v4 - with: - python-version: 3.11 - - name: Install python dependencies - run: | - pip install poetry - poetry install - poetry remove torch - poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu - - name: Download benchmark data - run: | - wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi" - unzip -o benchmark_data.zip - - name: Run benchmark test - run: | - poetry run python benchmarks/overall.py benchmark_data/pdfs benchmark_data/references report.json - poetry run python benchmarks/verify_scores.py report.json --type marker - - - diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 00000000..5b76ff15 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,32 @@ +name: Integration test + +on: [push] + +env: + PYTHONIOENCODING: "utf-8" + +jobs: + benchmark: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install apt dependencies + run: | + sudo apt-get update + sudo apt-get install -y pandoc + - name: Install python dependencies + run: | + pip install poetry + poetry install + - name: Run benchmark test + run: | + poetry run python benchmarks/overall/overall.py --max_rows 5 + poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/result.json --type marker + - name: Run table benchmark + run: | + poetry run python benchmarks/table/table.py --max_rows 5 + poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index af4e92e8..84137df5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,10 +2,6 @@ name: CI tests on: [push] -env: - TORCH_DEVICE: "cpu" - OCR_ENGINE: "surya" - jobs: tests: runs-on: ubuntu-latest diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml index 217e4221..06230580 100644 --- a/.github/workflows/scripts.yml +++ b/.github/workflows/scripts.yml @@ -2,10 +2,6 @@ name: Test CLI scripts on: [push] -env: - TORCH_DEVICE: "cpu" - OCR_ENGINE: "surya" - jobs: tests: runs-on: ubuntu-latest diff --git a/README.md b/README.md index 0a3382a8..f1a52aa9 100644 --- a/README.md +++ b/README.md @@ -10,17 +10,25 @@ Marker converts PDFs and images to markdown, JSON, and HTML quickly and accurate - Optionally boost accuracy with an LLM - Works on GPU, CPU, or MPS -## How it works +## Performance -Marker is a pipeline of deep learning models: + -- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya)) -- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya)) -- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify), [surya](https://github.com/VikParuchuri/surya)) -- Optionally use an LLM to improve quality -- Combine blocks and postprocess complete text +Marker benchmarks favorably compared to cloud services like Llamaparse and Mathpix, as well as other open source tools. -It only uses models where necessary, which improves speed and accuracy. +The above results are running single PDF pages serially. Marker is significantly faster when running in batch mode, with a projected throughput of 122 pages/second on an H100 (.18 seconds per page across 22 processes). + +See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks. + +## Hybrid Mode + +For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, format tables properly, and extract values from forms. It uses `gemini-flash-2.0`, which is cheap and fast. + +Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm: + + + +As you can see, the use_llm mode offers higher accuracy than marker or gemini alone. ## Examples @@ -30,14 +38,6 @@ It only uses models where necessary, which improves speed and accuracy. | [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/switch_transformers/switch_trans.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/switch_trans.json) | | [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/multicolcnn/multicolcnn.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/multicolcnn.json) | -## Performance - -![Benchmark overall](data/images/overall.png) - -The above results are with marker setup so it takes ~7GB of VRAM on an A10. - -See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks. - # Commercial usage I want marker to be as widely accessible as possible, while still funding my development/training costs. Research and personal usage is always okay, but there are some restrictions on commercial usage. @@ -56,17 +56,6 @@ There's a hosted API for marker available [here](https://www.datalab.to/): [Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development. -# Limitations - -PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address: - -- Marker will only convert block equations -- Tables are not always formatted 100% correctly -- Forms are not converted optimally -- Very complex layouts, with nested tables and forms, may not work - -Note: Passing the `--use_llm` flag will mostly solve these issues. - # Installation You'll need python 3.10+ and PyTorch. You may need to install the CPU version of torch first if you're not using a Mac or a GPU machine. See [here](https://pytorch.org/get-started/locally/) for more details. @@ -82,7 +71,7 @@ pip install marker-pdf First, some configuration: - Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`. -- Some PDFs, even digital ones, have bad text in them. Set the `force_ocr` flag on the CLI or via configuration to ensure your PDF runs through OCR, or the `strip_existing_ocr` to keep all digital text, and only strip out any existing OCR text. +- Some PDFs, even digital ones, have bad text in them. Set the `force_ocr` flag to ensure your PDF runs through OCR, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text. ## Interactive App @@ -219,11 +208,11 @@ rendered = converter("FILEPATH") text, _, images = text_from_rendered(rendered) ``` -This takes all the same configuration as the PdfConverter. You can specify the configuration `--force_layout_block=Table` to avoid layout detection and instead assume every page is a table. +This takes all the same configuration as the PdfConverter. You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table. Set `output_format=json` to also get cell bounding boxes. You can also run this via the CLI with ```shell -python convert_single.py FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter +marker_single FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter --output_format json ``` # Output Formats @@ -377,36 +366,55 @@ There are some settings that you may find useful if things aren't working the wa Pass the `debug` option to activate debug mode. This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information. # Benchmarks + ## Overall PDF Conversion -Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct. -**Speed** +We created a [benchmark set](https://huggingface.co/datasets/datalab-to/marker_benchmark) by extracting single PDF pages from common crawl. We scored based on a heuristic that aligns text with ground truth text segments, and an LLM as a judge scoring method. + +| Method | Avg Time | Heuristic Score | LLM Score | +|------------|----------|-----------------|-----------| +| marker | 2.83837 | 95.6709 | 4.23916 | +| llamaparse | 23.348 | 84.2442 | 3.97619 | +| mathpix | 6.36223 | 86.4281 | 4.15626 | +| docling | 3.69949 | 86.7073 | 3.70429 | -| Method | Average Score | Time per page | Time per document | -|---------|----------------|---------------|------------------| -| marker | 0.625115 | 0.234184 | 21.545 | +Benchmarks were run on an H100 for markjer and docling - llamaparse and mathpix used their cloud services. We can also look at it by document type: -**Accuracy** + -| Method | thinkpython.pdf | switch_trans.pdf | thinkdsp.pdf | crowd.pdf | thinkos.pdf | multicolcnn.pdf | -|---------|----------------|-----------------|--------------|------------|-------------|----------------| -| marker | 0.720347 | 0.592002 | 0.70468 | 0.515082 | 0.701394 | 0.517184 | +| Document Type | Marker heuristic | Marker LLM | Llamaparse Heuristic | Llamaparse LLM | Mathpix Heuristic | Mathpix LLM | Docling Heuristic | Docling LLM | +|----------------------|------------------|------------|----------------------|----------------|-------------------|-------------|-------------------|-------------| +| Scientific paper | 96.6737 | 4.34899 | 87.1651 | 3.96421 | 91.2267 | 4.46861 | 92.135 | 3.72422 | +| Book page | 97.1846 | 4.16168 | 90.9532 | 4.07186 | 93.8886 | 4.35329 | 90.0556 | 3.64671 | +| Other | 95.1632 | 4.25076 | 81.1385 | 4.01835 | 79.6231 | 4.00306 | 83.8223 | 3.76147 | +| Form | 88.0147 | 3.84663 | 66.3081 | 3.68712 | 64.7512 | 3.33129 | 68.3857 | 3.40491 | +| Presentation | 95.1562 | 4.13669 | 81.2261 | 4 | 83.6737 | 3.95683 | 84.8405 | 3.86331 | +| Financial document | 95.3697 | 4.39106 | 82.5812 | 4.16111 | 81.3115 | 4.05556 | 86.3882 | 3.8 | +| Letter | 98.4021 | 4.5 | 93.4477 | 4.28125 | 96.0383 | 4.45312 | 92.0952 | 4.09375 | +| Engineering document | 93.9244 | 4.04412 | 77.4854 | 3.72059 | 80.3319 | 3.88235 | 79.6807 | 3.42647 | +| Legal document | 96.689 | 4.27759 | 86.9769 | 3.87584 | 91.601 | 4.20805 | 87.8383 | 3.65552 | +| Newspaper page | 98.8733 | 4.25806 | 84.7492 | 3.90323 | 96.9963 | 4.45161 | 92.6496 | 3.51613 | +| Magazine page | 98.2145 | 4.38776 | 87.2902 | 3.97959 | 93.5934 | 4.16327 | 93.0892 | 4.02041 | -Peak GPU memory usage during the benchmark is `6GB` for marker. Benchmarks were run on an A10. +## Throughput -**Throughput** +We benchmarked throughput using a [single long PDF](https://www.greenteapress.com/thinkpython/thinkpython.pdf). -Marker takes about 6GB of VRAM on average per task, so you can convert 8 documents in parallel on an A6000. +| Method | Time per page | Time per document | VRAM used | +|---------|---------------|-------------------|---------- | +| marker | 0.18 | 43.42 | 3.17GB | -![Benchmark results](data/images/per_doc.png) +The projected throughput is 122 pages per second on an H100 - we can run 22 individual processes given the VRAM used. ## Table Conversion + Marker can extract tables from PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a tree edit distance based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves these scores: -| Avg score | Total tables | use_llm | -|-----------|--------------|---------| -| 0.822 | 54 | False | -| 0.887 | 54 | True | +| Method | Avg score | Total tables | +|------------------|-----------|--------------| +| marker | 0.816 | 99 | +| marker w/use_llm | 0.907 | 99 | +| gemini | 0.829 | 99 | The `--use_llm` flag can significantly improve table recognition performance, as you can see. @@ -426,16 +434,49 @@ poetry install Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this: ```shell -python benchmarks/overall.py data/pdfs data/references report.json +python benchmarks/overall.py --methods marker --scores heuristic,llm ``` +Options: + +- `--use_llm` use an llm to improve the marker results. +- `--max_rows` how many rows to process for the benchmark. +- `--methods` can be `llamaparse`, `mathpix`, `docling`, `marker`. Comma separated. +- `--scores` which scoring functions to use, can be `llm`, `heuristic`. Comma separated. + ### Table Conversion The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with: ```shell -python benchmarks/table/table.py table_report.json --max_rows 1000 +python benchmarks/table/table.py --max_rows 100 ``` +Options: + +- `--use_llm` uses an llm with marker to improve accuracy. +- `--use_gemini` also benchmarks gemini 2.0 flash. + +# How it works + +Marker is a pipeline of deep learning models: + +- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya)) +- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya)) +- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify), [surya](https://github.com/VikParuchuri/surya)) +- Optionally use an LLM to improve quality +- Combine blocks and postprocess complete text + +It only uses models where necessary, which improves speed and accuracy. + +# Limitations + +PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address: + +- Marker will only convert block equations +- Very complex layouts, with nested tables and forms, may not work + +Note: Passing the `--use_llm` flag will mostly solve these issues. + # Thanks This work would not have been possible without amazing open source models and datasets, including (but not limited to): @@ -445,4 +486,4 @@ This work would not have been possible without amazing open source models and da - Pypdfium2/pdfium - DocLayNet from IBM -Thank you to the authors of these models and datasets for making them available to the community! +Thank you to the authors of these models and datasets for making them available to the community! \ No newline at end of file diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/overall.py b/benchmarks/overall.py deleted file mode 100644 index f6fb9591..00000000 --- a/benchmarks/overall.py +++ /dev/null @@ -1,132 +0,0 @@ -import tempfile -import time -from collections import defaultdict - -import click -from tqdm import tqdm -import pypdfium2 as pdfium - -from marker.config.parser import ConfigParser -from marker.converters.pdf import PdfConverter -from marker.logger import configure_logging -from marker.models import create_model_dict -from pdftext.extraction import plain_text_output -import json -import os -import subprocess -import shutil -from tabulate import tabulate - -from marker.settings import settings -from scoring import score_text - -configure_logging() - - -def nougat_prediction(pdf_filename, batch_size=1): - out_dir = tempfile.mkdtemp() - subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True) - md_file = os.listdir(out_dir)[0] - with open(os.path.join(out_dir, md_file), "r") as f: - data = f.read() - shutil.rmtree(out_dir) - return data - -@click.command(help="Benchmark PDF to MD conversion.") -@click.argument("in_folder", type=str) -@click.argument("reference_folder", type=str) -@click.argument("out_file", type=str) -@click.option("--nougat", is_flag=True, help="Run nougat and compare") -@click.option("--md_out_path", type=str, default=None, help="Output path for generated markdown files") -def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_out_path: str): - methods = ["marker"] - if nougat: - methods.append("nougat") - - model_dict = create_model_dict() - - scores = defaultdict(dict) - benchmark_files = os.listdir(in_folder) - benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")] - times = defaultdict(dict) - pages = defaultdict(int) - - for idx, fname in tqdm(enumerate(benchmark_files)): - md_filename = fname.rsplit(".", 1)[0] + ".md" - - reference_filename = os.path.join(reference_folder, md_filename) - with open(reference_filename, "r") as f: - reference = f.read() - - pdf_filename = os.path.join(in_folder, fname) - doc = pdfium.PdfDocument(pdf_filename) - pages[fname] = len(doc) - - config_parser = ConfigParser({"output_format": "markdown"}) - for method in methods: - start = time.time() - if method == "marker": - converter = PdfConverter( - config=config_parser.generate_config_dict(), - artifact_dict=model_dict, - processor_list=None, - renderer=config_parser.get_renderer() - ) - full_text = converter(pdf_filename).markdown - elif method == "nougat": - full_text = nougat_prediction(pdf_filename, batch_size=1) - elif method == "naive": - full_text = plain_text_output(doc, workers=1) - else: - raise ValueError(f"Unknown method {method}") - - times[method][fname] = time.time() - start - - score = score_text(full_text, reference) - scores[method][fname] = score - - if md_out_path: - md_out_filename = f"{method}_{md_filename}" - with open(os.path.join(md_out_path, md_out_filename), "w+") as f: - f.write(full_text) - - total_pages = sum(pages.values()) - with open(out_file, "w+") as f: - write_data = defaultdict(dict) - for method in methods: - total_time = sum(times[method].values()) - file_stats = { - fname: - { - "time": times[method][fname], - "score": scores[method][fname], - "pages": pages[fname] - } - - for fname in benchmark_files - } - write_data[method] = { - "files": file_stats, - "avg_score": sum(scores[method].values()) / len(scores[method]), - "time_per_page": total_time / total_pages, - "time_per_doc": total_time / len(scores[method]) - } - - json.dump(write_data, f, indent=4) - - summary_table = [] - score_table = [] - score_headers = benchmark_files - for method in methods: - summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]]) - score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]]) - - print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"])) - print("") - print("Scores by file") - print(tabulate(score_table, headers=["Method", *score_headers])) - - -if __name__ == "__main__": - main() - diff --git a/benchmarks/overall/__init__.py b/benchmarks/overall/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/overall/display/__init__.py b/benchmarks/overall/display/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/overall/display/dataset.py b/benchmarks/overall/display/dataset.py new file mode 100644 index 00000000..e9fcabdd --- /dev/null +++ b/benchmarks/overall/display/dataset.py @@ -0,0 +1,48 @@ +import json +from typing import List + +import datasets +from tqdm import tqdm + +from benchmarks.overall.registry import METHOD_REGISTRY +from benchmarks.overall.schema import FullResult + + +def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str], max_rows: int | None = None) -> datasets.Dataset: + rows = [] + for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"): + if idx not in result["markdown"]: + continue + + if max_rows is not None and idx >= max_rows: + break + + row = { + "uuid": sample["uuid"], + "classification": sample["classification"], + "language": sample["language"], + "img": sample["img"], + } + for method in result["markdown"][idx]: + if method == "gt": + continue + + method_cls = METHOD_REGISTRY[method]() + md = result["markdown"][idx][method] + method_img = method_cls.render(result["markdown"][idx][method]) + row[f"{method}_md"] = md + row[f"{method}_img"] = method_img + + for score_type in score_types: + try: + row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"] + except KeyError: + row[f"{method}_{score_type}"] = -1.0 # Missing score + try: + row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"]) + except KeyError: + row[f"{method}_{score_type}_detail"] = "" # Missing detail + rows.append(row) + ds = datasets.Dataset.from_list(rows) + return ds + diff --git a/benchmarks/overall/display/table.py b/benchmarks/overall/display/table.py new file mode 100644 index 00000000..5d704214 --- /dev/null +++ b/benchmarks/overall/display/table.py @@ -0,0 +1,68 @@ +from pathlib import Path +from typing import Dict, List + +import tabulate + +from benchmarks.overall.schema import FullResult + +def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str): + table = tabulate.tabulate(rows, headers=headers, tablefmt="github") + with open(out_path / filename, "w", encoding="utf-8") as f: + f.write(f"# {title}\n") + f.write(table) + print(title) + print(table) + + +def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"): + document_types = list(result["averages_by_type"][default_method][default_score_type].keys()) + headers = ["Document Type"] + for method in methods: + for score_type in score_types: + headers.append(f"{method} {score_type}") + + document_rows = [[k] for k in document_types] + for i, doc_type in enumerate(document_types): + for method in methods: + for score_type in score_types: + avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type])) + document_rows[i].append(avg_score) + + write_table("Document Types", document_rows, headers, out_path, "document_types.md") + + headers = ["Block Type"] + block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks + block_score_types = list(result["averages_by_block_type"][default_method].keys()) + for method in methods: + for score_type in block_score_types: + headers.append(f"{method} {score_type}") + + block_rows = [[k] for k in block_types] + for i, block_type in enumerate(block_types): + for method in methods: + for score_type in block_score_types: + avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type])) + block_rows[i].append(avg_score) + + write_table("Block types", block_rows, headers, out_path, "block_types.md") + + headers = ["Method", "Avg Time"] + score_types + inference_rows = [[k] for k in methods] + all_raw_scores = [result["scores"][i] for i in result["scores"]] + for i, method in enumerate(methods): + avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method])) + inference_rows[i].append(avg_time) + for score_type in score_types: + scores_lst = [] + for ar in all_raw_scores: + try: + # Sometimes a few llm scores are missing + scores_lst.append(ar[method][score_type]["score"]) + except KeyError: + continue + avg_score = sum(scores_lst) / max(1, len(scores_lst)) + inference_rows[i].append(avg_score) + + write_table("Overall Results", inference_rows, headers, out_path, "overall.md") + + print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.") \ No newline at end of file diff --git a/benchmarks/overall/download/__init__.py b/benchmarks/overall/download/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/overall/download/base.py b/benchmarks/overall/download/base.py new file mode 100644 index 00000000..cc3f3557 --- /dev/null +++ b/benchmarks/overall/download/base.py @@ -0,0 +1,60 @@ +import json +from json import JSONDecodeError +from pathlib import Path + +import datasets +from tqdm import tqdm + + +class Downloader: + cache_path: Path = Path("cache") + service: str + + def __init__(self, api_key, app_id, max_rows: int = 2200): + self.cache_path.mkdir(exist_ok=True) + self.max_rows = max_rows + self.api_key = api_key + self.app_id = app_id + self.ds = datasets.load_dataset("datalab-to/marker_benchmark", split="train") + + def get_html(self, pdf_bytes): + raise NotImplementedError + + def upload_ds(self): + rows = [] + for file in self.cache_path.glob("*.json"): + with open(file, "r") as f: + data = json.load(f) + rows.append(data) + + out_ds = datasets.Dataset.from_list(rows, features=datasets.Features({ + "md": datasets.Value("string"), + "uuid": datasets.Value("string"), + "time": datasets.Value("float"), + })) + out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}") + + def generate_data(self): + max_rows = 2200 + for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"): + cache_file = self.cache_path / f"{idx}.json" + if cache_file.exists(): + continue + + pdf_bytes = sample["pdf"] # This is a single page PDF + try: + out_data = self.get_html(pdf_bytes) + except JSONDecodeError as e: + print(f"Error with sample {idx}: {e}") + continue + out_data["uuid"] = sample["uuid"] + + with cache_file.open("w") as f: + json.dump(out_data, f) + + if idx >= max_rows: + break + + def __call__(self): + self.generate_data() + self.upload_ds() diff --git a/benchmarks/overall/download/llamaparse.py b/benchmarks/overall/download/llamaparse.py new file mode 100644 index 00000000..a6b65867 --- /dev/null +++ b/benchmarks/overall/download/llamaparse.py @@ -0,0 +1,64 @@ +import io +import os +import time + +import requests + +from benchmarks.overall.download.base import Downloader + + +class LlamaParseDownloader(Downloader): + service = "llamaparse" + + def get_html(self, pdf_bytes): + rand_name = str(time.time()) + ".pdf" + start = time.time() + buff = io.BytesIO(pdf_bytes) + md = upload_and_parse_file(self.api_key, rand_name, buff) + end = time.time() + if isinstance(md, bytes): + md = md.decode("utf-8") + + return { + "md": md, + "time": end - start, + } + + +def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1): + headers = { + "Authorization": f"Bearer {api_key}", + "Accept": "application/json" + } + + # Upload file + files = { + 'file': (fname, buff, 'application/pdf') + } + response = requests.post( + 'https://api.cloud.llamaindex.ai/api/v1/parsing/upload', + headers=headers, + files=files + ) + response.raise_for_status() + job_id = response.json()['id'] + + # Poll for completion + for _ in range(max_retries): + status_response = requests.get( + f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}', + headers=headers + ) + status_response.raise_for_status() + if status_response.json()['status'] == 'SUCCESS': + # Get results + result_response = requests.get( + f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown', + headers=headers + ) + result_response.raise_for_status() + return result_response.json()['markdown'] + + time.sleep(delay) + + raise TimeoutError("Job did not complete within the maximum retry attempts") \ No newline at end of file diff --git a/benchmarks/overall/download/main.py b/benchmarks/overall/download/main.py new file mode 100644 index 00000000..01a31c37 --- /dev/null +++ b/benchmarks/overall/download/main.py @@ -0,0 +1,23 @@ +import click + +from benchmarks.overall.download.llamaparse import LlamaParseDownloader +from benchmarks.overall.download.mathpix import MathpixDownloader + + +@click.command("Download data from inference services") +@click.argument("service", type=click.Choice(["mathpix", "llamaparse"])) +@click.argument("--max_rows", type=int, default=2200) +@click.argument("--api_key", type=str, default=None) +@click.argument("--app_id", type=str, default=None) +def main(service: str, max_rows: int, api_key: str, app_id: str): + registry = { + "mathpix": MathpixDownloader, + "llamaparse": LlamaParseDownloader + } + downloader = registry[service](api_key, app_id, max_rows=max_rows) + + # Generate data and upload to hub + downloader() + +if __name__ == "__main__": + main() diff --git a/benchmarks/overall/download/mathpix.py b/benchmarks/overall/download/mathpix.py new file mode 100644 index 00000000..204424ac --- /dev/null +++ b/benchmarks/overall/download/mathpix.py @@ -0,0 +1,80 @@ +import json +import time + +import requests + +from benchmarks.overall.download.base import Downloader + + +class MathpixDownloader(Downloader): + service = "mathpix" + + def get_html(self, pdf_bytes): + headers = { + "app_id": self.app_id, + "app_key": self.api_key, + } + start = time.time() + pdf_id = mathpix_request(pdf_bytes, headers) + status = mathpix_status(pdf_id, headers) + if status in ["processing", "error"]: + md = "" + else: + md = mathpix_results(pdf_id, headers) + end = time.time() + if isinstance(md, bytes): + md = md.decode("utf-8") + + return { + "md": md, + "time": end - start + } + +def mathpix_request(buffer, headers): + response = requests.post("https://api.mathpix.com/v3/pdf", + headers=headers, + data={ + "options_json": json.dumps( + { + "conversion_formats": { + "md": True, + "html": True + } + } + ) + }, + files={ + "file": buffer + } + ) + data = response.json() + pdf_id = data["pdf_id"] + return pdf_id + +def mathpix_status(pdf_id, headers): + max_iters = 120 + i = 0 + status = "processing" + status2 = "processing" + while i < max_iters: + time.sleep(1) + response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}", + headers=headers + ) + status_resp = response.json() + if "conversion_status" not in status_resp: + continue + status = status_resp["conversion_status"]["md"]["status"] + status2 = status_resp["conversion_status"]["html"]["status"] + if status == "completed" and status2 == "completed": + break + elif status == "error" or status2 == "error": + break + out_status = "completed" if status == "completed" and status2 == "completed" else "error" + return out_status + +def mathpix_results(pdf_id, headers, ext="md"): + response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}", + headers=headers + ) + return response.content diff --git a/benchmarks/overall/elo.py b/benchmarks/overall/elo.py new file mode 100644 index 00000000..9eea3b55 --- /dev/null +++ b/benchmarks/overall/elo.py @@ -0,0 +1,225 @@ +import json +import random +import time +from dataclasses import dataclass +from typing import List, Dict, Tuple, Literal +from PIL import Image + +import click +import datasets +from google import genai +from google.genai.errors import APIError +from pydantic import BaseModel +from tqdm import tqdm + +from marker.settings import settings + +rating_prompt = """ +You're a document analysis expert who is comparing two different markdown samples to an image to see which one represents the content of the image better. The markdown will be called version A and version B. + +Here are some notes on the image and markdown: +- Some parts of the page may have been recognized as images and linked from the markdown, like `![](_page_0_Picture_0.jpeg)`. +- Tables will be formatted as Github flavored markdown. +- Block equations will be in LaTeX. +- The image and markdown may be in any language. +- The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text. + +The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided. + +**Instructions** +Follow this process to evaluate the markdown: +1. Carefully examine the image. +2. Carefully examine the first markdown input provided. +3. Describe how well version a represents the image. +4. Carefully examine the second markdown input provided. +5. Describe how well version B represents the image. +6. Compare version A and version B. +7. Decide which markdown representation is better, based on the criteria below. Output version_a if version a is better, and version_b if version b is better. + +Use these criteria when judging the markdown: +- Overall - the overall quality of the markdown as compared to the image. +- Text quality - the quality of the text extraction from the image. +- Formatting quality - the quality of the formatting applied to the markdown, as compared to the image. +- Tables - how effectively the tables have been extracted and formatted. +- Forms - how effectively the forms have extracted and formatted. +- Equations - how effectively block equations have been converted to LaTeX. +- Lists - if the lists have been properly extracted and formatted. +- Images - if images are identified and placed correctly. + +Notes on scoring: +- Perfect markdown will include all of the important text from the image, and the formatting will be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text. +- Bad markdown will have major missing text segments from the markdown or completely unreadable formatting. + +Output json, like in the example below. + +**Example** +Version A +```markdown +# *Section 1* +This is some *markdown* extracted from a document. Here is a block equation: +$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$ +``` +Version B +```markdown +# Section 1 +This is some markdown extracted from a document. Here is a block equation: +$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$ +``` +Output +```json +{ + "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.", + "version_a_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.", + "version_b_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation. The formatting in version b is slightly different from the image.", + "comparison": "Version A is better than version B. The text and formatting in version A matches the image better than version B.", + "winner": "version_a", +} +``` +**Input** +Version A +```markdown +{{version_a}} +``` +Version B +```markdown +{{version_b}} +``` +**Output** +""" + +class ComparerSchema(BaseModel): + image_description: str + version_a_description: str + version_b_description: str + comparison: str + winner: Literal["version_a", "version_b"] + + +class Comparer: + def __init__(self): + pass + + def __call__( + self, + img: Image.Image, + version_a: str, + version_b: str + ) -> str | None: + hydrated_prompt = rating_prompt.replace("{{version_a}}", version_a).replace("{{version_b}}", version_b) + try: + rating = self.llm_rater(img, hydrated_prompt) + except Exception as e: + print(f"Error: {e}") + return + return rating + + + def llm_rater(self, img: Image.Image, prompt: str): + response = self.llm_response_wrapper( + [img, prompt], + ComparerSchema + ) + assert "winner" in response, f"Response missing 'winner' key: {response}" + return response["winner"] + + def llm_response_wrapper( + self, + prompt, + response_schema, + ): + client = genai.Client( + api_key=settings.GOOGLE_API_KEY, + http_options={"timeout": 60000} + ) + try: + responses = client.models.generate_content( + model="gemini-2.0-flash", + contents=prompt, + config={ + "temperature": 0, + "response_schema": response_schema, + "response_mime_type": "application/json", + }, + ) + output = responses.candidates[0].content.parts[0].text + return json.loads(output) + except APIError as e: + print(f"Hit Gemini rate limit") + return + except Exception as e: + print(f"Error: {e}") + return + +@dataclass +class Method: + name: str + rating: float = 1500 + k_factor: float = 32 + + +class EloSystem: + def __init__(self, player_names: List[str]): + self.methods = {name: Method(name) for name in player_names} + + def expected_score(self, rating_a: float, rating_b: float) -> float: + return 1 / (1 + 10 ** ((rating_b - rating_a) / 400)) + + def update_ratings(self, winner: str, loser: str) -> Tuple[float, float]: + method_a = self.methods[winner] + method_b = self.methods[loser] + + expected_a = self.expected_score(method_a.rating, method_b.rating) + expected_b = self.expected_score(method_b.rating, method_a.rating) + + # Winner gets score of 1, loser gets 0 + method_a.rating += method_a.k_factor * (1 - expected_a) + method_b.rating += method_b.k_factor * (0 - expected_b) + + return method_a.rating, method_b.rating + + +@click.command("Calculate ELO scores for document conversion methods") +@click.argument("dataset", type=str) +@click.option("--methods", type=str, help="List of methods to compare: comma separated like marker,mathpix") +@click.option("--row_samples", type=int, default=2, help="Number of samples per row") +@click.option("--max_rows", type=int, default=100, help="Maximum number of rows to process") +def main( + dataset: str, + methods: str, + row_samples: int, + max_rows: int +): + ds = datasets.load_dataset(dataset, split="train") + method_lst = methods.split(",") + elo = EloSystem(method_lst) + comparer = Comparer() + + for i in tqdm(range(min(len(ds), max_rows)), desc="Calculating ELO"): + row = ds[i] + # Avoid any bias in ordering + random.shuffle(method_lst) + + for j, method_a in enumerate(method_lst[:-1]): + for z, method_b in enumerate(method_lst[j:]): + if method_a == method_b: + continue + + method_a_md = row[f"{method_a}_md"] + method_b_md = row[f"{method_b}_md"] + winner = comparer(row["img"], method_a_md, method_b_md) + if not winner: + continue + + if winner == "version_a": + elo.update_ratings(method_a, method_b) + else: + elo.update_ratings(method_b, method_a) + if i % 10 == 0: + print(elo.methods) + + # Print out ratings + print(elo.methods) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/benchmarks/overall/methods/__init__.py b/benchmarks/overall/methods/__init__.py new file mode 100644 index 00000000..a5a3f53f --- /dev/null +++ b/benchmarks/overall/methods/__init__.py @@ -0,0 +1,100 @@ +import io +import random +import re +from typing import Tuple + +import markdown2 +from PIL import Image +from playwright.sync_api import sync_playwright + +from benchmarks.overall.methods.schema import BenchmarkResult +from marker.renderers.markdown import MarkdownRenderer + + +class BaseMethod: + def __init__(self, **kwargs): + for kwarg in kwargs: + if hasattr(self, kwarg): + setattr(self, kwarg, kwargs[kwarg]) + + @staticmethod + def convert_to_md(html: str): + md = MarkdownRenderer() + markdown = md.md_cls.convert(html) + return markdown + + def __call__(self, sample) -> BenchmarkResult: + raise NotImplementedError() + + def render(self, markdown: str): + return self.html_to_image(self.convert_to_html(markdown)) + + @staticmethod + def convert_to_html(md: str): + block_placeholders = [] + inline_placeholders = [] + + # Add placeholders for the math + def block_sub(match): + content = match.group(1) + placeholder = f"1BLOCKMATH{len(block_placeholders)}1" + block_placeholders.append((placeholder, f"$${content}$$")) + return placeholder + + def inline_sub(match): + content = match.group(1) + placeholder = f"1INLINEMATH{len(inline_placeholders)}1" + inline_placeholders.append((placeholder, f"${content}$")) + return placeholder + + md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL) + md = re.sub(r'\$(.*?)\$', inline_sub, md) + + html = markdown2.markdown(md, extras=['tables']) + + # Replace placeholders + for placeholder, math_str in block_placeholders: + html = html.replace(placeholder, math_str) + for placeholder, math_str in inline_placeholders: + html = html.replace(placeholder, math_str) + + return html + + def html_to_image(self, html: str) -> Image.Image: + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + html_str = f""" + + + + + + + + + + + {html} + + + + """.strip() + page.set_viewport_size({"width": 1200, "height": 800}) + page.set_content(html_str) + page.wait_for_load_state("domcontentloaded") + page.wait_for_timeout(500) # Wait for KaTeX to render + screenshot_bytes = page.screenshot(full_page=True) + browser.close() + + return Image.open(io.BytesIO(screenshot_bytes)) \ No newline at end of file diff --git a/benchmarks/overall/methods/docling.py b/benchmarks/overall/methods/docling.py new file mode 100644 index 00000000..f36ee041 --- /dev/null +++ b/benchmarks/overall/methods/docling.py @@ -0,0 +1,26 @@ +import tempfile +import time + +from benchmarks.overall.methods import BaseMethod, BenchmarkResult + + +class DoclingMethod(BaseMethod): + model_dict: dict = None + use_llm: bool = False + + def __call__(self, sample) -> BenchmarkResult: + from docling.document_converter import DocumentConverter + pdf_bytes = sample["pdf"] # This is a single page PDF + converter = DocumentConverter() + + with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: + f.write(pdf_bytes) + start = time.time() + result = converter.convert(f.name) + total = time.time() - start + + return { + "markdown": result.document.export_to_markdown(), + "time": total + } + diff --git a/benchmarks/overall/methods/gt.py b/benchmarks/overall/methods/gt.py new file mode 100644 index 00000000..6c2c6c32 --- /dev/null +++ b/benchmarks/overall/methods/gt.py @@ -0,0 +1,29 @@ +from typing import List +import json + +from PIL import Image + +from benchmarks.overall.methods import BaseMethod, BenchmarkResult + + +class GTMethod(BaseMethod): + def __call__(self, sample) -> BenchmarkResult: + gt_blocks = json.loads(sample["gt_blocks"]) + gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0] + gt_markdown = [self.convert_to_md(block) for block in gt_html] + return { + "markdown": gt_markdown, + "time": 0 + } + + def render(self, html: List[str]) -> Image.Image: + joined = "\n\n".join(html) + html = f""" + + + +{joined} + + +""".strip() + return self.html_to_image(html) \ No newline at end of file diff --git a/benchmarks/overall/methods/llamaparse.py b/benchmarks/overall/methods/llamaparse.py new file mode 100644 index 00000000..e2b1e43a --- /dev/null +++ b/benchmarks/overall/methods/llamaparse.py @@ -0,0 +1,22 @@ +import datasets + +from benchmarks.overall.methods import BaseMethod, BenchmarkResult + + +class LlamaParseMethod(BaseMethod): + llamaparse_ds: datasets.Dataset = None + + def __call__(self, sample) -> BenchmarkResult: + uuid = sample["uuid"] + data = None + for row in self.llamaparse_ds: + if str(row["uuid"]) == str(uuid): + data = row + break + if not data: + raise ValueError(f"Could not find data for uuid {uuid}") + + return { + "markdown": data["md"], + "time": data["time"] + } \ No newline at end of file diff --git a/benchmarks/overall/methods/marker.py b/benchmarks/overall/methods/marker.py new file mode 100644 index 00000000..afaafcfc --- /dev/null +++ b/benchmarks/overall/methods/marker.py @@ -0,0 +1,29 @@ +import tempfile +import time + +from benchmarks.overall.methods import BaseMethod, BenchmarkResult +from marker.converters.pdf import PdfConverter + + +class MarkerMethod(BaseMethod): + model_dict: dict = None + use_llm: bool = False + + def __call__(self, sample) -> BenchmarkResult: + pdf_bytes = sample["pdf"] # This is a single page PDF + block_converter = PdfConverter( + artifact_dict=self.model_dict, + config={"page_range": [0], "disable_tqdm": True, "use_llm": self.use_llm} + ) + + with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: + f.write(pdf_bytes) + start = time.time() + rendered = block_converter(f.name) + total = time.time() - start + + return { + "markdown": rendered.markdown, + "time": total + } + diff --git a/benchmarks/overall/methods/mathpix.py b/benchmarks/overall/methods/mathpix.py new file mode 100644 index 00000000..d06340f7 --- /dev/null +++ b/benchmarks/overall/methods/mathpix.py @@ -0,0 +1,22 @@ +import datasets + +from benchmarks.overall.methods import BaseMethod, BenchmarkResult + + +class MathpixMethod(BaseMethod): + mathpix_ds: datasets.Dataset = None + + def __call__(self, sample) -> BenchmarkResult: + uuid = sample["uuid"] + data = None + for row in self.mathpix_ds: + if str(row["uuid"]) == str(uuid): + data = row + break + if not data: + raise ValueError(f"Could not find data for uuid {uuid}") + + return { + "markdown": data["md"], + "time": data["time"] + } \ No newline at end of file diff --git a/benchmarks/overall/methods/schema.py b/benchmarks/overall/methods/schema.py new file mode 100644 index 00000000..d475876e --- /dev/null +++ b/benchmarks/overall/methods/schema.py @@ -0,0 +1,6 @@ +from typing import TypedDict, List + + +class BenchmarkResult(TypedDict): + markdown: str | List[str] + time: float | None \ No newline at end of file diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py new file mode 100644 index 00000000..481753e3 --- /dev/null +++ b/benchmarks/overall/overall.py @@ -0,0 +1,148 @@ +import json +import os +from collections import defaultdict +from pathlib import Path +from typing import List + +import click +import datasets +from tqdm import tqdm + +from benchmarks.overall.display.dataset import build_dataset +from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY +from benchmarks.overall.schema import FullResult +from marker.logger import configure_logging +from marker.models import create_model_dict +from marker.settings import settings +from benchmarks.overall.display.table import print_scores + +configure_logging() + + +def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], score_types: List[str], artifacts: dict, max_rows=None) -> FullResult: + bench_scores = {} + averages_by_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + averages_by_block_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + average_times = defaultdict(list) + markdown_by_method = defaultdict(dict) + for idx, sample in tqdm(enumerate(benchmark_dataset), desc="Running benchmark"): + if max_rows is not None and idx >= max_rows: + break + + doc_type = sample["classification"] + gt_cls = METHOD_REGISTRY["gt"] + gt_blocks = json.loads(sample["gt_blocks"]) + gt_md = gt_cls(**artifacts)(sample)["markdown"] + markdown_by_method[idx]["gt"] = gt_md + + out_data = defaultdict(dict) + + try: + for method in methods: + method_cls = METHOD_REGISTRY[method](**artifacts) + method_info = method_cls(sample) + method_md = method_info["markdown"] + average_times[method].append(method_info["time"]) + markdown_by_method[idx][method] = method_md + + for score_type in score_types: + score_cls = SCORE_REGISTRY[score_type]() + try: + scores = score_cls(sample, gt_md, method_md) + except Exception as e: + # Some scorers can fail, like the LLM one + print(f"Failed to score {method} with {score_type}: {e}") + continue + + out_data[method][score_type] = scores + + averages_by_type[method][score_type][doc_type].append(scores["score"]) + + if "by_block" in scores["specific_scores"]: # Not all scorers support this + for score, gt_block in zip(scores["specific_scores"]["by_block"], gt_blocks): + averages_by_block_type[method][score_type][gt_block["block_type"]].append(score) + except Exception as e: + print(f"Failed to process {idx}: {e}") + if idx in markdown_by_method: + del markdown_by_method[idx] + continue + + bench_scores[idx] = out_data + + return { + "scores": bench_scores, + "markdown": markdown_by_method, + "averages_by_type": averages_by_type, + "averages_by_block_type": averages_by_block_type, + "average_times": average_times, + } + +@click.command(help="Benchmark PDF to MD conversion.") +@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark") +@click.option("--out_dataset", type=str, help="Path to the output dataset", default=None) +@click.option("--methods", type=str, help="Comma separated list of other methods to compare against. Possible values: marker,mathpix,llamaparse,docling", default="marker") +@click.option("--scores", type=str, help="Comma separated list of scoring functions to use. Possible values: heuristic,llm", default="heuristic") +@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.") +@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.") +@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.") +def main( + dataset: str, + out_dataset: str, + methods: str, + scores: str, + result_path: str, + max_rows: int, + use_llm: bool +): + out_path = Path(result_path) + out_path.mkdir(parents=True, exist_ok=True) + + methods = methods.split(",") + for method in methods: + if method not in METHOD_REGISTRY: + raise ValueError(f"Method {method} not allowed. Allowed methods are {METHOD_REGISTRY.keys()}") + + # Ensure marker is always first + all_methods = list(set(methods)) + methods = ["marker"] if "marker" in all_methods else [] + methods += [m for m in all_methods if m != "marker"] + + score_types = scores.split(",") + for score_type in score_types: + if score_type not in SCORE_REGISTRY: + raise ValueError(f"Score type {score_type} not allowed. Allowed types are {SCORE_REGISTRY.keys()}") + + benchmark_dataset = datasets.load_dataset(dataset, split="train") + artifacts = { + "model_dict": create_model_dict(), + "use_llm": use_llm, + "mathpix_ds": None, + "llamaparse_ds": None, + } + + if "mathpix" in methods: + artifacts["mathpix_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train") + + if "llamaparse" in methods: + artifacts["llamaparse_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_llamaparse", split="train") + + print(f"Running benchmark with methods: {methods} and scores: {score_types}") + result = get_method_scores(benchmark_dataset, methods, score_types, artifacts, max_rows=max_rows) + + # Display benchmark scoring tables + print_scores(result, out_path, methods, score_types, default_method=methods[0], default_score_type=score_types[0]) + + # Write to json + with open(out_path / "result.json", "w") as f: + json.dump(result, f) + + if out_dataset: + if use_llm: + out_dataset += "_llm" + dataset = build_dataset(benchmark_dataset, result, score_types, max_rows=max_rows) + dataset.push_to_hub(out_dataset) + + +if __name__ == "__main__": + main() + diff --git a/benchmarks/overall/registry.py b/benchmarks/overall/registry.py new file mode 100644 index 00000000..02184ad3 --- /dev/null +++ b/benchmarks/overall/registry.py @@ -0,0 +1,20 @@ +from benchmarks.overall.methods.docling import DoclingMethod +from benchmarks.overall.methods.gt import GTMethod +from benchmarks.overall.methods.llamaparse import LlamaParseMethod +from benchmarks.overall.methods.marker import MarkerMethod +from benchmarks.overall.methods.mathpix import MathpixMethod +from benchmarks.overall.scorers.heuristic import HeuristicScorer +from benchmarks.overall.scorers.llm import LLMScorer + +SCORE_REGISTRY = { + "heuristic": HeuristicScorer, + "llm": LLMScorer +} + +METHOD_REGISTRY = { + "marker": MarkerMethod, + "gt": GTMethod, + "mathpix": MathpixMethod, + "llamaparse": LlamaParseMethod, + "docling": DoclingMethod +} \ No newline at end of file diff --git a/benchmarks/overall/schema.py b/benchmarks/overall/schema.py new file mode 100644 index 00000000..56d99e3a --- /dev/null +++ b/benchmarks/overall/schema.py @@ -0,0 +1,12 @@ +from typing import TypedDict, List, Dict + +from benchmarks.overall.scorers.schema import BlockScores + +AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]] + +class FullResult(TypedDict): + scores: Dict[int, Dict[str, Dict[str, BlockScores]]] + averages_by_type: AVG_TYPE + averages_by_block_type: AVG_TYPE + average_times: Dict[str, List[float]] + markdown: Dict[int, Dict[str, str]] diff --git a/benchmarks/overall/scorers/__init__.py b/benchmarks/overall/scorers/__init__.py new file mode 100644 index 00000000..492bc4e4 --- /dev/null +++ b/benchmarks/overall/scorers/__init__.py @@ -0,0 +1,11 @@ +from typing import List + +from benchmarks.overall.scorers.schema import BlockScores + + +class BaseScorer: + def __init__(self): + pass + + def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores: + raise NotImplementedError() \ No newline at end of file diff --git a/benchmarks/overall/scorers/clean.py b/benchmarks/overall/scorers/clean.py new file mode 100644 index 00000000..ed3a6cc2 --- /dev/null +++ b/benchmarks/overall/scorers/clean.py @@ -0,0 +1,113 @@ +import re +import subprocess +import tempfile +from pathlib import Path + +import latex2mathml.converter + +class MarkdownCleaner: + def __init__(self): + pass + + def __call__(self, markdown): + markdown = self.normalize_markdown(markdown) # Use pandoc to normalize + + # Replace math expressions with latexml + pattern = r'(?", "\n") + markdown = re.sub(r"(.*?)", r"\1", markdown) + markdown = re.sub(r"(.*?)", r"\1", markdown) + markdown = re.sub(r"(.*?)", r"\1", markdown) # Remove span tags and keep content + + # Clean up markdown formatting + markdown = re.sub(r"\s+", " ", markdown) + markdown = re.sub(r"\n+", "\n", markdown) + markdown = re.sub("\\.+", ".", + markdown) # Replace repeated periods with a single period, like in table of contents + markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header + markdown = markdown.encode().decode('unicode-escape', errors="ignore") # Decode unicode characters properly + return markdown.strip().lower() + + @staticmethod + def normalize_markdown(md_text: str) -> str: + with tempfile.TemporaryDirectory() as tmp_dir: + dirpath = Path(tmp_dir) + input_file = dirpath / 'input.md' + input_file.write_text(md_text, encoding='utf-8') + + # Markdown to HTML + html_file = dirpath / 'temp.html' + subprocess.run( + [ + 'pandoc', + str(input_file), + '-f', 'markdown+tex_math_dollars', + '-t', 'html', + '-o', str(html_file), + '--quiet' + ], + check=True + ) + + # HTML to Markdown + output_file = dirpath / 'output.md' + subprocess.run( + [ + 'pandoc', + str(html_file), + '-f', 'html', + '-t', 'markdown+tex_math_dollars', + '-o', str(output_file), + '--quiet' + ], + check=True + ) + + # Read back the normalized Markdown + normalized_md = output_file.read_text(encoding='utf-8') + + return normalized_md + + def standardize_math(self, match): + try: + delim = "$$" if match.group(0).startswith('$$') else "$" + math_content = match.group(1) or match.group(2) + if delim == "$$": + math_content = latex2mathml.converter.convert(math_content) + else: + math_content = self.clean_latex(math_content) + return f'{delim}{math_content}{delim}' + except Exception as e: + print(f"Failed to standardize math expression: {match.group(0)} with error: {e}") + return match.group(0) + + @staticmethod + def clean_latex(latex_str): + latex_str = re.sub(r'\s+', ' ', latex_str.strip()) + for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']: + latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str) + + replacements = { + '\\times': '*', + '\\cdot': '*', + '\\div': '/', + '\\le': '<=', + '\\ge': '>=', + '\\neq': '!=', + '\\to': '\\rightarrow', + } + + for old, new in replacements.items(): + latex_str = latex_str.replace(old, new) + + return latex_str + + + diff --git a/benchmarks/overall/scorers/heuristic.py b/benchmarks/overall/scorers/heuristic.py new file mode 100644 index 00000000..ac1bf0e0 --- /dev/null +++ b/benchmarks/overall/scorers/heuristic.py @@ -0,0 +1,96 @@ +from typing import List + +from rapidfuzz import fuzz + +from benchmarks.overall.scorers.clean import MarkdownCleaner +from benchmarks.overall.scorers.schema import BlockScores +from benchmarks.overall.scorers import BaseScorer + + +class HeuristicScorer(BaseScorer): + def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores: + # Standardize inputs + gt_markdown = [self.clean_input(block) for block in gt_markdown] + method_markdown = self.clean_input(method_markdown) + + alignments = self.find_fuzzy_alignments(method_markdown, gt_markdown) + scores = [alignment["score"] for alignment in alignments] + + # Find order score + orders = [alignment["start"] for alignment in alignments] + correct_order = list(range(len(gt_markdown))) + actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x]) + order_score = self.kendall_tau(correct_order, actual_order) + + # Weight score by sequence length + gt_weights = [len(g) for g in gt_markdown] + weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)] + + # Weight the score by sequence length + overall_score = sum(weighted_scores) / max(1, sum(gt_weights)) + overall_score = overall_score * 0.8 + order_score * 0.2 + return { + "score": overall_score, + "specific_scores": { + "order": order_score, + "by_block": scores + }, + } + + @staticmethod + def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float: + n = len(correct_order) + concordant = 0 + discordant = 0 + + if n <= 1: + return 100 + + for i in range(n): + for j in range(i + 1, n): + correct_sign = correct_order[i] - correct_order[j] + actual_sign = actual_order[i] - actual_order[j] + + if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0): + concordant += 1 + elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0): + discordant += 1 + + total_pairs = (n * (n - 1)) // 2 + tau = (concordant - discordant) / total_pairs + tau = (tau + 1) / 2 # 0-1 scale + return tau * 100 # 0-100 scale + + @staticmethod + def find_fuzzy_alignments( + main_string: str, + substrings: List[str], + threshold: int = 70 + ) -> List[dict]: + alignments = [] + + for idx, substr in enumerate(substrings): + result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold) + + score = 0 + dest_start = 0 + dest_end = 0 + if result: + score = result.score + dest_start = result.dest_start + dest_end = result.dest_end + + alignments.append({ + "string": substr, + "start": dest_start, + "end": dest_end, + "score": score, + "idx": idx + }) + return alignments + + + @staticmethod + def clean_input(md: str): + cleaner = MarkdownCleaner() + return cleaner(md) \ No newline at end of file diff --git a/benchmarks/overall/scorers/llm.py b/benchmarks/overall/scorers/llm.py new file mode 100644 index 00000000..8ee8d138 --- /dev/null +++ b/benchmarks/overall/scorers/llm.py @@ -0,0 +1,147 @@ +import json +import tempfile +import time +from typing import List + +from PIL import Image +from google.genai.errors import APIError +from google import genai +import pypdfium2 as pdfium + +from benchmarks.overall.scorers import BaseScorer, BlockScores +from marker.settings import settings + +rating_prompt = """ +You're a document analysis expert who is comparing some markdown to an image to make sure the markdown is correct. You're rating how effectively the provided markdown represents the full text and formatting in the image provided. +You're given an image, along with the extracted markdown: +- Some parts of the page may have been recognized as images and linked from the markdown, like `![](_page_0_Picture_0.jpeg)`. +- Tables will be formatted as Github flavored markdown. +- Block equations will be in LaTeX. +- The image and markdown may be in any language. +- The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text. + +The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided. + +**Instructions** +Follow this process to evaluate the markdown: +1. Carefully examine the image. +2. Carefully examine the markdown input provided. +3. Compare the image to the markdown representation. Does the markdown representation properly represent the important text and formatting in the image? +4. Assign component scores, as described below. + +These are the primary scores: +- Overall - the overall quality of the markdown as compared to the image. +- Text quality - the quality of the text extraction from the image. +- Formatting quality - the quality of the formatting applied to the markdown, as compared to the image. + +Depending on which elements are present in the markdown, you will assign element-specific scores. +- Tables - how effectively the tables have been extracted and formatted. +- Forms - how effectively the forms have extracted and formatted. +- Equations - how effectively block equations have been converted to LaTeX. +- Section headers - if all of the section headers have been detected, and the right levels set. +- Lists - if the lists have been properly extracted and formatted. +- Images - if images are identified and placed correctly. + +Notes on scoring: +- To get a 5/5, all of the important text from the image must appear in the markdown, and the formatting should be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text. +- A 3/5 may have small missing text elements from the markdown and/or moderate formatting issues. +- A 1/5 will have major missing text segments from the markdown or completely unreadable formatting. +- Use 0/5 if a field isn't applicable, like if the image doesn't contain a table. + +Output json, like in the example below. + +**Example** +Input +```markdown +# Section 1 +This is some *markdown* extracted from a document. Here is a block equation: +$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$ +``` +Output +```json +{ + "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.", + "markdown_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.", + "comparison": "The text and formatting matches the image. There are no formatting or text extraction issues. The equations and section headers are correct.", + "overall": 5, + "text": 5, + "formatting": 5, + "section_headers": 5, + "tables": 0, + "forms": 0, + "equations": 5, + "lists": 0, + "images": 0 +} +``` +**Input** +```markdown +{{markdown}} +``` +**Output** +""" + +comparison_keys = ["comparison"] +description_keys = ["image_description", "markdown_description"] +text_keys = comparison_keys + description_keys +score_keys = ["overall", "text", "formatting", "section_headers", "tables", "forms", "equations", + "lists", "images"] + + +class LLMScorer(BaseScorer): + def __call__(self, sample, gt_markdown: List[str], markdown: str) -> BlockScores: + pdf_bytes = sample["pdf"] + with tempfile.NamedTemporaryFile(suffix=".pdf") as f: + f.write(pdf_bytes) + f.flush() + f.seek(0) + doc = pdfium.PdfDocument(f.name) + img = doc[0].render(scale=96/72).to_pil() + doc.close() + + return self.llm_rater(img, markdown) + + + def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores: + req_keys = text_keys + score_keys + properties = {} + for key in req_keys: + content_type = "INTEGER" if key in score_keys else "STRING" + properties[key] = {"type": content_type} + + response_schema = { + "required": req_keys, + "properties": properties, + "type": "OBJECT" + } + prompt = rating_prompt.replace("{{markdown}}", markdown) + response = self.llm_response_wrapper([img, prompt], response_schema) + assert all([k in response for k in req_keys]), f"Missing keys in response: {response}" + return { + "score": response["overall"], + "specific_scores": response, + } + + def llm_response_wrapper(self, prompt, response_schema, depth=0): + client = genai.Client( + api_key=settings.GOOGLE_API_KEY, + http_options={"timeout": 60000} + ) + try: + responses = client.models.generate_content( + model="gemini-2.0-flash", + contents=prompt, + config={ + "temperature": 0, + "response_schema": response_schema, + "response_mime_type": "application/json", + }, + ) + output = responses.candidates[0].content.parts[0].text + return json.loads(output) + except APIError as e: + print(f"Hit Gemini rate limit, waiting 120 seconds") + time.sleep(120) + if depth > 2: + raise e + return self.llm_response_wrapper(prompt, response_schema, depth + 1) \ No newline at end of file diff --git a/benchmarks/overall/scorers/schema.py b/benchmarks/overall/scorers/schema.py new file mode 100644 index 00000000..74e814fc --- /dev/null +++ b/benchmarks/overall/scorers/schema.py @@ -0,0 +1,6 @@ +from typing import TypedDict, List, Optional, Dict + + +class BlockScores(TypedDict): + score: float + specific_scores: Dict[str, float | List[float]] diff --git a/benchmarks/scoring.py b/benchmarks/scoring.py deleted file mode 100644 index 5aa9faff..00000000 --- a/benchmarks/scoring.py +++ /dev/null @@ -1,36 +0,0 @@ -from rapidfuzz import fuzz -from statistics import mean - -CHUNK_MIN_CHARS = 25 - -def chunk_text(text, chunk_len=500): - chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)] - chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS] - return chunks - - -def overlap_score(hypothesis_chunks, reference_chunks): - length_modifier = len(hypothesis_chunks) / len(reference_chunks) - search_distance = max(len(reference_chunks) // 5, 10) - chunk_scores = [] - for i, hyp_chunk in enumerate(hypothesis_chunks): - max_score = 0 - total_len = 0 - i_offset = int(i * length_modifier) - chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance)) - for j in chunk_range: - ref_chunk = reference_chunks[j] - score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100 - if score > max_score: - max_score = score - total_len = len(ref_chunk) - chunk_scores.append(max_score) - return chunk_scores - - -def score_text(hypothesis, reference): - # Returns a 0-1 alignment score - hypothesis_chunks = chunk_text(hypothesis) - reference_chunks = chunk_text(reference) - chunk_scores = overlap_score(hypothesis_chunks, reference_chunks) - return mean(chunk_scores) diff --git a/benchmarks/table/__init__.py b/benchmarks/table/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/table/gemini.py b/benchmarks/table/gemini.py index c58f2a92..5832a90f 100644 --- a/benchmarks/table/gemini.py +++ b/benchmarks/table/gemini.py @@ -1,7 +1,10 @@ import json from PIL import Image -import google.generativeai as genai -from google.ai.generativelanguage_v1beta.types import content +from google import genai +from google.genai import types +from io import BytesIO +from pydantic import BaseModel + from marker.settings import settings prompt = """ @@ -19,30 +22,26 @@ 3. Output only the HTML for the table, starting with the tag and ending with the
tag. """.strip() -genai.configure(api_key=settings.GOOGLE_API_KEY) +class TableSchema(BaseModel): + table_html: str def gemini_table_rec(image: Image.Image): - schema = content.Schema( - type=content.Type.OBJECT, - required=["table_html"], - properties={ - "table_html": content.Schema( - type=content.Type.STRING, - ) - } + client = genai.Client( + api_key=settings.GOOGLE_API_KEY, + http_options={"timeout": 60000} ) - model = genai.GenerativeModel("gemini-2.0-flash") + image_bytes = BytesIO() + image.save(image_bytes, format="PNG") - responses = model.generate_content( - [image, prompt], # According to gemini docs, it performs better if the image is the first element - stream=False, - generation_config={ + responses = client.models.generate_content( + model="gemini-2.0-flash", + contents=[types.Part.from_bytes(data=image_bytes.getvalue(), mime_type="image/png"), prompt], # According to gemini docs, it performs better if the image is the first element + config={ "temperature": 0, - "response_schema": schema, + "response_schema": TableSchema, "response_mime_type": "application/json", }, - request_options={'timeout': 60} ) output = responses.candidates[0].content.parts[0].text diff --git a/benchmarks/table/inference.py b/benchmarks/table/inference.py new file mode 100644 index 00000000..0c6432d7 --- /dev/null +++ b/benchmarks/table/inference.py @@ -0,0 +1,178 @@ +from typing import List + +import numpy as np +from bs4 import BeautifulSoup +import pypdfium2 as pdfium +from tqdm import tqdm +import base64 +import tempfile + +from benchmarks.table.gemini import gemini_table_rec +from marker.config.parser import ConfigParser +from marker.converters.table import TableConverter +from marker.models import create_model_dict +from marker.processors.llm.llm_table import LLMTableProcessor +from marker.processors.table import TableProcessor +from marker.renderers.json import JSONBlockOutput +from marker.schema.polygon import PolygonBox +from marker.util import matrix_intersection_area + + +def extract_tables(children: List[JSONBlockOutput]): + tables = [] + for child in children: + if child.block_type == 'Table': + tables.append(child) + elif child.children: + tables.extend(extract_tables(child.children)) + return tables + + +def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool): + models = create_model_dict() + config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True}) + total_unaligned = 0 + results = [] + + iterations = len(dataset) + if max_rows is not None: + iterations = min(max_rows, len(dataset)) + + for i in tqdm(range(iterations), desc='Converting Tables'): + try: + row = dataset[i] + pdf_binary = base64.b64decode(row['pdf']) + gt_tables = row['tables'] # Already sorted by reading order, which is what marker returns + + # Only use the basic table processors + converter = TableConverter( + config=config_parser.generate_config_dict(), + artifact_dict=models, + processor_list=[ + "marker.processors.table.TableProcessor", + "marker.processors.llm.llm_table.LLMTableProcessor", + ], + renderer=config_parser.get_renderer() + ) + + with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file: + temp_pdf_file.write(pdf_binary) + temp_pdf_file.seek(0) + marker_json = converter(temp_pdf_file.name).children + + doc = pdfium.PdfDocument(temp_pdf_file.name) + page_image = doc[0].render(scale=96/72).to_pil() + doc.close() + + if len(marker_json) == 0 or len(gt_tables) == 0: + print(f'No tables detected, skipping...') + total_unaligned += len(gt_tables) + continue + + marker_tables = extract_tables(marker_json) + marker_table_boxes = [table.bbox for table in marker_tables] + page_bbox = marker_json[0].bbox + + if len(marker_tables) != len(gt_tables): + print(f'Number of tables do not match, skipping...') + total_unaligned += len(gt_tables) + continue + + table_images = [ + page_image.crop( + PolygonBox.from_bbox(bbox) + .rescale( + (page_bbox[2], page_bbox[3]), (page_image.width, page_image.height) + ).bbox + ) + for bbox + in marker_table_boxes + ] + + # Normalize the bboxes + for bbox in marker_table_boxes: + bbox[0] = bbox[0] / page_bbox[2] + bbox[1] = bbox[1] / page_bbox[3] + bbox[2] = bbox[2] / page_bbox[2] + bbox[3] = bbox[3] / page_bbox[3] + + gt_boxes = [table['normalized_bbox'] for table in gt_tables] + gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes] + marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes] + table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes) + + aligned_tables = [] + used_tables = set() + unaligned_tables = set() + for table_idx, alignment in enumerate(table_alignments): + try: + max_area = np.max(alignment) + aligned_idx = np.argmax(alignment) + except ValueError: + # No alignment found + unaligned_tables.add(table_idx) + continue + + if max_area <= .01: + # No alignment found + unaligned_tables.add(table_idx) + continue + + if aligned_idx in used_tables: + # Marker table already aligned with another gt table + unaligned_tables.add(table_idx) + continue + + # Gt table doesn't align well with any marker table + gt_table_pct = gt_areas[table_idx] / max_area + if not .85 < gt_table_pct < 1.15: + unaligned_tables.add(table_idx) + continue + + # Marker table doesn't align with gt table + marker_table_pct = marker_areas[aligned_idx] / max_area + if not .85 < marker_table_pct < 1.15: + unaligned_tables.add(table_idx) + continue + + gemini_html = "" + if use_gemini: + try: + gemini_html = gemini_table_rec(table_images[aligned_idx]) + except Exception as e: + print(f'Gemini failed: {e}') + + aligned_tables.append( + (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html) + ) + used_tables.add(aligned_idx) + + total_unaligned += len(unaligned_tables) + + for marker_table, gt_table, gemini_table in aligned_tables: + gt_table_html = gt_table['html'] + + # marker wraps the table in which fintabnet data doesn't + # Fintabnet doesn't use th tags, need to be replaced for fair comparison + marker_table_soup = BeautifulSoup(marker_table.html, 'html.parser') + tbody = marker_table_soup.find('tbody') + if tbody: + tbody.unwrap() + for th_tag in marker_table_soup.find_all('th'): + th_tag.name = 'td' + for br_tag in marker_table_soup.find_all('br'): + br_tag.replace_with(marker_table_soup.new_string('')) + + marker_table_html = str(marker_table_soup) + marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines + gemini_table_html = gemini_table.replace("\n", " ") # Fintabnet uses spaces instead of newlines + + results.append({ + "marker_table": marker_table_html, + "gt_table": gt_table_html, + "gemini_table": gemini_table_html + }) + except pdfium.PdfiumError: + print('Broken PDF, Skipping...') + continue + return results, total_unaligned \ No newline at end of file diff --git a/benchmarks/table/table.py b/benchmarks/table/table.py index 3116274d..4e674c28 100644 --- a/benchmarks/table/table.py +++ b/benchmarks/table/table.py @@ -1,32 +1,22 @@ import os -from itertools import repeat -from tkinter import Image - -os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS +from pathlib import Path +from itertools import repeat from typing import List -import numpy as np -import base64 + import time import datasets from tqdm import tqdm -import tempfile import click from tabulate import tabulate import json -from bs4 import BeautifulSoup from concurrent.futures import ProcessPoolExecutor -from pypdfium2._helpers.misc import PdfiumError -import pypdfium2 as pdfium -from marker.util import matrix_intersection_area -from marker.renderers.json import JSONOutput, JSONBlockOutput -from marker.config.parser import ConfigParser -from marker.converters.table import TableConverter -from marker.models import create_model_dict +from marker.settings import settings +from benchmarks.table.inference import inference_tables from scoring import wrap_table_html, similarity_eval_html -from gemini import gemini_table_rec def update_teds_score(result, prefix: str = "marker"): prediction, ground_truth = result[f'{prefix}_table'], result['gt_table'] @@ -36,26 +26,16 @@ def update_teds_score(result, prefix: str = "marker"): return result -def extract_tables(children: List[JSONBlockOutput]): - tables = [] - for child in children: - if child.block_type == 'Table': - tables.append(child) - elif child.children: - tables.extend(extract_tables(child.children)) - return tables - - @click.command(help="Benchmark Table to HTML Conversion") -@click.argument("out_file", type=str) -@click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use") +@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.") +@click.option("--dataset", type=str, default="datalab-to/fintabnet_bench_marker", help="Dataset to use") @click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process") @click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use") @click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.") @click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.") @click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.") def main( - out_file: str, + result_path: str, dataset: str, max_rows: int, max_workers: int, @@ -63,130 +43,13 @@ def main( table_rec_batch_size: int | None, use_gemini: bool = False ): - models = create_model_dict() - config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size}) start = time.time() dataset = datasets.load_dataset(dataset, split='train') dataset = dataset.shuffle(seed=0) - iterations = len(dataset) - if max_rows is not None: - iterations = min(max_rows, len(dataset)) - - results = [] - total_unaligned = 0 - for i in tqdm(range(iterations), desc='Converting Tables'): - try: - row = dataset[i] - pdf_binary = base64.b64decode(row['pdf']) - gt_tables = row['tables'] #Already sorted by reading order, which is what marker returns - - converter = TableConverter( - config=config_parser.generate_config_dict(), - artifact_dict=models, - processor_list=config_parser.get_processors(), - renderer=config_parser.get_renderer() - ) - - with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file: - temp_pdf_file.write(pdf_binary) - temp_pdf_file.seek(0) - tqdm.disable = True - marker_json = converter(temp_pdf_file.name).children - tqdm.disable = False - - doc = pdfium.PdfDocument(temp_pdf_file.name) - page_image = doc[0].render(scale=92/72).to_pil() - - if len(marker_json) == 0 or len(gt_tables) == 0: - print(f'No tables detected, skipping...') - total_unaligned += len(gt_tables) - continue - - marker_tables = extract_tables(marker_json) - marker_table_boxes = [table.bbox for table in marker_tables] - page_bbox = marker_json[0].bbox - w_scaler, h_scaler = page_image.width / page_bbox[2], page_image.height / page_bbox[3] - table_images = [page_image.crop([bbox[0] * w_scaler, bbox[1] * h_scaler, bbox[2] * w_scaler, bbox[3] * h_scaler]) for bbox in marker_table_boxes] - - # Normalize the bboxes - for bbox in marker_table_boxes: - bbox[0] = bbox[0] / page_bbox[2] - bbox[1] = bbox[1] / page_bbox[3] - bbox[2] = bbox[2] / page_bbox[2] - bbox[3] = bbox[3] / page_bbox[3] - - gt_boxes = [table['normalized_bbox'] for table in gt_tables] - gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes] - marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes] - table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes) - - aligned_tables = [] - used_tables = set() - unaligned_tables = set() - for table_idx, alignment in enumerate(table_alignments): - try: - max_area = np.max(alignment) - aligned_idx = np.argmax(alignment) - except ValueError: - # No alignment found - unaligned_tables.add(table_idx) - continue - - if aligned_idx in used_tables: - # Marker table already aligned with another gt table - unaligned_tables.add(table_idx) - continue - - # Gt table doesn't align well with any marker table - gt_table_pct = gt_areas[table_idx] / max_area - if not .75 < gt_table_pct < 1.25: - unaligned_tables.add(table_idx) - continue - - # Marker table doesn't align with gt table - marker_table_pct = marker_areas[aligned_idx] / max_area - if not .75 < marker_table_pct < 1.25: - unaligned_tables.add(table_idx) - continue - - gemini_html = "" - if use_gemini: - gemini_html = gemini_table_rec(table_images[aligned_idx]) - - aligned_tables.append( - (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html) - ) - used_tables.add(aligned_idx) - - total_unaligned += len(unaligned_tables) - - for marker_table, gt_table, gemini_table in aligned_tables: - gt_table_html = gt_table['html'] - - #marker wraps the table in which fintabnet data doesn't - #Fintabnet doesn't use th tags, need to be replaced for fair comparison - marker_table_soup = BeautifulSoup(marker_table.html, 'html.parser') - tbody = marker_table_soup.find('tbody') - if tbody: - tbody.unwrap() - for th_tag in marker_table_soup.find_all('th'): - th_tag.name = 'td' - marker_table_html = str(marker_table_soup) - marker_table_html = marker_table_html.replace("
", " ") # Fintabnet uses spaces instead of newlines - marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines - gemini_table_html = gemini_table.replace("\n", " ") # Fintabnet uses spaces instead of newlines - - results.append({ - "marker_table": marker_table_html, - "gt_table": gt_table_html, - "gemini_table": gemini_table_html - }) - except PdfiumError: - print('Broken PDF, Skipping...') - continue + results, total_unaligned = inference_tables(dataset, use_llm, table_rec_batch_size, max_rows, use_gemini) print(f"Total time: {time.time() - start}.") print(f"Could not align {total_unaligned} tables from fintabnet.") @@ -223,8 +86,12 @@ def main( "gemini": gemini_results } - with open(out_file, "w+") as f: + out_path = Path(result_path) + out_path.mkdir(parents=True, exist_ok=True) + with open(out_path / "table.json", "w+") as f: json.dump(results, f, indent=2) + print(f"Results saved to {out_path}.") + if __name__ == '__main__': main() \ No newline at end of file diff --git a/benchmarks/throughput/__init__.py b/benchmarks/throughput/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/throughput/main.py b/benchmarks/throughput/main.py new file mode 100644 index 00000000..6e07054b --- /dev/null +++ b/benchmarks/throughput/main.py @@ -0,0 +1,39 @@ +import time +import torch + +import click +import pypdfium2 as pdfium + +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict + + +@click.command(help="Benchmark PDF to MD conversion throughput.") +@click.argument("pdf_path", type=str) +def main(pdf_path): + print(f"Converting {pdf_path} to markdown...") + pdf = pdfium.PdfDocument(pdf_path) + page_count = len(pdf) + pdf.close() + model_dict = create_model_dict() + torch.cuda.reset_peak_memory_stats() + + times = [] + for i in range(10): + block_converter = PdfConverter( + artifact_dict=model_dict, + config={"disable_tqdm": True} + ) + start = time.time() + block_converter(pdf_path) + total = time.time() - start + times.append(total) + + max_gpu_vram = torch.cuda.max_memory_allocated() / 1024 ** 3 + + print(f"Converted {page_count} pages in {sum(times)/len(times):.2f} seconds.") + print(f"Max GPU VRAM: {max_gpu_vram:.2f} GB") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/benchmarks/verify_scores.py b/benchmarks/verify_scores.py index 913081e9..088f137e 100644 --- a/benchmarks/verify_scores.py +++ b/benchmarks/verify_scores.py @@ -6,18 +6,18 @@ def verify_scores(file_path): with open(file_path, 'r') as file: data = json.load(file) - multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"] - switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"] - - if multicolcnn_score <= 0.34 or switch_trans_score <= 0.40: - raise ValueError("One or more scores are below the required threshold of 0.4") + raw_scores = [data["scores"][k] for k in data["scores"]] + marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores] + marker_score = sum(marker_scores) / len(marker_scores) + if marker_score < 90: + raise ValueError("Marker score below 90") def verify_table_scores(file_path): with open(file_path, 'r') as file: data = json.load(file) - avg = sum([r["score"] for r in data]) / len(data) + avg = sum([r["marker_score"] for r in data["marker"]]) / len(data) if avg < 0.7: raise ValueError("Average score is below the required threshold of 0.7") diff --git a/data/images/overall.png b/data/images/overall.png index 0946421a..1e2a9cce 100644 Binary files a/data/images/overall.png and b/data/images/overall.png differ diff --git a/data/images/per_doc.png b/data/images/per_doc.png index ed26cfb9..91694b04 100644 Binary files a/data/images/per_doc.png and b/data/images/per_doc.png differ diff --git a/data/images/table.png b/data/images/table.png new file mode 100644 index 00000000..8c6d81cb Binary files /dev/null and b/data/images/table.png differ diff --git a/marker/builders/layout.py b/marker/builders/layout.py index ff4af17d..0eba225a 100644 --- a/marker/builders/layout.py +++ b/marker/builders/layout.py @@ -22,7 +22,7 @@ class LayoutBuilder(BaseBuilder): """ A builder for performing layout detection on PDF pages and merging the results into the document. """ - batch_size: Annotated[ + layout_batch_size: Annotated[ Optional[int], "The batch size to use for the layout model.", "Default is None, which will use the default batch size for the model." @@ -36,7 +36,7 @@ class LayoutBuilder(BaseBuilder): float, "The minimum coverage ratio required for the layout model to consider", "the lines from the PdfProvider valid.", - ] = .1 + ] = .25 document_ocr_threshold: Annotated[ float, "The minimum ratio of pages that must pass the layout coverage check", @@ -67,8 +67,8 @@ def __call__(self, document: Document, provider: PdfProvider): self.merge_blocks(document.pages, provider.page_lines) def get_batch_size(self): - if self.batch_size is not None: - return self.batch_size + if self.layout_batch_size is not None: + return self.layout_batch_size elif settings.TORCH_DEVICE_MODEL == "cuda": return 6 return 6 @@ -140,7 +140,11 @@ def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: Pro good_pages = [] for (document_page, ocr_error_detection_label) in zip(document_pages, ocr_error_detection_labels): provider_lines = provider_page_lines.get(document_page.page_id, []) - good_pages.append(bool(provider_lines) and self.check_layout_coverage(document_page, provider_lines) and (ocr_error_detection_label != "bad")) + good_pages.append( + bool(provider_lines) and + self.check_layout_coverage(document_page, provider_lines) and + (ocr_error_detection_label != "bad") + ) ocr_document = sum(good_pages) / len(good_pages) < self.document_ocr_threshold for idx, document_page in enumerate(document_pages): @@ -180,7 +184,7 @@ def check_layout_coverage( large_text_blocks += 1 coverage_ratio = covered_blocks / total_blocks if total_blocks > 0 else 1 - text_okay = coverage_ratio >= self.layout_coverage_threshold + text_okay = coverage_ratio > self.layout_coverage_threshold # Model will sometimes say there is a single block of text on the page when it is blank if not text_okay and (total_blocks == 1 and large_text_blocks == 1): diff --git a/marker/builders/llm_layout.py b/marker/builders/llm_layout.py index 6e21fd09..8dbccc1f 100644 --- a/marker/builders/llm_layout.py +++ b/marker/builders/llm_layout.py @@ -1,10 +1,10 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Annotated -from google.ai.generativelanguage_v1beta.types import content from surya.layout import LayoutPredictor from surya.ocr_error import OCRErrorPredictor from tqdm import tqdm +from pydantic import BaseModel from marker.builders.layout import LayoutBuilder from marker.processors.llm import GoogleModel @@ -41,7 +41,7 @@ class LLMLayoutBuilder(LayoutBuilder): max_retries: Annotated[ int, "The maximum number of retries to use for the Gemini model.", - ] = 3 + ] = 2 max_concurrency: Annotated[ int, "The maximum number of concurrent requests to make to the Gemini model.", @@ -50,6 +50,10 @@ class LLMLayoutBuilder(LayoutBuilder): int, "The timeout for requests to the Gemini model.", ] = 60 + disable_tqdm: Annotated[ + bool, + "Whether to disable the tqdm progress bar.", + ] = False topk_relabelling_prompt: Annotated[ str, "The prompt to use for relabelling blocks.", @@ -107,7 +111,7 @@ def __call__(self, document: Document, provider: PdfProvider): print(f"Error relabelling blocks: {e}") def relabel_blocks(self, document: Document): - pbar = tqdm(desc="LLM layout relabelling") + pbar = tqdm(desc="LLM layout relabelling", disable=self.disable_tqdm) with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: futures = [] for page in document.pages: @@ -154,21 +158,15 @@ def process_block_complex_relabeling(self, document: Document, page: PageGroup, def process_block_relabeling(self, document: Document, page: PageGroup, block: Block, prompt: str): image = self.extract_image(document, block) - response_schema = content.Schema( - type=content.Type.OBJECT, - enum=[], - required=["image_description", "label"], - properties={ - "image_description": content.Schema( - type=content.Type.STRING, - ), - "label": content.Schema( - type=content.Type.STRING, - ), - }, - ) - response = self.model.generate_response(prompt, image, block, response_schema) + response = self.model.generate_response( + prompt, + image, + block, + LayoutSchema, + max_retries=self.max_retries, + timeout=self.timeout + ) generated_label = None if response and "label" in response: generated_label = response["label"] @@ -184,3 +182,8 @@ def process_block_relabeling(self, document: Document, page: PageGroup, block: B def extract_image(self, document: Document, image_block: Block, expand: float = 0.01): return image_block.get_image(document, highres=False, expansion=(expand, expand)) + + +class LayoutSchema(BaseModel): + image_description: str + label: str diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index 3741b760..01f69695 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -41,6 +41,7 @@ from marker.schema.registry import register_block_class from marker.util import strings_to_classes from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor +from marker.processors.order import OrderProcessor class PdfConverter(BaseConverter): @@ -59,6 +60,7 @@ class PdfConverter(BaseConverter): "Enable higher quality processing with LLMs.", ] = False default_processors: Tuple[BaseProcessor, ...] = ( + OrderProcessor, BlockquoteProcessor, CodeProcessor, DocumentTOCProcessor, diff --git a/marker/processors/equation.py b/marker/processors/equation.py index 5f5be17c..20ac0fb4 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -1,7 +1,4 @@ from typing import Annotated, List, Optional, Tuple - -from texify.inference import batch_inference -from texify.model.model import GenerateVisionEncoderDecoderModel from tqdm import tqdm from marker.models import TexifyPredictor @@ -32,6 +29,10 @@ class EquationProcessor(BaseProcessor): int, "The number of tokens to buffer above max for the Texify model.", ] = 256 + disable_tqdm: Annotated[ + bool, + "Whether to disable the tqdm progress bar.", + ] = False def __init__(self, texify_model: TexifyPredictor, config=None): super().__init__(config) @@ -53,11 +54,12 @@ def __call__(self, document: Document): "token_count": token_count }) + if len(equation_data) == 0: + return + predictions = self.get_latex_batched(equation_data) for prediction, equation_d in zip(predictions, equation_data): conditions = [ - self.get_total_texify_tokens(prediction) < self.model_max_length, - # Make sure we didn't get to the overall token max, indicates run-on len(prediction) > equation_d["token_count"] * .4, len(prediction.strip()) > 0 ] @@ -77,28 +79,15 @@ def get_batch_size(self): return 2 def get_latex_batched(self, equation_data: List[dict]): - predictions = [""] * len(equation_data) - batch_size = self.get_batch_size() - - for i in tqdm(range(0, len(equation_data), batch_size), desc="Recognizing equations"): - # Dynamically set max length to save inference time - min_idx = i - max_idx = min(min_idx + batch_size, len(equation_data)) - - batch_equations = equation_data[min_idx:max_idx] - batch_images = [eq["image"] for eq in batch_equations] - - model_output = self.texify_model( - batch_images - ) - - for j, output in enumerate(model_output): - token_count = self.get_total_texify_tokens(output.text) - if token_count >= self.model_max_length - 1: - output.text = "" - - image_idx = i + j - predictions[image_idx] = output.text + inference_images = [eq["image"] for eq in equation_data] + model_output = self.texify_model(inference_images, batch_size=self.get_batch_size()) + predictions = [output.text for output in model_output] + + for i, pred in enumerate(predictions): + token_count = self.get_total_texify_tokens(pred) + # If we're at the max token length, the prediction may be repetitive or invalid + if token_count >= self.model_max_length - 1: + predictions[i] = "" return predictions def get_total_texify_tokens(self, text): diff --git a/marker/processors/llm/__init__.py b/marker/processors/llm/__init__.py index 3d61166e..21ee04f3 100644 --- a/marker/processors/llm/__init__.py +++ b/marker/processors/llm/__init__.py @@ -27,7 +27,7 @@ class BaseLLMProcessor(BaseProcessor): max_retries: Annotated[ int, "The maximum number of retries to use for the Gemini model.", - ] = 3 + ] = 1 max_concurrency: Annotated[ int, "The maximum number of concurrent requests to make to the Gemini model.", @@ -35,7 +35,7 @@ class BaseLLMProcessor(BaseProcessor): timeout: Annotated[ int, "The timeout for requests to the Gemini model.", - ] = 60 + ] = 15 image_expansion_ratio: Annotated[ float, "The ratio to expand the image by when cropping.", @@ -44,6 +44,10 @@ class BaseLLMProcessor(BaseProcessor): bool, "Whether to use the LLM model.", ] = False + disable_tqdm: Annotated[ + bool, + "Whether to disable the tqdm progress bar.", + ] = False block_types = None def __init__(self, config=None): @@ -73,7 +77,7 @@ def rewrite_blocks(self, document: Document): if total_blocks == 0: return - pbar = tqdm(desc=f"{self.__class__.__name__} running") + pbar = tqdm(desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm) with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: for future in as_completed([ executor.submit(self.process_rewriting, document, page, block) diff --git a/marker/processors/llm/llm_complex.py b/marker/processors/llm/llm_complex.py index 52c46364..72966d62 100644 --- a/marker/processors/llm/llm_complex.py +++ b/marker/processors/llm/llm_complex.py @@ -1,9 +1,8 @@ import markdown2 +from pydantic import BaseModel from marker.processors.llm import BaseLLMProcessor -from google.ai.generativelanguage_v1beta.types import content - from marker.schema import BlockTypes from marker.schema.blocks import Block from marker.schema.document import Document @@ -55,18 +54,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block): text = block.raw_text(document) prompt = self.complex_region_prompt.replace("{extracted_text}", text) image = self.extract_image(document, block) - response_schema = content.Schema( - type=content.Type.OBJECT, - enum=[], - required=["corrected_markdown"], - properties={ - "corrected_markdown": content.Schema( - type=content.Type.STRING - ) - }, - ) - response = self.model.generate_response(prompt, image, block, response_schema) + response = self.model.generate_response(prompt, image, block, ComplexSchema) if not response or "corrected_markdown" not in response: block.update_metadata(llm_error_count=1) @@ -85,4 +74,7 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block): # Convert LLM markdown to html corrected_markdown = corrected_markdown.strip().lstrip("```markdown").rstrip("```").strip() - block.html = markdown2.markdown(corrected_markdown) \ No newline at end of file + block.html = markdown2.markdown(corrected_markdown, extras=["tables"]) + +class ComplexSchema(BaseModel): + corrected_markdown: str \ No newline at end of file diff --git a/marker/processors/llm/llm_equation.py b/marker/processors/llm/llm_equation.py index 74cfc4a3..89d0318d 100644 --- a/marker/processors/llm/llm_equation.py +++ b/marker/processors/llm/llm_equation.py @@ -1,6 +1,6 @@ -from marker.processors.llm import BaseLLMProcessor +from pydantic import BaseModel -from google.ai.generativelanguage_v1beta.types import content +from marker.processors.llm import BaseLLMProcessor from marker.schema import BlockTypes from marker.schema.blocks import Equation @@ -67,18 +67,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Equation prompt = self.equation_latex_prompt.replace("{equation}", text) image = self.extract_image(document, block) - response_schema = content.Schema( - type=content.Type.OBJECT, - enum=[], - required=["html_equation"], - properties={ - "html_equation": content.Schema( - type=content.Type.STRING - ) - }, - ) - response = self.model.generate_response(prompt, image, block, response_schema) + response = self.model.generate_response(prompt, image, block, EquationSchema) if not response or "html_equation" not in response: block.update_metadata(llm_error_count=1) @@ -89,3 +79,6 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Equation block.update_metadata(llm_error_count=1) return block.html = html_equation + +class EquationSchema(BaseModel): + html_equation: str diff --git a/marker/processors/llm/llm_form.py b/marker/processors/llm/llm_form.py index fc66f155..a47bad3c 100644 --- a/marker/processors/llm/llm_form.py +++ b/marker/processors/llm/llm_form.py @@ -1,6 +1,6 @@ -from marker.processors.llm import BaseLLMProcessor +from pydantic import BaseModel -from google.ai.generativelanguage_v1beta.types import content +from marker.processors.llm import BaseLLMProcessor from marker.schema import BlockTypes from marker.schema.blocks import Block @@ -13,13 +13,14 @@ class LLMFormProcessor(BaseLLMProcessor): form_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images. You will receive an image of a text block and an html representation of the form in the image. Your task is to correct any errors in the html representation, and format it properly. -Values and labels should appear in html tables, with the labels on the left side, and values on the right. The headers should be "Labels" and "Values". Other text in the form can appear between the tables. Only use the tags `table, p, span, i, b, th, td, tr, and div`. Do not omit any text from the form - make sure everything is included in the html representation. It should be as faithful to the original form as possible. +Values and labels should appear in html tables, with the labels on the left side, and values on the right. Other text in the form can appear between the tables. Only use the tags `table, p, span, i, b, th, td, tr, and div`. Do not omit any text from the form - make sure everything is included in the html representation. It should be as faithful to the original form as possible. **Instructions:** 1. Carefully examine the provided form block image. 2. Analyze the html representation of the form. -3. If the html representation is largely correct, or you cannot read the image properly, then write "No corrections needed." -4. If the html representation contains errors, generate the corrected html representation. -5. Output only either the corrected html representation or "No corrections needed." +3. Compare the html representation to the image. +4. If the html representation is correct, or you cannot read the image properly, then write "No corrections needed." +5. If the html representation contains errors, generate the corrected html representation. +6. Output only either the corrected html representation or "No corrections needed." **Example:** Input: ```html @@ -37,12 +38,9 @@ class LLMFormProcessor(BaseLLMProcessor): ``` Output: +Comparison: The html representation has the labels in the first row and the values in the second row. It should be corrected to have the labels on the left side and the values on the right side. ```html - - - - @@ -73,18 +71,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block): prompt = self.form_rewriting_prompt.replace("{block_html}", block_html) image = self.extract_image(document, block) - response_schema = content.Schema( - type=content.Type.OBJECT, - enum=[], - required=["corrected_html"], - properties={ - "corrected_html": content.Schema( - type=content.Type.STRING - ) - }, - ) - response = self.model.generate_response(prompt, image, block, response_schema) + response = self.model.generate_response(prompt, image, block, FormSchema) if not response or "corrected_html" not in response: block.update_metadata(llm_error_count=1) @@ -102,4 +90,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block): return corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip() - block.html = corrected_html \ No newline at end of file + block.html = corrected_html + +class FormSchema(BaseModel): + comparison: str + corrected_html: str \ No newline at end of file diff --git a/marker/processors/llm/llm_handwriting.py b/marker/processors/llm/llm_handwriting.py index d3e9b9f3..760efb35 100644 --- a/marker/processors/llm/llm_handwriting.py +++ b/marker/processors/llm/llm_handwriting.py @@ -1,9 +1,8 @@ import markdown2 +from pydantic import BaseModel from marker.processors.llm import BaseLLMProcessor -from google.ai.generativelanguage_v1beta.types import content - from marker.schema import BlockTypes from marker.schema.blocks import Handwriting, Text from marker.schema.document import Document @@ -49,18 +48,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Handwrit prompt = self.handwriting_generation_prompt image = self.extract_image(document, block) - response_schema = content.Schema( - type=content.Type.OBJECT, - enum=[], - required=["markdown"], - properties={ - "markdown": content.Schema( - type=content.Type.STRING - ) - }, - ) - - response = self.model.generate_response(prompt, image, block, response_schema) + + response = self.model.generate_response(prompt, image, block, HandwritingSchema) if not response or "markdown" not in response: block.update_metadata(llm_error_count=1) @@ -72,4 +61,7 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Handwrit return markdown = markdown.strip().lstrip("```markdown").rstrip("```").strip() - block.html = markdown2.markdown(markdown) + block.html = markdown2.markdown(markdown, extras=["tables"]) + +class HandwritingSchema(BaseModel): + markdown: str diff --git a/marker/processors/llm/llm_image_description.py b/marker/processors/llm/llm_image_description.py index a08e0dc9..c125df0f 100644 --- a/marker/processors/llm/llm_image_description.py +++ b/marker/processors/llm/llm_image_description.py @@ -1,6 +1,6 @@ -from marker.processors.llm import BaseLLMProcessor +from pydantic import BaseModel -from google.ai.generativelanguage_v1beta.types import content +from marker.processors.llm import BaseLLMProcessor from marker.schema import BlockTypes from marker.schema.blocks import Block @@ -49,18 +49,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block): prompt = self.image_description_prompt.replace("{raw_text}", block.raw_text(document)) image = self.extract_image(document, block) - response_schema = content.Schema( - type=content.Type.OBJECT, - enum=[], - required=["image_description"], - properties={ - "image_description": content.Schema( - type=content.Type.STRING - ) - }, - ) - response = self.model.generate_response(prompt, image, block, response_schema) + response = self.model.generate_response(prompt, image, block, ImageSchema) if not response or "image_description" not in response: block.update_metadata(llm_error_count=1) @@ -72,3 +62,6 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block): return block.description = image_description + +class ImageSchema(BaseModel): + image_description: str diff --git a/marker/processors/llm/llm_table.py b/marker/processors/llm/llm_table.py index e0c738a0..1ec1f8cd 100644 --- a/marker/processors/llm/llm_table.py +++ b/marker/processors/llm/llm_table.py @@ -1,8 +1,8 @@ from typing import Annotated, List, Tuple from bs4 import BeautifulSoup -from google.ai.generativelanguage_v1beta.types import content from PIL import Image +from pydantic import BaseModel from marker.processors.llm import BaseLLMProcessor from marker.schema import BlockTypes @@ -34,21 +34,21 @@ class LLMTableProcessor(BaseLLMProcessor): "The prompt to use for rewriting text.", "Default is a string containing the Gemini rewriting prompt." ] = """You are a text correction expert specializing in accurately reproducing text from images. -You will receive an image of a text block and an html representation of the table in the image. +You will receive an image and an html representation of the table in the image. Your task is to correct any errors in the html representation. The html representation should be as faithful to the original table as possible. Some guidelines: - Make sure to reproduce the original values as faithfully as possible. -- If you see any math in a table cell, fence it with the tag. Block math should be fenced with . +- If you see any math in a table cell, fence it with the tag. Block math should be fenced with . - Replace any images with a description, like "Image: [description]". - Only use the tags th, td, tr, br, span, i, b, math, and table. Only use the attributes display, style, colspan, and rowspan if necessary. You can use br to break up text lines in cells. +- Make sure the columns and rows match the image faithfully, and are easily readable and interpretable by a human. **Instructions:** 1. Carefully examine the provided text block image. 2. Analyze the html representation of the table. -3. If the html representation is largely correct, or you cannot read the image properly, then write "No corrections needed." -4. If the html representation contains errors, generate the corrected html representation. -5. Output only either the corrected html representation or "No corrections needed." +3. Write a comparison of the image and the html representation. +4. If the html representation is completely correct, or you cannot read the image properly, then write "No corrections needed." If the html representation has errors, generate the corrected html representation. Output only either the corrected html representation or "No corrections needed." **Example:** Input: ```html @@ -67,6 +67,7 @@ class LLMTableProcessor(BaseLLMProcessor): ``` Output: ```html +Comparison: The image shows a table with 2 rows and 3 columns. The text and formatting of the html table matches the image. No corrections needed. ``` **Input:** @@ -133,18 +134,7 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Table): def rewrite_single_chunk(self, page: PageGroup, block: Block, block_html: str, children: List[TableCell], image: Image.Image): prompt = self.table_rewriting_prompt.replace("{block_html}", block_html) - response_schema = content.Schema( - type=content.Type.OBJECT, - enum=[], - required=["corrected_html"], - properties={ - "corrected_html": content.Schema( - type=content.Type.STRING - ) - }, - ) - - response = self.model.generate_response(prompt, image, block, response_schema) + response = self.model.generate_response(prompt, image, block, TableSchema) if not response or "corrected_html" not in response: block.update_metadata(llm_error_count=1) @@ -246,3 +236,7 @@ def parse_html_table(self, html_text: str, block: Block, page: PageGroup) -> Lis cur_col += colspan return cells + +class TableSchema(BaseModel): + comparison: str + corrected_html: str diff --git a/marker/processors/llm/llm_table_merge.py b/marker/processors/llm/llm_table_merge.py index e2012998..c978a906 100644 --- a/marker/processors/llm/llm_table_merge.py +++ b/marker/processors/llm/llm_table_merge.py @@ -1,7 +1,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Annotated, List, Tuple, Literal -from google.ai.generativelanguage_v1beta.types import content +from pydantic import BaseModel from tqdm import tqdm from PIL import Image @@ -39,11 +39,15 @@ class LLMTableMergeProcessor(BaseLLMProcessor): horizontal_table_distance_threshold: Annotated[ int, "The maximum distance between table edges for adjacency." - ] = 20 + ] = 10 column_gap_threshold: Annotated[ int, "The maximum gap between columns to merge tables" ] = 50 + disable_tqdm: Annotated[ + bool, + "Whether to disable the tqdm progress bar.", + ] = False table_merge_prompt: Annotated[ str, "The prompt to use for rewriting text.", @@ -114,6 +118,9 @@ class LLMTableMergeProcessor(BaseLLMProcessor): @staticmethod def get_row_count(cells: List[TableCell]): + if not cells: + return 0 + max_rows = None for col_id in set([cell.col_id for cell in cells]): col_cells = [cell for cell in cells if cell.col_id == col_id] @@ -126,6 +133,9 @@ def get_row_count(cells: List[TableCell]): @staticmethod def get_column_count(cells: List[TableCell]): + if not cells: + return 0 + max_cols = None for row_id in set([cell.row_id for cell in cells]): row_cells = [cell for cell in cells if cell.row_id == row_id] @@ -137,7 +147,7 @@ def get_column_count(cells: List[TableCell]): return max_cols def rewrite_blocks(self, document: Document): - pbar = tqdm(desc=f"{self.__class__.__name__} running") + pbar = tqdm(desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm) table_runs = [] table_run = [] prev_block = None @@ -230,36 +240,11 @@ def process_rewriting(self, document: Document, blocks: List[Block]): prompt = self.table_merge_prompt.replace("{{table1}}", start_html).replace("{{table2}}", curr_html) - response_schema = content.Schema( - type=content.Type.OBJECT, - enum=[], - required=["table1_description", "table2_description", "explanation", "merge", "direction"], - properties={ - "table1_description": content.Schema( - type=content.Type.STRING - ), - "table2_description": content.Schema( - type=content.Type.STRING - ), - "explanation": content.Schema( - type=content.Type.STRING - ), - "merge": content.Schema( - type=content.Type.STRING, - enum=["true", "false"] - ), - "direction": content.Schema( - type=content.Type.STRING, - enum=["bottom", "right"] - ), - }, - ) - response = self.model.generate_response( prompt, [start_image, curr_image], curr_block, - response_schema + MergeSchema, ) if not response or ("direction" not in response or "merge" not in response): @@ -331,4 +316,12 @@ def join_images(image1: Image.Image, image2: Image.Image, direction: Literal['ri new_img = Image.new('RGB', (new_width, new_height), 'white') new_img.paste(image1, (0, 0)) new_img.paste(image2, (0, h1)) - return new_img \ No newline at end of file + return new_img + + +class MergeSchema(BaseModel): + table1_description: str + table2_description: str + explanation: str + merge: Literal["true", "false"] + direction: Literal["bottom", "right"] \ No newline at end of file diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py index 8a71b54e..0e5faa35 100644 --- a/marker/processors/llm/llm_text.py +++ b/marker/processors/llm/llm_text.py @@ -1,9 +1,10 @@ import json -import textwrap +from typing import List + +from pydantic import BaseModel from marker.processors.llm import BaseLLMProcessor from bs4 import BeautifulSoup -from google.ai.generativelanguage_v1beta.types import content from marker.schema import BlockTypes from marker.schema.blocks import Block from marker.schema.document import Document @@ -28,10 +29,11 @@ class LLMTextProcessor(BaseLLMProcessor): * Inline math: Ensure all mathematical expressions are correctly formatted and rendered. * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters. * Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies. -5. Do not remove any formatting i.e bold, italics, etc from the extracted lines unless it is necessary to correct the error. +5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error. 6. Ensure that inline math is properly with inline math tags. 7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines. 8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below. +9. You absolutely cannot remove any ... tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them. **Example:** @@ -39,7 +41,7 @@ class LLMTextProcessor(BaseLLMProcessor): ``` { "extracted_lines": [ - "Adversarial training (AT) [23], which aims to minimize\n", + "Adversarial training (AT) [23], which aims to minimize\n", "the model's risk under the worst-case perturbations, is cur-\n", "rently the most effective approach for improving the robust-\n", "ness of deep neural networks. For a given neural network\n", @@ -54,7 +56,7 @@ class LLMTextProcessor(BaseLLMProcessor): ```json { "corrected_lines": [ - "Adversarial training (AT) [23], which aims to minimize\n", + "Adversarial training (AT) [23], which aims to minimize\n", "the model's risk under the worst-case perturbations, is cur-\n", "rently the most effective approach for improving the robust-\n", "ness of deep neural networks. For a given neural network\n", @@ -78,21 +80,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block): prompt = self.text_math_rewriting_prompt.replace("{extracted_lines}", json.dumps({"extracted_lines": extracted_lines}, indent=2)) image = self.extract_image(document, block) - response_schema = content.Schema( - type=content.Type.OBJECT, - enum=[], - required=["corrected_lines"], - properties={ - "corrected_lines": content.Schema( - type=content.Type.ARRAY, - items=content.Schema( - type=content.Type.STRING, - ), - ) - }, - ) - response = self.model.generate_response(prompt, image, block, response_schema) + response = self.model.generate_response(prompt, image, block, LLMTextSchema) if not response or "corrected_lines" not in response: block.update_metadata(llm_error_count=1) return @@ -120,34 +109,44 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block): minimum_position=0, maximum_position=0, formats=[span['type']], + url=span.get('url'), page_id=text_line.page_id, text_extraction_method="gemini", ) ) text_line.structure.append(span_block.id) - def text_to_spans(self, text): + @staticmethod + def text_to_spans(text): soup = BeautifulSoup(text, 'html.parser') tag_types = { 'b': 'bold', 'i': 'italic', - 'math': 'math' + 'math': 'math', } spans = [] for element in soup.descendants: if not len(list(element.parents)) == 1: continue + + url = element.attrs.get('href') if hasattr(element, 'attrs') else None + if element.name in tag_types: spans.append({ 'type': tag_types[element.name], - 'content': element.get_text() + 'content': element.get_text(), + 'url': url }) elif element.string: spans.append({ 'type': 'plain', - 'content': element.string + 'content': element.string, + 'url': url }) return spans + +class LLMTextSchema(BaseModel): + corrected_lines: List[str] \ No newline at end of file diff --git a/marker/processors/llm/utils.py b/marker/processors/llm/utils.py index da7be67f..a36bdb4d 100644 --- a/marker/processors/llm/utils.py +++ b/marker/processors/llm/utils.py @@ -1,13 +1,16 @@ import json import time +from io import BytesIO from typing import List import PIL -import google.generativeai as genai -from google.ai.generativelanguage_v1beta.types import content -from google.api_core.exceptions import ResourceExhausted +from google import genai +from google.genai import types +from google.genai.errors import APIError +from pydantic import BaseModel from marker.schema.blocks import Block +from marker.settings import settings class GoogleModel: @@ -17,45 +20,59 @@ def __init__(self, api_key: str, model_name: str): self.api_key = api_key self.model_name = model_name - self.model = self.configure_google_model() - def configure_google_model(self): - genai.configure(api_key=self.api_key) - return genai.GenerativeModel(self.model_name) + def get_google_client(self, timeout: int = 60): + return genai.Client( + api_key=settings.GOOGLE_API_KEY, + http_options={"timeout": timeout * 1000} # Convert to milliseconds + ) + + def img_to_bytes(self, img: PIL.Image.Image): + image_bytes = BytesIO() + img.save(image_bytes, format="PNG") + return image_bytes.getvalue() def generate_response( self, prompt: str, image: PIL.Image.Image | List[PIL.Image.Image], block: Block, - response_schema: content.Schema, - max_retries: int = 3, - timeout: int = 60 + response_schema: type[BaseModel], + max_retries: int = 1, + timeout: int = 15 ): if not isinstance(image, list): image = [image] + + client = self.get_google_client(timeout=timeout) + image_parts = [types.Part.from_bytes(data=self.img_to_bytes(img), mime_type="image/png") for img in image] + tries = 0 while tries < max_retries: try: - responses = self.model.generate_content( - image + [prompt], # According to gemini docs, it performs better if the image is the first element - stream=False, - generation_config={ + responses = client.models.generate_content( + model="gemini-2.0-flash", + contents=image_parts + [prompt], # According to gemini docs, it performs better if the image is the first element + config={ "temperature": 0, "response_schema": response_schema, "response_mime_type": "application/json", - }, - request_options={'timeout': timeout} + } ) output = responses.candidates[0].content.parts[0].text total_tokens = responses.usage_metadata.total_token_count block.update_metadata(llm_tokens_used=total_tokens, llm_request_count=1) return json.loads(output) - except ResourceExhausted as e: - tries += 1 - wait_time = tries * 3 - print(f"ResourceExhausted: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{max_retries})") - time.sleep(wait_time) + except APIError as e: + if e.code == 429: + # Rate limit exceeded + tries += 1 + wait_time = tries * 3 + print(f"APIError: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{max_retries})") + time.sleep(wait_time) + else: + print(e) + break except Exception as e: print(e) break diff --git a/marker/processors/order.py b/marker/processors/order.py index b28e57c3..146eaf30 100644 --- a/marker/processors/order.py +++ b/marker/processors/order.py @@ -1,4 +1,5 @@ from statistics import mean +from collections import defaultdict from marker.processors import BaseProcessor from marker.schema import BlockTypes @@ -13,41 +14,53 @@ class OrderProcessor(BaseProcessor): def __call__(self, document: Document): for page in document.pages: + # Skip OCRed pages if page.text_extraction_method != "pdftext": continue + # Skip pages without layout slicing if not page.layout_sliced: continue - block_idxs = {} + block_idxs = defaultdict(int) for block_id in page.structure: block = document.get_block(block_id) spans = block.contained_blocks(document, (BlockTypes.Span, )) if len(spans) == 0: continue + # Avg span position in original PDF block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2 for block_id in page.structure: - if block_id in block_idxs and block_idxs[block_id] > 0: + # Already assigned block id via span position + if block_idxs[block_id] > 0: continue + block = document.get_block(block_id) prev_block = document.get_prev_block(block) next_block = document.get_next_block(block) + block_idx_add = 0 + if prev_block: + block_idx_add = 1 + while prev_block and prev_block.id not in block_idxs: prev_block = document.get_prev_block(prev_block) + block_idx_add += 1 if not prev_block: + block_idx_add = -1 while next_block and next_block.id not in block_idxs: next_block = document.get_next_block(next_block) + block_idx_add -= 1 if not next_block and not prev_block: - block_idxs[block_id] = 0 + pass elif prev_block: - block_idxs[block_id] = block_idxs[prev_block.id] + 1 + block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add else: - block_idxs[block_id] = block_idxs[next_block.id] - 1 + block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add page.structure = sorted(page.structure, key=lambda x: block_idxs[x]) diff --git a/marker/processors/table.py b/marker/processors/table.py index 75b723c0..8bd2831e 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -29,7 +29,7 @@ class TableProcessor(BaseProcessor): bool, "Whether to detect boxes for the table recognition model.", ] = False - detector_batch_size: Annotated[ + detection_batch_size: Annotated[ int, "The batch size to use for the table detection model.", "Default is None, which will use the default batch size for the model." @@ -101,6 +101,7 @@ def __call__(self, document: Document): ) self.assign_text_to_cells(tables, table_data) self.split_combined_rows(tables) # Split up rows that were combined + self.combine_dollar_column(tables) # Combine columns that are just dollar signs # Assign table cells to the table table_idx = 0 @@ -168,6 +169,64 @@ def normalize_spaces(text): text = text.replace(space, ' ') return text + + def combine_dollar_column(self, tables: List[TableResult]): + for table in tables: + if len(table.cells) == 0: + # Skip empty tables + continue + unique_cols = sorted(list(set([c.col_id for c in table.cells]))) + max_col = max(unique_cols) + dollar_cols = [] + for col in unique_cols: + # Cells in this col + col_cells = [c for c in table.cells if c.col_id == col] + col_text = ["\n".join(self.finalize_cell_text(c)).strip() for c in col_cells] + all_dollars = all([ct in ["", "$"] for ct in col_text]) + colspans = [c.colspan for c in col_cells] + span_into_col = [c for c in table.cells if c.col_id != col and c.col_id + c.colspan > col > c.col_id] + + # This is a column that is entirely dollar signs + if all([ + all_dollars, + len(col_cells) > 1, + len(span_into_col) == 0, + all([c == 1 for c in colspans]), + col < max_col + ]): + next_col_cells = [c for c in table.cells if c.col_id == col + 1] + next_col_rows = [c.row_id for c in next_col_cells] + col_rows = [c.row_id for c in col_cells] + if len(next_col_cells) == len(col_cells) and next_col_rows == col_rows: + dollar_cols.append(col) + + + if len(dollar_cols) == 0: + continue + + dollar_cols = sorted(dollar_cols) + col_offset = 0 + for col in unique_cols: + col_cells = [c for c in table.cells if c.col_id == col] + if col_offset == 0 and col not in dollar_cols: + continue + + if col in dollar_cols: + col_offset += 1 + for cell in col_cells: + text_lines = cell.text_lines if cell.text_lines else [] + next_row_col = [c for c in table.cells if c.row_id == cell.row_id and c.col_id == col + 1] + + # Add dollar to start of the next column + next_text_lines = next_row_col[0].text_lines if next_row_col[0].text_lines else [] + next_row_col[0].text_lines = deepcopy(text_lines) + deepcopy(next_text_lines) + table.cells = [c for c in table.cells if c.cell_id != cell.cell_id] # Remove original cell + next_row_col[0].col_id -= col_offset + else: + for cell in col_cells: + cell.col_id -= col_offset + + def split_combined_rows(self, tables: List[TableResult]): for table in tables: if len(table.cells) == 0: @@ -318,7 +377,7 @@ def assign_ocr_lines(self, ocr_blocks: list): [None] * len(det_images), self.detection_model, recognition_batch_size=self.get_recognition_batch_size(), - detection_batch_size=self.get_detector_batch_size() + detection_batch_size=self.get_detection_batch_size() ) for block, ocr_res in zip(ocr_blocks, ocr_results): @@ -333,9 +392,9 @@ def assign_ocr_lines(self, ocr_blocks: list): block["table_text_lines"] = table_cells - def get_detector_batch_size(self): - if self.detector_batch_size is not None: - return self.detector_batch_size + def get_detection_batch_size(self): + if self.detection_batch_size is not None: + return self.detection_batch_size elif settings.TORCH_DEVICE_MODEL == "cuda": return 4 return 4 diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index 9a48fa40..3d10af9f 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -99,7 +99,7 @@ def convert_table(self, el, text, convert_as_inline): for r in range(int(cell.get('rowspan', 1)) - 1): rowspan_cols[i + r] += colspan # Add the colspan to the next rows, so they get the correct number of columns colspans.append(row_cols) - total_cols = max(colspans) + total_cols = max(colspans) if colspans else 0 grid = [[None for _ in range(total_cols)] for _ in range(total_rows)] @@ -128,7 +128,7 @@ def convert_table(self, el, text, convert_as_inline): grid[row_idx + r][col_idx + c] = '' # Empty cell due to rowspan/colspan except IndexError: # Sometimes the colspan/rowspan predictions can overflow - print(f"Overflow in columns: {col_idx + c} >= {total_cols}") + print(f"Overflow in columns: {col_idx + c} >= {total_cols} or rows: {row_idx + r} >= {total_rows}") continue col_idx += colspan @@ -198,10 +198,9 @@ class MarkdownRenderer(HTMLRenderer): inline_math_delimiters: Annotated[Tuple[str], "The delimiters to use for inline math."] = ("$", "$") block_math_delimiters: Annotated[Tuple[str], "The delimiters to use for block math."] = ("$$", "$$") - def __call__(self, document: Document) -> MarkdownOutput: - document_output = document.render() - full_html, images = self.extract_html(document, document_output) - md_cls = Markdownify( + @property + def md_cls(self): + return Markdownify( self.paginate_output, self.page_separator, heading_style="ATX", @@ -215,7 +214,12 @@ def __call__(self, document: Document) -> MarkdownOutput: inline_math_delimiters=self.inline_math_delimiters, block_math_delimiters=self.block_math_delimiters ) - markdown = md_cls.convert(full_html) + + + def __call__(self, document: Document) -> MarkdownOutput: + document_output = document.render() + full_html, images = self.extract_html(document, document_output) + markdown = self.md_cls.convert(full_html) markdown = cleanup_text(markdown) return MarkdownOutput( markdown=markdown, diff --git a/marker/schema/polygon.py b/marker/schema/polygon.py index 2174bc6c..25e9ed31 100644 --- a/marker/schema/polygon.py +++ b/marker/schema/polygon.py @@ -126,6 +126,9 @@ def center_distance(self, other: PolygonBox, x_weight: float = 1, y_weight: floa else: return abs(self.center[0] - other.center[0]) * x_weight + abs(self.center[1] - other.center[1]) * y_weight + def tl_distance(self, other: PolygonBox): + return ((self.bbox[0] - other.bbox[0]) ** 2 + (self.bbox[1] - other.bbox[1]) ** 2) ** 0.5 + def rescale(self, old_size, new_size): # Point is in x, y format page_width, page_height = old_size diff --git a/marker/schema/text/line.py b/marker/schema/text/line.py index 30525a38..6285ee88 100644 --- a/marker/schema/text/line.py +++ b/marker/schema/text/line.py @@ -42,10 +42,18 @@ def formatted_text(self, document): for block in self.contained_blocks(document, (BlockTypes.Span,)): block_text = html.escape(block.text) + if block.has_superscript: + block_text = re.sub(r"^([0-9\W]+)(.*)", r"\1\2", block_text) + + if block.url: + block_text = f"{block_text}" + if block.italic: text += f"{block_text}" elif block.bold: text += f"{block_text}" + elif block.math: + text += f"{block_text}" else: text += block_text diff --git a/marker/scripts/streamlit_app.py b/marker/scripts/streamlit_app.py index 5b38e052..185ed7fa 100644 --- a/marker/scripts/streamlit_app.py +++ b/marker/scripts/streamlit_app.py @@ -115,7 +115,10 @@ def pillow_image_to_base64_string(img: Image) -> str: return base64.b64encode(buffered.getvalue()).decode("utf-8") -def block_display(image: Image, blocks: dict = {}, dpi=96): +def block_display(image: Image, blocks: dict | None = None, dpi=96): + if blocks is None: + blocks = {} + image_data_url = ( 'data:image/jpeg;base64,' + pillow_image_to_base64_string(image) ) diff --git a/marker/scripts/streamlit_app_blocks_viz.html b/marker/scripts/streamlit_app_blocks_viz.html index b31ee0a8..e00908d1 100644 --- a/marker/scripts/streamlit_app_blocks_viz.html +++ b/marker/scripts/streamlit_app_blocks_viz.html @@ -114,7 +114,7 @@
- @@ -147,17 +147,17 @@

const BLOCK_TYPES = $block_types_json; const blocksById = {}; const blockInfoDialog = document.querySelector("dialog#block-info-dialog"); - + function blockTypeColor(blockType) { return COLORS[BLOCK_TYPES[blockType] % COLORS.length]; } - + function traverseAndGenerateSVG(block) { let svg = ""; - + if (block.polygon) { const color = blockTypeColor(block.block_type); - + // dollar signs are escaped because this files gets read into a template string svg += ` }" fill=$${color} stroke=$${color}> `; - + blocksById[block.id] = block; } - + if (Array.isArray(block.children) && block.children.length > 0) { block.children.forEach((child) => { svg += traverseAndGenerateSVG(child); }); } - + return svg; } - + if (Object.keys(BLOCKS).length == 0) { // bail out if no blocks return; } - + const [vbWidth, vbHeight] = BLOCKS.children[0].polygon[2]; document .querySelector("svg") .setAttribute("viewBox", `0 0 $${vbWidth} $${vbHeight}`); - + const blocksOverlay = document.querySelector(".blocks-overlay"); blocksOverlay.innerHTML = traverseAndGenerateSVG(BLOCKS.children[0]); - + tippy("rect.block", { content: (block) => block.getAttribute("data-type"), placement: "top-start", arrow: false, offset: [0, 5], }); - + blocksOverlay.addEventListener("click", (event) => { if (event.target.tagName !== "rect") return; - + const blockId = event.target.id; const block = blocksById[blockId]; - + blockInfoDialog.querySelector("h1").innerHTML = ` $${blockId} ($${block.block_type}) `; blockInfoDialog.querySelector(".text-content").textContent = block.html; - + blockInfoDialog.dataset.blockJSON = JSON.stringify(block, null, 2); - + if (block.images) { const imagesDiv = blockInfoDialog.querySelector(".images"); imagesDiv.innerHTML = ""; diff --git a/marker/settings.py b/marker/settings.py index 2d416b90..2b1eda90 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -46,14 +46,6 @@ def MODEL_DTYPE(self) -> torch.dtype: else: return torch.float32 - # Texify model - TEXIFY_MODEL_NAME: str = "vikp/texify" - - @computed_field - @property - def TEXIFY_DTYPE(self) -> torch.dtype: - return torch.float32 if self.TORCH_DEVICE_MODEL == "cpu" else torch.float16 - class Config: env_file = find_dotenv("local.env") extra = "ignore" diff --git a/marker/util.py b/marker/util.py index 3dbde5f8..3586c0bb 100644 --- a/marker/util.py +++ b/marker/util.py @@ -80,3 +80,22 @@ def matrix_intersection_area(boxes1: List[List[float]], boxes2: List[List[float] height = np.maximum(0, max_y - min_y) return width * height # Shape: (N, M) + + +def matrix_distance(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray: + if len(boxes2) == 0: + return np.zeros((len(boxes1), 0)) + if len(boxes1) == 0: + return np.zeros((0, len(boxes2))) + + boxes1 = np.array(boxes1) # Shape: (N, 4) + boxes2 = np.array(boxes2) # Shape: (M, 4) + + boxes1_centers = (boxes1[:, :2] + boxes1[:, 2:]) / 2 # Shape: (M, 2) + boxes2_centers = (boxes2[:, :2] + boxes2[:, 2:]) / 2 # Shape: (M, 2) + + boxes1_centers = boxes1_centers[:, np.newaxis, :] # Shape: (N, 1, 2) + boxes2_centers = boxes2_centers[np.newaxis, :, :] # Shape: (1, M, 2) + + distances = np.linalg.norm(boxes1_centers - boxes2_centers, axis=2) # Shape: (N, M) + return distances diff --git a/poetry.lock b/poetry.lock index 94865761..0234f789 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2,98 +2,103 @@ [[package]] name = "aiohappyeyeballs" -version = "2.4.4" +version = "2.4.6" description = "Happy Eyeballs for asyncio" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "aiohappyeyeballs-2.4.4-py3-none-any.whl", hash = "sha256:a980909d50efcd44795c4afeca523296716d50cd756ddca6af8c65b996e27de8"}, - {file = "aiohappyeyeballs-2.4.4.tar.gz", hash = "sha256:5fdd7d87889c63183afc18ce9271f9b0a7d32c2303e394468dd45d514a757745"}, + {file = "aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1"}, + {file = "aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0"}, ] [[package]] name = "aiohttp" -version = "3.11.11" +version = "3.11.12" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.9" files = [ - {file = "aiohttp-3.11.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a60804bff28662cbcf340a4d61598891f12eea3a66af48ecfdc975ceec21e3c8"}, - {file = "aiohttp-3.11.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b4fa1cb5f270fb3eab079536b764ad740bb749ce69a94d4ec30ceee1b5940d5"}, - {file = "aiohttp-3.11.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:731468f555656767cda219ab42e033355fe48c85fbe3ba83a349631541715ba2"}, - {file = "aiohttp-3.11.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb23d8bb86282b342481cad4370ea0853a39e4a32a0042bb52ca6bdde132df43"}, - {file = "aiohttp-3.11.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f047569d655f81cb70ea5be942ee5d4421b6219c3f05d131f64088c73bb0917f"}, - {file = "aiohttp-3.11.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd7659baae9ccf94ae5fe8bfaa2c7bc2e94d24611528395ce88d009107e00c6d"}, - {file = "aiohttp-3.11.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af01e42ad87ae24932138f154105e88da13ce7d202a6de93fafdafb2883a00ef"}, - {file = "aiohttp-3.11.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5854be2f3e5a729800bac57a8d76af464e160f19676ab6aea74bde18ad19d438"}, - {file = "aiohttp-3.11.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6526e5fb4e14f4bbf30411216780c9967c20c5a55f2f51d3abd6de68320cc2f3"}, - {file = "aiohttp-3.11.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:85992ee30a31835fc482468637b3e5bd085fa8fe9392ba0bdcbdc1ef5e9e3c55"}, - {file = "aiohttp-3.11.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:88a12ad8ccf325a8a5ed80e6d7c3bdc247d66175afedbe104ee2aaca72960d8e"}, - {file = "aiohttp-3.11.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0a6d3fbf2232e3a08c41eca81ae4f1dff3d8f1a30bae415ebe0af2d2458b8a33"}, - {file = "aiohttp-3.11.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:84a585799c58b795573c7fa9b84c455adf3e1d72f19a2bf498b54a95ae0d194c"}, - {file = "aiohttp-3.11.11-cp310-cp310-win32.whl", hash = "sha256:bfde76a8f430cf5c5584553adf9926534352251d379dcb266ad2b93c54a29745"}, - {file = "aiohttp-3.11.11-cp310-cp310-win_amd64.whl", hash = "sha256:0fd82b8e9c383af11d2b26f27a478640b6b83d669440c0a71481f7c865a51da9"}, - {file = "aiohttp-3.11.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ba74ec819177af1ef7f59063c6d35a214a8fde6f987f7661f4f0eecc468a8f76"}, - {file = "aiohttp-3.11.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4af57160800b7a815f3fe0eba9b46bf28aafc195555f1824555fa2cfab6c1538"}, - {file = "aiohttp-3.11.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ffa336210cf9cd8ed117011085817d00abe4c08f99968deef0013ea283547204"}, - {file = "aiohttp-3.11.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81b8fe282183e4a3c7a1b72f5ade1094ed1c6345a8f153506d114af5bf8accd9"}, - {file = "aiohttp-3.11.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3af41686ccec6a0f2bdc66686dc0f403c41ac2089f80e2214a0f82d001052c03"}, - {file = "aiohttp-3.11.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70d1f9dde0e5dd9e292a6d4d00058737052b01f3532f69c0c65818dac26dc287"}, - {file = "aiohttp-3.11.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:249cc6912405917344192b9f9ea5cd5b139d49e0d2f5c7f70bdfaf6b4dbf3a2e"}, - {file = "aiohttp-3.11.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0eb98d90b6690827dcc84c246811feeb4e1eea683c0eac6caed7549be9c84665"}, - {file = "aiohttp-3.11.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec82bf1fda6cecce7f7b915f9196601a1bd1a3079796b76d16ae4cce6d0ef89b"}, - {file = "aiohttp-3.11.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9fd46ce0845cfe28f108888b3ab17abff84ff695e01e73657eec3f96d72eef34"}, - {file = "aiohttp-3.11.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:bd176afcf8f5d2aed50c3647d4925d0db0579d96f75a31e77cbaf67d8a87742d"}, - {file = "aiohttp-3.11.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ec2aa89305006fba9ffb98970db6c8221541be7bee4c1d027421d6f6df7d1ce2"}, - {file = "aiohttp-3.11.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:92cde43018a2e17d48bb09c79e4d4cb0e236de5063ce897a5e40ac7cb4878773"}, - {file = "aiohttp-3.11.11-cp311-cp311-win32.whl", hash = "sha256:aba807f9569455cba566882c8938f1a549f205ee43c27b126e5450dc9f83cc62"}, - {file = "aiohttp-3.11.11-cp311-cp311-win_amd64.whl", hash = "sha256:ae545f31489548c87b0cced5755cfe5a5308d00407000e72c4fa30b19c3220ac"}, - {file = "aiohttp-3.11.11-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e595c591a48bbc295ebf47cb91aebf9bd32f3ff76749ecf282ea7f9f6bb73886"}, - {file = "aiohttp-3.11.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3ea1b59dc06396b0b424740a10a0a63974c725b1c64736ff788a3689d36c02d2"}, - {file = "aiohttp-3.11.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8811f3f098a78ffa16e0ea36dffd577eb031aea797cbdba81be039a4169e242c"}, - {file = "aiohttp-3.11.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7227b87a355ce1f4bf83bfae4399b1f5bb42e0259cb9405824bd03d2f4336a"}, - {file = "aiohttp-3.11.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d40f9da8cabbf295d3a9dae1295c69975b86d941bc20f0a087f0477fa0a66231"}, - {file = "aiohttp-3.11.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ffb3dc385f6bb1568aa974fe65da84723210e5d9707e360e9ecb51f59406cd2e"}, - {file = "aiohttp-3.11.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8f5f7515f3552d899c61202d99dcb17d6e3b0de777900405611cd747cecd1b8"}, - {file = "aiohttp-3.11.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3499c7ffbfd9c6a3d8d6a2b01c26639da7e43d47c7b4f788016226b1e711caa8"}, - {file = "aiohttp-3.11.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8e2bf8029dbf0810c7bfbc3e594b51c4cc9101fbffb583a3923aea184724203c"}, - {file = "aiohttp-3.11.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b6212a60e5c482ef90f2d788835387070a88d52cf6241d3916733c9176d39eab"}, - {file = "aiohttp-3.11.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d119fafe7b634dbfa25a8c597718e69a930e4847f0b88e172744be24515140da"}, - {file = "aiohttp-3.11.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:6fba278063559acc730abf49845d0e9a9e1ba74f85f0ee6efd5803f08b285853"}, - {file = "aiohttp-3.11.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92fc484e34b733704ad77210c7957679c5c3877bd1e6b6d74b185e9320cc716e"}, - {file = "aiohttp-3.11.11-cp312-cp312-win32.whl", hash = "sha256:9f5b3c1ed63c8fa937a920b6c1bec78b74ee09593b3f5b979ab2ae5ef60d7600"}, - {file = "aiohttp-3.11.11-cp312-cp312-win_amd64.whl", hash = "sha256:1e69966ea6ef0c14ee53ef7a3d68b564cc408121ea56c0caa2dc918c1b2f553d"}, - {file = "aiohttp-3.11.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:541d823548ab69d13d23730a06f97460f4238ad2e5ed966aaf850d7c369782d9"}, - {file = "aiohttp-3.11.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:929f3ed33743a49ab127c58c3e0a827de0664bfcda566108989a14068f820194"}, - {file = "aiohttp-3.11.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0882c2820fd0132240edbb4a51eb8ceb6eef8181db9ad5291ab3332e0d71df5f"}, - {file = "aiohttp-3.11.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b63de12e44935d5aca7ed7ed98a255a11e5cb47f83a9fded7a5e41c40277d104"}, - {file = "aiohttp-3.11.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa54f8ef31d23c506910c21163f22b124facb573bff73930735cf9fe38bf7dff"}, - {file = "aiohttp-3.11.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a344d5dc18074e3872777b62f5f7d584ae4344cd6006c17ba12103759d407af3"}, - {file = "aiohttp-3.11.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7fb429ab1aafa1f48578eb315ca45bd46e9c37de11fe45c7f5f4138091e2f1"}, - {file = "aiohttp-3.11.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c341c7d868750e31961d6d8e60ff040fb9d3d3a46d77fd85e1ab8e76c3e9a5c4"}, - {file = "aiohttp-3.11.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed9ee95614a71e87f1a70bc81603f6c6760128b140bc4030abe6abaa988f1c3d"}, - {file = "aiohttp-3.11.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:de8d38f1c2810fa2a4f1d995a2e9c70bb8737b18da04ac2afbf3971f65781d87"}, - {file = "aiohttp-3.11.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:a9b7371665d4f00deb8f32208c7c5e652059b0fda41cf6dbcac6114a041f1cc2"}, - {file = "aiohttp-3.11.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:620598717fce1b3bd14dd09947ea53e1ad510317c85dda2c9c65b622edc96b12"}, - {file = "aiohttp-3.11.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bf8d9bfee991d8acc72d060d53860f356e07a50f0e0d09a8dfedea1c554dd0d5"}, - {file = "aiohttp-3.11.11-cp313-cp313-win32.whl", hash = "sha256:9d73ee3725b7a737ad86c2eac5c57a4a97793d9f442599bea5ec67ac9f4bdc3d"}, - {file = "aiohttp-3.11.11-cp313-cp313-win_amd64.whl", hash = "sha256:c7a06301c2fb096bdb0bd25fe2011531c1453b9f2c163c8031600ec73af1cc99"}, - {file = "aiohttp-3.11.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3e23419d832d969f659c208557de4a123e30a10d26e1e14b73431d3c13444c2e"}, - {file = "aiohttp-3.11.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:21fef42317cf02e05d3b09c028712e1d73a9606f02467fd803f7c1f39cc59add"}, - {file = "aiohttp-3.11.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1f21bb8d0235fc10c09ce1d11ffbd40fc50d3f08a89e4cf3a0c503dc2562247a"}, - {file = "aiohttp-3.11.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1642eceeaa5ab6c9b6dfeaaa626ae314d808188ab23ae196a34c9d97efb68350"}, - {file = "aiohttp-3.11.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2170816e34e10f2fd120f603e951630f8a112e1be3b60963a1f159f5699059a6"}, - {file = "aiohttp-3.11.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8be8508d110d93061197fd2d6a74f7401f73b6d12f8822bbcd6d74f2b55d71b1"}, - {file = "aiohttp-3.11.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4eed954b161e6b9b65f6be446ed448ed3921763cc432053ceb606f89d793927e"}, - {file = "aiohttp-3.11.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6c9af134da4bc9b3bd3e6a70072509f295d10ee60c697826225b60b9959acdd"}, - {file = "aiohttp-3.11.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:44167fc6a763d534a6908bdb2592269b4bf30a03239bcb1654781adf5e49caf1"}, - {file = "aiohttp-3.11.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:479b8c6ebd12aedfe64563b85920525d05d394b85f166b7873c8bde6da612f9c"}, - {file = "aiohttp-3.11.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:10b4ff0ad793d98605958089fabfa350e8e62bd5d40aa65cdc69d6785859f94e"}, - {file = "aiohttp-3.11.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:b540bd67cfb54e6f0865ceccd9979687210d7ed1a1cc8c01f8e67e2f1e883d28"}, - {file = "aiohttp-3.11.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1dac54e8ce2ed83b1f6b1a54005c87dfed139cf3f777fdc8afc76e7841101226"}, - {file = "aiohttp-3.11.11-cp39-cp39-win32.whl", hash = "sha256:568c1236b2fde93b7720f95a890741854c1200fba4a3471ff48b2934d2d93fd3"}, - {file = "aiohttp-3.11.11-cp39-cp39-win_amd64.whl", hash = "sha256:943a8b052e54dfd6439fd7989f67fc6a7f2138d0a2cf0a7de5f18aa4fe7eb3b1"}, - {file = "aiohttp-3.11.11.tar.gz", hash = "sha256:bb49c7f1e6ebf3821a42d81d494f538107610c3a705987f53068546b0e90303e"}, + {file = "aiohttp-3.11.12-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:aa8a8caca81c0a3e765f19c6953416c58e2f4cc1b84829af01dd1c771bb2f91f"}, + {file = "aiohttp-3.11.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:84ede78acde96ca57f6cf8ccb8a13fbaf569f6011b9a52f870c662d4dc8cd854"}, + {file = "aiohttp-3.11.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:584096938a001378484aa4ee54e05dc79c7b9dd933e271c744a97b3b6f644957"}, + {file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:392432a2dde22b86f70dd4a0e9671a349446c93965f261dbaecfaf28813e5c42"}, + {file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:88d385b8e7f3a870146bf5ea31786ef7463e99eb59e31db56e2315535d811f55"}, + {file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b10a47e5390c4b30a0d58ee12581003be52eedd506862ab7f97da7a66805befb"}, + {file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b5263dcede17b6b0c41ef0c3ccce847d82a7da98709e75cf7efde3e9e3b5cae"}, + {file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50c5c7b8aa5443304c55c262c5693b108c35a3b61ef961f1e782dd52a2f559c7"}, + {file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d1c031a7572f62f66f1257db37ddab4cb98bfaf9b9434a3b4840bf3560f5e788"}, + {file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:7e44eba534381dd2687be50cbd5f2daded21575242ecfdaf86bbeecbc38dae8e"}, + {file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:145a73850926018ec1681e734cedcf2716d6a8697d90da11284043b745c286d5"}, + {file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:2c311e2f63e42c1bf86361d11e2c4a59f25d9e7aabdbdf53dc38b885c5435cdb"}, + {file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ea756b5a7bac046d202a9a3889b9a92219f885481d78cd318db85b15cc0b7bcf"}, + {file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:526c900397f3bbc2db9cb360ce9c35134c908961cdd0ac25b1ae6ffcaa2507ff"}, + {file = "aiohttp-3.11.12-cp310-cp310-win32.whl", hash = "sha256:b8d3bb96c147b39c02d3db086899679f31958c5d81c494ef0fc9ef5bb1359b3d"}, + {file = "aiohttp-3.11.12-cp310-cp310-win_amd64.whl", hash = "sha256:7fe3d65279bfbee8de0fb4f8c17fc4e893eed2dba21b2f680e930cc2b09075c5"}, + {file = "aiohttp-3.11.12-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87a2e00bf17da098d90d4145375f1d985a81605267e7f9377ff94e55c5d769eb"}, + {file = "aiohttp-3.11.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b34508f1cd928ce915ed09682d11307ba4b37d0708d1f28e5774c07a7674cac9"}, + {file = "aiohttp-3.11.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:936d8a4f0f7081327014742cd51d320296b56aa6d324461a13724ab05f4b2933"}, + {file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de1378f72def7dfb5dbd73d86c19eda0ea7b0a6873910cc37d57e80f10d64e1"}, + {file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9d45dbb3aaec05cf01525ee1a7ac72de46a8c425cb75c003acd29f76b1ffe94"}, + {file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:930ffa1925393381e1e0a9b82137fa7b34c92a019b521cf9f41263976666a0d6"}, + {file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8340def6737118f5429a5df4e88f440746b791f8f1c4ce4ad8a595f42c980bd5"}, + {file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4016e383f91f2814e48ed61e6bda7d24c4d7f2402c75dd28f7e1027ae44ea204"}, + {file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c0600bcc1adfaaac321422d615939ef300df81e165f6522ad096b73439c0f58"}, + {file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:0450ada317a65383b7cce9576096150fdb97396dcfe559109b403c7242faffef"}, + {file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:850ff6155371fd802a280f8d369d4e15d69434651b844bde566ce97ee2277420"}, + {file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8fd12d0f989c6099e7b0f30dc6e0d1e05499f3337461f0b2b0dadea6c64b89df"}, + {file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:76719dd521c20a58a6c256d058547b3a9595d1d885b830013366e27011ffe804"}, + {file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:97fe431f2ed646a3b56142fc81d238abcbaff08548d6912acb0b19a0cadc146b"}, + {file = "aiohttp-3.11.12-cp311-cp311-win32.whl", hash = "sha256:e10c440d142fa8b32cfdb194caf60ceeceb3e49807072e0dc3a8887ea80e8c16"}, + {file = "aiohttp-3.11.12-cp311-cp311-win_amd64.whl", hash = "sha256:246067ba0cf5560cf42e775069c5d80a8989d14a7ded21af529a4e10e3e0f0e6"}, + {file = "aiohttp-3.11.12-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e392804a38353900c3fd8b7cacbea5132888f7129f8e241915e90b85f00e3250"}, + {file = "aiohttp-3.11.12-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8fa1510b96c08aaad49303ab11f8803787c99222288f310a62f493faf883ede1"}, + {file = "aiohttp-3.11.12-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dc065a4285307607df3f3686363e7f8bdd0d8ab35f12226362a847731516e42c"}, + {file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddb31f8474695cd61fc9455c644fc1606c164b93bff2490390d90464b4655df"}, + {file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9dec0000d2d8621d8015c293e24589d46fa218637d820894cb7356c77eca3259"}, + {file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e3552fe98e90fdf5918c04769f338a87fa4f00f3b28830ea9b78b1bdc6140e0d"}, + {file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dfe7f984f28a8ae94ff3a7953cd9678550dbd2a1f9bda5dd9c5ae627744c78e"}, + {file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a481a574af914b6e84624412666cbfbe531a05667ca197804ecc19c97b8ab1b0"}, + {file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1987770fb4887560363b0e1a9b75aa303e447433c41284d3af2840a2f226d6e0"}, + {file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:a4ac6a0f0f6402854adca4e3259a623f5c82ec3f0c049374133bcb243132baf9"}, + {file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c96a43822f1f9f69cc5c3706af33239489a6294be486a0447fb71380070d4d5f"}, + {file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a5e69046f83c0d3cb8f0d5bd9b8838271b1bc898e01562a04398e160953e8eb9"}, + {file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:68d54234c8d76d8ef74744f9f9fc6324f1508129e23da8883771cdbb5818cbef"}, + {file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c9fd9dcf9c91affe71654ef77426f5cf8489305e1c66ed4816f5a21874b094b9"}, + {file = "aiohttp-3.11.12-cp312-cp312-win32.whl", hash = "sha256:0ed49efcd0dc1611378beadbd97beb5d9ca8fe48579fc04a6ed0844072261b6a"}, + {file = "aiohttp-3.11.12-cp312-cp312-win_amd64.whl", hash = "sha256:54775858c7f2f214476773ce785a19ee81d1294a6bedc5cc17225355aab74802"}, + {file = "aiohttp-3.11.12-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:413ad794dccb19453e2b97c2375f2ca3cdf34dc50d18cc2693bd5aed7d16f4b9"}, + {file = "aiohttp-3.11.12-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4a93d28ed4b4b39e6f46fd240896c29b686b75e39cc6992692e3922ff6982b4c"}, + {file = "aiohttp-3.11.12-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d589264dbba3b16e8951b6f145d1e6b883094075283dafcab4cdd564a9e353a0"}, + {file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5148ca8955affdfeb864aca158ecae11030e952b25b3ae15d4e2b5ba299bad2"}, + {file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:525410e0790aab036492eeea913858989c4cb070ff373ec3bc322d700bdf47c1"}, + {file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bd8695be2c80b665ae3f05cb584093a1e59c35ecb7d794d1edd96e8cc9201d7"}, + {file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0203433121484b32646a5f5ea93ae86f3d9559d7243f07e8c0eab5ff8e3f70e"}, + {file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40cd36749a1035c34ba8d8aaf221b91ca3d111532e5ccb5fa8c3703ab1b967ed"}, + {file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a7442662afebbf7b4c6d28cb7aab9e9ce3a5df055fc4116cc7228192ad6cb484"}, + {file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:8a2fb742ef378284a50766e985804bd6adb5adb5aa781100b09befdbfa757b65"}, + {file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2cee3b117a8d13ab98b38d5b6bdcd040cfb4181068d05ce0c474ec9db5f3c5bb"}, + {file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f6a19bcab7fbd8f8649d6595624856635159a6527861b9cdc3447af288a00c00"}, + {file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e4cecdb52aaa9994fbed6b81d4568427b6002f0a91c322697a4bfcc2b2363f5a"}, + {file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:30f546358dfa0953db92ba620101fefc81574f87b2346556b90b5f3ef16e55ce"}, + {file = "aiohttp-3.11.12-cp313-cp313-win32.whl", hash = "sha256:ce1bb21fc7d753b5f8a5d5a4bae99566386b15e716ebdb410154c16c91494d7f"}, + {file = "aiohttp-3.11.12-cp313-cp313-win_amd64.whl", hash = "sha256:f7914ab70d2ee8ab91c13e5402122edbc77821c66d2758abb53aabe87f013287"}, + {file = "aiohttp-3.11.12-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7c3623053b85b4296cd3925eeb725e386644fd5bc67250b3bb08b0f144803e7b"}, + {file = "aiohttp-3.11.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:67453e603cea8e85ed566b2700efa1f6916aefbc0c9fcb2e86aaffc08ec38e78"}, + {file = "aiohttp-3.11.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6130459189e61baac5a88c10019b21e1f0c6d00ebc770e9ce269475650ff7f73"}, + {file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9060addfa4ff753b09392efe41e6af06ea5dd257829199747b9f15bfad819460"}, + {file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34245498eeb9ae54c687a07ad7f160053911b5745e186afe2d0c0f2898a1ab8a"}, + {file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8dc0fba9a74b471c45ca1a3cb6e6913ebfae416678d90529d188886278e7f3f6"}, + {file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a478aa11b328983c4444dacb947d4513cb371cd323f3845e53caeda6be5589d5"}, + {file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c160a04283c8c6f55b5bf6d4cad59bb9c5b9c9cd08903841b25f1f7109ef1259"}, + {file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:edb69b9589324bdc40961cdf0657815df674f1743a8d5ad9ab56a99e4833cfdd"}, + {file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:4ee84c2a22a809c4f868153b178fe59e71423e1f3d6a8cd416134bb231fbf6d3"}, + {file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:bf4480a5438f80e0f1539e15a7eb8b5f97a26fe087e9828e2c0ec2be119a9f72"}, + {file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:e6b2732ef3bafc759f653a98881b5b9cdef0716d98f013d376ee8dfd7285abf1"}, + {file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f752e80606b132140883bb262a457c475d219d7163d996dc9072434ffb0784c4"}, + {file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ab3247d58b393bda5b1c8f31c9edece7162fc13265334217785518dd770792b8"}, + {file = "aiohttp-3.11.12-cp39-cp39-win32.whl", hash = "sha256:0d5176f310a7fe6f65608213cc74f4228e4f4ce9fd10bcb2bb6da8fc66991462"}, + {file = "aiohttp-3.11.12-cp39-cp39-win_amd64.whl", hash = "sha256:74bd573dde27e58c760d9ca8615c41a57e719bff315c9adb6f2a4281a28e8798"}, + {file = "aiohttp-3.11.12.tar.gz", hash = "sha256:7603ca26d75b1b86160ce1bbe2787a0b706e592af5b2504e12caa88a217767b0"}, ] [package.dependencies] @@ -339,31 +344,32 @@ tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] [[package]] name = "babel" -version = "2.16.0" +version = "2.17.0" description = "Internationalization utilities" optional = false python-versions = ">=3.8" files = [ - {file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"}, - {file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"}, + {file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"}, + {file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"}, ] [package.extras] -dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] +dev = ["backports.zoneinfo", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata"] [[package]] name = "beautifulsoup4" -version = "4.12.3" +version = "4.13.3" description = "Screen-scraping library" optional = false -python-versions = ">=3.6.0" +python-versions = ">=3.7.0" files = [ - {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, - {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, + {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"}, + {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"}, ] [package.dependencies] soupsieve = ">1.2" +typing-extensions = ">=4.0.0" [package.extras] cchardet = ["cchardet"] @@ -1060,79 +1066,6 @@ gitdb = ">=4.0.1,<5" doc = ["sphinx (>=7.1.2,<7.2)", "sphinx-autodoc-typehints", "sphinx_rtd_theme"] test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"] -[[package]] -name = "google-ai-generativelanguage" -version = "0.6.15" -description = "Google Ai Generativelanguage API client library" -optional = false -python-versions = ">=3.7" -files = [ - {file = "google_ai_generativelanguage-0.6.15-py3-none-any.whl", hash = "sha256:5a03ef86377aa184ffef3662ca28f19eeee158733e45d7947982eb953c6ebb6c"}, - {file = "google_ai_generativelanguage-0.6.15.tar.gz", hash = "sha256:8f6d9dc4c12b065fe2d0289026171acea5183ebf2d0b11cefe12f3821e159ec3"}, -] - -[package.dependencies] -google-api-core = {version = ">=1.34.1,<2.0.dev0 || >=2.11.dev0,<3.0.0dev", extras = ["grpc"]} -google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0dev" -proto-plus = [ - {version = ">=1.22.3,<2.0.0dev", markers = "python_version < \"3.13\""}, - {version = ">=1.25.0,<2.0.0dev", markers = "python_version >= \"3.13\""}, -] -protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev" - -[[package]] -name = "google-api-core" -version = "2.24.1" -description = "Google API client core library" -optional = false -python-versions = ">=3.7" -files = [ - {file = "google_api_core-2.24.1-py3-none-any.whl", hash = "sha256:bc78d608f5a5bf853b80bd70a795f703294de656c096c0968320830a4bc280f1"}, - {file = "google_api_core-2.24.1.tar.gz", hash = "sha256:f8b36f5456ab0dd99a1b693a40a31d1e7757beea380ad1b38faaf8941eae9d8a"}, -] - -[package.dependencies] -google-auth = ">=2.14.1,<3.0.dev0" -googleapis-common-protos = ">=1.56.2,<2.0.dev0" -grpcio = [ - {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, - {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, -] -grpcio-status = [ - {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, - {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, -] -proto-plus = [ - {version = ">=1.22.3,<2.0.0dev", markers = "python_version < \"3.13\""}, - {version = ">=1.25.0,<2.0.0dev", markers = "python_version >= \"3.13\""}, -] -protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" -requests = ">=2.18.0,<3.0.0.dev0" - -[package.extras] -async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.dev0)"] -grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0)"] -grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] -grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] - -[[package]] -name = "google-api-python-client" -version = "2.160.0" -description = "Google API Client Library for Python" -optional = false -python-versions = ">=3.7" -files = [ - {file = "google_api_python_client-2.160.0-py2.py3-none-any.whl", hash = "sha256:63d61fb3e4cf3fb31a70a87f45567c22f6dfe87bbfa27252317e3e2c42900db4"}, - {file = "google_api_python_client-2.160.0.tar.gz", hash = "sha256:a8ccafaecfa42d15d5b5c3134ced8de08380019717fc9fb1ed510ca58eca3b7e"}, -] - -[package.dependencies] -google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0.dev0" -google-auth = ">=1.32.0,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0.dev0" -google-auth-httplib2 = ">=0.2.0,<1.0.0" -httplib2 = ">=0.19.0,<1.dev0" -uritemplate = ">=3.0.1,<5" - [[package]] name = "google-auth" version = "2.38.0" @@ -1158,142 +1091,106 @@ reauth = ["pyu2f (>=0.1.5)"] requests = ["requests (>=2.20.0,<3.0.0.dev0)"] [[package]] -name = "google-auth-httplib2" -version = "0.2.0" -description = "Google Authentication Library: httplib2 transport" -optional = false -python-versions = "*" -files = [ - {file = "google-auth-httplib2-0.2.0.tar.gz", hash = "sha256:38aa7badf48f974f1eb9861794e9c0cb2a0511a4ec0679b1f886d108f5640e05"}, - {file = "google_auth_httplib2-0.2.0-py2.py3-none-any.whl", hash = "sha256:b65a0a2123300dd71281a7bf6e64d65a0759287df52729bdd1ae2e47dc311a3d"}, -] - -[package.dependencies] -google-auth = "*" -httplib2 = ">=0.19.0" - -[[package]] -name = "google-generativeai" -version = "0.8.4" -description = "Google Generative AI High level API client library and tools." +name = "google-genai" +version = "1.1.0" +description = "GenAI Python SDK" optional = false python-versions = ">=3.9" files = [ - {file = "google_generativeai-0.8.4-py3-none-any.whl", hash = "sha256:e987b33ea6decde1e69191ddcaec6ef974458864d243de7191db50c21a7c5b82"}, + {file = "google_genai-1.1.0-py3-none-any.whl", hash = "sha256:c48ac44612ad6aadc0bf96b12fa4314756baa16382c890fff793bcb53e9a9cc8"}, ] [package.dependencies] -google-ai-generativelanguage = "0.6.15" -google-api-core = "*" -google-api-python-client = "*" -google-auth = ">=2.15.0" -protobuf = "*" -pydantic = "*" -tqdm = "*" -typing-extensions = "*" - -[package.extras] -dev = ["Pillow", "absl-py", "black", "ipython", "nose2", "pandas", "pytype", "pyyaml"] +google-auth = ">=2.14.1,<3.0.0dev" +pydantic = ">=2.0.0,<3.0.0dev" +requests = ">=2.28.1,<3.0.0dev" +websockets = ">=13.0,<15.0dev" [[package]] -name = "googleapis-common-protos" -version = "1.66.0" -description = "Common protobufs used in Google APIs" +name = "greenlet" +version = "3.1.1" +description = "Lightweight in-process concurrent programming" optional = false python-versions = ">=3.7" files = [ - {file = "googleapis_common_protos-1.66.0-py2.py3-none-any.whl", hash = "sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed"}, - {file = "googleapis_common_protos-1.66.0.tar.gz", hash = "sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c"}, -] - -[package.dependencies] -protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" - -[package.extras] -grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] - -[[package]] -name = "grpcio" -version = "1.70.0" -description = "HTTP/2-based RPC framework" -optional = false -python-versions = ">=3.8" -files = [ - {file = "grpcio-1.70.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:95469d1977429f45fe7df441f586521361e235982a0b39e33841549143ae2851"}, - {file = "grpcio-1.70.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:ed9718f17fbdb472e33b869c77a16d0b55e166b100ec57b016dc7de9c8d236bf"}, - {file = "grpcio-1.70.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:374d014f29f9dfdb40510b041792e0e2828a1389281eb590df066e1cc2b404e5"}, - {file = "grpcio-1.70.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2af68a6f5c8f78d56c145161544ad0febbd7479524a59c16b3e25053f39c87f"}, - {file = "grpcio-1.70.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce7df14b2dcd1102a2ec32f621cc9fab6695effef516efbc6b063ad749867295"}, - {file = "grpcio-1.70.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c78b339869f4dbf89881e0b6fbf376313e4f845a42840a7bdf42ee6caed4b11f"}, - {file = "grpcio-1.70.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:58ad9ba575b39edef71f4798fdb5c7b6d02ad36d47949cd381d4392a5c9cbcd3"}, - {file = "grpcio-1.70.0-cp310-cp310-win32.whl", hash = "sha256:2b0d02e4b25a5c1f9b6c7745d4fa06efc9fd6a611af0fb38d3ba956786b95199"}, - {file = "grpcio-1.70.0-cp310-cp310-win_amd64.whl", hash = "sha256:0de706c0a5bb9d841e353f6343a9defc9fc35ec61d6eb6111802f3aa9fef29e1"}, - {file = "grpcio-1.70.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:17325b0be0c068f35770f944124e8839ea3185d6d54862800fc28cc2ffad205a"}, - {file = "grpcio-1.70.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:dbe41ad140df911e796d4463168e33ef80a24f5d21ef4d1e310553fcd2c4a386"}, - {file = "grpcio-1.70.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:5ea67c72101d687d44d9c56068328da39c9ccba634cabb336075fae2eab0d04b"}, - {file = "grpcio-1.70.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb5277db254ab7586769e490b7b22f4ddab3876c490da0a1a9d7c695ccf0bf77"}, - {file = "grpcio-1.70.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7831a0fc1beeeb7759f737f5acd9fdcda520e955049512d68fda03d91186eea"}, - {file = "grpcio-1.70.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:27cc75e22c5dba1fbaf5a66c778e36ca9b8ce850bf58a9db887754593080d839"}, - {file = "grpcio-1.70.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d63764963412e22f0491d0d32833d71087288f4e24cbcddbae82476bfa1d81fd"}, - {file = "grpcio-1.70.0-cp311-cp311-win32.whl", hash = "sha256:bb491125103c800ec209d84c9b51f1c60ea456038e4734688004f377cfacc113"}, - {file = "grpcio-1.70.0-cp311-cp311-win_amd64.whl", hash = "sha256:d24035d49e026353eb042bf7b058fb831db3e06d52bee75c5f2f3ab453e71aca"}, - {file = "grpcio-1.70.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:ef4c14508299b1406c32bdbb9fb7b47612ab979b04cf2b27686ea31882387cff"}, - {file = "grpcio-1.70.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:aa47688a65643afd8b166928a1da6247d3f46a2784d301e48ca1cc394d2ffb40"}, - {file = "grpcio-1.70.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:880bfb43b1bb8905701b926274eafce5c70a105bc6b99e25f62e98ad59cb278e"}, - {file = "grpcio-1.70.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e654c4b17d07eab259d392e12b149c3a134ec52b11ecdc6a515b39aceeec898"}, - {file = "grpcio-1.70.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2394e3381071045a706ee2eeb6e08962dd87e8999b90ac15c55f56fa5a8c9597"}, - {file = "grpcio-1.70.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b3c76701428d2df01964bc6479422f20e62fcbc0a37d82ebd58050b86926ef8c"}, - {file = "grpcio-1.70.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ac073fe1c4cd856ebcf49e9ed6240f4f84d7a4e6ee95baa5d66ea05d3dd0df7f"}, - {file = "grpcio-1.70.0-cp312-cp312-win32.whl", hash = "sha256:cd24d2d9d380fbbee7a5ac86afe9787813f285e684b0271599f95a51bce33528"}, - {file = "grpcio-1.70.0-cp312-cp312-win_amd64.whl", hash = "sha256:0495c86a55a04a874c7627fd33e5beaee771917d92c0e6d9d797628ac40e7655"}, - {file = "grpcio-1.70.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:aa573896aeb7d7ce10b1fa425ba263e8dddd83d71530d1322fd3a16f31257b4a"}, - {file = "grpcio-1.70.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:d405b005018fd516c9ac529f4b4122342f60ec1cee181788249372524e6db429"}, - {file = "grpcio-1.70.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:f32090238b720eb585248654db8e3afc87b48d26ac423c8dde8334a232ff53c9"}, - {file = "grpcio-1.70.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa089a734f24ee5f6880c83d043e4f46bf812fcea5181dcb3a572db1e79e01c"}, - {file = "grpcio-1.70.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f19375f0300b96c0117aca118d400e76fede6db6e91f3c34b7b035822e06c35f"}, - {file = "grpcio-1.70.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:7c73c42102e4a5ec76608d9b60227d917cea46dff4d11d372f64cbeb56d259d0"}, - {file = "grpcio-1.70.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:0a5c78d5198a1f0aa60006cd6eb1c912b4a1520b6a3968e677dbcba215fabb40"}, - {file = "grpcio-1.70.0-cp313-cp313-win32.whl", hash = "sha256:fe9dbd916df3b60e865258a8c72ac98f3ac9e2a9542dcb72b7a34d236242a5ce"}, - {file = "grpcio-1.70.0-cp313-cp313-win_amd64.whl", hash = "sha256:4119fed8abb7ff6c32e3d2255301e59c316c22d31ab812b3fbcbaf3d0d87cc68"}, - {file = "grpcio-1.70.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:8058667a755f97407fca257c844018b80004ae8035565ebc2812cc550110718d"}, - {file = "grpcio-1.70.0-cp38-cp38-macosx_10_14_universal2.whl", hash = "sha256:879a61bf52ff8ccacbedf534665bb5478ec8e86ad483e76fe4f729aaef867cab"}, - {file = "grpcio-1.70.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:0ba0a173f4feacf90ee618fbc1a27956bfd21260cd31ced9bc707ef551ff7dc7"}, - {file = "grpcio-1.70.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:558c386ecb0148f4f99b1a65160f9d4b790ed3163e8610d11db47838d452512d"}, - {file = "grpcio-1.70.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:412faabcc787bbc826f51be261ae5fa996b21263de5368a55dc2cf824dc5090e"}, - {file = "grpcio-1.70.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3b0f01f6ed9994d7a0b27eeddea43ceac1b7e6f3f9d86aeec0f0064b8cf50fdb"}, - {file = "grpcio-1.70.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7385b1cb064734005204bc8994eed7dcb801ed6c2eda283f613ad8c6c75cf873"}, - {file = "grpcio-1.70.0-cp38-cp38-win32.whl", hash = "sha256:07269ff4940f6fb6710951116a04cd70284da86d0a4368fd5a3b552744511f5a"}, - {file = "grpcio-1.70.0-cp38-cp38-win_amd64.whl", hash = "sha256:aba19419aef9b254e15011b230a180e26e0f6864c90406fdbc255f01d83bc83c"}, - {file = "grpcio-1.70.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:4f1937f47c77392ccd555728f564a49128b6a197a05a5cd527b796d36f3387d0"}, - {file = "grpcio-1.70.0-cp39-cp39-macosx_10_14_universal2.whl", hash = "sha256:0cd430b9215a15c10b0e7d78f51e8a39d6cf2ea819fd635a7214fae600b1da27"}, - {file = "grpcio-1.70.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:e27585831aa6b57b9250abaf147003e126cd3a6c6ca0c531a01996f31709bed1"}, - {file = "grpcio-1.70.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1af8e15b0f0fe0eac75195992a63df17579553b0c4af9f8362cc7cc99ccddf4"}, - {file = "grpcio-1.70.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbce24409beaee911c574a3d75d12ffb8c3e3dd1b813321b1d7a96bbcac46bf4"}, - {file = "grpcio-1.70.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ff4a8112a79464919bb21c18e956c54add43ec9a4850e3949da54f61c241a4a6"}, - {file = "grpcio-1.70.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5413549fdf0b14046c545e19cfc4eb1e37e9e1ebba0ca390a8d4e9963cab44d2"}, - {file = "grpcio-1.70.0-cp39-cp39-win32.whl", hash = "sha256:b745d2c41b27650095e81dea7091668c040457483c9bdb5d0d9de8f8eb25e59f"}, - {file = "grpcio-1.70.0-cp39-cp39-win_amd64.whl", hash = "sha256:a31d7e3b529c94e930a117b2175b2efd179d96eb3c7a21ccb0289a8ab05b645c"}, - {file = "grpcio-1.70.0.tar.gz", hash = "sha256:8d1584a68d5922330025881e63a6c1b54cc8117291d382e4fa69339b6d914c56"}, + {file = "greenlet-3.1.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:0bbae94a29c9e5c7e4a2b7f0aae5c17e8e90acbfd3bf6270eeba60c39fce3563"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fde093fb93f35ca72a556cf72c92ea3ebfda3d79fc35bb19fbe685853869a83"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36b89d13c49216cadb828db8dfa6ce86bbbc476a82d3a6c397f0efae0525bdd0"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94b6150a85e1b33b40b1464a3f9988dcc5251d6ed06842abff82e42632fac120"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93147c513fac16385d1036b7e5b102c7fbbdb163d556b791f0f11eada7ba65dc"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da7a9bff22ce038e19bf62c4dd1ec8391062878710ded0a845bcf47cc0200617"}, + {file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b2795058c23988728eec1f36a4e5e4ebad22f8320c85f3587b539b9ac84128d7"}, + {file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ed10eac5830befbdd0c32f83e8aa6288361597550ba669b04c48f0f9a2c843c6"}, + {file = "greenlet-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:77c386de38a60d1dfb8e55b8c1101d68c79dfdd25c7095d51fec2dd800892b80"}, + {file = "greenlet-3.1.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:e4d333e558953648ca09d64f13e6d8f0523fa705f51cae3f03b5983489958c70"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09fc016b73c94e98e29af67ab7b9a879c307c6731a2c9da0db5a7d9b7edd1159"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5e975ca70269d66d17dd995dafc06f1b06e8cb1ec1e9ed54c1d1e4a7c4cf26e"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2813dc3de8c1ee3f924e4d4227999285fd335d1bcc0d2be6dc3f1f6a318ec1"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e347b3bfcf985a05e8c0b7d462ba6f15b1ee1c909e2dcad795e49e91b152c383"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e8f8c9cb53cdac7ba9793c276acd90168f416b9ce36799b9b885790f8ad6c0a"}, + {file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62ee94988d6b4722ce0028644418d93a52429e977d742ca2ccbe1c4f4a792511"}, + {file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1776fd7f989fc6b8d8c8cb8da1f6b82c5814957264d1f6cf818d475ec2bf6395"}, + {file = "greenlet-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:48ca08c771c268a768087b408658e216133aecd835c0ded47ce955381105ba39"}, + {file = "greenlet-3.1.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:4afe7ea89de619adc868e087b4d2359282058479d7cfb94970adf4b55284574d"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f406b22b7c9a9b4f8aa9d2ab13d6ae0ac3e85c9a809bd590ad53fed2bf70dc79"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c3a701fe5a9695b238503ce5bbe8218e03c3bcccf7e204e455e7462d770268aa"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2846930c65b47d70b9d178e89c7e1a69c95c1f68ea5aa0a58646b7a96df12441"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99cfaa2110534e2cf3ba31a7abcac9d328d1d9f1b95beede58294a60348fba36"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1443279c19fca463fc33e65ef2a935a5b09bb90f978beab37729e1c3c6c25fe9"}, + {file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b7cede291382a78f7bb5f04a529cb18e068dd29e0fb27376074b6d0317bf4dd0"}, + {file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:23f20bb60ae298d7d8656c6ec6db134bca379ecefadb0b19ce6f19d1f232a942"}, + {file = "greenlet-3.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:7124e16b4c55d417577c2077be379514321916d5790fa287c9ed6f23bd2ffd01"}, + {file = "greenlet-3.1.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:05175c27cb459dcfc05d026c4232f9de8913ed006d42713cb8a5137bd49375f1"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:935e943ec47c4afab8965954bf49bfa639c05d4ccf9ef6e924188f762145c0ff"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:667a9706c970cb552ede35aee17339a18e8f2a87a51fba2ed39ceeeb1004798a"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8a678974d1f3aa55f6cc34dc480169d58f2e6d8958895d68845fa4ab566509e"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc0f674aa41b92da8c49e0346318c6075d734994c3c4e4430b1c3f853e498e4"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0153404a4bb921f0ff1abeb5ce8a5131da56b953eda6e14b88dc6bbc04d2049e"}, + {file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:275f72decf9932639c1c6dd1013a1bc266438eb32710016a1c742df5da6e60a1"}, + {file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c4aab7f6381f38a4b42f269057aee279ab0fc7bf2e929e3d4abfae97b682a12c"}, + {file = "greenlet-3.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:b42703b1cf69f2aa1df7d1030b9d77d3e584a70755674d60e710f0af570f3761"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1695e76146579f8c06c1509c7ce4dfe0706f49c6831a817ac04eebb2fd02011"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7876452af029456b3f3549b696bb36a06db7c90747740c5302f74a9e9fa14b13"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ead44c85f8ab905852d3de8d86f6f8baf77109f9da589cb4fa142bd3b57b475"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8320f64b777d00dd7ccdade271eaf0cad6636343293a25074cc5566160e4de7b"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6510bf84a6b643dabba74d3049ead221257603a253d0a9873f55f6a59a65f822"}, + {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:04b013dc07c96f83134b1e99888e7a79979f1a247e2a9f59697fa14b5862ed01"}, + {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47da355d8687fd65240c364c90a31569a133b7b60de111c255ef5b606f2ae291"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98884ecf2ffb7d7fe6bd517e8eb99d31ff7855a840fa6d0d63cd07c037f6a981"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1d4aeb8891338e60d1ab6127af1fe45def5259def8094b9c7e34690c8858803"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db32b5348615a04b82240cc67983cb315309e88d444a288934ee6ceaebcad6cc"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dcc62f31eae24de7f8dce72134c8651c58000d3b1868e01392baea7c32c247de"}, + {file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1d3755bcb2e02de341c55b4fca7a745a24a9e7212ac953f6b3a48d117d7257aa"}, + {file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:b8da394b34370874b4572676f36acabac172602abf054cbc4ac910219f3340af"}, + {file = "greenlet-3.1.1-cp37-cp37m-win32.whl", hash = "sha256:a0dfc6c143b519113354e780a50381508139b07d2177cb6ad6a08278ec655798"}, + {file = "greenlet-3.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:54558ea205654b50c438029505def3834e80f0869a70fb15b871c29b4575ddef"}, + {file = "greenlet-3.1.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:346bed03fe47414091be4ad44786d1bd8bef0c3fcad6ed3dee074a032ab408a9"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfc59d69fc48664bc693842bd57acfdd490acafda1ab52c7836e3fc75c90a111"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21e10da6ec19b457b82636209cbe2331ff4306b54d06fa04b7c138ba18c8a81"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37b9de5a96111fc15418819ab4c4432e4f3c2ede61e660b1e33971eba26ef9ba"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ef9ea3f137e5711f0dbe5f9263e8c009b7069d8a1acea822bd5e9dae0ae49c8"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85f3ff71e2e60bd4b4932a043fbbe0f499e263c628390b285cb599154a3b03b1"}, + {file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:95ffcf719966dd7c453f908e208e14cde192e09fde6c7186c8f1896ef778d8cd"}, + {file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:03a088b9de532cbfe2ba2034b2b85e82df37874681e8c470d6fb2f8c04d7e4b7"}, + {file = "greenlet-3.1.1-cp38-cp38-win32.whl", hash = "sha256:8b8b36671f10ba80e159378df9c4f15c14098c4fd73a36b9ad715f057272fbef"}, + {file = "greenlet-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:7017b2be767b9d43cc31416aba48aab0d2309ee31b4dbf10a1d38fb7972bdf9d"}, + {file = "greenlet-3.1.1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:396979749bd95f018296af156201d6211240e7a23090f50a8d5d18c370084dc3"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca9d0ff5ad43e785350894d97e13633a66e2b50000e8a183a50a88d834752d42"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f6ff3b14f2df4c41660a7dec01045a045653998784bf8cfcb5a525bdffffbc8f"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94ebba31df2aa506d7b14866fed00ac141a867e63143fe5bca82a8e503b36437"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aaad12ac0ff500f62cebed98d8789198ea0e6f233421059fa68a5aa7220145"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63e4844797b975b9af3a3fb8f7866ff08775f5426925e1e0bbcfe7932059a12c"}, + {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7939aa3ca7d2a1593596e7ac6d59391ff30281ef280d8632fa03d81f7c5f955e"}, + {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d0028e725ee18175c6e422797c407874da24381ce0690d6b9396c204c7f7276e"}, + {file = "greenlet-3.1.1-cp39-cp39-win32.whl", hash = "sha256:5e06afd14cbaf9e00899fae69b24a32f2196c19de08fcb9f4779dd4f004e5e7c"}, + {file = "greenlet-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:3319aa75e0e0639bc15ff54ca327e8dc7a6fe404003496e3c6925cd3142e0e22"}, + {file = "greenlet-3.1.1.tar.gz", hash = "sha256:4ce3ac6cdb6adf7946475d7ef31777c26d94bccc377e070a7986bd2d5c515467"}, ] [package.extras] -protobuf = ["grpcio-tools (>=1.70.0)"] - -[[package]] -name = "grpcio-status" -version = "1.70.0" -description = "Status proto mapping for gRPC" -optional = false -python-versions = ">=3.8" -files = [ - {file = "grpcio_status-1.70.0-py3-none-any.whl", hash = "sha256:fc5a2ae2b9b1c1969cc49f3262676e6854aa2398ec69cb5bd6c47cd501904a85"}, - {file = "grpcio_status-1.70.0.tar.gz", hash = "sha256:0e7b42816512433b18b9d764285ff029bde059e9d41f8fe10a60631bd8348101"}, -] - -[package.dependencies] -googleapis-common-protos = ">=1.5.5" -grpcio = ">=1.70.0" -protobuf = ">=5.26.1,<6.0dev" +docs = ["Sphinx", "furo"] +test = ["objgraph", "psutil"] [[package]] name = "h11" @@ -1327,20 +1224,6 @@ http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] trio = ["trio (>=0.22.0,<1.0)"] -[[package]] -name = "httplib2" -version = "0.22.0" -description = "A comprehensive HTTP client library." -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "httplib2-0.22.0-py3-none-any.whl", hash = "sha256:14ae0a53c1ba8f3d37e9e27cf37eabb0fb9980f435ba405d546948b009dd64dc"}, - {file = "httplib2-0.22.0.tar.gz", hash = "sha256:d7a10bc5ef5ab08322488bde8c726eeee5c8618723fdb399597ec58f3d82df81"}, -] - -[package.dependencies] -pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""} - [[package]] name = "httpx" version = "0.28.1" @@ -1459,13 +1342,13 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio [[package]] name = "ipython" -version = "8.31.0" +version = "8.32.0" description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.10" files = [ - {file = "ipython-8.31.0-py3-none-any.whl", hash = "sha256:46ec58f8d3d076a61d128fe517a51eb730e3aaf0c184ea8c17d16e366660c6a6"}, - {file = "ipython-8.31.0.tar.gz", hash = "sha256:b6a2274606bec6166405ff05e54932ed6e5cfecaca1fc05f2cacde7bb074d70b"}, + {file = "ipython-8.32.0-py3-none-any.whl", hash = "sha256:cae85b0c61eff1fc48b0a8002de5958b6528fa9c8defb1894da63f42613708aa"}, + {file = "ipython-8.32.0.tar.gz", hash = "sha256:be2c91895b0b9ea7ba49d33b23e2040c352b33eb6a519cca7ce6e0c743444251"}, ] [package.dependencies] @@ -1732,17 +1615,18 @@ test = ["ipykernel", "pre-commit", "pytest (<8)", "pytest-cov", "pytest-timeout" [[package]] name = "jupyter-events" -version = "0.11.0" +version = "0.12.0" description = "Jupyter Event System library" optional = false python-versions = ">=3.9" files = [ - {file = "jupyter_events-0.11.0-py3-none-any.whl", hash = "sha256:36399b41ce1ca45fe8b8271067d6a140ffa54cec4028e95491c93b78a855cacf"}, - {file = "jupyter_events-0.11.0.tar.gz", hash = "sha256:c0bc56a37aac29c1fbc3bcfbddb8c8c49533f9cf11f1c4e6adadba936574ab90"}, + {file = "jupyter_events-0.12.0-py3-none-any.whl", hash = "sha256:6464b2fa5ad10451c3d35fabc75eab39556ae1e2853ad0c0cc31b656731a97fb"}, + {file = "jupyter_events-0.12.0.tar.gz", hash = "sha256:fc3fce98865f6784c9cd0a56a20644fc6098f21c8c33834a8d9fe383c17e554b"}, ] [package.dependencies] jsonschema = {version = ">=4.18.0", extras = ["format-nongpl"]} +packaging = "*" python-json-logger = ">=2.0.4" pyyaml = ">=5.3" referencing = "*" @@ -1905,6 +1789,17 @@ files = [ {file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"}, ] +[[package]] +name = "latex2mathml" +version = "3.77.0" +description = "Pure Python library for LaTeX to MathML conversion" +optional = false +python-versions = ">=3.8.1,<4.0.0" +files = [ + {file = "latex2mathml-3.77.0-py3-none-any.whl", hash = "sha256:5531e18a2a9eae7c24e257118b6a444cbba253cd27ff3e81f1bd6c41e88e786e"}, + {file = "latex2mathml-3.77.0.tar.gz", hash = "sha256:e2f501d1878f2e489c3f6f12786bef74c62f712d2770f7f3c837eb20a55d0a1e"}, +] + [[package]] name = "lxml" version = "5.3.0" @@ -2371,13 +2266,13 @@ dill = ">=0.3.8" [[package]] name = "narwhals" -version = "1.24.1" +version = "1.26.0" description = "Extremely lightweight compatibility layer between dataframe libraries" optional = false python-versions = ">=3.8" files = [ - {file = "narwhals-1.24.1-py3-none-any.whl", hash = "sha256:d8983fe14851c95d60576ddca37c094bd4ed24ab9ea98396844fb20ad9aaf184"}, - {file = "narwhals-1.24.1.tar.gz", hash = "sha256:b09b8253d945f23cdb683a84685abf3afb9f96114d89e9f35dc876e143f65007"}, + {file = "narwhals-1.26.0-py3-none-any.whl", hash = "sha256:4af8bbdea9e45638bb9a981568a8dfa880e40eb7dcf740d19fd32aea79223c6f"}, + {file = "narwhals-1.26.0.tar.gz", hash = "sha256:b9d7605bf1d97a9d87783a69748c39150964e2a1ab0e5a6fef3e59e56772639e"}, ] [package.extras] @@ -3077,6 +2972,26 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] type = ["mypy (>=1.11.2)"] +[[package]] +name = "playwright" +version = "1.50.0" +description = "A high-level API to automate web browsers" +optional = false +python-versions = ">=3.9" +files = [ + {file = "playwright-1.50.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:f36d754a6c5bd9bf7f14e8f57a2aea6fd08f39ca4c8476481b9c83e299531148"}, + {file = "playwright-1.50.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:40f274384591dfd27f2b014596250b2250c843ed1f7f4ef5d2960ecb91b4961e"}, + {file = "playwright-1.50.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:9922ef9bcd316995f01e220acffd2d37a463b4ad10fd73e388add03841dfa230"}, + {file = "playwright-1.50.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:8fc628c492d12b13d1f347137b2ac6c04f98197ff0985ef0403a9a9ee0d39131"}, + {file = "playwright-1.50.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcff35f72db2689a79007aee78f1b0621a22e6e3d6c1f58aaa9ac805bf4497c"}, + {file = "playwright-1.50.0-py3-none-win32.whl", hash = "sha256:3b906f4d351260016a8c5cc1e003bb341651ae682f62213b50168ed581c7558a"}, + {file = "playwright-1.50.0-py3-none-win_amd64.whl", hash = "sha256:1859423da82de631704d5e3d88602d755462b0906824c1debe140979397d2e8d"}, +] + +[package.dependencies] +greenlet = ">=3.1.1,<4.0.0" +pyee = ">=12,<13" + [[package]] name = "pluggy" version = "1.5.0" @@ -3211,23 +3126,6 @@ files = [ {file = "propcache-0.2.1.tar.gz", hash = "sha256:3f77ce728b19cb537714499928fe800c3dda29e8d9428778fc7c186da4c09a64"}, ] -[[package]] -name = "proto-plus" -version = "1.26.0" -description = "Beautiful, Pythonic protocol buffers" -optional = false -python-versions = ">=3.7" -files = [ - {file = "proto_plus-1.26.0-py3-none-any.whl", hash = "sha256:bf2dfaa3da281fc3187d12d224c707cb57214fb2c22ba854eb0c105a3fb2d4d7"}, - {file = "proto_plus-1.26.0.tar.gz", hash = "sha256:6e93d5f5ca267b54300880fff156b6a3386b3fa3f43b1da62e680fc0c586ef22"}, -] - -[package.dependencies] -protobuf = ">=3.19.0,<6.0.0dev" - -[package.extras] -testing = ["google-api-core (>=1.31.5)"] - [[package]] name = "protobuf" version = "5.29.3" @@ -3565,32 +3463,35 @@ carto = ["pydeck-carto"] jupyter = ["ipykernel (>=5.1.2)", "ipython (>=5.8.0)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"] [[package]] -name = "pygments" -version = "2.19.1" -description = "Pygments is a syntax highlighting package written in Python." +name = "pyee" +version = "12.1.1" +description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own" optional = false python-versions = ">=3.8" files = [ - {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, - {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, + {file = "pyee-12.1.1-py3-none-any.whl", hash = "sha256:18a19c650556bb6b32b406d7f017c8f513aceed1ef7ca618fb65de7bd2d347ef"}, + {file = "pyee-12.1.1.tar.gz", hash = "sha256:bbc33c09e2ff827f74191e3e5bbc6be7da02f627b7ec30d86f5ce1a6fb2424a3"}, ] +[package.dependencies] +typing-extensions = "*" + [package.extras] -windows-terminal = ["colorama (>=0.4.6)"] +dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "pytest", "pytest-asyncio", "pytest-trio", "sphinx", "toml", "tox", "trio", "trio", "trio-typing", "twine", "twisted", "validate-pyproject[all]"] [[package]] -name = "pyparsing" -version = "3.2.1" -description = "pyparsing module - Classes and methods to define and execute parsing grammars" +name = "pygments" +version = "2.19.1" +description = "Pygments is a syntax highlighting package written in Python." optional = false -python-versions = ">=3.9" +python-versions = ">=3.8" files = [ - {file = "pyparsing-3.2.1-py3-none-any.whl", hash = "sha256:506ff4f4386c4cec0590ec19e6302d3aedb992fdc02c761e90416f158dacf8e1"}, - {file = "pyparsing-3.2.1.tar.gz", hash = "sha256:61980854fd66de3a90028d679a954d5f2623e83144b5afe5ee86f43d762e5f0a"}, + {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, + {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, ] [package.extras] -diagrams = ["jinja2", "railroad-diagrams"] +windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pypdfium2" @@ -3746,17 +3647,18 @@ files = [ [[package]] name = "pywinpty" -version = "2.0.14" +version = "2.0.15" description = "Pseudo terminal support for Windows from Python." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "pywinpty-2.0.14-cp310-none-win_amd64.whl", hash = "sha256:0b149c2918c7974f575ba79f5a4aad58bd859a52fa9eb1296cc22aa412aa411f"}, - {file = "pywinpty-2.0.14-cp311-none-win_amd64.whl", hash = "sha256:cf2a43ac7065b3e0dc8510f8c1f13a75fb8fde805efa3b8cff7599a1ef497bc7"}, - {file = "pywinpty-2.0.14-cp312-none-win_amd64.whl", hash = "sha256:55dad362ef3e9408ade68fd173e4f9032b3ce08f68cfe7eacb2c263ea1179737"}, - {file = "pywinpty-2.0.14-cp313-none-win_amd64.whl", hash = "sha256:074fb988a56ec79ca90ed03a896d40707131897cefb8f76f926e3834227f2819"}, - {file = "pywinpty-2.0.14-cp39-none-win_amd64.whl", hash = "sha256:5725fd56f73c0531ec218663bd8c8ff5acc43c78962fab28564871b5fce053fd"}, - {file = "pywinpty-2.0.14.tar.gz", hash = "sha256:18bd9529e4a5daf2d9719aa17788ba6013e594ae94c5a0c27e83df3278b0660e"}, + {file = "pywinpty-2.0.15-cp310-cp310-win_amd64.whl", hash = "sha256:8e7f5de756a615a38b96cd86fa3cd65f901ce54ce147a3179c45907fa11b4c4e"}, + {file = "pywinpty-2.0.15-cp311-cp311-win_amd64.whl", hash = "sha256:9a6bcec2df2707aaa9d08b86071970ee32c5026e10bcc3cc5f6f391d85baf7ca"}, + {file = "pywinpty-2.0.15-cp312-cp312-win_amd64.whl", hash = "sha256:83a8f20b430bbc5d8957249f875341a60219a4e971580f2ba694fbfb54a45ebc"}, + {file = "pywinpty-2.0.15-cp313-cp313-win_amd64.whl", hash = "sha256:ab5920877dd632c124b4ed17bc6dd6ef3b9f86cd492b963ffdb1a67b85b0f408"}, + {file = "pywinpty-2.0.15-cp313-cp313t-win_amd64.whl", hash = "sha256:a4560ad8c01e537708d2790dbe7da7d986791de805d89dd0d3697ca59e9e4901"}, + {file = "pywinpty-2.0.15-cp39-cp39-win_amd64.whl", hash = "sha256:d261cd88fcd358cfb48a7ca0700db3e1c088c9c10403c9ebc0d8a8b57aa6a117"}, + {file = "pywinpty-2.0.15.tar.gz", hash = "sha256:312cf39153a8736c617d45ce8b6ad6cd2107de121df91c455b10ce6bba7a39b2"}, ] [[package]] @@ -4618,13 +4520,13 @@ full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart [[package]] name = "streamlit" -version = "1.41.1" +version = "1.42.0" description = "A faster way to build and share data apps" optional = false python-versions = "!=3.9.7,>=3.9" files = [ - {file = "streamlit-1.41.1-py2.py3-none-any.whl", hash = "sha256:0def00822480071d642e6df36cd63c089f991da3a69fd9eb4ab8f65ce27de4e0"}, - {file = "streamlit-1.41.1.tar.gz", hash = "sha256:6626d32b098ba1458b71eebdd634c62af2dd876380e59c4b6a1e828a39d62d69"}, + {file = "streamlit-1.42.0-py2.py3-none-any.whl", hash = "sha256:edf333fd3525b7c64b19e1156b483a1a93cbdb09a3a06f26478388d68f971090"}, + {file = "streamlit-1.42.0.tar.gz", hash = "sha256:8c48494ccfad33e7d0bc5873151800b203cb71203bfd42bc7418940710ca4970"}, ] [package.dependencies] @@ -4645,21 +4547,21 @@ rich = ">=10.14.0,<14" tenacity = ">=8.1.0,<10" toml = ">=0.10.1,<2" tornado = ">=6.0.3,<7" -typing-extensions = ">=4.3.0,<5" +typing-extensions = ">=4.4.0,<5" watchdog = {version = ">=2.1.5,<7", markers = "platform_system != \"Darwin\""} [package.extras] -snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[modin] (>=1.17.0)"] +snowflake = ["snowflake-connector-python (>=3.3.0)", "snowflake-snowpark-python[modin] (>=1.17.0)"] [[package]] name = "surya-ocr" -version = "0.10.2" +version = "0.11.0" description = "OCR, layout, reading order, and table recognition in 90+ languages" optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "surya_ocr-0.10.2-py3-none-any.whl", hash = "sha256:fbb590ae92b2a785e75ca25a53dd2ff59b1f56ec017a22f6127c9c7c62a1b910"}, - {file = "surya_ocr-0.10.2.tar.gz", hash = "sha256:ddbaf5d2f2cc0a08992446f889f782aa81e9e1cfa3fd957c124273365d411057"}, + {file = "surya_ocr-0.11.0-py3-none-any.whl", hash = "sha256:2314a04d6aa2f362eefb14145b9d1b2c5b6568fb287ff8205cc0d580b9a304a3"}, + {file = "surya_ocr-0.11.0.tar.gz", hash = "sha256:c13475981929ad1a50e0151085815bbff183f9f328d2efba9b77c119e9ca754a"}, ] [package.dependencies] @@ -4741,27 +4643,6 @@ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["pre-commit", "pytest (>=7.0)", "pytest-timeout"] typing = ["mypy (>=1.6,<2.0)", "traitlets (>=5.11.1)"] -[[package]] -name = "texify" -version = "0.2.1" -description = "OCR for latex images" -optional = false -python-versions = "<4.0,>=3.10" -files = [ - {file = "texify-0.2.1-py3-none-any.whl", hash = "sha256:861c90ea6167fb6c2b334d5fcf0116dd9e1585af359463dec83115891c09dcfa"}, - {file = "texify-0.2.1.tar.gz", hash = "sha256:bab30f8445aa60e36de122fb86deb77b3f25348a885d4d5f3c67d6b6f5bb2e81"}, -] - -[package.dependencies] -ftfy = ">=6.1.3,<7.0.0" -Pillow = ">=10.1.0,<11.0.0" -pydantic = ">=2.5.2,<3.0.0" -pydantic-settings = ">=2.1.0,<3.0.0" -pypdfium2 = ">=4.25.0,<5.0.0" -python-dotenv = ">=1.0.0,<2.0.0" -torch = ">=2.1.2,<3.0.0" -transformers = ">=4.36.2,<5.0.0" - [[package]] name = "threadpoolctl" version = "3.5.0" @@ -4989,13 +4870,13 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0, [[package]] name = "transformers" -version = "4.48.2" +version = "4.48.3" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = false python-versions = ">=3.9.0" files = [ - {file = "transformers-4.48.2-py3-none-any.whl", hash = "sha256:493bc5b0268b116eff305edf6656367fc89cf570e7a9d5891369e04751db698a"}, - {file = "transformers-4.48.2.tar.gz", hash = "sha256:dcfb73473e61f22fb3366fe2471ed2e42779ecdd49527a1bdf1937574855d516"}, + {file = "transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36"}, + {file = "transformers-4.48.3.tar.gz", hash = "sha256:a5e8f1e9a6430aa78215836be70cecd3f872d99eeda300f41ad6cc841724afdb"}, ] [package.dependencies] @@ -5122,17 +5003,6 @@ files = [ [package.extras] dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake8-commas", "flake8-comprehensions", "flake8-continuation", "flake8-datetimez", "flake8-docstrings", "flake8-import-order", "flake8-literal", "flake8-modern-annotations", "flake8-noqa", "flake8-pyproject", "flake8-requirements", "flake8-typechecking-import", "flake8-use-fstring", "mypy", "pep8-naming", "types-PyYAML"] -[[package]] -name = "uritemplate" -version = "4.1.1" -description = "Implementation of RFC 6570 URI Templates" -optional = false -python-versions = ">=3.6" -files = [ - {file = "uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"}, - {file = "uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0"}, -] - [[package]] name = "urllib3" version = "2.3.0" @@ -5260,6 +5130,84 @@ docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"] optional = ["python-socks", "wsaccel"] test = ["websockets"] +[[package]] +name = "websockets" +version = "14.2" +description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" +optional = false +python-versions = ">=3.9" +files = [ + {file = "websockets-14.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e8179f95323b9ab1c11723e5d91a89403903f7b001828161b480a7810b334885"}, + {file = "websockets-14.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0d8c3e2cdb38f31d8bd7d9d28908005f6fa9def3324edb9bf336d7e4266fd397"}, + {file = "websockets-14.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:714a9b682deb4339d39ffa674f7b674230227d981a37d5d174a4a83e3978a610"}, + {file = "websockets-14.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2e53c72052f2596fb792a7acd9704cbc549bf70fcde8a99e899311455974ca3"}, + {file = "websockets-14.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3fbd68850c837e57373d95c8fe352203a512b6e49eaae4c2f4088ef8cf21980"}, + {file = "websockets-14.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b27ece32f63150c268593d5fdb82819584831a83a3f5809b7521df0685cd5d8"}, + {file = "websockets-14.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4daa0faea5424d8713142b33825fff03c736f781690d90652d2c8b053345b0e7"}, + {file = "websockets-14.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:bc63cee8596a6ec84d9753fd0fcfa0452ee12f317afe4beae6b157f0070c6c7f"}, + {file = "websockets-14.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a570862c325af2111343cc9b0257b7119b904823c675b22d4ac547163088d0d"}, + {file = "websockets-14.2-cp310-cp310-win32.whl", hash = "sha256:75862126b3d2d505e895893e3deac0a9339ce750bd27b4ba515f008b5acf832d"}, + {file = "websockets-14.2-cp310-cp310-win_amd64.whl", hash = "sha256:cc45afb9c9b2dc0852d5c8b5321759cf825f82a31bfaf506b65bf4668c96f8b2"}, + {file = "websockets-14.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3bdc8c692c866ce5fefcaf07d2b55c91d6922ac397e031ef9b774e5b9ea42166"}, + {file = "websockets-14.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c93215fac5dadc63e51bcc6dceca72e72267c11def401d6668622b47675b097f"}, + {file = "websockets-14.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1c9b6535c0e2cf8a6bf938064fb754aaceb1e6a4a51a80d884cd5db569886910"}, + {file = "websockets-14.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a52a6d7cf6938e04e9dceb949d35fbdf58ac14deea26e685ab6368e73744e4c"}, + {file = "websockets-14.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9f05702e93203a6ff5226e21d9b40c037761b2cfb637187c9802c10f58e40473"}, + {file = "websockets-14.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22441c81a6748a53bfcb98951d58d1af0661ab47a536af08920d129b4d1c3473"}, + {file = "websockets-14.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd9b868d78b194790e6236d9cbc46d68aba4b75b22497eb4ab64fa640c3af56"}, + {file = "websockets-14.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1a5a20d5843886d34ff8c57424cc65a1deda4375729cbca4cb6b3353f3ce4142"}, + {file = "websockets-14.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:34277a29f5303d54ec6468fb525d99c99938607bc96b8d72d675dee2b9f5bf1d"}, + {file = "websockets-14.2-cp311-cp311-win32.whl", hash = "sha256:02687db35dbc7d25fd541a602b5f8e451a238ffa033030b172ff86a93cb5dc2a"}, + {file = "websockets-14.2-cp311-cp311-win_amd64.whl", hash = "sha256:862e9967b46c07d4dcd2532e9e8e3c2825e004ffbf91a5ef9dde519ee2effb0b"}, + {file = "websockets-14.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1f20522e624d7ffbdbe259c6b6a65d73c895045f76a93719aa10cd93b3de100c"}, + {file = "websockets-14.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:647b573f7d3ada919fd60e64d533409a79dcf1ea21daeb4542d1d996519ca967"}, + {file = "websockets-14.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6af99a38e49f66be5a64b1e890208ad026cda49355661549c507152113049990"}, + {file = "websockets-14.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:091ab63dfc8cea748cc22c1db2814eadb77ccbf82829bac6b2fbe3401d548eda"}, + {file = "websockets-14.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b374e8953ad477d17e4851cdc66d83fdc2db88d9e73abf755c94510ebddceb95"}, + {file = "websockets-14.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a39d7eceeea35db85b85e1169011bb4321c32e673920ae9c1b6e0978590012a3"}, + {file = "websockets-14.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0a6f3efd47ffd0d12080594f434faf1cd2549b31e54870b8470b28cc1d3817d9"}, + {file = "websockets-14.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:065ce275e7c4ffb42cb738dd6b20726ac26ac9ad0a2a48e33ca632351a737267"}, + {file = "websockets-14.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e9d0e53530ba7b8b5e389c02282f9d2aa47581514bd6049d3a7cffe1385cf5fe"}, + {file = "websockets-14.2-cp312-cp312-win32.whl", hash = "sha256:20e6dd0984d7ca3037afcb4494e48c74ffb51e8013cac71cf607fffe11df7205"}, + {file = "websockets-14.2-cp312-cp312-win_amd64.whl", hash = "sha256:44bba1a956c2c9d268bdcdf234d5e5ff4c9b6dc3e300545cbe99af59dda9dcce"}, + {file = "websockets-14.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6f1372e511c7409a542291bce92d6c83320e02c9cf392223272287ce55bc224e"}, + {file = "websockets-14.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4da98b72009836179bb596a92297b1a61bb5a830c0e483a7d0766d45070a08ad"}, + {file = "websockets-14.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8a86a269759026d2bde227652b87be79f8a734e582debf64c9d302faa1e9f03"}, + {file = "websockets-14.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86cf1aaeca909bf6815ea714d5c5736c8d6dd3a13770e885aafe062ecbd04f1f"}, + {file = "websockets-14.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9b0f6c3ba3b1240f602ebb3971d45b02cc12bd1845466dd783496b3b05783a5"}, + {file = "websockets-14.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669c3e101c246aa85bc8534e495952e2ca208bd87994650b90a23d745902db9a"}, + {file = "websockets-14.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eabdb28b972f3729348e632ab08f2a7b616c7e53d5414c12108c29972e655b20"}, + {file = "websockets-14.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2066dc4cbcc19f32c12a5a0e8cc1b7ac734e5b64ac0a325ff8353451c4b15ef2"}, + {file = "websockets-14.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ab95d357cd471df61873dadf66dd05dd4709cae001dd6342edafc8dc6382f307"}, + {file = "websockets-14.2-cp313-cp313-win32.whl", hash = "sha256:a9e72fb63e5f3feacdcf5b4ff53199ec8c18d66e325c34ee4c551ca748623bbc"}, + {file = "websockets-14.2-cp313-cp313-win_amd64.whl", hash = "sha256:b439ea828c4ba99bb3176dc8d9b933392a2413c0f6b149fdcba48393f573377f"}, + {file = "websockets-14.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7cd5706caec1686c5d233bc76243ff64b1c0dc445339bd538f30547e787c11fe"}, + {file = "websockets-14.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ec607328ce95a2f12b595f7ae4c5d71bf502212bddcea528290b35c286932b12"}, + {file = "websockets-14.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:da85651270c6bfb630136423037dd4975199e5d4114cae6d3066641adcc9d1c7"}, + {file = "websockets-14.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ecadc7ce90accf39903815697917643f5b7cfb73c96702318a096c00aa71f5"}, + {file = "websockets-14.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1979bee04af6a78608024bad6dfcc0cc930ce819f9e10342a29a05b5320355d0"}, + {file = "websockets-14.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dddacad58e2614a24938a50b85969d56f88e620e3f897b7d80ac0d8a5800258"}, + {file = "websockets-14.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:89a71173caaf75fa71a09a5f614f450ba3ec84ad9fca47cb2422a860676716f0"}, + {file = "websockets-14.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6af6a4b26eea4fc06c6818a6b962a952441e0e39548b44773502761ded8cc1d4"}, + {file = "websockets-14.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:80c8efa38957f20bba0117b48737993643204645e9ec45512579132508477cfc"}, + {file = "websockets-14.2-cp39-cp39-win32.whl", hash = "sha256:2e20c5f517e2163d76e2729104abc42639c41cf91f7b1839295be43302713661"}, + {file = "websockets-14.2-cp39-cp39-win_amd64.whl", hash = "sha256:b4c8cef610e8d7c70dea92e62b6814a8cd24fbd01d7103cc89308d2bfe1659ef"}, + {file = "websockets-14.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d7d9cafbccba46e768be8a8ad4635fa3eae1ffac4c6e7cb4eb276ba41297ed29"}, + {file = "websockets-14.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c76193c1c044bd1e9b3316dcc34b174bbf9664598791e6fb606d8d29000e070c"}, + {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd475a974d5352390baf865309fe37dec6831aafc3014ffac1eea99e84e83fc2"}, + {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c6c0097a41968b2e2b54ed3424739aab0b762ca92af2379f152c1aef0187e1c"}, + {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7ff794c8b36bc402f2e07c0b2ceb4a2424147ed4785ff03e2a7af03711d60a"}, + {file = "websockets-14.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dec254fcabc7bd488dab64846f588fc5b6fe0d78f641180030f8ea27b76d72c3"}, + {file = "websockets-14.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:bbe03eb853e17fd5b15448328b4ec7fb2407d45fb0245036d06a3af251f8e48f"}, + {file = "websockets-14.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3c4aa3428b904d5404a0ed85f3644d37e2cb25996b7f096d77caeb0e96a3b42"}, + {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:577a4cebf1ceaf0b65ffc42c54856214165fb8ceeba3935852fc33f6b0c55e7f"}, + {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad1c1d02357b7665e700eca43a31d52814ad9ad9b89b58118bdabc365454b574"}, + {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f390024a47d904613577df83ba700bd189eedc09c57af0a904e5c39624621270"}, + {file = "websockets-14.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3c1426c021c38cf92b453cdf371228d3430acd775edee6bac5a4d577efc72365"}, + {file = "websockets-14.2-py3-none-any.whl", hash = "sha256:7a6ceec4ea84469f15cf15807a747e9efe57e369c384fa86e022b3bea679b79b"}, + {file = "websockets-14.2.tar.gz", hash = "sha256:5059ed9c54945efb321f097084b4c7e52c246f2c869815876a69d1efc4ad6eb5"}, +] + [[package]] name = "widgetsnbextension" version = "4.0.13" @@ -5502,4 +5450,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "9730ffc5216b8e9eb6d4c59573f4d382a160480f1a3f7fcae290d52d4e6f8a28" +content-hash = "d98a730ed15cb2a34a91a60062f5d6faa7eec256b2c42e79d868e5f0c9874c94" diff --git a/pyproject.toml b/pyproject.toml index fcc8de3b..9d4cedf4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "1.3.5" +version = "1.4.0" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md" @@ -25,17 +25,16 @@ python-dotenv = "^1.0.0" torch = "^2.5.1" tqdm = "^4.66.1" ftfy = "^6.1.1" -texify = "^0.2.1" rapidfuzz = "^3.8.1" -surya-ocr = "~0.10.2" +surya-ocr = "~0.11.0" regex = "^2024.4.28" pdftext = "~0.5.1" markdownify = "^0.13.1" click = "^8.1.7" -google-generativeai = "^0.8.3" markdown2 = "^2.5.2" filetype = "^1.2.0" scikit-learn = "^1.6.1" +google-genai = "^1.0.0" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" @@ -50,6 +49,8 @@ apted = "1.0.3" distance = "0.1.3" lxml = "5.3.0" tabulate = "^0.9.0" +latex2mathml = "^3.77.0" +playwright = "^1.49.1" [tool.poetry.scripts] marker = "marker.scripts.convert:convert_cli" diff --git a/tests/processors/test_llm_processors.py b/tests/processors/test_llm_processors.py index f8d0bc38..61197a98 100644 --- a/tests/processors/test_llm_processors.py +++ b/tests/processors/test_llm_processors.py @@ -97,25 +97,6 @@ def test_llm_table_processor(pdf_document, detection_model, table_rec_model, rec assert "Value 1 $x$" in markdown -@pytest.mark.filename("adversarial.pdf") -@pytest.mark.config({"page_range": [0]}) -def test_llm_text_processor(pdf_document, mocker): - inline_math_block = pdf_document.contained_blocks((BlockTypes.TextInlineMath,))[0] - text_lines = inline_math_block.contained_blocks(pdf_document, (BlockTypes.Line,)) - corrected_lines = ["Text"] * len(text_lines) - - mock_cls = Mock() - mock_cls.return_value.generate_response.return_value = {"corrected_lines": corrected_lines} - mocker.patch("marker.processors.llm.GoogleModel", mock_cls) - - processor = LLMTextProcessor({"use_llm": True, "google_api_key": "test"}) - processor(pdf_document) - - contained_spans = text_lines[0].contained_blocks(pdf_document, (BlockTypes.Span,)) - assert contained_spans[0].text == "Text\n" # Newline inserted at end of line - assert contained_spans[0].formats == ["italic"] - - @pytest.mark.filename("A17_FlightPlan.pdf") @pytest.mark.config({"page_range": [0]}) def test_llm_caption_processor_disabled(pdf_document): @@ -168,4 +149,22 @@ def test_llm_complex_region_processor(pdf_document, mocker): renderer = MarkdownRenderer() rendered_md = renderer(pdf_document).markdown - assert md in rendered_md \ No newline at end of file + assert md in rendered_md + +@pytest.mark.filename("adversarial.pdf") +@pytest.mark.config({"page_range": [0]}) +def test_llm_text_processor(pdf_document, mocker): + inline_math_block = pdf_document.contained_blocks((BlockTypes.TextInlineMath,))[0] + text_lines = inline_math_block.contained_blocks(pdf_document, (BlockTypes.Line,)) + corrected_lines = ["Text"] * len(text_lines) + + mock_cls = Mock() + mock_cls.return_value.generate_response.return_value = {"corrected_lines": corrected_lines} + mocker.patch("marker.processors.llm.GoogleModel", mock_cls) + + processor = LLMTextProcessor({"use_llm": True, "google_api_key": "test"}) + processor(pdf_document) + + contained_spans = text_lines[0].contained_blocks(pdf_document, (BlockTypes.Span,)) + assert contained_spans[0].text == "Text\n" # Newline inserted at end of line + assert contained_spans[0].formats == ["italic"] \ No newline at end of file
LabelsValues
Label 1 Value 1