diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
deleted file mode 100644
index 5d49aa1c..00000000
--- a/.github/workflows/benchmark.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Integration test with benchmark
-
-on: [push]
-
-env:
- TORCH_DEVICE: "cpu"
-
-jobs:
- benchmark:
- runs-on: [ubuntu-latest, windows-latest]
- steps:
- - uses: actions/checkout@v3
- - name: Set up Python 3.11
- uses: actions/setup-python@v4
- with:
- python-version: 3.11
- - name: Install python dependencies
- run: |
- pip install poetry
- poetry install
- poetry remove torch
- poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
- - name: Download benchmark data
- run: |
- wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
- unzip -o benchmark_data.zip
- - name: Run benchmark test
- run: |
- poetry run python benchmarks/overall.py benchmark_data/pdfs benchmark_data/references report.json
- poetry run python benchmarks/verify_scores.py report.json --type marker
-
-
-
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
new file mode 100644
index 00000000..5b76ff15
--- /dev/null
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,32 @@
+name: Integration test
+
+on: [push]
+
+env:
+ PYTHONIOENCODING: "utf-8"
+
+jobs:
+ benchmark:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python 3.11
+ uses: actions/setup-python@v4
+ with:
+ python-version: 3.11
+ - name: Install apt dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y pandoc
+ - name: Install python dependencies
+ run: |
+ pip install poetry
+ poetry install
+ - name: Run benchmark test
+ run: |
+ poetry run python benchmarks/overall/overall.py --max_rows 5
+ poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/result.json --type marker
+ - name: Run table benchmark
+ run: |
+ poetry run python benchmarks/table/table.py --max_rows 5
+ poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table
\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index af4e92e8..84137df5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,10 +2,6 @@ name: CI tests
on: [push]
-env:
- TORCH_DEVICE: "cpu"
- OCR_ENGINE: "surya"
-
jobs:
tests:
runs-on: ubuntu-latest
diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml
index 217e4221..06230580 100644
--- a/.github/workflows/scripts.yml
+++ b/.github/workflows/scripts.yml
@@ -2,10 +2,6 @@ name: Test CLI scripts
on: [push]
-env:
- TORCH_DEVICE: "cpu"
- OCR_ENGINE: "surya"
-
jobs:
tests:
runs-on: ubuntu-latest
diff --git a/README.md b/README.md
index 0a3382a8..f1a52aa9 100644
--- a/README.md
+++ b/README.md
@@ -10,17 +10,25 @@ Marker converts PDFs and images to markdown, JSON, and HTML quickly and accurate
- Optionally boost accuracy with an LLM
- Works on GPU, CPU, or MPS
-## How it works
+## Performance
-Marker is a pipeline of deep learning models:
+
-- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya))
-- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya))
-- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify), [surya](https://github.com/VikParuchuri/surya))
-- Optionally use an LLM to improve quality
-- Combine blocks and postprocess complete text
+Marker benchmarks favorably compared to cloud services like Llamaparse and Mathpix, as well as other open source tools.
-It only uses models where necessary, which improves speed and accuracy.
+The above results are running single PDF pages serially. Marker is significantly faster when running in batch mode, with a projected throughput of 122 pages/second on an H100 (.18 seconds per page across 22 processes).
+
+See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
+
+## Hybrid Mode
+
+For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, format tables properly, and extract values from forms. It uses `gemini-flash-2.0`, which is cheap and fast.
+
+Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:
+
+
+
+As you can see, the use_llm mode offers higher accuracy than marker or gemini alone.
## Examples
@@ -30,14 +38,6 @@ It only uses models where necessary, which improves speed and accuracy.
| [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/switch_transformers/switch_trans.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/switch_trans.json) |
| [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/multicolcnn/multicolcnn.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/multicolcnn.json) |
-## Performance
-
-data:image/s3,"s3://crabby-images/826e7/826e782e814ea33aa473ae193b7fc8c5791846b9" alt="Benchmark overall"
-
-The above results are with marker setup so it takes ~7GB of VRAM on an A10.
-
-See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
-
# Commercial usage
I want marker to be as widely accessible as possible, while still funding my development/training costs. Research and personal usage is always okay, but there are some restrictions on commercial usage.
@@ -56,17 +56,6 @@ There's a hosted API for marker available [here](https://www.datalab.to/):
[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
-# Limitations
-
-PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
-
-- Marker will only convert block equations
-- Tables are not always formatted 100% correctly
-- Forms are not converted optimally
-- Very complex layouts, with nested tables and forms, may not work
-
-Note: Passing the `--use_llm` flag will mostly solve these issues.
-
# Installation
You'll need python 3.10+ and PyTorch. You may need to install the CPU version of torch first if you're not using a Mac or a GPU machine. See [here](https://pytorch.org/get-started/locally/) for more details.
@@ -82,7 +71,7 @@ pip install marker-pdf
First, some configuration:
- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
-- Some PDFs, even digital ones, have bad text in them. Set the `force_ocr` flag on the CLI or via configuration to ensure your PDF runs through OCR, or the `strip_existing_ocr` to keep all digital text, and only strip out any existing OCR text.
+- Some PDFs, even digital ones, have bad text in them. Set the `force_ocr` flag to ensure your PDF runs through OCR, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text.
## Interactive App
@@ -219,11 +208,11 @@ rendered = converter("FILEPATH")
text, _, images = text_from_rendered(rendered)
```
-This takes all the same configuration as the PdfConverter. You can specify the configuration `--force_layout_block=Table` to avoid layout detection and instead assume every page is a table.
+This takes all the same configuration as the PdfConverter. You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table. Set `output_format=json` to also get cell bounding boxes.
You can also run this via the CLI with
```shell
-python convert_single.py FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter
+marker_single FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter --output_format json
```
# Output Formats
@@ -377,36 +366,55 @@ There are some settings that you may find useful if things aren't working the wa
Pass the `debug` option to activate debug mode. This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information.
# Benchmarks
+
## Overall PDF Conversion
-Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct.
-**Speed**
+We created a [benchmark set](https://huggingface.co/datasets/datalab-to/marker_benchmark) by extracting single PDF pages from common crawl. We scored based on a heuristic that aligns text with ground truth text segments, and an LLM as a judge scoring method.
+
+| Method | Avg Time | Heuristic Score | LLM Score |
+|------------|----------|-----------------|-----------|
+| marker | 2.83837 | 95.6709 | 4.23916 |
+| llamaparse | 23.348 | 84.2442 | 3.97619 |
+| mathpix | 6.36223 | 86.4281 | 4.15626 |
+| docling | 3.69949 | 86.7073 | 3.70429 |
-| Method | Average Score | Time per page | Time per document |
-|---------|----------------|---------------|------------------|
-| marker | 0.625115 | 0.234184 | 21.545 |
+Benchmarks were run on an H100 for markjer and docling - llamaparse and mathpix used their cloud services. We can also look at it by document type:
-**Accuracy**
+
-| Method | thinkpython.pdf | switch_trans.pdf | thinkdsp.pdf | crowd.pdf | thinkos.pdf | multicolcnn.pdf |
-|---------|----------------|-----------------|--------------|------------|-------------|----------------|
-| marker | 0.720347 | 0.592002 | 0.70468 | 0.515082 | 0.701394 | 0.517184 |
+| Document Type | Marker heuristic | Marker LLM | Llamaparse Heuristic | Llamaparse LLM | Mathpix Heuristic | Mathpix LLM | Docling Heuristic | Docling LLM |
+|----------------------|------------------|------------|----------------------|----------------|-------------------|-------------|-------------------|-------------|
+| Scientific paper | 96.6737 | 4.34899 | 87.1651 | 3.96421 | 91.2267 | 4.46861 | 92.135 | 3.72422 |
+| Book page | 97.1846 | 4.16168 | 90.9532 | 4.07186 | 93.8886 | 4.35329 | 90.0556 | 3.64671 |
+| Other | 95.1632 | 4.25076 | 81.1385 | 4.01835 | 79.6231 | 4.00306 | 83.8223 | 3.76147 |
+| Form | 88.0147 | 3.84663 | 66.3081 | 3.68712 | 64.7512 | 3.33129 | 68.3857 | 3.40491 |
+| Presentation | 95.1562 | 4.13669 | 81.2261 | 4 | 83.6737 | 3.95683 | 84.8405 | 3.86331 |
+| Financial document | 95.3697 | 4.39106 | 82.5812 | 4.16111 | 81.3115 | 4.05556 | 86.3882 | 3.8 |
+| Letter | 98.4021 | 4.5 | 93.4477 | 4.28125 | 96.0383 | 4.45312 | 92.0952 | 4.09375 |
+| Engineering document | 93.9244 | 4.04412 | 77.4854 | 3.72059 | 80.3319 | 3.88235 | 79.6807 | 3.42647 |
+| Legal document | 96.689 | 4.27759 | 86.9769 | 3.87584 | 91.601 | 4.20805 | 87.8383 | 3.65552 |
+| Newspaper page | 98.8733 | 4.25806 | 84.7492 | 3.90323 | 96.9963 | 4.45161 | 92.6496 | 3.51613 |
+| Magazine page | 98.2145 | 4.38776 | 87.2902 | 3.97959 | 93.5934 | 4.16327 | 93.0892 | 4.02041 |
-Peak GPU memory usage during the benchmark is `6GB` for marker. Benchmarks were run on an A10.
+## Throughput
-**Throughput**
+We benchmarked throughput using a [single long PDF](https://www.greenteapress.com/thinkpython/thinkpython.pdf).
-Marker takes about 6GB of VRAM on average per task, so you can convert 8 documents in parallel on an A6000.
+| Method | Time per page | Time per document | VRAM used |
+|---------|---------------|-------------------|---------- |
+| marker | 0.18 | 43.42 | 3.17GB |
-data:image/s3,"s3://crabby-images/792a1/792a1924cb1b9057534d0af404d4e57fcc975030" alt="Benchmark results"
+The projected throughput is 122 pages per second on an H100 - we can run 22 individual processes given the VRAM used.
## Table Conversion
+
Marker can extract tables from PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a tree edit distance based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves these scores:
-| Avg score | Total tables | use_llm |
-|-----------|--------------|---------|
-| 0.822 | 54 | False |
-| 0.887 | 54 | True |
+| Method | Avg score | Total tables |
+|------------------|-----------|--------------|
+| marker | 0.816 | 99 |
+| marker w/use_llm | 0.907 | 99 |
+| gemini | 0.829 | 99 |
The `--use_llm` flag can significantly improve table recognition performance, as you can see.
@@ -426,16 +434,49 @@ poetry install
Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this:
```shell
-python benchmarks/overall.py data/pdfs data/references report.json
+python benchmarks/overall.py --methods marker --scores heuristic,llm
```
+Options:
+
+- `--use_llm` use an llm to improve the marker results.
+- `--max_rows` how many rows to process for the benchmark.
+- `--methods` can be `llamaparse`, `mathpix`, `docling`, `marker`. Comma separated.
+- `--scores` which scoring functions to use, can be `llm`, `heuristic`. Comma separated.
+
### Table Conversion
The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
```shell
-python benchmarks/table/table.py table_report.json --max_rows 1000
+python benchmarks/table/table.py --max_rows 100
```
+Options:
+
+- `--use_llm` uses an llm with marker to improve accuracy.
+- `--use_gemini` also benchmarks gemini 2.0 flash.
+
+# How it works
+
+Marker is a pipeline of deep learning models:
+
+- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya))
+- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya))
+- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify), [surya](https://github.com/VikParuchuri/surya))
+- Optionally use an LLM to improve quality
+- Combine blocks and postprocess complete text
+
+It only uses models where necessary, which improves speed and accuracy.
+
+# Limitations
+
+PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
+
+- Marker will only convert block equations
+- Very complex layouts, with nested tables and forms, may not work
+
+Note: Passing the `--use_llm` flag will mostly solve these issues.
+
# Thanks
This work would not have been possible without amazing open source models and datasets, including (but not limited to):
@@ -445,4 +486,4 @@ This work would not have been possible without amazing open source models and da
- Pypdfium2/pdfium
- DocLayNet from IBM
-Thank you to the authors of these models and datasets for making them available to the community!
+Thank you to the authors of these models and datasets for making them available to the community!
\ No newline at end of file
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/overall.py b/benchmarks/overall.py
deleted file mode 100644
index f6fb9591..00000000
--- a/benchmarks/overall.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import tempfile
-import time
-from collections import defaultdict
-
-import click
-from tqdm import tqdm
-import pypdfium2 as pdfium
-
-from marker.config.parser import ConfigParser
-from marker.converters.pdf import PdfConverter
-from marker.logger import configure_logging
-from marker.models import create_model_dict
-from pdftext.extraction import plain_text_output
-import json
-import os
-import subprocess
-import shutil
-from tabulate import tabulate
-
-from marker.settings import settings
-from scoring import score_text
-
-configure_logging()
-
-
-def nougat_prediction(pdf_filename, batch_size=1):
- out_dir = tempfile.mkdtemp()
- subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
- md_file = os.listdir(out_dir)[0]
- with open(os.path.join(out_dir, md_file), "r") as f:
- data = f.read()
- shutil.rmtree(out_dir)
- return data
-
-@click.command(help="Benchmark PDF to MD conversion.")
-@click.argument("in_folder", type=str)
-@click.argument("reference_folder", type=str)
-@click.argument("out_file", type=str)
-@click.option("--nougat", is_flag=True, help="Run nougat and compare")
-@click.option("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
-def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_out_path: str):
- methods = ["marker"]
- if nougat:
- methods.append("nougat")
-
- model_dict = create_model_dict()
-
- scores = defaultdict(dict)
- benchmark_files = os.listdir(in_folder)
- benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")]
- times = defaultdict(dict)
- pages = defaultdict(int)
-
- for idx, fname in tqdm(enumerate(benchmark_files)):
- md_filename = fname.rsplit(".", 1)[0] + ".md"
-
- reference_filename = os.path.join(reference_folder, md_filename)
- with open(reference_filename, "r") as f:
- reference = f.read()
-
- pdf_filename = os.path.join(in_folder, fname)
- doc = pdfium.PdfDocument(pdf_filename)
- pages[fname] = len(doc)
-
- config_parser = ConfigParser({"output_format": "markdown"})
- for method in methods:
- start = time.time()
- if method == "marker":
- converter = PdfConverter(
- config=config_parser.generate_config_dict(),
- artifact_dict=model_dict,
- processor_list=None,
- renderer=config_parser.get_renderer()
- )
- full_text = converter(pdf_filename).markdown
- elif method == "nougat":
- full_text = nougat_prediction(pdf_filename, batch_size=1)
- elif method == "naive":
- full_text = plain_text_output(doc, workers=1)
- else:
- raise ValueError(f"Unknown method {method}")
-
- times[method][fname] = time.time() - start
-
- score = score_text(full_text, reference)
- scores[method][fname] = score
-
- if md_out_path:
- md_out_filename = f"{method}_{md_filename}"
- with open(os.path.join(md_out_path, md_out_filename), "w+") as f:
- f.write(full_text)
-
- total_pages = sum(pages.values())
- with open(out_file, "w+") as f:
- write_data = defaultdict(dict)
- for method in methods:
- total_time = sum(times[method].values())
- file_stats = {
- fname:
- {
- "time": times[method][fname],
- "score": scores[method][fname],
- "pages": pages[fname]
- }
-
- for fname in benchmark_files
- }
- write_data[method] = {
- "files": file_stats,
- "avg_score": sum(scores[method].values()) / len(scores[method]),
- "time_per_page": total_time / total_pages,
- "time_per_doc": total_time / len(scores[method])
- }
-
- json.dump(write_data, f, indent=4)
-
- summary_table = []
- score_table = []
- score_headers = benchmark_files
- for method in methods:
- summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]])
- score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]])
-
- print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"]))
- print("")
- print("Scores by file")
- print(tabulate(score_table, headers=["Method", *score_headers]))
-
-
-if __name__ == "__main__":
- main()
-
diff --git a/benchmarks/overall/__init__.py b/benchmarks/overall/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/overall/display/__init__.py b/benchmarks/overall/display/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/overall/display/dataset.py b/benchmarks/overall/display/dataset.py
new file mode 100644
index 00000000..e9fcabdd
--- /dev/null
+++ b/benchmarks/overall/display/dataset.py
@@ -0,0 +1,48 @@
+import json
+from typing import List
+
+import datasets
+from tqdm import tqdm
+
+from benchmarks.overall.registry import METHOD_REGISTRY
+from benchmarks.overall.schema import FullResult
+
+
+def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str], max_rows: int | None = None) -> datasets.Dataset:
+ rows = []
+ for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"):
+ if idx not in result["markdown"]:
+ continue
+
+ if max_rows is not None and idx >= max_rows:
+ break
+
+ row = {
+ "uuid": sample["uuid"],
+ "classification": sample["classification"],
+ "language": sample["language"],
+ "img": sample["img"],
+ }
+ for method in result["markdown"][idx]:
+ if method == "gt":
+ continue
+
+ method_cls = METHOD_REGISTRY[method]()
+ md = result["markdown"][idx][method]
+ method_img = method_cls.render(result["markdown"][idx][method])
+ row[f"{method}_md"] = md
+ row[f"{method}_img"] = method_img
+
+ for score_type in score_types:
+ try:
+ row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"]
+ except KeyError:
+ row[f"{method}_{score_type}"] = -1.0 # Missing score
+ try:
+ row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"])
+ except KeyError:
+ row[f"{method}_{score_type}_detail"] = "" # Missing detail
+ rows.append(row)
+ ds = datasets.Dataset.from_list(rows)
+ return ds
+
diff --git a/benchmarks/overall/display/table.py b/benchmarks/overall/display/table.py
new file mode 100644
index 00000000..5d704214
--- /dev/null
+++ b/benchmarks/overall/display/table.py
@@ -0,0 +1,68 @@
+from pathlib import Path
+from typing import Dict, List
+
+import tabulate
+
+from benchmarks.overall.schema import FullResult
+
+def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
+ table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
+ with open(out_path / filename, "w", encoding="utf-8") as f:
+ f.write(f"# {title}\n")
+ f.write(table)
+ print(title)
+ print(table)
+
+
+def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
+ document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
+ headers = ["Document Type"]
+ for method in methods:
+ for score_type in score_types:
+ headers.append(f"{method} {score_type}")
+
+ document_rows = [[k] for k in document_types]
+ for i, doc_type in enumerate(document_types):
+ for method in methods:
+ for score_type in score_types:
+ avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
+ document_rows[i].append(avg_score)
+
+ write_table("Document Types", document_rows, headers, out_path, "document_types.md")
+
+ headers = ["Block Type"]
+ block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
+ block_score_types = list(result["averages_by_block_type"][default_method].keys())
+ for method in methods:
+ for score_type in block_score_types:
+ headers.append(f"{method} {score_type}")
+
+ block_rows = [[k] for k in block_types]
+ for i, block_type in enumerate(block_types):
+ for method in methods:
+ for score_type in block_score_types:
+ avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
+ block_rows[i].append(avg_score)
+
+ write_table("Block types", block_rows, headers, out_path, "block_types.md")
+
+ headers = ["Method", "Avg Time"] + score_types
+ inference_rows = [[k] for k in methods]
+ all_raw_scores = [result["scores"][i] for i in result["scores"]]
+ for i, method in enumerate(methods):
+ avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
+ inference_rows[i].append(avg_time)
+ for score_type in score_types:
+ scores_lst = []
+ for ar in all_raw_scores:
+ try:
+ # Sometimes a few llm scores are missing
+ scores_lst.append(ar[method][score_type]["score"])
+ except KeyError:
+ continue
+ avg_score = sum(scores_lst) / max(1, len(scores_lst))
+ inference_rows[i].append(avg_score)
+
+ write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
+
+ print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
\ No newline at end of file
diff --git a/benchmarks/overall/download/__init__.py b/benchmarks/overall/download/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/overall/download/base.py b/benchmarks/overall/download/base.py
new file mode 100644
index 00000000..cc3f3557
--- /dev/null
+++ b/benchmarks/overall/download/base.py
@@ -0,0 +1,60 @@
+import json
+from json import JSONDecodeError
+from pathlib import Path
+
+import datasets
+from tqdm import tqdm
+
+
+class Downloader:
+ cache_path: Path = Path("cache")
+ service: str
+
+ def __init__(self, api_key, app_id, max_rows: int = 2200):
+ self.cache_path.mkdir(exist_ok=True)
+ self.max_rows = max_rows
+ self.api_key = api_key
+ self.app_id = app_id
+ self.ds = datasets.load_dataset("datalab-to/marker_benchmark", split="train")
+
+ def get_html(self, pdf_bytes):
+ raise NotImplementedError
+
+ def upload_ds(self):
+ rows = []
+ for file in self.cache_path.glob("*.json"):
+ with open(file, "r") as f:
+ data = json.load(f)
+ rows.append(data)
+
+ out_ds = datasets.Dataset.from_list(rows, features=datasets.Features({
+ "md": datasets.Value("string"),
+ "uuid": datasets.Value("string"),
+ "time": datasets.Value("float"),
+ }))
+ out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}")
+
+ def generate_data(self):
+ max_rows = 2200
+ for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"):
+ cache_file = self.cache_path / f"{idx}.json"
+ if cache_file.exists():
+ continue
+
+ pdf_bytes = sample["pdf"] # This is a single page PDF
+ try:
+ out_data = self.get_html(pdf_bytes)
+ except JSONDecodeError as e:
+ print(f"Error with sample {idx}: {e}")
+ continue
+ out_data["uuid"] = sample["uuid"]
+
+ with cache_file.open("w") as f:
+ json.dump(out_data, f)
+
+ if idx >= max_rows:
+ break
+
+ def __call__(self):
+ self.generate_data()
+ self.upload_ds()
diff --git a/benchmarks/overall/download/llamaparse.py b/benchmarks/overall/download/llamaparse.py
new file mode 100644
index 00000000..a6b65867
--- /dev/null
+++ b/benchmarks/overall/download/llamaparse.py
@@ -0,0 +1,64 @@
+import io
+import os
+import time
+
+import requests
+
+from benchmarks.overall.download.base import Downloader
+
+
+class LlamaParseDownloader(Downloader):
+ service = "llamaparse"
+
+ def get_html(self, pdf_bytes):
+ rand_name = str(time.time()) + ".pdf"
+ start = time.time()
+ buff = io.BytesIO(pdf_bytes)
+ md = upload_and_parse_file(self.api_key, rand_name, buff)
+ end = time.time()
+ if isinstance(md, bytes):
+ md = md.decode("utf-8")
+
+ return {
+ "md": md,
+ "time": end - start,
+ }
+
+
+def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1):
+ headers = {
+ "Authorization": f"Bearer {api_key}",
+ "Accept": "application/json"
+ }
+
+ # Upload file
+ files = {
+ 'file': (fname, buff, 'application/pdf')
+ }
+ response = requests.post(
+ 'https://api.cloud.llamaindex.ai/api/v1/parsing/upload',
+ headers=headers,
+ files=files
+ )
+ response.raise_for_status()
+ job_id = response.json()['id']
+
+ # Poll for completion
+ for _ in range(max_retries):
+ status_response = requests.get(
+ f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}',
+ headers=headers
+ )
+ status_response.raise_for_status()
+ if status_response.json()['status'] == 'SUCCESS':
+ # Get results
+ result_response = requests.get(
+ f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown',
+ headers=headers
+ )
+ result_response.raise_for_status()
+ return result_response.json()['markdown']
+
+ time.sleep(delay)
+
+ raise TimeoutError("Job did not complete within the maximum retry attempts")
\ No newline at end of file
diff --git a/benchmarks/overall/download/main.py b/benchmarks/overall/download/main.py
new file mode 100644
index 00000000..01a31c37
--- /dev/null
+++ b/benchmarks/overall/download/main.py
@@ -0,0 +1,23 @@
+import click
+
+from benchmarks.overall.download.llamaparse import LlamaParseDownloader
+from benchmarks.overall.download.mathpix import MathpixDownloader
+
+
+@click.command("Download data from inference services")
+@click.argument("service", type=click.Choice(["mathpix", "llamaparse"]))
+@click.argument("--max_rows", type=int, default=2200)
+@click.argument("--api_key", type=str, default=None)
+@click.argument("--app_id", type=str, default=None)
+def main(service: str, max_rows: int, api_key: str, app_id: str):
+ registry = {
+ "mathpix": MathpixDownloader,
+ "llamaparse": LlamaParseDownloader
+ }
+ downloader = registry[service](api_key, app_id, max_rows=max_rows)
+
+ # Generate data and upload to hub
+ downloader()
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/overall/download/mathpix.py b/benchmarks/overall/download/mathpix.py
new file mode 100644
index 00000000..204424ac
--- /dev/null
+++ b/benchmarks/overall/download/mathpix.py
@@ -0,0 +1,80 @@
+import json
+import time
+
+import requests
+
+from benchmarks.overall.download.base import Downloader
+
+
+class MathpixDownloader(Downloader):
+ service = "mathpix"
+
+ def get_html(self, pdf_bytes):
+ headers = {
+ "app_id": self.app_id,
+ "app_key": self.api_key,
+ }
+ start = time.time()
+ pdf_id = mathpix_request(pdf_bytes, headers)
+ status = mathpix_status(pdf_id, headers)
+ if status in ["processing", "error"]:
+ md = ""
+ else:
+ md = mathpix_results(pdf_id, headers)
+ end = time.time()
+ if isinstance(md, bytes):
+ md = md.decode("utf-8")
+
+ return {
+ "md": md,
+ "time": end - start
+ }
+
+def mathpix_request(buffer, headers):
+ response = requests.post("https://api.mathpix.com/v3/pdf",
+ headers=headers,
+ data={
+ "options_json": json.dumps(
+ {
+ "conversion_formats": {
+ "md": True,
+ "html": True
+ }
+ }
+ )
+ },
+ files={
+ "file": buffer
+ }
+ )
+ data = response.json()
+ pdf_id = data["pdf_id"]
+ return pdf_id
+
+def mathpix_status(pdf_id, headers):
+ max_iters = 120
+ i = 0
+ status = "processing"
+ status2 = "processing"
+ while i < max_iters:
+ time.sleep(1)
+ response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}",
+ headers=headers
+ )
+ status_resp = response.json()
+ if "conversion_status" not in status_resp:
+ continue
+ status = status_resp["conversion_status"]["md"]["status"]
+ status2 = status_resp["conversion_status"]["html"]["status"]
+ if status == "completed" and status2 == "completed":
+ break
+ elif status == "error" or status2 == "error":
+ break
+ out_status = "completed" if status == "completed" and status2 == "completed" else "error"
+ return out_status
+
+def mathpix_results(pdf_id, headers, ext="md"):
+ response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}",
+ headers=headers
+ )
+ return response.content
diff --git a/benchmarks/overall/elo.py b/benchmarks/overall/elo.py
new file mode 100644
index 00000000..9eea3b55
--- /dev/null
+++ b/benchmarks/overall/elo.py
@@ -0,0 +1,225 @@
+import json
+import random
+import time
+from dataclasses import dataclass
+from typing import List, Dict, Tuple, Literal
+from PIL import Image
+
+import click
+import datasets
+from google import genai
+from google.genai.errors import APIError
+from pydantic import BaseModel
+from tqdm import tqdm
+
+from marker.settings import settings
+
+rating_prompt = """
+You're a document analysis expert who is comparing two different markdown samples to an image to see which one represents the content of the image better. The markdown will be called version A and version B.
+
+Here are some notes on the image and markdown:
+- Some parts of the page may have been recognized as images and linked from the markdown, like `data:image/s3,"s3://crabby-images/90472/9047277bed4a9e4df13f2186220c11e5eb46ee69" alt=""`.
+- Tables will be formatted as Github flavored markdown.
+- Block equations will be in LaTeX.
+- The image and markdown may be in any language.
+- The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.
+
+The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.
+
+**Instructions**
+Follow this process to evaluate the markdown:
+1. Carefully examine the image.
+2. Carefully examine the first markdown input provided.
+3. Describe how well version a represents the image.
+4. Carefully examine the second markdown input provided.
+5. Describe how well version B represents the image.
+6. Compare version A and version B.
+7. Decide which markdown representation is better, based on the criteria below. Output version_a if version a is better, and version_b if version b is better.
+
+Use these criteria when judging the markdown:
+- Overall - the overall quality of the markdown as compared to the image.
+- Text quality - the quality of the text extraction from the image.
+- Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.
+- Tables - how effectively the tables have been extracted and formatted.
+- Forms - how effectively the forms have extracted and formatted.
+- Equations - how effectively block equations have been converted to LaTeX.
+- Lists - if the lists have been properly extracted and formatted.
+- Images - if images are identified and placed correctly.
+
+Notes on scoring:
+- Perfect markdown will include all of the important text from the image, and the formatting will be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
+- Bad markdown will have major missing text segments from the markdown or completely unreadable formatting.
+
+Output json, like in the example below.
+
+**Example**
+Version A
+```markdown
+# *Section 1*
+This is some *markdown* extracted from a document. Here is a block equation:
+$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
+```
+Version B
+```markdown
+# Section 1
+This is some markdown extracted from a document. Here is a block equation:
+$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
+```
+Output
+```json
+{
+ "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
+ "version_a_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
+ "version_b_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation. The formatting in version b is slightly different from the image.",
+ "comparison": "Version A is better than version B. The text and formatting in version A matches the image better than version B.",
+ "winner": "version_a",
+}
+```
+**Input**
+Version A
+```markdown
+{{version_a}}
+```
+Version B
+```markdown
+{{version_b}}
+```
+**Output**
+"""
+
+class ComparerSchema(BaseModel):
+ image_description: str
+ version_a_description: str
+ version_b_description: str
+ comparison: str
+ winner: Literal["version_a", "version_b"]
+
+
+class Comparer:
+ def __init__(self):
+ pass
+
+ def __call__(
+ self,
+ img: Image.Image,
+ version_a: str,
+ version_b: str
+ ) -> str | None:
+ hydrated_prompt = rating_prompt.replace("{{version_a}}", version_a).replace("{{version_b}}", version_b)
+ try:
+ rating = self.llm_rater(img, hydrated_prompt)
+ except Exception as e:
+ print(f"Error: {e}")
+ return
+ return rating
+
+
+ def llm_rater(self, img: Image.Image, prompt: str):
+ response = self.llm_response_wrapper(
+ [img, prompt],
+ ComparerSchema
+ )
+ assert "winner" in response, f"Response missing 'winner' key: {response}"
+ return response["winner"]
+
+ def llm_response_wrapper(
+ self,
+ prompt,
+ response_schema,
+ ):
+ client = genai.Client(
+ api_key=settings.GOOGLE_API_KEY,
+ http_options={"timeout": 60000}
+ )
+ try:
+ responses = client.models.generate_content(
+ model="gemini-2.0-flash",
+ contents=prompt,
+ config={
+ "temperature": 0,
+ "response_schema": response_schema,
+ "response_mime_type": "application/json",
+ },
+ )
+ output = responses.candidates[0].content.parts[0].text
+ return json.loads(output)
+ except APIError as e:
+ print(f"Hit Gemini rate limit")
+ return
+ except Exception as e:
+ print(f"Error: {e}")
+ return
+
+@dataclass
+class Method:
+ name: str
+ rating: float = 1500
+ k_factor: float = 32
+
+
+class EloSystem:
+ def __init__(self, player_names: List[str]):
+ self.methods = {name: Method(name) for name in player_names}
+
+ def expected_score(self, rating_a: float, rating_b: float) -> float:
+ return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
+
+ def update_ratings(self, winner: str, loser: str) -> Tuple[float, float]:
+ method_a = self.methods[winner]
+ method_b = self.methods[loser]
+
+ expected_a = self.expected_score(method_a.rating, method_b.rating)
+ expected_b = self.expected_score(method_b.rating, method_a.rating)
+
+ # Winner gets score of 1, loser gets 0
+ method_a.rating += method_a.k_factor * (1 - expected_a)
+ method_b.rating += method_b.k_factor * (0 - expected_b)
+
+ return method_a.rating, method_b.rating
+
+
+@click.command("Calculate ELO scores for document conversion methods")
+@click.argument("dataset", type=str)
+@click.option("--methods", type=str, help="List of methods to compare: comma separated like marker,mathpix")
+@click.option("--row_samples", type=int, default=2, help="Number of samples per row")
+@click.option("--max_rows", type=int, default=100, help="Maximum number of rows to process")
+def main(
+ dataset: str,
+ methods: str,
+ row_samples: int,
+ max_rows: int
+):
+ ds = datasets.load_dataset(dataset, split="train")
+ method_lst = methods.split(",")
+ elo = EloSystem(method_lst)
+ comparer = Comparer()
+
+ for i in tqdm(range(min(len(ds), max_rows)), desc="Calculating ELO"):
+ row = ds[i]
+ # Avoid any bias in ordering
+ random.shuffle(method_lst)
+
+ for j, method_a in enumerate(method_lst[:-1]):
+ for z, method_b in enumerate(method_lst[j:]):
+ if method_a == method_b:
+ continue
+
+ method_a_md = row[f"{method_a}_md"]
+ method_b_md = row[f"{method_b}_md"]
+ winner = comparer(row["img"], method_a_md, method_b_md)
+ if not winner:
+ continue
+
+ if winner == "version_a":
+ elo.update_ratings(method_a, method_b)
+ else:
+ elo.update_ratings(method_b, method_a)
+ if i % 10 == 0:
+ print(elo.methods)
+
+ # Print out ratings
+ print(elo.methods)
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/benchmarks/overall/methods/__init__.py b/benchmarks/overall/methods/__init__.py
new file mode 100644
index 00000000..a5a3f53f
--- /dev/null
+++ b/benchmarks/overall/methods/__init__.py
@@ -0,0 +1,100 @@
+import io
+import random
+import re
+from typing import Tuple
+
+import markdown2
+from PIL import Image
+from playwright.sync_api import sync_playwright
+
+from benchmarks.overall.methods.schema import BenchmarkResult
+from marker.renderers.markdown import MarkdownRenderer
+
+
+class BaseMethod:
+ def __init__(self, **kwargs):
+ for kwarg in kwargs:
+ if hasattr(self, kwarg):
+ setattr(self, kwarg, kwargs[kwarg])
+
+ @staticmethod
+ def convert_to_md(html: str):
+ md = MarkdownRenderer()
+ markdown = md.md_cls.convert(html)
+ return markdown
+
+ def __call__(self, sample) -> BenchmarkResult:
+ raise NotImplementedError()
+
+ def render(self, markdown: str):
+ return self.html_to_image(self.convert_to_html(markdown))
+
+ @staticmethod
+ def convert_to_html(md: str):
+ block_placeholders = []
+ inline_placeholders = []
+
+ # Add placeholders for the math
+ def block_sub(match):
+ content = match.group(1)
+ placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
+ block_placeholders.append((placeholder, f"$${content}$$"))
+ return placeholder
+
+ def inline_sub(match):
+ content = match.group(1)
+ placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
+ inline_placeholders.append((placeholder, f"${content}$"))
+ return placeholder
+
+ md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
+ md = re.sub(r'\$(.*?)\$', inline_sub, md)
+
+ html = markdown2.markdown(md, extras=['tables'])
+
+ # Replace placeholders
+ for placeholder, math_str in block_placeholders:
+ html = html.replace(placeholder, math_str)
+ for placeholder, math_str in inline_placeholders:
+ html = html.replace(placeholder, math_str)
+
+ return html
+
+ def html_to_image(self, html: str) -> Image.Image:
+ with sync_playwright() as p:
+ browser = p.chromium.launch()
+ page = browser.new_page()
+ html_str = f"""
+
+
+
+
+
+
+
+
+
+
+ {html}
+
+
+
+ """.strip()
+ page.set_viewport_size({"width": 1200, "height": 800})
+ page.set_content(html_str)
+ page.wait_for_load_state("domcontentloaded")
+ page.wait_for_timeout(500) # Wait for KaTeX to render
+ screenshot_bytes = page.screenshot(full_page=True)
+ browser.close()
+
+ return Image.open(io.BytesIO(screenshot_bytes))
\ No newline at end of file
diff --git a/benchmarks/overall/methods/docling.py b/benchmarks/overall/methods/docling.py
new file mode 100644
index 00000000..f36ee041
--- /dev/null
+++ b/benchmarks/overall/methods/docling.py
@@ -0,0 +1,26 @@
+import tempfile
+import time
+
+from benchmarks.overall.methods import BaseMethod, BenchmarkResult
+
+
+class DoclingMethod(BaseMethod):
+ model_dict: dict = None
+ use_llm: bool = False
+
+ def __call__(self, sample) -> BenchmarkResult:
+ from docling.document_converter import DocumentConverter
+ pdf_bytes = sample["pdf"] # This is a single page PDF
+ converter = DocumentConverter()
+
+ with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
+ f.write(pdf_bytes)
+ start = time.time()
+ result = converter.convert(f.name)
+ total = time.time() - start
+
+ return {
+ "markdown": result.document.export_to_markdown(),
+ "time": total
+ }
+
diff --git a/benchmarks/overall/methods/gt.py b/benchmarks/overall/methods/gt.py
new file mode 100644
index 00000000..6c2c6c32
--- /dev/null
+++ b/benchmarks/overall/methods/gt.py
@@ -0,0 +1,29 @@
+from typing import List
+import json
+
+from PIL import Image
+
+from benchmarks.overall.methods import BaseMethod, BenchmarkResult
+
+
+class GTMethod(BaseMethod):
+ def __call__(self, sample) -> BenchmarkResult:
+ gt_blocks = json.loads(sample["gt_blocks"])
+ gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
+ gt_markdown = [self.convert_to_md(block) for block in gt_html]
+ return {
+ "markdown": gt_markdown,
+ "time": 0
+ }
+
+ def render(self, html: List[str]) -> Image.Image:
+ joined = "\n\n".join(html)
+ html = f"""
+
+
+
+{joined}
+
+
+""".strip()
+ return self.html_to_image(html)
\ No newline at end of file
diff --git a/benchmarks/overall/methods/llamaparse.py b/benchmarks/overall/methods/llamaparse.py
new file mode 100644
index 00000000..e2b1e43a
--- /dev/null
+++ b/benchmarks/overall/methods/llamaparse.py
@@ -0,0 +1,22 @@
+import datasets
+
+from benchmarks.overall.methods import BaseMethod, BenchmarkResult
+
+
+class LlamaParseMethod(BaseMethod):
+ llamaparse_ds: datasets.Dataset = None
+
+ def __call__(self, sample) -> BenchmarkResult:
+ uuid = sample["uuid"]
+ data = None
+ for row in self.llamaparse_ds:
+ if str(row["uuid"]) == str(uuid):
+ data = row
+ break
+ if not data:
+ raise ValueError(f"Could not find data for uuid {uuid}")
+
+ return {
+ "markdown": data["md"],
+ "time": data["time"]
+ }
\ No newline at end of file
diff --git a/benchmarks/overall/methods/marker.py b/benchmarks/overall/methods/marker.py
new file mode 100644
index 00000000..afaafcfc
--- /dev/null
+++ b/benchmarks/overall/methods/marker.py
@@ -0,0 +1,29 @@
+import tempfile
+import time
+
+from benchmarks.overall.methods import BaseMethod, BenchmarkResult
+from marker.converters.pdf import PdfConverter
+
+
+class MarkerMethod(BaseMethod):
+ model_dict: dict = None
+ use_llm: bool = False
+
+ def __call__(self, sample) -> BenchmarkResult:
+ pdf_bytes = sample["pdf"] # This is a single page PDF
+ block_converter = PdfConverter(
+ artifact_dict=self.model_dict,
+ config={"page_range": [0], "disable_tqdm": True, "use_llm": self.use_llm}
+ )
+
+ with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
+ f.write(pdf_bytes)
+ start = time.time()
+ rendered = block_converter(f.name)
+ total = time.time() - start
+
+ return {
+ "markdown": rendered.markdown,
+ "time": total
+ }
+
diff --git a/benchmarks/overall/methods/mathpix.py b/benchmarks/overall/methods/mathpix.py
new file mode 100644
index 00000000..d06340f7
--- /dev/null
+++ b/benchmarks/overall/methods/mathpix.py
@@ -0,0 +1,22 @@
+import datasets
+
+from benchmarks.overall.methods import BaseMethod, BenchmarkResult
+
+
+class MathpixMethod(BaseMethod):
+ mathpix_ds: datasets.Dataset = None
+
+ def __call__(self, sample) -> BenchmarkResult:
+ uuid = sample["uuid"]
+ data = None
+ for row in self.mathpix_ds:
+ if str(row["uuid"]) == str(uuid):
+ data = row
+ break
+ if not data:
+ raise ValueError(f"Could not find data for uuid {uuid}")
+
+ return {
+ "markdown": data["md"],
+ "time": data["time"]
+ }
\ No newline at end of file
diff --git a/benchmarks/overall/methods/schema.py b/benchmarks/overall/methods/schema.py
new file mode 100644
index 00000000..d475876e
--- /dev/null
+++ b/benchmarks/overall/methods/schema.py
@@ -0,0 +1,6 @@
+from typing import TypedDict, List
+
+
+class BenchmarkResult(TypedDict):
+ markdown: str | List[str]
+ time: float | None
\ No newline at end of file
diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py
new file mode 100644
index 00000000..481753e3
--- /dev/null
+++ b/benchmarks/overall/overall.py
@@ -0,0 +1,148 @@
+import json
+import os
+from collections import defaultdict
+from pathlib import Path
+from typing import List
+
+import click
+import datasets
+from tqdm import tqdm
+
+from benchmarks.overall.display.dataset import build_dataset
+from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY
+from benchmarks.overall.schema import FullResult
+from marker.logger import configure_logging
+from marker.models import create_model_dict
+from marker.settings import settings
+from benchmarks.overall.display.table import print_scores
+
+configure_logging()
+
+
+def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], score_types: List[str], artifacts: dict, max_rows=None) -> FullResult:
+ bench_scores = {}
+ averages_by_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+ averages_by_block_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+ average_times = defaultdict(list)
+ markdown_by_method = defaultdict(dict)
+ for idx, sample in tqdm(enumerate(benchmark_dataset), desc="Running benchmark"):
+ if max_rows is not None and idx >= max_rows:
+ break
+
+ doc_type = sample["classification"]
+ gt_cls = METHOD_REGISTRY["gt"]
+ gt_blocks = json.loads(sample["gt_blocks"])
+ gt_md = gt_cls(**artifacts)(sample)["markdown"]
+ markdown_by_method[idx]["gt"] = gt_md
+
+ out_data = defaultdict(dict)
+
+ try:
+ for method in methods:
+ method_cls = METHOD_REGISTRY[method](**artifacts)
+ method_info = method_cls(sample)
+ method_md = method_info["markdown"]
+ average_times[method].append(method_info["time"])
+ markdown_by_method[idx][method] = method_md
+
+ for score_type in score_types:
+ score_cls = SCORE_REGISTRY[score_type]()
+ try:
+ scores = score_cls(sample, gt_md, method_md)
+ except Exception as e:
+ # Some scorers can fail, like the LLM one
+ print(f"Failed to score {method} with {score_type}: {e}")
+ continue
+
+ out_data[method][score_type] = scores
+
+ averages_by_type[method][score_type][doc_type].append(scores["score"])
+
+ if "by_block" in scores["specific_scores"]: # Not all scorers support this
+ for score, gt_block in zip(scores["specific_scores"]["by_block"], gt_blocks):
+ averages_by_block_type[method][score_type][gt_block["block_type"]].append(score)
+ except Exception as e:
+ print(f"Failed to process {idx}: {e}")
+ if idx in markdown_by_method:
+ del markdown_by_method[idx]
+ continue
+
+ bench_scores[idx] = out_data
+
+ return {
+ "scores": bench_scores,
+ "markdown": markdown_by_method,
+ "averages_by_type": averages_by_type,
+ "averages_by_block_type": averages_by_block_type,
+ "average_times": average_times,
+ }
+
+@click.command(help="Benchmark PDF to MD conversion.")
+@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
+@click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
+@click.option("--methods", type=str, help="Comma separated list of other methods to compare against. Possible values: marker,mathpix,llamaparse,docling", default="marker")
+@click.option("--scores", type=str, help="Comma separated list of scoring functions to use. Possible values: heuristic,llm", default="heuristic")
+@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
+@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
+@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
+def main(
+ dataset: str,
+ out_dataset: str,
+ methods: str,
+ scores: str,
+ result_path: str,
+ max_rows: int,
+ use_llm: bool
+):
+ out_path = Path(result_path)
+ out_path.mkdir(parents=True, exist_ok=True)
+
+ methods = methods.split(",")
+ for method in methods:
+ if method not in METHOD_REGISTRY:
+ raise ValueError(f"Method {method} not allowed. Allowed methods are {METHOD_REGISTRY.keys()}")
+
+ # Ensure marker is always first
+ all_methods = list(set(methods))
+ methods = ["marker"] if "marker" in all_methods else []
+ methods += [m for m in all_methods if m != "marker"]
+
+ score_types = scores.split(",")
+ for score_type in score_types:
+ if score_type not in SCORE_REGISTRY:
+ raise ValueError(f"Score type {score_type} not allowed. Allowed types are {SCORE_REGISTRY.keys()}")
+
+ benchmark_dataset = datasets.load_dataset(dataset, split="train")
+ artifacts = {
+ "model_dict": create_model_dict(),
+ "use_llm": use_llm,
+ "mathpix_ds": None,
+ "llamaparse_ds": None,
+ }
+
+ if "mathpix" in methods:
+ artifacts["mathpix_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")
+
+ if "llamaparse" in methods:
+ artifacts["llamaparse_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_llamaparse", split="train")
+
+ print(f"Running benchmark with methods: {methods} and scores: {score_types}")
+ result = get_method_scores(benchmark_dataset, methods, score_types, artifacts, max_rows=max_rows)
+
+ # Display benchmark scoring tables
+ print_scores(result, out_path, methods, score_types, default_method=methods[0], default_score_type=score_types[0])
+
+ # Write to json
+ with open(out_path / "result.json", "w") as f:
+ json.dump(result, f)
+
+ if out_dataset:
+ if use_llm:
+ out_dataset += "_llm"
+ dataset = build_dataset(benchmark_dataset, result, score_types, max_rows=max_rows)
+ dataset.push_to_hub(out_dataset)
+
+
+if __name__ == "__main__":
+ main()
+
diff --git a/benchmarks/overall/registry.py b/benchmarks/overall/registry.py
new file mode 100644
index 00000000..02184ad3
--- /dev/null
+++ b/benchmarks/overall/registry.py
@@ -0,0 +1,20 @@
+from benchmarks.overall.methods.docling import DoclingMethod
+from benchmarks.overall.methods.gt import GTMethod
+from benchmarks.overall.methods.llamaparse import LlamaParseMethod
+from benchmarks.overall.methods.marker import MarkerMethod
+from benchmarks.overall.methods.mathpix import MathpixMethod
+from benchmarks.overall.scorers.heuristic import HeuristicScorer
+from benchmarks.overall.scorers.llm import LLMScorer
+
+SCORE_REGISTRY = {
+ "heuristic": HeuristicScorer,
+ "llm": LLMScorer
+}
+
+METHOD_REGISTRY = {
+ "marker": MarkerMethod,
+ "gt": GTMethod,
+ "mathpix": MathpixMethod,
+ "llamaparse": LlamaParseMethod,
+ "docling": DoclingMethod
+}
\ No newline at end of file
diff --git a/benchmarks/overall/schema.py b/benchmarks/overall/schema.py
new file mode 100644
index 00000000..56d99e3a
--- /dev/null
+++ b/benchmarks/overall/schema.py
@@ -0,0 +1,12 @@
+from typing import TypedDict, List, Dict
+
+from benchmarks.overall.scorers.schema import BlockScores
+
+AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]]
+
+class FullResult(TypedDict):
+ scores: Dict[int, Dict[str, Dict[str, BlockScores]]]
+ averages_by_type: AVG_TYPE
+ averages_by_block_type: AVG_TYPE
+ average_times: Dict[str, List[float]]
+ markdown: Dict[int, Dict[str, str]]
diff --git a/benchmarks/overall/scorers/__init__.py b/benchmarks/overall/scorers/__init__.py
new file mode 100644
index 00000000..492bc4e4
--- /dev/null
+++ b/benchmarks/overall/scorers/__init__.py
@@ -0,0 +1,11 @@
+from typing import List
+
+from benchmarks.overall.scorers.schema import BlockScores
+
+
+class BaseScorer:
+ def __init__(self):
+ pass
+
+ def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
+ raise NotImplementedError()
\ No newline at end of file
diff --git a/benchmarks/overall/scorers/clean.py b/benchmarks/overall/scorers/clean.py
new file mode 100644
index 00000000..ed3a6cc2
--- /dev/null
+++ b/benchmarks/overall/scorers/clean.py
@@ -0,0 +1,113 @@
+import re
+import subprocess
+import tempfile
+from pathlib import Path
+
+import latex2mathml.converter
+
+class MarkdownCleaner:
+ def __init__(self):
+ pass
+
+ def __call__(self, markdown):
+ markdown = self.normalize_markdown(markdown) # Use pandoc to normalize
+
+ # Replace math expressions with latexml
+ pattern = r'(?", "\n")
+ markdown = re.sub(r"(.*?)", r"\1", markdown)
+ markdown = re.sub(r"(.*?)", r"\1", markdown)
+ markdown = re.sub(r"(.*?)", r"\1", markdown) # Remove span tags and keep content
+
+ # Clean up markdown formatting
+ markdown = re.sub(r"\s+", " ", markdown)
+ markdown = re.sub(r"\n+", "\n", markdown)
+ markdown = re.sub("\\.+", ".",
+ markdown) # Replace repeated periods with a single period, like in table of contents
+ markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
+ markdown = markdown.encode().decode('unicode-escape', errors="ignore") # Decode unicode characters properly
+ return markdown.strip().lower()
+
+ @staticmethod
+ def normalize_markdown(md_text: str) -> str:
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ dirpath = Path(tmp_dir)
+ input_file = dirpath / 'input.md'
+ input_file.write_text(md_text, encoding='utf-8')
+
+ # Markdown to HTML
+ html_file = dirpath / 'temp.html'
+ subprocess.run(
+ [
+ 'pandoc',
+ str(input_file),
+ '-f', 'markdown+tex_math_dollars',
+ '-t', 'html',
+ '-o', str(html_file),
+ '--quiet'
+ ],
+ check=True
+ )
+
+ # HTML to Markdown
+ output_file = dirpath / 'output.md'
+ subprocess.run(
+ [
+ 'pandoc',
+ str(html_file),
+ '-f', 'html',
+ '-t', 'markdown+tex_math_dollars',
+ '-o', str(output_file),
+ '--quiet'
+ ],
+ check=True
+ )
+
+ # Read back the normalized Markdown
+ normalized_md = output_file.read_text(encoding='utf-8')
+
+ return normalized_md
+
+ def standardize_math(self, match):
+ try:
+ delim = "$$" if match.group(0).startswith('$$') else "$"
+ math_content = match.group(1) or match.group(2)
+ if delim == "$$":
+ math_content = latex2mathml.converter.convert(math_content)
+ else:
+ math_content = self.clean_latex(math_content)
+ return f'{delim}{math_content}{delim}'
+ except Exception as e:
+ print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
+ return match.group(0)
+
+ @staticmethod
+ def clean_latex(latex_str):
+ latex_str = re.sub(r'\s+', ' ', latex_str.strip())
+ for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
+ latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
+
+ replacements = {
+ '\\times': '*',
+ '\\cdot': '*',
+ '\\div': '/',
+ '\\le': '<=',
+ '\\ge': '>=',
+ '\\neq': '!=',
+ '\\to': '\\rightarrow',
+ }
+
+ for old, new in replacements.items():
+ latex_str = latex_str.replace(old, new)
+
+ return latex_str
+
+
+
diff --git a/benchmarks/overall/scorers/heuristic.py b/benchmarks/overall/scorers/heuristic.py
new file mode 100644
index 00000000..ac1bf0e0
--- /dev/null
+++ b/benchmarks/overall/scorers/heuristic.py
@@ -0,0 +1,96 @@
+from typing import List
+
+from rapidfuzz import fuzz
+
+from benchmarks.overall.scorers.clean import MarkdownCleaner
+from benchmarks.overall.scorers.schema import BlockScores
+from benchmarks.overall.scorers import BaseScorer
+
+
+class HeuristicScorer(BaseScorer):
+ def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
+ # Standardize inputs
+ gt_markdown = [self.clean_input(block) for block in gt_markdown]
+ method_markdown = self.clean_input(method_markdown)
+
+ alignments = self.find_fuzzy_alignments(method_markdown, gt_markdown)
+ scores = [alignment["score"] for alignment in alignments]
+
+ # Find order score
+ orders = [alignment["start"] for alignment in alignments]
+ correct_order = list(range(len(gt_markdown)))
+ actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
+ order_score = self.kendall_tau(correct_order, actual_order)
+
+ # Weight score by sequence length
+ gt_weights = [len(g) for g in gt_markdown]
+ weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
+
+ # Weight the score by sequence length
+ overall_score = sum(weighted_scores) / max(1, sum(gt_weights))
+ overall_score = overall_score * 0.8 + order_score * 0.2
+ return {
+ "score": overall_score,
+ "specific_scores": {
+ "order": order_score,
+ "by_block": scores
+ },
+ }
+
+ @staticmethod
+ def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
+ n = len(correct_order)
+ concordant = 0
+ discordant = 0
+
+ if n <= 1:
+ return 100
+
+ for i in range(n):
+ for j in range(i + 1, n):
+ correct_sign = correct_order[i] - correct_order[j]
+ actual_sign = actual_order[i] - actual_order[j]
+
+ if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0):
+ concordant += 1
+ elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0):
+ discordant += 1
+
+ total_pairs = (n * (n - 1)) // 2
+ tau = (concordant - discordant) / total_pairs
+ tau = (tau + 1) / 2 # 0-1 scale
+ return tau * 100 # 0-100 scale
+
+ @staticmethod
+ def find_fuzzy_alignments(
+ main_string: str,
+ substrings: List[str],
+ threshold: int = 70
+ ) -> List[dict]:
+ alignments = []
+
+ for idx, substr in enumerate(substrings):
+ result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold)
+
+ score = 0
+ dest_start = 0
+ dest_end = 0
+ if result:
+ score = result.score
+ dest_start = result.dest_start
+ dest_end = result.dest_end
+
+ alignments.append({
+ "string": substr,
+ "start": dest_start,
+ "end": dest_end,
+ "score": score,
+ "idx": idx
+ })
+ return alignments
+
+
+ @staticmethod
+ def clean_input(md: str):
+ cleaner = MarkdownCleaner()
+ return cleaner(md)
\ No newline at end of file
diff --git a/benchmarks/overall/scorers/llm.py b/benchmarks/overall/scorers/llm.py
new file mode 100644
index 00000000..8ee8d138
--- /dev/null
+++ b/benchmarks/overall/scorers/llm.py
@@ -0,0 +1,147 @@
+import json
+import tempfile
+import time
+from typing import List
+
+from PIL import Image
+from google.genai.errors import APIError
+from google import genai
+import pypdfium2 as pdfium
+
+from benchmarks.overall.scorers import BaseScorer, BlockScores
+from marker.settings import settings
+
+rating_prompt = """
+You're a document analysis expert who is comparing some markdown to an image to make sure the markdown is correct. You're rating how effectively the provided markdown represents the full text and formatting in the image provided.
+You're given an image, along with the extracted markdown:
+- Some parts of the page may have been recognized as images and linked from the markdown, like `data:image/s3,"s3://crabby-images/90472/9047277bed4a9e4df13f2186220c11e5eb46ee69" alt=""`.
+- Tables will be formatted as Github flavored markdown.
+- Block equations will be in LaTeX.
+- The image and markdown may be in any language.
+- The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.
+
+The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.
+
+**Instructions**
+Follow this process to evaluate the markdown:
+1. Carefully examine the image.
+2. Carefully examine the markdown input provided.
+3. Compare the image to the markdown representation. Does the markdown representation properly represent the important text and formatting in the image?
+4. Assign component scores, as described below.
+
+These are the primary scores:
+- Overall - the overall quality of the markdown as compared to the image.
+- Text quality - the quality of the text extraction from the image.
+- Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.
+
+Depending on which elements are present in the markdown, you will assign element-specific scores.
+- Tables - how effectively the tables have been extracted and formatted.
+- Forms - how effectively the forms have extracted and formatted.
+- Equations - how effectively block equations have been converted to LaTeX.
+- Section headers - if all of the section headers have been detected, and the right levels set.
+- Lists - if the lists have been properly extracted and formatted.
+- Images - if images are identified and placed correctly.
+
+Notes on scoring:
+- To get a 5/5, all of the important text from the image must appear in the markdown, and the formatting should be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
+- A 3/5 may have small missing text elements from the markdown and/or moderate formatting issues.
+- A 1/5 will have major missing text segments from the markdown or completely unreadable formatting.
+- Use 0/5 if a field isn't applicable, like if the image doesn't contain a table.
+
+Output json, like in the example below.
+
+**Example**
+Input
+```markdown
+# Section 1
+This is some *markdown* extracted from a document. Here is a block equation:
+$$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
+```
+Output
+```json
+{
+ "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
+ "markdown_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
+ "comparison": "The text and formatting matches the image. There are no formatting or text extraction issues. The equations and section headers are correct.",
+ "overall": 5,
+ "text": 5,
+ "formatting": 5,
+ "section_headers": 5,
+ "tables": 0,
+ "forms": 0,
+ "equations": 5,
+ "lists": 0,
+ "images": 0
+}
+```
+**Input**
+```markdown
+{{markdown}}
+```
+**Output**
+"""
+
+comparison_keys = ["comparison"]
+description_keys = ["image_description", "markdown_description"]
+text_keys = comparison_keys + description_keys
+score_keys = ["overall", "text", "formatting", "section_headers", "tables", "forms", "equations",
+ "lists", "images"]
+
+
+class LLMScorer(BaseScorer):
+ def __call__(self, sample, gt_markdown: List[str], markdown: str) -> BlockScores:
+ pdf_bytes = sample["pdf"]
+ with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
+ f.write(pdf_bytes)
+ f.flush()
+ f.seek(0)
+ doc = pdfium.PdfDocument(f.name)
+ img = doc[0].render(scale=96/72).to_pil()
+ doc.close()
+
+ return self.llm_rater(img, markdown)
+
+
+ def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores:
+ req_keys = text_keys + score_keys
+ properties = {}
+ for key in req_keys:
+ content_type = "INTEGER" if key in score_keys else "STRING"
+ properties[key] = {"type": content_type}
+
+ response_schema = {
+ "required": req_keys,
+ "properties": properties,
+ "type": "OBJECT"
+ }
+ prompt = rating_prompt.replace("{{markdown}}", markdown)
+ response = self.llm_response_wrapper([img, prompt], response_schema)
+ assert all([k in response for k in req_keys]), f"Missing keys in response: {response}"
+ return {
+ "score": response["overall"],
+ "specific_scores": response,
+ }
+
+ def llm_response_wrapper(self, prompt, response_schema, depth=0):
+ client = genai.Client(
+ api_key=settings.GOOGLE_API_KEY,
+ http_options={"timeout": 60000}
+ )
+ try:
+ responses = client.models.generate_content(
+ model="gemini-2.0-flash",
+ contents=prompt,
+ config={
+ "temperature": 0,
+ "response_schema": response_schema,
+ "response_mime_type": "application/json",
+ },
+ )
+ output = responses.candidates[0].content.parts[0].text
+ return json.loads(output)
+ except APIError as e:
+ print(f"Hit Gemini rate limit, waiting 120 seconds")
+ time.sleep(120)
+ if depth > 2:
+ raise e
+ return self.llm_response_wrapper(prompt, response_schema, depth + 1)
\ No newline at end of file
diff --git a/benchmarks/overall/scorers/schema.py b/benchmarks/overall/scorers/schema.py
new file mode 100644
index 00000000..74e814fc
--- /dev/null
+++ b/benchmarks/overall/scorers/schema.py
@@ -0,0 +1,6 @@
+from typing import TypedDict, List, Optional, Dict
+
+
+class BlockScores(TypedDict):
+ score: float
+ specific_scores: Dict[str, float | List[float]]
diff --git a/benchmarks/scoring.py b/benchmarks/scoring.py
deleted file mode 100644
index 5aa9faff..00000000
--- a/benchmarks/scoring.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from rapidfuzz import fuzz
-from statistics import mean
-
-CHUNK_MIN_CHARS = 25
-
-def chunk_text(text, chunk_len=500):
- chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
- chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
- return chunks
-
-
-def overlap_score(hypothesis_chunks, reference_chunks):
- length_modifier = len(hypothesis_chunks) / len(reference_chunks)
- search_distance = max(len(reference_chunks) // 5, 10)
- chunk_scores = []
- for i, hyp_chunk in enumerate(hypothesis_chunks):
- max_score = 0
- total_len = 0
- i_offset = int(i * length_modifier)
- chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
- for j in chunk_range:
- ref_chunk = reference_chunks[j]
- score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
- if score > max_score:
- max_score = score
- total_len = len(ref_chunk)
- chunk_scores.append(max_score)
- return chunk_scores
-
-
-def score_text(hypothesis, reference):
- # Returns a 0-1 alignment score
- hypothesis_chunks = chunk_text(hypothesis)
- reference_chunks = chunk_text(reference)
- chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
- return mean(chunk_scores)
diff --git a/benchmarks/table/__init__.py b/benchmarks/table/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/table/gemini.py b/benchmarks/table/gemini.py
index c58f2a92..5832a90f 100644
--- a/benchmarks/table/gemini.py
+++ b/benchmarks/table/gemini.py
@@ -1,7 +1,10 @@
import json
from PIL import Image
-import google.generativeai as genai
-from google.ai.generativelanguage_v1beta.types import content
+from google import genai
+from google.genai import types
+from io import BytesIO
+from pydantic import BaseModel
+
from marker.settings import settings
prompt = """
@@ -19,30 +22,26 @@
3. Output only the HTML for the table, starting with the
tag and ending with the
tag.
""".strip()
-genai.configure(api_key=settings.GOOGLE_API_KEY)
+class TableSchema(BaseModel):
+ table_html: str
def gemini_table_rec(image: Image.Image):
- schema = content.Schema(
- type=content.Type.OBJECT,
- required=["table_html"],
- properties={
- "table_html": content.Schema(
- type=content.Type.STRING,
- )
- }
+ client = genai.Client(
+ api_key=settings.GOOGLE_API_KEY,
+ http_options={"timeout": 60000}
)
- model = genai.GenerativeModel("gemini-2.0-flash")
+ image_bytes = BytesIO()
+ image.save(image_bytes, format="PNG")
- responses = model.generate_content(
- [image, prompt], # According to gemini docs, it performs better if the image is the first element
- stream=False,
- generation_config={
+ responses = client.models.generate_content(
+ model="gemini-2.0-flash",
+ contents=[types.Part.from_bytes(data=image_bytes.getvalue(), mime_type="image/png"), prompt], # According to gemini docs, it performs better if the image is the first element
+ config={
"temperature": 0,
- "response_schema": schema,
+ "response_schema": TableSchema,
"response_mime_type": "application/json",
},
- request_options={'timeout': 60}
)
output = responses.candidates[0].content.parts[0].text
diff --git a/benchmarks/table/inference.py b/benchmarks/table/inference.py
new file mode 100644
index 00000000..0c6432d7
--- /dev/null
+++ b/benchmarks/table/inference.py
@@ -0,0 +1,178 @@
+from typing import List
+
+import numpy as np
+from bs4 import BeautifulSoup
+import pypdfium2 as pdfium
+from tqdm import tqdm
+import base64
+import tempfile
+
+from benchmarks.table.gemini import gemini_table_rec
+from marker.config.parser import ConfigParser
+from marker.converters.table import TableConverter
+from marker.models import create_model_dict
+from marker.processors.llm.llm_table import LLMTableProcessor
+from marker.processors.table import TableProcessor
+from marker.renderers.json import JSONBlockOutput
+from marker.schema.polygon import PolygonBox
+from marker.util import matrix_intersection_area
+
+
+def extract_tables(children: List[JSONBlockOutput]):
+ tables = []
+ for child in children:
+ if child.block_type == 'Table':
+ tables.append(child)
+ elif child.children:
+ tables.extend(extract_tables(child.children))
+ return tables
+
+
+def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool):
+ models = create_model_dict()
+ config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True})
+ total_unaligned = 0
+ results = []
+
+ iterations = len(dataset)
+ if max_rows is not None:
+ iterations = min(max_rows, len(dataset))
+
+ for i in tqdm(range(iterations), desc='Converting Tables'):
+ try:
+ row = dataset[i]
+ pdf_binary = base64.b64decode(row['pdf'])
+ gt_tables = row['tables'] # Already sorted by reading order, which is what marker returns
+
+ # Only use the basic table processors
+ converter = TableConverter(
+ config=config_parser.generate_config_dict(),
+ artifact_dict=models,
+ processor_list=[
+ "marker.processors.table.TableProcessor",
+ "marker.processors.llm.llm_table.LLMTableProcessor",
+ ],
+ renderer=config_parser.get_renderer()
+ )
+
+ with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
+ temp_pdf_file.write(pdf_binary)
+ temp_pdf_file.seek(0)
+ marker_json = converter(temp_pdf_file.name).children
+
+ doc = pdfium.PdfDocument(temp_pdf_file.name)
+ page_image = doc[0].render(scale=96/72).to_pil()
+ doc.close()
+
+ if len(marker_json) == 0 or len(gt_tables) == 0:
+ print(f'No tables detected, skipping...')
+ total_unaligned += len(gt_tables)
+ continue
+
+ marker_tables = extract_tables(marker_json)
+ marker_table_boxes = [table.bbox for table in marker_tables]
+ page_bbox = marker_json[0].bbox
+
+ if len(marker_tables) != len(gt_tables):
+ print(f'Number of tables do not match, skipping...')
+ total_unaligned += len(gt_tables)
+ continue
+
+ table_images = [
+ page_image.crop(
+ PolygonBox.from_bbox(bbox)
+ .rescale(
+ (page_bbox[2], page_bbox[3]), (page_image.width, page_image.height)
+ ).bbox
+ )
+ for bbox
+ in marker_table_boxes
+ ]
+
+ # Normalize the bboxes
+ for bbox in marker_table_boxes:
+ bbox[0] = bbox[0] / page_bbox[2]
+ bbox[1] = bbox[1] / page_bbox[3]
+ bbox[2] = bbox[2] / page_bbox[2]
+ bbox[3] = bbox[3] / page_bbox[3]
+
+ gt_boxes = [table['normalized_bbox'] for table in gt_tables]
+ gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes]
+ marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes]
+ table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes)
+
+ aligned_tables = []
+ used_tables = set()
+ unaligned_tables = set()
+ for table_idx, alignment in enumerate(table_alignments):
+ try:
+ max_area = np.max(alignment)
+ aligned_idx = np.argmax(alignment)
+ except ValueError:
+ # No alignment found
+ unaligned_tables.add(table_idx)
+ continue
+
+ if max_area <= .01:
+ # No alignment found
+ unaligned_tables.add(table_idx)
+ continue
+
+ if aligned_idx in used_tables:
+ # Marker table already aligned with another gt table
+ unaligned_tables.add(table_idx)
+ continue
+
+ # Gt table doesn't align well with any marker table
+ gt_table_pct = gt_areas[table_idx] / max_area
+ if not .85 < gt_table_pct < 1.15:
+ unaligned_tables.add(table_idx)
+ continue
+
+ # Marker table doesn't align with gt table
+ marker_table_pct = marker_areas[aligned_idx] / max_area
+ if not .85 < marker_table_pct < 1.15:
+ unaligned_tables.add(table_idx)
+ continue
+
+ gemini_html = ""
+ if use_gemini:
+ try:
+ gemini_html = gemini_table_rec(table_images[aligned_idx])
+ except Exception as e:
+ print(f'Gemini failed: {e}')
+
+ aligned_tables.append(
+ (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
+ )
+ used_tables.add(aligned_idx)
+
+ total_unaligned += len(unaligned_tables)
+
+ for marker_table, gt_table, gemini_table in aligned_tables:
+ gt_table_html = gt_table['html']
+
+ # marker wraps the table in which fintabnet data doesn't
+ # Fintabnet doesn't use th tags, need to be replaced for fair comparison
+ marker_table_soup = BeautifulSoup(marker_table.html, 'html.parser')
+ tbody = marker_table_soup.find('tbody')
+ if tbody:
+ tbody.unwrap()
+ for th_tag in marker_table_soup.find_all('th'):
+ th_tag.name = 'td'
+ for br_tag in marker_table_soup.find_all('br'):
+ br_tag.replace_with(marker_table_soup.new_string(''))
+
+ marker_table_html = str(marker_table_soup)
+ marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
+ gemini_table_html = gemini_table.replace("\n", " ") # Fintabnet uses spaces instead of newlines
+
+ results.append({
+ "marker_table": marker_table_html,
+ "gt_table": gt_table_html,
+ "gemini_table": gemini_table_html
+ })
+ except pdfium.PdfiumError:
+ print('Broken PDF, Skipping...')
+ continue
+ return results, total_unaligned
\ No newline at end of file
diff --git a/benchmarks/table/table.py b/benchmarks/table/table.py
index 3116274d..4e674c28 100644
--- a/benchmarks/table/table.py
+++ b/benchmarks/table/table.py
@@ -1,32 +1,22 @@
import os
-from itertools import repeat
-from tkinter import Image
-
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS
+from pathlib import Path
+from itertools import repeat
from typing import List
-import numpy as np
-import base64
+
import time
import datasets
from tqdm import tqdm
-import tempfile
import click
from tabulate import tabulate
import json
-from bs4 import BeautifulSoup
from concurrent.futures import ProcessPoolExecutor
-from pypdfium2._helpers.misc import PdfiumError
-import pypdfium2 as pdfium
-from marker.util import matrix_intersection_area
-from marker.renderers.json import JSONOutput, JSONBlockOutput
-from marker.config.parser import ConfigParser
-from marker.converters.table import TableConverter
-from marker.models import create_model_dict
+from marker.settings import settings
+from benchmarks.table.inference import inference_tables
from scoring import wrap_table_html, similarity_eval_html
-from gemini import gemini_table_rec
def update_teds_score(result, prefix: str = "marker"):
prediction, ground_truth = result[f'{prefix}_table'], result['gt_table']
@@ -36,26 +26,16 @@ def update_teds_score(result, prefix: str = "marker"):
return result
-def extract_tables(children: List[JSONBlockOutput]):
- tables = []
- for child in children:
- if child.block_type == 'Table':
- tables.append(child)
- elif child.children:
- tables.extend(extract_tables(child.children))
- return tables
-
-
@click.command(help="Benchmark Table to HTML Conversion")
-@click.argument("out_file", type=str)
-@click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
+@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.")
+@click.option("--dataset", type=str, default="datalab-to/fintabnet_bench_marker", help="Dataset to use")
@click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
@click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
@click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.")
@click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
@click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.")
def main(
- out_file: str,
+ result_path: str,
dataset: str,
max_rows: int,
max_workers: int,
@@ -63,130 +43,13 @@ def main(
table_rec_batch_size: int | None,
use_gemini: bool = False
):
- models = create_model_dict()
- config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size})
start = time.time()
dataset = datasets.load_dataset(dataset, split='train')
dataset = dataset.shuffle(seed=0)
- iterations = len(dataset)
- if max_rows is not None:
- iterations = min(max_rows, len(dataset))
-
- results = []
- total_unaligned = 0
- for i in tqdm(range(iterations), desc='Converting Tables'):
- try:
- row = dataset[i]
- pdf_binary = base64.b64decode(row['pdf'])
- gt_tables = row['tables'] #Already sorted by reading order, which is what marker returns
-
- converter = TableConverter(
- config=config_parser.generate_config_dict(),
- artifact_dict=models,
- processor_list=config_parser.get_processors(),
- renderer=config_parser.get_renderer()
- )
-
- with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
- temp_pdf_file.write(pdf_binary)
- temp_pdf_file.seek(0)
- tqdm.disable = True
- marker_json = converter(temp_pdf_file.name).children
- tqdm.disable = False
-
- doc = pdfium.PdfDocument(temp_pdf_file.name)
- page_image = doc[0].render(scale=92/72).to_pil()
-
- if len(marker_json) == 0 or len(gt_tables) == 0:
- print(f'No tables detected, skipping...')
- total_unaligned += len(gt_tables)
- continue
-
- marker_tables = extract_tables(marker_json)
- marker_table_boxes = [table.bbox for table in marker_tables]
- page_bbox = marker_json[0].bbox
- w_scaler, h_scaler = page_image.width / page_bbox[2], page_image.height / page_bbox[3]
- table_images = [page_image.crop([bbox[0] * w_scaler, bbox[1] * h_scaler, bbox[2] * w_scaler, bbox[3] * h_scaler]) for bbox in marker_table_boxes]
-
- # Normalize the bboxes
- for bbox in marker_table_boxes:
- bbox[0] = bbox[0] / page_bbox[2]
- bbox[1] = bbox[1] / page_bbox[3]
- bbox[2] = bbox[2] / page_bbox[2]
- bbox[3] = bbox[3] / page_bbox[3]
-
- gt_boxes = [table['normalized_bbox'] for table in gt_tables]
- gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes]
- marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes]
- table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes)
-
- aligned_tables = []
- used_tables = set()
- unaligned_tables = set()
- for table_idx, alignment in enumerate(table_alignments):
- try:
- max_area = np.max(alignment)
- aligned_idx = np.argmax(alignment)
- except ValueError:
- # No alignment found
- unaligned_tables.add(table_idx)
- continue
-
- if aligned_idx in used_tables:
- # Marker table already aligned with another gt table
- unaligned_tables.add(table_idx)
- continue
-
- # Gt table doesn't align well with any marker table
- gt_table_pct = gt_areas[table_idx] / max_area
- if not .75 < gt_table_pct < 1.25:
- unaligned_tables.add(table_idx)
- continue
-
- # Marker table doesn't align with gt table
- marker_table_pct = marker_areas[aligned_idx] / max_area
- if not .75 < marker_table_pct < 1.25:
- unaligned_tables.add(table_idx)
- continue
-
- gemini_html = ""
- if use_gemini:
- gemini_html = gemini_table_rec(table_images[aligned_idx])
-
- aligned_tables.append(
- (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
- )
- used_tables.add(aligned_idx)
-
- total_unaligned += len(unaligned_tables)
-
- for marker_table, gt_table, gemini_table in aligned_tables:
- gt_table_html = gt_table['html']
-
- #marker wraps the table in which fintabnet data doesn't
- #Fintabnet doesn't use th tags, need to be replaced for fair comparison
- marker_table_soup = BeautifulSoup(marker_table.html, 'html.parser')
- tbody = marker_table_soup.find('tbody')
- if tbody:
- tbody.unwrap()
- for th_tag in marker_table_soup.find_all('th'):
- th_tag.name = 'td'
- marker_table_html = str(marker_table_soup)
- marker_table_html = marker_table_html.replace(" ", " ") # Fintabnet uses spaces instead of newlines
- marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
- gemini_table_html = gemini_table.replace("\n", " ") # Fintabnet uses spaces instead of newlines
-
- results.append({
- "marker_table": marker_table_html,
- "gt_table": gt_table_html,
- "gemini_table": gemini_table_html
- })
- except PdfiumError:
- print('Broken PDF, Skipping...')
- continue
+ results, total_unaligned = inference_tables(dataset, use_llm, table_rec_batch_size, max_rows, use_gemini)
print(f"Total time: {time.time() - start}.")
print(f"Could not align {total_unaligned} tables from fintabnet.")
@@ -223,8 +86,12 @@ def main(
"gemini": gemini_results
}
- with open(out_file, "w+") as f:
+ out_path = Path(result_path)
+ out_path.mkdir(parents=True, exist_ok=True)
+ with open(out_path / "table.json", "w+") as f:
json.dump(results, f, indent=2)
+ print(f"Results saved to {out_path}.")
+
if __name__ == '__main__':
main()
\ No newline at end of file
diff --git a/benchmarks/throughput/__init__.py b/benchmarks/throughput/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/throughput/main.py b/benchmarks/throughput/main.py
new file mode 100644
index 00000000..6e07054b
--- /dev/null
+++ b/benchmarks/throughput/main.py
@@ -0,0 +1,39 @@
+import time
+import torch
+
+import click
+import pypdfium2 as pdfium
+
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+
+
+@click.command(help="Benchmark PDF to MD conversion throughput.")
+@click.argument("pdf_path", type=str)
+def main(pdf_path):
+ print(f"Converting {pdf_path} to markdown...")
+ pdf = pdfium.PdfDocument(pdf_path)
+ page_count = len(pdf)
+ pdf.close()
+ model_dict = create_model_dict()
+ torch.cuda.reset_peak_memory_stats()
+
+ times = []
+ for i in range(10):
+ block_converter = PdfConverter(
+ artifact_dict=model_dict,
+ config={"disable_tqdm": True}
+ )
+ start = time.time()
+ block_converter(pdf_path)
+ total = time.time() - start
+ times.append(total)
+
+ max_gpu_vram = torch.cuda.max_memory_allocated() / 1024 ** 3
+
+ print(f"Converted {page_count} pages in {sum(times)/len(times):.2f} seconds.")
+ print(f"Max GPU VRAM: {max_gpu_vram:.2f} GB")
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/benchmarks/verify_scores.py b/benchmarks/verify_scores.py
index 913081e9..088f137e 100644
--- a/benchmarks/verify_scores.py
+++ b/benchmarks/verify_scores.py
@@ -6,18 +6,18 @@ def verify_scores(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
- multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
- switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]
-
- if multicolcnn_score <= 0.34 or switch_trans_score <= 0.40:
- raise ValueError("One or more scores are below the required threshold of 0.4")
+ raw_scores = [data["scores"][k] for k in data["scores"]]
+ marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores]
+ marker_score = sum(marker_scores) / len(marker_scores)
+ if marker_score < 90:
+ raise ValueError("Marker score below 90")
def verify_table_scores(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
- avg = sum([r["score"] for r in data]) / len(data)
+ avg = sum([r["marker_score"] for r in data["marker"]]) / len(data)
if avg < 0.7:
raise ValueError("Average score is below the required threshold of 0.7")
diff --git a/data/images/overall.png b/data/images/overall.png
index 0946421a..1e2a9cce 100644
Binary files a/data/images/overall.png and b/data/images/overall.png differ
diff --git a/data/images/per_doc.png b/data/images/per_doc.png
index ed26cfb9..91694b04 100644
Binary files a/data/images/per_doc.png and b/data/images/per_doc.png differ
diff --git a/data/images/table.png b/data/images/table.png
new file mode 100644
index 00000000..8c6d81cb
Binary files /dev/null and b/data/images/table.png differ
diff --git a/marker/builders/layout.py b/marker/builders/layout.py
index ff4af17d..0eba225a 100644
--- a/marker/builders/layout.py
+++ b/marker/builders/layout.py
@@ -22,7 +22,7 @@ class LayoutBuilder(BaseBuilder):
"""
A builder for performing layout detection on PDF pages and merging the results into the document.
"""
- batch_size: Annotated[
+ layout_batch_size: Annotated[
Optional[int],
"The batch size to use for the layout model.",
"Default is None, which will use the default batch size for the model."
@@ -36,7 +36,7 @@ class LayoutBuilder(BaseBuilder):
float,
"The minimum coverage ratio required for the layout model to consider",
"the lines from the PdfProvider valid.",
- ] = .1
+ ] = .25
document_ocr_threshold: Annotated[
float,
"The minimum ratio of pages that must pass the layout coverage check",
@@ -67,8 +67,8 @@ def __call__(self, document: Document, provider: PdfProvider):
self.merge_blocks(document.pages, provider.page_lines)
def get_batch_size(self):
- if self.batch_size is not None:
- return self.batch_size
+ if self.layout_batch_size is not None:
+ return self.layout_batch_size
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 6
return 6
@@ -140,7 +140,11 @@ def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: Pro
good_pages = []
for (document_page, ocr_error_detection_label) in zip(document_pages, ocr_error_detection_labels):
provider_lines = provider_page_lines.get(document_page.page_id, [])
- good_pages.append(bool(provider_lines) and self.check_layout_coverage(document_page, provider_lines) and (ocr_error_detection_label != "bad"))
+ good_pages.append(
+ bool(provider_lines) and
+ self.check_layout_coverage(document_page, provider_lines) and
+ (ocr_error_detection_label != "bad")
+ )
ocr_document = sum(good_pages) / len(good_pages) < self.document_ocr_threshold
for idx, document_page in enumerate(document_pages):
@@ -180,7 +184,7 @@ def check_layout_coverage(
large_text_blocks += 1
coverage_ratio = covered_blocks / total_blocks if total_blocks > 0 else 1
- text_okay = coverage_ratio >= self.layout_coverage_threshold
+ text_okay = coverage_ratio > self.layout_coverage_threshold
# Model will sometimes say there is a single block of text on the page when it is blank
if not text_okay and (total_blocks == 1 and large_text_blocks == 1):
diff --git a/marker/builders/llm_layout.py b/marker/builders/llm_layout.py
index 6e21fd09..8dbccc1f 100644
--- a/marker/builders/llm_layout.py
+++ b/marker/builders/llm_layout.py
@@ -1,10 +1,10 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Annotated
-from google.ai.generativelanguage_v1beta.types import content
from surya.layout import LayoutPredictor
from surya.ocr_error import OCRErrorPredictor
from tqdm import tqdm
+from pydantic import BaseModel
from marker.builders.layout import LayoutBuilder
from marker.processors.llm import GoogleModel
@@ -41,7 +41,7 @@ class LLMLayoutBuilder(LayoutBuilder):
max_retries: Annotated[
int,
"The maximum number of retries to use for the Gemini model.",
- ] = 3
+ ] = 2
max_concurrency: Annotated[
int,
"The maximum number of concurrent requests to make to the Gemini model.",
@@ -50,6 +50,10 @@ class LLMLayoutBuilder(LayoutBuilder):
int,
"The timeout for requests to the Gemini model.",
] = 60
+ disable_tqdm: Annotated[
+ bool,
+ "Whether to disable the tqdm progress bar.",
+ ] = False
topk_relabelling_prompt: Annotated[
str,
"The prompt to use for relabelling blocks.",
@@ -107,7 +111,7 @@ def __call__(self, document: Document, provider: PdfProvider):
print(f"Error relabelling blocks: {e}")
def relabel_blocks(self, document: Document):
- pbar = tqdm(desc="LLM layout relabelling")
+ pbar = tqdm(desc="LLM layout relabelling", disable=self.disable_tqdm)
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
futures = []
for page in document.pages:
@@ -154,21 +158,15 @@ def process_block_complex_relabeling(self, document: Document, page: PageGroup,
def process_block_relabeling(self, document: Document, page: PageGroup, block: Block, prompt: str):
image = self.extract_image(document, block)
- response_schema = content.Schema(
- type=content.Type.OBJECT,
- enum=[],
- required=["image_description", "label"],
- properties={
- "image_description": content.Schema(
- type=content.Type.STRING,
- ),
- "label": content.Schema(
- type=content.Type.STRING,
- ),
- },
- )
- response = self.model.generate_response(prompt, image, block, response_schema)
+ response = self.model.generate_response(
+ prompt,
+ image,
+ block,
+ LayoutSchema,
+ max_retries=self.max_retries,
+ timeout=self.timeout
+ )
generated_label = None
if response and "label" in response:
generated_label = response["label"]
@@ -184,3 +182,8 @@ def process_block_relabeling(self, document: Document, page: PageGroup, block: B
def extract_image(self, document: Document, image_block: Block, expand: float = 0.01):
return image_block.get_image(document, highres=False, expansion=(expand, expand))
+
+
+class LayoutSchema(BaseModel):
+ image_description: str
+ label: str
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
index 3741b760..01f69695 100644
--- a/marker/converters/pdf.py
+++ b/marker/converters/pdf.py
@@ -41,6 +41,7 @@
from marker.schema.registry import register_block_class
from marker.util import strings_to_classes
from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
+from marker.processors.order import OrderProcessor
class PdfConverter(BaseConverter):
@@ -59,6 +60,7 @@ class PdfConverter(BaseConverter):
"Enable higher quality processing with LLMs.",
] = False
default_processors: Tuple[BaseProcessor, ...] = (
+ OrderProcessor,
BlockquoteProcessor,
CodeProcessor,
DocumentTOCProcessor,
diff --git a/marker/processors/equation.py b/marker/processors/equation.py
index 5f5be17c..20ac0fb4 100644
--- a/marker/processors/equation.py
+++ b/marker/processors/equation.py
@@ -1,7 +1,4 @@
from typing import Annotated, List, Optional, Tuple
-
-from texify.inference import batch_inference
-from texify.model.model import GenerateVisionEncoderDecoderModel
from tqdm import tqdm
from marker.models import TexifyPredictor
@@ -32,6 +29,10 @@ class EquationProcessor(BaseProcessor):
int,
"The number of tokens to buffer above max for the Texify model.",
] = 256
+ disable_tqdm: Annotated[
+ bool,
+ "Whether to disable the tqdm progress bar.",
+ ] = False
def __init__(self, texify_model: TexifyPredictor, config=None):
super().__init__(config)
@@ -53,11 +54,12 @@ def __call__(self, document: Document):
"token_count": token_count
})
+ if len(equation_data) == 0:
+ return
+
predictions = self.get_latex_batched(equation_data)
for prediction, equation_d in zip(predictions, equation_data):
conditions = [
- self.get_total_texify_tokens(prediction) < self.model_max_length,
- # Make sure we didn't get to the overall token max, indicates run-on
len(prediction) > equation_d["token_count"] * .4,
len(prediction.strip()) > 0
]
@@ -77,28 +79,15 @@ def get_batch_size(self):
return 2
def get_latex_batched(self, equation_data: List[dict]):
- predictions = [""] * len(equation_data)
- batch_size = self.get_batch_size()
-
- for i in tqdm(range(0, len(equation_data), batch_size), desc="Recognizing equations"):
- # Dynamically set max length to save inference time
- min_idx = i
- max_idx = min(min_idx + batch_size, len(equation_data))
-
- batch_equations = equation_data[min_idx:max_idx]
- batch_images = [eq["image"] for eq in batch_equations]
-
- model_output = self.texify_model(
- batch_images
- )
-
- for j, output in enumerate(model_output):
- token_count = self.get_total_texify_tokens(output.text)
- if token_count >= self.model_max_length - 1:
- output.text = ""
-
- image_idx = i + j
- predictions[image_idx] = output.text
+ inference_images = [eq["image"] for eq in equation_data]
+ model_output = self.texify_model(inference_images, batch_size=self.get_batch_size())
+ predictions = [output.text for output in model_output]
+
+ for i, pred in enumerate(predictions):
+ token_count = self.get_total_texify_tokens(pred)
+ # If we're at the max token length, the prediction may be repetitive or invalid
+ if token_count >= self.model_max_length - 1:
+ predictions[i] = ""
return predictions
def get_total_texify_tokens(self, text):
diff --git a/marker/processors/llm/__init__.py b/marker/processors/llm/__init__.py
index 3d61166e..21ee04f3 100644
--- a/marker/processors/llm/__init__.py
+++ b/marker/processors/llm/__init__.py
@@ -27,7 +27,7 @@ class BaseLLMProcessor(BaseProcessor):
max_retries: Annotated[
int,
"The maximum number of retries to use for the Gemini model.",
- ] = 3
+ ] = 1
max_concurrency: Annotated[
int,
"The maximum number of concurrent requests to make to the Gemini model.",
@@ -35,7 +35,7 @@ class BaseLLMProcessor(BaseProcessor):
timeout: Annotated[
int,
"The timeout for requests to the Gemini model.",
- ] = 60
+ ] = 15
image_expansion_ratio: Annotated[
float,
"The ratio to expand the image by when cropping.",
@@ -44,6 +44,10 @@ class BaseLLMProcessor(BaseProcessor):
bool,
"Whether to use the LLM model.",
] = False
+ disable_tqdm: Annotated[
+ bool,
+ "Whether to disable the tqdm progress bar.",
+ ] = False
block_types = None
def __init__(self, config=None):
@@ -73,7 +77,7 @@ def rewrite_blocks(self, document: Document):
if total_blocks == 0:
return
- pbar = tqdm(desc=f"{self.__class__.__name__} running")
+ pbar = tqdm(desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm)
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
for future in as_completed([
executor.submit(self.process_rewriting, document, page, block)
diff --git a/marker/processors/llm/llm_complex.py b/marker/processors/llm/llm_complex.py
index 52c46364..72966d62 100644
--- a/marker/processors/llm/llm_complex.py
+++ b/marker/processors/llm/llm_complex.py
@@ -1,9 +1,8 @@
import markdown2
+from pydantic import BaseModel
from marker.processors.llm import BaseLLMProcessor
-from google.ai.generativelanguage_v1beta.types import content
-
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.schema.document import Document
@@ -55,18 +54,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
text = block.raw_text(document)
prompt = self.complex_region_prompt.replace("{extracted_text}", text)
image = self.extract_image(document, block)
- response_schema = content.Schema(
- type=content.Type.OBJECT,
- enum=[],
- required=["corrected_markdown"],
- properties={
- "corrected_markdown": content.Schema(
- type=content.Type.STRING
- )
- },
- )
- response = self.model.generate_response(prompt, image, block, response_schema)
+ response = self.model.generate_response(prompt, image, block, ComplexSchema)
if not response or "corrected_markdown" not in response:
block.update_metadata(llm_error_count=1)
@@ -85,4 +74,7 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
# Convert LLM markdown to html
corrected_markdown = corrected_markdown.strip().lstrip("```markdown").rstrip("```").strip()
- block.html = markdown2.markdown(corrected_markdown)
\ No newline at end of file
+ block.html = markdown2.markdown(corrected_markdown, extras=["tables"])
+
+class ComplexSchema(BaseModel):
+ corrected_markdown: str
\ No newline at end of file
diff --git a/marker/processors/llm/llm_equation.py b/marker/processors/llm/llm_equation.py
index 74cfc4a3..89d0318d 100644
--- a/marker/processors/llm/llm_equation.py
+++ b/marker/processors/llm/llm_equation.py
@@ -1,6 +1,6 @@
-from marker.processors.llm import BaseLLMProcessor
+from pydantic import BaseModel
-from google.ai.generativelanguage_v1beta.types import content
+from marker.processors.llm import BaseLLMProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Equation
@@ -67,18 +67,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Equation
prompt = self.equation_latex_prompt.replace("{equation}", text)
image = self.extract_image(document, block)
- response_schema = content.Schema(
- type=content.Type.OBJECT,
- enum=[],
- required=["html_equation"],
- properties={
- "html_equation": content.Schema(
- type=content.Type.STRING
- )
- },
- )
- response = self.model.generate_response(prompt, image, block, response_schema)
+ response = self.model.generate_response(prompt, image, block, EquationSchema)
if not response or "html_equation" not in response:
block.update_metadata(llm_error_count=1)
@@ -89,3 +79,6 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Equation
block.update_metadata(llm_error_count=1)
return
block.html = html_equation
+
+class EquationSchema(BaseModel):
+ html_equation: str
diff --git a/marker/processors/llm/llm_form.py b/marker/processors/llm/llm_form.py
index fc66f155..a47bad3c 100644
--- a/marker/processors/llm/llm_form.py
+++ b/marker/processors/llm/llm_form.py
@@ -1,6 +1,6 @@
-from marker.processors.llm import BaseLLMProcessor
+from pydantic import BaseModel
-from google.ai.generativelanguage_v1beta.types import content
+from marker.processors.llm import BaseLLMProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Block
@@ -13,13 +13,14 @@ class LLMFormProcessor(BaseLLMProcessor):
form_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
You will receive an image of a text block and an html representation of the form in the image.
Your task is to correct any errors in the html representation, and format it properly.
-Values and labels should appear in html tables, with the labels on the left side, and values on the right. The headers should be "Labels" and "Values". Other text in the form can appear between the tables. Only use the tags `table, p, span, i, b, th, td, tr, and div`. Do not omit any text from the form - make sure everything is included in the html representation. It should be as faithful to the original form as possible.
+Values and labels should appear in html tables, with the labels on the left side, and values on the right. Other text in the form can appear between the tables. Only use the tags `table, p, span, i, b, th, td, tr, and div`. Do not omit any text from the form - make sure everything is included in the html representation. It should be as faithful to the original form as possible.
**Instructions:**
1. Carefully examine the provided form block image.
2. Analyze the html representation of the form.
-3. If the html representation is largely correct, or you cannot read the image properly, then write "No corrections needed."
-4. If the html representation contains errors, generate the corrected html representation.
-5. Output only either the corrected html representation or "No corrections needed."
+3. Compare the html representation to the image.
+4. If the html representation is correct, or you cannot read the image properly, then write "No corrections needed."
+5. If the html representation contains errors, generate the corrected html representation.
+6. Output only either the corrected html representation or "No corrections needed."
**Example:**
Input:
```html
@@ -37,12 +38,9 @@ class LLMFormProcessor(BaseLLMProcessor):
```
Output:
+Comparison: The html representation has the labels in the first row and the values in the second row. It should be corrected to have the labels on the left side and the values on the right side.
```html
-
-
Labels
-
Values
-
Label 1
Value 1
@@ -73,18 +71,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
prompt = self.form_rewriting_prompt.replace("{block_html}", block_html)
image = self.extract_image(document, block)
- response_schema = content.Schema(
- type=content.Type.OBJECT,
- enum=[],
- required=["corrected_html"],
- properties={
- "corrected_html": content.Schema(
- type=content.Type.STRING
- )
- },
- )
- response = self.model.generate_response(prompt, image, block, response_schema)
+ response = self.model.generate_response(prompt, image, block, FormSchema)
if not response or "corrected_html" not in response:
block.update_metadata(llm_error_count=1)
@@ -102,4 +90,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
return
corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip()
- block.html = corrected_html
\ No newline at end of file
+ block.html = corrected_html
+
+class FormSchema(BaseModel):
+ comparison: str
+ corrected_html: str
\ No newline at end of file
diff --git a/marker/processors/llm/llm_handwriting.py b/marker/processors/llm/llm_handwriting.py
index d3e9b9f3..760efb35 100644
--- a/marker/processors/llm/llm_handwriting.py
+++ b/marker/processors/llm/llm_handwriting.py
@@ -1,9 +1,8 @@
import markdown2
+from pydantic import BaseModel
from marker.processors.llm import BaseLLMProcessor
-from google.ai.generativelanguage_v1beta.types import content
-
from marker.schema import BlockTypes
from marker.schema.blocks import Handwriting, Text
from marker.schema.document import Document
@@ -49,18 +48,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Handwrit
prompt = self.handwriting_generation_prompt
image = self.extract_image(document, block)
- response_schema = content.Schema(
- type=content.Type.OBJECT,
- enum=[],
- required=["markdown"],
- properties={
- "markdown": content.Schema(
- type=content.Type.STRING
- )
- },
- )
-
- response = self.model.generate_response(prompt, image, block, response_schema)
+
+ response = self.model.generate_response(prompt, image, block, HandwritingSchema)
if not response or "markdown" not in response:
block.update_metadata(llm_error_count=1)
@@ -72,4 +61,7 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Handwrit
return
markdown = markdown.strip().lstrip("```markdown").rstrip("```").strip()
- block.html = markdown2.markdown(markdown)
+ block.html = markdown2.markdown(markdown, extras=["tables"])
+
+class HandwritingSchema(BaseModel):
+ markdown: str
diff --git a/marker/processors/llm/llm_image_description.py b/marker/processors/llm/llm_image_description.py
index a08e0dc9..c125df0f 100644
--- a/marker/processors/llm/llm_image_description.py
+++ b/marker/processors/llm/llm_image_description.py
@@ -1,6 +1,6 @@
-from marker.processors.llm import BaseLLMProcessor
+from pydantic import BaseModel
-from google.ai.generativelanguage_v1beta.types import content
+from marker.processors.llm import BaseLLMProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Block
@@ -49,18 +49,8 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
prompt = self.image_description_prompt.replace("{raw_text}", block.raw_text(document))
image = self.extract_image(document, block)
- response_schema = content.Schema(
- type=content.Type.OBJECT,
- enum=[],
- required=["image_description"],
- properties={
- "image_description": content.Schema(
- type=content.Type.STRING
- )
- },
- )
- response = self.model.generate_response(prompt, image, block, response_schema)
+ response = self.model.generate_response(prompt, image, block, ImageSchema)
if not response or "image_description" not in response:
block.update_metadata(llm_error_count=1)
@@ -72,3 +62,6 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
return
block.description = image_description
+
+class ImageSchema(BaseModel):
+ image_description: str
diff --git a/marker/processors/llm/llm_table.py b/marker/processors/llm/llm_table.py
index e0c738a0..1ec1f8cd 100644
--- a/marker/processors/llm/llm_table.py
+++ b/marker/processors/llm/llm_table.py
@@ -1,8 +1,8 @@
from typing import Annotated, List, Tuple
from bs4 import BeautifulSoup
-from google.ai.generativelanguage_v1beta.types import content
from PIL import Image
+from pydantic import BaseModel
from marker.processors.llm import BaseLLMProcessor
from marker.schema import BlockTypes
@@ -34,21 +34,21 @@ class LLMTableProcessor(BaseLLMProcessor):
"The prompt to use for rewriting text.",
"Default is a string containing the Gemini rewriting prompt."
] = """You are a text correction expert specializing in accurately reproducing text from images.
-You will receive an image of a text block and an html representation of the table in the image.
+You will receive an image and an html representation of the table in the image.
Your task is to correct any errors in the html representation. The html representation should be as faithful to the original table as possible.
Some guidelines:
- Make sure to reproduce the original values as faithfully as possible.
-- If you see any math in a table cell, fence it with the