diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
deleted file mode 100644
index 5d49aa1c..00000000
--- a/.github/workflows/benchmark.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Integration test with benchmark
-
-on: [push]
-
-env:
- TORCH_DEVICE: "cpu"
-
-jobs:
- benchmark:
- runs-on: [ubuntu-latest, windows-latest]
- steps:
- - uses: actions/checkout@v3
- - name: Set up Python 3.11
- uses: actions/setup-python@v4
- with:
- python-version: 3.11
- - name: Install python dependencies
- run: |
- pip install poetry
- poetry install
- poetry remove torch
- poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
- - name: Download benchmark data
- run: |
- wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
- unzip -o benchmark_data.zip
- - name: Run benchmark test
- run: |
- poetry run python benchmarks/overall.py benchmark_data/pdfs benchmark_data/references report.json
- poetry run python benchmarks/verify_scores.py report.json --type marker
-
-
-
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
new file mode 100644
index 00000000..ae6a1c84
--- /dev/null
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,28 @@
+name: Integration test
+
+on: [push]
+
+env:
+ PYTHONIOENCODING: "utf-8"
+
+jobs:
+ benchmark:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python 3.11
+ uses: actions/setup-python@v4
+ with:
+ python-version: 3.11
+ - name: Install python dependencies
+ run: |
+ pip install poetry
+ poetry install
+ - name: Run benchmark test
+ run: |
+ poetry run python benchmarks/overall/overall.py --max_rows 5
+ poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/overall.json --type marker
+ - name: Run table benchmark
+ run: |
+ poetry run python benchmarks/table/table.py --max_rows 5
+ poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table
\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index af4e92e8..84137df5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,10 +2,6 @@ name: CI tests
on: [push]
-env:
- TORCH_DEVICE: "cpu"
- OCR_ENGINE: "surya"
-
jobs:
tests:
runs-on: ubuntu-latest
diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml
index 217e4221..06230580 100644
--- a/.github/workflows/scripts.yml
+++ b/.github/workflows/scripts.yml
@@ -2,10 +2,6 @@ name: Test CLI scripts
on: [push]
-env:
- TORCH_DEVICE: "cpu"
- OCR_ENGINE: "surya"
-
jobs:
tests:
runs-on: ubuntu-latest
diff --git a/benchmarks/overall/inference.py b/benchmarks/overall/inference.py
index f312429b..1b504cff 100644
--- a/benchmarks/overall/inference.py
+++ b/benchmarks/overall/inference.py
@@ -1,15 +1,16 @@
-import json
import tempfile
+import time
+
from bs4 import BeautifulSoup
from benchmarks.overall.scoring import score_blocks
from benchmarks.overall.schema import BlockScores
from marker.converters.pdf import PdfConverter
-def get_marker_html(marker_models: dict, pdf_bytes: bytes):
+def get_marker_html(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
block_converter = PdfConverter(
artifact_dict=marker_models,
- config={"page_range": [0], "disable_tqdm": True},
+ config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm},
renderer="marker.renderers.html.HTMLRenderer"
)
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
@@ -21,16 +22,17 @@ def get_marker_html(marker_models: dict, pdf_bytes: bytes):
return inner_html
-def marker_html_func(model_dict, sample, **kwargs) -> BlockScores:
- gt_blocks = json.loads(sample["gt_blocks"])
+def marker_scoring_func(model_dict, sample, gt_html, use_llm=False, **kwargs) -> BlockScores:
pdf_bytes = sample["pdf"] # This is a single page PDF
- marker_html = get_marker_html(model_dict, pdf_bytes)
- gt_html = [block["html"] for block in gt_blocks]
+ start = time.time()
+ marker_html = get_marker_html(model_dict, pdf_bytes, use_llm)
+ total = time.time() - start
scores = score_blocks(gt_html, marker_html)
+ scores["time"] = total
return scores
-def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores:
+def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs) -> BlockScores:
uuid = sample["uuid"]
data = None
for row in mathpix_ds:
@@ -41,7 +43,6 @@ def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores:
raise ValueError(f"Could not find data for uuid {uuid}")
mathpix_md = data["md"]
- gt_blocks = json.loads(sample["gt_blocks"])
- gt_html = [block["html"] for block in gt_blocks]
scores = score_blocks(gt_html, mathpix_md, convert=False)
+ scores["time"] = data["time"]
return scores
diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py
index bdb1fc7c..9cf6fb01 100644
--- a/benchmarks/overall/overall.py
+++ b/benchmarks/overall/overall.py
@@ -9,7 +9,7 @@
import tabulate
from tqdm import tqdm
-from benchmarks.overall.inference import marker_html_func, mathpix_html_func
+from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
from benchmarks.overall.schema import FullResult
from marker.logger import configure_logging
from marker.models import create_model_dict
@@ -18,7 +18,7 @@
configure_logging()
-def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func, **kwargs) -> FullResult:
+def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_func, **kwargs) -> FullResult:
bench_scores = {}
averages_by_type = defaultdict(list)
averages_by_block_type = defaultdict(list)
@@ -29,7 +29,8 @@ def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func,
gt_blocks = json.loads(sample["gt_blocks"])
doc_type = sample["classification"]
try:
- scores = html_func(model_dict, sample, **kwargs)
+ gt_html = [block["html"] for block in gt_blocks]
+ scores = score_func(model_dict, sample, gt_html, **kwargs)
except ValueError as e:
print(f"Error with sample {idx}: {e}")
continue
@@ -40,10 +41,13 @@ def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func,
bench_scores[idx] = scores
+ avg_time = sum([bench_scores[k]["time"] for k in bench_scores]) / len(bench_scores)
return {
"raw_scores": bench_scores,
"averages_by_type": averages_by_type,
- "averages_by_block_type": averages_by_block_type
+ "averages_by_block_type": averages_by_block_type,
+ "average_time": avg_time,
+ "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
}
def print_scores(scores: FullResult, method: str):
@@ -73,11 +77,13 @@ def print_scores(scores: FullResult, method: str):
@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values: mathpix", default="")
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
+@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
def main(
dataset: str,
other_methods: str,
result_path: str,
- max_rows: int
+ max_rows: int,
+ use_llm: bool
):
allowed_methods = ["mathpix", ""]
methods = other_methods.split(",")
@@ -88,14 +94,14 @@ def main(
model_dict = create_model_dict()
ds = datasets.load_dataset(dataset, split="train")
- marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows)
+ marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows, use_llm=use_llm)
all_scores = {
"marker": marker_scores
}
if "mathpix" in methods:
mathpix_ds = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")
- mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, html_func=mathpix_html_func, mathpix_ds=mathpix_ds)
+ mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds)
all_scores["mathpix"] = mathpix_scores
for k,v in all_scores.items():
@@ -103,8 +109,8 @@ def main(
out_path = Path(result_path)
out_path.mkdir(parents=True, exist_ok=True)
- with open(out_path / "overall.json", "w") as f:
- json.dump(all_scores, f, indent=2)
+ with open(out_path / "overall.json", "w", encoding="utf-8") as f:
+ json.dump(all_scores, f, indent=2, ensure_ascii=False)
print(f"Results saved to {out_path}.")
diff --git a/benchmarks/overall/schema.py b/benchmarks/overall/schema.py
index 98ffc1b8..8af5bf28 100644
--- a/benchmarks/overall/schema.py
+++ b/benchmarks/overall/schema.py
@@ -1,4 +1,4 @@
-from typing import TypedDict, List, Dict
+from typing import TypedDict, List, Dict, Optional
class BlockScores(TypedDict):
@@ -7,9 +7,12 @@ class BlockScores(TypedDict):
gt: List[str]
method: str
overall_score: float
+ time: Optional[float]
class FullResult(TypedDict):
raw_scores: Dict[int, BlockScores]
averages_by_type: Dict[str, List[float]]
averages_by_block_type: Dict[str, List[float]]
+ average_time: float
+ average_score: float
diff --git a/benchmarks/overall/scoring.py b/benchmarks/overall/scoring.py
index 1ba78bc9..713e5fef 100644
--- a/benchmarks/overall/scoring.py
+++ b/benchmarks/overall/scoring.py
@@ -12,6 +12,9 @@ def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
concordant = 0
discordant = 0
+ if n <= 1:
+ return 100
+
for i in range(n):
for j in range(i + 1, n):
correct_sign = correct_order[i] - correct_order[j]
@@ -61,18 +64,27 @@ def convert_to_md(html):
return markdown
def standardize_markdown(markdown):
+ # Replace math expressions
pattern = r'(?", "\n")
markdown = re.sub(r"(.*?)", r"\1", markdown)
markdown = re.sub(r"(.*?)", r"\1", markdown)
+ markdown = re.sub(r"(.*?)", r"\1", markdown) # Remove span tags and keep content
+ # Clean up markdown
markdown = re.sub(r"\s+", " ", markdown)
markdown = re.sub(r"\n+", "\n", markdown)
markdown = re.sub("\\.+", ".", markdown) # Replace repeated periods with a single period, like in table of contents
markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
markdown = re.sub(r"\$", "", markdown) # Remove equation delimiters
+ markdown = markdown.encode().decode('unicode-escape') # Decode unicode characters properly
return markdown.strip().lower()
@@ -116,10 +128,14 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
gt = [standardize_markdown(convert_to_md(gt)) for gt in gt_html]
alignments = find_fuzzy_alignments(method_html, gt)
scores = [alignment["score"] for alignment in alignments]
+
+ # Find order score
orders = [alignment["start"] for alignment in alignments]
- correct_order = range(len(gt))
+ correct_order = list(range(len(gt)))
actual_order = sorted(range(len(gt)), key=lambda x: orders[x])
order_score = kendall_tau(correct_order, actual_order)
+
+ # Weight score by sequence length
gt_weights = [len(g) for g in gt]
weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
@@ -131,5 +147,6 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
"order_score": order_score,
"gt": gt,
"method": method_html,
- "overall_score": overall_score
+ "overall_score": overall_score,
+ "time": None
}
\ No newline at end of file
diff --git a/benchmarks/table/table.py b/benchmarks/table/table.py
index 448e32fe..75b4c613 100644
--- a/benchmarks/table/table.py
+++ b/benchmarks/table/table.py
@@ -49,7 +49,7 @@ def extract_tables(children: List[JSONBlockOutput]):
@click.command(help="Benchmark Table to HTML Conversion")
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.")
-@click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
+@click.option("--dataset", type=str, default="datalab-to/fintabnet_bench_marker", help="Dataset to use")
@click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
@click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
@click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.")
@@ -222,9 +222,9 @@ def main(
"gemini": gemini_results
}
- out_path = Path(result_path) / "table.json"
+ out_path = Path(result_path)
out_path.mkdir(parents=True, exist_ok=True)
- with open(out_path, "w+") as f:
+ with open(out_path / "table.json", "w+") as f:
json.dump(results, f, indent=2)
print(f"Results saved to {out_path}.")
diff --git a/benchmarks/verify_scores.py b/benchmarks/verify_scores.py
index 913081e9..defff1c7 100644
--- a/benchmarks/verify_scores.py
+++ b/benchmarks/verify_scores.py
@@ -6,11 +6,9 @@ def verify_scores(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
- multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
- switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]
-
- if multicolcnn_score <= 0.34 or switch_trans_score <= 0.40:
- raise ValueError("One or more scores are below the required threshold of 0.4")
+ marker_score = data["marker"]["average_score"]
+ if marker_score < 90:
+ raise ValueError("Marker score below 90")
def verify_table_scores(file_path):