Skip to content

Commit

Permalink
Refactor benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 30, 2025
1 parent 70c0b0e commit bbf4161
Show file tree
Hide file tree
Showing 6 changed files with 199 additions and 153 deletions.
Empty file added benchmarks/overall/__init__.py
Empty file.
80 changes: 54 additions & 26 deletions benchmarks/overall/overall.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import json
import os
import traceback
from collections import defaultdict
from pathlib import Path
from typing import Dict

import click
import datasets
import tabulate
from tqdm import tqdm
import pypdfium2 as pdfium

from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
from benchmarks.overall.schema import FullResult
Expand All @@ -28,12 +29,17 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f

gt_blocks = json.loads(sample["gt_blocks"])
doc_type = sample["classification"]

try:
gt_html = [block["html"] for block in gt_blocks]
scores = score_func(model_dict, sample, gt_html, **kwargs)
except ValueError as e:
print(f"Error with sample {idx}: {e}")
continue
except pdfium.PdfiumError as e:
print(f"Error opening pdf: {e}")
continue

averages_by_type[doc_type].append(scores["overall_score"])

for score, gt_block in zip(scores["scores"], gt_blocks):
Expand All @@ -50,27 +56,48 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
"average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
}

def print_scores(scores: FullResult, method: str):
averages_by_type = scores["averages_by_type"]
averages_by_block_type = scores["averages_by_block_type"]
bench_scores = scores["raw_scores"]

for k in averages_by_type:
averages_by_type[k] = sum(averages_by_type[k]) / len(averages_by_type[k])
averages_by_type = sorted(averages_by_type.items())

print(f"Scores for method {method}:")
print(tabulate.tabulate(averages_by_type, headers=["Document Type", "Average Score"], tablefmt="github"))

for k in averages_by_block_type:
averages_by_block_type[k] = sum(averages_by_block_type[k]) / len(averages_by_block_type[k])
averages_by_block_type = sorted(averages_by_block_type.items())

print(tabulate.tabulate(averages_by_block_type, headers=["Block Type", "Average Score"], tablefmt="github"))

overall_average = sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
print(tabulate.tabulate([["Overall Average", overall_average]], tablefmt="github"))
print()
def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"):
inference_types = [default_method] + [k for k in scores.keys() if k != default_method]

document_types = list(scores[default_method]["averages_by_type"].keys())
document_rows = [[k] for k in document_types]
for k in inference_types:
for i, doc_type in enumerate(document_types):
avg = sum(scores[k]["averages_by_type"][doc_type]) / max(1, len(scores[k]["averages_by_type"][doc_type]))
document_rows[i].append(avg)

print("Document types")
document_type_table = tabulate.tabulate(document_rows, headers=["Document Type"] + inference_types, tablefmt="github")
print(document_type_table)
with open(out_path / "document_types.md", "w", encoding="utf-8") as f:
f.write(document_type_table)

block_types = list(scores[default_method]["averages_by_block_type"].keys())
block_rows = [[k] for k in block_types]
for k in inference_types:
for i, block_type in enumerate(block_types):
avg = sum(scores[k]["averages_by_block_type"][block_type]) / max(1, len(scores[k]["averages_by_block_type"][block_type]))
block_rows[i].append(avg)

print("Block types")
block_type_table = tabulate.tabulate(block_rows, headers=["Block Type"] + inference_types, tablefmt="github")
print(block_type_table)
with open(out_path / "block_types.md", "w", encoding="utf-8") as f:
f.write(block_type_table)

headers = ["Method", "Avg Score", "Avg Time"]
inference_rows = [[k] for k in inference_types]
for i, k in enumerate(inference_types):
inference_rows[i].append(scores[k]["average_score"])
inference_rows[i].append(scores[k]["average_time"])

print("Overall")
overall_table = tabulate.tabulate(inference_rows, headers=headers, tablefmt="github")
print(overall_table)
with open(out_path / "overall.md", "w", encoding="utf-8") as f:
f.write(overall_table)

print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")

@click.command(help="Benchmark PDF to MD conversion.")
@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
Expand All @@ -85,6 +112,9 @@ def main(
max_rows: int,
use_llm: bool
):
out_path = Path(result_path)
out_path.mkdir(parents=True, exist_ok=True)

allowed_methods = ["mathpix", ""]
methods = other_methods.split(",")
for method in methods:
Expand All @@ -104,11 +134,9 @@ def main(
mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds)
all_scores["mathpix"] = mathpix_scores

for k,v in all_scores.items():
print_scores(v, k)
# Display formatted score tables
print_scores(all_scores, out_path)

out_path = Path(result_path)
out_path.mkdir(parents=True, exist_ok=True)
with open(out_path / "overall.json", "w", encoding="utf-8") as f:
json.dump(all_scores, f, indent=2, ensure_ascii=False)

Expand Down
Empty file added benchmarks/table/__init__.py
Empty file.
139 changes: 139 additions & 0 deletions benchmarks/table/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import datasets
import numpy as np
from bs4 import BeautifulSoup
import pypdfium2 as pdfium
from tqdm import tqdm
import base64
import tempfile

from benchmarks.table.gemini import gemini_table_rec
from marker.config.parser import ConfigParser
from marker.converters.table import TableConverter
from marker.models import create_model_dict
from marker.util import matrix_intersection_area


def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool):
models = create_model_dict()
config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True})
total_unaligned = 0
results = []

dataset = datasets.load_dataset(dataset, split='train')
dataset = dataset.shuffle(seed=0)

iterations = len(dataset)
if max_rows is not None:
iterations = min(max_rows, len(dataset))

for i in tqdm(range(iterations), desc='Converting Tables'):
try:
row = dataset[i]
pdf_binary = base64.b64decode(row['pdf'])
gt_tables = row['tables'] # Already sorted by reading order, which is what marker returns

converter = TableConverter(
config=config_parser.generate_config_dict(),
artifact_dict=models,
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer()
)

with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
temp_pdf_file.write(pdf_binary)
temp_pdf_file.seek(0)
marker_json = converter(temp_pdf_file.name).children

doc = pdfium.PdfDocument(temp_pdf_file.name)
page_image = doc[0].render(scale=92 / 72).to_pil()

if len(marker_json) == 0 or len(gt_tables) == 0:
print(f'No tables detected, skipping...')
total_unaligned += len(gt_tables)
continue

marker_tables = extract_tables(marker_json)
marker_table_boxes = [table.bbox for table in marker_tables]
page_bbox = marker_json[0].bbox
w_scaler, h_scaler = page_image.width / page_bbox[2], page_image.height / page_bbox[3]
table_images = [
page_image.crop([bbox[0] * w_scaler, bbox[1] * h_scaler, bbox[2] * w_scaler, bbox[3] * h_scaler]) for bbox
in marker_table_boxes]

# Normalize the bboxes
for bbox in marker_table_boxes:
bbox[0] = bbox[0] / page_bbox[2]
bbox[1] = bbox[1] / page_bbox[3]
bbox[2] = bbox[2] / page_bbox[2]
bbox[3] = bbox[3] / page_bbox[3]

gt_boxes = [table['normalized_bbox'] for table in gt_tables]
gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes]
marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes]
table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes)

aligned_tables = []
used_tables = set()
unaligned_tables = set()
for table_idx, alignment in enumerate(table_alignments):
try:
max_area = np.max(alignment)
aligned_idx = np.argmax(alignment)
except ValueError:
# No alignment found
unaligned_tables.add(table_idx)
continue

if aligned_idx in used_tables:
# Marker table already aligned with another gt table
unaligned_tables.add(table_idx)
continue

# Gt table doesn't align well with any marker table
gt_table_pct = gt_areas[table_idx] / max_area
if not .75 < gt_table_pct < 1.25:
unaligned_tables.add(table_idx)
continue

# Marker table doesn't align with gt table
marker_table_pct = marker_areas[aligned_idx] / max_area
if not .75 < marker_table_pct < 1.25:
unaligned_tables.add(table_idx)
continue

gemini_html = ""
if use_gemini:
gemini_html = gemini_table_rec(table_images[aligned_idx])

aligned_tables.append(
(marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
)
used_tables.add(aligned_idx)

total_unaligned += len(unaligned_tables)

for marker_table, gt_table, gemini_table in aligned_tables:
gt_table_html = gt_table['html']

# marker wraps the table in <tbody> which fintabnet data doesn't
# Fintabnet doesn't use th tags, need to be replaced for fair comparison
marker_table_soup = BeautifulSoup(marker_table.html, 'html.parser')
tbody = marker_table_soup.find('tbody')
if tbody:
tbody.unwrap()
for th_tag in marker_table_soup.find_all('th'):
th_tag.name = 'td'
marker_table_html = str(marker_table_soup)
marker_table_html = marker_table_html.replace("<br>", " ") # Fintabnet uses spaces instead of newlines
marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
gemini_table_html = gemini_table.replace("\n", " ") # Fintabnet uses spaces instead of newlines

results.append({
"marker_table": marker_table_html,
"gt_table": gt_table_html,
"gemini_table": gemini_table_html
})
except pdfium.PdfiumError:
print('Broken PDF, Skipping...')
continue
return results, total_unaligned
Loading

0 comments on commit bbf4161

Please sign in to comment.