diff --git a/benchmarks/overall/__init__.py b/benchmarks/overall/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py index 9cf6fb01..e1245094 100644 --- a/benchmarks/overall/overall.py +++ b/benchmarks/overall/overall.py @@ -1,13 +1,14 @@ import json import os -import traceback from collections import defaultdict from pathlib import Path +from typing import Dict import click import datasets import tabulate from tqdm import tqdm +import pypdfium2 as pdfium from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func from benchmarks.overall.schema import FullResult @@ -28,12 +29,17 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f gt_blocks = json.loads(sample["gt_blocks"]) doc_type = sample["classification"] + try: gt_html = [block["html"] for block in gt_blocks] scores = score_func(model_dict, sample, gt_html, **kwargs) except ValueError as e: print(f"Error with sample {idx}: {e}") continue + except pdfium.PdfiumError as e: + print(f"Error opening pdf: {e}") + continue + averages_by_type[doc_type].append(scores["overall_score"]) for score, gt_block in zip(scores["scores"], gt_blocks): @@ -50,27 +56,48 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores) } -def print_scores(scores: FullResult, method: str): - averages_by_type = scores["averages_by_type"] - averages_by_block_type = scores["averages_by_block_type"] - bench_scores = scores["raw_scores"] - - for k in averages_by_type: - averages_by_type[k] = sum(averages_by_type[k]) / len(averages_by_type[k]) - averages_by_type = sorted(averages_by_type.items()) - - print(f"Scores for method {method}:") - print(tabulate.tabulate(averages_by_type, headers=["Document Type", "Average Score"], tablefmt="github")) - - for k in averages_by_block_type: - averages_by_block_type[k] = sum(averages_by_block_type[k]) / len(averages_by_block_type[k]) - averages_by_block_type = sorted(averages_by_block_type.items()) - - print(tabulate.tabulate(averages_by_block_type, headers=["Block Type", "Average Score"], tablefmt="github")) - - overall_average = sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores) - print(tabulate.tabulate([["Overall Average", overall_average]], tablefmt="github")) - print() +def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"): + inference_types = [default_method] + [k for k in scores.keys() if k != default_method] + + document_types = list(scores[default_method]["averages_by_type"].keys()) + document_rows = [[k] for k in document_types] + for k in inference_types: + for i, doc_type in enumerate(document_types): + avg = sum(scores[k]["averages_by_type"][doc_type]) / max(1, len(scores[k]["averages_by_type"][doc_type])) + document_rows[i].append(avg) + + print("Document types") + document_type_table = tabulate.tabulate(document_rows, headers=["Document Type"] + inference_types, tablefmt="github") + print(document_type_table) + with open(out_path / "document_types.md", "w", encoding="utf-8") as f: + f.write(document_type_table) + + block_types = list(scores[default_method]["averages_by_block_type"].keys()) + block_rows = [[k] for k in block_types] + for k in inference_types: + for i, block_type in enumerate(block_types): + avg = sum(scores[k]["averages_by_block_type"][block_type]) / max(1, len(scores[k]["averages_by_block_type"][block_type])) + block_rows[i].append(avg) + + print("Block types") + block_type_table = tabulate.tabulate(block_rows, headers=["Block Type"] + inference_types, tablefmt="github") + print(block_type_table) + with open(out_path / "block_types.md", "w", encoding="utf-8") as f: + f.write(block_type_table) + + headers = ["Method", "Avg Score", "Avg Time"] + inference_rows = [[k] for k in inference_types] + for i, k in enumerate(inference_types): + inference_rows[i].append(scores[k]["average_score"]) + inference_rows[i].append(scores[k]["average_time"]) + + print("Overall") + overall_table = tabulate.tabulate(inference_rows, headers=headers, tablefmt="github") + print(overall_table) + with open(out_path / "overall.md", "w", encoding="utf-8") as f: + f.write(overall_table) + + print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.") @click.command(help="Benchmark PDF to MD conversion.") @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark") @@ -85,6 +112,9 @@ def main( max_rows: int, use_llm: bool ): + out_path = Path(result_path) + out_path.mkdir(parents=True, exist_ok=True) + allowed_methods = ["mathpix", ""] methods = other_methods.split(",") for method in methods: @@ -104,11 +134,9 @@ def main( mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds) all_scores["mathpix"] = mathpix_scores - for k,v in all_scores.items(): - print_scores(v, k) + # Display formatted score tables + print_scores(all_scores, out_path) - out_path = Path(result_path) - out_path.mkdir(parents=True, exist_ok=True) with open(out_path / "overall.json", "w", encoding="utf-8") as f: json.dump(all_scores, f, indent=2, ensure_ascii=False) diff --git a/benchmarks/table/__init__.py b/benchmarks/table/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/table/inference.py b/benchmarks/table/inference.py new file mode 100644 index 00000000..7e228c93 --- /dev/null +++ b/benchmarks/table/inference.py @@ -0,0 +1,139 @@ +import datasets +import numpy as np +from bs4 import BeautifulSoup +import pypdfium2 as pdfium +from tqdm import tqdm +import base64 +import tempfile + +from benchmarks.table.gemini import gemini_table_rec +from marker.config.parser import ConfigParser +from marker.converters.table import TableConverter +from marker.models import create_model_dict +from marker.util import matrix_intersection_area + + +def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool): + models = create_model_dict() + config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True}) + total_unaligned = 0 + results = [] + + dataset = datasets.load_dataset(dataset, split='train') + dataset = dataset.shuffle(seed=0) + + iterations = len(dataset) + if max_rows is not None: + iterations = min(max_rows, len(dataset)) + + for i in tqdm(range(iterations), desc='Converting Tables'): + try: + row = dataset[i] + pdf_binary = base64.b64decode(row['pdf']) + gt_tables = row['tables'] # Already sorted by reading order, which is what marker returns + + converter = TableConverter( + config=config_parser.generate_config_dict(), + artifact_dict=models, + processor_list=config_parser.get_processors(), + renderer=config_parser.get_renderer() + ) + + with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file: + temp_pdf_file.write(pdf_binary) + temp_pdf_file.seek(0) + marker_json = converter(temp_pdf_file.name).children + + doc = pdfium.PdfDocument(temp_pdf_file.name) + page_image = doc[0].render(scale=92 / 72).to_pil() + + if len(marker_json) == 0 or len(gt_tables) == 0: + print(f'No tables detected, skipping...') + total_unaligned += len(gt_tables) + continue + + marker_tables = extract_tables(marker_json) + marker_table_boxes = [table.bbox for table in marker_tables] + page_bbox = marker_json[0].bbox + w_scaler, h_scaler = page_image.width / page_bbox[2], page_image.height / page_bbox[3] + table_images = [ + page_image.crop([bbox[0] * w_scaler, bbox[1] * h_scaler, bbox[2] * w_scaler, bbox[3] * h_scaler]) for bbox + in marker_table_boxes] + + # Normalize the bboxes + for bbox in marker_table_boxes: + bbox[0] = bbox[0] / page_bbox[2] + bbox[1] = bbox[1] / page_bbox[3] + bbox[2] = bbox[2] / page_bbox[2] + bbox[3] = bbox[3] / page_bbox[3] + + gt_boxes = [table['normalized_bbox'] for table in gt_tables] + gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes] + marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes] + table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes) + + aligned_tables = [] + used_tables = set() + unaligned_tables = set() + for table_idx, alignment in enumerate(table_alignments): + try: + max_area = np.max(alignment) + aligned_idx = np.argmax(alignment) + except ValueError: + # No alignment found + unaligned_tables.add(table_idx) + continue + + if aligned_idx in used_tables: + # Marker table already aligned with another gt table + unaligned_tables.add(table_idx) + continue + + # Gt table doesn't align well with any marker table + gt_table_pct = gt_areas[table_idx] / max_area + if not .75 < gt_table_pct < 1.25: + unaligned_tables.add(table_idx) + continue + + # Marker table doesn't align with gt table + marker_table_pct = marker_areas[aligned_idx] / max_area + if not .75 < marker_table_pct < 1.25: + unaligned_tables.add(table_idx) + continue + + gemini_html = "" + if use_gemini: + gemini_html = gemini_table_rec(table_images[aligned_idx]) + + aligned_tables.append( + (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html) + ) + used_tables.add(aligned_idx) + + total_unaligned += len(unaligned_tables) + + for marker_table, gt_table, gemini_table in aligned_tables: + gt_table_html = gt_table['html'] + + # marker wraps the table in
which fintabnet data doesn't + # Fintabnet doesn't use th tags, need to be replaced for fair comparison + marker_table_soup = BeautifulSoup(marker_table.html, 'html.parser') + tbody = marker_table_soup.find('tbody') + if tbody: + tbody.unwrap() + for th_tag in marker_table_soup.find_all('th'): + th_tag.name = 'td' + marker_table_html = str(marker_table_soup) + marker_table_html = marker_table_html.replace("