Refactor benchmarks

VikParuchuri · Jan 30, 2025 · bbf4161 · bbf4161
1 parent 70c0b0e
commit bbf4161
Show file tree

Hide file tree

Showing 6 changed files with 199 additions and 153 deletions.
diff --git a/benchmarks/overall/__init__.py b/benchmarks/overall/__init__.py
diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py
@@ -1,13 +1,14 @@
 import json
 import os
-import traceback
 from collections import defaultdict
 from pathlib import Path
+from typing import Dict
 
 import click
 import datasets
 import tabulate
 from tqdm import tqdm
+import pypdfium2 as pdfium
 
 from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
 from benchmarks.overall.schema import FullResult
@@ -28,12 +29,17 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
 
         gt_blocks = json.loads(sample["gt_blocks"])
         doc_type = sample["classification"]
+
         try:
             gt_html = [block["html"] for block in gt_blocks]
             scores = score_func(model_dict, sample, gt_html, **kwargs)
         except ValueError as e:
             print(f"Error with sample {idx}: {e}")
             continue
+        except pdfium.PdfiumError as e:
+            print(f"Error opening pdf: {e}")
+            continue
+
         averages_by_type[doc_type].append(scores["overall_score"])
 
         for score, gt_block in zip(scores["scores"], gt_blocks):
@@ -50,27 +56,48 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
         "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
     }
 
-def print_scores(scores: FullResult, method: str):
-    averages_by_type = scores["averages_by_type"]
-    averages_by_block_type = scores["averages_by_block_type"]
-    bench_scores = scores["raw_scores"]
-
-    for k in averages_by_type:
-        averages_by_type[k] = sum(averages_by_type[k]) / len(averages_by_type[k])
-    averages_by_type = sorted(averages_by_type.items())
-
-    print(f"Scores for method {method}:")
-    print(tabulate.tabulate(averages_by_type, headers=["Document Type", "Average Score"], tablefmt="github"))
-
-    for k in averages_by_block_type:
-        averages_by_block_type[k] = sum(averages_by_block_type[k]) / len(averages_by_block_type[k])
-    averages_by_block_type = sorted(averages_by_block_type.items())
-
-    print(tabulate.tabulate(averages_by_block_type, headers=["Block Type", "Average Score"], tablefmt="github"))
-
-    overall_average = sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
-    print(tabulate.tabulate([["Overall Average", overall_average]], tablefmt="github"))
-    print()
+def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"):
+    inference_types = [default_method] + [k for k in scores.keys() if k != default_method]
+
+    document_types = list(scores[default_method]["averages_by_type"].keys())
+    document_rows = [[k] for k in document_types]
+    for k in inference_types:
+        for i, doc_type in enumerate(document_types):
+            avg = sum(scores[k]["averages_by_type"][doc_type]) / max(1, len(scores[k]["averages_by_type"][doc_type]))
+            document_rows[i].append(avg)
+
+    print("Document types")
+    document_type_table = tabulate.tabulate(document_rows, headers=["Document Type"] + inference_types, tablefmt="github")
+    print(document_type_table)
+    with open(out_path / "document_types.md", "w", encoding="utf-8") as f:
+        f.write(document_type_table)
+
+    block_types = list(scores[default_method]["averages_by_block_type"].keys())
+    block_rows = [[k] for k in block_types]
+    for k in inference_types:
+        for i, block_type in enumerate(block_types):
+            avg = sum(scores[k]["averages_by_block_type"][block_type]) / max(1, len(scores[k]["averages_by_block_type"][block_type]))
+            block_rows[i].append(avg)
+
+    print("Block types")
+    block_type_table = tabulate.tabulate(block_rows, headers=["Block Type"] + inference_types, tablefmt="github")
+    print(block_type_table)
+    with open(out_path / "block_types.md", "w", encoding="utf-8") as f:
+        f.write(block_type_table)
+
+    headers = ["Method", "Avg Score", "Avg Time"]
+    inference_rows = [[k] for k in inference_types]
+    for i, k in enumerate(inference_types):
+        inference_rows[i].append(scores[k]["average_score"])
+        inference_rows[i].append(scores[k]["average_time"])
+
+    print("Overall")
+    overall_table = tabulate.tabulate(inference_rows, headers=headers, tablefmt="github")
+    print(overall_table)
+    with open(out_path / "overall.md", "w", encoding="utf-8") as f:
+        f.write(overall_table)
+
+    print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method.  The scores are 0-100 based on edit distance.")
 
 @click.command(help="Benchmark PDF to MD conversion.")
 @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
@@ -85,6 +112,9 @@ def main(
         max_rows: int,
         use_llm: bool
 ):
+    out_path = Path(result_path)
+    out_path.mkdir(parents=True, exist_ok=True)
+
     allowed_methods = ["mathpix", ""]
     methods = other_methods.split(",")
     for method in methods:
@@ -104,11 +134,9 @@ def main(
         mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds)
         all_scores["mathpix"] = mathpix_scores
 
-    for k,v in all_scores.items():
-        print_scores(v, k)
+    # Display formatted score tables
+    print_scores(all_scores, out_path)
 
-    out_path = Path(result_path)
-    out_path.mkdir(parents=True, exist_ok=True)
     with open(out_path / "overall.json", "w", encoding="utf-8") as f:
         json.dump(all_scores, f, indent=2, ensure_ascii=False)
 

diff --git a/benchmarks/table/__init__.py b/benchmarks/table/__init__.py
diff --git a/benchmarks/table/inference.py b/benchmarks/table/inference.py
@@ -0,0 +1,139 @@
+import datasets
+import numpy as np
+from bs4 import BeautifulSoup
+import pypdfium2 as pdfium
+from tqdm import tqdm
+import base64
+import tempfile
+
+from benchmarks.table.gemini import gemini_table_rec
+from marker.config.parser import ConfigParser
+from marker.converters.table import TableConverter
+from marker.models import create_model_dict
+from marker.util import matrix_intersection_area
+
+
+def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool):
+    models = create_model_dict()
+    config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True})
+    total_unaligned = 0
+    results = []
+
+    dataset = datasets.load_dataset(dataset, split='train')
+    dataset = dataset.shuffle(seed=0)
+
+    iterations = len(dataset)
+    if max_rows is not None:
+        iterations = min(max_rows, len(dataset))
+
+    for i in tqdm(range(iterations), desc='Converting Tables'):
+        try:
+            row = dataset[i]
+            pdf_binary = base64.b64decode(row['pdf'])
+            gt_tables = row['tables']  # Already sorted by reading order, which is what marker returns
+
+            converter = TableConverter(
+                config=config_parser.generate_config_dict(),
+                artifact_dict=models,
+                processor_list=config_parser.get_processors(),
+                renderer=config_parser.get_renderer()
+            )
+
+            with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
+                temp_pdf_file.write(pdf_binary)
+                temp_pdf_file.seek(0)
+                marker_json = converter(temp_pdf_file.name).children
+
+                doc = pdfium.PdfDocument(temp_pdf_file.name)
+                page_image = doc[0].render(scale=92 / 72).to_pil()
+
+            if len(marker_json) == 0 or len(gt_tables) == 0:
+                print(f'No tables detected, skipping...')
+                total_unaligned += len(gt_tables)
+                continue
+
+            marker_tables = extract_tables(marker_json)
+            marker_table_boxes = [table.bbox for table in marker_tables]
+            page_bbox = marker_json[0].bbox
+            w_scaler, h_scaler = page_image.width / page_bbox[2], page_image.height / page_bbox[3]
+            table_images = [
+                page_image.crop([bbox[0] * w_scaler, bbox[1] * h_scaler, bbox[2] * w_scaler, bbox[3] * h_scaler]) for bbox
+                in marker_table_boxes]
+
+            # Normalize the bboxes
+            for bbox in marker_table_boxes:
+                bbox[0] = bbox[0] / page_bbox[2]
+                bbox[1] = bbox[1] / page_bbox[3]
+                bbox[2] = bbox[2] / page_bbox[2]
+                bbox[3] = bbox[3] / page_bbox[3]
+
+            gt_boxes = [table['normalized_bbox'] for table in gt_tables]
+            gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes]
+            marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes]
+            table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes)
+
+            aligned_tables = []
+            used_tables = set()
+            unaligned_tables = set()
+            for table_idx, alignment in enumerate(table_alignments):
+                try:
+                    max_area = np.max(alignment)
+                    aligned_idx = np.argmax(alignment)
+                except ValueError:
+                    # No alignment found
+                    unaligned_tables.add(table_idx)
+                    continue
+
+                if aligned_idx in used_tables:
+                    # Marker table already aligned with another gt table
+                    unaligned_tables.add(table_idx)
+                    continue
+
+                # Gt table doesn't align well with any marker table
+                gt_table_pct = gt_areas[table_idx] / max_area
+                if not .75 < gt_table_pct < 1.25:
+                    unaligned_tables.add(table_idx)
+                    continue
+
+                # Marker table doesn't align with gt table
+                marker_table_pct = marker_areas[aligned_idx] / max_area
+                if not .75 < marker_table_pct < 1.25:
+                    unaligned_tables.add(table_idx)
+                    continue
+
+                gemini_html = ""
+                if use_gemini:
+                    gemini_html = gemini_table_rec(table_images[aligned_idx])
+
+                aligned_tables.append(
+                    (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
+                )
+                used_tables.add(aligned_idx)
+
+            total_unaligned += len(unaligned_tables)
+
+            for marker_table, gt_table, gemini_table in aligned_tables:
+                gt_table_html = gt_table['html']
+
+                # marker wraps the table in <tbody> which fintabnet data doesn't
+                # Fintabnet doesn't use th tags, need to be replaced for fair comparison
+                marker_table_soup = BeautifulSoup(marker_table.html, 'html.parser')
+                tbody = marker_table_soup.find('tbody')
+                if tbody:
+                    tbody.unwrap()
+                for th_tag in marker_table_soup.find_all('th'):
+                    th_tag.name = 'td'
+                marker_table_html = str(marker_table_soup)
+                marker_table_html = marker_table_html.replace("<br>", " ")  # Fintabnet uses spaces instead of newlines
+                marker_table_html = marker_table_html.replace("\n", " ")  # Fintabnet uses spaces instead of newlines
+                gemini_table_html = gemini_table.replace("\n", " ")  # Fintabnet uses spaces instead of newlines
+
+                results.append({
+                    "marker_table": marker_table_html,
+                    "gt_table": gt_table_html,
+                    "gemini_table": gemini_table_html
+                })
+        except pdfium.PdfiumError:
+            print('Broken PDF, Skipping...')
+            continue
+    return results, total_unaligned