diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml deleted file mode 100644 index 5d49aa1c..00000000 --- a/.github/workflows/benchmark.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Integration test with benchmark - -on: [push] - -env: - TORCH_DEVICE: "cpu" - -jobs: - benchmark: - runs-on: [ubuntu-latest, windows-latest] - steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.11 - uses: actions/setup-python@v4 - with: - python-version: 3.11 - - name: Install python dependencies - run: | - pip install poetry - poetry install - poetry remove torch - poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu - - name: Download benchmark data - run: | - wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi" - unzip -o benchmark_data.zip - - name: Run benchmark test - run: | - poetry run python benchmarks/overall.py benchmark_data/pdfs benchmark_data/references report.json - poetry run python benchmarks/verify_scores.py report.json --type marker - - - diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 00000000..14986669 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,31 @@ +name: Integration test + +on: [push] + +env: + PYTHONIOENCODING: "utf-8" + +jobs: + benchmark: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ ubuntu-latest, windows-latest ] + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install python dependencies + run: | + pip install poetry + poetry install + - name: Run benchmark test + run: | + poetry run python benchmarks/overall/overall.py --max_rows 5 + poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/overall.json --type marker + - name: Run table benchmark + run: | + poetry run python benchmarks/table/table.py --max_rows 5 + poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index af4e92e8..84137df5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,10 +2,6 @@ name: CI tests on: [push] -env: - TORCH_DEVICE: "cpu" - OCR_ENGINE: "surya" - jobs: tests: runs-on: ubuntu-latest diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml index 217e4221..06230580 100644 --- a/.github/workflows/scripts.yml +++ b/.github/workflows/scripts.yml @@ -2,10 +2,6 @@ name: Test CLI scripts on: [push] -env: - TORCH_DEVICE: "cpu" - OCR_ENGINE: "surya" - jobs: tests: runs-on: ubuntu-latest diff --git a/benchmarks/overall/inference.py b/benchmarks/overall/inference.py index f312429b..1b504cff 100644 --- a/benchmarks/overall/inference.py +++ b/benchmarks/overall/inference.py @@ -1,15 +1,16 @@ -import json import tempfile +import time + from bs4 import BeautifulSoup from benchmarks.overall.scoring import score_blocks from benchmarks.overall.schema import BlockScores from marker.converters.pdf import PdfConverter -def get_marker_html(marker_models: dict, pdf_bytes: bytes): +def get_marker_html(marker_models: dict, pdf_bytes: bytes, use_llm: bool): block_converter = PdfConverter( artifact_dict=marker_models, - config={"page_range": [0], "disable_tqdm": True}, + config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm}, renderer="marker.renderers.html.HTMLRenderer" ) with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: @@ -21,16 +22,17 @@ def get_marker_html(marker_models: dict, pdf_bytes: bytes): return inner_html -def marker_html_func(model_dict, sample, **kwargs) -> BlockScores: - gt_blocks = json.loads(sample["gt_blocks"]) +def marker_scoring_func(model_dict, sample, gt_html, use_llm=False, **kwargs) -> BlockScores: pdf_bytes = sample["pdf"] # This is a single page PDF - marker_html = get_marker_html(model_dict, pdf_bytes) - gt_html = [block["html"] for block in gt_blocks] + start = time.time() + marker_html = get_marker_html(model_dict, pdf_bytes, use_llm) + total = time.time() - start scores = score_blocks(gt_html, marker_html) + scores["time"] = total return scores -def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores: +def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs) -> BlockScores: uuid = sample["uuid"] data = None for row in mathpix_ds: @@ -41,7 +43,6 @@ def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores: raise ValueError(f"Could not find data for uuid {uuid}") mathpix_md = data["md"] - gt_blocks = json.loads(sample["gt_blocks"]) - gt_html = [block["html"] for block in gt_blocks] scores = score_blocks(gt_html, mathpix_md, convert=False) + scores["time"] = data["time"] return scores diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py index bdb1fc7c..9cf6fb01 100644 --- a/benchmarks/overall/overall.py +++ b/benchmarks/overall/overall.py @@ -9,7 +9,7 @@ import tabulate from tqdm import tqdm -from benchmarks.overall.inference import marker_html_func, mathpix_html_func +from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func from benchmarks.overall.schema import FullResult from marker.logger import configure_logging from marker.models import create_model_dict @@ -18,7 +18,7 @@ configure_logging() -def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func, **kwargs) -> FullResult: +def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_func, **kwargs) -> FullResult: bench_scores = {} averages_by_type = defaultdict(list) averages_by_block_type = defaultdict(list) @@ -29,7 +29,8 @@ def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func, gt_blocks = json.loads(sample["gt_blocks"]) doc_type = sample["classification"] try: - scores = html_func(model_dict, sample, **kwargs) + gt_html = [block["html"] for block in gt_blocks] + scores = score_func(model_dict, sample, gt_html, **kwargs) except ValueError as e: print(f"Error with sample {idx}: {e}") continue @@ -40,10 +41,13 @@ def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func, bench_scores[idx] = scores + avg_time = sum([bench_scores[k]["time"] for k in bench_scores]) / len(bench_scores) return { "raw_scores": bench_scores, "averages_by_type": averages_by_type, - "averages_by_block_type": averages_by_block_type + "averages_by_block_type": averages_by_block_type, + "average_time": avg_time, + "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores) } def print_scores(scores: FullResult, method: str): @@ -73,11 +77,13 @@ def print_scores(scores: FullResult, method: str): @click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values: mathpix", default="") @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.") @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.") +@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.") def main( dataset: str, other_methods: str, result_path: str, - max_rows: int + max_rows: int, + use_llm: bool ): allowed_methods = ["mathpix", ""] methods = other_methods.split(",") @@ -88,14 +94,14 @@ def main( model_dict = create_model_dict() ds = datasets.load_dataset(dataset, split="train") - marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows) + marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows, use_llm=use_llm) all_scores = { "marker": marker_scores } if "mathpix" in methods: mathpix_ds = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train") - mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, html_func=mathpix_html_func, mathpix_ds=mathpix_ds) + mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds) all_scores["mathpix"] = mathpix_scores for k,v in all_scores.items(): @@ -103,8 +109,8 @@ def main( out_path = Path(result_path) out_path.mkdir(parents=True, exist_ok=True) - with open(out_path / "overall.json", "w") as f: - json.dump(all_scores, f, indent=2) + with open(out_path / "overall.json", "w", encoding="utf-8") as f: + json.dump(all_scores, f, indent=2, ensure_ascii=False) print(f"Results saved to {out_path}.") diff --git a/benchmarks/overall/schema.py b/benchmarks/overall/schema.py index 98ffc1b8..8af5bf28 100644 --- a/benchmarks/overall/schema.py +++ b/benchmarks/overall/schema.py @@ -1,4 +1,4 @@ -from typing import TypedDict, List, Dict +from typing import TypedDict, List, Dict, Optional class BlockScores(TypedDict): @@ -7,9 +7,12 @@ class BlockScores(TypedDict): gt: List[str] method: str overall_score: float + time: Optional[float] class FullResult(TypedDict): raw_scores: Dict[int, BlockScores] averages_by_type: Dict[str, List[float]] averages_by_block_type: Dict[str, List[float]] + average_time: float + average_score: float diff --git a/benchmarks/overall/scoring.py b/benchmarks/overall/scoring.py index 1ba78bc9..713e5fef 100644 --- a/benchmarks/overall/scoring.py +++ b/benchmarks/overall/scoring.py @@ -12,6 +12,9 @@ def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float: concordant = 0 discordant = 0 + if n <= 1: + return 100 + for i in range(n): for j in range(i + 1, n): correct_sign = correct_order[i] - correct_order[j] @@ -61,18 +64,27 @@ def convert_to_md(html): return markdown def standardize_markdown(markdown): + # Replace math expressions pattern = r'(?", "\n") markdown = re.sub(r"(.*?)", r"\1", markdown) markdown = re.sub(r"(.*?)", r"\1", markdown) + markdown = re.sub(r"(.*?)", r"\1", markdown) # Remove span tags and keep content + # Clean up markdown markdown = re.sub(r"\s+", " ", markdown) markdown = re.sub(r"\n+", "\n", markdown) markdown = re.sub("\\.+", ".", markdown) # Replace repeated periods with a single period, like in table of contents markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header markdown = re.sub(r"\$", "", markdown) # Remove equation delimiters + markdown = markdown.encode().decode('unicode-escape') # Decode unicode characters properly return markdown.strip().lower() @@ -116,10 +128,14 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores: gt = [standardize_markdown(convert_to_md(gt)) for gt in gt_html] alignments = find_fuzzy_alignments(method_html, gt) scores = [alignment["score"] for alignment in alignments] + + # Find order score orders = [alignment["start"] for alignment in alignments] - correct_order = range(len(gt)) + correct_order = list(range(len(gt))) actual_order = sorted(range(len(gt)), key=lambda x: orders[x]) order_score = kendall_tau(correct_order, actual_order) + + # Weight score by sequence length gt_weights = [len(g) for g in gt] weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)] @@ -131,5 +147,6 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores: "order_score": order_score, "gt": gt, "method": method_html, - "overall_score": overall_score + "overall_score": overall_score, + "time": None } \ No newline at end of file diff --git a/benchmarks/table/table.py b/benchmarks/table/table.py index 448e32fe..75b4c613 100644 --- a/benchmarks/table/table.py +++ b/benchmarks/table/table.py @@ -49,7 +49,7 @@ def extract_tables(children: List[JSONBlockOutput]): @click.command(help="Benchmark Table to HTML Conversion") @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.") -@click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use") +@click.option("--dataset", type=str, default="datalab-to/fintabnet_bench_marker", help="Dataset to use") @click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process") @click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use") @click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.") @@ -222,9 +222,9 @@ def main( "gemini": gemini_results } - out_path = Path(result_path) / "table.json" + out_path = Path(result_path) out_path.mkdir(parents=True, exist_ok=True) - with open(out_path, "w+") as f: + with open(out_path / "table.json", "w+") as f: json.dump(results, f, indent=2) print(f"Results saved to {out_path}.") diff --git a/benchmarks/verify_scores.py b/benchmarks/verify_scores.py index 913081e9..defff1c7 100644 --- a/benchmarks/verify_scores.py +++ b/benchmarks/verify_scores.py @@ -6,11 +6,9 @@ def verify_scores(file_path): with open(file_path, 'r') as file: data = json.load(file) - multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"] - switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"] - - if multicolcnn_score <= 0.34 or switch_trans_score <= 0.40: - raise ValueError("One or more scores are below the required threshold of 0.4") + marker_score = data["marker"]["average_score"] + if marker_score < 90: + raise ValueError("Marker score below 90") def verify_table_scores(file_path):