From 277f2db312ee62dc801b58d3d80e362e7b839450 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Mon, 3 Feb 2025 17:14:52 -0500 Subject: [PATCH] Add order processor --- benchmarks/overall/inference.py | 10 +++++----- benchmarks/overall/overall.py | 2 +- benchmarks/overall/render.py | 14 +++++++++++--- benchmarks/overall/schema.py | 1 + marker/converters/pdf.py | 2 ++ marker/processors/order.py | 23 ++++++++++++++++++----- marker/schema/polygon.py | 3 +++ marker/util.py | 19 +++++++++++++++++++ 8 files changed, 60 insertions(+), 14 deletions(-) diff --git a/benchmarks/overall/inference.py b/benchmarks/overall/inference.py index 717cb3b4..03c2257c 100644 --- a/benchmarks/overall/inference.py +++ b/benchmarks/overall/inference.py @@ -23,9 +23,9 @@ def marker_scoring_func(model_dict, sample, gt_markdown, use_llm=False, **kwargs pdf_bytes = sample["pdf"] # This is a single page PDF start = time.time() marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm) - marker_md = clean_input(marker_md) + marker_md_clean = clean_input(marker_md) total = time.time() - start - scores = score_blocks(gt_markdown, marker_md) + scores = score_blocks(gt_markdown, marker_md_clean) scores["time"] = total scores["markdown"] = marker_md return scores @@ -41,8 +41,8 @@ def mathpix_scoring_func(model_dict, sample, gt_markdown, mathpix_ds=None, **kwa if not data: raise ValueError(f"Could not find data for uuid {uuid}") - mathpix_md = clean_input(data["md"]) - scores = score_blocks(gt_markdown, mathpix_md) + mathpix_md_clean = clean_input(data["md"]) + scores = score_blocks(gt_markdown, mathpix_md_clean) scores["time"] = data["time"] - scores["markdown"] = mathpix_md + scores["markdown"] = data["md"] return scores diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py index e4c1fd14..291cfb3b 100644 --- a/benchmarks/overall/overall.py +++ b/benchmarks/overall/overall.py @@ -56,7 +56,7 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f "averages_by_type": averages_by_type, "averages_by_block_type": averages_by_block_type, "average_time": avg_time, - "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores) + "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores), } def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"): diff --git a/benchmarks/overall/render.py b/benchmarks/overall/render.py index b49e32af..ff252266 100644 --- a/benchmarks/overall/render.py +++ b/benchmarks/overall/render.py @@ -12,6 +12,7 @@ import markdown2 from playwright.sync_api import sync_playwright +from benchmarks.overall.clean import convert_to_md, clean_input from benchmarks.overall.schema import FullResult def convert_to_html(md: str): @@ -90,7 +91,13 @@ def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> da ds_rows = defaultdict(dict) for idx in full_idxs: - row = ds[idx] # img, gt_blocks, classification, language, uuid + row = ds[idx] + ds_rows[idx].update({ + "img": row["img"], + "classification": row["classification"], + "language": row["language"], + "uuid": row["uuid"] + }) for method in all_scores: method_row = all_scores[method]["raw_scores"][idx] ds_rows[idx].update({ @@ -99,10 +106,11 @@ def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> da f"{method}_image": markdown_to_image(method_row["markdown"]), f"{method}_time": method_row["time"] }) - gt_md = "\n\n".join([clean_input(convert_to_md(block)) for block in json.loads(row["gt_blocks"])]) + gt_html = [block["html"] for block in json.loads(row["gt_blocks"]) if len(block["html"]) > 0] + gt_md = "\n\n".join([convert_to_md(block) for block in gt_html]) ds_rows[idx].update({ "gt_markdown": gt_md, - "gt_image": markdown_to_image(gt_md) + "gt_markdown_image": markdown_to_image(gt_md) }) out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs]) return out_dataset diff --git a/benchmarks/overall/schema.py b/benchmarks/overall/schema.py index d2d725f3..668f83f1 100644 --- a/benchmarks/overall/schema.py +++ b/benchmarks/overall/schema.py @@ -15,3 +15,4 @@ class FullResult(TypedDict): averages_by_block_type: Dict[str, List[float]] average_time: float average_score: float + gt_markdown: List[str] diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index 3741b760..01f69695 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -41,6 +41,7 @@ from marker.schema.registry import register_block_class from marker.util import strings_to_classes from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor +from marker.processors.order import OrderProcessor class PdfConverter(BaseConverter): @@ -59,6 +60,7 @@ class PdfConverter(BaseConverter): "Enable higher quality processing with LLMs.", ] = False default_processors: Tuple[BaseProcessor, ...] = ( + OrderProcessor, BlockquoteProcessor, CodeProcessor, DocumentTOCProcessor, diff --git a/marker/processors/order.py b/marker/processors/order.py index b28e57c3..146eaf30 100644 --- a/marker/processors/order.py +++ b/marker/processors/order.py @@ -1,4 +1,5 @@ from statistics import mean +from collections import defaultdict from marker.processors import BaseProcessor from marker.schema import BlockTypes @@ -13,41 +14,53 @@ class OrderProcessor(BaseProcessor): def __call__(self, document: Document): for page in document.pages: + # Skip OCRed pages if page.text_extraction_method != "pdftext": continue + # Skip pages without layout slicing if not page.layout_sliced: continue - block_idxs = {} + block_idxs = defaultdict(int) for block_id in page.structure: block = document.get_block(block_id) spans = block.contained_blocks(document, (BlockTypes.Span, )) if len(spans) == 0: continue + # Avg span position in original PDF block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2 for block_id in page.structure: - if block_id in block_idxs and block_idxs[block_id] > 0: + # Already assigned block id via span position + if block_idxs[block_id] > 0: continue + block = document.get_block(block_id) prev_block = document.get_prev_block(block) next_block = document.get_next_block(block) + block_idx_add = 0 + if prev_block: + block_idx_add = 1 + while prev_block and prev_block.id not in block_idxs: prev_block = document.get_prev_block(prev_block) + block_idx_add += 1 if not prev_block: + block_idx_add = -1 while next_block and next_block.id not in block_idxs: next_block = document.get_next_block(next_block) + block_idx_add -= 1 if not next_block and not prev_block: - block_idxs[block_id] = 0 + pass elif prev_block: - block_idxs[block_id] = block_idxs[prev_block.id] + 1 + block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add else: - block_idxs[block_id] = block_idxs[next_block.id] - 1 + block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add page.structure = sorted(page.structure, key=lambda x: block_idxs[x]) diff --git a/marker/schema/polygon.py b/marker/schema/polygon.py index 2174bc6c..25e9ed31 100644 --- a/marker/schema/polygon.py +++ b/marker/schema/polygon.py @@ -126,6 +126,9 @@ def center_distance(self, other: PolygonBox, x_weight: float = 1, y_weight: floa else: return abs(self.center[0] - other.center[0]) * x_weight + abs(self.center[1] - other.center[1]) * y_weight + def tl_distance(self, other: PolygonBox): + return ((self.bbox[0] - other.bbox[0]) ** 2 + (self.bbox[1] - other.bbox[1]) ** 2) ** 0.5 + def rescale(self, old_size, new_size): # Point is in x, y format page_width, page_height = old_size diff --git a/marker/util.py b/marker/util.py index 3dbde5f8..3586c0bb 100644 --- a/marker/util.py +++ b/marker/util.py @@ -80,3 +80,22 @@ def matrix_intersection_area(boxes1: List[List[float]], boxes2: List[List[float] height = np.maximum(0, max_y - min_y) return width * height # Shape: (N, M) + + +def matrix_distance(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray: + if len(boxes2) == 0: + return np.zeros((len(boxes1), 0)) + if len(boxes1) == 0: + return np.zeros((0, len(boxes2))) + + boxes1 = np.array(boxes1) # Shape: (N, 4) + boxes2 = np.array(boxes2) # Shape: (M, 4) + + boxes1_centers = (boxes1[:, :2] + boxes1[:, 2:]) / 2 # Shape: (M, 2) + boxes2_centers = (boxes2[:, :2] + boxes2[:, 2:]) / 2 # Shape: (M, 2) + + boxes1_centers = boxes1_centers[:, np.newaxis, :] # Shape: (N, 1, 2) + boxes2_centers = boxes2_centers[np.newaxis, :, :] # Shape: (1, M, 2) + + distances = np.linalg.norm(boxes1_centers - boxes2_centers, axis=2) # Shape: (N, M) + return distances