From 9f043f1d65a13bccc5b694ab4d04929f6aacb5fe Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 3 May 2024 10:37:44 -0700 Subject: [PATCH] Fix headings --- README.md | 2 +- marker/equations/equations.py | 94 ++++++++++++++++++++++++----------- marker/models.py | 2 +- marker/schema/bbox.py | 4 +- 4 files changed, 69 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 5ca03ec3..396433de 100644 --- a/README.md +++ b/README.md @@ -199,7 +199,7 @@ Omit `--nougat` to exclude nougat from the benchmark. I don't recommend running # Commercial usage -All models were trained from scratch, so they're okay for commercial usage. The weights for the models are licensed cc-by-nc-sa-4.0, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period. +All models were trained from scratch, so they're okay for commercial usage. The weights for the models are licensed cc-by-nc-sa-4.0, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. If you want to remove the GPL license requirements for inference or use the weights commercially over the revenue limit, please contact me at marker@vikas.sh for dual licensing. diff --git a/marker/equations/equations.py b/marker/equations/equations.py index a66b3d3e..8bdb0709 100644 --- a/marker/equations/equations.py +++ b/marker/equations/equations.py @@ -1,66 +1,103 @@ +from collections import defaultdict from copy import deepcopy from typing import List from marker.debug.data import dump_equation_debug_data from marker.equations.images import get_equation_image from marker.equations.inference import get_total_texify_tokens, get_latex_batched +from marker.schema.bbox import rescale_bbox from marker.schema.page import Page -from marker.schema.block import Line, Span, Block +from marker.schema.block import Line, Span, Block, bbox_from_lines from marker.settings import settings def find_equation_blocks(page, processor): equation_blocks = [] - for i in range(len(page.blocks)): - block = page.blocks[i] - block_text = block.prelim_text - # Check if the block is an equation - if not block.block_type in ["Formula"]: + equation_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Formula"]] + equation_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in equation_regions] + + lines_to_remove = defaultdict(list) + insert_points = {} + equation_lines = defaultdict(list) + for region_idx, region in enumerate(equation_regions): + for block_idx, block in enumerate(page.blocks): + for line_idx, line in enumerate(block.lines): + if line.intersection_pct(region) > .8: + # We will remove this line from the block + lines_to_remove[region_idx].append((block_idx, line_idx)) + equation_lines[region_idx].append(line) + + if region_idx not in insert_points: + # Insert before the block if line is at the beginning of the block, otherwise after the block + if line_idx <= len(block.lines) // 2: + insert_points[region_idx] = (block_idx, line_idx) + else: + insert_points[region_idx] = block_idx + 1 + + block_lines_to_remove = defaultdict(set) + for region_idx, equation_region in enumerate(equation_regions): + if region_idx not in equation_lines or len(equation_lines[region_idx]) == 0: continue + equation_block = equation_lines[region_idx] + equation_insert = insert_points[region_idx] + block_text = " ".join([line.prelim_text for line in equation_block]) + equation_bbox = bbox_from_lines(equation_block) total_tokens = get_total_texify_tokens(block_text, processor) - selected_blocks = (i, total_tokens) + selected_blocks = (equation_insert, total_tokens, block_text, equation_bbox) if total_tokens < settings.TEXIFY_MODEL_MAX: + for item in lines_to_remove[region_idx]: + block_lines_to_remove[item[0]].add(item[1]) equation_blocks.append(selected_blocks) + # Remove the lines from the blocks + for block_idx, bad_lines in block_lines_to_remove.items(): + block = page.blocks[block_idx] + block.lines = [line for idx, line in enumerate(block.lines) if idx not in bad_lines] + return equation_blocks -def replace_blocks_with_latex(page_blocks: Page, page_equation_blocks, predictions, pnum, processor): +def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnum, processor): converted_spans = [] idx = 0 success_count = 0 fail_count = 0 - for block_number, (block_idx, token_count) in enumerate(page_equation_blocks): - block = page_blocks.blocks[block_idx] - orig_block_text = block.prelim_text + for block_number, (insert_point, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks): latex_text = predictions[block_number] conditions = [ - len(latex_text) > 0, - get_total_texify_tokens(latex_text, processor) < settings.TEXIFY_MODEL_MAX, # Make sure we didn't run to the overall token max - len(latex_text) > len(orig_block_text) * .8, + get_total_texify_tokens(latex_text, processor) < settings.TEXIFY_MODEL_MAX, # Make sure we didn't get to the overall token max, indicates run-on + len(latex_text) > len(block_text) * .7, len(latex_text.strip()) > 0 ] - if not all(conditions): - fail_count += 1 - else: - success_count += 1 - block_line = Line( + new_block = Block( + lines=[Line( spans=[ Span( - text=latex_text, - bbox=block.bbox, + text=block_text.replace("\n", " "), + bbox=equation_bbox, span_id=f"{pnum}_{idx}_fixeq", font="Latex", font_weight=0, font_size=0 ) ], - bbox=block.bbox - ) - block.lines = [block_line] - converted_spans.append(deepcopy(block_line.spans[0])) + bbox=equation_bbox + )], + bbox=equation_bbox, + block_type="Formula", + pnum=pnum + ) + + if not all(conditions): + fail_count += 1 + else: + success_count += 1 + new_block.lines[0].spans[0].text = latex_text + converted_spans.append(deepcopy(new_block.lines[0].spans[0])) + + page_blocks.blocks.insert(insert_point, new_block) return success_count, fail_count, converted_spans @@ -80,9 +117,8 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings. token_counts = [] for page_idx, page_equation_blocks in enumerate(equation_blocks): page_obj = doc[page_idx] - for equation_idx, (block_idx, token_count) in enumerate(page_equation_blocks): - bbox = pages[page_idx].blocks[block_idx].bbox - png_image = get_equation_image(page_obj, pages[page_idx], bbox) + for equation_idx, (insert_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks): + png_image = get_equation_image(page_obj, pages[page_idx], equation_bbox) images.append(png_image) token_counts.append(token_count) @@ -96,7 +132,7 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings. for page_idx, page_equation_blocks in enumerate(equation_blocks): page_equation_count = len(page_equation_blocks) page_predictions = predictions[page_start:page_start + page_equation_count] - success_count, fail_count, converted_span = replace_blocks_with_latex( + success_count, fail_count, converted_span = insert_latex_block( pages[page_idx], page_equation_blocks, page_predictions, diff --git a/marker/models.py b/marker/models.py index fe100e26..72cb2b62 100644 --- a/marker/models.py +++ b/marker/models.py @@ -32,7 +32,7 @@ def setup_texify_model(): def setup_layout_model(): model = segformer.load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT) - processor = segformer.load_processor() + processor = segformer.load_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT) model.processor = processor return model diff --git a/marker/schema/bbox.py b/marker/schema/bbox.py index 6919feba..0469374a 100644 --- a/marker/schema/bbox.py +++ b/marker/schema/bbox.py @@ -100,5 +100,5 @@ def rescale_bbox(orig_dim, new_dim, bbox): width_scaler = detected_width / page_width height_scaler = detected_height / page_height - bbox = [bbox[0] / width_scaler, bbox[1] / height_scaler, bbox[2] / width_scaler, bbox[3] / height_scaler] - return bbox + new_bbox = [bbox[0] / width_scaler, bbox[1] / height_scaler, bbox[2] / width_scaler, bbox[3] / height_scaler] + return new_bbox