From 360126e39a9f91d781b0362fc7454b1fa7d43301 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 26 Nov 2024 14:17:17 -0500 Subject: [PATCH] Fix various bugs --- README.md | 10 ++---- marker/builders/layout.py | 2 ++ marker/converters/pdf.py | 7 ++-- marker/processors/debug.py | 6 ++-- marker/processors/footnote.py | 12 +++---- marker/processors/order.py | 53 ++++++++++++++++++++++++++++++ marker/processors/sectionheader.py | 14 ++++---- marker/schema/blocks/base.py | 6 ++++ marker/schema/blocks/footnote.py | 17 ++++++++++ marker/schema/blocks/picture.py | 2 +- marker/schema/groups/page.py | 4 +-- 11 files changed, 100 insertions(+), 33 deletions(-) create mode 100644 marker/processors/order.py diff --git a/README.md b/README.md index 883c9cdc..18d3c5b1 100644 --- a/README.md +++ b/README.md @@ -323,25 +323,19 @@ Pass the `debug` option to activate debug mode. This will save images of each p Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct. -Benchmarks show that marker is 4x faster than nougat, and more accurate outside arXiv (nougat was trained on arXiv data). - **Speed** | Method | Average Score | Time per page | Time per document | |--------|---------------|---------------|-------------------| -| marker | 0.613721 | 0.631991 | 58.1432 | -| nougat | 0.406603 | 2.59702 | 238.926 | +| marker | 0.618355 | 0.250211 | 23.0194 | **Accuracy** -First 3 are non-arXiv books, last 3 are arXiv papers. - | Method | multicolcnn.pdf | switch_trans.pdf | thinkpython.pdf | thinkos.pdf | thinkdsp.pdf | crowd.pdf | |--------|-----------------|------------------|-----------------|-------------|--------------|-----------| | marker | 0.536176 | 0.516833 | 0.70515 | 0.710657 | 0.690042 | 0.523467 | -| nougat | 0.44009 | 0.588973 | 0.322706 | 0.401342 | 0.160842 | 0.525663 | -Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `4.1GB` for marker. Benchmarks were run on an A6000 Ada. +Peak GPU memory usage during the benchmark is `4.1GB` for marker. Benchmarks were run on an A10. **Throughput** diff --git a/marker/builders/layout.py b/marker/builders/layout.py index dd5795ad..d2eb2b5f 100644 --- a/marker/builders/layout.py +++ b/marker/builders/layout.py @@ -67,6 +67,7 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou for page, layout_result in zip(pages, layout_results): layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size provider_page_size = page.polygon.size + page.layout_sliced = layout_result.sliced # This indicates if the page was sliced by the layout model for bbox in sorted(layout_result.bboxes, key=lambda x: x.position): block_cls = get_block_class(BlockTypes[bbox.label]) layout_block = page.add_block(block_cls, PolygonBox(polygon=bbox.polygon)) @@ -114,3 +115,4 @@ def check_layout_coverage( if not text_okay and (total_blocks == 1 and large_text_blocks == 1): text_okay = True return text_okay + diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index 83c6ce48..12edb5c6 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -1,13 +1,12 @@ import os - -from marker.processors.footnote import FootnoteProcessor -from marker.processors.line_numbers import LineNumbersProcessor - os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning from marker.processors.code import CodeProcessor from marker.processors.document_toc import DocumentTOCProcessor from marker.providers.pdf import PdfProvider +from marker.processors.footnote import FootnoteProcessor +from marker.processors.line_numbers import LineNumbersProcessor +from marker.processors.order import OrderProcessor from marker.util import strings_to_classes diff --git a/marker/processors/debug.py b/marker/processors/debug.py index a440bac7..05f85b58 100644 --- a/marker/processors/debug.py +++ b/marker/processors/debug.py @@ -117,7 +117,8 @@ def draw_layout_debug_images(self, document: Document, pdf_mode=False): def render_layout_boxes(self, page, png_image): layout_bboxes = [] layout_labels = [] - for child in page.children: + for block_id in page.structure: + child = page.get_block(block_id) if child.block_type in [BlockTypes.Line, BlockTypes.Span]: continue @@ -134,7 +135,8 @@ def render_layout_boxes(self, page, png_image): labels=order_labels, color="green", draw_bbox=False, - label_offset=5 + label_offset=5, + label_font_size=24 ) return png_image diff --git a/marker/processors/footnote.py b/marker/processors/footnote.py index 8817ce86..6baa945e 100644 --- a/marker/processors/footnote.py +++ b/marker/processors/footnote.py @@ -26,8 +26,8 @@ class FootnoteProcessor(BaseProcessor): Default is .99 """ block_types = (BlockTypes.Footnote,) - page_bottom_threshold = .75 - line_height_scaler = .99 + page_bottom_threshold = .725 + line_height_scaler = .97 def __call__(self, document: Document): @@ -44,8 +44,7 @@ def compute_block_stats(self, document: Document): line_heights = [] for page in document.pages: for footnote in page.contained_blocks(document, self.block_types): - contained_lines = footnote.contained_blocks(document, (BlockTypes.Line,)) - line_heights.extend([line.polygon.height for line in contained_lines]) + line_heights.append(footnote.line_height(document)) return line_heights @@ -54,11 +53,8 @@ def relabel_texts_to_footnotes(self, page: PageGroup, document: Document, avg_fo block_stats = [] for block in text_blocks: - contained_lines = block.contained_blocks(document, (BlockTypes.Line,)) - line_heights = [line.polygon.height for line in contained_lines] - block_stats.append({ - "line_height": mean(line_heights) if len(line_heights) > 0 else 999, + "line_height": block.line_height(document), "in_bottom": block.polygon.y_end > page.polygon.height * self.page_bottom_threshold }) diff --git a/marker/processors/order.py b/marker/processors/order.py new file mode 100644 index 00000000..b28e57c3 --- /dev/null +++ b/marker/processors/order.py @@ -0,0 +1,53 @@ +from statistics import mean + +from marker.processors import BaseProcessor +from marker.schema import BlockTypes +from marker.schema.document import Document + + +class OrderProcessor(BaseProcessor): + """ + A processor for sorting the blocks in order if needed. This can help when the layout image was sliced. + """ + block_types = tuple() + + def __call__(self, document: Document): + for page in document.pages: + if page.text_extraction_method != "pdftext": + continue + + if not page.layout_sliced: + continue + + block_idxs = {} + for block_id in page.structure: + block = document.get_block(block_id) + spans = block.contained_blocks(document, (BlockTypes.Span, )) + if len(spans) == 0: + continue + + block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2 + + for block_id in page.structure: + if block_id in block_idxs and block_idxs[block_id] > 0: + continue + block = document.get_block(block_id) + prev_block = document.get_prev_block(block) + next_block = document.get_next_block(block) + + while prev_block and prev_block.id not in block_idxs: + prev_block = document.get_prev_block(prev_block) + + if not prev_block: + while next_block and next_block.id not in block_idxs: + next_block = document.get_next_block(next_block) + + if not next_block and not prev_block: + block_idxs[block_id] = 0 + elif prev_block: + block_idxs[block_id] = block_idxs[prev_block.id] + 1 + else: + block_idxs[block_id] = block_idxs[next_block.id] - 1 + + page.structure = sorted(page.structure, key=lambda x: block_idxs[x]) + diff --git a/marker/processors/sectionheader.py b/marker/processors/sectionheader.py index fde6726a..b040832d 100644 --- a/marker/processors/sectionheader.py +++ b/marker/processors/sectionheader.py @@ -44,11 +44,12 @@ def __call__(self, document: Document): line_heights: Dict[int, List[float]] = {} for page in document.pages: for block in page.contained_blocks(document, self.block_types): - line_heights[block.block_id] = [] if block.structure is not None: - line_heights[block.block_id] = [document.get_block(l).polygon.height for l in block.structure if l.block_type == BlockTypes.Line] + line_heights[block.id] = block.line_height(document) + else: + line_heights[block.id] = 0 - flat_line_heights = [h for heights in line_heights.values() for h in heights] + flat_line_heights = list(line_heights.values()) heading_ranges = self.bucket_headings(flat_line_heights) for page in document.pages: @@ -56,11 +57,10 @@ def __call__(self, document: Document): if block.block_type not in self.block_types: continue - block_heights = line_heights[block.block_id] - if len(block_heights) > 0: - avg_height = sum(block_heights) / len(block_heights) + block_height = line_heights[block.id] + if block_height > 0: for idx, (min_height, max_height) in enumerate(heading_ranges): - if avg_height >= min_height * self.height_tolerance: + if block_height >= min_height * self.height_tolerance: block.heading_level = idx + 1 break diff --git a/marker/schema/blocks/base.py b/marker/schema/blocks/base.py index 3a70a3f1..ffc000dc 100644 --- a/marker/schema/blocks/base.py +++ b/marker/schema/blocks/base.py @@ -179,3 +179,9 @@ def render(self, document: Document, parent_structure: Optional[List[str]], sect children=child_content, section_hierarchy=section_hierarchy ) + + def line_height(self, document: Document): + lines = self.contained_blocks(document, (BlockTypes.Line,)) + if len(lines) == 0: + return 0 + return self.polygon.height / len(lines) diff --git a/marker/schema/blocks/footnote.py b/marker/schema/blocks/footnote.py index d203123d..c476aa7d 100644 --- a/marker/schema/blocks/footnote.py +++ b/marker/schema/blocks/footnote.py @@ -1,11 +1,28 @@ +import re + from marker.schema import BlockTypes from marker.schema.blocks import Block +def superscript(child_blocks): + # Superscript leading symbol or digit sequence + first_block = None + while len(child_blocks) > 0: + first_block = child_blocks[0] + child_blocks = first_block.children + + if first_block is not None and first_block.id.block_type == BlockTypes.Line: + digit_start = r"^([0-9\W]+)(.*)" + first_block.html = re.sub(digit_start, r"\1\2", first_block.html.lstrip()) + + class Footnote(Block): block_type: BlockTypes = BlockTypes.Footnote def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) template = template.replace("\n", " ") + + # Add superscripts to start + superscript(child_blocks) return f"

{template}

" diff --git a/marker/schema/blocks/picture.py b/marker/schema/blocks/picture.py index fa929ba1..0765c0ef 100644 --- a/marker/schema/blocks/picture.py +++ b/marker/schema/blocks/picture.py @@ -6,4 +6,4 @@ class Picture(Block): block_type: BlockTypes = BlockTypes.Picture def assemble_html(self, child_blocks, parent_structure): - return f"

Image {self.block_id}

" + return f"

Image {self.id}

" diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index b1ae6cd9..96094178 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -9,15 +9,13 @@ from marker.schema.groups.base import Group from marker.schema.polygon import PolygonBox -if TYPE_CHECKING: - from marker.schema.document import Document - class PageGroup(Group): block_type: BlockTypes = BlockTypes.Page lowres_image: Image.Image | None = None highres_image: Image.Image | None = None children: List[Block] | None = None + layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong) def incr_block_id(self): if self.block_id is None: