Fix various bugs

VikParuchuri · Nov 26, 2024 · 360126e · 360126e
1 parent f26834c
commit 360126e
Show file tree

Hide file tree

Showing 11 changed files with 100 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -323,25 +323,19 @@ Pass the `debug` option to activate debug mode.  This will save images of each p
 
 Benchmarking PDF extraction quality is hard.  I've created a test set by finding books and scientific papers that have a pdf version and a latex source.  I convert the latex to text, and compare the reference to the output of text extraction methods.  It's noisy, but at least directionally correct.
 
-Benchmarks show that marker is 4x faster than nougat, and more accurate outside arXiv (nougat was trained on arXiv data).
-
 **Speed**
 
 | Method | Average Score | Time per page | Time per document |
 |--------|---------------|---------------|-------------------|
-| marker | 0.613721      | 0.631991      | 58.1432           |
-| nougat | 0.406603      | 2.59702       | 238.926           |
+| marker | 0.618355      | 0.250211      | 23.0194           |
 
 **Accuracy**
 
-First 3 are non-arXiv books, last 3 are arXiv papers.
-
 | Method | multicolcnn.pdf | switch_trans.pdf | thinkpython.pdf | thinkos.pdf | thinkdsp.pdf | crowd.pdf |
 |--------|-----------------|------------------|-----------------|-------------|--------------|-----------|
 | marker | 0.536176        | 0.516833         | 0.70515         | 0.710657    | 0.690042     | 0.523467  |
-| nougat | 0.44009         | 0.588973         | 0.322706        | 0.401342    | 0.160842     | 0.525663  |
 
-Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `4.1GB` for marker.  Benchmarks were run on an A6000 Ada.
+Peak GPU memory usage during the benchmark is `4.1GB` for marker.  Benchmarks were run on an A10.
 
 **Throughput**
 

diff --git a/marker/builders/layout.py b/marker/builders/layout.py
@@ -67,6 +67,7 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou
         for page, layout_result in zip(pages, layout_results):
             layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size
             provider_page_size = page.polygon.size
+            page.layout_sliced = layout_result.sliced # This indicates if the page was sliced by the layout model
             for bbox in sorted(layout_result.bboxes, key=lambda x: x.position):
                 block_cls = get_block_class(BlockTypes[bbox.label])
                 layout_block = page.add_block(block_cls, PolygonBox(polygon=bbox.polygon))
@@ -114,3 +115,4 @@ def check_layout_coverage(
         if not text_okay and (total_blocks == 1 and large_text_blocks == 1):
             text_okay = True
         return text_okay
+
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
@@ -1,13 +1,12 @@
 import os
-
-from marker.processors.footnote import FootnoteProcessor
-from marker.processors.line_numbers import LineNumbersProcessor
-
 os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
 
 from marker.processors.code import CodeProcessor
 from marker.processors.document_toc import DocumentTOCProcessor
 from marker.providers.pdf import PdfProvider
+from marker.processors.footnote import FootnoteProcessor
+from marker.processors.line_numbers import LineNumbersProcessor
+from marker.processors.order import OrderProcessor
 
 from marker.util import strings_to_classes
 

diff --git a/marker/processors/debug.py b/marker/processors/debug.py
@@ -117,7 +117,8 @@ def draw_layout_debug_images(self, document: Document, pdf_mode=False):
     def render_layout_boxes(self, page, png_image):
         layout_bboxes = []
         layout_labels = []
-        for child in page.children:
+        for block_id in page.structure:
+            child = page.get_block(block_id)
             if child.block_type in [BlockTypes.Line, BlockTypes.Span]:
                 continue
 
@@ -134,7 +135,8 @@ def render_layout_boxes(self, page, png_image):
             labels=order_labels,
             color="green",
             draw_bbox=False,
-            label_offset=5
+            label_offset=5,
+            label_font_size=24
         )
         return png_image
 

diff --git a/marker/processors/footnote.py b/marker/processors/footnote.py
@@ -26,8 +26,8 @@ class FootnoteProcessor(BaseProcessor):
             Default is .99
     """
     block_types = (BlockTypes.Footnote,)
-    page_bottom_threshold = .75
-    line_height_scaler = .99
+    page_bottom_threshold = .725
+    line_height_scaler = .97
 
 
     def __call__(self, document: Document):
@@ -44,8 +44,7 @@ def compute_block_stats(self, document: Document):
         line_heights = []
         for page in document.pages:
             for footnote in page.contained_blocks(document, self.block_types):
-                contained_lines = footnote.contained_blocks(document, (BlockTypes.Line,))
-                line_heights.extend([line.polygon.height for line in contained_lines])
+                line_heights.append(footnote.line_height(document))
         return line_heights
 
 
@@ -54,11 +53,8 @@ def relabel_texts_to_footnotes(self, page: PageGroup, document: Document, avg_fo
         block_stats = []
 
         for block in text_blocks:
-            contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
-            line_heights = [line.polygon.height for line in contained_lines]
-
             block_stats.append({
-                "line_height": mean(line_heights) if len(line_heights) > 0 else 999,
+                "line_height": block.line_height(document),
                 "in_bottom": block.polygon.y_end > page.polygon.height * self.page_bottom_threshold
             })
 

diff --git a/marker/processors/order.py b/marker/processors/order.py
@@ -0,0 +1,53 @@
+from statistics import mean
+
+from marker.processors import BaseProcessor
+from marker.schema import BlockTypes
+from marker.schema.document import Document
+
+
+class OrderProcessor(BaseProcessor):
+    """
+    A processor for sorting the blocks in order if needed.  This can help when the layout image was sliced.
+    """
+    block_types = tuple()
+
+    def __call__(self, document: Document):
+        for page in document.pages:
+            if page.text_extraction_method != "pdftext":
+                continue
+
+            if not page.layout_sliced:
+                continue
+
+            block_idxs = {}
+            for block_id in page.structure:
+                block = document.get_block(block_id)
+                spans = block.contained_blocks(document, (BlockTypes.Span, ))
+                if len(spans) == 0:
+                    continue
+
+                block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2
+
+            for block_id in page.structure:
+                if block_id in block_idxs and block_idxs[block_id] > 0:
+                    continue
+                block = document.get_block(block_id)
+                prev_block = document.get_prev_block(block)
+                next_block = document.get_next_block(block)
+
+                while prev_block and prev_block.id not in block_idxs:
+                    prev_block = document.get_prev_block(prev_block)
+
+                if not prev_block:
+                    while next_block and next_block.id not in block_idxs:
+                        next_block = document.get_next_block(next_block)
+
+                if not next_block and not prev_block:
+                    block_idxs[block_id] = 0
+                elif prev_block:
+                    block_idxs[block_id] = block_idxs[prev_block.id] + 1
+                else:
+                    block_idxs[block_id] = block_idxs[next_block.id] - 1
+
+            page.structure = sorted(page.structure, key=lambda x: block_idxs[x])
+
diff --git a/marker/processors/sectionheader.py b/marker/processors/sectionheader.py
@@ -44,23 +44,23 @@ def __call__(self, document: Document):
         line_heights: Dict[int, List[float]] = {}
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
-                line_heights[block.block_id] = []
                 if block.structure is not None:
-                    line_heights[block.block_id] = [document.get_block(l).polygon.height for l in block.structure if l.block_type == BlockTypes.Line]
+                    line_heights[block.id] = block.line_height(document)
+                else:
+                    line_heights[block.id] = 0
 
-        flat_line_heights = [h for heights in line_heights.values() for h in heights]
+        flat_line_heights = list(line_heights.values())
         heading_ranges = self.bucket_headings(flat_line_heights)
 
         for page in document.pages:
             for block in page.children:
                 if block.block_type not in self.block_types:
                     continue
 
-                block_heights = line_heights[block.block_id]
-                if len(block_heights) > 0:
-                    avg_height = sum(block_heights) / len(block_heights)
+                block_height = line_heights[block.id]
+                if block_height > 0:
                     for idx, (min_height, max_height) in enumerate(heading_ranges):
-                        if avg_height >= min_height * self.height_tolerance:
+                        if block_height >= min_height * self.height_tolerance:
                             block.heading_level = idx + 1
                             break
 

diff --git a/marker/schema/blocks/base.py b/marker/schema/blocks/base.py
@@ -179,3 +179,9 @@ def render(self, document: Document, parent_structure: Optional[List[str]], sect
             children=child_content,
             section_hierarchy=section_hierarchy
         )
+
+    def line_height(self, document: Document):
+        lines = self.contained_blocks(document, (BlockTypes.Line,))
+        if len(lines) == 0:
+            return 0
+        return self.polygon.height / len(lines)
diff --git a/marker/schema/blocks/footnote.py b/marker/schema/blocks/footnote.py
@@ -1,11 +1,28 @@
+import re
+
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 
 
+def superscript(child_blocks):
+    # Superscript leading symbol or digit sequence
+    first_block = None
+    while len(child_blocks) > 0:
+        first_block = child_blocks[0]
+        child_blocks = first_block.children
+
+    if first_block is not None and first_block.id.block_type == BlockTypes.Line:
+        digit_start = r"^([0-9\W]+)(.*)"
+        first_block.html = re.sub(digit_start, r"<sup>\1</sup>\2", first_block.html.lstrip())
+
+
 class Footnote(Block):
     block_type: BlockTypes = BlockTypes.Footnote
 
     def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
         template = template.replace("\n", " ")
+
+        # Add superscripts to start
+        superscript(child_blocks)
         return f"<p>{template}</p>"
diff --git a/marker/schema/blocks/picture.py b/marker/schema/blocks/picture.py
@@ -6,4 +6,4 @@ class Picture(Block):
     block_type: BlockTypes = BlockTypes.Picture
 
     def assemble_html(self, child_blocks, parent_structure):
-        return f"<p>Image {self.block_id}</p>"
+        return f"<p>Image {self.id}</p>"
diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
@@ -9,15 +9,13 @@
 from marker.schema.groups.base import Group
 from marker.schema.polygon import PolygonBox
 
-if TYPE_CHECKING:
-    from marker.schema.document import Document
-
 
 class PageGroup(Group):
     block_type: BlockTypes = BlockTypes.Page
     lowres_image: Image.Image | None = None
     highres_image: Image.Image | None = None
     children: List[Block] | None = None
+    layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong)
 
     def incr_block_id(self):
         if self.block_id is None: