Merge pull request #506 from VikParuchuri/dev

Fix span id issue
VikParuchuri · Jan 27, 2025 · 228a7ba · 228a7ba
2 parents 6d58e82 + 4ca60f4
commit 228a7ba
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 13 deletions.
diff --git a/marker/processors/table.py b/marker/processors/table.py
@@ -135,7 +135,9 @@ def __call__(self, document: Document):
             for block in page.contained_blocks(document, self.block_types):
                 intersections = matrix_intersection_area([c.polygon.bbox for c in child_contained_blocks], [block.polygon.bbox])
                 for child, intersection in zip(child_contained_blocks, intersections):
-                    if intersection > 0.95 and child.id in page.structure:
+                    # Adjust this to percentage of the child block that is enclosed by the table
+                    intersection_pct = intersection / max(child.polygon.area, 1)
+                    if intersection_pct > 0.95 and child.id in page.structure:
                         page.structure.remove(child.id)
 
     def finalize_cell_text(self, cell: SuryaTableCell):
@@ -284,7 +286,11 @@ def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
             table_idx = 0
             for block in extract_blocks:
                 if block["page_id"] == pnum:
-                    block["table_text_lines"] = page_tables[table_idx]
+                    table_text = page_tables[table_idx]
+                    if len(table_text) == 0:
+                        block["ocr_block"] = True # Re-OCR the block if pdftext didn't find any text
+                    else:
+                        block["table_text_lines"] = page_tables[table_idx]
                     table_idx += 1
             assert table_idx == len(page_tables), "Number of tables and table inputs must match"
 

diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py
@@ -168,11 +168,13 @@ def convert_table(self, el, text, convert_as_inline):
     def convert_a(self, el, text, convert_as_inline):
         text = self.escape(text)
         text = re.sub(r"([\[\]])", r"\\\1", text)
-        return super().convert_a(el, self.escape(text), convert_as_inline)
+        return super().convert_a(el, text, convert_as_inline)
 
     def convert_span(self, el, text, convert_as_inline):
-        return f'<span id="{el["id"]}"/>'
-
+        if el.get("id"):
+            return f'<span id="{el["id"]}">{text}</span>'
+        else:
+            return text
 
 class MarkdownOutput(BaseModel):
     markdown: str

diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
@@ -120,11 +120,10 @@ def compute_line_block_intersections(self, provider_outputs: List[ProviderOutput
                 continue
 
             max_intersection = intersection_line.argmax()
-            if intersection_matrix[line_idx, max_intersection] > 0:
-                max_intersections[line_idx] = (
-                    intersection_matrix[line_idx, max_intersection],
-                    blocks[max_intersection].id
-                )
+            max_intersections[line_idx] = (
+                intersection_matrix[line_idx, max_intersection],
+                blocks[max_intersection].id
+            )
         return max_intersections
 
     def replace_block(self, block: Block, new_block: Block):

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "1.3.1"
+version = "1.3.2"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"

diff --git a/tests/builders/test_pdf_links.py b/tests/builders/test_pdf_links.py
@@ -29,7 +29,7 @@ def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf
     markdown = markdown_output.markdown
 
     assert '[II.](#page-1-0)' in markdown
-    assert '<span id="page-1-0"/>II. THEORETICAL FRAMEWORK' in markdown
+    assert '<span id="page-1-0"></span>II. THEORETICAL FRAMEWORK' in markdown
 
-    for ref in set([f'<span id="page-{m[0]}-{m[1]}"/>' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]):
+    for ref in set([f'<span id="page-{m[0]}-{m[1]}">' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]):
         assert ref in markdown, f"Reference {ref} not found in markdown"
diff --git a/tests/processors/test_table_processor.py b/tests/processors/test_table_processor.py
@@ -40,3 +40,26 @@ def test_avoid_double_ocr(pdf_document, detection_model, recognition_model, tabl
     table_output = renderer(pdf_document)
     assert "Participants" in table_output.markdown
 
+
+@pytest.mark.filename("multicol-blocks.pdf")
+@pytest.mark.config({"page_range": [3]})
+def test_overlap_blocks(pdf_document, detection_model, recognition_model, table_rec_model):
+    page = pdf_document.pages[0]
+    assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(pdf_document)
+
+    processor = TableProcessor(detection_model, recognition_model, table_rec_model)
+    processor(pdf_document)
+
+    assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(pdf_document)
+
+
+@pytest.mark.filename("pres.pdf")
+@pytest.mark.config({"page_range": [4]})
+def test_ocr_table(pdf_document, detection_model, recognition_model, table_rec_model):
+    processor = TableProcessor(detection_model, recognition_model, table_rec_model)
+    processor(pdf_document)
+
+    renderer = MarkdownRenderer()
+    table_output = renderer(pdf_document)
+    assert "1.2E-38" in table_output.markdown
+