Skip to content

Commit

Permalink
Merge pull request #506 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Fix span id issue
  • Loading branch information
VikParuchuri authored Jan 27, 2025
2 parents 6d58e82 + 4ca60f4 commit 228a7ba
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 13 deletions.
10 changes: 8 additions & 2 deletions marker/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ def __call__(self, document: Document):
for block in page.contained_blocks(document, self.block_types):
intersections = matrix_intersection_area([c.polygon.bbox for c in child_contained_blocks], [block.polygon.bbox])
for child, intersection in zip(child_contained_blocks, intersections):
if intersection > 0.95 and child.id in page.structure:
# Adjust this to percentage of the child block that is enclosed by the table
intersection_pct = intersection / max(child.polygon.area, 1)
if intersection_pct > 0.95 and child.id in page.structure:
page.structure.remove(child.id)

def finalize_cell_text(self, cell: SuryaTableCell):
Expand Down Expand Up @@ -284,7 +286,11 @@ def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
table_idx = 0
for block in extract_blocks:
if block["page_id"] == pnum:
block["table_text_lines"] = page_tables[table_idx]
table_text = page_tables[table_idx]
if len(table_text) == 0:
block["ocr_block"] = True # Re-OCR the block if pdftext didn't find any text
else:
block["table_text_lines"] = page_tables[table_idx]
table_idx += 1
assert table_idx == len(page_tables), "Number of tables and table inputs must match"

Expand Down
8 changes: 5 additions & 3 deletions marker/renderers/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,11 +168,13 @@ def convert_table(self, el, text, convert_as_inline):
def convert_a(self, el, text, convert_as_inline):
text = self.escape(text)
text = re.sub(r"([\[\]])", r"\\\1", text)
return super().convert_a(el, self.escape(text), convert_as_inline)
return super().convert_a(el, text, convert_as_inline)

def convert_span(self, el, text, convert_as_inline):
return f'<span id="{el["id"]}"/>'

if el.get("id"):
return f'<span id="{el["id"]}">{text}</span>'
else:
return text

class MarkdownOutput(BaseModel):
markdown: str
Expand Down
9 changes: 4 additions & 5 deletions marker/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,10 @@ def compute_line_block_intersections(self, provider_outputs: List[ProviderOutput
continue

max_intersection = intersection_line.argmax()
if intersection_matrix[line_idx, max_intersection] > 0:
max_intersections[line_idx] = (
intersection_matrix[line_idx, max_intersection],
blocks[max_intersection].id
)
max_intersections[line_idx] = (
intersection_matrix[line_idx, max_intersection],
blocks[max_intersection].id
)
return max_intersections

def replace_block(self, block: Block, new_block: Block):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "1.3.1"
version = "1.3.2"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <github@vikas.sh>"]
readme = "README.md"
Expand Down
4 changes: 2 additions & 2 deletions tests/builders/test_pdf_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf
markdown = markdown_output.markdown

assert '[II.](#page-1-0)' in markdown
assert '<span id="page-1-0"/>II. THEORETICAL FRAMEWORK' in markdown
assert '<span id="page-1-0"></span>II. THEORETICAL FRAMEWORK' in markdown

for ref in set([f'<span id="page-{m[0]}-{m[1]}"/>' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]):
for ref in set([f'<span id="page-{m[0]}-{m[1]}">' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]):
assert ref in markdown, f"Reference {ref} not found in markdown"
23 changes: 23 additions & 0 deletions tests/processors/test_table_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,26 @@ def test_avoid_double_ocr(pdf_document, detection_model, recognition_model, tabl
table_output = renderer(pdf_document)
assert "Participants" in table_output.markdown


@pytest.mark.filename("multicol-blocks.pdf")
@pytest.mark.config({"page_range": [3]})
def test_overlap_blocks(pdf_document, detection_model, recognition_model, table_rec_model):
page = pdf_document.pages[0]
assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(pdf_document)

processor = TableProcessor(detection_model, recognition_model, table_rec_model)
processor(pdf_document)

assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(pdf_document)


@pytest.mark.filename("pres.pdf")
@pytest.mark.config({"page_range": [4]})
def test_ocr_table(pdf_document, detection_model, recognition_model, table_rec_model):
processor = TableProcessor(detection_model, recognition_model, table_rec_model)
processor(pdf_document)

renderer = MarkdownRenderer()
table_output = renderer(pdf_document)
assert "1.2E-38" in table_output.markdown

0 comments on commit 228a7ba

Please sign in to comment.