Skip to content

Commit

Permalink
Merge pull request #373 from VikParuchuri/dev-mose/marker-v2
Browse files Browse the repository at this point in the history
Add Line merging across Pages and Columns
  • Loading branch information
VikParuchuri authored Nov 20, 2024
2 parents d9352be + 1e157c1 commit c6693c5
Show file tree
Hide file tree
Showing 13 changed files with 172 additions and 13 deletions.
2 changes: 2 additions & 0 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
setup_recognition_model, setup_table_rec_model, setup_texify_model
from marker.v2.processors.equation import EquationProcessor
from marker.v2.processors.sectionheader import SectionHeaderProcessor
from marker.v2.processors.text import TextProcessor
from marker.v2.processors.table import TableProcessor
from marker.v2.renderers.markdown import MarkdownRenderer
from marker.v2.schema import BlockTypes
Expand Down Expand Up @@ -67,6 +68,7 @@ def __call__(self, filepath: str):
EquationProcessor(self.texify_model, self.config),
TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model, self.config),
SectionHeaderProcessor(self.config),
TextProcessor(self.config),
CodeProcessor(self.config),
DocumentTOCProcessor(self.config),
DebugProcessor(self.config),
Expand Down
4 changes: 3 additions & 1 deletion marker/v2/processors/sectionheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ def __call__(self, document: Document):
if block.block_type not in self.block_types:
continue

line_heights[block.block_id] = [document.get_block(l).polygon.height for l in block.structure if l.block_type == BlockTypes.Line]
line_heights[block.block_id] = []
if block.structure is not None:
line_heights[block.block_id] = [document.get_block(l).polygon.height for l in block.structure if l.block_type == BlockTypes.Line]

flat_line_heights = [h for heights in line_heights.values() for h in heights]
heading_ranges = self.bucket_headings(flat_line_heights)
Expand Down
93 changes: 93 additions & 0 deletions marker/v2/processors/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import math
from typing import List

import regex

from marker.v2.processors import BaseProcessor
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document
from marker.v2.schema.text.line import Line


class TextProcessor(BaseProcessor):
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
column_gap_ratio = 0.02 # column gaps are atleast 2% of the page width

def __init__(self, config):
super().__init__(config)

def __call__(self, document: Document):
for page in document.pages:
column_gap = page.polygon.width * self.column_gap_ratio
for block in page.children:
if block.block_type in self.block_types:
if block.structure is None:
continue

if not len(block.structure) >= 2: # Skip single lines
continue

column_break, page_break = False, False
next_block = page.get_next_block(block)
if next_block is not None: # we check for a column break
column_break = (
math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and
next_block.polygon.x_start > (block.polygon.x_end + column_gap)
)
else: # It's a page break since we don't have a next block in the page
page_break = True

if not (column_break or page_break):
continue

next_block_starts_indented = True
next_block_in_first_quadrant = False
new_block_lines = []

if column_break:
if next_block.block_type not in self.block_types:
continue
if next_block.structure is None: # This is odd though, why do we have text blocks with no structure?
continue

new_block_lines = [page.get_block(block_id) for block_id in next_block.structure]
else: # page break
next_page = document.get_next_page(page)
if next_page is None:
continue # we're on the last page, so we don't worry about merging

# Go through the next page only
for next_page_block_id in next_page.structure:
if next_page_block_id.block_type in [BlockTypes.PageHeader, BlockTypes.PageFooter]:
continue # skip headers and footers
if next_page_block_id.block_type not in self.block_types:
break # we found a non-text block, so we can stop looking

# we have our text_block
next_page_block = next_page.get_block(next_page_block_id)
if next_page_block.structure is None:
break # This is odd though, why do we have text blocks with no structure?

new_block_lines = [next_page.get_block(block_id) for block_id in next_page_block.structure]

next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \
(next_page_block.polygon.y_start < next_page.polygon.height // 2)
break
else:
continue # we didn't break anywhere so we continue

# we check for next_block indentation
if len(new_block_lines):
min_x = math.ceil(min([l.polygon.x_start for l in new_block_lines]))
next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x

lines: List[Line] = [page.get_block(block_id) for block_id in block.structure]
max_x = math.floor(max([l.polygon.x_end for l in lines]))
last_line_is_full_width = lines[-1].polygon.x_end >= max_x

last_line_is_hyphentated = regex.compile(r'.*[\p{Ll}|\d][-—¬]\s?$', regex.DOTALL).match(lines[-1].raw_text(document).strip())

if (last_line_is_full_width or last_line_is_hyphentated) and \
not next_block_starts_indented and \
((next_block_in_first_quadrant and page_break) or column_break):
block.has_continuation = True
17 changes: 15 additions & 2 deletions marker/v2/renderers/markdown.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from markdownify import markdownify, MarkdownConverter
import regex
from markdownify import MarkdownConverter
from pydantic import BaseModel

from marker.v2.renderers.html import HTMLRenderer
Expand All @@ -20,6 +21,17 @@ def convert_div(self, el, text, convert_as_inline):
else:
return text

def convert_p(self, el, text, *args):
hyphens = r'-—¬'
has_continuation = el.has_attr('class') and 'has-continuation' in el['class']
if has_continuation:
if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages
return regex.split(rf"[{hyphens}]\s?$", text)[0]
if regex.search(r'[^\w\s]$', text): # Ends with non-word character and so we add a space after text, e.g "However,"
return f"{text} "
return text
return f"{text}\n\n" if text else "" # default convert_p behavior


class MarkdownOutput(BaseModel):
markdown: str
Expand All @@ -39,7 +51,8 @@ def __call__(self, document: Document) -> MarkdownOutput:
heading_style="ATX",
bullets="-",
escape_misc=False,
escape_underscores=False
escape_underscores=False,
escape_asterisks=False
)
markdown = md_cls.convert(full_html)
return MarkdownOutput(
Expand Down
8 changes: 4 additions & 4 deletions marker/v2/schema/blocks/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, List, Literal, Optional, Dict
from typing import TYPE_CHECKING, List, Literal, Optional, Dict

from pydantic import BaseModel, ConfigDict, field_validator

Expand Down Expand Up @@ -109,7 +109,7 @@ def raw_text(self, document: Document) -> str:
text += "\n"
return text

def assemble_html(self, child_blocks: List[BlockOutput], parent_structure=None):
def assemble_html(self, child_blocks: List[BlockOutput], parent_structure: Optional[List[str]] = None):
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>"
Expand All @@ -125,7 +125,7 @@ def assign_section_hierarchy(self, section_hierarchy):

return section_hierarchy

def render(self, document: Document, parent_structure, section_hierarchy=None):
def render(self, document: Document, parent_structure: Optional[List[str]], section_hierarchy=None):
child_content = []
if section_hierarchy is None:
section_hierarchy = {}
Expand All @@ -135,7 +135,7 @@ def render(self, document: Document, parent_structure, section_hierarchy=None):
for block_id in self.structure:
block = document.get_block(block_id)
rendered = block.render(document, self.structure, section_hierarchy)
section_hierarchy = rendered.section_hierarchy # Update the section hierarchy from the peer blocks
section_hierarchy = rendered.section_hierarchy # Update the section hierarchy from the peer blocks
child_content.append(rendered)

return BlockOutput(
Expand Down
7 changes: 6 additions & 1 deletion marker/v2/schema/blocks/inlinemath.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@

class InlineMath(Block):
block_type: BlockTypes = BlockTypes.TextInlineMath
has_continuation: bool = False

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"

class_attr = ""
if self.has_continuation:
class_attr = " class='has-continuation'"
return f"<p{class_attr}>{template}</p>"
7 changes: 6 additions & 1 deletion marker/v2/schema/blocks/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@

class Text(Block):
block_type: BlockTypes = BlockTypes.Text
has_continuation: bool = False

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"

class_attr = ""
if self.has_continuation:
class_attr += " class='has-continuation'"
return f"<p{class_attr}>{template}</p>"
16 changes: 16 additions & 0 deletions marker/v2/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ def get_page(self, page_id):
return page
return None

def get_next_block(self, block: Block):
page = self.get_page(block.page_id)
next_block = page.get_next_block(block)
if next_block:
return next_block
next_page = self.get_next_page(page)
if not next_page:
return None
return next_page.get_block(next_page.structure[0])

def get_next_page(self, page: PageGroup):
page_idx = self.pages.index(page)
if page_idx + 1 < len(self.pages):
return self.pages[page_idx + 1]
return None

def assemble_html(self, child_blocks: List[Block]):
template = ""
for c in child_blocks:
Expand Down
6 changes: 6 additions & 0 deletions marker/v2/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ def add_child(self, block: Block):
else:
self.children.append(block)

def get_next_block(self, block: Block):
block_idx = self.structure.index(block.id)
if block_idx + 1 < len(self.structure):
return self.get_block(self.structure[block_idx + 1])
return None

def add_block(self, block_cls: type[Block], polygon: PolygonBox) -> Block:
self.incr_block_id()
block = block_cls(
Expand Down
16 changes: 16 additions & 0 deletions marker/v2/schema/polygon.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,22 @@ def center(self):
def size(self):
return [self.width, self.height]

@property
def x_start(self):
return self.bbox[0]

@property
def y_start(self):
return self.bbox[1]

@property
def x_end(self):
return self.bbox[2]

@property
def y_end(self):
return self.bbox[3]

@computed_field
@property
def bbox(self) -> List[float]:
Expand Down
1 change: 1 addition & 0 deletions tests/builders/test_garbled_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from marker.v2.schema import BlockTypes


@pytest.mark.skip(reason="This is failing because we need better garbled text detection")
@pytest.mark.filename("water_damage.pdf")
def test_garbled_pdf(pdf_document):
assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'
Expand Down
6 changes: 3 additions & 3 deletions tests/builders/test_overriding.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ def test_overriding(pdf_document: Document):


def get_lines(pdf: str, config=None):
for block_type, block_cls in config["override_map"].items():
register_block_class(block_type, block_cls)

provider: PdfProvider = setup_pdf_provider(pdf, config)
return provider.get_page_lines(0)

Expand All @@ -39,9 +42,6 @@ def test_overriding_mp():
"override_map": {BlockTypes.Line: NewLine}
}

for block_type, block_cls in config["override_map"].items():
register_block_class(block_type, block_cls)

pdf_list = ["adversarial.pdf", "adversarial_rot.pdf"]

with mp.Pool(processes=2) as pool:
Expand Down
2 changes: 1 addition & 1 deletion tests/processors/test_document_toc_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ def test_table_processor(pdf_document, detection_model, recognition_model, table
processor(pdf_document)

assert len(pdf_document.table_of_contents) == 3
assert pdf_document.table_of_contents[0]["text"] == "Subspace Adversarial Training"
assert pdf_document.table_of_contents[0]["title"] == "Subspace Adversarial Training"

0 comments on commit c6693c5

Please sign in to comment.