Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Line merging across Pages and Columns #373

Merged
merged 21 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
4fbae5f
initialize registry overrides in the worker [skip ci]
iammosespaulr Nov 19, 2024
71648d1
initial text processor [skip ci]
iammosespaulr Nov 19, 2024
a9e11a6
add line joining logic across pages and columns
iammosespaulr Nov 19, 2024
28de11a
better heuristics and check next blocks across page boundaries [skip ci]
iammosespaulr Nov 19, 2024
88fbb8e
Merge remote-tracking branch 'origin/v2' into dev-mose/marker-v2
iammosespaulr Nov 19, 2024
b909e01
parameterize threshold and fix tests [skip ci]
iammosespaulr Nov 19, 2024
e9f8352
fix structure checking [skip ci]
iammosespaulr Nov 19, 2024
0157e2f
fixes and cleanup
iammosespaulr Nov 19, 2024
59aac27
fix section header processor and line count threshold [skip ci]
iammosespaulr Nov 19, 2024
5cffaaf
update continuation heuristic
iammosespaulr Nov 20, 2024
99c5f86
Merge remote-tracking branch 'origin/v2' into dev-mose/marker-v2
iammosespaulr Nov 20, 2024
bb44846
add some tolerance by rounding down to the nearest int for indent che…
iammosespaulr Nov 20, 2024
dd4db58
fix extra space in <p> tags [skip ci]
iammosespaulr Nov 20, 2024
86c5234
clean up logic and add heuristic to check if the next text block is i…
iammosespaulr Nov 20, 2024
1dd3440
clean up
iammosespaulr Nov 20, 2024
42dac94
more cleanup [skip ci]
iammosespaulr Nov 20, 2024
e0c6ff3
fix thinko
iammosespaulr Nov 20, 2024
c27664a
fix ceil
iammosespaulr Nov 20, 2024
175747c
merge new_block indentation check logic
iammosespaulr Nov 20, 2024
9503e9e
fix column break logic [skip ci]
iammosespaulr Nov 20, 2024
1e157c1
column gap tolerances and gap ratio of 2%
iammosespaulr Nov 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
setup_recognition_model, setup_table_rec_model, setup_texify_model
from marker.v2.processors.equation import EquationProcessor
from marker.v2.processors.sectionheader import SectionHeaderProcessor
from marker.v2.processors.text import TextProcessor
from marker.v2.processors.table import TableProcessor
from marker.v2.renderers.markdown import MarkdownRenderer
from marker.v2.schema import BlockTypes
Expand Down Expand Up @@ -67,6 +68,7 @@ def __call__(self, filepath: str):
EquationProcessor(self.texify_model, self.config),
TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model, self.config),
SectionHeaderProcessor(self.config),
TextProcessor(self.config),
CodeProcessor(self.config),
DocumentTOCProcessor(self.config),
DebugProcessor(self.config),
Expand Down
4 changes: 3 additions & 1 deletion marker/v2/processors/sectionheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ def __call__(self, document: Document):
if block.block_type not in self.block_types:
continue

line_heights[block.block_id] = [document.get_block(l).polygon.height for l in block.structure if l.block_type == BlockTypes.Line]
line_heights[block.block_id] = []
if block.structure is not None:
line_heights[block.block_id] = [document.get_block(l).polygon.height for l in block.structure if l.block_type == BlockTypes.Line]

flat_line_heights = [h for heights in line_heights.values() for h in heights]
heading_ranges = self.bucket_headings(flat_line_heights)
Expand Down
93 changes: 93 additions & 0 deletions marker/v2/processors/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import math
from typing import List

import regex

from marker.v2.processors import BaseProcessor
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document
from marker.v2.schema.text.line import Line


class TextProcessor(BaseProcessor):
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
column_gap_ratio = 0.02 # column gaps are atleast 2% of the page width

def __init__(self, config):
super().__init__(config)

def __call__(self, document: Document):
for page in document.pages:
column_gap = page.polygon.width * self.column_gap_ratio
for block in page.children:
if block.block_type in self.block_types:
if block.structure is None:
continue

if not len(block.structure) >= 2: # Skip single lines
continue

column_break, page_break = False, False
next_block = page.get_next_block(block)
if next_block is not None: # we check for a column break
column_break = (
math.floor(next_block.polygon.y_start) <= math.floor(block.polygon.y_start) and
next_block.polygon.x_start > (block.polygon.x_end + column_gap)
)
else: # It's a page break since we don't have a next block in the page
page_break = True

if not (column_break or page_break):
continue

next_block_starts_indented = True
next_block_in_first_quadrant = False
new_block_lines = []

if column_break:
if next_block.block_type not in self.block_types:
continue
if next_block.structure is None: # This is odd though, why do we have text blocks with no structure?
continue

new_block_lines = [page.get_block(block_id) for block_id in next_block.structure]
else: # page break
next_page = document.get_next_page(page)
if next_page is None:
continue # we're on the last page, so we don't worry about merging

# Go through the next page only
for next_page_block_id in next_page.structure:
if next_page_block_id.block_type in [BlockTypes.PageHeader, BlockTypes.PageFooter]:
continue # skip headers and footers
if next_page_block_id.block_type not in self.block_types:
break # we found a non-text block, so we can stop looking

# we have our text_block
next_page_block = next_page.get_block(next_page_block_id)
if next_page_block.structure is None:
break # This is odd though, why do we have text blocks with no structure?

new_block_lines = [next_page.get_block(block_id) for block_id in next_page_block.structure]

next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \
(next_page_block.polygon.y_start < next_page.polygon.height // 2)
break
else:
continue # we didn't break anywhere so we continue

# we check for next_block indentation
if len(new_block_lines):
min_x = math.ceil(min([l.polygon.x_start for l in new_block_lines]))
next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x

lines: List[Line] = [page.get_block(block_id) for block_id in block.structure]
max_x = math.floor(max([l.polygon.x_end for l in lines]))
last_line_is_full_width = lines[-1].polygon.x_end >= max_x

last_line_is_hyphentated = regex.compile(r'.*[\p{Ll}|\d][-—¬]\s?$', regex.DOTALL).match(lines[-1].raw_text(document).strip())

if (last_line_is_full_width or last_line_is_hyphentated) and \
not next_block_starts_indented and \
((next_block_in_first_quadrant and page_break) or column_break):
block.has_continuation = True
17 changes: 15 additions & 2 deletions marker/v2/renderers/markdown.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from markdownify import markdownify, MarkdownConverter
import regex
from markdownify import MarkdownConverter
from pydantic import BaseModel

from marker.v2.renderers.html import HTMLRenderer
Expand All @@ -20,6 +21,17 @@ def convert_div(self, el, text, convert_as_inline):
else:
return text

def convert_p(self, el, text, *args):
hyphens = r'-—¬'
has_continuation = el.has_attr('class') and 'has-continuation' in el['class']
if has_continuation:
if regex.compile(rf'.*[\p{{Ll}}|\d][{hyphens}]\s?$', regex.DOTALL).match(text): # handle hypenation across pages
return regex.split(rf"[{hyphens}]\s?$", text)[0]
if regex.search(r'[^\w\s]$', text): # Ends with non-word character and so we add a space after text, e.g "However,"
return f"{text} "
return text
return f"{text}\n\n" if text else "" # default convert_p behavior


class MarkdownOutput(BaseModel):
markdown: str
Expand All @@ -39,7 +51,8 @@ def __call__(self, document: Document) -> MarkdownOutput:
heading_style="ATX",
bullets="-",
escape_misc=False,
escape_underscores=False
escape_underscores=False,
escape_asterisks=False
)
markdown = md_cls.convert(full_html)
return MarkdownOutput(
Expand Down
8 changes: 4 additions & 4 deletions marker/v2/schema/blocks/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, List, Literal, Optional, Dict
from typing import TYPE_CHECKING, List, Literal, Optional, Dict

from pydantic import BaseModel, ConfigDict, field_validator

Expand Down Expand Up @@ -109,7 +109,7 @@ def raw_text(self, document: Document) -> str:
text += "\n"
return text

def assemble_html(self, child_blocks: List[BlockOutput], parent_structure=None):
def assemble_html(self, child_blocks: List[BlockOutput], parent_structure: Optional[List[str]] = None):
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>"
Expand All @@ -125,7 +125,7 @@ def assign_section_hierarchy(self, section_hierarchy):

return section_hierarchy

def render(self, document: Document, parent_structure, section_hierarchy=None):
def render(self, document: Document, parent_structure: Optional[List[str]], section_hierarchy=None):
child_content = []
if section_hierarchy is None:
section_hierarchy = {}
Expand All @@ -135,7 +135,7 @@ def render(self, document: Document, parent_structure, section_hierarchy=None):
for block_id in self.structure:
block = document.get_block(block_id)
rendered = block.render(document, self.structure, section_hierarchy)
section_hierarchy = rendered.section_hierarchy # Update the section hierarchy from the peer blocks
section_hierarchy = rendered.section_hierarchy # Update the section hierarchy from the peer blocks
child_content.append(rendered)

return BlockOutput(
Expand Down
7 changes: 6 additions & 1 deletion marker/v2/schema/blocks/inlinemath.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@

class InlineMath(Block):
block_type: BlockTypes = BlockTypes.TextInlineMath
has_continuation: bool = False

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"

class_attr = ""
if self.has_continuation:
class_attr = " class='has-continuation'"
return f"<p{class_attr}>{template}</p>"
7 changes: 6 additions & 1 deletion marker/v2/schema/blocks/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@

class Text(Block):
block_type: BlockTypes = BlockTypes.Text
has_continuation: bool = False

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"

class_attr = ""
if self.has_continuation:
class_attr += " class='has-continuation'"
return f"<p{class_attr}>{template}</p>"
16 changes: 16 additions & 0 deletions marker/v2/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ def get_page(self, page_id):
return page
return None

def get_next_block(self, block: Block):
page = self.get_page(block.page_id)
next_block = page.get_next_block(block)
if next_block:
return next_block
next_page = self.get_next_page(page)
if not next_page:
return None
return next_page.get_block(next_page.structure[0])

def get_next_page(self, page: PageGroup):
page_idx = self.pages.index(page)
if page_idx + 1 < len(self.pages):
return self.pages[page_idx + 1]
return None

def assemble_html(self, child_blocks: List[Block]):
template = ""
for c in child_blocks:
Expand Down
6 changes: 6 additions & 0 deletions marker/v2/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ def add_child(self, block: Block):
else:
self.children.append(block)

def get_next_block(self, block: Block):
block_idx = self.structure.index(block.id)
if block_idx + 1 < len(self.structure):
return self.get_block(self.structure[block_idx + 1])
return None

def add_block(self, block_cls: type[Block], polygon: PolygonBox) -> Block:
self.incr_block_id()
block = block_cls(
Expand Down
16 changes: 16 additions & 0 deletions marker/v2/schema/polygon.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,22 @@ def center(self):
def size(self):
return [self.width, self.height]

@property
def x_start(self):
return self.bbox[0]

@property
def y_start(self):
return self.bbox[1]

@property
def x_end(self):
return self.bbox[2]

@property
def y_end(self):
return self.bbox[3]

@computed_field
@property
def bbox(self) -> List[float]:
Expand Down
1 change: 1 addition & 0 deletions tests/builders/test_garbled_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from marker.v2.schema import BlockTypes


@pytest.mark.skip(reason="This is failing because we need better garbled text detection")
@pytest.mark.filename("water_damage.pdf")
def test_garbled_pdf(pdf_document):
assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'
Expand Down
6 changes: 3 additions & 3 deletions tests/builders/test_overriding.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ def test_overriding(pdf_document: Document):


def get_lines(pdf: str, config=None):
for block_type, block_cls in config["override_map"].items():
register_block_class(block_type, block_cls)

provider: PdfProvider = setup_pdf_provider(pdf, config)
return provider.get_page_lines(0)

Expand All @@ -39,9 +42,6 @@ def test_overriding_mp():
"override_map": {BlockTypes.Line: NewLine}
}

for block_type, block_cls in config["override_map"].items():
register_block_class(block_type, block_cls)

pdf_list = ["adversarial.pdf", "adversarial_rot.pdf"]

with mp.Pool(processes=2) as pool:
Expand Down
2 changes: 1 addition & 1 deletion tests/processors/test_document_toc_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ def test_table_processor(pdf_document, detection_model, recognition_model, table
processor(pdf_document)

assert len(pdf_document.table_of_contents) == 3
assert pdf_document.table_of_contents[0]["text"] == "Subspace Adversarial Training"
assert pdf_document.table_of_contents[0]["title"] == "Subspace Adversarial Training"