Skip to content

Commit

Permalink
update continuation heuristic
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Nov 20, 2024
1 parent 59aac27 commit 5cffaaf
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 6 deletions.
36 changes: 30 additions & 6 deletions marker/v2/processors/text.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from typing import List

import regex

from marker.v2.processors import BaseProcessor
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document
from marker.v2.schema.text.line import Line
from typing import List


class TextProcessor(BaseProcessor):
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
min_continuation_width_ratio = 0.98

def __init__(self, config):
super().__init__(config)
Expand All @@ -22,12 +24,34 @@ def __call__(self, document: Document):
if not len(block.structure) >= 2: # Skip single lines
continue

next_block = document.get_next_block(block)
if next_block and next_block.block_type not in self.block_types:
column_or_page_break = False
next_block = page.get_next_block(block)
if next_block is not None: # we check for a column break
column_or_page_break = (
next_block.polygon.y_start < block.polygon.y_start and
next_block.polygon.x_start > block.polygon.x_start
)
else: # It's a page break since we don't have a next block in the page
column_or_page_break = True

if not column_or_page_break:
continue

next_block_starts_indented = True
next_block_doc = document.get_next_block(block)
if next_block_doc:
if next_block_doc.block_type not in self.block_types:
continue
new_page = document.get_page(next_block_doc.page_id) # the next block can come from the next page
new_block_lines = [new_page.get_block(block_id) for block_id in next_block_doc.structure]
min_x = min([l.polygon.x_start for l in new_block_lines])
next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x

lines: List[Line] = [page.get_block(block_id) for block_id in block.structure]
avg_width = sum([l.polygon.width for l in lines]) / len(lines)
max_x = max([l.polygon.x_end for l in lines])

last_line_is_full_width = lines[-1].polygon.x_end >= max_x
last_line_is_hyphentated = regex.compile(r'.*[\p{Ll}|\d][-—¬]\s?$', regex.DOTALL).match(lines[-1].raw_text(document).strip())

if lines[-1].polygon.width >= avg_width * self.min_continuation_width_ratio:
if (last_line_is_full_width or last_line_is_hyphentated) and not next_block_starts_indented:
block.has_continuation = True
16 changes: 16 additions & 0 deletions marker/v2/schema/polygon.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,22 @@ def center(self):
def size(self):
return [self.width, self.height]

@property
def x_start(self):
return self.bbox[0]

@property
def y_start(self):
return self.bbox[1]

@property
def x_end(self):
return self.bbox[2]

@property
def y_end(self):
return self.bbox[3]

@computed_field
@property
def bbox(self) -> List[float]:
Expand Down

0 comments on commit 5cffaaf

Please sign in to comment.