Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Inline Math Support #517

Open
wants to merge 26 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
f695b49
Initial inline math support
tarun-menta Jan 30, 2025
c32eb88
Run OCR on any missing lines from good pages
tarun-menta Jan 30, 2025
8c9a4fb
Cleanup debug statements
tarun-menta Jan 30, 2025
c7c7dcf
Move line splitting for math boxes from `surya` into `marker`
tarun-menta Jan 30, 2025
6eb7e83
Update hashing
tarun-menta Jan 31, 2025
834c097
Refactor fully
tarun-menta Jan 31, 2025
b075c9d
Cleanup
tarun-menta Jan 31, 2025
fd351dd
Refactor splitting logic and add support for character level splitting
tarun-menta Jan 31, 2025
cdd30ab
Final refactor - Remove char processing logic
tarun-menta Feb 1, 2025
1ed58bf
Update test config with new model
tarun-menta Feb 1, 2025
34381e6
Cleanup
tarun-menta Feb 1, 2025
ce867b8
Char level splicing of inline math into provider lines
tarun-menta Feb 2, 2025
af2c09e
Merge remote-tracking branch 'origin/texify' into inline-math
tarun-menta Feb 2, 2025
1871067
Update equation processor for inline
tarun-menta Feb 3, 2025
96c95a2
Update OCR and Line Builders
tarun-menta Feb 3, 2025
ddd0616
Better thresholds
tarun-menta Feb 4, 2025
9cbaae7
Fix tests; Include new builder in args
tarun-menta Feb 4, 2025
92e75f9
Fix OCR bug - Don't sort lines in `RecognitionPredictor`
tarun-menta Feb 4, 2025
a2457e4
Fix line merging
tarun-menta Feb 4, 2025
861fa91
Update `text_extraction_method` for blocks in `OCRBuilder`
tarun-menta Feb 4, 2025
ffdad8c
Fix OCR+Provider line merging for rotated PDFs
tarun-menta Feb 4, 2025
43ddda5
Set page extraction method in `LineBuilder`
tarun-menta Feb 4, 2025
f3d5e5b
Add flag for inline math instead of running every time
tarun-menta Feb 5, 2025
82668f3
Update test
tarun-menta Feb 5, 2025
bbdae4b
Fix tests
tarun-menta Feb 5, 2025
f0e4745
Update to new inline detection API
tarun-menta Feb 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 2 additions & 74 deletions marker/builders/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,6 @@ class LayoutBuilder(BaseBuilder):
"The batch size to use for the layout model.",
"Default is None, which will use the default batch size for the model."
] = None
layout_coverage_min_lines: Annotated[
int,
"The minimum number of PdfProvider lines that must be covered by the layout model",
"to consider the lines from the PdfProvider valid.",
] = 1
layout_coverage_threshold: Annotated[
float,
"The minimum coverage ratio required for the layout model to consider",
"the lines from the PdfProvider valid.",
] = .1
document_ocr_threshold: Annotated[
float,
"The minimum ratio of pages that must pass the layout coverage check",
"to avoid OCR.",
] = .8
excluded_for_coverage: Annotated[
Tuple[BlockTypes],
"A list of block types to exclude from the layout coverage check.",
] = (BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup)
force_layout_block: Annotated[
str,
"Skip layout and force every page to be treated as a specific block type.",
Expand All @@ -64,7 +45,7 @@ def __call__(self, document: Document, provider: PdfProvider):
else:
layout_results = self.surya_layout(document.pages)
self.add_blocks_to_pages(document.pages, layout_results)
self.merge_blocks(document.pages, provider.page_lines)
# self.merge_blocks(document.pages, provider.page_lines)

def get_batch_size(self):
if self.batch_size is not None:
Expand Down Expand Up @@ -132,57 +113,4 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou

# Ensure page has non-empty children
if page.children is None:
page.children = []

def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: ProviderPageLines):
ocr_error_detection_labels = self.surya_ocr_error_detection(document_pages, provider_page_lines).labels

good_pages = []
for (document_page, ocr_error_detection_label) in zip(document_pages, ocr_error_detection_labels):
provider_lines = provider_page_lines.get(document_page.page_id, [])
good_pages.append(bool(provider_lines) and self.check_layout_coverage(document_page, provider_lines) and (ocr_error_detection_label != "bad"))

ocr_document = sum(good_pages) / len(good_pages) < self.document_ocr_threshold
for idx, document_page in enumerate(document_pages):
provider_lines = provider_page_lines.get(document_page.page_id, [])
needs_ocr = not good_pages[idx]
if needs_ocr and ocr_document:
document_page.text_extraction_method = "surya"
continue
document_page.merge_blocks(provider_lines, text_extraction_method="pdftext")
document_page.text_extraction_method = "pdftext"

def check_layout_coverage(
self,
document_page: PageGroup,
provider_lines: List[ProviderOutput],
):
covered_blocks = 0
total_blocks = 0
large_text_blocks = 0

layout_blocks = [document_page.get_block(block) for block in document_page.structure]
layout_blocks = [b for b in layout_blocks if b.block_type not in self.excluded_for_coverage]

layout_bboxes = [block.polygon.bbox for block in layout_blocks]
provider_bboxes = [line.line.polygon.bbox for line in provider_lines]

intersection_matrix = matrix_intersection_area(layout_bboxes, provider_bboxes)

for idx, layout_block in enumerate(layout_blocks):
total_blocks += 1
intersecting_lines = np.count_nonzero(intersection_matrix[idx] > 0)

if intersecting_lines >= self.layout_coverage_min_lines:
covered_blocks += 1

if layout_block.polygon.intersection_pct(document_page.polygon) > 0.8 and layout_block.block_type == BlockTypes.Text:
large_text_blocks += 1

coverage_ratio = covered_blocks / total_blocks if total_blocks > 0 else 1
text_okay = coverage_ratio >= self.layout_coverage_threshold

# Model will sometimes say there is a single block of text on the page when it is blank
if not text_okay and (total_blocks == 1 and large_text_blocks == 1):
text_okay = True
return text_okay
page.children = []
Loading
Loading