Skip to content

Commit

Permalink
Fix for bbox sizing
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 27, 2024
1 parent 12950fd commit 58dfbe9
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 62 deletions.
54 changes: 0 additions & 54 deletions marker/processors/footnote.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,65 +24,11 @@ class FootnoteProcessor(BaseProcessor):
Default is .99
"""
block_types = (BlockTypes.Footnote,)
page_bottom_threshold = .75
line_height_scaler = .94


def __call__(self, document: Document):
footnote_heights = self.compute_footnote_block_stats(document)
text_heights = self.compute_text_block_stats(document)
if len(footnote_heights) == 0:
footnote_heights = [999]
if len(text_heights) == 0:
text_heights = [999]

avg_footnote_height = mean(footnote_heights)
avg_text_height = mean(text_heights)
text_height_close_to_footnote = avg_footnote_height * self.line_height_scaler < avg_text_height < avg_footnote_height * (1 + 1 - self.line_height_scaler)
for page in document.pages:
if not text_height_close_to_footnote:
self.relabel_texts_to_footnotes(page, document, avg_footnote_height)

self.push_footnotes_to_bottom(page, document)

def compute_footnote_block_stats(self, document: Document):
line_heights = []
for page in document.pages:
for footnote in page.contained_blocks(document, self.block_types):
line_heights.append(footnote.line_height(document))
return line_heights

def compute_text_block_stats(self, document: Document):
line_heights = []
for page in document.pages:
for text_block in page.contained_blocks(document, (BlockTypes.Text,)):
line_heights.append(text_block.line_height(document))
return line_heights


def relabel_texts_to_footnotes(self, page: PageGroup, document: Document, avg_footnote_height: int):
text_blocks = page.contained_blocks(document, (BlockTypes.Text,))
block_stats = []

for block in text_blocks:
block_stats.append({
"line_height": block.line_height(document),
"in_bottom": block.polygon.y_end > page.polygon.height * self.page_bottom_threshold
})

# Find the average font size and line height
if len(block_stats) == 0:
return

height_gap = 1 - self.line_height_scaler
for text_block, stats_dict in zip(text_blocks, block_stats):
if all([
avg_footnote_height * self.line_height_scaler < stats_dict["line_height"] < avg_footnote_height * (1 + height_gap),
stats_dict["in_bottom"]
]):
new_block = Footnote.from_block(text_block)
page.replace_block(text_block, new_block)


def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
footnote_blocks = page.contained_blocks(document, self.block_types)
Expand Down
20 changes: 12 additions & 8 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import atexit
import functools
import re
from concurrent.futures.thread import ThreadPoolExecutor
from itertools import repeat
from typing import List, Set, Dict
from typing import List, Set

import pypdfium2 as pdfium
from pdftext.extraction import dictionary_output
Expand Down Expand Up @@ -39,9 +36,13 @@ def __init__(self, filepath: str, config=None):

assert max(self.page_range) < len(self.doc) and min(self.page_range) >= 0, f"Invalid page range, values must be between 0 and {len(self.doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."

if not self.force_ocr:
if self.force_ocr:
# Manually assign page bboxes, since we can't get them from pdftext
self.page_bboxes = {i: self.doc[i].get_bbox() for i in self.page_range}
else:
self.page_lines = self.pdftext_extraction()


atexit.register(self.cleanup_pdf_doc)

def __len__(self) -> int:
Expand Down Expand Up @@ -111,6 +112,8 @@ def pdftext_extraction(self) -> ProviderPageLines:
workers=self.pdftext_workers,
flatten_pdf=self.flatten_pdf
)
self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_char_blocks)}

SpanClass: Span = get_block_class(BlockTypes.Span)
LineClass: Line = get_block_class(BlockTypes.Line)
for page in page_char_blocks:
Expand Down Expand Up @@ -195,9 +198,10 @@ def get_image(self, idx: int, dpi: int) -> Image.Image:
image = image.convert("RGB")
return image

def get_page_bbox(self, idx: int) -> PolygonBox:
page = self.doc[idx]
return PolygonBox.from_bbox(page.get_bbox())
def get_page_bbox(self, idx: int) -> PolygonBox | None:
bbox = self.page_bboxes.get(idx)
if bbox:
return PolygonBox.from_bbox(bbox)

def get_page_lines(self, idx: int) -> List[ProviderOutput]:
return self.page_lines[idx]
12 changes: 12 additions & 0 deletions tests/builders/test_ocr_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,15 @@ def test_ocr_pipeline(pdf_document):
first_span = first_page.get_block(first_text_block.structure[0])
assert first_span.block_type == BlockTypes.Span
assert first_span.text.strip() == 'Subspace Adversarial Training'

# Ensure we match all text lines up properly
# Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,))
assert len(text_lines) == 75

# Ensure the bbox sizes match up
max_line_position = max([line.polygon.y_end for line in text_lines])
max_block_position = max([block.polygon.y_end for block in text_blocks if block.source == "layout"])
assert max_line_position <= (max_block_position * 1.02)

19 changes: 19 additions & 0 deletions tests/builders/test_rotated_bboxes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pytest

from marker.schema import BlockTypes


@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("adversarial_rot.pdf")
def test_rotated_bboxes(pdf_document):
first_page = pdf_document.pages[0]

# Ensure we match all text lines up properly
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text,))
assert len(text_lines) == 97

# Ensure the bbox sizes match up
max_line_position = max([line.polygon.x_end for line in text_lines])
max_block_position = max([block.polygon.x_end for block in text_blocks if block.source == "layout"])
assert max_line_position <= max_block_position

0 comments on commit 58dfbe9

Please sign in to comment.