Skip to content

Commit

Permalink
Fix various bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 26, 2024
1 parent f26834c commit 360126e
Show file tree
Hide file tree
Showing 11 changed files with 100 additions and 33 deletions.
10 changes: 2 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -323,25 +323,19 @@ Pass the `debug` option to activate debug mode. This will save images of each p

Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct.

Benchmarks show that marker is 4x faster than nougat, and more accurate outside arXiv (nougat was trained on arXiv data).

**Speed**

| Method | Average Score | Time per page | Time per document |
|--------|---------------|---------------|-------------------|
| marker | 0.613721 | 0.631991 | 58.1432 |
| nougat | 0.406603 | 2.59702 | 238.926 |
| marker | 0.618355 | 0.250211 | 23.0194 |

**Accuracy**

First 3 are non-arXiv books, last 3 are arXiv papers.

| Method | multicolcnn.pdf | switch_trans.pdf | thinkpython.pdf | thinkos.pdf | thinkdsp.pdf | crowd.pdf |
|--------|-----------------|------------------|-----------------|-------------|--------------|-----------|
| marker | 0.536176 | 0.516833 | 0.70515 | 0.710657 | 0.690042 | 0.523467 |
| nougat | 0.44009 | 0.588973 | 0.322706 | 0.401342 | 0.160842 | 0.525663 |

Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `4.1GB` for marker. Benchmarks were run on an A6000 Ada.
Peak GPU memory usage during the benchmark is `4.1GB` for marker. Benchmarks were run on an A10.

**Throughput**

Expand Down
2 changes: 2 additions & 0 deletions marker/builders/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou
for page, layout_result in zip(pages, layout_results):
layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size
provider_page_size = page.polygon.size
page.layout_sliced = layout_result.sliced # This indicates if the page was sliced by the layout model
for bbox in sorted(layout_result.bboxes, key=lambda x: x.position):
block_cls = get_block_class(BlockTypes[bbox.label])
layout_block = page.add_block(block_cls, PolygonBox(polygon=bbox.polygon))
Expand Down Expand Up @@ -114,3 +115,4 @@ def check_layout_coverage(
if not text_okay and (total_blocks == 1 and large_text_blocks == 1):
text_okay = True
return text_okay

7 changes: 3 additions & 4 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import os

from marker.processors.footnote import FootnoteProcessor
from marker.processors.line_numbers import LineNumbersProcessor

os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning

from marker.processors.code import CodeProcessor
from marker.processors.document_toc import DocumentTOCProcessor
from marker.providers.pdf import PdfProvider
from marker.processors.footnote import FootnoteProcessor
from marker.processors.line_numbers import LineNumbersProcessor
from marker.processors.order import OrderProcessor

from marker.util import strings_to_classes

Expand Down
6 changes: 4 additions & 2 deletions marker/processors/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ def draw_layout_debug_images(self, document: Document, pdf_mode=False):
def render_layout_boxes(self, page, png_image):
layout_bboxes = []
layout_labels = []
for child in page.children:
for block_id in page.structure:
child = page.get_block(block_id)
if child.block_type in [BlockTypes.Line, BlockTypes.Span]:
continue

Expand All @@ -134,7 +135,8 @@ def render_layout_boxes(self, page, png_image):
labels=order_labels,
color="green",
draw_bbox=False,
label_offset=5
label_offset=5,
label_font_size=24
)
return png_image

Expand Down
12 changes: 4 additions & 8 deletions marker/processors/footnote.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ class FootnoteProcessor(BaseProcessor):
Default is .99
"""
block_types = (BlockTypes.Footnote,)
page_bottom_threshold = .75
line_height_scaler = .99
page_bottom_threshold = .725
line_height_scaler = .97


def __call__(self, document: Document):
Expand All @@ -44,8 +44,7 @@ def compute_block_stats(self, document: Document):
line_heights = []
for page in document.pages:
for footnote in page.contained_blocks(document, self.block_types):
contained_lines = footnote.contained_blocks(document, (BlockTypes.Line,))
line_heights.extend([line.polygon.height for line in contained_lines])
line_heights.append(footnote.line_height(document))
return line_heights


Expand All @@ -54,11 +53,8 @@ def relabel_texts_to_footnotes(self, page: PageGroup, document: Document, avg_fo
block_stats = []

for block in text_blocks:
contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
line_heights = [line.polygon.height for line in contained_lines]

block_stats.append({
"line_height": mean(line_heights) if len(line_heights) > 0 else 999,
"line_height": block.line_height(document),
"in_bottom": block.polygon.y_end > page.polygon.height * self.page_bottom_threshold
})

Expand Down
53 changes: 53 additions & 0 deletions marker/processors/order.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from statistics import mean

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document


class OrderProcessor(BaseProcessor):
"""
A processor for sorting the blocks in order if needed. This can help when the layout image was sliced.
"""
block_types = tuple()

def __call__(self, document: Document):
for page in document.pages:
if page.text_extraction_method != "pdftext":
continue

if not page.layout_sliced:
continue

block_idxs = {}
for block_id in page.structure:
block = document.get_block(block_id)
spans = block.contained_blocks(document, (BlockTypes.Span, ))
if len(spans) == 0:
continue

block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2

for block_id in page.structure:
if block_id in block_idxs and block_idxs[block_id] > 0:
continue
block = document.get_block(block_id)
prev_block = document.get_prev_block(block)
next_block = document.get_next_block(block)

while prev_block and prev_block.id not in block_idxs:
prev_block = document.get_prev_block(prev_block)

if not prev_block:
while next_block and next_block.id not in block_idxs:
next_block = document.get_next_block(next_block)

if not next_block and not prev_block:
block_idxs[block_id] = 0
elif prev_block:
block_idxs[block_id] = block_idxs[prev_block.id] + 1
else:
block_idxs[block_id] = block_idxs[next_block.id] - 1

page.structure = sorted(page.structure, key=lambda x: block_idxs[x])

14 changes: 7 additions & 7 deletions marker/processors/sectionheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,23 @@ def __call__(self, document: Document):
line_heights: Dict[int, List[float]] = {}
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
line_heights[block.block_id] = []
if block.structure is not None:
line_heights[block.block_id] = [document.get_block(l).polygon.height for l in block.structure if l.block_type == BlockTypes.Line]
line_heights[block.id] = block.line_height(document)
else:
line_heights[block.id] = 0

flat_line_heights = [h for heights in line_heights.values() for h in heights]
flat_line_heights = list(line_heights.values())
heading_ranges = self.bucket_headings(flat_line_heights)

for page in document.pages:
for block in page.children:
if block.block_type not in self.block_types:
continue

block_heights = line_heights[block.block_id]
if len(block_heights) > 0:
avg_height = sum(block_heights) / len(block_heights)
block_height = line_heights[block.id]
if block_height > 0:
for idx, (min_height, max_height) in enumerate(heading_ranges):
if avg_height >= min_height * self.height_tolerance:
if block_height >= min_height * self.height_tolerance:
block.heading_level = idx + 1
break

Expand Down
6 changes: 6 additions & 0 deletions marker/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,9 @@ def render(self, document: Document, parent_structure: Optional[List[str]], sect
children=child_content,
section_hierarchy=section_hierarchy
)

def line_height(self, document: Document):
lines = self.contained_blocks(document, (BlockTypes.Line,))
if len(lines) == 0:
return 0
return self.polygon.height / len(lines)
17 changes: 17 additions & 0 deletions marker/schema/blocks/footnote.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,28 @@
import re

from marker.schema import BlockTypes
from marker.schema.blocks import Block


def superscript(child_blocks):
# Superscript leading symbol or digit sequence
first_block = None
while len(child_blocks) > 0:
first_block = child_blocks[0]
child_blocks = first_block.children

if first_block is not None and first_block.id.block_type == BlockTypes.Line:
digit_start = r"^([0-9\W]+)(.*)"
first_block.html = re.sub(digit_start, r"<sup>\1</sup>\2", first_block.html.lstrip())


class Footnote(Block):
block_type: BlockTypes = BlockTypes.Footnote

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")

# Add superscripts to start
superscript(child_blocks)
return f"<p>{template}</p>"
2 changes: 1 addition & 1 deletion marker/schema/blocks/picture.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ class Picture(Block):
block_type: BlockTypes = BlockTypes.Picture

def assemble_html(self, child_blocks, parent_structure):
return f"<p>Image {self.block_id}</p>"
return f"<p>Image {self.id}</p>"
4 changes: 1 addition & 3 deletions marker/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@
from marker.schema.groups.base import Group
from marker.schema.polygon import PolygonBox

if TYPE_CHECKING:
from marker.schema.document import Document


class PageGroup(Group):
block_type: BlockTypes = BlockTypes.Page
lowres_image: Image.Image | None = None
highres_image: Image.Image | None = None
children: List[Block] | None = None
layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong)

def incr_block_id(self):
if self.block_id is None:
Expand Down

0 comments on commit 360126e

Please sign in to comment.