Skip to content

Commit

Permalink
Patch section header issue
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 14, 2025
1 parent 130855a commit 0c92614
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 6 deletions.
11 changes: 7 additions & 4 deletions marker/processors/sectionheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@ class SectionHeaderProcessor(BaseProcessor):
] = 0.99

def __call__(self, document: Document):
line_heights: Dict[int, List[float]] = {}
line_heights: Dict[int, float] = {}
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
# Iterate children to grab all section headers
for block in page.children:
if block.block_type not in self.block_types:
continue
if block.structure is not None:
line_heights[block.id] = block.line_height(document)
else:
Expand All @@ -49,11 +52,11 @@ def __call__(self, document: Document):
heading_ranges = self.bucket_headings(flat_line_heights)

for page in document.pages:
# Iterate children to grab all section headers
for block in page.children:
if block.block_type not in self.block_types:
continue

block_height = line_heights[block.id]
block_height = line_heights.get(block.id, 0)
if block_height > 0:
for idx, (min_height, max_height) in enumerate(heading_ranges):
if block_height >= min_height * self.height_tolerance:
Expand Down
2 changes: 1 addition & 1 deletion marker/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def render(self, document: Document, parent_structure: Optional[List[str]], sect
section_hierarchy=section_hierarchy
)

def line_height(self, document: Document):
def line_height(self, document: Document) -> float:
lines = self.contained_blocks(document, (BlockTypes.Line,))
if len(lines) == 0:
return 0
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "1.2.3"
version = "1.2.4"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <github@vikas.sh>"]
readme = "README.md"
Expand Down

0 comments on commit 0c92614

Please sign in to comment.