Skip to content

Commit

Permalink
Merge in dev
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 15, 2025
2 parents 8317db4 + 0c92614 commit 81fd95e
Show file tree
Hide file tree
Showing 8 changed files with 34 additions and 12 deletions.
14 changes: 7 additions & 7 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import os

from marker.processors import BaseProcessor
from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
from marker.providers.registry import provider_from_filepath

os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning

import inspect
from collections import defaultdict
from typing import Annotated, Any, Dict, List, Optional, Type, Tuple
from functools import cache

from marker.processors import BaseProcessor
from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
from marker.providers.registry import provider_from_filepath
from marker.builders.document import DocumentBuilder
from marker.builders.layout import LayoutBuilder
from marker.builders.llm_layout import LLMLayoutBuilder
Expand Down Expand Up @@ -122,12 +121,13 @@ def resolve_dependencies(self, cls):

return cls(**resolved_kwargs)

@cache
def build_document(self, filepath: str):
provider_cls = provider_from_filepath(filepath)
pdf_provider = provider_cls(filepath, self.config)
layout_builder = self.resolve_dependencies(self.layout_builder_class)
ocr_builder = self.resolve_dependencies(OcrBuilder)
document = DocumentBuilder(self.config)(pdf_provider, layout_builder, ocr_builder)
with provider_cls(filepath, self.config) as provider:
document = DocumentBuilder(self.config)(provider, layout_builder, ocr_builder)
StructureBuilder(self.config)(document)

for processor_cls in self.processor_list:
Expand Down
3 changes: 2 additions & 1 deletion marker/converters/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def build_document(self, filepath: str):
ocr_builder = self.resolve_dependencies(OcrBuilder)
document_builder = DocumentBuilder(self.config)
document_builder.disable_ocr = True
document = document_builder(pdf_provider, layout_builder, ocr_builder)
with provider_cls(filepath, self.config) as provider:
document = document_builder(provider, layout_builder, ocr_builder)

for page in document.pages:
page.structure = [p for p in page.structure if p.block_type in self.converter_block_types]
Expand Down
12 changes: 9 additions & 3 deletions marker/processors/sectionheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@ class SectionHeaderProcessor(BaseProcessor):
] = 0.99

def __call__(self, document: Document):
line_heights: Dict[int, List[float]] = {}
line_heights: Dict[int, float] = {}
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
# Iterate children to grab all section headers
for block in page.children:
if block.block_type not in self.block_types:
continue
if block.structure is not None:
line_heights[block.id] = block.line_height(document)
else:
Expand All @@ -49,7 +52,10 @@ def __call__(self, document: Document):
heading_ranges = self.bucket_headings(flat_line_heights)

for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
# Iterate children to grab all section headers
for block in page.children:
if block.block_type not in self.block_types:
continue
block_height = line_heights.get(block.id, 0)
if block_height > 0:
for idx, (min_height, max_height) in enumerate(heading_ranges):
Expand Down
6 changes: 6 additions & 0 deletions marker/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,9 @@ def get_page_bbox(self, idx: int) -> PolygonBox | None:

def get_page_lines(self, idx: int) -> List[Line]:
pass

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, traceback):
raise NotImplementedError
3 changes: 3 additions & 0 deletions marker/providers/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ def __init__(self, filepath: str, config=None):
def __len__(self):
return self.image_count

def __exit__(self, exc_type, exc_value, traceback):
pass

def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
return [self.images[i] for i in idxs]

Expand Down
3 changes: 3 additions & 0 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ def __init__(self, filepath: str, config=None):

atexit.register(self.cleanup_pdf_doc)

def __exit__(self, exc_type, exc_value, traceback):
self.cleanup_pdf_doc()

def __len__(self) -> int:
return len(self.doc)

Expand Down
3 changes: 3 additions & 0 deletions marker/renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ def replace_whitespace(match):
break
html = new_merged

# Replace consecutive whitespace
html = re.sub(r'\s+', ' ', html)

return html

def generate_page_stats(self, document: Document, document_output):
Expand Down
2 changes: 1 addition & 1 deletion marker/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def render(self, document: Document, parent_structure: Optional[List[str]] = Non
section_hierarchy=section_hierarchy
)

def line_height(self, document: Document):
def line_height(self, document: Document) -> float:
lines = self.contained_blocks(document, (BlockTypes.Line,))
if len(lines) == 0:
return 0
Expand Down

0 comments on commit 81fd95e

Please sign in to comment.