Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into vik_dev
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 15, 2025
2 parents 04bb7ad + d82aa47 commit b0edbe2
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 34 deletions.
1 change: 0 additions & 1 deletion marker/config/printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ def parse_args(self, ctx, args):
["--" + class_name_attr, class_name_attr],
type=attr_type,
help=" ".join(metadata),
default=default,
is_flag=is_flag,
)
)
Expand Down
12 changes: 12 additions & 0 deletions marker/processors/footnote.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
Expand All @@ -13,6 +15,7 @@ class FootnoteProcessor(BaseProcessor):
def __call__(self, document: Document):
for page in document.pages:
self.push_footnotes_to_bottom(page, document)
self.assign_superscripts(page, document)

def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
footnote_blocks = page.contained_blocks(document, self.block_types)
Expand All @@ -24,3 +27,12 @@ def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
# Move to bottom if it is
page.structure.remove(block.id)
page.add_structure(block)

def assign_superscripts(self, page: PageGroup, document: Document):
footnote_blocks = page.contained_blocks(document, self.block_types)

for block in footnote_blocks:
for span in block.contained_blocks(document, (BlockTypes.Span,)):
if re.match(r"^[0-9\W]+", span.text):
span.has_superscript = True
break
18 changes: 14 additions & 4 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ class PdfProvider(BaseProvider):
bool,
"Whether to strip existing OCR text from the PDF.",
] = False
disable_links: Annotated[
bool,
"Whether to disable links.",
] = False

def __init__(self, filepath: str, config=None):
super().__init__(filepath, config)
Expand Down Expand Up @@ -168,12 +172,14 @@ def pdftext_extraction(self) -> ProviderPageLines:
keep_chars=False,
workers=self.pdftext_workers,
flatten_pdf=self.flatten_pdf,
quote_loosebox=False
quote_loosebox=False,
disable_links=self.disable_links
)
self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_char_blocks)}

SpanClass: Span = get_block_class(BlockTypes.Span)
LineClass: Line = get_block_class(BlockTypes.Line)

for page in page_char_blocks:
page_id = page["page"]
lines: List[ProviderOutput] = []
Expand Down Expand Up @@ -202,7 +208,9 @@ def pdftext_extraction(self) -> ProviderPageLines:
maximum_position=span["char_end_idx"],
formats=list(font_formats),
page_id=page_id,
text_extraction_method="pdftext"
text_extraction_method="pdftext",
url=span.get("url"),
anchors=span.get("anchors"),
)
)
polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True)
Expand All @@ -214,6 +222,7 @@ def pdftext_extraction(self) -> ProviderPageLines:
)
if self.check_line_spans(lines):
page_lines[page_id] = lines

return page_lines

def check_line_spans(self, page_lines: List[ProviderOutput]) -> bool:
Expand Down Expand Up @@ -255,7 +264,7 @@ def check_page(self, page_id: int) -> bool:
font_map = {}
for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
font = pdfium_c.FPDFTextObj_GetFont(text_obj)
font_name = self.get_fontname(font)
font_name = self._get_fontname(font)

# we also skip pages without embedded fonts and fonts without names
non_embedded_fonts.append(pdfium_c.FPDFFont_GetIsEmbedded(font) == 0)
Expand Down Expand Up @@ -317,7 +326,8 @@ def get_page_bbox(self, idx: int) -> PolygonBox | None:
def get_page_lines(self, idx: int) -> List[ProviderOutput]:
return self.page_lines[idx]

def get_fontname(self, font) -> str:
@staticmethod
def _get_fontname(font) -> str:
font_name = ""
buffer_size = 256

Expand Down
8 changes: 8 additions & 0 deletions marker/renderers/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,14 @@ def convert_table(self, el, text, convert_as_inline):
table_md = '\n'.join(markdown_lines)
return "\n\n" + table_md + "\n\n"

def convert_a(self, el, text, convert_as_inline):
text = self.escape(text)
text = re.sub(r"([\[\]])", r"\\\1", text)
return super().convert_a(el, self.escape(text), convert_as_inline)

def convert_span(self, el, text, convert_as_inline):
return f'<span id="{el["id"]}"/>'


class MarkdownOutput(BaseModel):
markdown: str
Expand Down
16 changes: 0 additions & 16 deletions marker/schema/blocks/footnote.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,12 @@
import re

from marker.schema import BlockTypes
from marker.schema.blocks import Block


def superscript(child_blocks):
# Superscript leading symbol or digit sequence
first_block = None
while len(child_blocks) > 0:
first_block = child_blocks[0]
child_blocks = first_block.children

if first_block is not None and first_block.id.block_type == BlockTypes.Line:
digit_start = r"^([0-9\W]+)(.*)"
first_block.html = re.sub(digit_start, r"<sup>\1</sup>\2", first_block.html.lstrip())


class Footnote(Block):
block_type: BlockTypes = BlockTypes.Footnote

def assemble_html(self, document, child_blocks, parent_structure):
template = super().assemble_html(document, child_blocks, parent_structure)
template = template.replace("\n", " ")

# Add superscripts to start
superscript(child_blocks)
return f"<p>{template}</p>"
13 changes: 4 additions & 9 deletions marker/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ class PageGroup(Group):
lowres_image: Image.Image | None = None
highres_image: Image.Image | None = None
children: List[Union[Any, Block]] | None = None
layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong)
layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong)
excluded_block_types: Sequence[BlockTypes] = (BlockTypes.Line, BlockTypes.Span,)
maximum_assignment_distance: float = 20 # pixels
maximum_assignment_distance: float = 20 # pixels

def incr_block_id(self):
if self.block_id is None:
Expand All @@ -41,7 +41,7 @@ def get_image(self, *args, highres: bool = False, **kwargs):
def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None):
if ignored_block_types is None:
ignored_block_types = []

structure_idx = 0
if block is not None:
structure_idx = self.structure.index(block.id) + 1
Expand Down Expand Up @@ -122,7 +122,6 @@ def replace_block(self, block: Block, new_block: Block):
for child in self.children:
child.replace_block(block, new_block)


def identify_missing_blocks(
self,
provider_line_idxs: List[int],
Expand All @@ -137,7 +136,7 @@ def identify_missing_blocks(

# if the unassociated line is a new line with minimal area, we can skip it
if provider_outputs[line_idx].line.polygon.area <= 1 and \
provider_outputs[line_idx].raw_text == "\n":
provider_outputs[line_idx].raw_text == "\n":
continue

if new_block is None:
Expand Down Expand Up @@ -184,7 +183,6 @@ def create_missing_blocks(
else:
self.structure.append(block.id)


def add_initial_blocks(
self,
block_lines: Dict[BlockId, LINE_MAPPING_TYPE],
Expand All @@ -205,7 +203,6 @@ def add_initial_blocks(
self.add_full_block(span)
line.add_structure(span)


def merge_blocks(
self,
provider_outputs: List[ProviderOutput],
Expand Down Expand Up @@ -257,5 +254,3 @@ def aggregate_block_metadata(self) -> BlockMetadata:
if block.metadata is not None:
self.metadata = self.metadata.merge(block.metadata)
return self.metadata


20 changes: 16 additions & 4 deletions marker/schema/text/span.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import html
import re
from typing import List, Literal
from typing import List, Literal, Optional

from marker.schema import BlockTypes
from marker.schema.blocks import Block
Expand All @@ -22,6 +22,9 @@ class Span(Block):
minimum_position: int
maximum_position: int
formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]
has_superscript: bool = False
url: Optional[str] = None
anchors: Optional[List[str]] = None

@property
def bold(self):
Expand Down Expand Up @@ -58,10 +61,19 @@ def assemble_html(self, document, child_blocks, parent_structure):
text = html.escape(text)
text = cleanup_text(text)

if self.has_superscript:
text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", text)

if self.url:
text = f"<a href='{self.url}'>{text}</a>"

if self.italic:
return f"<i>{text}</i>"
text = f"<i>{text}</i>"
elif self.bold:
return f"<b>{text}</b>"
text = f"<b>{text}</b>"
elif self.math:
return f"<math display='inline'>{text}</math>"
text = f"<math display='inline'>{text}</math>"

if self.anchors:
text = "".join(f"<span id='{anchor}'/>" for anchor in self.anchors) + text
return text
32 changes: 32 additions & 0 deletions tests/builders/test_pdf_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pytest

from marker.converters.pdf import PdfConverter
from marker.renderers.markdown import MarkdownOutput
from marker.schema import BlockTypes
from marker.schema.document import Document


@pytest.mark.filename("arxiv_test.pdf")
@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [1]})
def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf):
first_page = pdf_document.pages[0]

for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)):
if "II." in section_header_span.text:
assert section_header_span.url == "#page-1-0"
break
else:
raise ValueError("Could not find II. in the first page")

section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0]
assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n'

section_header_span = section_header_block.contained_blocks(pdf_document, (BlockTypes.Span,))[0]
assert section_header_span.anchors == ['page-1-0']

markdown_output: MarkdownOutput = pdf_converter(temp_pdf.name)
markdown = markdown_output.markdown

assert '[II.](#page-1-0)' in markdown
assert '<span id="page-1-0"/>II. THEORETICAL FRAMEWORK' in markdown

0 comments on commit b0edbe2

Please sign in to comment.