Skip to content

Commit

Permalink
improved links!
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Jan 17, 2025
1 parent d82aa47 commit 505ec2a
Show file tree
Hide file tree
Showing 22 changed files with 122 additions and 62 deletions.
3 changes: 2 additions & 1 deletion marker/builders/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def build_document(self, provider: PdfProvider):
page_id=p,
lowres_image=lowres_images[i],
highres_image=highres_images[i],
polygon=provider.get_page_bbox(p)
polygon=provider.get_page_bbox(p),
refs=provider.get_page_refs(p)
) for i, p in enumerate(provider.page_range)
]
DocumentClass: Document = get_block_class(BlockTypes.Document)
Expand Down
4 changes: 3 additions & 1 deletion marker/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import inspect
from collections import defaultdict
from typing import Annotated, Any, Dict, List, Optional, Type
from functools import cache
from typing import Annotated, Any, Dict, List, Optional, Type

from marker.builders.document import DocumentBuilder
from marker.builders.layout import LayoutBuilder
Expand All @@ -28,6 +28,7 @@
from marker.processors.llm.llm_table import LLMTableProcessor
from marker.processors.llm.llm_text import LLMTextProcessor
from marker.processors.page_header import PageHeaderProcessor
from marker.processors.reference import ReferenceProcessor
from marker.processors.sectionheader import SectionHeaderProcessor
from marker.processors.table import TableProcessor
from marker.processors.text import TextProcessor
Expand Down Expand Up @@ -82,6 +83,7 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: Optional[List[
LLMTextProcessor,
LLMComplexRegionProcessor,
LLMImageDescriptionProcessor,
ReferenceProcessor,
DebugProcessor,
]

Expand Down
55 changes: 55 additions & 0 deletions marker/processors/reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import numpy as np

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Reference
from marker.schema.document import Document
from marker.schema.groups.list import ListGroup
from marker.schema.groups.table import TableGroup
from marker.schema.registry import get_block_class
from marker.schema.groups.picture import PictureGroup
from marker.schema.groups.figure import FigureGroup


class ReferenceProcessor(BaseProcessor):
"""
A processor for adding references to the document.
"""

def __init__(self, config):
super().__init__(config)

def __call__(self, document: Document):
ReferenceClass: Reference = get_block_class(BlockTypes.Reference)

for page in document.pages:
refs = page.refs
ref_starts = np.array([ref.coord for ref in refs])

blocks = []
for block_id in page.structure:
block = page.get_block(block_id)
if isinstance(block, (ListGroup, FigureGroup, TableGroup)):
blocks.extend([page.get_block(b) for b in block.structure])
else:
blocks.append(block)
blocks = [b for b in blocks if not b.ignore_for_output]

block_starts = np.array([block.polygon.bbox[:2] for block in blocks])

if not (len(refs) and len(block_starts)):
continue

distances = np.linalg.norm(block_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
for ref_idx in range(len(ref_starts)):
block_idx = np.argmin(distances[:, ref_idx])
block = blocks[block_idx]

ref_block = page.add_full_block(ReferenceClass(
ref=refs[ref_idx].ref,
polygon=block.polygon,
page_id=page.page_id
))
if block.structure is None:
block.structure = []
block.structure.insert(0, ref_block.id)
9 changes: 7 additions & 2 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import atexit
import ctypes
import re
from typing import Annotated, List, Optional, Set
from typing import Annotated, Dict, List, Optional, Set

import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from ftfy import fix_text
from pdftext.extraction import dictionary_output
from pdftext.schema import Reference
from PIL import Image

from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines
Expand Down Expand Up @@ -74,6 +75,7 @@ def __init__(self, filepath: str, config=None):

self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))}
self.page_refs: Dict[int, List[Reference]] = {i: [] for i in range(len(self.doc))}

if self.page_range is None:
self.page_range = range(len(self.doc))
Expand Down Expand Up @@ -199,7 +201,6 @@ def pdftext_extraction(self) -> ProviderPageLines:
page_id=page_id,
text_extraction_method="pdftext",
url=span.get("url"),
anchors=span.get("anchors"),
)
)
polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True)
Expand All @@ -211,6 +212,7 @@ def pdftext_extraction(self) -> ProviderPageLines:
)
if self.check_line_spans(lines):
page_lines[page_id] = lines
self.page_refs[page_id] = page["refs"]

return page_lines

Expand Down Expand Up @@ -311,6 +313,9 @@ def get_page_bbox(self, idx: int) -> PolygonBox | None:
def get_page_lines(self, idx: int) -> List[ProviderOutput]:
return self.page_lines[idx]

def get_page_refs(self, idx: int):
return self.page_refs[idx]

@staticmethod
def _get_fontname(font) -> str:
font_name = ""
Expand Down
1 change: 0 additions & 1 deletion marker/renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@


class BaseRenderer:
remove_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to ignore while rendering."] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure)
extract_images: Annotated[bool, "Extract images from the document."] = True

Expand Down
6 changes: 2 additions & 4 deletions marker/renderers/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,12 @@ def extract_html(self, document, document_output, level=0):
ref_block_id: BlockId = item.id
break

if ref_block_id.block_type in self.remove_blocks:
ref.replace_with('')
elif ref_block_id.block_type in self.image_blocks:
if ref_block_id.block_type in self.image_blocks:
if self.extract_images:
image = self.extract_image(document, ref_block_id)
image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
images[image_name] = image
ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
ref.replace_with(BeautifulSoup(f"<p>{content}<img src='{image_name}'></p>", 'html.parser'))
else:
# This will be the image description if using llm mode, or empty if not
ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
Expand Down
1 change: 1 addition & 0 deletions marker/schema/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class BlockTypes(str, Enum):
TableOfContents = auto()
Document = auto()
ComplexRegion = auto()
Reference = auto()

def __str__(self):
return self.name
1 change: 1 addition & 0 deletions marker/schema/blocks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@
from marker.schema.blocks.text import Text
from marker.schema.blocks.toc import TableOfContents
from marker.schema.blocks.complexregion import ComplexRegion
from marker.schema.blocks.reference import Reference
6 changes: 6 additions & 0 deletions marker/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from marker.schema.document import Document
from marker.schema.groups.page import PageGroup


class BlockMetadata(BaseModel):
llm_request_count: int = 0
llm_error_count: int = 0
Expand Down Expand Up @@ -76,6 +77,7 @@ class Block(BaseModel):
text_extraction_method: Optional[Literal['pdftext', 'surya', 'gemini']] = None
structure: List[BlockId] | None = None # The top-level page structure, which is the block ids in order
ignore_for_output: bool = False # Whether this block should be ignored in output
replace_output_newlines: bool = False # Whether to replace newlines with spaces in output
source: Literal['layout', 'heuristics', 'processor'] = 'layout'
top_k: Optional[Dict[BlockTypes, float]] = None
metadata: BlockMetadata | None = None
Expand Down Expand Up @@ -168,6 +170,10 @@ def assemble_html(self, child_blocks: List[BlockOutput], parent_structure: Optio
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>"

if self.replace_output_newlines:
template = "<p>" + template.replace("\n", " ") + "</p>"

return template

def assign_section_hierarchy(self, section_hierarchy):
Expand Down
6 changes: 1 addition & 5 deletions marker/schema/blocks/caption.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,4 @@

class Caption(Block):
block_type: BlockTypes = BlockTypes.Caption

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
replace_output_newlines: bool = True
14 changes: 8 additions & 6 deletions marker/schema/blocks/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ class Equation(Block):

def assemble_html(self, child_blocks, parent_structure=None):
if self.latex:
html_out = f"<p block-type='{self.block_type}'>"
child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
html_out = super().assemble_html(child_ref_blocks, parent_structure)
html_out += f"<p block-type='{self.block_type}'>"

try:
latex = self.parse_latex(html.escape(self.latex))
Expand Down Expand Up @@ -43,9 +45,9 @@ def parse_latex(text: str):
("$$", "block"),
("$", "inline")
]
text = text.replace("\n", "<br>") # we can't handle \n's inside <p> properly if we don't do this

text = text.replace("\n", "<br>") # we can't handle \n's inside <p> properly if we don't do this

i = 0
stack = []
result = []
Expand All @@ -72,7 +74,7 @@ def parse_latex(text: str):
else: # No delimiter match
buffer += text[i]
i += 1

if buffer:
result.append({"class": "text", "content": buffer})
return result
return result
7 changes: 4 additions & 3 deletions marker/schema/blocks/figure.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ class Figure(Block):
description: str | None = None

def assemble_html(self, child_blocks, parent_structure):
child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
html = super().assemble_html(child_ref_blocks, parent_structure)
if self.description:
return f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
else:
return ""
html += f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
return html
7 changes: 1 addition & 6 deletions marker/schema/blocks/footnote.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,4 @@

class Footnote(Block):
block_type: BlockTypes = BlockTypes.Footnote

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")

return f"<p>{template}</p>"
replace_output_newlines: bool = True
6 changes: 1 addition & 5 deletions marker/schema/blocks/handwriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,4 @@

class Handwriting(Block):
block_type: BlockTypes = BlockTypes.Handwriting

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
replace_output_newlines: bool = True
10 changes: 2 additions & 8 deletions marker/schema/blocks/pagefooter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,5 @@

class PageFooter(Block):
block_type: str = BlockTypes.PageFooter

def assemble_html(self, child_blocks, parent_structure):
if self.ignore_for_output:
return ""

template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
replace_output_newlines: bool = True
ignore_for_output: bool = True
10 changes: 2 additions & 8 deletions marker/schema/blocks/pageheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,5 @@

class PageHeader(Block):
block_type: BlockTypes = BlockTypes.PageHeader

def assemble_html(self, child_blocks, parent_structure):
if self.ignore_for_output:
return ""

template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
replace_output_newlines: bool = True
ignore_for_output: bool = True
11 changes: 11 additions & 0 deletions marker/schema/blocks/reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class Reference(Block):
block_type: BlockTypes = BlockTypes.Reference
ref: str

def assemble_html(self, child_blocks, parent_structure=None):
template = super().assemble_html(child_blocks, parent_structure)
return f"<span id='{self.ref}'>{template}</span>"
8 changes: 4 additions & 4 deletions marker/schema/blocks/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class Table(Block):
cells: List[SpanTableCell] | None = None

def assemble_html(self, child_blocks, parent_structure=None):
child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
template = super().assemble_html(child_ref_blocks, parent_structure)
if self.cells:
return str(html_format(self.cells))
else:
template = super().assemble_html(child_blocks, parent_structure)
return f"<p>{template}</p>"
return template + str(html_format(self.cells))
return f"<p>{template}</p>"
2 changes: 2 additions & 0 deletions marker/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from PIL import Image

from pdftext.schema import Reference
from marker.providers import ProviderOutput
from marker.schema import BlockTypes
from marker.schema.blocks import Block, BlockId, Text
Expand All @@ -22,6 +23,7 @@ class PageGroup(Group):
layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong)
excluded_block_types: Sequence[BlockTypes] = (BlockTypes.Line, BlockTypes.Span,)
maximum_assignment_distance: float = 20 # pixels
refs: List[Reference] | None = None

def incr_block_id(self):
if self.block_id is None:
Expand Down
3 changes: 2 additions & 1 deletion marker/schema/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
Footnote, Form, Handwriting, InlineMath, \
ListItem, PageFooter, PageHeader, Picture, \
SectionHeader, Table, TableOfContents, \
Text
Text, Reference
from marker.schema.blocks.complexregion import ComplexRegion
from marker.schema.document import Document
from marker.schema.groups import FigureGroup, ListGroup, PageGroup, \
Expand Down Expand Up @@ -51,6 +51,7 @@ def get_block_class(block_type: BlockTypes) -> Type[Block]:
register_block_class(BlockTypes.Text, Text)
register_block_class(BlockTypes.TableOfContents, TableOfContents)
register_block_class(BlockTypes.ComplexRegion, ComplexRegion)
register_block_class(BlockTypes.Reference, Reference)
register_block_class(BlockTypes.Document, Document)

assert len(BLOCK_REGISTRY) == len(BlockTypes)
Expand Down
3 changes: 0 additions & 3 deletions marker/schema/text/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ class Span(Block):
formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]
has_superscript: bool = False
url: Optional[str] = None
anchors: Optional[List[str]] = None

@property
def bold(self):
Expand Down Expand Up @@ -74,6 +73,4 @@ def assemble_html(self, child_blocks, parent_structure):
elif self.math:
text = f"<math display='inline'>{text}</math>"

if self.anchors:
text = "".join(f"<span id='{anchor}'/>" for anchor in self.anchors) + text
return text
Loading

0 comments on commit 505ec2a

Please sign in to comment.