Merge remote-tracking branch 'origin/dev' into vik_dev

VikParuchuri · Jan 15, 2025 · b0edbe2 · b0edbe2
2 parents 04bb7ad + d82aa47
commit b0edbe2
Show file tree

Hide file tree

Showing 8 changed files with 86 additions and 34 deletions.
diff --git a/marker/config/printer.py b/marker/config/printer.py
@@ -71,7 +71,6 @@ def parse_args(self, ctx, args):
                                 ["--" + class_name_attr, class_name_attr],
                                 type=attr_type,
                                 help=" ".join(metadata),
-                                default=default,
                                 is_flag=is_flag,
                             )
                         )

diff --git a/marker/processors/footnote.py b/marker/processors/footnote.py
@@ -1,3 +1,5 @@
+import re
+
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
@@ -13,6 +15,7 @@ class FootnoteProcessor(BaseProcessor):
     def __call__(self, document: Document):
         for page in document.pages:
             self.push_footnotes_to_bottom(page, document)
+            self.assign_superscripts(page, document)
 
     def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
         footnote_blocks = page.contained_blocks(document, self.block_types)
@@ -24,3 +27,12 @@ def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
                 # Move to bottom if it is
                 page.structure.remove(block.id)
                 page.add_structure(block)
+
+    def assign_superscripts(self, page: PageGroup, document: Document):
+        footnote_blocks = page.contained_blocks(document, self.block_types)
+
+        for block in footnote_blocks:
+            for span in block.contained_blocks(document, (BlockTypes.Span,)):
+                if re.match(r"^[0-9\W]+", span.text):
+                    span.has_superscript = True
+                break
diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
@@ -65,6 +65,10 @@ class PdfProvider(BaseProvider):
         bool,
         "Whether to strip existing OCR text from the PDF.",
     ] = False
+    disable_links: Annotated[
+        bool,
+        "Whether to disable links.",
+    ] = False
 
     def __init__(self, filepath: str, config=None):
         super().__init__(filepath, config)
@@ -168,12 +172,14 @@ def pdftext_extraction(self) -> ProviderPageLines:
             keep_chars=False,
             workers=self.pdftext_workers,
             flatten_pdf=self.flatten_pdf,
-            quote_loosebox=False
+            quote_loosebox=False,
+            disable_links=self.disable_links
         )
         self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_char_blocks)}
 
         SpanClass: Span = get_block_class(BlockTypes.Span)
         LineClass: Line = get_block_class(BlockTypes.Line)
+
         for page in page_char_blocks:
             page_id = page["page"]
             lines: List[ProviderOutput] = []
@@ -202,7 +208,9 @@ def pdftext_extraction(self) -> ProviderPageLines:
                                 maximum_position=span["char_end_idx"],
                                 formats=list(font_formats),
                                 page_id=page_id,
-                                text_extraction_method="pdftext"
+                                text_extraction_method="pdftext",
+                                url=span.get("url"),
+                                anchors=span.get("anchors"),
                             )
                         )
                     polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True)
@@ -214,6 +222,7 @@ def pdftext_extraction(self) -> ProviderPageLines:
                     )
             if self.check_line_spans(lines):
                 page_lines[page_id] = lines
+
         return page_lines
 
     def check_line_spans(self, page_lines: List[ProviderOutput]) -> bool:
@@ -255,7 +264,7 @@ def check_page(self, page_id: int) -> bool:
             font_map = {}
             for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
                 font = pdfium_c.FPDFTextObj_GetFont(text_obj)
-                font_name = self.get_fontname(font)
+                font_name = self._get_fontname(font)
 
                 # we also skip pages without embedded fonts and fonts without names
                 non_embedded_fonts.append(pdfium_c.FPDFFont_GetIsEmbedded(font) == 0)
@@ -317,7 +326,8 @@ def get_page_bbox(self, idx: int) -> PolygonBox | None:
     def get_page_lines(self, idx: int) -> List[ProviderOutput]:
         return self.page_lines[idx]
 
-    def get_fontname(self, font) -> str:
+    @staticmethod
+    def _get_fontname(font) -> str:
         font_name = ""
         buffer_size = 256
 

diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py
@@ -137,6 +137,14 @@ def convert_table(self, el, text, convert_as_inline):
         table_md = '\n'.join(markdown_lines)
         return "\n\n" + table_md + "\n\n"
 
+    def convert_a(self, el, text, convert_as_inline):
+        text = self.escape(text)
+        text = re.sub(r"([\[\]])", r"\\\1", text)
+        return super().convert_a(el, self.escape(text), convert_as_inline)
+
+    def convert_span(self, el, text, convert_as_inline):
+        return f'<span id="{el["id"]}"/>'
+
 
 class MarkdownOutput(BaseModel):
     markdown: str

diff --git a/marker/schema/blocks/footnote.py b/marker/schema/blocks/footnote.py
@@ -1,28 +1,12 @@
-import re
-
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 
 
-def superscript(child_blocks):
-    # Superscript leading symbol or digit sequence
-    first_block = None
-    while len(child_blocks) > 0:
-        first_block = child_blocks[0]
-        child_blocks = first_block.children
-
-    if first_block is not None and first_block.id.block_type == BlockTypes.Line:
-        digit_start = r"^([0-9\W]+)(.*)"
-        first_block.html = re.sub(digit_start, r"<sup>\1</sup>\2", first_block.html.lstrip())
-
-
 class Footnote(Block):
     block_type: BlockTypes = BlockTypes.Footnote
 
     def assemble_html(self, document, child_blocks, parent_structure):
         template = super().assemble_html(document, child_blocks, parent_structure)
         template = template.replace("\n", " ")
 
-        # Add superscripts to start
-        superscript(child_blocks)
         return f"<p>{template}</p>"
diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
@@ -19,9 +19,9 @@ class PageGroup(Group):
     lowres_image: Image.Image | None = None
     highres_image: Image.Image | None = None
     children: List[Union[Any, Block]] | None = None
-    layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong)
+    layout_sliced: bool = False  # Whether the layout model had to slice the image (order may be wrong)
     excluded_block_types: Sequence[BlockTypes] = (BlockTypes.Line, BlockTypes.Span,)
-    maximum_assignment_distance: float = 20 # pixels
+    maximum_assignment_distance: float = 20  # pixels
 
     def incr_block_id(self):
         if self.block_id is None:
@@ -41,7 +41,7 @@ def get_image(self, *args, highres: bool = False, **kwargs):
     def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None):
         if ignored_block_types is None:
             ignored_block_types = []
-        
+
         structure_idx = 0
         if block is not None:
             structure_idx = self.structure.index(block.id) + 1
@@ -122,7 +122,6 @@ def replace_block(self, block: Block, new_block: Block):
         for child in self.children:
             child.replace_block(block, new_block)
 
-
     def identify_missing_blocks(
             self,
             provider_line_idxs: List[int],
@@ -137,7 +136,7 @@ def identify_missing_blocks(
 
             # if the unassociated line is a new line with minimal area, we can skip it
             if provider_outputs[line_idx].line.polygon.area <= 1 and \
-                provider_outputs[line_idx].raw_text == "\n":
+                    provider_outputs[line_idx].raw_text == "\n":
                 continue
 
             if new_block is None:
@@ -184,7 +183,6 @@ def create_missing_blocks(
             else:
                 self.structure.append(block.id)
 
-
     def add_initial_blocks(
             self,
             block_lines: Dict[BlockId, LINE_MAPPING_TYPE],
@@ -205,7 +203,6 @@ def add_initial_blocks(
                     self.add_full_block(span)
                     line.add_structure(span)
 
-
     def merge_blocks(
         self,
         provider_outputs: List[ProviderOutput],
@@ -257,5 +254,3 @@ def aggregate_block_metadata(self) -> BlockMetadata:
             if block.metadata is not None:
                 self.metadata = self.metadata.merge(block.metadata)
         return self.metadata
-
-
diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py
@@ -1,6 +1,6 @@
 import html
 import re
-from typing import List, Literal
+from typing import List, Literal, Optional
 
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
@@ -22,6 +22,9 @@ class Span(Block):
     minimum_position: int
     maximum_position: int
     formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]
+    has_superscript: bool = False
+    url: Optional[str] = None
+    anchors: Optional[List[str]] = None
 
     @property
     def bold(self):
@@ -58,10 +61,19 @@ def assemble_html(self, document, child_blocks, parent_structure):
         text = html.escape(text)
         text = cleanup_text(text)
 
+        if self.has_superscript:
+            text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", text)
+
+        if self.url:
+            text = f"<a href='{self.url}'>{text}</a>"
+
         if self.italic:
-            return f"<i>{text}</i>"
+            text = f"<i>{text}</i>"
         elif self.bold:
-            return f"<b>{text}</b>"
+            text = f"<b>{text}</b>"
         elif self.math:
-            return f"<math display='inline'>{text}</math>"
+            text = f"<math display='inline'>{text}</math>"
+
+        if self.anchors:
+            text = "".join(f"<span id='{anchor}'/>" for anchor in self.anchors) + text
         return text
diff --git a/tests/builders/test_pdf_links.py b/tests/builders/test_pdf_links.py
@@ -0,0 +1,32 @@
+import pytest
+
+from marker.converters.pdf import PdfConverter
+from marker.renderers.markdown import MarkdownOutput
+from marker.schema import BlockTypes
+from marker.schema.document import Document
+
+
+@pytest.mark.filename("arxiv_test.pdf")
+@pytest.mark.output_format("markdown")
+@pytest.mark.config({"page_range": [1]})
+def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf):
+    first_page = pdf_document.pages[0]
+
+    for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)):
+        if "II." in section_header_span.text:
+            assert section_header_span.url == "#page-1-0"
+            break
+    else:
+        raise ValueError("Could not find II. in the first page")
+
+    section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0]
+    assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n'
+
+    section_header_span = section_header_block.contained_blocks(pdf_document, (BlockTypes.Span,))[0]
+    assert section_header_span.anchors == ['page-1-0']
+
+    markdown_output: MarkdownOutput = pdf_converter(temp_pdf.name)
+    markdown = markdown_output.markdown
+
+    assert '[II.](#page-1-0)' in markdown
+    assert '<span id="page-1-0"/>II. THEORETICAL FRAMEWORK' in markdown