From 4f5a4a924c721fc1a77c14b9390ee23e17917f55 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Thu, 9 Jan 2025 10:49:48 +0000
Subject: [PATCH 01/14] initial pdf link merging impl

---
 marker/providers/pdf.py      | 104 +++++++++++++++++++++++++++++++++++
 marker/schema/groups/page.py |  15 ++---
 marker/schema/text/span.py   |   9 ++-
 3 files changed, 115 insertions(+), 13 deletions(-)

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index 09b9603d..32ee513e 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -1,5 +1,6 @@
 import atexit
 import ctypes
+import math
 import re
 from typing import Annotated, List, Optional, Set
 
@@ -7,6 +8,7 @@
 import pypdfium2.raw as pdfium_c
 from ftfy import fix_text
 from pdftext.extraction import dictionary_output
+from pdftext.schema import Bbox
 from PIL import Image
 
 from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines
@@ -16,6 +18,7 @@
 from marker.schema.registry import get_block_class
 from marker.schema.text.line import Line
 from marker.schema.text.span import Span
+from marker.util import matrix_intersection_area
 
 
 class PdfProvider(BaseProvider):
@@ -196,9 +199,36 @@ def pdftext_extraction(self) -> ProviderPageLines:
                         )
                     )
             if self.check_line_spans(lines):
+                self.merge_links(lines, page_id)
                 page_lines[page_id] = lines
         return page_lines
 
+    def merge_links(self, lines, page_id):
+        links = self.get_links(page_id)
+
+        spans = [span for line in lines for span in line.spans]
+        span_bboxes = [span.polygon.bbox for span in spans]
+        link_bboxes = [link['bbox'] for link in links]
+        intersection_matrix = matrix_intersection_area(span_bboxes, link_bboxes)
+        max_intersections = {}
+
+        for span_idx, span in enumerate(spans):
+            intersection_span = intersection_matrix[span_idx]
+            if intersection_span.sum() == 0:
+                continue
+
+            max_intersection = intersection_span.argmax()
+            if intersection_matrix[span_idx, max_intersection] > 0:
+                max_intersections[span_idx] = (
+                    intersection_matrix[span_idx, max_intersection],
+                    links[max_intersection]
+                )
+
+        for span_idx, span in enumerate(spans):
+            if span_idx in max_intersections:
+                link = max_intersections[span_idx][1]
+                span.url = link['url']
+
     def check_line_spans(self, page_lines: List[ProviderOutput]) -> bool:
         page_spans = [span for line in page_lines for span in line.spans]
         if len(page_spans) == 0:
@@ -313,3 +343,77 @@ def get_fontname(self, font) -> str:
             pass
 
         return font_name
+
+    def get_links(self, page_idx):
+        urls = []
+        page = self.doc[page_idx]
+        page_bbox: List[float] = page.get_bbox()
+        page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
+        page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
+        page_rotation = 0
+        try:
+            page_rotation = page.get_rotation()
+        except:
+            pass
+
+        annot_count = pdfium_c.FPDFPage_GetAnnotCount(page)
+        for i in range(annot_count):
+            url = {
+                'bbox': [],
+                'url': '',
+                'page': page_idx,
+            }
+            annot = pdfium_c.FPDFPage_GetAnnot(page, i)
+            if pdfium_c.FPDFAnnot_GetSubtype(annot) == pdfium_c.FPDF_ANNOT_LINK:
+                fs_rect = pdfium_c.FS_RECTF()
+                success = pdfium_c.FPDFAnnot_GetRect(annot, ctypes.byref(fs_rect))
+                if not success:
+                    continue
+
+                cx_start, cy_start, cx_end, cy_end = [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom]
+
+                cx_start -= page_bbox[0]
+                cx_end -= page_bbox[0]
+                cy_start -= page_bbox[1]
+                cy_end -= page_bbox[1]
+
+                ty_start = page_height - cy_start
+                ty_end = page_height - cy_end
+
+                bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)]
+                url['bbox'] = Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox
+
+                link_obj = pdfium_c.FPDFAnnot_GetLink(annot)
+
+                action = pdfium_c.FPDFLink_GetAction(link_obj)
+                a_type = pdfium_c.FPDFAction_GetType(action)
+
+                if a_type == pdfium_c.PDFACTION_UNSUPPORTED:
+                    continue
+
+                elif a_type == pdfium_c.PDFACTION_GOTO:
+                    # Goto a page
+                    dest = pdfium_c.FPDFAction_GetDest(self.doc, action)
+                    if dest:
+                        tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest)
+                        url['url'] = f"#page-{tgt_page}"
+
+                # elif a_type == pdfium_c.PDFACTION_LAUNCH:
+                #     # Typically opens a file/app
+                #     path_len = pdfium_c.FPDFAction_GetFilePath(action, None, 0)
+                #     if path_len > 0:
+                #         buf = ctypes.create_string_buffer(path_len)
+                #         pdfium_c.FPDFAction_GetFilePath(action, buf, path_len)
+                #         filepath = buf.raw[:path_len].decode('utf-8', errors='replace').rstrip('\x00')
+
+                elif a_type == pdfium_c.PDFACTION_URI:
+                    # External link
+                    needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0)
+                    if needed_len > 0:
+                        buf = ctypes.create_string_buffer(needed_len)
+                        pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len)
+                        uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00')
+                        url["url"] = uri
+
+                urls.append(url)
+        return urls
diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
index 3089f4ce..2c5c30c3 100644
--- a/marker/schema/groups/page.py
+++ b/marker/schema/groups/page.py
@@ -19,9 +19,9 @@ class PageGroup(Group):
     lowres_image: Image.Image | None = None
     highres_image: Image.Image | None = None
     children: List[Union[Any, Block]] | None = None
-    layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong)
+    layout_sliced: bool = False  # Whether the layout model had to slice the image (order may be wrong)
     excluded_block_types: Sequence[BlockTypes] = (BlockTypes.Line, BlockTypes.Span,)
-    maximum_assignment_distance: float = 20 # pixels
+    maximum_assignment_distance: float = 20  # pixels
 
     def incr_block_id(self):
         if self.block_id is None:
@@ -38,7 +38,7 @@ def add_child(self, block: Block):
     def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None):
         if ignored_block_types is None:
             ignored_block_types = []
-        
+
         structure_idx = 0
         if block is not None:
             structure_idx = self.structure.index(block.id) + 1
@@ -78,7 +78,7 @@ def get_block(self, block_id: BlockId) -> Block | None:
         return block
 
     def assemble_html(self, child_blocks, parent_structure=None):
-        template = ""
+        template = f"<span id='page-{self.page_id}'></span>"
         for c in child_blocks:
             template += f"<content-ref src='{c.id}'></content-ref>"
         return template
@@ -119,7 +119,6 @@ def replace_block(self, block: Block, new_block: Block):
         for child in self.children:
             child.replace_block(block, new_block)
 
-
     def identify_missing_blocks(
             self,
             provider_line_idxs: List[int],
@@ -134,7 +133,7 @@ def identify_missing_blocks(
 
             # if the unassociated line is a new line with minimal area, we can skip it
             if provider_outputs[line_idx].line.polygon.area <= 1 and \
-                provider_outputs[line_idx].raw_text == "\n":
+                    provider_outputs[line_idx].raw_text == "\n":
                 continue
 
             if new_block is None:
@@ -181,7 +180,6 @@ def create_missing_blocks(
             else:
                 self.structure.append(block.id)
 
-
     def add_initial_blocks(
             self,
             block_lines: Dict[BlockId, LINE_MAPPING_TYPE],
@@ -202,7 +200,6 @@ def add_initial_blocks(
                     self.add_full_block(span)
                     line.add_structure(span)
 
-
     def merge_blocks(
         self,
         provider_outputs: List[ProviderOutput],
@@ -254,5 +251,3 @@ def aggregate_block_metadata(self) -> BlockMetadata:
             if block.metadata is not None:
                 self.metadata = self.metadata.merge(block.metadata)
         return self.metadata
-
-
diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py
index d066ccbe..1131df09 100644
--- a/marker/schema/text/span.py
+++ b/marker/schema/text/span.py
@@ -22,6 +22,7 @@ class Span(Block):
     minimum_position: int
     maximum_position: int
     formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]
+    url: str = ''
 
     @property
     def bold(self):
@@ -59,9 +60,11 @@ def assemble_html(self, child_blocks, parent_structure):
         text = cleanup_text(text)
 
         if self.italic:
-            return f"<i>{text}</i>"
+            text = f"<i>{text}</i>"
         elif self.bold:
-            return f"<b>{text}</b>"
+            text = f"<b>{text}</b>"
         elif self.math:
-            return f"<math display='inline'>{text}</math>"
+            text = f"<math display='inline'>{text}</math>"
+        elif self.url:
+            text = f"<a href='{self.url}'>{text}</a>"
         return text

From 6f0166e51d0e300e66c6722aa17b1a996e732b89 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Fri, 10 Jan 2025 09:27:15 +0000
Subject: [PATCH 02/14] add support for refs and fix markdown conversion etc

---
 marker/providers/pdf.py      | 251 +++++++++++++++++++++++++++--------
 marker/renderers/markdown.py |   8 ++
 marker/schema/groups/page.py |   2 +-
 marker/schema/text/span.py   |   9 +-
 4 files changed, 212 insertions(+), 58 deletions(-)

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index 32ee513e..47701574 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -2,8 +2,9 @@
 import ctypes
 import math
 import re
-from typing import Annotated, List, Optional, Set
+from typing import Annotated, List, Optional, Set, Tuple
 
+import numpy as np
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from ftfy import fix_text
@@ -67,12 +68,17 @@ class PdfProvider(BaseProvider):
         bool,
         "Whether to strip existing OCR text from the PDF.",
     ] = False
+    disable_links: Annotated[
+        bool,
+        "Whether to disable links.",
+    ] = False
 
     def __init__(self, filepath: str, config=None):
         super().__init__(filepath, config)
 
         self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
         self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))}
+        self.refs = {}
 
         if self.page_range is None:
             self.page_range = range(len(self.doc))
@@ -151,7 +157,7 @@ def pdftext_extraction(self) -> ProviderPageLines:
         page_char_blocks = dictionary_output(
             self.filepath,
             page_range=self.page_range,
-            keep_chars=False,
+            keep_chars=True,
             workers=self.pdftext_workers,
             flatten_pdf=self.flatten_pdf,
             quote_loosebox=False
@@ -160,6 +166,14 @@ def pdftext_extraction(self) -> ProviderPageLines:
 
         SpanClass: Span = get_block_class(BlockTypes.Span)
         LineClass: Line = get_block_class(BlockTypes.Line)
+        for page in page_char_blocks:
+            if not self.disable_links:
+                self.merge_links(page)
+
+        for page in page_char_blocks:
+            if not self.disable_links:
+                self.merge_refs(page)
+
         for page in page_char_blocks:
             page_id = page["page"]
             lines: List[ProviderOutput] = []
@@ -188,7 +202,9 @@ def pdftext_extraction(self) -> ProviderPageLines:
                                 maximum_position=span["char_end_idx"],
                                 formats=list(font_formats),
                                 page_id=page_id,
-                                text_extraction_method="pdftext"
+                                text_extraction_method="pdftext",
+                                url=span.get("url"),
+                                anchor=span.get("anchor"),
                             )
                         )
                     polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True)
@@ -199,15 +215,17 @@ def pdftext_extraction(self) -> ProviderPageLines:
                         )
                     )
             if self.check_line_spans(lines):
-                self.merge_links(lines, page_id)
                 page_lines[page_id] = lines
+
         return page_lines
 
-    def merge_links(self, lines, page_id):
+    def merge_links(self, page):
+        page_id = page["page"]
+
         links = self.get_links(page_id)
 
-        spans = [span for line in lines for span in line.spans]
-        span_bboxes = [span.polygon.bbox for span in spans]
+        spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']]
+        span_bboxes = [span['bbox'] for span in spans]
         link_bboxes = [link['bbox'] for link in links]
         intersection_matrix = matrix_intersection_area(span_bboxes, link_bboxes)
         max_intersections = {}
@@ -224,10 +242,98 @@ def merge_links(self, lines, page_id):
                     links[max_intersection]
                 )
 
+        span_replace_map = {}
         for span_idx, span in enumerate(spans):
             if span_idx in max_intersections:
                 link = max_intersections[span_idx][1]
-                span.url = link['url']
+                if link['dest_page'] is not None:
+                    dest_page = link['dest_page']
+                    link['url'] = f"#page-{dest_page}"
+                    self.refs.setdefault(dest_page, [])
+                    if link['dest_bbox']:
+                        dest_box = "-".join(map(str, link['dest_bbox']))                        
+                    else:
+                        dest_box = "0.0-0.0-1.0-1.0"
+                    if dest_box not in self.refs[dest_page]:
+                        self.refs[dest_page].append(dest_box)
+                    link['url'] += f"-{self.refs[dest_page].index(dest_box)}"
+                span_replace_map[span_idx] = self.break_spans(span, link)
+            span_idx += 1
+
+        span_idx = 0
+        for block in page["blocks"]:
+            for line in block["lines"]:
+                spans = []
+                for span in line["spans"]:
+                    if not span["text"]:
+                        continue
+                    if span_idx in span_replace_map:
+                        spans.extend(span_replace_map[span_idx])
+                    else:
+                        spans.append(span)
+                    span_idx += 1
+                line['spans'] = spans
+
+    def merge_refs(self, page):
+        page_id = page["page"]
+
+        refs = self.refs.get(page_id, [])
+        if not refs:
+            return
+
+        spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']]
+
+        span_starts = np.array([span['bbox'][:2] for span in spans])
+        ref_bboxes = np.array([list(map(float, ref.split("-"))) for ref in refs])
+        ref_starts = np.array([bbox[:2] for bbox in ref_bboxes])
+
+        distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
+
+        assigned_refs = set()
+        for ref_idx, ref_center in enumerate(ref_starts):
+            if ref_idx in assigned_refs:
+                continue
+
+            span_indices = np.argsort(distances[:, ref_idx])
+            for span_idx in span_indices:
+                if spans[span_idx].get('anchor') is None:
+                    spans[span_idx]['anchor'] = f"page-{page_id}-{ref_idx}"
+                    assigned_refs.add(ref_idx)
+                    break
+
+    def break_spans(self, orig_span, link):
+        spans = []
+        span = None
+        link_bbox = Bbox(link['bbox'])
+
+        for char in orig_span['chars']:
+            char_bbox = Bbox(char['bbox'])
+            char_in_link = bool(link_bbox.intersection_pct(char_bbox) > 0)
+
+            if not span or (char_in_link != span['char_in_link']):
+                span = {
+                    "bbox": char_bbox,
+                    "text": char["char"],
+                    "rotation": char["rotation"],
+                    "font": char["font"],
+                    "char_start_idx": char["char_idx"],
+                    "char_end_idx": char["char_idx"],
+                    "chars": [char],
+                    "url": link['url'] if char_in_link else '',
+                    "char_in_link": char_in_link
+                }
+                spans.append(span)
+            else:
+                span['text'] += char['char']
+                span['char_end_idx'] = char['char_idx']
+                span['bbox'] = span['bbox'].merge(char_bbox)
+                span['chars'].append(char)
+
+        for span in spans:
+            span['bbox'] = span['bbox'].bbox
+            del span['char_in_link']
+
+        return spans
 
     def check_line_spans(self, page_lines: List[ProviderOutput]) -> bool:
         page_spans = [span for line in page_lines for span in line.spans]
@@ -344,6 +450,44 @@ def get_fontname(self, font) -> str:
 
         return font_name
 
+    def get_dest_position(self, dest) -> Optional[Tuple[float, float]]:
+        has_x = ctypes.c_int()
+        has_y = ctypes.c_int()
+        has_zoom = ctypes.c_int()
+        x_coord = ctypes.c_float()
+        y_coord = ctypes.c_float()
+        zoom_level = ctypes.c_float()
+        success = pdfium_c.FPDFDest_GetLocationInPage(
+            dest,
+            ctypes.byref(has_x),
+            ctypes.byref(has_y),
+            ctypes.byref(has_zoom),
+            ctypes.byref(x_coord),
+            ctypes.byref(y_coord),
+            ctypes.byref(zoom_level)
+        )
+        if success:
+            if has_x.value and has_y.value:
+                return x_coord.value, y_coord.value
+        else:
+            return None
+
+    def rect_to_scaled_bbox(self, rect, page_bbox, page_height, page_width, page_rotation) -> List[float]:
+        cx_start, cy_start, cx_end, cy_end = rect
+        cx_start -= page_bbox[0]
+        cx_end -= page_bbox[0]
+        cy_start -= page_bbox[1]
+        cy_end -= page_bbox[1]
+
+        ty_start = page_height - cy_start
+        ty_end = page_height - cy_end
+
+        bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)]
+        return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox
+
+    def xy_to_scaled_bbox(self, x, y, page_bbox, page_height, page_width, page_rotation, expand_by=1) -> List[float]:
+        return self.rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_height, page_width, page_rotation)
+
     def get_links(self, page_idx):
         urls = []
         page = self.doc[page_idx]
@@ -358,10 +502,12 @@ def get_links(self, page_idx):
 
         annot_count = pdfium_c.FPDFPage_GetAnnotCount(page)
         for i in range(annot_count):
-            url = {
-                'bbox': [],
-                'url': '',
+            link = {
+                'bbox': None,
                 'page': page_idx,
+                'dest_page': None,
+                'dest_bbox': None,
+                'url': None,
             }
             annot = pdfium_c.FPDFPage_GetAnnot(page, i)
             if pdfium_c.FPDFAnnot_GetSubtype(annot) == pdfium_c.FPDF_ANNOT_LINK:
@@ -369,51 +515,46 @@ def get_links(self, page_idx):
                 success = pdfium_c.FPDFAnnot_GetRect(annot, ctypes.byref(fs_rect))
                 if not success:
                     continue
-
-                cx_start, cy_start, cx_end, cy_end = [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom]
-
-                cx_start -= page_bbox[0]
-                cx_end -= page_bbox[0]
-                cy_start -= page_bbox[1]
-                cy_end -= page_bbox[1]
-
-                ty_start = page_height - cy_start
-                ty_end = page_height - cy_end
-
-                bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)]
-                url['bbox'] = Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox
+                link['bbox'] = self.rect_to_scaled_bbox(
+                    [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom],
+                    page_bbox, page_height, page_width, page_rotation
+                )
 
                 link_obj = pdfium_c.FPDFAnnot_GetLink(annot)
 
-                action = pdfium_c.FPDFLink_GetAction(link_obj)
-                a_type = pdfium_c.FPDFAction_GetType(action)
-
-                if a_type == pdfium_c.PDFACTION_UNSUPPORTED:
-                    continue
-
-                elif a_type == pdfium_c.PDFACTION_GOTO:
-                    # Goto a page
-                    dest = pdfium_c.FPDFAction_GetDest(self.doc, action)
-                    if dest:
-                        tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest)
-                        url['url'] = f"#page-{tgt_page}"
-
-                # elif a_type == pdfium_c.PDFACTION_LAUNCH:
-                #     # Typically opens a file/app
-                #     path_len = pdfium_c.FPDFAction_GetFilePath(action, None, 0)
-                #     if path_len > 0:
-                #         buf = ctypes.create_string_buffer(path_len)
-                #         pdfium_c.FPDFAction_GetFilePath(action, buf, path_len)
-                #         filepath = buf.raw[:path_len].decode('utf-8', errors='replace').rstrip('\x00')
-
-                elif a_type == pdfium_c.PDFACTION_URI:
-                    # External link
-                    needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0)
-                    if needed_len > 0:
-                        buf = ctypes.create_string_buffer(needed_len)
-                        pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len)
-                        uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00')
-                        url["url"] = uri
-
-                urls.append(url)
+                dest = pdfium_c.FPDFLink_GetDest(self.doc, link_obj)
+                if dest:
+                    tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest)
+                    link['dest_page'] = tgt_page
+                    dest_position = self.get_dest_position(dest)
+                    if dest_position:
+                        link['dest_bbox'] = self.xy_to_scaled_bbox(*dest_position, page_bbox, page_height, page_width, page_rotation)
+
+                else:
+                    action = pdfium_c.FPDFLink_GetAction(link_obj)
+                    a_type = pdfium_c.FPDFAction_GetType(action)
+
+                    if a_type == pdfium_c.PDFACTION_UNSUPPORTED:
+                        continue
+
+                    elif a_type == pdfium_c.PDFACTION_GOTO:
+                        # Goto a page
+                        dest = pdfium_c.FPDFAction_GetDest(self.doc, action)
+                        if dest:
+                            tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest)
+                            link['dest_page'] = tgt_page
+                            dest_position = self.get_dest_position(dest)
+                            if dest_position:
+                                link['dest_bbox'] = self.xy_to_scaled_bbox(*dest_position, page_bbox, page_height, page_width, page_rotation)
+
+                    elif a_type == pdfium_c.PDFACTION_URI:
+                        # External link
+                        needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0)
+                        if needed_len > 0:
+                            buf = ctypes.create_string_buffer(needed_len)
+                            pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len)
+                            uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00')
+                            link["url"] = uri
+
+                urls.append(link)
         return urls
diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py
index 46bb62ba..dd386a33 100644
--- a/marker/renderers/markdown.py
+++ b/marker/renderers/markdown.py
@@ -61,6 +61,14 @@ def convert_th(self, el, text, convert_as_inline):
         text = text.replace("|", " ").replace("\n", " ")
         return super().convert_th(el, text, convert_as_inline)
 
+    def convert_a(self, el, text, convert_as_inline):
+        text = self.escape(text)
+        text = re.sub(r"([\[\]])", r"\\\1", text)
+        return super().convert_a(el, self.escape(text), convert_as_inline)
+
+    def convert_span(self, el, text, convert_as_inline):
+        return str(el)
+
 
 class MarkdownOutput(BaseModel):
     markdown: str
diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
index 2c5c30c3..00282899 100644
--- a/marker/schema/groups/page.py
+++ b/marker/schema/groups/page.py
@@ -78,7 +78,7 @@ def get_block(self, block_id: BlockId) -> Block | None:
         return block
 
     def assemble_html(self, child_blocks, parent_structure=None):
-        template = f"<span id='page-{self.page_id}'></span>"
+        template = ""
         for c in child_blocks:
             template += f"<content-ref src='{c.id}'></content-ref>"
         return template
diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py
index 1131df09..66015e6d 100644
--- a/marker/schema/text/span.py
+++ b/marker/schema/text/span.py
@@ -1,6 +1,6 @@
 import html
 import re
-from typing import List, Literal
+from typing import List, Literal, Optional
 
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
@@ -22,7 +22,8 @@ class Span(Block):
     minimum_position: int
     maximum_position: int
     formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]
-    url: str = ''
+    url: Optional[str] = None
+    anchor: Optional[str] = None
 
     @property
     def bold(self):
@@ -65,6 +66,10 @@ def assemble_html(self, child_blocks, parent_structure):
             text = f"<b>{text}</b>"
         elif self.math:
             text = f"<math display='inline'>{text}</math>"
+        elif self.url and self.anchor:
+            text = f"<span id='{self.anchor}'><a href='{self.url}'>{text}</a></span>"
         elif self.url:
             text = f"<a href='{self.url}'>{text}</a>"
+        elif self.anchor:
+            text = f"<span id='{self.anchor}'>{text}</span>"
         return text

From 602cad14e9845d16ee4f313feeef882ececf8d92 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Fri, 10 Jan 2025 16:03:36 +0000
Subject: [PATCH 03/14] support multiple anchors per span and put the anchors
 at the beginning

---
 marker/providers/pdf.py      | 38 ++++++++++++++++++------------------
 marker/renderers/markdown.py |  2 +-
 marker/schema/text/span.py   |  9 ++++-----
 3 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index 47701574..5b73f9d0 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -204,7 +204,7 @@ def pdftext_extraction(self) -> ProviderPageLines:
                                 page_id=page_id,
                                 text_extraction_method="pdftext",
                                 url=span.get("url"),
-                                anchor=span.get("anchor"),
+                                anchors=span.get("anchors"),
                             )
                         )
                     polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True)
@@ -248,15 +248,15 @@ def merge_links(self, page):
                 link = max_intersections[span_idx][1]
                 if link['dest_page'] is not None:
                     dest_page = link['dest_page']
-                    link['url'] = f"#page-{dest_page}"
                     self.refs.setdefault(dest_page, [])
-                    if link['dest_bbox']:
-                        dest_box = "-".join(map(str, link['dest_bbox']))                        
+                    link['url'] = f"#page-{dest_page}"
+                    if link['dest_pos']:
+                        dest_pos = link['dest_pos']
                     else:
-                        dest_box = "0.0-0.0-1.0-1.0"
-                    if dest_box not in self.refs[dest_page]:
-                        self.refs[dest_page].append(dest_box)
-                    link['url'] += f"-{self.refs[dest_page].index(dest_box)}"
+                        dest_pos = [0.0, 0.0]
+                    if dest_pos not in self.refs[dest_page]:
+                        self.refs[dest_page].append(dest_pos)
+                    link['url'] += f"-{self.refs[dest_page].index(dest_pos)}"
                 span_replace_map[span_idx] = self.break_spans(span, link)
             span_idx += 1
 
@@ -284,8 +284,8 @@ def merge_refs(self, page):
         spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']]
 
         span_starts = np.array([span['bbox'][:2] for span in spans])
-        ref_bboxes = np.array([list(map(float, ref.split("-"))) for ref in refs])
-        ref_starts = np.array([bbox[:2] for bbox in ref_bboxes])
+        ref_pos = np.array([ref for ref in refs])
+        ref_starts = np.array([pos for pos in ref_pos])
 
         distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
 
@@ -296,10 +296,10 @@ def merge_refs(self, page):
 
             span_indices = np.argsort(distances[:, ref_idx])
             for span_idx in span_indices:
-                if spans[span_idx].get('anchor') is None:
-                    spans[span_idx]['anchor'] = f"page-{page_id}-{ref_idx}"
-                    assigned_refs.add(ref_idx)
-                    break
+                spans[span_idx].setdefault('anchors', [])
+                spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}")
+                assigned_refs.add(ref_idx)
+                break
 
     def break_spans(self, orig_span, link):
         spans = []
@@ -485,8 +485,8 @@ def rect_to_scaled_bbox(self, rect, page_bbox, page_height, page_width, page_rot
         bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)]
         return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox
 
-    def xy_to_scaled_bbox(self, x, y, page_bbox, page_height, page_width, page_rotation, expand_by=1) -> List[float]:
-        return self.rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_height, page_width, page_rotation)
+    def xy_to_scaled_pos(self, x, y, page_bbox, page_height, page_width, page_rotation, expand_by=1) -> List[float]:
+        return self.rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_height, page_width, page_rotation)[:2]
 
     def get_links(self, page_idx):
         urls = []
@@ -506,7 +506,7 @@ def get_links(self, page_idx):
                 'bbox': None,
                 'page': page_idx,
                 'dest_page': None,
-                'dest_bbox': None,
+                'dest_pos': None,
                 'url': None,
             }
             annot = pdfium_c.FPDFPage_GetAnnot(page, i)
@@ -528,7 +528,7 @@ def get_links(self, page_idx):
                     link['dest_page'] = tgt_page
                     dest_position = self.get_dest_position(dest)
                     if dest_position:
-                        link['dest_bbox'] = self.xy_to_scaled_bbox(*dest_position, page_bbox, page_height, page_width, page_rotation)
+                        link['dest_pos'] = self.xy_to_scaled_pos(*dest_position, page_bbox, page_height, page_width, page_rotation)
 
                 else:
                     action = pdfium_c.FPDFLink_GetAction(link_obj)
@@ -545,7 +545,7 @@ def get_links(self, page_idx):
                             link['dest_page'] = tgt_page
                             dest_position = self.get_dest_position(dest)
                             if dest_position:
-                                link['dest_bbox'] = self.xy_to_scaled_bbox(*dest_position, page_bbox, page_height, page_width, page_rotation)
+                                link['dest_pos'] = self.xy_to_scaled_pos(*dest_position, page_bbox, page_height, page_width, page_rotation)
 
                     elif a_type == pdfium_c.PDFACTION_URI:
                         # External link
diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py
index dd386a33..65d0a27d 100644
--- a/marker/renderers/markdown.py
+++ b/marker/renderers/markdown.py
@@ -67,7 +67,7 @@ def convert_a(self, el, text, convert_as_inline):
         return super().convert_a(el, self.escape(text), convert_as_inline)
 
     def convert_span(self, el, text, convert_as_inline):
-        return str(el)
+        return f'<span id="{el["id"]}"/>'
 
 
 class MarkdownOutput(BaseModel):
diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py
index 66015e6d..86fe24bc 100644
--- a/marker/schema/text/span.py
+++ b/marker/schema/text/span.py
@@ -23,7 +23,7 @@ class Span(Block):
     maximum_position: int
     formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]
     url: Optional[str] = None
-    anchor: Optional[str] = None
+    anchors: Optional[List[str]] = None
 
     @property
     def bold(self):
@@ -66,10 +66,9 @@ def assemble_html(self, child_blocks, parent_structure):
             text = f"<b>{text}</b>"
         elif self.math:
             text = f"<math display='inline'>{text}</math>"
-        elif self.url and self.anchor:
-            text = f"<span id='{self.anchor}'><a href='{self.url}'>{text}</a></span>"
         elif self.url:
             text = f"<a href='{self.url}'>{text}</a>"
-        elif self.anchor:
-            text = f"<span id='{self.anchor}'>{text}</span>"
+
+        if self.anchors:
+            text = "".join(f"<span id='{anchor}'/>" for anchor in self.anchors) + text
         return text

From 2e48bd8813f2b9e1276ed8efc4bb18efbb36cbb3 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Fri, 10 Jan 2025 19:20:10 +0000
Subject: [PATCH 04/14] fix bugs and remove defaults

---
 marker/config/printer.py | 1 -
 marker/providers/pdf.py  | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/marker/config/printer.py b/marker/config/printer.py
index e9b37b42..1f4991f8 100644
--- a/marker/config/printer.py
+++ b/marker/config/printer.py
@@ -44,7 +44,6 @@ def parse_args(self, ctx, args):
                                 options,
                                 type=attr_type,
                                 help=" ".join(metadata),
-                                default=default,
                                 is_flag=is_flag,
                             )
                         )
diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index 5b73f9d0..ba8dbdb6 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -282,6 +282,8 @@ def merge_refs(self, page):
             return
 
         spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']]
+        if not spans:
+            return
 
         span_starts = np.array([span['bbox'][:2] for span in spans])
         ref_pos = np.array([ref for ref in refs])

From 0a0a7f2bcd184eaa786aa5d55989ade878c07a5a Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Mon, 13 Jan 2025 07:51:35 +0000
Subject: [PATCH 05/14] bugfix bbox

---
 marker/providers/pdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index ba8dbdb6..5b1b340f 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -484,7 +484,7 @@ def rect_to_scaled_bbox(self, rect, page_bbox, page_height, page_width, page_rot
         ty_start = page_height - cy_start
         ty_end = page_height - cy_end
 
-        bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)]
+        bbox = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
         return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox
 
     def xy_to_scaled_pos(self, x, y, page_bbox, page_height, page_width, page_rotation, expand_by=1) -> List[float]:

From baf7a3ac055a915bf7d2054ba9ce3669f4144960 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Mon, 13 Jan 2025 11:11:18 +0000
Subject: [PATCH 06/14] fix multiple links associated with a single span

---
 marker/providers/pdf.py | 80 ++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 41 deletions(-)

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index 5b1b340f..64a4aaf9 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -224,51 +224,42 @@ def merge_links(self, page):
 
         links = self.get_links(page_id)
 
-        spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']]
+        spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']]
         span_bboxes = [span['bbox'] for span in spans]
         link_bboxes = [link['bbox'] for link in links]
-        intersection_matrix = matrix_intersection_area(span_bboxes, link_bboxes)
-        max_intersections = {}
+        intersection_matrix = matrix_intersection_area(link_bboxes, span_bboxes)
 
-        for span_idx, span in enumerate(spans):
-            intersection_span = intersection_matrix[span_idx]
-            if intersection_span.sum() == 0:
+        span_link_map = {}
+        for link_idx, link in enumerate(links):
+            intersection_link = intersection_matrix[link_idx]
+            if intersection_link.sum() == 0:
                 continue
 
-            max_intersection = intersection_span.argmax()
-            if intersection_matrix[span_idx, max_intersection] > 0:
-                max_intersections[span_idx] = (
-                    intersection_matrix[span_idx, max_intersection],
-                    links[max_intersection]
-                )
+            max_intersection = intersection_link.argmax()
+            span = spans[max_intersection]
 
-        span_replace_map = {}
-        for span_idx, span in enumerate(spans):
-            if span_idx in max_intersections:
-                link = max_intersections[span_idx][1]
-                if link['dest_page'] is not None:
-                    dest_page = link['dest_page']
-                    self.refs.setdefault(dest_page, [])
-                    link['url'] = f"#page-{dest_page}"
-                    if link['dest_pos']:
-                        dest_pos = link['dest_pos']
-                    else:
-                        dest_pos = [0.0, 0.0]
-                    if dest_pos not in self.refs[dest_page]:
-                        self.refs[dest_page].append(dest_pos)
-                    link['url'] += f"-{self.refs[dest_page].index(dest_pos)}"
-                span_replace_map[span_idx] = self.break_spans(span, link)
-            span_idx += 1
+            if link['dest_page'] is not None:
+                dest_page = link['dest_page']
+                self.refs.setdefault(dest_page, [])
+                link['url'] = f"#page-{dest_page}"
+                if link['dest_pos']:
+                    dest_pos = link['dest_pos']
+                else:
+                    dest_pos = [0.0, 0.0]
+                if dest_pos not in self.refs[dest_page]:
+                    self.refs[dest_page].append(dest_pos)
+                link['url'] += f"-{self.refs[dest_page].index(dest_pos)}"
+
+            span_link_map.setdefault(max_intersection, [])
+            span_link_map[max_intersection].append(link)
 
         span_idx = 0
         for block in page["blocks"]:
             for line in block["lines"]:
                 spans = []
                 for span in line["spans"]:
-                    if not span["text"]:
-                        continue
-                    if span_idx in span_replace_map:
-                        spans.extend(span_replace_map[span_idx])
+                    if span_idx in span_link_map:
+                        spans.extend(self.break_spans(span, span_link_map[span_idx]))
                     else:
                         spans.append(span)
                     span_idx += 1
@@ -303,16 +294,25 @@ def merge_refs(self, page):
                 assigned_refs.add(ref_idx)
                 break
 
-    def break_spans(self, orig_span, link):
+    def break_spans(self, orig_span, links):
         spans = []
         span = None
-        link_bbox = Bbox(link['bbox'])
+        link_bboxes = [Bbox(link['bbox']) for link in links]
 
         for char in orig_span['chars']:
             char_bbox = Bbox(char['bbox'])
-            char_in_link = bool(link_bbox.intersection_pct(char_bbox) > 0)
-
-            if not span or (char_in_link != span['char_in_link']):
+            intersections = []
+            for i, link_bbox in enumerate(link_bboxes):
+                area = link_bbox.intersection_area(char_bbox)
+                if area > 0:
+                    intersections.append((area, links[i]))
+
+            current_url = ''
+            if intersections:
+                intersections.sort(key=lambda x: x[0], reverse=True)
+                current_url = intersections[0][1]['url']
+
+            if not span or current_url != span['url']:
                 span = {
                     "bbox": char_bbox,
                     "text": char["char"],
@@ -321,8 +321,7 @@ def break_spans(self, orig_span, link):
                     "char_start_idx": char["char_idx"],
                     "char_end_idx": char["char_idx"],
                     "chars": [char],
-                    "url": link['url'] if char_in_link else '',
-                    "char_in_link": char_in_link
+                    "url": current_url
                 }
                 spans.append(span)
             else:
@@ -333,7 +332,6 @@ def break_spans(self, orig_span, link):
 
         for span in spans:
             span['bbox'] = span['bbox'].bbox
-            del span['char_in_link']
 
         return spans
 

From 646769b34a70ce7c17a88b7177bfb46e6c79611c Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Mon, 13 Jan 2025 17:35:14 +0000
Subject: [PATCH 07/14] fix footnotes [skip ci]

---
 marker/processors/footnote.py    | 12 ++++++++++++
 marker/schema/blocks/footnote.py | 16 ----------------
 marker/schema/text/span.py       |  9 +++++++--
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/marker/processors/footnote.py b/marker/processors/footnote.py
index fdf31721..41f350a2 100644
--- a/marker/processors/footnote.py
+++ b/marker/processors/footnote.py
@@ -1,3 +1,5 @@
+import re
+
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
@@ -13,6 +15,7 @@ class FootnoteProcessor(BaseProcessor):
     def __call__(self, document: Document):
         for page in document.pages:
             self.push_footnotes_to_bottom(page, document)
+            self.assign_superscripts(page, document)
 
     def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
         footnote_blocks = page.contained_blocks(document, self.block_types)
@@ -24,3 +27,12 @@ def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
                 # Move to bottom if it is
                 page.structure.remove(block.id)
                 page.add_structure(block)
+
+    def assign_superscripts(self, page: PageGroup, document: Document):
+        footnote_blocks = page.contained_blocks(document, self.block_types)
+
+        for block in footnote_blocks:
+            for span in block.contained_blocks(document, (BlockTypes.Span,)):
+                if re.match(r"^[0-9\W]+", span.text):
+                    span.has_superscript = True
+                break
diff --git a/marker/schema/blocks/footnote.py b/marker/schema/blocks/footnote.py
index c476aa7d..71d3580b 100644
--- a/marker/schema/blocks/footnote.py
+++ b/marker/schema/blocks/footnote.py
@@ -1,21 +1,7 @@
-import re
-
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 
 
-def superscript(child_blocks):
-    # Superscript leading symbol or digit sequence
-    first_block = None
-    while len(child_blocks) > 0:
-        first_block = child_blocks[0]
-        child_blocks = first_block.children
-
-    if first_block is not None and first_block.id.block_type == BlockTypes.Line:
-        digit_start = r"^([0-9\W]+)(.*)"
-        first_block.html = re.sub(digit_start, r"<sup>\1</sup>\2", first_block.html.lstrip())
-
-
 class Footnote(Block):
     block_type: BlockTypes = BlockTypes.Footnote
 
@@ -23,6 +9,4 @@ def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
         template = template.replace("\n", " ")
 
-        # Add superscripts to start
-        superscript(child_blocks)
         return f"<p>{template}</p>"
diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py
index 86fe24bc..1b6e18f2 100644
--- a/marker/schema/text/span.py
+++ b/marker/schema/text/span.py
@@ -22,6 +22,7 @@ class Span(Block):
     minimum_position: int
     maximum_position: int
     formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]
+    has_superscript: bool = False
     url: Optional[str] = None
     anchors: Optional[List[str]] = None
 
@@ -60,14 +61,18 @@ def assemble_html(self, child_blocks, parent_structure):
         text = html.escape(text)
         text = cleanup_text(text)
 
+        if self.has_superscript:
+            text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", text)
+
+        if self.url:
+            text = f"<a href='{self.url}'>{text}</a>"
+
         if self.italic:
             text = f"<i>{text}</i>"
         elif self.bold:
             text = f"<b>{text}</b>"
         elif self.math:
             text = f"<math display='inline'>{text}</math>"
-        elif self.url:
-            text = f"<a href='{self.url}'>{text}</a>"
 
         if self.anchors:
             text = "".join(f"<span id='{anchor}'/>" for anchor in self.anchors) + text

From cb2c26f5ac4f0ebf49085275570e3008eb3852d6 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Mon, 13 Jan 2025 18:45:53 +0000
Subject: [PATCH 08/14] more cleanup [skip ci]

---
 marker/providers/pdf.py | 49 ++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index 64a4aaf9..cc31c100 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -238,17 +238,24 @@ def merge_links(self, page):
             max_intersection = intersection_link.argmax()
             span = spans[max_intersection]
 
-            if link['dest_page'] is not None:
-                dest_page = link['dest_page']
-                self.refs.setdefault(dest_page, [])
-                link['url'] = f"#page-{dest_page}"
-                if link['dest_pos']:
-                    dest_pos = link['dest_pos']
-                else:
-                    dest_pos = [0.0, 0.0]
-                if dest_pos not in self.refs[dest_page]:
-                    self.refs[dest_page].append(dest_pos)
-                link['url'] += f"-{self.refs[dest_page].index(dest_pos)}"
+            if link['dest_page'] is None:
+                continue
+
+            dest_page = link['dest_page']
+            self.refs.setdefault(dest_page, [])
+            link['url'] = f"#page-{dest_page}"
+            if link['dest_pos']:
+                dest_pos = link['dest_pos']
+            else:
+                # Don't link to self if there is no dest_pos
+                if dest_page == page_id:
+                    continue
+                dest_pos = [0.0, 0.0]
+
+            if dest_pos not in self.refs[dest_page]:
+                self.refs[dest_page].append(dest_pos)
+
+            link['url'] += f"-{self.refs[dest_page].index(dest_pos)}"
 
             span_link_map.setdefault(max_intersection, [])
             span_link_map[max_intersection].append(link)
@@ -272,27 +279,19 @@ def merge_refs(self, page):
         if not refs:
             return
 
-        spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']]
+        spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']]
         if not spans:
             return
 
         span_starts = np.array([span['bbox'][:2] for span in spans])
-        ref_pos = np.array([ref for ref in refs])
-        ref_starts = np.array([pos for pos in ref_pos])
+        ref_starts = np.array(refs)
 
         distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
 
-        assigned_refs = set()
-        for ref_idx, ref_center in enumerate(ref_starts):
-            if ref_idx in assigned_refs:
-                continue
-
-            span_indices = np.argsort(distances[:, ref_idx])
-            for span_idx in span_indices:
-                spans[span_idx].setdefault('anchors', [])
-                spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}")
-                assigned_refs.add(ref_idx)
-                break
+        for ref_idx in range(len(ref_starts)):
+            span_idx = np.argmin(distances[:, ref_idx])
+            spans[span_idx].setdefault('anchors', [])
+            spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}")
 
     def break_spans(self, orig_span, links):
         spans = []

From 18eea949cbe8a774e93492e96fa094bda20dd9dd Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Tue, 14 Jan 2025 09:14:00 +0000
Subject: [PATCH 09/14] cleanup [skip ci]

---
 marker/providers/pdf.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index cc31c100..dba6785e 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -166,12 +166,12 @@ def pdftext_extraction(self) -> ProviderPageLines:
 
         SpanClass: Span = get_block_class(BlockTypes.Span)
         LineClass: Line = get_block_class(BlockTypes.Line)
-        for page in page_char_blocks:
-            if not self.disable_links:
+
+        if not self.disable_links:
+            for page in page_char_blocks:
                 self.merge_links(page)
 
-        for page in page_char_blocks:
-            if not self.disable_links:
+            for page in page_char_blocks:
                 self.merge_refs(page)
 
         for page in page_char_blocks:

From 152727bb5e18c3df83aa0e334c97a84cd2057198 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Tue, 14 Jan 2025 10:14:52 +0000
Subject: [PATCH 10/14] more minor cleanup [skip ci]

---
 marker/providers/pdf.py | 131 +++++++++++++++++++++++-----------------
 1 file changed, 75 insertions(+), 56 deletions(-)

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index dba6785e..d9d344e4 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -220,6 +220,10 @@ def pdftext_extraction(self) -> ProviderPageLines:
         return page_lines
 
     def merge_links(self, page):
+        """
+        Merges links with spans. Some spans can also have multiple links associated with them.
+        We break up the spans and reconstruct them taking the links into account.
+        """
         page_id = page["page"]
 
         links = self.get_links(page_id)
@@ -266,13 +270,17 @@ def merge_links(self, page):
                 spans = []
                 for span in line["spans"]:
                     if span_idx in span_link_map:
-                        spans.extend(self.break_spans(span, span_link_map[span_idx]))
+                        spans.extend(self._reconstruct_spans(span, span_link_map[span_idx]))
                     else:
                         spans.append(span)
                     span_idx += 1
                 line['spans'] = spans
 
     def merge_refs(self, page):
+        """
+        We associate each reference to the nearest span.
+        """
+
         page_id = page["page"]
 
         refs = self.refs.get(page_id, [])
@@ -293,7 +301,10 @@ def merge_refs(self, page):
             spans[span_idx].setdefault('anchors', [])
             spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}")
 
-    def break_spans(self, orig_span, links):
+    def _reconstruct_spans(self, orig_span: dict, links: List[dict]):
+        """
+        Reconstructs the spans by breaking them up into smaller spans based on the links.
+        """
         spans = []
         span = None
         link_bboxes = [Bbox(link['bbox']) for link in links]
@@ -369,7 +380,7 @@ def check_page(self, page_id: int) -> bool:
             font_map = {}
             for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
                 font = pdfium_c.FPDFTextObj_GetFont(text_obj)
-                font_name = self.get_fontname(font)
+                font_name = self._get_fontname(font)
 
                 # we also skip pages without embedded fonts and fonts without names
                 non_embedded_fonts.append(pdfium_c.FPDFFont_GetIsEmbedded(font) == 0)
@@ -431,7 +442,8 @@ def get_page_bbox(self, idx: int) -> PolygonBox | None:
     def get_page_lines(self, idx: int) -> List[ProviderOutput]:
         return self.page_lines[idx]
 
-    def get_fontname(self, font) -> str:
+    @staticmethod
+    def _get_fontname(font) -> str:
         font_name = ""
         buffer_size = 256
 
@@ -449,7 +461,8 @@ def get_fontname(self, font) -> str:
 
         return font_name
 
-    def get_dest_position(self, dest) -> Optional[Tuple[float, float]]:
+    @staticmethod
+    def _get_dest_position(dest) -> Optional[Tuple[float, float]]:
         has_x = ctypes.c_int()
         has_y = ctypes.c_int()
         has_zoom = ctypes.c_int()
@@ -471,7 +484,11 @@ def get_dest_position(self, dest) -> Optional[Tuple[float, float]]:
         else:
             return None
 
-    def rect_to_scaled_bbox(self, rect, page_bbox, page_height, page_width, page_rotation) -> List[float]:
+    @staticmethod
+    def _rect_to_scaled_bbox(rect, page_bbox, page_rotation) -> List[float]:
+        page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
+        page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
+
         cx_start, cy_start, cx_end, cy_end = rect
         cx_start -= page_bbox[0]
         cx_end -= page_bbox[0]
@@ -484,15 +501,14 @@ def rect_to_scaled_bbox(self, rect, page_bbox, page_height, page_width, page_rot
         bbox = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
         return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox
 
-    def xy_to_scaled_pos(self, x, y, page_bbox, page_height, page_width, page_rotation, expand_by=1) -> List[float]:
-        return self.rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_height, page_width, page_rotation)[:2]
+    @staticmethod
+    def _xy_to_scaled_pos(x, y, page_bbox, page_rotation, expand_by=1) -> List[float]:
+        return PdfProvider._rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_rotation)[:2]
 
     def get_links(self, page_idx):
         urls = []
         page = self.doc[page_idx]
         page_bbox: List[float] = page.get_bbox()
-        page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
-        page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
         page_rotation = 0
         try:
             page_rotation = page.get_rotation()
@@ -509,51 +525,54 @@ def get_links(self, page_idx):
                 'url': None,
             }
             annot = pdfium_c.FPDFPage_GetAnnot(page, i)
-            if pdfium_c.FPDFAnnot_GetSubtype(annot) == pdfium_c.FPDF_ANNOT_LINK:
-                fs_rect = pdfium_c.FS_RECTF()
-                success = pdfium_c.FPDFAnnot_GetRect(annot, ctypes.byref(fs_rect))
-                if not success:
+            if pdfium_c.FPDFAnnot_GetSubtype(annot) != pdfium_c.FPDF_ANNOT_LINK:
+                continue
+
+            fs_rect = pdfium_c.FS_RECTF()
+            success = pdfium_c.FPDFAnnot_GetRect(annot, ctypes.byref(fs_rect))
+            if not success:
+                continue
+
+            link['bbox'] = self._rect_to_scaled_bbox(
+                [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom],
+                page_bbox, page_rotation
+            )
+
+            link_obj = pdfium_c.FPDFAnnot_GetLink(annot)
+
+            dest = pdfium_c.FPDFLink_GetDest(self.doc, link_obj)
+            if dest:
+                tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest)
+                link['dest_page'] = tgt_page
+                dest_position = self._get_dest_position(dest)
+                if dest_position:
+                    link['dest_pos'] = self._xy_to_scaled_pos(*dest_position, page_bbox, page_rotation)
+
+            else:
+                action = pdfium_c.FPDFLink_GetAction(link_obj)
+                a_type = pdfium_c.FPDFAction_GetType(action)
+
+                if a_type == pdfium_c.PDFACTION_UNSUPPORTED:
                     continue
-                link['bbox'] = self.rect_to_scaled_bbox(
-                    [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom],
-                    page_bbox, page_height, page_width, page_rotation
-                )
-
-                link_obj = pdfium_c.FPDFAnnot_GetLink(annot)
-
-                dest = pdfium_c.FPDFLink_GetDest(self.doc, link_obj)
-                if dest:
-                    tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest)
-                    link['dest_page'] = tgt_page
-                    dest_position = self.get_dest_position(dest)
-                    if dest_position:
-                        link['dest_pos'] = self.xy_to_scaled_pos(*dest_position, page_bbox, page_height, page_width, page_rotation)
-
-                else:
-                    action = pdfium_c.FPDFLink_GetAction(link_obj)
-                    a_type = pdfium_c.FPDFAction_GetType(action)
-
-                    if a_type == pdfium_c.PDFACTION_UNSUPPORTED:
-                        continue
-
-                    elif a_type == pdfium_c.PDFACTION_GOTO:
-                        # Goto a page
-                        dest = pdfium_c.FPDFAction_GetDest(self.doc, action)
-                        if dest:
-                            tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest)
-                            link['dest_page'] = tgt_page
-                            dest_position = self.get_dest_position(dest)
-                            if dest_position:
-                                link['dest_pos'] = self.xy_to_scaled_pos(*dest_position, page_bbox, page_height, page_width, page_rotation)
-
-                    elif a_type == pdfium_c.PDFACTION_URI:
-                        # External link
-                        needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0)
-                        if needed_len > 0:
-                            buf = ctypes.create_string_buffer(needed_len)
-                            pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len)
-                            uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00')
-                            link["url"] = uri
-
-                urls.append(link)
+
+                elif a_type == pdfium_c.PDFACTION_GOTO:
+                    # Goto a page
+                    dest = pdfium_c.FPDFAction_GetDest(self.doc, action)
+                    if dest:
+                        tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest)
+                        link['dest_page'] = tgt_page
+                        dest_position = self._get_dest_position(dest)
+                        if dest_position:
+                            link['dest_pos'] = self._xy_to_scaled_pos(*dest_position, page_bbox, page_rotation)
+
+                elif a_type == pdfium_c.PDFACTION_URI:
+                    # External link
+                    needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0)
+                    if needed_len > 0:
+                        buf = ctypes.create_string_buffer(needed_len)
+                        pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len)
+                        uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00')
+                        link["url"] = uri
+
+            urls.append(link)
         return urls

From 5ac8b0db8e76009623ff9d2935c84a8831f86f5e Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Tue, 14 Jan 2025 12:31:53 +0000
Subject: [PATCH 11/14] add test for pdf link and reference

---
 tests/builders/test_pdf_links.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 tests/builders/test_pdf_links.py

diff --git a/tests/builders/test_pdf_links.py b/tests/builders/test_pdf_links.py
new file mode 100644
index 00000000..00bff7ee
--- /dev/null
+++ b/tests/builders/test_pdf_links.py
@@ -0,0 +1,32 @@
+import pytest
+
+from marker.converters.pdf import PdfConverter
+from marker.renderers.markdown import MarkdownOutput
+from marker.schema import BlockTypes
+from marker.schema.document import Document
+
+
+@pytest.mark.filename("arxiv_test.pdf")
+@pytest.mark.output_format("markdown")
+@pytest.mark.config({"page_range": [1]})
+def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf):
+    first_page = pdf_document.pages[0]
+
+    for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)):
+        if section_header_span.text == " II.":
+            assert section_header_span.url == "#page-1-0"
+            break
+    else:
+        raise ValueError("Could not find II. in the first page")
+
+    section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0]
+    assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n'
+
+    section_header_span = section_header_block.contained_blocks(pdf_document, (BlockTypes.Span,))[0]
+    assert section_header_span.anchors == ['page-1-0']
+
+    markdown_output: MarkdownOutput = pdf_converter(temp_pdf.name)
+    markdown = markdown_output.markdown
+
+    assert '[II.](#page-1-0)' in markdown
+    assert '<span id="page-1-0"/>II. THEORETICAL FRAMEWORK' in markdown

From 2049068e8e2cc5a8a03f3d6b30773ee1bef764d7 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Tue, 14 Jan 2025 12:39:32 +0000
Subject: [PATCH 12/14] more cleanup

---
 marker/providers/pdf.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index d9d344e4..ab441c5f 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -78,7 +78,6 @@ def __init__(self, filepath: str, config=None):
 
         self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
         self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))}
-        self.refs = {}
 
         if self.page_range is None:
             self.page_range = range(len(self.doc))
@@ -168,11 +167,15 @@ def pdftext_extraction(self) -> ProviderPageLines:
         LineClass: Line = get_block_class(BlockTypes.Line)
 
         if not self.disable_links:
+            refs = {}
+
+            # we first go through the entire document and merge links and collect refs
             for page in page_char_blocks:
-                self.merge_links(page)
+                self.merge_links(page, refs)
 
+            # we can now merge the collected refs for each page
             for page in page_char_blocks:
-                self.merge_refs(page)
+                self.merge_refs(page, refs)
 
         for page in page_char_blocks:
             page_id = page["page"]
@@ -219,7 +222,7 @@ def pdftext_extraction(self) -> ProviderPageLines:
 
         return page_lines
 
-    def merge_links(self, page):
+    def merge_links(self, page, refs):
         """
         Merges links with spans. Some spans can also have multiple links associated with them.
         We break up the spans and reconstruct them taking the links into account.
@@ -246,7 +249,7 @@ def merge_links(self, page):
                 continue
 
             dest_page = link['dest_page']
-            self.refs.setdefault(dest_page, [])
+            refs.setdefault(dest_page, [])
             link['url'] = f"#page-{dest_page}"
             if link['dest_pos']:
                 dest_pos = link['dest_pos']
@@ -254,12 +257,13 @@ def merge_links(self, page):
                 # Don't link to self if there is no dest_pos
                 if dest_page == page_id:
                     continue
+                # if we don't have a dest pos, we just link to the top of the page
                 dest_pos = [0.0, 0.0]
 
-            if dest_pos not in self.refs[dest_page]:
-                self.refs[dest_page].append(dest_pos)
+            if dest_pos not in refs[dest_page]:
+                refs[dest_page].append(dest_pos)
 
-            link['url'] += f"-{self.refs[dest_page].index(dest_pos)}"
+            link['url'] += f"-{refs[dest_page].index(dest_pos)}"
 
             span_link_map.setdefault(max_intersection, [])
             span_link_map[max_intersection].append(link)
@@ -276,15 +280,15 @@ def merge_links(self, page):
                     span_idx += 1
                 line['spans'] = spans
 
-    def merge_refs(self, page):
+    def merge_refs(self, page, refs):
         """
         We associate each reference to the nearest span.
         """
 
         page_id = page["page"]
 
-        refs = self.refs.get(page_id, [])
-        if not refs:
+        page_refs = refs.get(page_id, [])
+        if not page_refs:
             return
 
         spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']]
@@ -292,7 +296,7 @@ def merge_refs(self, page):
             return
 
         span_starts = np.array([span['bbox'][:2] for span in spans])
-        ref_starts = np.array(refs)
+        ref_starts = np.array(page_refs)
 
         distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
 

From 0dc0e948ef6236a4b43e7cda2db812691560eb72 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Tue, 14 Jan 2025 17:06:31 +0000
Subject: [PATCH 13/14] move link code to pdftext

---
 marker/providers/pdf.py          | 263 +------------------------------
 tests/builders/test_pdf_links.py |   2 +-
 2 files changed, 4 insertions(+), 261 deletions(-)

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index ab441c5f..8232e742 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -1,15 +1,12 @@
 import atexit
 import ctypes
-import math
 import re
-from typing import Annotated, List, Optional, Set, Tuple
+from typing import Annotated, List, Optional, Set
 
-import numpy as np
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from ftfy import fix_text
 from pdftext.extraction import dictionary_output
-from pdftext.schema import Bbox
 from PIL import Image
 
 from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines
@@ -19,7 +16,6 @@
 from marker.schema.registry import get_block_class
 from marker.schema.text.line import Line
 from marker.schema.text.span import Span
-from marker.util import matrix_intersection_area
 
 
 class PdfProvider(BaseProvider):
@@ -159,24 +155,14 @@ def pdftext_extraction(self) -> ProviderPageLines:
             keep_chars=True,
             workers=self.pdftext_workers,
             flatten_pdf=self.flatten_pdf,
-            quote_loosebox=False
+            quote_loosebox=False,
+            disable_links=self.disable_links
         )
         self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_char_blocks)}
 
         SpanClass: Span = get_block_class(BlockTypes.Span)
         LineClass: Line = get_block_class(BlockTypes.Line)
 
-        if not self.disable_links:
-            refs = {}
-
-            # we first go through the entire document and merge links and collect refs
-            for page in page_char_blocks:
-                self.merge_links(page, refs)
-
-            # we can now merge the collected refs for each page
-            for page in page_char_blocks:
-                self.merge_refs(page, refs)
-
         for page in page_char_blocks:
             page_id = page["page"]
             lines: List[ProviderOutput] = []
@@ -222,133 +208,6 @@ def pdftext_extraction(self) -> ProviderPageLines:
 
         return page_lines
 
-    def merge_links(self, page, refs):
-        """
-        Merges links with spans. Some spans can also have multiple links associated with them.
-        We break up the spans and reconstruct them taking the links into account.
-        """
-        page_id = page["page"]
-
-        links = self.get_links(page_id)
-
-        spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']]
-        span_bboxes = [span['bbox'] for span in spans]
-        link_bboxes = [link['bbox'] for link in links]
-        intersection_matrix = matrix_intersection_area(link_bboxes, span_bboxes)
-
-        span_link_map = {}
-        for link_idx, link in enumerate(links):
-            intersection_link = intersection_matrix[link_idx]
-            if intersection_link.sum() == 0:
-                continue
-
-            max_intersection = intersection_link.argmax()
-            span = spans[max_intersection]
-
-            if link['dest_page'] is None:
-                continue
-
-            dest_page = link['dest_page']
-            refs.setdefault(dest_page, [])
-            link['url'] = f"#page-{dest_page}"
-            if link['dest_pos']:
-                dest_pos = link['dest_pos']
-            else:
-                # Don't link to self if there is no dest_pos
-                if dest_page == page_id:
-                    continue
-                # if we don't have a dest pos, we just link to the top of the page
-                dest_pos = [0.0, 0.0]
-
-            if dest_pos not in refs[dest_page]:
-                refs[dest_page].append(dest_pos)
-
-            link['url'] += f"-{refs[dest_page].index(dest_pos)}"
-
-            span_link_map.setdefault(max_intersection, [])
-            span_link_map[max_intersection].append(link)
-
-        span_idx = 0
-        for block in page["blocks"]:
-            for line in block["lines"]:
-                spans = []
-                for span in line["spans"]:
-                    if span_idx in span_link_map:
-                        spans.extend(self._reconstruct_spans(span, span_link_map[span_idx]))
-                    else:
-                        spans.append(span)
-                    span_idx += 1
-                line['spans'] = spans
-
-    def merge_refs(self, page, refs):
-        """
-        We associate each reference to the nearest span.
-        """
-
-        page_id = page["page"]
-
-        page_refs = refs.get(page_id, [])
-        if not page_refs:
-            return
-
-        spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']]
-        if not spans:
-            return
-
-        span_starts = np.array([span['bbox'][:2] for span in spans])
-        ref_starts = np.array(page_refs)
-
-        distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
-
-        for ref_idx in range(len(ref_starts)):
-            span_idx = np.argmin(distances[:, ref_idx])
-            spans[span_idx].setdefault('anchors', [])
-            spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}")
-
-    def _reconstruct_spans(self, orig_span: dict, links: List[dict]):
-        """
-        Reconstructs the spans by breaking them up into smaller spans based on the links.
-        """
-        spans = []
-        span = None
-        link_bboxes = [Bbox(link['bbox']) for link in links]
-
-        for char in orig_span['chars']:
-            char_bbox = Bbox(char['bbox'])
-            intersections = []
-            for i, link_bbox in enumerate(link_bboxes):
-                area = link_bbox.intersection_area(char_bbox)
-                if area > 0:
-                    intersections.append((area, links[i]))
-
-            current_url = ''
-            if intersections:
-                intersections.sort(key=lambda x: x[0], reverse=True)
-                current_url = intersections[0][1]['url']
-
-            if not span or current_url != span['url']:
-                span = {
-                    "bbox": char_bbox,
-                    "text": char["char"],
-                    "rotation": char["rotation"],
-                    "font": char["font"],
-                    "char_start_idx": char["char_idx"],
-                    "char_end_idx": char["char_idx"],
-                    "chars": [char],
-                    "url": current_url
-                }
-                spans.append(span)
-            else:
-                span['text'] += char['char']
-                span['char_end_idx'] = char['char_idx']
-                span['bbox'] = span['bbox'].merge(char_bbox)
-                span['chars'].append(char)
-
-        for span in spans:
-            span['bbox'] = span['bbox'].bbox
-
-        return spans
-
     def check_line_spans(self, page_lines: List[ProviderOutput]) -> bool:
         page_spans = [span for line in page_lines for span in line.spans]
         if len(page_spans) == 0:
@@ -464,119 +323,3 @@ def _get_fontname(font) -> str:
             pass
 
         return font_name
-
-    @staticmethod
-    def _get_dest_position(dest) -> Optional[Tuple[float, float]]:
-        has_x = ctypes.c_int()
-        has_y = ctypes.c_int()
-        has_zoom = ctypes.c_int()
-        x_coord = ctypes.c_float()
-        y_coord = ctypes.c_float()
-        zoom_level = ctypes.c_float()
-        success = pdfium_c.FPDFDest_GetLocationInPage(
-            dest,
-            ctypes.byref(has_x),
-            ctypes.byref(has_y),
-            ctypes.byref(has_zoom),
-            ctypes.byref(x_coord),
-            ctypes.byref(y_coord),
-            ctypes.byref(zoom_level)
-        )
-        if success:
-            if has_x.value and has_y.value:
-                return x_coord.value, y_coord.value
-        else:
-            return None
-
-    @staticmethod
-    def _rect_to_scaled_bbox(rect, page_bbox, page_rotation) -> List[float]:
-        page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
-        page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
-
-        cx_start, cy_start, cx_end, cy_end = rect
-        cx_start -= page_bbox[0]
-        cx_end -= page_bbox[0]
-        cy_start -= page_bbox[1]
-        cy_end -= page_bbox[1]
-
-        ty_start = page_height - cy_start
-        ty_end = page_height - cy_end
-
-        bbox = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
-        return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox
-
-    @staticmethod
-    def _xy_to_scaled_pos(x, y, page_bbox, page_rotation, expand_by=1) -> List[float]:
-        return PdfProvider._rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_rotation)[:2]
-
-    def get_links(self, page_idx):
-        urls = []
-        page = self.doc[page_idx]
-        page_bbox: List[float] = page.get_bbox()
-        page_rotation = 0
-        try:
-            page_rotation = page.get_rotation()
-        except:
-            pass
-
-        annot_count = pdfium_c.FPDFPage_GetAnnotCount(page)
-        for i in range(annot_count):
-            link = {
-                'bbox': None,
-                'page': page_idx,
-                'dest_page': None,
-                'dest_pos': None,
-                'url': None,
-            }
-            annot = pdfium_c.FPDFPage_GetAnnot(page, i)
-            if pdfium_c.FPDFAnnot_GetSubtype(annot) != pdfium_c.FPDF_ANNOT_LINK:
-                continue
-
-            fs_rect = pdfium_c.FS_RECTF()
-            success = pdfium_c.FPDFAnnot_GetRect(annot, ctypes.byref(fs_rect))
-            if not success:
-                continue
-
-            link['bbox'] = self._rect_to_scaled_bbox(
-                [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom],
-                page_bbox, page_rotation
-            )
-
-            link_obj = pdfium_c.FPDFAnnot_GetLink(annot)
-
-            dest = pdfium_c.FPDFLink_GetDest(self.doc, link_obj)
-            if dest:
-                tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest)
-                link['dest_page'] = tgt_page
-                dest_position = self._get_dest_position(dest)
-                if dest_position:
-                    link['dest_pos'] = self._xy_to_scaled_pos(*dest_position, page_bbox, page_rotation)
-
-            else:
-                action = pdfium_c.FPDFLink_GetAction(link_obj)
-                a_type = pdfium_c.FPDFAction_GetType(action)
-
-                if a_type == pdfium_c.PDFACTION_UNSUPPORTED:
-                    continue
-
-                elif a_type == pdfium_c.PDFACTION_GOTO:
-                    # Goto a page
-                    dest = pdfium_c.FPDFAction_GetDest(self.doc, action)
-                    if dest:
-                        tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest)
-                        link['dest_page'] = tgt_page
-                        dest_position = self._get_dest_position(dest)
-                        if dest_position:
-                            link['dest_pos'] = self._xy_to_scaled_pos(*dest_position, page_bbox, page_rotation)
-
-                elif a_type == pdfium_c.PDFACTION_URI:
-                    # External link
-                    needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0)
-                    if needed_len > 0:
-                        buf = ctypes.create_string_buffer(needed_len)
-                        pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len)
-                        uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00')
-                        link["url"] = uri
-
-            urls.append(link)
-        return urls
diff --git a/tests/builders/test_pdf_links.py b/tests/builders/test_pdf_links.py
index 00bff7ee..72a97070 100644
--- a/tests/builders/test_pdf_links.py
+++ b/tests/builders/test_pdf_links.py
@@ -13,7 +13,7 @@ def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf
     first_page = pdf_document.pages[0]
 
     for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)):
-        if section_header_span.text == " II.":
+        if "II." in section_header_span.text:
             assert section_header_span.url == "#page-1-0"
             break
     else:

From 54ab2ccb25d9f83c151b83021ddd03569dfaff4a Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Tue, 14 Jan 2025 17:26:16 +0000
Subject: [PATCH 14/14] keep_chars=False [skip ci]

---
 marker/providers/pdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index 8232e742..f568cb42 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -152,7 +152,7 @@ def pdftext_extraction(self) -> ProviderPageLines:
         page_char_blocks = dictionary_output(
             self.filepath,
             page_range=self.page_range,
-            keep_chars=True,
+            keep_chars=False,
             workers=self.pdftext_workers,
             flatten_pdf=self.flatten_pdf,
             quote_loosebox=False,