From 4f5a4a924c721fc1a77c14b9390ee23e17917f55 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Thu, 9 Jan 2025 10:49:48 +0000 Subject: [PATCH 01/14] initial pdf link merging impl --- marker/providers/pdf.py | 104 +++++++++++++++++++++++++++++++++++ marker/schema/groups/page.py | 15 ++--- marker/schema/text/span.py | 9 ++- 3 files changed, 115 insertions(+), 13 deletions(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index 09b9603d..32ee513e 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -1,5 +1,6 @@ import atexit import ctypes +import math import re from typing import Annotated, List, Optional, Set @@ -7,6 +8,7 @@ import pypdfium2.raw as pdfium_c from ftfy import fix_text from pdftext.extraction import dictionary_output +from pdftext.schema import Bbox from PIL import Image from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines @@ -16,6 +18,7 @@ from marker.schema.registry import get_block_class from marker.schema.text.line import Line from marker.schema.text.span import Span +from marker.util import matrix_intersection_area class PdfProvider(BaseProvider): @@ -196,9 +199,36 @@ def pdftext_extraction(self) -> ProviderPageLines: ) ) if self.check_line_spans(lines): + self.merge_links(lines, page_id) page_lines[page_id] = lines return page_lines + def merge_links(self, lines, page_id): + links = self.get_links(page_id) + + spans = [span for line in lines for span in line.spans] + span_bboxes = [span.polygon.bbox for span in spans] + link_bboxes = [link['bbox'] for link in links] + intersection_matrix = matrix_intersection_area(span_bboxes, link_bboxes) + max_intersections = {} + + for span_idx, span in enumerate(spans): + intersection_span = intersection_matrix[span_idx] + if intersection_span.sum() == 0: + continue + + max_intersection = intersection_span.argmax() + if intersection_matrix[span_idx, max_intersection] > 0: + max_intersections[span_idx] = ( + intersection_matrix[span_idx, max_intersection], + links[max_intersection] + ) + + for span_idx, span in enumerate(spans): + if span_idx in max_intersections: + link = max_intersections[span_idx][1] + span.url = link['url'] + def check_line_spans(self, page_lines: List[ProviderOutput]) -> bool: page_spans = [span for line in page_lines for span in line.spans] if len(page_spans) == 0: @@ -313,3 +343,77 @@ def get_fontname(self, font) -> str: pass return font_name + + def get_links(self, page_idx): + urls = [] + page = self.doc[page_idx] + page_bbox: List[float] = page.get_bbox() + page_width = math.ceil(abs(page_bbox[2] - page_bbox[0])) + page_height = math.ceil(abs(page_bbox[1] - page_bbox[3])) + page_rotation = 0 + try: + page_rotation = page.get_rotation() + except: + pass + + annot_count = pdfium_c.FPDFPage_GetAnnotCount(page) + for i in range(annot_count): + url = { + 'bbox': [], + 'url': '', + 'page': page_idx, + } + annot = pdfium_c.FPDFPage_GetAnnot(page, i) + if pdfium_c.FPDFAnnot_GetSubtype(annot) == pdfium_c.FPDF_ANNOT_LINK: + fs_rect = pdfium_c.FS_RECTF() + success = pdfium_c.FPDFAnnot_GetRect(annot, ctypes.byref(fs_rect)) + if not success: + continue + + cx_start, cy_start, cx_end, cy_end = [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom] + + cx_start -= page_bbox[0] + cx_end -= page_bbox[0] + cy_start -= page_bbox[1] + cy_end -= page_bbox[1] + + ty_start = page_height - cy_start + ty_end = page_height - cy_end + + bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)] + url['bbox'] = Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox + + link_obj = pdfium_c.FPDFAnnot_GetLink(annot) + + action = pdfium_c.FPDFLink_GetAction(link_obj) + a_type = pdfium_c.FPDFAction_GetType(action) + + if a_type == pdfium_c.PDFACTION_UNSUPPORTED: + continue + + elif a_type == pdfium_c.PDFACTION_GOTO: + # Goto a page + dest = pdfium_c.FPDFAction_GetDest(self.doc, action) + if dest: + tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest) + url['url'] = f"#page-{tgt_page}" + + # elif a_type == pdfium_c.PDFACTION_LAUNCH: + # # Typically opens a file/app + # path_len = pdfium_c.FPDFAction_GetFilePath(action, None, 0) + # if path_len > 0: + # buf = ctypes.create_string_buffer(path_len) + # pdfium_c.FPDFAction_GetFilePath(action, buf, path_len) + # filepath = buf.raw[:path_len].decode('utf-8', errors='replace').rstrip('\x00') + + elif a_type == pdfium_c.PDFACTION_URI: + # External link + needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0) + if needed_len > 0: + buf = ctypes.create_string_buffer(needed_len) + pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len) + uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00') + url["url"] = uri + + urls.append(url) + return urls diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index 3089f4ce..2c5c30c3 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -19,9 +19,9 @@ class PageGroup(Group): lowres_image: Image.Image | None = None highres_image: Image.Image | None = None children: List[Union[Any, Block]] | None = None - layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong) + layout_sliced: bool = False # Whether the layout model had to slice the image (order may be wrong) excluded_block_types: Sequence[BlockTypes] = (BlockTypes.Line, BlockTypes.Span,) - maximum_assignment_distance: float = 20 # pixels + maximum_assignment_distance: float = 20 # pixels def incr_block_id(self): if self.block_id is None: @@ -38,7 +38,7 @@ def add_child(self, block: Block): def get_next_block(self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None): if ignored_block_types is None: ignored_block_types = [] - + structure_idx = 0 if block is not None: structure_idx = self.structure.index(block.id) + 1 @@ -78,7 +78,7 @@ def get_block(self, block_id: BlockId) -> Block | None: return block def assemble_html(self, child_blocks, parent_structure=None): - template = "" + template = f"" for c in child_blocks: template += f"" return template @@ -119,7 +119,6 @@ def replace_block(self, block: Block, new_block: Block): for child in self.children: child.replace_block(block, new_block) - def identify_missing_blocks( self, provider_line_idxs: List[int], @@ -134,7 +133,7 @@ def identify_missing_blocks( # if the unassociated line is a new line with minimal area, we can skip it if provider_outputs[line_idx].line.polygon.area <= 1 and \ - provider_outputs[line_idx].raw_text == "\n": + provider_outputs[line_idx].raw_text == "\n": continue if new_block is None: @@ -181,7 +180,6 @@ def create_missing_blocks( else: self.structure.append(block.id) - def add_initial_blocks( self, block_lines: Dict[BlockId, LINE_MAPPING_TYPE], @@ -202,7 +200,6 @@ def add_initial_blocks( self.add_full_block(span) line.add_structure(span) - def merge_blocks( self, provider_outputs: List[ProviderOutput], @@ -254,5 +251,3 @@ def aggregate_block_metadata(self) -> BlockMetadata: if block.metadata is not None: self.metadata = self.metadata.merge(block.metadata) return self.metadata - - diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py index d066ccbe..1131df09 100644 --- a/marker/schema/text/span.py +++ b/marker/schema/text/span.py @@ -22,6 +22,7 @@ class Span(Block): minimum_position: int maximum_position: int formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']] + url: str = '' @property def bold(self): @@ -59,9 +60,11 @@ def assemble_html(self, child_blocks, parent_structure): text = cleanup_text(text) if self.italic: - return f"{text}" + text = f"{text}" elif self.bold: - return f"{text}" + text = f"{text}" elif self.math: - return f"{text}" + text = f"{text}" + elif self.url: + text = f"{text}" return text From 6f0166e51d0e300e66c6722aa17b1a996e732b89 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Fri, 10 Jan 2025 09:27:15 +0000 Subject: [PATCH 02/14] add support for refs and fix markdown conversion etc --- marker/providers/pdf.py | 251 +++++++++++++++++++++++++++-------- marker/renderers/markdown.py | 8 ++ marker/schema/groups/page.py | 2 +- marker/schema/text/span.py | 9 +- 4 files changed, 212 insertions(+), 58 deletions(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index 32ee513e..47701574 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -2,8 +2,9 @@ import ctypes import math import re -from typing import Annotated, List, Optional, Set +from typing import Annotated, List, Optional, Set, Tuple +import numpy as np import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c from ftfy import fix_text @@ -67,12 +68,17 @@ class PdfProvider(BaseProvider): bool, "Whether to strip existing OCR text from the PDF.", ] = False + disable_links: Annotated[ + bool, + "Whether to disable links.", + ] = False def __init__(self, filepath: str, config=None): super().__init__(filepath, config) self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath) self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))} + self.refs = {} if self.page_range is None: self.page_range = range(len(self.doc)) @@ -151,7 +157,7 @@ def pdftext_extraction(self) -> ProviderPageLines: page_char_blocks = dictionary_output( self.filepath, page_range=self.page_range, - keep_chars=False, + keep_chars=True, workers=self.pdftext_workers, flatten_pdf=self.flatten_pdf, quote_loosebox=False @@ -160,6 +166,14 @@ def pdftext_extraction(self) -> ProviderPageLines: SpanClass: Span = get_block_class(BlockTypes.Span) LineClass: Line = get_block_class(BlockTypes.Line) + for page in page_char_blocks: + if not self.disable_links: + self.merge_links(page) + + for page in page_char_blocks: + if not self.disable_links: + self.merge_refs(page) + for page in page_char_blocks: page_id = page["page"] lines: List[ProviderOutput] = [] @@ -188,7 +202,9 @@ def pdftext_extraction(self) -> ProviderPageLines: maximum_position=span["char_end_idx"], formats=list(font_formats), page_id=page_id, - text_extraction_method="pdftext" + text_extraction_method="pdftext", + url=span.get("url"), + anchor=span.get("anchor"), ) ) polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True) @@ -199,15 +215,17 @@ def pdftext_extraction(self) -> ProviderPageLines: ) ) if self.check_line_spans(lines): - self.merge_links(lines, page_id) page_lines[page_id] = lines + return page_lines - def merge_links(self, lines, page_id): + def merge_links(self, page): + page_id = page["page"] + links = self.get_links(page_id) - spans = [span for line in lines for span in line.spans] - span_bboxes = [span.polygon.bbox for span in spans] + spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']] + span_bboxes = [span['bbox'] for span in spans] link_bboxes = [link['bbox'] for link in links] intersection_matrix = matrix_intersection_area(span_bboxes, link_bboxes) max_intersections = {} @@ -224,10 +242,98 @@ def merge_links(self, lines, page_id): links[max_intersection] ) + span_replace_map = {} for span_idx, span in enumerate(spans): if span_idx in max_intersections: link = max_intersections[span_idx][1] - span.url = link['url'] + if link['dest_page'] is not None: + dest_page = link['dest_page'] + link['url'] = f"#page-{dest_page}" + self.refs.setdefault(dest_page, []) + if link['dest_bbox']: + dest_box = "-".join(map(str, link['dest_bbox'])) + else: + dest_box = "0.0-0.0-1.0-1.0" + if dest_box not in self.refs[dest_page]: + self.refs[dest_page].append(dest_box) + link['url'] += f"-{self.refs[dest_page].index(dest_box)}" + span_replace_map[span_idx] = self.break_spans(span, link) + span_idx += 1 + + span_idx = 0 + for block in page["blocks"]: + for line in block["lines"]: + spans = [] + for span in line["spans"]: + if not span["text"]: + continue + if span_idx in span_replace_map: + spans.extend(span_replace_map[span_idx]) + else: + spans.append(span) + span_idx += 1 + line['spans'] = spans + + def merge_refs(self, page): + page_id = page["page"] + + refs = self.refs.get(page_id, []) + if not refs: + return + + spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']] + + span_starts = np.array([span['bbox'][:2] for span in spans]) + ref_bboxes = np.array([list(map(float, ref.split("-"))) for ref in refs]) + ref_starts = np.array([bbox[:2] for bbox in ref_bboxes]) + + distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2) + + assigned_refs = set() + for ref_idx, ref_center in enumerate(ref_starts): + if ref_idx in assigned_refs: + continue + + span_indices = np.argsort(distances[:, ref_idx]) + for span_idx in span_indices: + if spans[span_idx].get('anchor') is None: + spans[span_idx]['anchor'] = f"page-{page_id}-{ref_idx}" + assigned_refs.add(ref_idx) + break + + def break_spans(self, orig_span, link): + spans = [] + span = None + link_bbox = Bbox(link['bbox']) + + for char in orig_span['chars']: + char_bbox = Bbox(char['bbox']) + char_in_link = bool(link_bbox.intersection_pct(char_bbox) > 0) + + if not span or (char_in_link != span['char_in_link']): + span = { + "bbox": char_bbox, + "text": char["char"], + "rotation": char["rotation"], + "font": char["font"], + "char_start_idx": char["char_idx"], + "char_end_idx": char["char_idx"], + "chars": [char], + "url": link['url'] if char_in_link else '', + "char_in_link": char_in_link + } + spans.append(span) + else: + span['text'] += char['char'] + span['char_end_idx'] = char['char_idx'] + span['bbox'] = span['bbox'].merge(char_bbox) + span['chars'].append(char) + + for span in spans: + span['bbox'] = span['bbox'].bbox + del span['char_in_link'] + + return spans def check_line_spans(self, page_lines: List[ProviderOutput]) -> bool: page_spans = [span for line in page_lines for span in line.spans] @@ -344,6 +450,44 @@ def get_fontname(self, font) -> str: return font_name + def get_dest_position(self, dest) -> Optional[Tuple[float, float]]: + has_x = ctypes.c_int() + has_y = ctypes.c_int() + has_zoom = ctypes.c_int() + x_coord = ctypes.c_float() + y_coord = ctypes.c_float() + zoom_level = ctypes.c_float() + success = pdfium_c.FPDFDest_GetLocationInPage( + dest, + ctypes.byref(has_x), + ctypes.byref(has_y), + ctypes.byref(has_zoom), + ctypes.byref(x_coord), + ctypes.byref(y_coord), + ctypes.byref(zoom_level) + ) + if success: + if has_x.value and has_y.value: + return x_coord.value, y_coord.value + else: + return None + + def rect_to_scaled_bbox(self, rect, page_bbox, page_height, page_width, page_rotation) -> List[float]: + cx_start, cy_start, cx_end, cy_end = rect + cx_start -= page_bbox[0] + cx_end -= page_bbox[0] + cy_start -= page_bbox[1] + cy_end -= page_bbox[1] + + ty_start = page_height - cy_start + ty_end = page_height - cy_end + + bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)] + return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox + + def xy_to_scaled_bbox(self, x, y, page_bbox, page_height, page_width, page_rotation, expand_by=1) -> List[float]: + return self.rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_height, page_width, page_rotation) + def get_links(self, page_idx): urls = [] page = self.doc[page_idx] @@ -358,10 +502,12 @@ def get_links(self, page_idx): annot_count = pdfium_c.FPDFPage_GetAnnotCount(page) for i in range(annot_count): - url = { - 'bbox': [], - 'url': '', + link = { + 'bbox': None, 'page': page_idx, + 'dest_page': None, + 'dest_bbox': None, + 'url': None, } annot = pdfium_c.FPDFPage_GetAnnot(page, i) if pdfium_c.FPDFAnnot_GetSubtype(annot) == pdfium_c.FPDF_ANNOT_LINK: @@ -369,51 +515,46 @@ def get_links(self, page_idx): success = pdfium_c.FPDFAnnot_GetRect(annot, ctypes.byref(fs_rect)) if not success: continue - - cx_start, cy_start, cx_end, cy_end = [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom] - - cx_start -= page_bbox[0] - cx_end -= page_bbox[0] - cy_start -= page_bbox[1] - cy_end -= page_bbox[1] - - ty_start = page_height - cy_start - ty_end = page_height - cy_end - - bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)] - url['bbox'] = Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox + link['bbox'] = self.rect_to_scaled_bbox( + [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom], + page_bbox, page_height, page_width, page_rotation + ) link_obj = pdfium_c.FPDFAnnot_GetLink(annot) - action = pdfium_c.FPDFLink_GetAction(link_obj) - a_type = pdfium_c.FPDFAction_GetType(action) - - if a_type == pdfium_c.PDFACTION_UNSUPPORTED: - continue - - elif a_type == pdfium_c.PDFACTION_GOTO: - # Goto a page - dest = pdfium_c.FPDFAction_GetDest(self.doc, action) - if dest: - tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest) - url['url'] = f"#page-{tgt_page}" - - # elif a_type == pdfium_c.PDFACTION_LAUNCH: - # # Typically opens a file/app - # path_len = pdfium_c.FPDFAction_GetFilePath(action, None, 0) - # if path_len > 0: - # buf = ctypes.create_string_buffer(path_len) - # pdfium_c.FPDFAction_GetFilePath(action, buf, path_len) - # filepath = buf.raw[:path_len].decode('utf-8', errors='replace').rstrip('\x00') - - elif a_type == pdfium_c.PDFACTION_URI: - # External link - needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0) - if needed_len > 0: - buf = ctypes.create_string_buffer(needed_len) - pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len) - uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00') - url["url"] = uri - - urls.append(url) + dest = pdfium_c.FPDFLink_GetDest(self.doc, link_obj) + if dest: + tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest) + link['dest_page'] = tgt_page + dest_position = self.get_dest_position(dest) + if dest_position: + link['dest_bbox'] = self.xy_to_scaled_bbox(*dest_position, page_bbox, page_height, page_width, page_rotation) + + else: + action = pdfium_c.FPDFLink_GetAction(link_obj) + a_type = pdfium_c.FPDFAction_GetType(action) + + if a_type == pdfium_c.PDFACTION_UNSUPPORTED: + continue + + elif a_type == pdfium_c.PDFACTION_GOTO: + # Goto a page + dest = pdfium_c.FPDFAction_GetDest(self.doc, action) + if dest: + tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest) + link['dest_page'] = tgt_page + dest_position = self.get_dest_position(dest) + if dest_position: + link['dest_bbox'] = self.xy_to_scaled_bbox(*dest_position, page_bbox, page_height, page_width, page_rotation) + + elif a_type == pdfium_c.PDFACTION_URI: + # External link + needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0) + if needed_len > 0: + buf = ctypes.create_string_buffer(needed_len) + pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len) + uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00') + link["url"] = uri + + urls.append(link) return urls diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index 46bb62ba..dd386a33 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -61,6 +61,14 @@ def convert_th(self, el, text, convert_as_inline): text = text.replace("|", " ").replace("\n", " ") return super().convert_th(el, text, convert_as_inline) + def convert_a(self, el, text, convert_as_inline): + text = self.escape(text) + text = re.sub(r"([\[\]])", r"\\\1", text) + return super().convert_a(el, self.escape(text), convert_as_inline) + + def convert_span(self, el, text, convert_as_inline): + return str(el) + class MarkdownOutput(BaseModel): markdown: str diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index 2c5c30c3..00282899 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -78,7 +78,7 @@ def get_block(self, block_id: BlockId) -> Block | None: return block def assemble_html(self, child_blocks, parent_structure=None): - template = f"" + template = "" for c in child_blocks: template += f"" return template diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py index 1131df09..66015e6d 100644 --- a/marker/schema/text/span.py +++ b/marker/schema/text/span.py @@ -1,6 +1,6 @@ import html import re -from typing import List, Literal +from typing import List, Literal, Optional from marker.schema import BlockTypes from marker.schema.blocks import Block @@ -22,7 +22,8 @@ class Span(Block): minimum_position: int maximum_position: int formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']] - url: str = '' + url: Optional[str] = None + anchor: Optional[str] = None @property def bold(self): @@ -65,6 +66,10 @@ def assemble_html(self, child_blocks, parent_structure): text = f"{text}" elif self.math: text = f"{text}" + elif self.url and self.anchor: + text = f"{text}" elif self.url: text = f"{text}" + elif self.anchor: + text = f"{text}" return text From 602cad14e9845d16ee4f313feeef882ececf8d92 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Fri, 10 Jan 2025 16:03:36 +0000 Subject: [PATCH 03/14] support multiple anchors per span and put the anchors at the beginning --- marker/providers/pdf.py | 38 ++++++++++++++++++------------------ marker/renderers/markdown.py | 2 +- marker/schema/text/span.py | 9 ++++----- 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index 47701574..5b73f9d0 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -204,7 +204,7 @@ def pdftext_extraction(self) -> ProviderPageLines: page_id=page_id, text_extraction_method="pdftext", url=span.get("url"), - anchor=span.get("anchor"), + anchors=span.get("anchors"), ) ) polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True) @@ -248,15 +248,15 @@ def merge_links(self, page): link = max_intersections[span_idx][1] if link['dest_page'] is not None: dest_page = link['dest_page'] - link['url'] = f"#page-{dest_page}" self.refs.setdefault(dest_page, []) - if link['dest_bbox']: - dest_box = "-".join(map(str, link['dest_bbox'])) + link['url'] = f"#page-{dest_page}" + if link['dest_pos']: + dest_pos = link['dest_pos'] else: - dest_box = "0.0-0.0-1.0-1.0" - if dest_box not in self.refs[dest_page]: - self.refs[dest_page].append(dest_box) - link['url'] += f"-{self.refs[dest_page].index(dest_box)}" + dest_pos = [0.0, 0.0] + if dest_pos not in self.refs[dest_page]: + self.refs[dest_page].append(dest_pos) + link['url'] += f"-{self.refs[dest_page].index(dest_pos)}" span_replace_map[span_idx] = self.break_spans(span, link) span_idx += 1 @@ -284,8 +284,8 @@ def merge_refs(self, page): spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']] span_starts = np.array([span['bbox'][:2] for span in spans]) - ref_bboxes = np.array([list(map(float, ref.split("-"))) for ref in refs]) - ref_starts = np.array([bbox[:2] for bbox in ref_bboxes]) + ref_pos = np.array([ref for ref in refs]) + ref_starts = np.array([pos for pos in ref_pos]) distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2) @@ -296,10 +296,10 @@ def merge_refs(self, page): span_indices = np.argsort(distances[:, ref_idx]) for span_idx in span_indices: - if spans[span_idx].get('anchor') is None: - spans[span_idx]['anchor'] = f"page-{page_id}-{ref_idx}" - assigned_refs.add(ref_idx) - break + spans[span_idx].setdefault('anchors', []) + spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}") + assigned_refs.add(ref_idx) + break def break_spans(self, orig_span, link): spans = [] @@ -485,8 +485,8 @@ def rect_to_scaled_bbox(self, rect, page_bbox, page_height, page_width, page_rot bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)] return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox - def xy_to_scaled_bbox(self, x, y, page_bbox, page_height, page_width, page_rotation, expand_by=1) -> List[float]: - return self.rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_height, page_width, page_rotation) + def xy_to_scaled_pos(self, x, y, page_bbox, page_height, page_width, page_rotation, expand_by=1) -> List[float]: + return self.rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_height, page_width, page_rotation)[:2] def get_links(self, page_idx): urls = [] @@ -506,7 +506,7 @@ def get_links(self, page_idx): 'bbox': None, 'page': page_idx, 'dest_page': None, - 'dest_bbox': None, + 'dest_pos': None, 'url': None, } annot = pdfium_c.FPDFPage_GetAnnot(page, i) @@ -528,7 +528,7 @@ def get_links(self, page_idx): link['dest_page'] = tgt_page dest_position = self.get_dest_position(dest) if dest_position: - link['dest_bbox'] = self.xy_to_scaled_bbox(*dest_position, page_bbox, page_height, page_width, page_rotation) + link['dest_pos'] = self.xy_to_scaled_pos(*dest_position, page_bbox, page_height, page_width, page_rotation) else: action = pdfium_c.FPDFLink_GetAction(link_obj) @@ -545,7 +545,7 @@ def get_links(self, page_idx): link['dest_page'] = tgt_page dest_position = self.get_dest_position(dest) if dest_position: - link['dest_bbox'] = self.xy_to_scaled_bbox(*dest_position, page_bbox, page_height, page_width, page_rotation) + link['dest_pos'] = self.xy_to_scaled_pos(*dest_position, page_bbox, page_height, page_width, page_rotation) elif a_type == pdfium_c.PDFACTION_URI: # External link diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index dd386a33..65d0a27d 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -67,7 +67,7 @@ def convert_a(self, el, text, convert_as_inline): return super().convert_a(el, self.escape(text), convert_as_inline) def convert_span(self, el, text, convert_as_inline): - return str(el) + return f'' class MarkdownOutput(BaseModel): diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py index 66015e6d..86fe24bc 100644 --- a/marker/schema/text/span.py +++ b/marker/schema/text/span.py @@ -23,7 +23,7 @@ class Span(Block): maximum_position: int formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']] url: Optional[str] = None - anchor: Optional[str] = None + anchors: Optional[List[str]] = None @property def bold(self): @@ -66,10 +66,9 @@ def assemble_html(self, child_blocks, parent_structure): text = f"{text}" elif self.math: text = f"{text}" - elif self.url and self.anchor: - text = f"{text}" elif self.url: text = f"{text}" - elif self.anchor: - text = f"{text}" + + if self.anchors: + text = "".join(f"" for anchor in self.anchors) + text return text From 2e48bd8813f2b9e1276ed8efc4bb18efbb36cbb3 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Fri, 10 Jan 2025 19:20:10 +0000 Subject: [PATCH 04/14] fix bugs and remove defaults --- marker/config/printer.py | 1 - marker/providers/pdf.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/marker/config/printer.py b/marker/config/printer.py index e9b37b42..1f4991f8 100644 --- a/marker/config/printer.py +++ b/marker/config/printer.py @@ -44,7 +44,6 @@ def parse_args(self, ctx, args): options, type=attr_type, help=" ".join(metadata), - default=default, is_flag=is_flag, ) ) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index 5b73f9d0..ba8dbdb6 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -282,6 +282,8 @@ def merge_refs(self, page): return spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']] + if not spans: + return span_starts = np.array([span['bbox'][:2] for span in spans]) ref_pos = np.array([ref for ref in refs]) From 0a0a7f2bcd184eaa786aa5d55989ade878c07a5a Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Mon, 13 Jan 2025 07:51:35 +0000 Subject: [PATCH 05/14] bugfix bbox --- marker/providers/pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index ba8dbdb6..5b1b340f 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -484,7 +484,7 @@ def rect_to_scaled_bbox(self, rect, page_bbox, page_height, page_width, page_rot ty_start = page_height - cy_start ty_end = page_height - cy_end - bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)] + bbox = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)] return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox def xy_to_scaled_pos(self, x, y, page_bbox, page_height, page_width, page_rotation, expand_by=1) -> List[float]: From baf7a3ac055a915bf7d2054ba9ce3669f4144960 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Mon, 13 Jan 2025 11:11:18 +0000 Subject: [PATCH 06/14] fix multiple links associated with a single span --- marker/providers/pdf.py | 80 ++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index 5b1b340f..64a4aaf9 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -224,51 +224,42 @@ def merge_links(self, page): links = self.get_links(page_id) - spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']] + spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']] span_bboxes = [span['bbox'] for span in spans] link_bboxes = [link['bbox'] for link in links] - intersection_matrix = matrix_intersection_area(span_bboxes, link_bboxes) - max_intersections = {} + intersection_matrix = matrix_intersection_area(link_bboxes, span_bboxes) - for span_idx, span in enumerate(spans): - intersection_span = intersection_matrix[span_idx] - if intersection_span.sum() == 0: + span_link_map = {} + for link_idx, link in enumerate(links): + intersection_link = intersection_matrix[link_idx] + if intersection_link.sum() == 0: continue - max_intersection = intersection_span.argmax() - if intersection_matrix[span_idx, max_intersection] > 0: - max_intersections[span_idx] = ( - intersection_matrix[span_idx, max_intersection], - links[max_intersection] - ) + max_intersection = intersection_link.argmax() + span = spans[max_intersection] - span_replace_map = {} - for span_idx, span in enumerate(spans): - if span_idx in max_intersections: - link = max_intersections[span_idx][1] - if link['dest_page'] is not None: - dest_page = link['dest_page'] - self.refs.setdefault(dest_page, []) - link['url'] = f"#page-{dest_page}" - if link['dest_pos']: - dest_pos = link['dest_pos'] - else: - dest_pos = [0.0, 0.0] - if dest_pos not in self.refs[dest_page]: - self.refs[dest_page].append(dest_pos) - link['url'] += f"-{self.refs[dest_page].index(dest_pos)}" - span_replace_map[span_idx] = self.break_spans(span, link) - span_idx += 1 + if link['dest_page'] is not None: + dest_page = link['dest_page'] + self.refs.setdefault(dest_page, []) + link['url'] = f"#page-{dest_page}" + if link['dest_pos']: + dest_pos = link['dest_pos'] + else: + dest_pos = [0.0, 0.0] + if dest_pos not in self.refs[dest_page]: + self.refs[dest_page].append(dest_pos) + link['url'] += f"-{self.refs[dest_page].index(dest_pos)}" + + span_link_map.setdefault(max_intersection, []) + span_link_map[max_intersection].append(link) span_idx = 0 for block in page["blocks"]: for line in block["lines"]: spans = [] for span in line["spans"]: - if not span["text"]: - continue - if span_idx in span_replace_map: - spans.extend(span_replace_map[span_idx]) + if span_idx in span_link_map: + spans.extend(self.break_spans(span, span_link_map[span_idx])) else: spans.append(span) span_idx += 1 @@ -303,16 +294,25 @@ def merge_refs(self, page): assigned_refs.add(ref_idx) break - def break_spans(self, orig_span, link): + def break_spans(self, orig_span, links): spans = [] span = None - link_bbox = Bbox(link['bbox']) + link_bboxes = [Bbox(link['bbox']) for link in links] for char in orig_span['chars']: char_bbox = Bbox(char['bbox']) - char_in_link = bool(link_bbox.intersection_pct(char_bbox) > 0) - - if not span or (char_in_link != span['char_in_link']): + intersections = [] + for i, link_bbox in enumerate(link_bboxes): + area = link_bbox.intersection_area(char_bbox) + if area > 0: + intersections.append((area, links[i])) + + current_url = '' + if intersections: + intersections.sort(key=lambda x: x[0], reverse=True) + current_url = intersections[0][1]['url'] + + if not span or current_url != span['url']: span = { "bbox": char_bbox, "text": char["char"], @@ -321,8 +321,7 @@ def break_spans(self, orig_span, link): "char_start_idx": char["char_idx"], "char_end_idx": char["char_idx"], "chars": [char], - "url": link['url'] if char_in_link else '', - "char_in_link": char_in_link + "url": current_url } spans.append(span) else: @@ -333,7 +332,6 @@ def break_spans(self, orig_span, link): for span in spans: span['bbox'] = span['bbox'].bbox - del span['char_in_link'] return spans From 646769b34a70ce7c17a88b7177bfb46e6c79611c Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Mon, 13 Jan 2025 17:35:14 +0000 Subject: [PATCH 07/14] fix footnotes [skip ci] --- marker/processors/footnote.py | 12 ++++++++++++ marker/schema/blocks/footnote.py | 16 ---------------- marker/schema/text/span.py | 9 +++++++-- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/marker/processors/footnote.py b/marker/processors/footnote.py index fdf31721..41f350a2 100644 --- a/marker/processors/footnote.py +++ b/marker/processors/footnote.py @@ -1,3 +1,5 @@ +import re + from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document @@ -13,6 +15,7 @@ class FootnoteProcessor(BaseProcessor): def __call__(self, document: Document): for page in document.pages: self.push_footnotes_to_bottom(page, document) + self.assign_superscripts(page, document) def push_footnotes_to_bottom(self, page: PageGroup, document: Document): footnote_blocks = page.contained_blocks(document, self.block_types) @@ -24,3 +27,12 @@ def push_footnotes_to_bottom(self, page: PageGroup, document: Document): # Move to bottom if it is page.structure.remove(block.id) page.add_structure(block) + + def assign_superscripts(self, page: PageGroup, document: Document): + footnote_blocks = page.contained_blocks(document, self.block_types) + + for block in footnote_blocks: + for span in block.contained_blocks(document, (BlockTypes.Span,)): + if re.match(r"^[0-9\W]+", span.text): + span.has_superscript = True + break diff --git a/marker/schema/blocks/footnote.py b/marker/schema/blocks/footnote.py index c476aa7d..71d3580b 100644 --- a/marker/schema/blocks/footnote.py +++ b/marker/schema/blocks/footnote.py @@ -1,21 +1,7 @@ -import re - from marker.schema import BlockTypes from marker.schema.blocks import Block -def superscript(child_blocks): - # Superscript leading symbol or digit sequence - first_block = None - while len(child_blocks) > 0: - first_block = child_blocks[0] - child_blocks = first_block.children - - if first_block is not None and first_block.id.block_type == BlockTypes.Line: - digit_start = r"^([0-9\W]+)(.*)" - first_block.html = re.sub(digit_start, r"\1\2", first_block.html.lstrip()) - - class Footnote(Block): block_type: BlockTypes = BlockTypes.Footnote @@ -23,6 +9,4 @@ def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) template = template.replace("\n", " ") - # Add superscripts to start - superscript(child_blocks) return f"

{template}

" diff --git a/marker/schema/text/span.py b/marker/schema/text/span.py index 86fe24bc..1b6e18f2 100644 --- a/marker/schema/text/span.py +++ b/marker/schema/text/span.py @@ -22,6 +22,7 @@ class Span(Block): minimum_position: int maximum_position: int formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']] + has_superscript: bool = False url: Optional[str] = None anchors: Optional[List[str]] = None @@ -60,14 +61,18 @@ def assemble_html(self, child_blocks, parent_structure): text = html.escape(text) text = cleanup_text(text) + if self.has_superscript: + text = re.sub(r"^([0-9\W]+)(.*)", r"\1\2", text) + + if self.url: + text = f"{text}" + if self.italic: text = f"{text}" elif self.bold: text = f"{text}" elif self.math: text = f"{text}" - elif self.url: - text = f"{text}" if self.anchors: text = "".join(f"" for anchor in self.anchors) + text From cb2c26f5ac4f0ebf49085275570e3008eb3852d6 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Mon, 13 Jan 2025 18:45:53 +0000 Subject: [PATCH 08/14] more cleanup [skip ci] --- marker/providers/pdf.py | 49 ++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index 64a4aaf9..cc31c100 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -238,17 +238,24 @@ def merge_links(self, page): max_intersection = intersection_link.argmax() span = spans[max_intersection] - if link['dest_page'] is not None: - dest_page = link['dest_page'] - self.refs.setdefault(dest_page, []) - link['url'] = f"#page-{dest_page}" - if link['dest_pos']: - dest_pos = link['dest_pos'] - else: - dest_pos = [0.0, 0.0] - if dest_pos not in self.refs[dest_page]: - self.refs[dest_page].append(dest_pos) - link['url'] += f"-{self.refs[dest_page].index(dest_pos)}" + if link['dest_page'] is None: + continue + + dest_page = link['dest_page'] + self.refs.setdefault(dest_page, []) + link['url'] = f"#page-{dest_page}" + if link['dest_pos']: + dest_pos = link['dest_pos'] + else: + # Don't link to self if there is no dest_pos + if dest_page == page_id: + continue + dest_pos = [0.0, 0.0] + + if dest_pos not in self.refs[dest_page]: + self.refs[dest_page].append(dest_pos) + + link['url'] += f"-{self.refs[dest_page].index(dest_pos)}" span_link_map.setdefault(max_intersection, []) span_link_map[max_intersection].append(link) @@ -272,27 +279,19 @@ def merge_refs(self, page): if not refs: return - spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans'] if span['text']] + spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']] if not spans: return span_starts = np.array([span['bbox'][:2] for span in spans]) - ref_pos = np.array([ref for ref in refs]) - ref_starts = np.array([pos for pos in ref_pos]) + ref_starts = np.array(refs) distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2) - assigned_refs = set() - for ref_idx, ref_center in enumerate(ref_starts): - if ref_idx in assigned_refs: - continue - - span_indices = np.argsort(distances[:, ref_idx]) - for span_idx in span_indices: - spans[span_idx].setdefault('anchors', []) - spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}") - assigned_refs.add(ref_idx) - break + for ref_idx in range(len(ref_starts)): + span_idx = np.argmin(distances[:, ref_idx]) + spans[span_idx].setdefault('anchors', []) + spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}") def break_spans(self, orig_span, links): spans = [] From 18eea949cbe8a774e93492e96fa094bda20dd9dd Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Tue, 14 Jan 2025 09:14:00 +0000 Subject: [PATCH 09/14] cleanup [skip ci] --- marker/providers/pdf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index cc31c100..dba6785e 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -166,12 +166,12 @@ def pdftext_extraction(self) -> ProviderPageLines: SpanClass: Span = get_block_class(BlockTypes.Span) LineClass: Line = get_block_class(BlockTypes.Line) - for page in page_char_blocks: - if not self.disable_links: + + if not self.disable_links: + for page in page_char_blocks: self.merge_links(page) - for page in page_char_blocks: - if not self.disable_links: + for page in page_char_blocks: self.merge_refs(page) for page in page_char_blocks: From 152727bb5e18c3df83aa0e334c97a84cd2057198 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Tue, 14 Jan 2025 10:14:52 +0000 Subject: [PATCH 10/14] more minor cleanup [skip ci] --- marker/providers/pdf.py | 131 +++++++++++++++++++++++----------------- 1 file changed, 75 insertions(+), 56 deletions(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index dba6785e..d9d344e4 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -220,6 +220,10 @@ def pdftext_extraction(self) -> ProviderPageLines: return page_lines def merge_links(self, page): + """ + Merges links with spans. Some spans can also have multiple links associated with them. + We break up the spans and reconstruct them taking the links into account. + """ page_id = page["page"] links = self.get_links(page_id) @@ -266,13 +270,17 @@ def merge_links(self, page): spans = [] for span in line["spans"]: if span_idx in span_link_map: - spans.extend(self.break_spans(span, span_link_map[span_idx])) + spans.extend(self._reconstruct_spans(span, span_link_map[span_idx])) else: spans.append(span) span_idx += 1 line['spans'] = spans def merge_refs(self, page): + """ + We associate each reference to the nearest span. + """ + page_id = page["page"] refs = self.refs.get(page_id, []) @@ -293,7 +301,10 @@ def merge_refs(self, page): spans[span_idx].setdefault('anchors', []) spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}") - def break_spans(self, orig_span, links): + def _reconstruct_spans(self, orig_span: dict, links: List[dict]): + """ + Reconstructs the spans by breaking them up into smaller spans based on the links. + """ spans = [] span = None link_bboxes = [Bbox(link['bbox']) for link in links] @@ -369,7 +380,7 @@ def check_page(self, page_id: int) -> bool: font_map = {} for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs): font = pdfium_c.FPDFTextObj_GetFont(text_obj) - font_name = self.get_fontname(font) + font_name = self._get_fontname(font) # we also skip pages without embedded fonts and fonts without names non_embedded_fonts.append(pdfium_c.FPDFFont_GetIsEmbedded(font) == 0) @@ -431,7 +442,8 @@ def get_page_bbox(self, idx: int) -> PolygonBox | None: def get_page_lines(self, idx: int) -> List[ProviderOutput]: return self.page_lines[idx] - def get_fontname(self, font) -> str: + @staticmethod + def _get_fontname(font) -> str: font_name = "" buffer_size = 256 @@ -449,7 +461,8 @@ def get_fontname(self, font) -> str: return font_name - def get_dest_position(self, dest) -> Optional[Tuple[float, float]]: + @staticmethod + def _get_dest_position(dest) -> Optional[Tuple[float, float]]: has_x = ctypes.c_int() has_y = ctypes.c_int() has_zoom = ctypes.c_int() @@ -471,7 +484,11 @@ def get_dest_position(self, dest) -> Optional[Tuple[float, float]]: else: return None - def rect_to_scaled_bbox(self, rect, page_bbox, page_height, page_width, page_rotation) -> List[float]: + @staticmethod + def _rect_to_scaled_bbox(rect, page_bbox, page_rotation) -> List[float]: + page_width = math.ceil(abs(page_bbox[2] - page_bbox[0])) + page_height = math.ceil(abs(page_bbox[1] - page_bbox[3])) + cx_start, cy_start, cx_end, cy_end = rect cx_start -= page_bbox[0] cx_end -= page_bbox[0] @@ -484,15 +501,14 @@ def rect_to_scaled_bbox(self, rect, page_bbox, page_height, page_width, page_rot bbox = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)] return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox - def xy_to_scaled_pos(self, x, y, page_bbox, page_height, page_width, page_rotation, expand_by=1) -> List[float]: - return self.rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_height, page_width, page_rotation)[:2] + @staticmethod + def _xy_to_scaled_pos(x, y, page_bbox, page_rotation, expand_by=1) -> List[float]: + return PdfProvider._rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_rotation)[:2] def get_links(self, page_idx): urls = [] page = self.doc[page_idx] page_bbox: List[float] = page.get_bbox() - page_width = math.ceil(abs(page_bbox[2] - page_bbox[0])) - page_height = math.ceil(abs(page_bbox[1] - page_bbox[3])) page_rotation = 0 try: page_rotation = page.get_rotation() @@ -509,51 +525,54 @@ def get_links(self, page_idx): 'url': None, } annot = pdfium_c.FPDFPage_GetAnnot(page, i) - if pdfium_c.FPDFAnnot_GetSubtype(annot) == pdfium_c.FPDF_ANNOT_LINK: - fs_rect = pdfium_c.FS_RECTF() - success = pdfium_c.FPDFAnnot_GetRect(annot, ctypes.byref(fs_rect)) - if not success: + if pdfium_c.FPDFAnnot_GetSubtype(annot) != pdfium_c.FPDF_ANNOT_LINK: + continue + + fs_rect = pdfium_c.FS_RECTF() + success = pdfium_c.FPDFAnnot_GetRect(annot, ctypes.byref(fs_rect)) + if not success: + continue + + link['bbox'] = self._rect_to_scaled_bbox( + [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom], + page_bbox, page_rotation + ) + + link_obj = pdfium_c.FPDFAnnot_GetLink(annot) + + dest = pdfium_c.FPDFLink_GetDest(self.doc, link_obj) + if dest: + tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest) + link['dest_page'] = tgt_page + dest_position = self._get_dest_position(dest) + if dest_position: + link['dest_pos'] = self._xy_to_scaled_pos(*dest_position, page_bbox, page_rotation) + + else: + action = pdfium_c.FPDFLink_GetAction(link_obj) + a_type = pdfium_c.FPDFAction_GetType(action) + + if a_type == pdfium_c.PDFACTION_UNSUPPORTED: continue - link['bbox'] = self.rect_to_scaled_bbox( - [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom], - page_bbox, page_height, page_width, page_rotation - ) - - link_obj = pdfium_c.FPDFAnnot_GetLink(annot) - - dest = pdfium_c.FPDFLink_GetDest(self.doc, link_obj) - if dest: - tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest) - link['dest_page'] = tgt_page - dest_position = self.get_dest_position(dest) - if dest_position: - link['dest_pos'] = self.xy_to_scaled_pos(*dest_position, page_bbox, page_height, page_width, page_rotation) - - else: - action = pdfium_c.FPDFLink_GetAction(link_obj) - a_type = pdfium_c.FPDFAction_GetType(action) - - if a_type == pdfium_c.PDFACTION_UNSUPPORTED: - continue - - elif a_type == pdfium_c.PDFACTION_GOTO: - # Goto a page - dest = pdfium_c.FPDFAction_GetDest(self.doc, action) - if dest: - tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest) - link['dest_page'] = tgt_page - dest_position = self.get_dest_position(dest) - if dest_position: - link['dest_pos'] = self.xy_to_scaled_pos(*dest_position, page_bbox, page_height, page_width, page_rotation) - - elif a_type == pdfium_c.PDFACTION_URI: - # External link - needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0) - if needed_len > 0: - buf = ctypes.create_string_buffer(needed_len) - pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len) - uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00') - link["url"] = uri - - urls.append(link) + + elif a_type == pdfium_c.PDFACTION_GOTO: + # Goto a page + dest = pdfium_c.FPDFAction_GetDest(self.doc, action) + if dest: + tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest) + link['dest_page'] = tgt_page + dest_position = self._get_dest_position(dest) + if dest_position: + link['dest_pos'] = self._xy_to_scaled_pos(*dest_position, page_bbox, page_rotation) + + elif a_type == pdfium_c.PDFACTION_URI: + # External link + needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0) + if needed_len > 0: + buf = ctypes.create_string_buffer(needed_len) + pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len) + uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00') + link["url"] = uri + + urls.append(link) return urls From 5ac8b0db8e76009623ff9d2935c84a8831f86f5e Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Tue, 14 Jan 2025 12:31:53 +0000 Subject: [PATCH 11/14] add test for pdf link and reference --- tests/builders/test_pdf_links.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests/builders/test_pdf_links.py diff --git a/tests/builders/test_pdf_links.py b/tests/builders/test_pdf_links.py new file mode 100644 index 00000000..00bff7ee --- /dev/null +++ b/tests/builders/test_pdf_links.py @@ -0,0 +1,32 @@ +import pytest + +from marker.converters.pdf import PdfConverter +from marker.renderers.markdown import MarkdownOutput +from marker.schema import BlockTypes +from marker.schema.document import Document + + +@pytest.mark.filename("arxiv_test.pdf") +@pytest.mark.output_format("markdown") +@pytest.mark.config({"page_range": [1]}) +def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf): + first_page = pdf_document.pages[0] + + for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)): + if section_header_span.text == " II.": + assert section_header_span.url == "#page-1-0" + break + else: + raise ValueError("Could not find II. in the first page") + + section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0] + assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n' + + section_header_span = section_header_block.contained_blocks(pdf_document, (BlockTypes.Span,))[0] + assert section_header_span.anchors == ['page-1-0'] + + markdown_output: MarkdownOutput = pdf_converter(temp_pdf.name) + markdown = markdown_output.markdown + + assert '[II.](#page-1-0)' in markdown + assert 'II. THEORETICAL FRAMEWORK' in markdown From 2049068e8e2cc5a8a03f3d6b30773ee1bef764d7 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Tue, 14 Jan 2025 12:39:32 +0000 Subject: [PATCH 12/14] more cleanup --- marker/providers/pdf.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index d9d344e4..ab441c5f 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -78,7 +78,6 @@ def __init__(self, filepath: str, config=None): self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath) self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))} - self.refs = {} if self.page_range is None: self.page_range = range(len(self.doc)) @@ -168,11 +167,15 @@ def pdftext_extraction(self) -> ProviderPageLines: LineClass: Line = get_block_class(BlockTypes.Line) if not self.disable_links: + refs = {} + + # we first go through the entire document and merge links and collect refs for page in page_char_blocks: - self.merge_links(page) + self.merge_links(page, refs) + # we can now merge the collected refs for each page for page in page_char_blocks: - self.merge_refs(page) + self.merge_refs(page, refs) for page in page_char_blocks: page_id = page["page"] @@ -219,7 +222,7 @@ def pdftext_extraction(self) -> ProviderPageLines: return page_lines - def merge_links(self, page): + def merge_links(self, page, refs): """ Merges links with spans. Some spans can also have multiple links associated with them. We break up the spans and reconstruct them taking the links into account. @@ -246,7 +249,7 @@ def merge_links(self, page): continue dest_page = link['dest_page'] - self.refs.setdefault(dest_page, []) + refs.setdefault(dest_page, []) link['url'] = f"#page-{dest_page}" if link['dest_pos']: dest_pos = link['dest_pos'] @@ -254,12 +257,13 @@ def merge_links(self, page): # Don't link to self if there is no dest_pos if dest_page == page_id: continue + # if we don't have a dest pos, we just link to the top of the page dest_pos = [0.0, 0.0] - if dest_pos not in self.refs[dest_page]: - self.refs[dest_page].append(dest_pos) + if dest_pos not in refs[dest_page]: + refs[dest_page].append(dest_pos) - link['url'] += f"-{self.refs[dest_page].index(dest_pos)}" + link['url'] += f"-{refs[dest_page].index(dest_pos)}" span_link_map.setdefault(max_intersection, []) span_link_map[max_intersection].append(link) @@ -276,15 +280,15 @@ def merge_links(self, page): span_idx += 1 line['spans'] = spans - def merge_refs(self, page): + def merge_refs(self, page, refs): """ We associate each reference to the nearest span. """ page_id = page["page"] - refs = self.refs.get(page_id, []) - if not refs: + page_refs = refs.get(page_id, []) + if not page_refs: return spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']] @@ -292,7 +296,7 @@ def merge_refs(self, page): return span_starts = np.array([span['bbox'][:2] for span in spans]) - ref_starts = np.array(refs) + ref_starts = np.array(page_refs) distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2) From 0dc0e948ef6236a4b43e7cda2db812691560eb72 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Tue, 14 Jan 2025 17:06:31 +0000 Subject: [PATCH 13/14] move link code to pdftext --- marker/providers/pdf.py | 263 +------------------------------ tests/builders/test_pdf_links.py | 2 +- 2 files changed, 4 insertions(+), 261 deletions(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index ab441c5f..8232e742 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -1,15 +1,12 @@ import atexit import ctypes -import math import re -from typing import Annotated, List, Optional, Set, Tuple +from typing import Annotated, List, Optional, Set -import numpy as np import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c from ftfy import fix_text from pdftext.extraction import dictionary_output -from pdftext.schema import Bbox from PIL import Image from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines @@ -19,7 +16,6 @@ from marker.schema.registry import get_block_class from marker.schema.text.line import Line from marker.schema.text.span import Span -from marker.util import matrix_intersection_area class PdfProvider(BaseProvider): @@ -159,24 +155,14 @@ def pdftext_extraction(self) -> ProviderPageLines: keep_chars=True, workers=self.pdftext_workers, flatten_pdf=self.flatten_pdf, - quote_loosebox=False + quote_loosebox=False, + disable_links=self.disable_links ) self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_char_blocks)} SpanClass: Span = get_block_class(BlockTypes.Span) LineClass: Line = get_block_class(BlockTypes.Line) - if not self.disable_links: - refs = {} - - # we first go through the entire document and merge links and collect refs - for page in page_char_blocks: - self.merge_links(page, refs) - - # we can now merge the collected refs for each page - for page in page_char_blocks: - self.merge_refs(page, refs) - for page in page_char_blocks: page_id = page["page"] lines: List[ProviderOutput] = [] @@ -222,133 +208,6 @@ def pdftext_extraction(self) -> ProviderPageLines: return page_lines - def merge_links(self, page, refs): - """ - Merges links with spans. Some spans can also have multiple links associated with them. - We break up the spans and reconstruct them taking the links into account. - """ - page_id = page["page"] - - links = self.get_links(page_id) - - spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']] - span_bboxes = [span['bbox'] for span in spans] - link_bboxes = [link['bbox'] for link in links] - intersection_matrix = matrix_intersection_area(link_bboxes, span_bboxes) - - span_link_map = {} - for link_idx, link in enumerate(links): - intersection_link = intersection_matrix[link_idx] - if intersection_link.sum() == 0: - continue - - max_intersection = intersection_link.argmax() - span = spans[max_intersection] - - if link['dest_page'] is None: - continue - - dest_page = link['dest_page'] - refs.setdefault(dest_page, []) - link['url'] = f"#page-{dest_page}" - if link['dest_pos']: - dest_pos = link['dest_pos'] - else: - # Don't link to self if there is no dest_pos - if dest_page == page_id: - continue - # if we don't have a dest pos, we just link to the top of the page - dest_pos = [0.0, 0.0] - - if dest_pos not in refs[dest_page]: - refs[dest_page].append(dest_pos) - - link['url'] += f"-{refs[dest_page].index(dest_pos)}" - - span_link_map.setdefault(max_intersection, []) - span_link_map[max_intersection].append(link) - - span_idx = 0 - for block in page["blocks"]: - for line in block["lines"]: - spans = [] - for span in line["spans"]: - if span_idx in span_link_map: - spans.extend(self._reconstruct_spans(span, span_link_map[span_idx])) - else: - spans.append(span) - span_idx += 1 - line['spans'] = spans - - def merge_refs(self, page, refs): - """ - We associate each reference to the nearest span. - """ - - page_id = page["page"] - - page_refs = refs.get(page_id, []) - if not page_refs: - return - - spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']] - if not spans: - return - - span_starts = np.array([span['bbox'][:2] for span in spans]) - ref_starts = np.array(page_refs) - - distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2) - - for ref_idx in range(len(ref_starts)): - span_idx = np.argmin(distances[:, ref_idx]) - spans[span_idx].setdefault('anchors', []) - spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}") - - def _reconstruct_spans(self, orig_span: dict, links: List[dict]): - """ - Reconstructs the spans by breaking them up into smaller spans based on the links. - """ - spans = [] - span = None - link_bboxes = [Bbox(link['bbox']) for link in links] - - for char in orig_span['chars']: - char_bbox = Bbox(char['bbox']) - intersections = [] - for i, link_bbox in enumerate(link_bboxes): - area = link_bbox.intersection_area(char_bbox) - if area > 0: - intersections.append((area, links[i])) - - current_url = '' - if intersections: - intersections.sort(key=lambda x: x[0], reverse=True) - current_url = intersections[0][1]['url'] - - if not span or current_url != span['url']: - span = { - "bbox": char_bbox, - "text": char["char"], - "rotation": char["rotation"], - "font": char["font"], - "char_start_idx": char["char_idx"], - "char_end_idx": char["char_idx"], - "chars": [char], - "url": current_url - } - spans.append(span) - else: - span['text'] += char['char'] - span['char_end_idx'] = char['char_idx'] - span['bbox'] = span['bbox'].merge(char_bbox) - span['chars'].append(char) - - for span in spans: - span['bbox'] = span['bbox'].bbox - - return spans - def check_line_spans(self, page_lines: List[ProviderOutput]) -> bool: page_spans = [span for line in page_lines for span in line.spans] if len(page_spans) == 0: @@ -464,119 +323,3 @@ def _get_fontname(font) -> str: pass return font_name - - @staticmethod - def _get_dest_position(dest) -> Optional[Tuple[float, float]]: - has_x = ctypes.c_int() - has_y = ctypes.c_int() - has_zoom = ctypes.c_int() - x_coord = ctypes.c_float() - y_coord = ctypes.c_float() - zoom_level = ctypes.c_float() - success = pdfium_c.FPDFDest_GetLocationInPage( - dest, - ctypes.byref(has_x), - ctypes.byref(has_y), - ctypes.byref(has_zoom), - ctypes.byref(x_coord), - ctypes.byref(y_coord), - ctypes.byref(zoom_level) - ) - if success: - if has_x.value and has_y.value: - return x_coord.value, y_coord.value - else: - return None - - @staticmethod - def _rect_to_scaled_bbox(rect, page_bbox, page_rotation) -> List[float]: - page_width = math.ceil(abs(page_bbox[2] - page_bbox[0])) - page_height = math.ceil(abs(page_bbox[1] - page_bbox[3])) - - cx_start, cy_start, cx_end, cy_end = rect - cx_start -= page_bbox[0] - cx_end -= page_bbox[0] - cy_start -= page_bbox[1] - cy_end -= page_bbox[1] - - ty_start = page_height - cy_start - ty_end = page_height - cy_end - - bbox = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)] - return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox - - @staticmethod - def _xy_to_scaled_pos(x, y, page_bbox, page_rotation, expand_by=1) -> List[float]: - return PdfProvider._rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_rotation)[:2] - - def get_links(self, page_idx): - urls = [] - page = self.doc[page_idx] - page_bbox: List[float] = page.get_bbox() - page_rotation = 0 - try: - page_rotation = page.get_rotation() - except: - pass - - annot_count = pdfium_c.FPDFPage_GetAnnotCount(page) - for i in range(annot_count): - link = { - 'bbox': None, - 'page': page_idx, - 'dest_page': None, - 'dest_pos': None, - 'url': None, - } - annot = pdfium_c.FPDFPage_GetAnnot(page, i) - if pdfium_c.FPDFAnnot_GetSubtype(annot) != pdfium_c.FPDF_ANNOT_LINK: - continue - - fs_rect = pdfium_c.FS_RECTF() - success = pdfium_c.FPDFAnnot_GetRect(annot, ctypes.byref(fs_rect)) - if not success: - continue - - link['bbox'] = self._rect_to_scaled_bbox( - [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom], - page_bbox, page_rotation - ) - - link_obj = pdfium_c.FPDFAnnot_GetLink(annot) - - dest = pdfium_c.FPDFLink_GetDest(self.doc, link_obj) - if dest: - tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest) - link['dest_page'] = tgt_page - dest_position = self._get_dest_position(dest) - if dest_position: - link['dest_pos'] = self._xy_to_scaled_pos(*dest_position, page_bbox, page_rotation) - - else: - action = pdfium_c.FPDFLink_GetAction(link_obj) - a_type = pdfium_c.FPDFAction_GetType(action) - - if a_type == pdfium_c.PDFACTION_UNSUPPORTED: - continue - - elif a_type == pdfium_c.PDFACTION_GOTO: - # Goto a page - dest = pdfium_c.FPDFAction_GetDest(self.doc, action) - if dest: - tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(self.doc, dest) - link['dest_page'] = tgt_page - dest_position = self._get_dest_position(dest) - if dest_position: - link['dest_pos'] = self._xy_to_scaled_pos(*dest_position, page_bbox, page_rotation) - - elif a_type == pdfium_c.PDFACTION_URI: - # External link - needed_len = pdfium_c.FPDFAction_GetURIPath(self.doc, action, None, 0) - if needed_len > 0: - buf = ctypes.create_string_buffer(needed_len) - pdfium_c.FPDFAction_GetURIPath(self.doc, action, buf, needed_len) - uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00') - link["url"] = uri - - urls.append(link) - return urls diff --git a/tests/builders/test_pdf_links.py b/tests/builders/test_pdf_links.py index 00bff7ee..72a97070 100644 --- a/tests/builders/test_pdf_links.py +++ b/tests/builders/test_pdf_links.py @@ -13,7 +13,7 @@ def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf first_page = pdf_document.pages[0] for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)): - if section_header_span.text == " II.": + if "II." in section_header_span.text: assert section_header_span.url == "#page-1-0" break else: From 54ab2ccb25d9f83c151b83021ddd03569dfaff4a Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Tue, 14 Jan 2025 17:26:16 +0000 Subject: [PATCH 14/14] keep_chars=False [skip ci] --- marker/providers/pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index 8232e742..f568cb42 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -152,7 +152,7 @@ def pdftext_extraction(self) -> ProviderPageLines: page_char_blocks = dictionary_output( self.filepath, page_range=self.page_range, - keep_chars=True, + keep_chars=False, workers=self.pdftext_workers, flatten_pdf=self.flatten_pdf, quote_loosebox=False,