Skip to content

Commit

Permalink
Merge pull request #30 from VikParuchuri/dev-mose/upgraded-links
Browse files Browse the repository at this point in the history
Improved References
  • Loading branch information
VikParuchuri authored Jan 17, 2025
2 parents 9af0711 + a060c2e commit e3595fc
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 52 deletions.
67 changes: 17 additions & 50 deletions pdftext/pdf/links.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@
import math
from typing import Dict, List, Optional, Tuple

import numpy as np
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c

from pdftext.pdf.utils import matrix_intersection_area
from pdftext.schema import Bbox, Link, Page, Pages, Span
from pdftext.schema import Bbox, Link, Page, PageReference, Pages, Span


def _get_dest_position(dest) -> Optional[Tuple[float, float]]:
Expand Down Expand Up @@ -122,7 +121,7 @@ def get_links(page_idx: int, pdf: pdfium.PdfDocument) -> List[Link]:
return urls


def merge_links(page: Page, pdf: pdfium.PdfDocument, refs: dict):
def merge_links(page: Page, pdf: pdfium.PdfDocument, refs: PageReference):
"""
Merges links with spans. Some spans can also have multiple links associated with them.
We break up the spans and reconstruct them taking the links into account.
Expand All @@ -146,25 +145,19 @@ def merge_links(page: Page, pdf: pdfium.PdfDocument, refs: dict):
max_intersection = intersection_link.argmax()
span = spans[max_intersection]

if link['dest_page'] is None:
continue

dest_page = link['dest_page']
refs.setdefault(dest_page, [])

if link['dest_pos']:
dest_pos = link['dest_pos']
else:
# Don't link to self if there is no dest_pos
if dest_page == page_id:
continue
# if we don't have a dest pos, we just link to the top of the page
dest_pos = [0.0, 0.0]

if dest_pos not in refs[dest_page]:
refs[dest_page].append(dest_pos)

link['url'] = f"#page-{dest_page}-{refs[dest_page].index(dest_pos)}"
if dest_page is not None:
if link['dest_pos']:
dest_pos = link['dest_pos']
else:
# Don't link to self if there is no dest_pos
if dest_page == page_id:
continue
# if we don't have a dest pos, we just link to the top of the page
dest_pos = [0.0, 0.0]

ref = refs.add_ref(dest_page, dest_pos)
link['url'] = ref.url

span_link_map.setdefault(max_intersection, [])
span_link_map[max_intersection].append(link)
Expand All @@ -182,32 +175,6 @@ def merge_links(page: Page, pdf: pdfium.PdfDocument, refs: dict):
line['spans'] = spans


def merge_refs(page: Page, refs):
"""
We associate each reference to the nearest span.
"""

page_id = page["page"]

page_refs = refs.get(page_id, [])
if not page_refs:
return

spans: List[Span] = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']]
if not spans:
return

span_starts = np.array([span['bbox'][:2] for span in spans])
ref_starts = np.array(page_refs)

distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)

for ref_idx in range(len(ref_starts)):
span_idx = np.argmin(distances[:, ref_idx])
spans[span_idx].setdefault('anchors', [])
spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}")


def _reconstruct_spans(orig_span: dict, links: List[Link]) -> List[Span]:
"""
Reconstructs the spans by breaking them up into smaller spans based on the links.
Expand Down Expand Up @@ -239,7 +206,6 @@ def _reconstruct_spans(orig_span: dict, links: List[Link]) -> List[Span]:
"char_end_idx": char["char_idx"],
"chars": [char],
"url": current_url,
"anchors": [],
}
spans.append(span)
else:
Expand All @@ -252,9 +218,10 @@ def _reconstruct_spans(orig_span: dict, links: List[Link]) -> List[Span]:


def add_links_and_refs(pages: Pages, pdf_doc: pdfium.PdfDocument):
refs: Dict[int, List[List[float]]] = {}
refs = PageReference()

for page in pages:
merge_links(page, pdf_doc, refs)

for page in pages:
merge_refs(page, refs)
page["refs"] = refs.get_refs(page["page"])
1 change: 0 additions & 1 deletion pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def span_break():
"char_end_idx": char["char_idx"],
"chars": [char],
"url": '',
"anchors": []
})

for char in chars:
Expand Down
41 changes: 40 additions & 1 deletion pdftext/schema.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Dict, List, Optional, TypedDict, Union


Expand Down Expand Up @@ -134,7 +135,6 @@ class Span(TypedDict):
char_end_idx: int
rotation: int
url: str
anchors: List[str]


class Line(TypedDict):
Expand All @@ -156,6 +156,7 @@ class Page(TypedDict):
height: int
blocks: List[Block]
rotation: int
refs: List[Reference]


class TableCell(TypedDict):
Expand All @@ -176,6 +177,44 @@ class Link(TypedDict):
url: Optional[str]


@dataclass
class Reference:
idx: int
page: int
coord: List[float]

@property
def ref(self):
return f"page-{self.page}-{self.idx}"

@property
def url(self):
return f"#{self.ref}"


class PageReference:
def __init__(self):
self.page_ref_map: Dict[int, List[Reference]] = {}

def get_refs(self, page: int) -> List[Reference]:
return self.page_ref_map.get(page, [])

def add_ref(self, page: int, coord: List[float]) -> Reference:
self.page_ref_map.setdefault(page, [])
ref = self.check_ref(page, coord)
if ref is None:
ref = Reference(idx=len(self.page_ref_map[page]), page=page, coord=coord)
self.page_ref_map[page].append(ref)
return ref

def check_ref(self, page: int, coord: List[float]) -> Optional[Reference]:
refs = self.page_ref_map.get(page, [])
for ref in refs:
if ref.coord == coord:
return ref
return None


Chars = List[Char]
Spans = List[Span]
Lines = List[Line]
Expand Down

0 comments on commit e3595fc

Please sign in to comment.