Skip to content

Commit

Permalink
Merge pull request #29 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Link support
  • Loading branch information
VikParuchuri authored Jan 21, 2025
2 parents 4671d86 + e3595fc commit 0a4f33c
Show file tree
Hide file tree
Showing 7 changed files with 324 additions and 12 deletions.
10 changes: 9 additions & 1 deletion pdftext/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import pypdfium2 as pdfium

from pdftext.pdf.links import add_links_and_refs
from pdftext.pdf.pages import get_pages
from pdftext.postprocessing import handle_hyphens, merge_text, postprocess_text, sort_blocks
from pdftext.schema import Pages, TableInputs, Tables
Expand Down Expand Up @@ -96,9 +97,16 @@ def dictionary_output(
keep_chars=False,
flatten_pdf=False,
quote_loosebox=True,
disable_links=False,
workers=None
) -> Pages:
pages: Pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox)

if not disable_links:
pdf = _load_pdf(pdf_path, False)
add_links_and_refs(pages, pdf)
pdf.close()

for page in pages:
page_width, page_height = page["width"], page["height"]
for block in page["blocks"]:
Expand All @@ -122,6 +130,7 @@ def dictionary_output(
page["bbox"] = [page["bbox"][2], page["bbox"][3], page["bbox"][0], page["bbox"][1]]
return pages


def table_output(
pdf_path: str,
table_inputs: TableInputs,
Expand All @@ -144,4 +153,3 @@ def table_output(
assert len(tables) == len(table_input["tables"]), "Number of tables and table inputs must match"
out_tables.append(tables)
return out_tables

4 changes: 2 additions & 2 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
ty_start = page_height - cy_start
ty_end = page_height - cy_end

bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)]
bbox = Bbox(bbox).rotate(page_width, page_height, page_rotation)
bbox_coords = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
bbox = Bbox(bbox_coords).rotate(page_width, page_height, page_rotation)

chars.append({
"bbox": bbox,
Expand Down
227 changes: 227 additions & 0 deletions pdftext/pdf/links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
import ctypes
import math
from typing import Dict, List, Optional, Tuple

import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c

from pdftext.pdf.utils import matrix_intersection_area
from pdftext.schema import Bbox, Link, Page, PageReference, Pages, Span


def _get_dest_position(dest) -> Optional[Tuple[float, float]]:
has_x = ctypes.c_int()
has_y = ctypes.c_int()
has_zoom = ctypes.c_int()
x_coord = ctypes.c_float()
y_coord = ctypes.c_float()
zoom_level = ctypes.c_float()
success = pdfium_c.FPDFDest_GetLocationInPage(
dest, has_x, has_y, has_zoom,
x_coord, y_coord, zoom_level
)
if success:
if has_x.value and has_y.value:
return x_coord.value, y_coord.value
return None


def _rect_to_scaled_bbox(rect, page_bbox, page_rotation) -> List[float]:
page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))

cx_start, cy_start, cx_end, cy_end = rect
cx_start -= page_bbox[0]
cx_end -= page_bbox[0]
cy_start -= page_bbox[1]
cy_end -= page_bbox[1]

ty_start = page_height - cy_start
ty_end = page_height - cy_end

bbox = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox


def _xy_to_scaled_pos(x, y, page_bbox, page_rotation, expand_by=1) -> List[float]:
return _rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_rotation)[:2]


def get_links(page_idx: int, pdf: pdfium.PdfDocument) -> List[Link]:
urls = []

page = pdf.get_page(page_idx)
page_bbox: List[float] = page.get_bbox()
page_rotation = 0
try:
page_rotation = page.get_rotation()
except:
pass

annot_count = pdfium_c.FPDFPage_GetAnnotCount(page)
for i in range(annot_count):
link: Link = {
'page': page_idx,
'bbox': None,
'dest_page': None,
'dest_pos': None,
'url': None,
}
annot = pdfium_c.FPDFPage_GetAnnot(page, i)
if pdfium_c.FPDFAnnot_GetSubtype(annot) != pdfium_c.FPDF_ANNOT_LINK:
continue

fs_rect = pdfium_c.FS_RECTF()
success = pdfium_c.FPDFAnnot_GetRect(annot, fs_rect)
if not success:
continue

link['bbox'] = _rect_to_scaled_bbox(
[fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom],
page_bbox, page_rotation
)

link_obj = pdfium_c.FPDFAnnot_GetLink(annot)

dest = pdfium_c.FPDFLink_GetDest(pdf, link_obj)
if dest:
tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(pdf, dest)
link['dest_page'] = tgt_page
dest_position = _get_dest_position(dest)
if dest_position:
link['dest_pos'] = _xy_to_scaled_pos(*dest_position, page_bbox, page_rotation)

else:
action = pdfium_c.FPDFLink_GetAction(link_obj)
a_type = pdfium_c.FPDFAction_GetType(action)

if a_type == pdfium_c.PDFACTION_UNSUPPORTED:
continue

elif a_type == pdfium_c.PDFACTION_GOTO:
# Goto a page
dest = pdfium_c.FPDFAction_GetDest(pdf, action)
if dest:
tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(pdf, dest)
link['dest_page'] = tgt_page
dest_position = _get_dest_position(dest)
if dest_position:
link['dest_pos'] = _xy_to_scaled_pos(*dest_position, page_bbox, page_rotation)

elif a_type == pdfium_c.PDFACTION_URI:
# External link
needed_len = pdfium_c.FPDFAction_GetURIPath(pdf, action, None, 0)
if needed_len > 0:
buf = ctypes.create_string_buffer(needed_len)
pdfium_c.FPDFAction_GetURIPath(pdf, action, buf, needed_len)
uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00')
link["url"] = uri

urls.append(link)
return urls


def merge_links(page: Page, pdf: pdfium.PdfDocument, refs: PageReference):
"""
Merges links with spans. Some spans can also have multiple links associated with them.
We break up the spans and reconstruct them taking the links into account.
"""
page_id = page["page"]

links = get_links(page_id, pdf)

spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']]
span_bboxes = [span['bbox'].bbox for span in spans]
link_bboxes = [link['bbox'] for link in links]

intersection_matrix = matrix_intersection_area(link_bboxes, span_bboxes)

span_link_map: Dict[int, List[Link]] = {}
for link_idx, link in enumerate(links):
intersection_link = intersection_matrix[link_idx]
if intersection_link.sum() == 0:
continue

max_intersection = intersection_link.argmax()
span = spans[max_intersection]

dest_page = link['dest_page']
if dest_page is not None:
if link['dest_pos']:
dest_pos = link['dest_pos']
else:
# Don't link to self if there is no dest_pos
if dest_page == page_id:
continue
# if we don't have a dest pos, we just link to the top of the page
dest_pos = [0.0, 0.0]

ref = refs.add_ref(dest_page, dest_pos)
link['url'] = ref.url

span_link_map.setdefault(max_intersection, [])
span_link_map[max_intersection].append(link)

span_idx = 0
for block in page["blocks"]:
for line in block["lines"]:
spans = []
for span in line["spans"]:
if span_idx in span_link_map:
spans.extend(_reconstruct_spans(span, span_link_map[span_idx]))
else:
spans.append(span)
span_idx += 1
line['spans'] = spans


def _reconstruct_spans(orig_span: dict, links: List[Link]) -> List[Span]:
"""
Reconstructs the spans by breaking them up into smaller spans based on the links.
"""
spans: List[Span] = []
span: Span = None
link_bboxes = [Bbox(link['bbox']) for link in links]

for char in orig_span['chars']:
char_bbox = Bbox(char['bbox'])
intersections: List[Tuple[float, Link]] = []
for i, link_bbox in enumerate(link_bboxes):
area = link_bbox.intersection_area(char_bbox)
if area > 0:
intersections.append((area, links[i]))

current_url = ''
if intersections:
intersections.sort(key=lambda x: x[0], reverse=True)
current_url = intersections[0][1]['url']

if not span or current_url != span['url']:
span = {
"bbox": char_bbox,
"text": char["char"],
"rotation": char["rotation"],
"font": char["font"],
"char_start_idx": char["char_idx"],
"char_end_idx": char["char_idx"],
"chars": [char],
"url": current_url,
}
spans.append(span)
else:
span['text'] += char['char']
span['char_end_idx'] = char['char_idx']
span['bbox'] = span['bbox'].merge(char_bbox)
span['chars'].append(char)

return spans


def add_links_and_refs(pages: Pages, pdf_doc: pdfium.PdfDocument):
refs = PageReference()

for page in pages:
merge_links(page, pdf_doc, refs)

for page in pages:
page["refs"] = refs.get_refs(page["page"])
3 changes: 2 additions & 1 deletion pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def span_break():
"font": char["font"],
"char_start_idx": char["char_idx"],
"char_end_idx": char["char_idx"],
"chars": [char]
"chars": [char],
"url": '',
})

for char in chars:
Expand Down
23 changes: 23 additions & 0 deletions pdftext/pdf/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from ctypes import byref, c_int, create_string_buffer
from typing import List

import numpy as np
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c

Expand Down Expand Up @@ -34,3 +36,24 @@ def get_fontname(textpage, i):
except:
pass
return font_name_str, flags


def matrix_intersection_area(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray:
if len(boxes1) == 0 or len(boxes2) == 0:
return np.zeros((len(boxes1), len(boxes2)))

boxes1 = np.array(boxes1)
boxes2 = np.array(boxes2)

boxes1 = boxes1[:, np.newaxis, :] # Shape: (N, 1, 4)
boxes2 = boxes2[np.newaxis, :, :] # Shape: (1, M, 4)

min_x = np.maximum(boxes1[..., 0], boxes2[..., 0]) # Shape: (N, M)
min_y = np.maximum(boxes1[..., 1], boxes2[..., 1])
max_x = np.minimum(boxes1[..., 2], boxes2[..., 2])
max_y = np.minimum(boxes1[..., 3], boxes2[..., 3])

width = np.maximum(0, max_x - min_x)
height = np.maximum(0, max_y - min_y)

return width * height # Shape: (N, M)
7 changes: 4 additions & 3 deletions pdftext/postprocessing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import List, Dict
import unicodedata
from typing import List

from pdftext.pdf.utils import SPACES, LINE_BREAKS, TABS, WHITESPACE_CHARS
from pdftext.pdf.utils import LINE_BREAKS, SPACES, TABS, WHITESPACE_CHARS
from pdftext.schema import Page

LIGATURES = {
"ff": "ff",
Expand Down Expand Up @@ -91,7 +92,7 @@ def sort_blocks(blocks: List, tolerance=1.25) -> List:
return sorted_page_blocks


def merge_text(page: Dict, sort=False, hyphens=False) -> str:
def merge_text(page: Page, sort=False, hyphens=False) -> str:
text = ""
if sort:
page["blocks"] = sort_blocks(page["blocks"])
Expand Down
Loading

0 comments on commit 0a4f33c

Please sign in to comment.