diff --git a/pdftext/extraction.py b/pdftext/extraction.py index b01ffc3..9c4e01d 100644 --- a/pdftext/extraction.py +++ b/pdftext/extraction.py @@ -7,6 +7,7 @@ import pypdfium2 as pdfium +from pdftext.pdf.links import add_links_and_refs from pdftext.pdf.pages import get_pages from pdftext.postprocessing import handle_hyphens, merge_text, postprocess_text, sort_blocks from pdftext.schema import Pages, TableInputs, Tables @@ -96,9 +97,16 @@ def dictionary_output( keep_chars=False, flatten_pdf=False, quote_loosebox=True, + disable_links=False, workers=None ) -> Pages: pages: Pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox) + + if not disable_links: + pdf = _load_pdf(pdf_path, False) + add_links_and_refs(pages, pdf) + pdf.close() + for page in pages: page_width, page_height = page["width"], page["height"] for block in page["blocks"]: @@ -122,6 +130,7 @@ def dictionary_output( page["bbox"] = [page["bbox"][2], page["bbox"][3], page["bbox"][0], page["bbox"][1]] return pages + def table_output( pdf_path: str, table_inputs: TableInputs, @@ -144,4 +153,3 @@ def table_output( assert len(tables) == len(table_input["tables"]), "Number of tables and table inputs must match" out_tables.append(tables) return out_tables - diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index 4f6c872..534d303 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -32,8 +32,8 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio ty_start = page_height - cy_start ty_end = page_height - cy_end - bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)] - bbox = Bbox(bbox).rotate(page_width, page_height, page_rotation) + bbox_coords = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)] + bbox = Bbox(bbox_coords).rotate(page_width, page_height, page_rotation) chars.append({ "bbox": bbox, diff --git a/pdftext/pdf/links.py b/pdftext/pdf/links.py new file mode 100644 index 0000000..29119d6 --- /dev/null +++ b/pdftext/pdf/links.py @@ -0,0 +1,260 @@ +import ctypes +import math +from typing import Dict, List, Optional, Tuple + +import numpy as np +import pypdfium2 as pdfium +import pypdfium2.raw as pdfium_c + +from pdftext.pdf.utils import matrix_intersection_area +from pdftext.schema import Bbox, Link, Page, Pages, Span + + +def _get_dest_position(dest) -> Optional[Tuple[float, float]]: + has_x = ctypes.c_int() + has_y = ctypes.c_int() + has_zoom = ctypes.c_int() + x_coord = ctypes.c_float() + y_coord = ctypes.c_float() + zoom_level = ctypes.c_float() + success = pdfium_c.FPDFDest_GetLocationInPage( + dest, has_x, has_y, has_zoom, + x_coord, y_coord, zoom_level + ) + if success: + if has_x.value and has_y.value: + return x_coord.value, y_coord.value + return None + + +def _rect_to_scaled_bbox(rect, page_bbox, page_rotation) -> List[float]: + page_width = math.ceil(abs(page_bbox[2] - page_bbox[0])) + page_height = math.ceil(abs(page_bbox[1] - page_bbox[3])) + + cx_start, cy_start, cx_end, cy_end = rect + cx_start -= page_bbox[0] + cx_end -= page_bbox[0] + cy_start -= page_bbox[1] + cy_end -= page_bbox[1] + + ty_start = page_height - cy_start + ty_end = page_height - cy_end + + bbox = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)] + return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox + + +def _xy_to_scaled_pos(x, y, page_bbox, page_rotation, expand_by=1) -> List[float]: + return _rect_to_scaled_bbox([x - expand_by, y - expand_by, x + expand_by, y + expand_by], page_bbox, page_rotation)[:2] + + +def get_links(page_idx: int, pdf: pdfium.PdfDocument) -> List[Link]: + urls = [] + + page = pdf.get_page(page_idx) + page_bbox: List[float] = page.get_bbox() + page_rotation = 0 + try: + page_rotation = page.get_rotation() + except: + pass + + annot_count = pdfium_c.FPDFPage_GetAnnotCount(page) + for i in range(annot_count): + link: Link = { + 'page': page_idx, + 'bbox': None, + 'dest_page': None, + 'dest_pos': None, + 'url': None, + } + annot = pdfium_c.FPDFPage_GetAnnot(page, i) + if pdfium_c.FPDFAnnot_GetSubtype(annot) != pdfium_c.FPDF_ANNOT_LINK: + continue + + fs_rect = pdfium_c.FS_RECTF() + success = pdfium_c.FPDFAnnot_GetRect(annot, fs_rect) + if not success: + continue + + link['bbox'] = _rect_to_scaled_bbox( + [fs_rect.left, fs_rect.top, fs_rect.right, fs_rect.bottom], + page_bbox, page_rotation + ) + + link_obj = pdfium_c.FPDFAnnot_GetLink(annot) + + dest = pdfium_c.FPDFLink_GetDest(pdf, link_obj) + if dest: + tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(pdf, dest) + link['dest_page'] = tgt_page + dest_position = _get_dest_position(dest) + if dest_position: + link['dest_pos'] = _xy_to_scaled_pos(*dest_position, page_bbox, page_rotation) + + else: + action = pdfium_c.FPDFLink_GetAction(link_obj) + a_type = pdfium_c.FPDFAction_GetType(action) + + if a_type == pdfium_c.PDFACTION_UNSUPPORTED: + continue + + elif a_type == pdfium_c.PDFACTION_GOTO: + # Goto a page + dest = pdfium_c.FPDFAction_GetDest(pdf, action) + if dest: + tgt_page = pdfium_c.FPDFDest_GetDestPageIndex(pdf, dest) + link['dest_page'] = tgt_page + dest_position = _get_dest_position(dest) + if dest_position: + link['dest_pos'] = _xy_to_scaled_pos(*dest_position, page_bbox, page_rotation) + + elif a_type == pdfium_c.PDFACTION_URI: + # External link + needed_len = pdfium_c.FPDFAction_GetURIPath(pdf, action, None, 0) + if needed_len > 0: + buf = ctypes.create_string_buffer(needed_len) + pdfium_c.FPDFAction_GetURIPath(pdf, action, buf, needed_len) + uri = buf.raw[:needed_len].decode('utf-8', errors='replace').rstrip('\x00') + link["url"] = uri + + urls.append(link) + return urls + + +def merge_links(page: Page, pdf: pdfium.PdfDocument, refs: dict): + """ + Merges links with spans. Some spans can also have multiple links associated with them. + We break up the spans and reconstruct them taking the links into account. + """ + page_id = page["page"] + + links = get_links(page_id, pdf) + + spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']] + span_bboxes = [span['bbox'].bbox for span in spans] + link_bboxes = [link['bbox'] for link in links] + + intersection_matrix = matrix_intersection_area(link_bboxes, span_bboxes) + + span_link_map: Dict[int, List[Link]] = {} + for link_idx, link in enumerate(links): + intersection_link = intersection_matrix[link_idx] + if intersection_link.sum() == 0: + continue + + max_intersection = intersection_link.argmax() + span = spans[max_intersection] + + if link['dest_page'] is None: + continue + + dest_page = link['dest_page'] + refs.setdefault(dest_page, []) + + if link['dest_pos']: + dest_pos = link['dest_pos'] + else: + # Don't link to self if there is no dest_pos + if dest_page == page_id: + continue + # if we don't have a dest pos, we just link to the top of the page + dest_pos = [0.0, 0.0] + + if dest_pos not in refs[dest_page]: + refs[dest_page].append(dest_pos) + + link['url'] = f"#page-{dest_page}-{refs[dest_page].index(dest_pos)}" + + span_link_map.setdefault(max_intersection, []) + span_link_map[max_intersection].append(link) + + span_idx = 0 + for block in page["blocks"]: + for line in block["lines"]: + spans = [] + for span in line["spans"]: + if span_idx in span_link_map: + spans.extend(_reconstruct_spans(span, span_link_map[span_idx])) + else: + spans.append(span) + span_idx += 1 + line['spans'] = spans + + +def merge_refs(page: Page, refs): + """ + We associate each reference to the nearest span. + """ + + page_id = page["page"] + + page_refs = refs.get(page_id, []) + if not page_refs: + return + + spans: List[Span] = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']] + if not spans: + return + + span_starts = np.array([span['bbox'][:2] for span in spans]) + ref_starts = np.array(page_refs) + + distances = np.linalg.norm(span_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2) + + for ref_idx in range(len(ref_starts)): + span_idx = np.argmin(distances[:, ref_idx]) + spans[span_idx].setdefault('anchors', []) + spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}") + + +def _reconstruct_spans(orig_span: dict, links: List[Link]) -> List[Span]: + """ + Reconstructs the spans by breaking them up into smaller spans based on the links. + """ + spans: List[Span] = [] + span: Span = None + link_bboxes = [Bbox(link['bbox']) for link in links] + + for char in orig_span['chars']: + char_bbox = Bbox(char['bbox']) + intersections: List[Tuple[float, Link]] = [] + for i, link_bbox in enumerate(link_bboxes): + area = link_bbox.intersection_area(char_bbox) + if area > 0: + intersections.append((area, links[i])) + + current_url = '' + if intersections: + intersections.sort(key=lambda x: x[0], reverse=True) + current_url = intersections[0][1]['url'] + + if not span or current_url != span['url']: + span = { + "bbox": char_bbox, + "text": char["char"], + "rotation": char["rotation"], + "font": char["font"], + "char_start_idx": char["char_idx"], + "char_end_idx": char["char_idx"], + "chars": [char], + "url": current_url, + "anchors": [], + } + spans.append(span) + else: + span['text'] += char['char'] + span['char_end_idx'] = char['char_idx'] + span['bbox'] = span['bbox'].merge(char_bbox) + span['chars'].append(char) + + return spans + + +def add_links_and_refs(pages: Pages, pdf_doc: pdfium.PdfDocument): + refs: Dict[int, List[List[float]]] = {} + + for page in pages: + merge_links(page, pdf_doc, refs) + for page in pages: + merge_refs(page, refs) diff --git a/pdftext/pdf/pages.py b/pdftext/pdf/pages.py index 166f1e4..15e0969 100644 --- a/pdftext/pdf/pages.py +++ b/pdftext/pdf/pages.py @@ -23,7 +23,9 @@ def span_break(): "font": char["font"], "char_start_idx": char["char_idx"], "char_end_idx": char["char_idx"], - "chars": [char] + "chars": [char], + "url": '', + "anchors": [] }) for char in chars: diff --git a/pdftext/pdf/utils.py b/pdftext/pdf/utils.py index 6719434..0412c39 100644 --- a/pdftext/pdf/utils.py +++ b/pdftext/pdf/utils.py @@ -1,5 +1,7 @@ from ctypes import byref, c_int, create_string_buffer +from typing import List +import numpy as np import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c @@ -34,3 +36,24 @@ def get_fontname(textpage, i): except: pass return font_name_str, flags + + +def matrix_intersection_area(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray: + if len(boxes1) == 0 or len(boxes2) == 0: + return np.zeros((len(boxes1), len(boxes2))) + + boxes1 = np.array(boxes1) + boxes2 = np.array(boxes2) + + boxes1 = boxes1[:, np.newaxis, :] # Shape: (N, 1, 4) + boxes2 = boxes2[np.newaxis, :, :] # Shape: (1, M, 4) + + min_x = np.maximum(boxes1[..., 0], boxes2[..., 0]) # Shape: (N, M) + min_y = np.maximum(boxes1[..., 1], boxes2[..., 1]) + max_x = np.minimum(boxes1[..., 2], boxes2[..., 2]) + max_y = np.minimum(boxes1[..., 3], boxes2[..., 3]) + + width = np.maximum(0, max_x - min_x) + height = np.maximum(0, max_y - min_y) + + return width * height # Shape: (N, M) diff --git a/pdftext/postprocessing.py b/pdftext/postprocessing.py index d4c0d92..4174149 100644 --- a/pdftext/postprocessing.py +++ b/pdftext/postprocessing.py @@ -1,7 +1,8 @@ -from typing import List, Dict import unicodedata +from typing import List -from pdftext.pdf.utils import SPACES, LINE_BREAKS, TABS, WHITESPACE_CHARS +from pdftext.pdf.utils import LINE_BREAKS, SPACES, TABS, WHITESPACE_CHARS +from pdftext.schema import Page LIGATURES = { "ff": "ff", @@ -91,7 +92,7 @@ def sort_blocks(blocks: List, tolerance=1.25) -> List: return sorted_page_blocks -def merge_text(page: Dict, sort=False, hyphens=False) -> str: +def merge_text(page: Page, sort=False, hyphens=False) -> str: text = "" if sort: page["blocks"] = sort_blocks(page["blocks"]) diff --git a/pdftext/schema.py b/pdftext/schema.py index 3692720..cdc3051 100644 --- a/pdftext/schema.py +++ b/pdftext/schema.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Dict, List, TypedDict, Union +from typing import Any, Dict, List, Optional, TypedDict, Union class Bbox: @@ -119,7 +119,7 @@ def rescale(self, img_size: List[int], page: Page) -> Bbox: class Char(TypedDict): bbox: Bbox - text: str + char: str rotation: float font: Dict[str, Union[Any, str]] char_idx: int @@ -129,21 +129,24 @@ class Span(TypedDict): bbox: Bbox text: str font: Dict[str, Union[Any, str]] - font_weight: float - font_size: float - chars: List[Char] | None + chars: List[Char] char_start_idx: int char_end_idx: int + rotation: int + url: str + anchors: List[str] class Line(TypedDict): spans: List[Span] bbox: Bbox + rotation: int class Block(TypedDict): lines: List[Line] bbox: Bbox + rotation: int class Page(TypedDict): @@ -154,15 +157,25 @@ class Page(TypedDict): blocks: List[Block] rotation: int + class TableCell(TypedDict): text: str bbox: Bbox + class TableInput(TypedDict): tables: List[List[int]] img_size: List[int] +class Link(TypedDict): + page: int + bbox: List[float] + dest_page: Optional[int] + dest_pos: Optional[List[float]] + url: Optional[str] + + Chars = List[Char] Spans = List[Span] Lines = List[Line]