Skip to content

Commit

Permalink
clean up span definitions
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Jan 14, 2025
1 parent f6d7b28 commit 38f1500
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 12 deletions.
4 changes: 2 additions & 2 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
ty_start = page_height - cy_start
ty_end = page_height - cy_end

bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)]
bbox = Bbox(bbox).rotate(page_width, page_height, page_rotation)
bbox_coords = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
bbox = Bbox(bbox_coords).rotate(page_width, page_height, page_rotation)

chars.append({
"bbox": bbox,
Expand Down
15 changes: 8 additions & 7 deletions pdftext/pdf/links.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pypdfium2.raw as pdfium_c

from pdftext.pdf.utils import matrix_intersection_area
from pdftext.schema import Bbox, Link, Page, Pages
from pdftext.schema import Bbox, Link, Page, Pages, Span


def _get_dest_position(dest) -> Optional[Tuple[float, float]]:
Expand Down Expand Up @@ -187,7 +187,7 @@ def merge_links(page: Page, pdf: pdfium.PdfDocument, refs: dict):
line['spans'] = spans


def merge_refs(page, refs):
def merge_refs(page: Page, refs):
"""
We associate each reference to the nearest span.
"""
Expand All @@ -198,7 +198,7 @@ def merge_refs(page, refs):
if not page_refs:
return

spans = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']]
spans: List[Span] = [span for block in page['blocks'] for line in block['lines'] for span in line['spans']]
if not spans:
return

Expand All @@ -213,12 +213,12 @@ def merge_refs(page, refs):
spans[span_idx]['anchors'].append(f"page-{page_id}-{ref_idx}")


def _reconstruct_spans(orig_span: dict, links: List[Link]):
def _reconstruct_spans(orig_span: dict, links: List[Link]) -> List[Span]:
"""
Reconstructs the spans by breaking them up into smaller spans based on the links.
"""
spans = []
span = None
spans: List[Span] = []
span: Span = None
link_bboxes = [Bbox(link['bbox']) for link in links]

for char in orig_span['chars']:
Expand All @@ -243,7 +243,8 @@ def _reconstruct_spans(orig_span: dict, links: List[Link]):
"char_start_idx": char["char_idx"],
"char_end_idx": char["char_idx"],
"chars": [char],
"url": current_url
"url": current_url,
"anchors": [],
}
spans.append(span)
else:
Expand Down
4 changes: 3 additions & 1 deletion pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ def span_break():
"font": char["font"],
"char_start_idx": char["char_idx"],
"char_end_idx": char["char_idx"],
"chars": [char]
"chars": [char],
"url": '',
"anchors": []
})

for char in chars:
Expand Down
4 changes: 2 additions & 2 deletions pdftext/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,12 @@ class Span(TypedDict):
bbox: Bbox
text: str
font: Dict[str, Union[Any, str]]
font_weight: float
font_size: float
chars: List[Char]
char_start_idx: int
char_end_idx: int
rotation: int
url: str
anchors: List[str]


class Line(TypedDict):
Expand Down

0 comments on commit 38f1500

Please sign in to comment.