Skip to content

Commit

Permalink
Merge pull request #32 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Misc bugfixes and improvements
  • Loading branch information
VikParuchuri authored Jan 27, 2025
2 parents 0a4f33c + 77ba545 commit d14f873
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 132 deletions.
7 changes: 5 additions & 2 deletions pdftext/pdf/links.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,13 @@ def _reconstruct_spans(orig_span: dict, links: List[Link]) -> List[Span]:
link_bboxes = [Bbox(link['bbox']) for link in links]

for char in orig_span['chars']:
char_bbox = Bbox(char['bbox'])
char_bbox = Bbox(char['bbox'].bbox)
intersections: List[Tuple[float, Link]] = []
for i, link_bbox in enumerate(link_bboxes):
area = link_bbox.intersection_area(char_bbox)
if char_bbox.area > 0:
area = link_bbox.intersection_area(char_bbox)
else:
area = link_bbox.intersection_area(Bbox(char['bbox'].bbox, ensure_nonzero_area=True))
if area > 0:
intersections.append((area, links[i]))

Expand Down
7 changes: 6 additions & 1 deletion pdftext/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@


class Bbox:
def __init__(self, bbox: List[float]):
def __init__(self, bbox: List[float], ensure_nonzero_area=False):
if ensure_nonzero_area:
bbox = list(bbox)
bbox[2] = max(bbox[0], bbox[2] + 1)
bbox[3] = max(bbox[1], bbox[3] + 1)
self.bbox = bbox
self.ensure_nonzero_area = ensure_nonzero_area

def __getitem__(self, item):
return self.bbox[item]
Expand Down
Loading

0 comments on commit d14f873

Please sign in to comment.