Skip to content

Commit

Permalink
handle zero area chars when reconstructing spans
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Jan 25, 2025
1 parent b2a6ee9 commit 1452098
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
5 changes: 4 additions & 1 deletion pdftext/pdf/links.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,10 @@ def _reconstruct_spans(orig_span: dict, links: List[Link]) -> List[Span]:
char_bbox = Bbox(char['bbox'].bbox)
intersections: List[Tuple[float, Link]] = []
for i, link_bbox in enumerate(link_bboxes):
area = link_bbox.intersection_area(char_bbox)
if char_bbox.area > 0:
area = link_bbox.intersection_area(char_bbox)
else:
area = link_bbox.intersection_area(Bbox(char['bbox'].bbox, ensure_nonzero_area=True))
if area > 0:
intersections.append((area, links[i]))

Expand Down
6 changes: 5 additions & 1 deletion pdftext/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@


class Bbox:
def __init__(self, bbox: List[float]):
def __init__(self, bbox: List[float], ensure_nonzero_area=False):
if ensure_nonzero_area:
bbox[2] = max(bbox[0], bbox[2] + 1)
bbox[3] = max(bbox[1], bbox[3] + 1)
self.bbox = bbox
self.ensure_nonzero_area = ensure_nonzero_area

def __getitem__(self, item):
return self.bbox[item]
Expand Down

0 comments on commit 1452098

Please sign in to comment.