Skip to content

Commit

Permalink
Handle rotation with nonzero origin
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Apr 26, 2024
1 parent 648f0af commit 250c513
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 9 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ The output will be a json list, with each item in the list corresponding to a si
- `name` - font name, may be None
- `flags` - font flags, in the format of the `PDF spec 1.7 Section 5.7.1 Font Descriptor Flags`

If the pdf is rotated, the bboxes will be relative to the rotated page (they're rotated after being extracted).

# Programmatic usage

Extract plain text:
Expand Down
Binary file modified models/dt.joblib
Binary file not shown.
35 changes: 27 additions & 8 deletions pdftext/pdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,21 +100,40 @@ def fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height):


def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, bl_origin: bool, page_rotation: int, normalize=False):
orig_page_height, orig_page_width = page_height, page_width
if page_rotation in [90, 270]:
orig_page_height, orig_page_width = page_width, page_height

if bl_origin:
bbox = fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height)
if page_rotation > 0:
bbox = rotate_page_bbox(bbox, page_rotation, page_width, page_height)
else:
# Do not rotate here, since we'll do it manually later
bbox = pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, 0)

if page_rotation > 0:
bbox = rotate_bbox(bbox, page_rotation, page_width, page_height)
bbox = pdfium_page_bbox_to_device_bbox(page, bbox, orig_page_width, orig_page_height, page_rotation)
if page_rotation > 0:
bbox = rotate_pdfium_bbox(bbox, page_rotation, page_width, page_height)

if normalize:
bbox = [bbox[0] / page_width, bbox[1] / page_height, bbox[2] / page_width, bbox[3] / page_height]
return bbox


def rotate_bbox(bbox, angle_deg, width, height):
def rotate_pdfium_bbox(bbox, angle_deg, width, height):
x1, y1, x2, y2 = bbox
if angle_deg == 90:
bbox = [y1, x1, y2, x2]
bbox = [bbox[2], height - bbox[1], bbox[0], height - bbox[3]]
elif angle_deg == 180:
bbox = [x2, y2, x1, y1]
bbox = [width - bbox[0], height - bbox[1], width - bbox[2], height - bbox[3]]
elif angle_deg == 270:
bbox = rotate_pdfium_bbox(bbox, 90, width, height)
bbox = rotate_pdfium_bbox(bbox, 180, width, height)

return bbox


def rotate_page_bbox(bbox, angle_deg, width, height):
x1, y1, x2, y2 = bbox
if angle_deg == 90:
bbox = [y1, x1, y2, x2]
Expand All @@ -123,7 +142,7 @@ def rotate_bbox(bbox, angle_deg, width, height):
bbox = [x2, y2, x1, y1]
bbox = [width - bbox[0], height - bbox[1], width - bbox[2], height - bbox[3]]
elif angle_deg == 270:
bbox = rotate_bbox(bbox, 90, width, height)
bbox = rotate_bbox(bbox, 180, width, height)
bbox = rotate_page_bbox(bbox, 90, width, height)
bbox = rotate_page_bbox(bbox, 180, width, height)

return bbox
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.1.2"
version = "0.2.0"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit 250c513

Please sign in to comment.