diff --git a/README.md b/README.md index 95e7348..4e188ac 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,8 @@ The output will be a json list, with each item in the list corresponding to a si - `name` - font name, may be None - `flags` - font flags, in the format of the `PDF spec 1.7 Section 5.7.1 Font Descriptor Flags` +If the pdf is rotated, the bboxes will be relative to the rotated page (they're rotated after being extracted). + # Programmatic usage Extract plain text: diff --git a/models/dt.joblib b/models/dt.joblib index 198d963..5aa9d22 100644 Binary files a/models/dt.joblib and b/models/dt.joblib differ diff --git a/pdftext/pdf/utils.py b/pdftext/pdf/utils.py index fa4c3af..495a70d 100644 --- a/pdftext/pdf/utils.py +++ b/pdftext/pdf/utils.py @@ -100,21 +100,40 @@ def fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height): def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, bl_origin: bool, page_rotation: int, normalize=False): + orig_page_height, orig_page_width = page_height, page_width + if page_rotation in [90, 270]: + orig_page_height, orig_page_width = page_width, page_height + if bl_origin: bbox = fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height) + if page_rotation > 0: + bbox = rotate_page_bbox(bbox, page_rotation, page_width, page_height) else: - # Do not rotate here, since we'll do it manually later - bbox = pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, 0) - - if page_rotation > 0: - bbox = rotate_bbox(bbox, page_rotation, page_width, page_height) + bbox = pdfium_page_bbox_to_device_bbox(page, bbox, orig_page_width, orig_page_height, page_rotation) + if page_rotation > 0: + bbox = rotate_pdfium_bbox(bbox, page_rotation, page_width, page_height) if normalize: bbox = [bbox[0] / page_width, bbox[1] / page_height, bbox[2] / page_width, bbox[3] / page_height] return bbox -def rotate_bbox(bbox, angle_deg, width, height): +def rotate_pdfium_bbox(bbox, angle_deg, width, height): + x1, y1, x2, y2 = bbox + if angle_deg == 90: + bbox = [y1, x1, y2, x2] + bbox = [bbox[2], height - bbox[1], bbox[0], height - bbox[3]] + elif angle_deg == 180: + bbox = [x2, y2, x1, y1] + bbox = [width - bbox[0], height - bbox[1], width - bbox[2], height - bbox[3]] + elif angle_deg == 270: + bbox = rotate_pdfium_bbox(bbox, 90, width, height) + bbox = rotate_pdfium_bbox(bbox, 180, width, height) + + return bbox + + +def rotate_page_bbox(bbox, angle_deg, width, height): x1, y1, x2, y2 = bbox if angle_deg == 90: bbox = [y1, x1, y2, x2] @@ -123,7 +142,7 @@ def rotate_bbox(bbox, angle_deg, width, height): bbox = [x2, y2, x1, y1] bbox = [width - bbox[0], height - bbox[1], width - bbox[2], height - bbox[3]] elif angle_deg == 270: - bbox = rotate_bbox(bbox, 90, width, height) - bbox = rotate_bbox(bbox, 180, width, height) + bbox = rotate_page_bbox(bbox, 90, width, height) + bbox = rotate_page_bbox(bbox, 180, width, height) return bbox \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index dbd5d28..2654c0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.1.2" +version = "0.2.0" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"