Skip to content

Commit

Permalink
Handle rotation properly
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Apr 26, 2024
1 parent 7952861 commit 648f0af
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 43 deletions.
16 changes: 3 additions & 13 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,14 @@ def dictionary_output(pdf_path, sort=False, model=None):
for key in bad_keys:
del block[key]
for line in block["lines"]:
line_box = None
bad_keys = [key for key in line.keys() if key not in ["chars", "bbox"]]
for key in bad_keys:
del line[key]
for char in line["chars"]:
char["bbox"] = unnormalize_bbox(char["bbox"], page["bbox"])
char["bbox"] = unnormalize_bbox(char["bbox"], page["width"], page["height"])
char["char"] = postprocess_text(char["char"])
if line_box is None:
line_box = char["bbox"]
else:
line_box = [
min(line_box[0], char["bbox"][0]),
min(line_box[1], char["bbox"][1]),
max(line_box[2], char["bbox"][2]),
max(line_box[3], char["bbox"][3]),
]
line["bbox"] = line_box
block["bbox"] = unnormalize_bbox(block["bbox"], page["bbox"])
line["bbox"] = unnormalize_bbox(line["bbox"], page["width"], page["height"])
block["bbox"] = unnormalize_bbox(block["bbox"], page["width"], page["height"])
if sort:
page["blocks"] = sort_blocks(page["blocks"])
return pages
12 changes: 8 additions & 4 deletions pdftext/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,14 @@ def update_block(blocks, block):
def infer_single_page(text_chars):
prev_char = None

blocks = {"blocks": []}
blocks = {
"blocks": [],
"page": text_chars["page"],
"rotation": text_chars["rotation"],
"bbox": text_chars["bbox"],
"width": text_chars["width"],
"height": text_chars["height"],
}
block = {"lines": []}
line = {"spans": []}
span = {"chars": []}
Expand Down Expand Up @@ -125,9 +132,6 @@ def infer_single_page(text_chars):
if len(block["lines"]) > 0:
update_block(blocks, block)

blocks["page"] = text_chars["page"]
blocks["rotation"] = text_chars["rotation"]
blocks["bbox"] = text_chars["bbox"]
return blocks


Expand Down
29 changes: 22 additions & 7 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,32 @@ def get_pdfium_chars(pdf_path, fontname_sample_freq=settings.FONTNAME_SAMPLE_FRE
page = pdf.get_page(page_idx)
text_page = page.get_textpage()
mediabox = page.get_mediabox()
bl_origin = mediabox[0] == 0 and mediabox[1] == 0

page_rotation = page.get_rotation()
bbox = page.get_bbox()
page_width = math.ceil(bbox[2] - bbox[0])
page_width = math.ceil(abs(bbox[2] - bbox[0]))
page_height = math.ceil(abs(bbox[1] - bbox[3]))
bbox = pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, page_rotation)

# Recalculate page width and height with new bboxes
page_width = math.ceil(abs(bbox[2] - bbox[0]))
page_height = math.ceil(abs(bbox[1] - bbox[3]))

# Flip width and height if rotated
if page_rotation == 90 or page_rotation == 270:
page_width, page_height = page_height, page_width

bl_origin = all([
mediabox[0] == 0,
mediabox[1] == 0
])

text_chars = {
"chars": [],
"page": page_idx,
"rotation": page.get_rotation(),
"bbox": pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height)
"rotation": page_rotation,
"bbox": bbox,
"width": page_width,
"height": page_height,
}

fontname = None
Expand All @@ -59,8 +74,8 @@ def get_pdfium_chars(pdf_path, fontname_sample_freq=settings.FONTNAME_SAMPLE_FRE

rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * 180 / math.pi # convert from radians to degrees
coords = text_page.get_charbox(i, loose=True)
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, bl_origin, normalize=True)
coords = text_page.get_charbox(i, loose=False)
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, bl_origin, page_rotation, normalize=True)

char_info = {
"font": {
Expand Down
65 changes: 46 additions & 19 deletions pdftext/pdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ def normalize_bbox(bbox, page_bound):
return x1, y1, x2, y2


def unnormalize_bbox(bbox, page_bound):
def unnormalize_bbox(bbox, page_width, page_height):
x1, y1, x2, y2 = bbox
x1 = round(x1 * page_bound[2], 1)
y1 = round(y1 * page_bound[3], 1)
x2 = round(x2 * page_bound[2], 1)
y2 = round(y2 * page_bound[3], 1)
x1 = round(x1 * page_width, 1)
y1 = round(y1 * page_height, 1)
x2 = round(x2 * page_width, 1)
y2 = round(y2 * page_height, 1)
return x1, y1, x2, y2


Expand All @@ -63,40 +63,67 @@ def get_fontname(textpage, char_index):
return decoded, flag_buffer.value


def page_to_device(page, x, y, page_width, page_height):
def page_to_device(page, x, y, page_width, page_height, page_rotation: int):
if page_rotation == 90:
page_rotation = 1
elif page_rotation == 180:
page_rotation = 2
elif page_rotation == 270:
page_rotation = 3
else:
page_rotation = 0
device_x = ctypes.c_int()
device_y = ctypes.c_int()
device_x_ptr = ctypes.pointer(device_x)
device_y_ptr = ctypes.pointer(device_y)
rotation = pdfium_c.FPDFPage_GetRotation(page)
width = math.ceil(page_width)
height = math.ceil(page_height)
pdfium_c.FPDF_PageToDevice(page, 0, 0, width, height, rotation, x, y, device_x_ptr, device_y_ptr)
pdfium_c.FPDF_PageToDevice(page, 0, 0, width, height, page_rotation, x, y, device_x_ptr, device_y_ptr)
x = device_x.value
y = device_y.value
return x, y


def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
left_bottom = page_to_device(page, *bbox[:2], page_width, page_height)
top_right = page_to_device(page, *bbox[2:], page_width, page_height)
def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, page_rotation):
left_bottom = page_to_device(page, *bbox[:2], page_width, page_height, page_rotation)
top_right = page_to_device(page, *bbox[2:], page_width, page_height, page_rotation)

dev_bbox = [left_bottom[0], top_right[1], top_right[0], left_bottom[1]]
if normalize:
dev_bbox = [dev_bbox[0] / page_width, dev_bbox[1] / page_height, dev_bbox[2] / page_width, dev_bbox[3] / page_height]
return dev_bbox


def fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
def fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height):
left, bottom, right, top = bbox

dev_bbox = [left, page_height-top, right, page_height-bottom]
if normalize:
dev_bbox = [dev_bbox[0] / page_width, dev_bbox[1] / page_height, dev_bbox[2] / page_width, dev_bbox[3] / page_height]
return dev_bbox


def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, bl_origin: bool, normalize=False):
def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, bl_origin: bool, page_rotation: int, normalize=False):
if bl_origin:
return fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize)
return pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize)
bbox = fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height)
else:
# Do not rotate here, since we'll do it manually later
bbox = pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, 0)

if page_rotation > 0:
bbox = rotate_bbox(bbox, page_rotation, page_width, page_height)

if normalize:
bbox = [bbox[0] / page_width, bbox[1] / page_height, bbox[2] / page_width, bbox[3] / page_height]
return bbox


def rotate_bbox(bbox, angle_deg, width, height):
x1, y1, x2, y2 = bbox
if angle_deg == 90:
bbox = [y1, x1, y2, x2]
bbox = [height - bbox[2], bbox[1], height - bbox[0], bbox[3]]
elif angle_deg == 180:
bbox = [x2, y2, x1, y1]
bbox = [width - bbox[0], height - bbox[1], width - bbox[2], height - bbox[3]]
elif angle_deg == 270:
bbox = rotate_bbox(bbox, 90, width, height)
bbox = rotate_bbox(bbox, 180, width, height)

return bbox

0 comments on commit 648f0af

Please sign in to comment.