Skip to content

Commit

Permalink
Improve inference speed
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Apr 26, 2024
1 parent fae2334 commit 7952861
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 32 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ Here are the scores, run on an M1 Macbook, without multiprocessing:
| Library | Time (s per page) | Alignment Score (% accuracy vs pymupdf) |
|------------|-------------------|-----------------------------------------|
| pymupdf | 0.32 | -- |
| pdftext | 1.57 | 97.66 |
| pdftext | 1.4 | 97.76 |
| pdfplumber | 3.0 | 90.3 |

pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same character information).
Expand Down
Binary file modified models/dt.joblib
Binary file not shown.
56 changes: 31 additions & 25 deletions pdftext/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
def update_current(current, new_char):
bbox = new_char["bbox"]
if "bbox" not in current:
current_bbox = bbox
current_bbox = bbox.copy()
current["bbox"] = current_bbox
else:
current_bbox = current["bbox"]
Expand All @@ -18,17 +18,23 @@ def update_current(current, new_char):
current_bbox[3] = max(bbox[3], current_bbox[3])
current["center_x"] = (current_bbox[0] + current_bbox[2]) / 2
current["center_y"] = (current_bbox[1] + current_bbox[3]) / 2
return current


def create_training_row(char_info, prev_char, currblock, currline):
char = char_info["char"]
char_center_x = (char_info["bbox"][2] + char_info["bbox"][0]) / 2
char_center_y = (char_info["bbox"][3] + char_info["bbox"][1]) / 2
x_gap = char_info["bbox"][0] - prev_char["bbox"][2]
y_gap = char_info["bbox"][1] - prev_char["bbox"][3]

# Store variables used multiple times
char_x1, char_y1, char_x2, char_y2 = char_info["bbox"]
prev_x1, prev_y1, prev_x2, prev_y2 = prev_char["bbox"]
char_center_x = (char_x2 + char_x1) / 2
char_center_y = (char_y2 + char_y1) / 2
x_gap = char_x1 - prev_x2
y_gap = char_y1 - prev_y2

char_font = char_info["font"]
prev_font = prev_char["font"]
font_match = all(
[char_info["font"][key] == prev_char["font"][key] for key in ["name", "size", "weight", "flags"]] +
[char_font[key] == prev_font[key] for key in ["name", "size", "weight", "flags"]] +
[char_info["rotation"] == prev_char["rotation"]]
)
is_space = any([
Expand All @@ -42,20 +48,20 @@ def create_training_row(char_info, prev_char, currblock, currline):
"x_gap": x_gap,
"y_gap": y_gap,
"font_match": font_match,
"x_outer_gap": char_info["bbox"][2] - prev_char["bbox"][0],
"y_outer_gap": char_info["bbox"][3] - prev_char["bbox"][1],
"x_outer_gap": char_x2 - prev_x1,
"y_outer_gap": char_y2 - prev_y1,
"line_x_center_gap": char_center_x - currline["center_x"],
"line_y_center_gap": char_center_y - currline["center_y"],
"line_x_gap": char_info["bbox"][0] - currline["bbox"][2],
"line_y_gap": char_info["bbox"][1] - currline["bbox"][3],
"line_x_start_gap": char_info["bbox"][0] - currline["bbox"][0],
"line_y_start_gap": char_info["bbox"][1] - currline["bbox"][1],
"line_x_gap": char_x1 - currline["bbox"][2],
"line_y_gap": char_y1 - currline["bbox"][3],
"line_x_start_gap": char_x1 - currline["bbox"][0],
"line_y_start_gap": char_y1 - currline["bbox"][1],
"block_x_center_gap": char_center_x - currblock["center_x"],
"block_y_center_gap": char_center_y - currblock["center_y"],
"block_x_gap": char_info["bbox"][0] - currblock["bbox"][2],
"block_y_gap": char_info["bbox"][1] - currblock["bbox"][3],
"block_x_start_gap": char_info["bbox"][0] - currblock["bbox"][0],
"block_y_start_gap": char_info["bbox"][1] - currblock["bbox"][1]
"block_x_gap": char_x1 - currblock["bbox"][2],
"block_y_gap": char_y1 - currblock["bbox"][3],
"block_x_start_gap": char_x1 - currblock["bbox"][0],
"block_y_start_gap": char_y1 - currblock["bbox"][1]
}

return training_row
Expand Down Expand Up @@ -91,7 +97,8 @@ def infer_single_page(text_chars):
for i, char_info in enumerate(text_chars["chars"]):
if prev_char:
training_row = create_training_row(char_info, prev_char, block, line)
training_row = [v for _, v in sorted(training_row.items())]
sorted_keys = sorted(training_row.keys())
training_row = [training_row[key] for key in sorted_keys]

prediction = yield training_row
if prediction == 0:
Expand All @@ -107,8 +114,8 @@ def infer_single_page(text_chars):
block = update_block(blocks, block)

span["chars"].append(char_info)
line = update_current(line, char_info)
block = update_current(block, char_info)
update_current(line, char_info)
update_current(block, char_info)

prev_char = char_info
if len(span["chars"]) > 0:
Expand Down Expand Up @@ -150,16 +157,15 @@ def inference(text_chars, model):
if len(page_blocks) == len(generators):
break

training_list = sorted(training_data.items())
training_rows = [tl[1] for tl in training_list]
training_idxs = [tl[0] for tl in training_list]
training_idxs = sorted(training_data.keys())
training_rows = [training_data[idx] for idx in training_idxs]

# Disable nan, etc, validation for a small speedup
with sklearn.config_context(assume_finite=True):
predictions = model.predict(training_rows)
for pred, page_idx in zip(predictions, training_idxs):
next_prediction[page_idx] = pred
page_blocks = sorted(page_blocks.items())
page_blocks = [p[1] for p in page_blocks]
sorted_keys = sorted(page_blocks.keys())
page_blocks = [page_blocks[key] for key in sorted_keys]
assert len(page_blocks) == len(text_chars)
return page_blocks
4 changes: 3 additions & 1 deletion pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def get_pdfium_chars(pdf_path, fontname_sample_freq=settings.FONTNAME_SAMPLE_FRE
for page_idx in range(len(pdf)):
page = pdf.get_page(page_idx)
text_page = page.get_textpage()
mediabox = page.get_mediabox()
bl_origin = mediabox[0] == 0 and mediabox[1] == 0

bbox = page.get_bbox()
page_width = math.ceil(bbox[2] - bbox[0])
Expand Down Expand Up @@ -58,7 +60,7 @@ def get_pdfium_chars(pdf_path, fontname_sample_freq=settings.FONTNAME_SAMPLE_FRE
rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * 180 / math.pi # convert from radians to degrees
coords = text_page.get_charbox(i, loose=True)
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, normalize=True)
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, bl_origin, normalize=True)

char_info = {
"font": {
Expand Down
15 changes: 10 additions & 5 deletions pdftext/pdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,20 +78,25 @@ def page_to_device(page, x, y, page_width, page_height):


def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
bbox_width = bbox[2] - bbox[0]
bbox_height = bbox[3] - bbox[1]
left_bottom = page_to_device(page, *bbox[:2], page_width, page_height)
top_right = page_to_device(page, *bbox[2:], page_width, page_height)

dev_bbox = [left_bottom[0], left_bottom[1] - bbox_height, left_bottom[0] + bbox_width, left_bottom[1]] # Convert to ltrb
dev_bbox = [left_bottom[0], top_right[1], top_right[0], left_bottom[1]]
if normalize:
dev_bbox = [dev_bbox[0] / page_width, dev_bbox[1] / page_height, dev_bbox[2] / page_width, dev_bbox[3] / page_height]
return dev_bbox


def page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
def fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
left, bottom, right, top = bbox

dev_bbox = [left, page_height-top, right, page_height-bottom]
if normalize:
dev_bbox = [dev_bbox[0] / page_width, dev_bbox[1] / page_height, dev_bbox[2] / page_width, dev_bbox[3] / page_height]
return dev_bbox
return dev_bbox


def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, bl_origin: bool, normalize=False):
if bl_origin:
return fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize)
return pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize)

0 comments on commit 7952861

Please sign in to comment.