Improve inference speed

VikParuchuri · Apr 26, 2024 · 7952861 · 7952861
1 parent fae2334
commit 7952861
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -88,7 +88,7 @@ Here are the scores, run on an M1 Macbook, without multiprocessing:
 | Library    | Time (s per page) | Alignment Score (% accuracy vs pymupdf) |
 |------------|-------------------|-----------------------------------------|
 | pymupdf    | 0.32              | --                                      |
-| pdftext    | 1.57              | 97.66                                   |
+| pdftext    | 1.4               | 97.76                                   |
 | pdfplumber | 3.0               | 90.3                                    |
 
 pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same character information).

diff --git a/models/dt.joblib b/models/dt.joblib
diff --git a/pdftext/inference.py b/pdftext/inference.py
@@ -8,7 +8,7 @@
 def update_current(current, new_char):
     bbox = new_char["bbox"]
     if "bbox" not in current:
-        current_bbox = bbox
+        current_bbox = bbox.copy()
         current["bbox"] = current_bbox
     else:
         current_bbox = current["bbox"]
@@ -18,17 +18,23 @@ def update_current(current, new_char):
         current_bbox[3] = max(bbox[3], current_bbox[3])
     current["center_x"] = (current_bbox[0] + current_bbox[2]) / 2
     current["center_y"] = (current_bbox[1] + current_bbox[3]) / 2
-    return current
 
 
 def create_training_row(char_info, prev_char, currblock, currline):
     char = char_info["char"]
-    char_center_x = (char_info["bbox"][2] + char_info["bbox"][0]) / 2
-    char_center_y = (char_info["bbox"][3] + char_info["bbox"][1]) / 2
-    x_gap = char_info["bbox"][0] - prev_char["bbox"][2]
-    y_gap = char_info["bbox"][1] - prev_char["bbox"][3]
+
+    # Store variables used multiple times
+    char_x1, char_y1, char_x2, char_y2 = char_info["bbox"]
+    prev_x1, prev_y1, prev_x2, prev_y2 = prev_char["bbox"]
+    char_center_x = (char_x2 + char_x1) / 2
+    char_center_y = (char_y2 + char_y1) / 2
+    x_gap = char_x1 - prev_x2
+    y_gap = char_y1 - prev_y2
+
+    char_font = char_info["font"]
+    prev_font = prev_char["font"]
     font_match = all(
-        [char_info["font"][key] == prev_char["font"][key] for key in ["name", "size", "weight", "flags"]] +
+        [char_font[key] == prev_font[key] for key in ["name", "size", "weight", "flags"]] +
         [char_info["rotation"] == prev_char["rotation"]]
     )
     is_space = any([
@@ -42,20 +48,20 @@ def create_training_row(char_info, prev_char, currblock, currline):
         "x_gap": x_gap,
         "y_gap": y_gap,
         "font_match": font_match,
-        "x_outer_gap": char_info["bbox"][2] - prev_char["bbox"][0],
-        "y_outer_gap": char_info["bbox"][3] - prev_char["bbox"][1],
+        "x_outer_gap": char_x2 - prev_x1,
+        "y_outer_gap": char_y2 - prev_y1,
         "line_x_center_gap": char_center_x - currline["center_x"],
         "line_y_center_gap": char_center_y - currline["center_y"],
-        "line_x_gap": char_info["bbox"][0] - currline["bbox"][2],
-        "line_y_gap": char_info["bbox"][1] - currline["bbox"][3],
-        "line_x_start_gap": char_info["bbox"][0] - currline["bbox"][0],
-        "line_y_start_gap": char_info["bbox"][1] - currline["bbox"][1],
+        "line_x_gap": char_x1 - currline["bbox"][2],
+        "line_y_gap": char_y1 - currline["bbox"][3],
+        "line_x_start_gap": char_x1 - currline["bbox"][0],
+        "line_y_start_gap": char_y1 - currline["bbox"][1],
         "block_x_center_gap": char_center_x - currblock["center_x"],
         "block_y_center_gap": char_center_y - currblock["center_y"],
-        "block_x_gap": char_info["bbox"][0] - currblock["bbox"][2],
-        "block_y_gap": char_info["bbox"][1] - currblock["bbox"][3],
-        "block_x_start_gap": char_info["bbox"][0] - currblock["bbox"][0],
-        "block_y_start_gap": char_info["bbox"][1] - currblock["bbox"][1]
+        "block_x_gap": char_x1 - currblock["bbox"][2],
+        "block_y_gap": char_y1 - currblock["bbox"][3],
+        "block_x_start_gap": char_x1 - currblock["bbox"][0],
+        "block_y_start_gap": char_y1 - currblock["bbox"][1]
     }
 
     return training_row
@@ -91,7 +97,8 @@ def infer_single_page(text_chars):
     for i, char_info in enumerate(text_chars["chars"]):
         if prev_char:
             training_row = create_training_row(char_info, prev_char, block, line)
-            training_row = [v for _, v in sorted(training_row.items())]
+            sorted_keys = sorted(training_row.keys())
+            training_row = [training_row[key] for key in sorted_keys]
 
             prediction = yield training_row
             if prediction == 0:
@@ -107,8 +114,8 @@ def infer_single_page(text_chars):
                 block = update_block(blocks, block)
 
         span["chars"].append(char_info)
-        line = update_current(line, char_info)
-        block = update_current(block, char_info)
+        update_current(line, char_info)
+        update_current(block, char_info)
 
         prev_char = char_info
     if len(span["chars"]) > 0:
@@ -150,16 +157,15 @@ def inference(text_chars, model):
         if len(page_blocks) == len(generators):
             break
 
-        training_list = sorted(training_data.items())
-        training_rows = [tl[1] for tl in training_list]
-        training_idxs = [tl[0] for tl in training_list]
+        training_idxs = sorted(training_data.keys())
+        training_rows = [training_data[idx] for idx in training_idxs]
 
         # Disable nan, etc, validation for a small speedup
         with sklearn.config_context(assume_finite=True):
             predictions = model.predict(training_rows)
         for pred, page_idx in zip(predictions, training_idxs):
             next_prediction[page_idx] = pred
-    page_blocks = sorted(page_blocks.items())
-    page_blocks = [p[1] for p in page_blocks]
+    sorted_keys = sorted(page_blocks.keys())
+    page_blocks = [page_blocks[key] for key in sorted_keys]
     assert len(page_blocks) == len(text_chars)
     return page_blocks
diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py
@@ -30,6 +30,8 @@ def get_pdfium_chars(pdf_path, fontname_sample_freq=settings.FONTNAME_SAMPLE_FRE
     for page_idx in range(len(pdf)):
         page = pdf.get_page(page_idx)
         text_page = page.get_textpage()
+        mediabox = page.get_mediabox()
+        bl_origin = mediabox[0] == 0 and mediabox[1] == 0
 
         bbox = page.get_bbox()
         page_width = math.ceil(bbox[2] - bbox[0])
@@ -58,7 +60,7 @@ def get_pdfium_chars(pdf_path, fontname_sample_freq=settings.FONTNAME_SAMPLE_FRE
             rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
             rotation = rotation * 180 / math.pi # convert from radians to degrees
             coords = text_page.get_charbox(i, loose=True)
-            device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, normalize=True)
+            device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, bl_origin, normalize=True)
 
             char_info = {
                 "font": {

diff --git a/pdftext/pdf/utils.py b/pdftext/pdf/utils.py
@@ -78,20 +78,25 @@ def page_to_device(page, x, y, page_width, page_height):
 
 
 def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
-    bbox_width = bbox[2] - bbox[0]
-    bbox_height = bbox[3] - bbox[1]
     left_bottom = page_to_device(page, *bbox[:2], page_width, page_height)
+    top_right = page_to_device(page, *bbox[2:], page_width, page_height)
 
-    dev_bbox = [left_bottom[0], left_bottom[1] - bbox_height, left_bottom[0] + bbox_width, left_bottom[1]]   # Convert to ltrb
+    dev_bbox = [left_bottom[0], top_right[1], top_right[0], left_bottom[1]]
     if normalize:
         dev_bbox = [dev_bbox[0] / page_width, dev_bbox[1] / page_height, dev_bbox[2] / page_width, dev_bbox[3] / page_height]
     return dev_bbox
 
 
-def page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
+def fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
     left, bottom, right, top = bbox
 
     dev_bbox = [left, page_height-top, right, page_height-bottom]
     if normalize:
         dev_bbox = [dev_bbox[0] / page_width, dev_bbox[1] / page_height, dev_bbox[2] / page_width, dev_bbox[3] / page_height]
-    return dev_bbox
+    return dev_bbox
+
+
+def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, bl_origin: bool, normalize=False):
+    if bl_origin:
+        return fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize)
+    return pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize)