Minor performance optimization

VikParuchuri · May 24, 2024 · 51266d8 · 51266d8
1 parent 37d1caf
commit 51266d8
Show file tree

Hide file tree

Showing 6 changed files with 90 additions and 99 deletions.
diff --git a/pdftext/extraction.py b/pdftext/extraction.py
@@ -61,30 +61,30 @@ def paginated_plain_text_output(pdf_path, sort=False, model=None, hyphens=False,
     return text
 
 
+def _process_span(span, page_width, page_height, keep_chars):
+    span["bbox"] = unnormalize_bbox(span["bbox"], page_width, page_height)
+    span["text"] = handle_hyphens(postprocess_text(span["text"]), keep_hyphens=True)
+    if not keep_chars:
+        del span["chars"]
+    else:
+        for char in span["chars"]:
+            char["bbox"] = unnormalize_bbox(char["bbox"], page_width, page_height)
+
+
 def dictionary_output(pdf_path, sort=False, model=None, page_range=None, keep_chars=False, workers=None):
     pages = _get_pages(pdf_path, model, page_range, workers=workers)
     for page in pages:
+        page_width, page_height = page["width"], page["height"]
         for block in page["blocks"]:
-            bad_keys = [key for key in block.keys() if key not in ["lines", "bbox"]]
-            for key in bad_keys:
-                del block[key]
+            block = {k: v for k, v in block.items() if k in ["lines", "bbox"]}
+            block["bbox"] = unnormalize_bbox(block["bbox"], page_width, page_height)
             for line in block["lines"]:
-                bad_keys = [key for key in line.keys() if key not in ["bbox", "spans"]]
-                for key in bad_keys:
-                    del line[key]
+                line = {k: v for k, v in line.items() if k in ["bbox", "spans"]}
+                line["bbox"] = unnormalize_bbox(line["bbox"], page_width, page_height)
                 for span in line["spans"]:
-                    span["bbox"] = unnormalize_bbox(span["bbox"], page["width"], page["height"])
-                    span["text"] = postprocess_text(span["text"])
-                    span["text"] = handle_hyphens(span["text"], keep_hyphens=True)
-
-                    if not keep_chars:
-                        del span["chars"]
-                    else:
-                        for char in span["chars"]:
-                            char["bbox"] = unnormalize_bbox(char["bbox"], page["width"], page["height"])
-
-                line["bbox"] = unnormalize_bbox(line["bbox"], page["width"], page["height"])
-            block["bbox"] = unnormalize_bbox(block["bbox"], page["width"], page["height"])
+                    _process_span(span, page_width, page_height, keep_chars)
+
         if sort:
             page["blocks"] = sort_blocks(page["blocks"])
+
     return pages
diff --git a/pdftext/inference.py b/pdftext/inference.py
@@ -9,14 +9,14 @@
 def update_current(current, new_char):
     bbox = new_char["bbox"]
     if "bbox" not in current:
-        current_bbox = bbox.copy()
-        current["bbox"] = current_bbox
+        current["bbox"] = bbox.copy()
     else:
         current_bbox = current["bbox"]
         current_bbox[0] = min(bbox[0], current_bbox[0])
         current_bbox[1] = min(bbox[1], current_bbox[1])
         current_bbox[2] = max(bbox[2], current_bbox[2])
         current_bbox[3] = max(bbox[3], current_bbox[3])
+    current_bbox = current["bbox"]
     current["center_x"] = (current_bbox[0] + current_bbox[2]) / 2
     current["center_y"] = (current_bbox[1] + current_bbox[3]) / 2
 
@@ -25,23 +25,27 @@ def create_training_row(char_info, prev_char, currblock, currline):
     char = char_info["char"]
 
     # Store variables used multiple times
-    char_x1, char_y1, char_x2, char_y2 = char_info["bbox"]
-    prev_x1, prev_y1, prev_x2, prev_y2 = prev_char["bbox"]
+    char_bbox = char_info["bbox"]
+    prev_bbox = prev_char["bbox"]
+    currblock_bbox = currblock["bbox"]
+    currline_bbox = currline["bbox"]
+
+    char_x1, char_y1, char_x2, char_y2 = char_bbox
+    prev_x1, prev_y1, prev_x2, prev_y2 = prev_bbox
     char_center_x = (char_x2 + char_x1) / 2
     char_center_y = (char_y2 + char_y1) / 2
     x_gap = char_x1 - prev_x2
     y_gap = char_y1 - prev_y2
 
     char_font = char_info["font"]
     prev_font = prev_char["font"]
-    font_match = all(
-        [char_font[key] == prev_font[key] for key in ["name", "size", "weight", "flags"]] +
-        [char_info["rotation"] == prev_char["rotation"]]
-    )
-    is_space = any([
-        char in SPACES,
-        char in TABS,
-    ])
+    font_match = (char_font["name"] == prev_font["name"] and
+                  char_font["size"] == prev_font["size"] and
+                  char_font["weight"] == prev_font["weight"] and
+                  char_font["flags"] == prev_font["flags"] and
+                  char_info["rotation"] == prev_char["rotation"])
+
+    is_space = char in SPACES or char in TABS
 
     training_row = {
         "is_newline": char in LINE_BREAKS,
@@ -53,42 +57,49 @@ def create_training_row(char_info, prev_char, currblock, currline):
         "y_outer_gap": char_y2 - prev_y1,
         "line_x_center_gap": char_center_x - currline["center_x"],
         "line_y_center_gap": char_center_y - currline["center_y"],
-        "line_x_gap": char_x1 - currline["bbox"][2],
-        "line_y_gap": char_y1 - currline["bbox"][3],
-        "line_x_start_gap": char_x1 - currline["bbox"][0],
-        "line_y_start_gap": char_y1 - currline["bbox"][1],
+        "line_x_gap": char_x1 - currline_bbox[2],
+        "line_y_gap": char_y1 - currline_bbox[3],
+        "line_x_start_gap": char_x1 - currline_bbox[0],
+        "line_y_start_gap": char_y1 - currline_bbox[1],
         "block_x_center_gap": char_center_x - currblock["center_x"],
         "block_y_center_gap": char_center_y - currblock["center_y"],
-        "block_x_gap": char_x1 - currblock["bbox"][2],
-        "block_y_gap": char_y1 - currblock["bbox"][3],
-        "block_x_start_gap": char_x1 - currblock["bbox"][0],
-        "block_y_start_gap": char_y1 - currblock["bbox"][1]
+        "block_x_gap": char_x1 - currblock_bbox[2],
+        "block_y_gap": char_y1 - currblock_bbox[3],
+        "block_x_start_gap": char_x1 - currblock_bbox[0],
+        "block_y_start_gap": char_y1 - currblock_bbox[1]
     }
 
     return training_row
 
 
 def update_span(line, span):
-    if len(span["chars"]) > 0:
-        span["font"] = span["chars"][0]["font"]
-        span["rotation"] = span["chars"][0]["rotation"]
+    if span["chars"]:
+        first_char = span["chars"][0]
+        span["font"] = first_char["font"]
+        span["rotation"] = first_char["rotation"]
+
         char_bboxes = [char["bbox"] for char in span["chars"]]
-        span["bbox"] = [min([bbox[0] for bbox in char_bboxes]),
-                        min([bbox[1] for bbox in char_bboxes]),
-                        max([bbox[2] for bbox in char_bboxes]),
-                        max([bbox[3] for bbox in char_bboxes])]
-        span["text"] = "".join([char["char"] for char in span["chars"]])
-        span["char_start_idx"] = span["chars"][0]["char_idx"]
+        min_x, min_y, max_x, max_y = char_bboxes[0]
+
+        for bbox in char_bboxes[1:]:
+            min_x = min(min_x, bbox[0])
+            min_y = min(min_y, bbox[1])
+            max_x = max(max_x, bbox[2])
+            max_y = max(max_y, bbox[3])
+
+        span["bbox"] = [min_x, min_y, max_x, max_y]
+        span["text"] = "".join(char["char"] for char in span["chars"])
+        span["char_start_idx"] = first_char["char_idx"]
         span["char_end_idx"] = span["chars"][-1]["char_idx"]
 
-    # Remove unneeded keys from the characters
-    for char in span["chars"]:
-        del_keys = [k for k in list(char.keys()) if k not in ["char", "bbox"]]
-        for key in del_keys:
-            del char[key]
-    line["spans"].append(span)
-    span = {"chars": []}
-    return span
+        # Remove unneeded keys from the characters
+        for char in span["chars"]:
+            for key in list(char.keys()):
+                if key not in ["char", "bbox"]:
+                    del char[key]
+
+        line["spans"].append(span)
+    return {"chars": []}
 
 
 def update_line(block, line):
@@ -118,8 +129,10 @@ def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
     block = {"lines": []}
     line = {"spans": []}
     span = {"chars": []}
-    for i, char_info in enumerate(text_chars["chars"]):
-        font_info = f"{char_info['font']['name']}_{char_info['font']['size']}_{char_info['font']['weight']}_{char_info['font']['flags']}_{char_info['rotation']}"
+
+    for char_info in text_chars["chars"]:
+        font = char_info['font']
+        font_info = f"{font['name']}_{font['size']}_{font['weight']}_{font['flags']}_{char_info['rotation']}"
         if prev_char:
             training_row = create_training_row(char_info, prev_char, block, line)
             sorted_keys = sorted(training_row.keys())
@@ -147,11 +160,12 @@ def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
 
         prev_char = char_info
         prev_font_info = font_info
-    if len(span["chars"]) > 0:
+
+    if span["chars"]:
         update_span(line, span)
-    if len(line["spans"]) > 0:
+    if line["spans"]:
         update_line(block, line)
-    if len(block["lines"]) > 0:
+    if block["lines"]:
         update_block(blocks, block)
 
     return blocks

diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py
@@ -1,24 +1,22 @@
-import decimal
 import math
-from typing import Dict
+from typing import Dict, List
 
-import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 
 from pdftext.pdf.utils import get_fontname, pdfium_page_bbox_to_device_bbox, page_bbox_to_device_bbox
 from pdftext.settings import settings
 
 
-def update_previous_fonts(text_chars: Dict, i: int, prev_fontname: str, prev_fontflags: int, text_page, fontname_sample_freq: int):
+def update_previous_fonts(char_infos: List, i: int, prev_fontname: str, prev_fontflags: int, text_page, fontname_sample_freq: int):
     min_update = max(0, i - fontname_sample_freq) # Minimum index to update
     for j in range(i-1, min_update, -1): # Goes from i to min_update
         fontname, fontflags = get_fontname(text_page, j)
 
         # If we hit the region with the previous fontname, we can bail out
         if fontname == prev_fontname and fontflags == prev_fontflags:
             break
-        text_chars["chars"][j]["font"]["name"] = fontname
-        text_chars["chars"][j]["font"]["flags"] = fontflags
+        char_infos[j]["font"]["name"] = fontname
+        char_infos[j]["font"]["flags"] = fontflags
 
 
 def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ):
@@ -42,13 +40,9 @@ def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAM
         if page_rotation == 90 or page_rotation == 270:
             page_width, page_height = page_height, page_width
 
-        bl_origin = all([
-            mediabox[0] == 0,
-            mediabox[1] == 0
-        ])
+        bl_origin = (mediabox[0] == 0 and mediabox[1] == 0)
 
         text_chars = {
-            "chars": [],
             "page": page_idx,
             "rotation": page_rotation,
             "bbox": bbox,
@@ -59,6 +53,8 @@ def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAM
         fontname = None
         fontflags = None
         total_chars = text_page.count_chars()
+        char_infos = []
+
         for i in range(total_chars):
             char = pdfium_c.FPDFText_GetUnicode(text_page, i)
             char = chr(char)
@@ -69,7 +65,7 @@ def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAM
                 prev_fontflags = fontflags
                 fontname, fontflags = get_fontname(text_page, i)
                 if (fontname != prev_fontname or fontflags != prev_fontflags) and i > 0:
-                    update_previous_fonts(text_chars, i, prev_fontname, prev_fontflags, text_page, fontname_sample_freq)
+                    update_previous_fonts(char_infos, i, prev_fontname, prev_fontflags, text_page, fontname_sample_freq)
 
             rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
             rotation = rotation * 180 / math.pi # convert from radians to degrees
@@ -88,8 +84,9 @@ def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAM
                 "bbox": device_coords,
                 "char_idx": i
             }
-            text_chars["chars"].append(char_info)
+            char_infos.append(char_info)
 
+        text_chars["chars"] = char_infos
         text_chars["total_chars"] = total_chars
         blocks.append(text_chars)
     return blocks
diff --git a/pdftext/pdf/utils.py b/pdftext/pdf/utils.py
@@ -2,37 +2,17 @@
 import ctypes
 import math
 
-from pdftext.settings import settings
-
 LINE_BREAKS = ["\n", "\u000D", "\u000A"]
 TABS = ["\t", "\u0009", "\x09"]
 SPACES = [" ", "\ufffe", "\uFEFF", "\xa0"]
 WHITESPACE_CHARS = ["\n", "\r", "\f", "\t", " "]
 
 
-def char_count(textpage, *rect):
-    args = (textpage, *rect)
-    n_chars = pdfium_c.FPDFText_GetBoundedText(*args, None, 0)
-    if n_chars <= 0:
-        return 0
-    return n_chars
-
-
-def normalize_bbox(bbox, page_bound):
-    x1, y1, x2, y2 = bbox
-    x1 = x1 / page_bound[2]
-    y1 = y1 / page_bound[3]
-    x2 = x2 / page_bound[2]
-    y2 = y2 / page_bound[3]
-    return x1, y1, x2, y2
-
-
 def unnormalize_bbox(bbox, page_width, page_height):
-    x1, y1, x2, y2 = bbox
-    x1 = round(x1 * page_width, 1)
-    y1 = round(y1 * page_height, 1)
-    x2 = round(x2 * page_width, 1)
-    y2 = round(y2 * page_height, 1)
+    x1 = round(bbox[0] * page_width, 1)
+    y1 = round(bbox[1] * page_height, 1)
+    x2 = round(bbox[2] * page_width, 1)
+    y2 = round(bbox[3] * page_height, 1)
     return x1, y1, x2, y2
 
 

diff --git a/pdftext/settings.py b/pdftext/settings.py
@@ -12,7 +12,7 @@ class Settings(BaseSettings):
 
     # Inference
     BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection
-    WORKER_PAGE_THRESHOLD: int = 30 # Min number of pages per worker in parallel
+    WORKER_PAGE_THRESHOLD: int = 10 # Min number of pages per worker in parallel
 
     # Benchmark
     RESULTS_FOLDER: str = "results"

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdftext"
-version = "0.3.8"
+version = "0.3.9"
 description = "Extract structured text from pdfs quickly"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 license = "Apache-2.0"