diff --git a/pdftext/extraction.py b/pdftext/extraction.py index efda88f..e7e423c 100644 --- a/pdftext/extraction.py +++ b/pdftext/extraction.py @@ -61,30 +61,30 @@ def paginated_plain_text_output(pdf_path, sort=False, model=None, hyphens=False, return text +def _process_span(span, page_width, page_height, keep_chars): + span["bbox"] = unnormalize_bbox(span["bbox"], page_width, page_height) + span["text"] = handle_hyphens(postprocess_text(span["text"]), keep_hyphens=True) + if not keep_chars: + del span["chars"] + else: + for char in span["chars"]: + char["bbox"] = unnormalize_bbox(char["bbox"], page_width, page_height) + + def dictionary_output(pdf_path, sort=False, model=None, page_range=None, keep_chars=False, workers=None): pages = _get_pages(pdf_path, model, page_range, workers=workers) for page in pages: + page_width, page_height = page["width"], page["height"] for block in page["blocks"]: - bad_keys = [key for key in block.keys() if key not in ["lines", "bbox"]] - for key in bad_keys: - del block[key] + block = {k: v for k, v in block.items() if k in ["lines", "bbox"]} + block["bbox"] = unnormalize_bbox(block["bbox"], page_width, page_height) for line in block["lines"]: - bad_keys = [key for key in line.keys() if key not in ["bbox", "spans"]] - for key in bad_keys: - del line[key] + line = {k: v for k, v in line.items() if k in ["bbox", "spans"]} + line["bbox"] = unnormalize_bbox(line["bbox"], page_width, page_height) for span in line["spans"]: - span["bbox"] = unnormalize_bbox(span["bbox"], page["width"], page["height"]) - span["text"] = postprocess_text(span["text"]) - span["text"] = handle_hyphens(span["text"], keep_hyphens=True) - - if not keep_chars: - del span["chars"] - else: - for char in span["chars"]: - char["bbox"] = unnormalize_bbox(char["bbox"], page["width"], page["height"]) - - line["bbox"] = unnormalize_bbox(line["bbox"], page["width"], page["height"]) - block["bbox"] = unnormalize_bbox(block["bbox"], page["width"], page["height"]) + _process_span(span, page_width, page_height, keep_chars) + if sort: page["blocks"] = sort_blocks(page["blocks"]) + return pages diff --git a/pdftext/inference.py b/pdftext/inference.py index 01138c4..61306a6 100644 --- a/pdftext/inference.py +++ b/pdftext/inference.py @@ -9,14 +9,14 @@ def update_current(current, new_char): bbox = new_char["bbox"] if "bbox" not in current: - current_bbox = bbox.copy() - current["bbox"] = current_bbox + current["bbox"] = bbox.copy() else: current_bbox = current["bbox"] current_bbox[0] = min(bbox[0], current_bbox[0]) current_bbox[1] = min(bbox[1], current_bbox[1]) current_bbox[2] = max(bbox[2], current_bbox[2]) current_bbox[3] = max(bbox[3], current_bbox[3]) + current_bbox = current["bbox"] current["center_x"] = (current_bbox[0] + current_bbox[2]) / 2 current["center_y"] = (current_bbox[1] + current_bbox[3]) / 2 @@ -25,8 +25,13 @@ def create_training_row(char_info, prev_char, currblock, currline): char = char_info["char"] # Store variables used multiple times - char_x1, char_y1, char_x2, char_y2 = char_info["bbox"] - prev_x1, prev_y1, prev_x2, prev_y2 = prev_char["bbox"] + char_bbox = char_info["bbox"] + prev_bbox = prev_char["bbox"] + currblock_bbox = currblock["bbox"] + currline_bbox = currline["bbox"] + + char_x1, char_y1, char_x2, char_y2 = char_bbox + prev_x1, prev_y1, prev_x2, prev_y2 = prev_bbox char_center_x = (char_x2 + char_x1) / 2 char_center_y = (char_y2 + char_y1) / 2 x_gap = char_x1 - prev_x2 @@ -34,14 +39,13 @@ def create_training_row(char_info, prev_char, currblock, currline): char_font = char_info["font"] prev_font = prev_char["font"] - font_match = all( - [char_font[key] == prev_font[key] for key in ["name", "size", "weight", "flags"]] + - [char_info["rotation"] == prev_char["rotation"]] - ) - is_space = any([ - char in SPACES, - char in TABS, - ]) + font_match = (char_font["name"] == prev_font["name"] and + char_font["size"] == prev_font["size"] and + char_font["weight"] == prev_font["weight"] and + char_font["flags"] == prev_font["flags"] and + char_info["rotation"] == prev_char["rotation"]) + + is_space = char in SPACES or char in TABS training_row = { "is_newline": char in LINE_BREAKS, @@ -53,42 +57,49 @@ def create_training_row(char_info, prev_char, currblock, currline): "y_outer_gap": char_y2 - prev_y1, "line_x_center_gap": char_center_x - currline["center_x"], "line_y_center_gap": char_center_y - currline["center_y"], - "line_x_gap": char_x1 - currline["bbox"][2], - "line_y_gap": char_y1 - currline["bbox"][3], - "line_x_start_gap": char_x1 - currline["bbox"][0], - "line_y_start_gap": char_y1 - currline["bbox"][1], + "line_x_gap": char_x1 - currline_bbox[2], + "line_y_gap": char_y1 - currline_bbox[3], + "line_x_start_gap": char_x1 - currline_bbox[0], + "line_y_start_gap": char_y1 - currline_bbox[1], "block_x_center_gap": char_center_x - currblock["center_x"], "block_y_center_gap": char_center_y - currblock["center_y"], - "block_x_gap": char_x1 - currblock["bbox"][2], - "block_y_gap": char_y1 - currblock["bbox"][3], - "block_x_start_gap": char_x1 - currblock["bbox"][0], - "block_y_start_gap": char_y1 - currblock["bbox"][1] + "block_x_gap": char_x1 - currblock_bbox[2], + "block_y_gap": char_y1 - currblock_bbox[3], + "block_x_start_gap": char_x1 - currblock_bbox[0], + "block_y_start_gap": char_y1 - currblock_bbox[1] } return training_row def update_span(line, span): - if len(span["chars"]) > 0: - span["font"] = span["chars"][0]["font"] - span["rotation"] = span["chars"][0]["rotation"] + if span["chars"]: + first_char = span["chars"][0] + span["font"] = first_char["font"] + span["rotation"] = first_char["rotation"] + char_bboxes = [char["bbox"] for char in span["chars"]] - span["bbox"] = [min([bbox[0] for bbox in char_bboxes]), - min([bbox[1] for bbox in char_bboxes]), - max([bbox[2] for bbox in char_bboxes]), - max([bbox[3] for bbox in char_bboxes])] - span["text"] = "".join([char["char"] for char in span["chars"]]) - span["char_start_idx"] = span["chars"][0]["char_idx"] + min_x, min_y, max_x, max_y = char_bboxes[0] + + for bbox in char_bboxes[1:]: + min_x = min(min_x, bbox[0]) + min_y = min(min_y, bbox[1]) + max_x = max(max_x, bbox[2]) + max_y = max(max_y, bbox[3]) + + span["bbox"] = [min_x, min_y, max_x, max_y] + span["text"] = "".join(char["char"] for char in span["chars"]) + span["char_start_idx"] = first_char["char_idx"] span["char_end_idx"] = span["chars"][-1]["char_idx"] - # Remove unneeded keys from the characters - for char in span["chars"]: - del_keys = [k for k in list(char.keys()) if k not in ["char", "bbox"]] - for key in del_keys: - del char[key] - line["spans"].append(span) - span = {"chars": []} - return span + # Remove unneeded keys from the characters + for char in span["chars"]: + for key in list(char.keys()): + if key not in ["char", "bbox"]: + del char[key] + + line["spans"].append(span) + return {"chars": []} def update_line(block, line): @@ -118,8 +129,10 @@ def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD): block = {"lines": []} line = {"spans": []} span = {"chars": []} - for i, char_info in enumerate(text_chars["chars"]): - font_info = f"{char_info['font']['name']}_{char_info['font']['size']}_{char_info['font']['weight']}_{char_info['font']['flags']}_{char_info['rotation']}" + + for char_info in text_chars["chars"]: + font = char_info['font'] + font_info = f"{font['name']}_{font['size']}_{font['weight']}_{font['flags']}_{char_info['rotation']}" if prev_char: training_row = create_training_row(char_info, prev_char, block, line) sorted_keys = sorted(training_row.keys()) @@ -147,11 +160,12 @@ def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD): prev_char = char_info prev_font_info = font_info - if len(span["chars"]) > 0: + + if span["chars"]: update_span(line, span) - if len(line["spans"]) > 0: + if line["spans"]: update_line(block, line) - if len(block["lines"]) > 0: + if block["lines"]: update_block(blocks, block) return blocks diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index 01c3ad7..5b39892 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -1,15 +1,13 @@ -import decimal import math -from typing import Dict +from typing import Dict, List -import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c from pdftext.pdf.utils import get_fontname, pdfium_page_bbox_to_device_bbox, page_bbox_to_device_bbox from pdftext.settings import settings -def update_previous_fonts(text_chars: Dict, i: int, prev_fontname: str, prev_fontflags: int, text_page, fontname_sample_freq: int): +def update_previous_fonts(char_infos: List, i: int, prev_fontname: str, prev_fontflags: int, text_page, fontname_sample_freq: int): min_update = max(0, i - fontname_sample_freq) # Minimum index to update for j in range(i-1, min_update, -1): # Goes from i to min_update fontname, fontflags = get_fontname(text_page, j) @@ -17,8 +15,8 @@ def update_previous_fonts(text_chars: Dict, i: int, prev_fontname: str, prev_fon # If we hit the region with the previous fontname, we can bail out if fontname == prev_fontname and fontflags == prev_fontflags: break - text_chars["chars"][j]["font"]["name"] = fontname - text_chars["chars"][j]["font"]["flags"] = fontflags + char_infos[j]["font"]["name"] = fontname + char_infos[j]["font"]["flags"] = fontflags def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ): @@ -42,13 +40,9 @@ def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAM if page_rotation == 90 or page_rotation == 270: page_width, page_height = page_height, page_width - bl_origin = all([ - mediabox[0] == 0, - mediabox[1] == 0 - ]) + bl_origin = (mediabox[0] == 0 and mediabox[1] == 0) text_chars = { - "chars": [], "page": page_idx, "rotation": page_rotation, "bbox": bbox, @@ -59,6 +53,8 @@ def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAM fontname = None fontflags = None total_chars = text_page.count_chars() + char_infos = [] + for i in range(total_chars): char = pdfium_c.FPDFText_GetUnicode(text_page, i) char = chr(char) @@ -69,7 +65,7 @@ def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAM prev_fontflags = fontflags fontname, fontflags = get_fontname(text_page, i) if (fontname != prev_fontname or fontflags != prev_fontflags) and i > 0: - update_previous_fonts(text_chars, i, prev_fontname, prev_fontflags, text_page, fontname_sample_freq) + update_previous_fonts(char_infos, i, prev_fontname, prev_fontflags, text_page, fontname_sample_freq) rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i) rotation = rotation * 180 / math.pi # convert from radians to degrees @@ -88,8 +84,9 @@ def get_pdfium_chars(pdf, page_range, fontname_sample_freq=settings.FONTNAME_SAM "bbox": device_coords, "char_idx": i } - text_chars["chars"].append(char_info) + char_infos.append(char_info) + text_chars["chars"] = char_infos text_chars["total_chars"] = total_chars blocks.append(text_chars) return blocks \ No newline at end of file diff --git a/pdftext/pdf/utils.py b/pdftext/pdf/utils.py index 7408e1c..409d029 100644 --- a/pdftext/pdf/utils.py +++ b/pdftext/pdf/utils.py @@ -2,37 +2,17 @@ import ctypes import math -from pdftext.settings import settings - LINE_BREAKS = ["\n", "\u000D", "\u000A"] TABS = ["\t", "\u0009", "\x09"] SPACES = [" ", "\ufffe", "\uFEFF", "\xa0"] WHITESPACE_CHARS = ["\n", "\r", "\f", "\t", " "] -def char_count(textpage, *rect): - args = (textpage, *rect) - n_chars = pdfium_c.FPDFText_GetBoundedText(*args, None, 0) - if n_chars <= 0: - return 0 - return n_chars - - -def normalize_bbox(bbox, page_bound): - x1, y1, x2, y2 = bbox - x1 = x1 / page_bound[2] - y1 = y1 / page_bound[3] - x2 = x2 / page_bound[2] - y2 = y2 / page_bound[3] - return x1, y1, x2, y2 - - def unnormalize_bbox(bbox, page_width, page_height): - x1, y1, x2, y2 = bbox - x1 = round(x1 * page_width, 1) - y1 = round(y1 * page_height, 1) - x2 = round(x2 * page_width, 1) - y2 = round(y2 * page_height, 1) + x1 = round(bbox[0] * page_width, 1) + y1 = round(bbox[1] * page_height, 1) + x2 = round(bbox[2] * page_width, 1) + y2 = round(bbox[3] * page_height, 1) return x1, y1, x2, y2 diff --git a/pdftext/settings.py b/pdftext/settings.py index 65da221..4654108 100644 --- a/pdftext/settings.py +++ b/pdftext/settings.py @@ -12,7 +12,7 @@ class Settings(BaseSettings): # Inference BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection - WORKER_PAGE_THRESHOLD: int = 30 # Min number of pages per worker in parallel + WORKER_PAGE_THRESHOLD: int = 10 # Min number of pages per worker in parallel # Benchmark RESULTS_FOLDER: str = "results" diff --git a/pyproject.toml b/pyproject.toml index d25b356..199ff9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.3.8" +version = "0.3.9" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"