From 1a56805a29516dc8fc82ddf6bba32f483568a129 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 30 Apr 2024 10:51:39 -0700 Subject: [PATCH] Improve block handling --- README.md | 2 +- benchmark.py | 2 +- pdftext/inference.py | 14 ++++++++------ pdftext/postprocessing.py | 4 ++-- pdftext/settings.py | 7 +++++-- pyproject.toml | 2 +- 6 files changed, 18 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index e26eb65..d5ce939 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ The output will be a json list, with each item in the list corresponding to a si - `bbox` - the page bbox, in `[x1, y1, x2, y2]` format - `rotation` - how much the page is rotated, in degrees (`0`, `90`, `180`, or `270`) -- `page_idx` - the index of the page +- `page` - the index of the page - `blocks` - the blocks that make up the text in the pdf. Approximately equal to a paragraph. - `bbox` - the block bbox, in `[x1, y1, x2, y2]` format - `lines` - the lines inside the block diff --git a/benchmark.py b/benchmark.py index d641439..d56c458 100644 --- a/benchmark.py +++ b/benchmark.py @@ -32,7 +32,7 @@ def pymupdf_inference(pdf_path): for span in line["spans"]: text += span["text"] text = text.rstrip() + "\n" - text = text.rstrip() + "\n" + text = text.rstrip() + "\n\n" pages.append(text) return pages diff --git a/pdftext/inference.py b/pdftext/inference.py index 446ce83..2541d8a 100644 --- a/pdftext/inference.py +++ b/pdftext/inference.py @@ -3,6 +3,7 @@ import sklearn from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES +from pdftext.settings import settings def update_current(current, new_char): @@ -98,7 +99,7 @@ def update_block(blocks, block): return block -def infer_single_page(text_chars): +def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD): prev_char = None prev_font_info = None @@ -120,14 +121,15 @@ def infer_single_page(text_chars): sorted_keys = sorted(training_row.keys()) training_row = [training_row[key] for key in sorted_keys] - prediction = yield training_row - if prediction == 0: + prediction_probs = yield training_row + # First item is probability of same line/block, second is probability of new line, third is probability of new block + if prediction_probs[0] >= .5: pass - elif prediction == 2 and prev_char["char"] in LINE_BREAKS: + elif prediction_probs[2] > block_threshold: span = update_span(line, span) line = update_line(block, line) block = update_block(blocks, block) - elif prev_char["char"] in LINE_BREAKS and prediction == 1: # Look for newline character as a forcing signal for a new line + elif prev_char["char"] in LINE_BREAKS and prediction_probs[1] >= .5: # Look for newline character as a forcing signal for a new line span = update_span(line, span) line = update_line(block, line) elif prev_font_info != font_info: @@ -180,7 +182,7 @@ def inference(text_chars, model): # Disable nan, etc, validation for a small speedup with sklearn.config_context(assume_finite=True): - predictions = model.predict(training_rows) + predictions = model.predict_proba(training_rows) for pred, page_idx in zip(predictions, training_idxs): next_prediction[page_idx] = pred sorted_keys = sorted(page_blocks.keys()) diff --git a/pdftext/postprocessing.py b/pdftext/postprocessing.py index 1716a28..7c38076 100644 --- a/pdftext/postprocessing.py +++ b/pdftext/postprocessing.py @@ -36,7 +36,7 @@ def handle_hyphens(text: str, keep_hyphens=False) -> str: new_text = "" found_hyphen = False for i in range(len(text) - 1): - if text[i] == HYPHEN_CHAR and text[i+1] in LINE_BREAKS: + if text[i] == HYPHEN_CHAR: found_hyphen = True elif found_hyphen: if text[i] in LINE_BREAKS: @@ -106,7 +106,7 @@ def merge_text(page: Dict, sort=False, hyphens=False) -> str: line_text = line_text.rstrip() + "\n" block_text += line_text - block_text = block_text.rstrip() + "\n" + block_text = block_text.rstrip() + "\n\n" text += block_text text = handle_hyphens(text, keep_hyphens=hyphens) return text diff --git a/pdftext/settings.py b/pdftext/settings.py index 615fdde..f945f0c 100644 --- a/pdftext/settings.py +++ b/pdftext/settings.py @@ -7,10 +7,13 @@ class Settings(BaseSettings): BASE_PATH: str = os.path.dirname(os.path.dirname(__file__)) MODEL_PATH: str = os.path.join(BASE_PATH, "models", "dt.joblib") - # How many characters to buffer when reading a font name - FONT_BUFFER_SIZE: int = 1024 + # Fonts + FONT_BUFFER_SIZE: int = 1024 # How many characters to buffer when reading a font name FONTNAME_SAMPLE_FREQ: int = 10 + # Inference + BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection + # Benchmark RESULTS_FOLDER: str = "results" BENCH_DATASET_NAME: str = "vikp/pdf_bench" diff --git a/pyproject.toml b/pyproject.toml index ff0abd9..a71f9de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.3.1" +version = "0.3.2" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"