From 1a56805a29516dc8fc82ddf6bba32f483568a129 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Tue, 30 Apr 2024 10:51:39 -0700
Subject: [PATCH] Improve block handling

---
 README.md                 |  2 +-
 benchmark.py              |  2 +-
 pdftext/inference.py      | 14 ++++++++------
 pdftext/postprocessing.py |  4 ++--
 pdftext/settings.py       |  7 +++++--
 pyproject.toml            |  2 +-
 6 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index e26eb65..d5ce939 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ The output will be a json list, with each item in the list corresponding to a si
 
 - `bbox` - the page bbox, in `[x1, y1, x2, y2]` format
 - `rotation` - how much the page is rotated, in degrees (`0`, `90`, `180`, or `270`)
-- `page_idx` - the index of the page
+- `page` - the index of the page
 - `blocks` - the blocks that make up the text in the pdf.  Approximately equal to a paragraph.
   - `bbox` - the block bbox, in `[x1, y1, x2, y2]` format
   - `lines` - the lines inside the block
diff --git a/benchmark.py b/benchmark.py
index d641439..d56c458 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -32,7 +32,7 @@ def pymupdf_inference(pdf_path):
                 for span in line["spans"]:
                     text += span["text"]
                 text = text.rstrip() + "\n"
-            text = text.rstrip() + "\n"
+            text = text.rstrip() + "\n\n"
         pages.append(text)
     return pages
 
diff --git a/pdftext/inference.py b/pdftext/inference.py
index 446ce83..2541d8a 100644
--- a/pdftext/inference.py
+++ b/pdftext/inference.py
@@ -3,6 +3,7 @@
 import sklearn
 
 from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES
+from pdftext.settings import settings
 
 
 def update_current(current, new_char):
@@ -98,7 +99,7 @@ def update_block(blocks, block):
     return block
 
 
-def infer_single_page(text_chars):
+def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
     prev_char = None
     prev_font_info = None
 
@@ -120,14 +121,15 @@ def infer_single_page(text_chars):
             sorted_keys = sorted(training_row.keys())
             training_row = [training_row[key] for key in sorted_keys]
 
-            prediction = yield training_row
-            if prediction == 0:
+            prediction_probs = yield training_row
+            # First item is probability of same line/block, second is probability of new line, third is probability of new block
+            if prediction_probs[0] >= .5:
                 pass
-            elif prediction == 2 and prev_char["char"] in LINE_BREAKS:
+            elif prediction_probs[2] > block_threshold:
                 span = update_span(line, span)
                 line = update_line(block, line)
                 block = update_block(blocks, block)
-            elif prev_char["char"] in LINE_BREAKS and prediction == 1: # Look for newline character as a forcing signal for a new line
+            elif prev_char["char"] in LINE_BREAKS and prediction_probs[1] >= .5: # Look for newline character as a forcing signal for a new line
                 span = update_span(line, span)
                 line = update_line(block, line)
             elif prev_font_info != font_info:
@@ -180,7 +182,7 @@ def inference(text_chars, model):
 
         # Disable nan, etc, validation for a small speedup
         with sklearn.config_context(assume_finite=True):
-            predictions = model.predict(training_rows)
+            predictions = model.predict_proba(training_rows)
         for pred, page_idx in zip(predictions, training_idxs):
             next_prediction[page_idx] = pred
     sorted_keys = sorted(page_blocks.keys())
diff --git a/pdftext/postprocessing.py b/pdftext/postprocessing.py
index 1716a28..7c38076 100644
--- a/pdftext/postprocessing.py
+++ b/pdftext/postprocessing.py
@@ -36,7 +36,7 @@ def handle_hyphens(text: str, keep_hyphens=False) -> str:
         new_text = ""
         found_hyphen = False
         for i in range(len(text) - 1):
-            if text[i] == HYPHEN_CHAR and text[i+1] in LINE_BREAKS:
+            if text[i] == HYPHEN_CHAR:
                 found_hyphen = True
             elif found_hyphen:
                 if text[i] in LINE_BREAKS:
@@ -106,7 +106,7 @@ def merge_text(page: Dict, sort=False, hyphens=False) -> str:
             line_text = line_text.rstrip() + "\n"
 
             block_text += line_text
-        block_text = block_text.rstrip() + "\n"
+        block_text = block_text.rstrip() + "\n\n"
         text += block_text
     text = handle_hyphens(text, keep_hyphens=hyphens)
     return text
diff --git a/pdftext/settings.py b/pdftext/settings.py
index 615fdde..f945f0c 100644
--- a/pdftext/settings.py
+++ b/pdftext/settings.py
@@ -7,10 +7,13 @@ class Settings(BaseSettings):
     BASE_PATH: str = os.path.dirname(os.path.dirname(__file__))
     MODEL_PATH: str = os.path.join(BASE_PATH, "models", "dt.joblib")
 
-    # How many characters to buffer when reading a font name
-    FONT_BUFFER_SIZE: int = 1024
+    # Fonts
+    FONT_BUFFER_SIZE: int = 1024 # How many characters to buffer when reading a font name
     FONTNAME_SAMPLE_FREQ: int = 10
 
+    # Inference
+    BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection
+
     # Benchmark
     RESULTS_FOLDER: str = "results"
     BENCH_DATASET_NAME: str = "vikp/pdf_bench"
diff --git a/pyproject.toml b/pyproject.toml
index ff0abd9..a71f9de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdftext"
-version = "0.3.1"
+version = "0.3.2"
 description = "Extract structured text from pdfs quickly"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 license = "Apache-2.0"