Skip to content

Commit

Permalink
Improve block handling
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Apr 30, 2024
1 parent dc483f9 commit 1a56805
Show file tree
Hide file tree
Showing 6 changed files with 18 additions and 13 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ The output will be a json list, with each item in the list corresponding to a si

- `bbox` - the page bbox, in `[x1, y1, x2, y2]` format
- `rotation` - how much the page is rotated, in degrees (`0`, `90`, `180`, or `270`)
- `page_idx` - the index of the page
- `page` - the index of the page
- `blocks` - the blocks that make up the text in the pdf. Approximately equal to a paragraph.
- `bbox` - the block bbox, in `[x1, y1, x2, y2]` format
- `lines` - the lines inside the block
Expand Down
2 changes: 1 addition & 1 deletion benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def pymupdf_inference(pdf_path):
for span in line["spans"]:
text += span["text"]
text = text.rstrip() + "\n"
text = text.rstrip() + "\n"
text = text.rstrip() + "\n\n"
pages.append(text)
return pages

Expand Down
14 changes: 8 additions & 6 deletions pdftext/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sklearn

from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES
from pdftext.settings import settings


def update_current(current, new_char):
Expand Down Expand Up @@ -98,7 +99,7 @@ def update_block(blocks, block):
return block


def infer_single_page(text_chars):
def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
prev_char = None
prev_font_info = None

Expand All @@ -120,14 +121,15 @@ def infer_single_page(text_chars):
sorted_keys = sorted(training_row.keys())
training_row = [training_row[key] for key in sorted_keys]

prediction = yield training_row
if prediction == 0:
prediction_probs = yield training_row
# First item is probability of same line/block, second is probability of new line, third is probability of new block
if prediction_probs[0] >= .5:
pass
elif prediction == 2 and prev_char["char"] in LINE_BREAKS:
elif prediction_probs[2] > block_threshold:
span = update_span(line, span)
line = update_line(block, line)
block = update_block(blocks, block)
elif prev_char["char"] in LINE_BREAKS and prediction == 1: # Look for newline character as a forcing signal for a new line
elif prev_char["char"] in LINE_BREAKS and prediction_probs[1] >= .5: # Look for newline character as a forcing signal for a new line
span = update_span(line, span)
line = update_line(block, line)
elif prev_font_info != font_info:
Expand Down Expand Up @@ -180,7 +182,7 @@ def inference(text_chars, model):

# Disable nan, etc, validation for a small speedup
with sklearn.config_context(assume_finite=True):
predictions = model.predict(training_rows)
predictions = model.predict_proba(training_rows)
for pred, page_idx in zip(predictions, training_idxs):
next_prediction[page_idx] = pred
sorted_keys = sorted(page_blocks.keys())
Expand Down
4 changes: 2 additions & 2 deletions pdftext/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def handle_hyphens(text: str, keep_hyphens=False) -> str:
new_text = ""
found_hyphen = False
for i in range(len(text) - 1):
if text[i] == HYPHEN_CHAR and text[i+1] in LINE_BREAKS:
if text[i] == HYPHEN_CHAR:
found_hyphen = True
elif found_hyphen:
if text[i] in LINE_BREAKS:
Expand Down Expand Up @@ -106,7 +106,7 @@ def merge_text(page: Dict, sort=False, hyphens=False) -> str:
line_text = line_text.rstrip() + "\n"

block_text += line_text
block_text = block_text.rstrip() + "\n"
block_text = block_text.rstrip() + "\n\n"
text += block_text
text = handle_hyphens(text, keep_hyphens=hyphens)
return text
7 changes: 5 additions & 2 deletions pdftext/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@ class Settings(BaseSettings):
BASE_PATH: str = os.path.dirname(os.path.dirname(__file__))
MODEL_PATH: str = os.path.join(BASE_PATH, "models", "dt.joblib")

# How many characters to buffer when reading a font name
FONT_BUFFER_SIZE: int = 1024
# Fonts
FONT_BUFFER_SIZE: int = 1024 # How many characters to buffer when reading a font name
FONTNAME_SAMPLE_FREQ: int = 10

# Inference
BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection

# Benchmark
RESULTS_FOLDER: str = "results"
BENCH_DATASET_NAME: str = "vikp/pdf_bench"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.1"
version = "0.3.2"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit 1a56805

Please sign in to comment.