diff --git a/models/dt.joblib b/models/dt.joblib index 04cc2a9..fd5969f 100644 Binary files a/models/dt.joblib and b/models/dt.joblib differ diff --git a/pdftext/inference.py b/pdftext/inference.py index a2575bd..446ce83 100644 --- a/pdftext/inference.py +++ b/pdftext/inference.py @@ -123,13 +123,13 @@ def infer_single_page(text_chars): prediction = yield training_row if prediction == 0: pass - elif prev_char["char"] == "\n" or prediction == 2: # Look for newline character as a forcing signal for a new line + elif prediction == 2 and prev_char["char"] in LINE_BREAKS: span = update_span(line, span) line = update_line(block, line) - elif prediction == 3: + block = update_block(blocks, block) + elif prev_char["char"] in LINE_BREAKS and prediction == 1: # Look for newline character as a forcing signal for a new line span = update_span(line, span) line = update_line(block, line) - block = update_block(blocks, block) elif prev_font_info != font_info: span = update_span(line, span) diff --git a/pdftext/pdf/utils.py b/pdftext/pdf/utils.py index acf8aee..cf77483 100644 --- a/pdftext/pdf/utils.py +++ b/pdftext/pdf/utils.py @@ -4,7 +4,7 @@ from pdftext.settings import settings -LINE_BREAKS = ["\n", "\u000D", "\u000A", "\u000C"] +LINE_BREAKS = ["\n", "\u000D", "\u000A"] TABS = ["\t", "\u0009", "\x09"] SPACES = [" ", "\ufffe", "\uFEFF", "\xa0"] WHITESPACE_CHARS = ["\n", "\r", "\f", "\t", " "] diff --git a/pdftext/postprocessing.py b/pdftext/postprocessing.py index 8854e33..b7505b6 100644 --- a/pdftext/postprocessing.py +++ b/pdftext/postprocessing.py @@ -19,11 +19,11 @@ def postprocess_text(text: str) -> str: + for old, new in REPLACEMENTS.items(): + text = text.replace(old, new) text = replace_special_chars(text) text = replace_control_chars(text) text = replace_ligatures(text) - for old, new in REPLACEMENTS.items(): - text = text.replace(old, new) return text