Skip to content

Commit

Permalink
Improve model
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Apr 29, 2024
1 parent 936399f commit b636434
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 6 deletions.
Binary file modified models/dt.joblib
Binary file not shown.
6 changes: 3 additions & 3 deletions pdftext/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,13 @@ def infer_single_page(text_chars):
prediction = yield training_row
if prediction == 0:
pass
elif prev_char["char"] == "\n" or prediction == 2: # Look for newline character as a forcing signal for a new line
elif prediction == 2 and prev_char["char"] in LINE_BREAKS:
span = update_span(line, span)
line = update_line(block, line)
elif prediction == 3:
block = update_block(blocks, block)
elif prev_char["char"] in LINE_BREAKS and prediction == 1: # Look for newline character as a forcing signal for a new line
span = update_span(line, span)
line = update_line(block, line)
block = update_block(blocks, block)
elif prev_font_info != font_info:
span = update_span(line, span)

Expand Down
2 changes: 1 addition & 1 deletion pdftext/pdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from pdftext.settings import settings

LINE_BREAKS = ["\n", "\u000D", "\u000A", "\u000C"]
LINE_BREAKS = ["\n", "\u000D", "\u000A"]
TABS = ["\t", "\u0009", "\x09"]
SPACES = [" ", "\ufffe", "\uFEFF", "\xa0"]
WHITESPACE_CHARS = ["\n", "\r", "\f", "\t", " "]
Expand Down
4 changes: 2 additions & 2 deletions pdftext/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@


def postprocess_text(text: str) -> str:
for old, new in REPLACEMENTS.items():
text = text.replace(old, new)
text = replace_special_chars(text)
text = replace_control_chars(text)
text = replace_ligatures(text)
for old, new in REPLACEMENTS.items():
text = text.replace(old, new)
return text


Expand Down

0 comments on commit b636434

Please sign in to comment.