Skip to content

Commit

Permalink
break on newlines instead of crlf
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Dec 10, 2024
1 parent d053ffa commit be036b2
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,16 @@ def line_break():
line_break()
continue

# we break if the previous span ends with a CLRF or hyphenation
if any(line["spans"][-1]["text"].endswith(suffix) for suffix in ["\r\n", "\x02"]):
# we break if the previous span ends with a linebreak or hyphenation
if any(line["spans"][-1]["text"].endswith(suffix) for suffix in ["\n", "\x02"]):
line_break()
continue

if span["rotation"] != line["rotation"]:
line_break()
continue

# sometimes pdfium doesn't inject a CLRF at the end of a line, so we check the span positions
# sometimes pdfium doesn't inject a linebreak, so we check the span positions
if span["bbox"].y_start > line["bbox"].y_end:
line_break()
continue
Expand Down

0 comments on commit be036b2

Please sign in to comment.