Skip to content

Commit

Permalink
Change how lines are broken
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 2, 2024
1 parent 65cc3c1 commit 34d3748
Show file tree
Hide file tree
Showing 4 changed files with 3 additions and 5 deletions.
2 changes: 1 addition & 1 deletion pdftext/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
span = update_span(line, span)
line = update_line(block, line)
block = update_block(blocks, block)
elif prev_char["char"] in LINE_BREAKS and prediction_probs[1] >= .5: # Look for newline character as a forcing signal for a new line
elif prev_char["char"] in LINE_BREAKS: # Look for newline character as a forcing signal for a new line
span = update_span(line, span)
line = update_line(block, line)
elif prev_font_info != font_info:
Expand Down
3 changes: 1 addition & 2 deletions pdftext/pdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ def get_fontname(textpage, char_index):
# Re-interpret the type from char to unsigned short as required by the function
buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))
flag_buffer = ctypes.c_int()
flag_ptr = ctypes.pointer(flag_buffer)
font_info = pdfium_c.FPDFText_GetFontInfo(textpage, char_index, buffer_ptr, n_bytes, flag_ptr)
font_info = pdfium_c.FPDFText_GetFontInfo(textpage, char_index, buffer_ptr, n_bytes, flag_buffer)
if font_info == 0:
return None, None
try:
Expand Down
1 change: 0 additions & 1 deletion pdftext/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ class Settings(BaseSettings):
MODEL_PATH: str = os.path.join(BASE_PATH, "models", "dt.joblib")

# Fonts
FONT_BUFFER_SIZE: int = 1024 # How many characters to buffer when reading a font name
FONTNAME_SAMPLE_FREQ: int = 10

# Inference
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.3"
version = "0.3.4"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit 34d3748

Please sign in to comment.