Skip to content

Commit

Permalink
Fix how fontnames are pulled
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 2, 2024
1 parent 34d3748 commit 54a4218
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 10 deletions.
16 changes: 8 additions & 8 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@
from pdftext.settings import settings


def update_previous_fonts(text_chars: Dict, i: int, fontname: str, fontflags: int, prev_fontname: str, text_page, fontname_sample_freq: int):
min_update = max(0, i - fontname_sample_freq + 1) # Minimum index to update
regather_font_info = fontname != prev_fontname
for j in range(min_update, i): # Goes from min_update to i - 1
if regather_font_info:
fontname, fontflags = get_fontname(text_page, j)
def update_previous_fonts(text_chars: Dict, i: int, prev_fontname: str, prev_fontflags: int, text_page, fontname_sample_freq: int):
min_update = max(0, i - fontname_sample_freq) # Minimum index to update
for j in range(i-1, min_update, -1): # Goes from i to min_update
fontname, fontflags = get_fontname(text_page, j)

# If we hit the region with the previous fontname, we can bail out
if fontname == prev_fontname:
if fontname == prev_fontname and fontflags == prev_fontflags:
break
text_chars["chars"][j]["font"]["name"] = fontname
text_chars["chars"][j]["font"]["flags"] = fontflags
Expand Down Expand Up @@ -70,8 +68,10 @@ def get_pdfium_chars(pdf, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ, pa
fontweight = round(pdfium_c.FPDFText_GetFontWeight(text_page, i), 1)
if fontname is None or i % fontname_sample_freq == 0:
prev_fontname = fontname
prev_fontflags = fontflags
fontname, fontflags = get_fontname(text_page, i)
update_previous_fonts(text_chars, i, fontname, fontflags, prev_fontname, text_page, fontname_sample_freq)
if (fontname != prev_fontname or fontflags != prev_fontflags) and i > 0:
update_previous_fonts(text_chars, i, prev_fontname, prev_fontflags, text_page, fontname_sample_freq)

rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * 180 / math.pi # convert from radians to degrees
Expand Down
2 changes: 1 addition & 1 deletion pdftext/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class Settings(BaseSettings):
MODEL_PATH: str = os.path.join(BASE_PATH, "models", "dt.joblib")

# Fonts
FONTNAME_SAMPLE_FREQ: int = 10
FONTNAME_SAMPLE_FREQ: int = 4

# Inference
BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.4"
version = "0.3.5"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit 54a4218

Please sign in to comment.