From 54a421837c0f58eb8ae9541ab3d739da93c14181 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 2 May 2024 16:56:33 -0700 Subject: [PATCH] Fix how fontnames are pulled --- pdftext/pdf/chars.py | 16 ++++++++-------- pdftext/settings.py | 2 +- pyproject.toml | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index 0d9e298..f883744 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -9,15 +9,13 @@ from pdftext.settings import settings -def update_previous_fonts(text_chars: Dict, i: int, fontname: str, fontflags: int, prev_fontname: str, text_page, fontname_sample_freq: int): - min_update = max(0, i - fontname_sample_freq + 1) # Minimum index to update - regather_font_info = fontname != prev_fontname - for j in range(min_update, i): # Goes from min_update to i - 1 - if regather_font_info: - fontname, fontflags = get_fontname(text_page, j) +def update_previous_fonts(text_chars: Dict, i: int, prev_fontname: str, prev_fontflags: int, text_page, fontname_sample_freq: int): + min_update = max(0, i - fontname_sample_freq) # Minimum index to update + for j in range(i-1, min_update, -1): # Goes from i to min_update + fontname, fontflags = get_fontname(text_page, j) # If we hit the region with the previous fontname, we can bail out - if fontname == prev_fontname: + if fontname == prev_fontname and fontflags == prev_fontflags: break text_chars["chars"][j]["font"]["name"] = fontname text_chars["chars"][j]["font"]["flags"] = fontflags @@ -70,8 +68,10 @@ def get_pdfium_chars(pdf, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ, pa fontweight = round(pdfium_c.FPDFText_GetFontWeight(text_page, i), 1) if fontname is None or i % fontname_sample_freq == 0: prev_fontname = fontname + prev_fontflags = fontflags fontname, fontflags = get_fontname(text_page, i) - update_previous_fonts(text_chars, i, fontname, fontflags, prev_fontname, text_page, fontname_sample_freq) + if (fontname != prev_fontname or fontflags != prev_fontflags) and i > 0: + update_previous_fonts(text_chars, i, prev_fontname, prev_fontflags, text_page, fontname_sample_freq) rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i) rotation = rotation * 180 / math.pi # convert from radians to degrees diff --git a/pdftext/settings.py b/pdftext/settings.py index 2c98678..0df433a 100644 --- a/pdftext/settings.py +++ b/pdftext/settings.py @@ -8,7 +8,7 @@ class Settings(BaseSettings): MODEL_PATH: str = os.path.join(BASE_PATH, "models", "dt.joblib") # Fonts - FONTNAME_SAMPLE_FREQ: int = 10 + FONTNAME_SAMPLE_FREQ: int = 4 # Inference BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection diff --git a/pyproject.toml b/pyproject.toml index 491109b..c3f002c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.3.4" +version = "0.3.5" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"