From 46ba8bf0e02f5c77839044eae263e69830cc4a42 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 17 Oct 2024 15:52:53 -0400 Subject: [PATCH] Revert extraction --- pdftext/pdf/chars.py | 12 ++++++------ pyproject.toml | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index fb821c7..0ef3760 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -68,13 +68,13 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings fontname = None fontflags = None + total_chars = text_page.count_chars() char_infos = [] - rad_to_deg = 180 / math.pi - all_chars = text_page.get_text_bounded() - for char_idx, char in enumerate(all_chars): - i = pdfium_c.FPDFText_GetCharIndexFromTextIndex(text_page, char_idx) + for i in range(total_chars): + char = pdfium_c.FPDFText_GetUnicode(text_page, i) + char = chr(char) fontsize = round(pdfium_c.FPDFText_GetFontSize(text_page, i), 1) fontweight = round(pdfium_c.FPDFText_GetFontWeight(text_page, i), 1) if fontname is None or i % fontname_sample_freq == 0: @@ -99,11 +99,11 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings "rotation": rotation, "char": char, "bbox": device_coords, - "char_idx": char_idx + "char_idx": i } char_infos.append(char_info) text_chars["chars"] = char_infos - text_chars["total_chars"] = len(all_chars) + text_chars["total_chars"] = total_chars blocks.append(text_chars) return blocks \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 2b28f7f..2f45b58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.3.15" +version = "0.3.16" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"