Skip to content

Commit

Permalink
Merge pull request #14 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Revert extraction
  • Loading branch information
VikParuchuri authored Oct 17, 2024
2 parents a7cd4fb + 46ba8bf commit c6a85c6
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 7 deletions.
12 changes: 6 additions & 6 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,13 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings

fontname = None
fontflags = None
total_chars = text_page.count_chars()
char_infos = []

rad_to_deg = 180 / math.pi
all_chars = text_page.get_text_bounded()

for char_idx, char in enumerate(all_chars):
i = pdfium_c.FPDFText_GetCharIndexFromTextIndex(text_page, char_idx)
for i in range(total_chars):
char = pdfium_c.FPDFText_GetUnicode(text_page, i)
char = chr(char)
fontsize = round(pdfium_c.FPDFText_GetFontSize(text_page, i), 1)
fontweight = round(pdfium_c.FPDFText_GetFontWeight(text_page, i), 1)
if fontname is None or i % fontname_sample_freq == 0:
Expand All @@ -99,11 +99,11 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
"rotation": rotation,
"char": char,
"bbox": device_coords,
"char_idx": char_idx
"char_idx": i
}
char_infos.append(char_info)

text_chars["chars"] = char_infos
text_chars["total_chars"] = len(all_chars)
text_chars["total_chars"] = total_chars
blocks.append(text_chars)
return blocks
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.15"
version = "0.3.16"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit c6a85c6

Please sign in to comment.