Skip to content

Commit

Permalink
Merge pull request #12 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Improve line breaks, ignore special chars
  • Loading branch information
VikParuchuri authored Oct 17, 2024
2 parents 5915750 + 063af44 commit 7460bf4
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 9 deletions.
46 changes: 45 additions & 1 deletion pdftext/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,45 @@ def update_block(blocks, block):
return block


def get_dynamic_line_thresh(text_chars, rotation, default_thresh=.05, min_thresh=.0025, min_lines=5):
line_dists = []
prev_char = None
for i, char_info in enumerate(text_chars["chars"][1:]):
if prev_char is None:
prev_char = char_info
continue

if rotation == 90:
line_dist = char_info["bbox"][2] - prev_char["bbox"][0]
elif rotation == 180:
line_dist = prev_char["bbox"][1] - char_info["bbox"][3]
elif rotation == 270:
line_dist = char_info["bbox"][0] - prev_char["bbox"][2]
else:
line_dist = char_info["bbox"][1] - prev_char["bbox"][3]

if line_dist > min_thresh:
line_dists.append(line_dist)
prev_char = char_info
line_gap_thresh = np.percentile(line_dists, 50) if len(line_dists) > min_lines else default_thresh
return line_gap_thresh


def is_same_line(char_bbox, line_box, space_thresh, rotation):
line_center_x, line_center_y = (line_box[0] + line_box[2]) / 2, (line_box[1] + line_box[3]) / 2
def normalized_diff(a, b, mult=1, use_abs=True):
func = abs if use_abs else lambda x: x
return func(a - b) < space_thresh * mult

if rotation in [90, 270]:
char_center_x = (char_bbox[0] + char_bbox[2]) / 2

return normalized_diff(char_center_x, line_center_x)
else: # 0 or default case
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
return normalized_diff(char_center_y, line_center_y)


def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
prev_char = None
prev_font_info = None
Expand All @@ -127,6 +166,8 @@ def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
block = {"lines": []}
line = {"spans": []}
span = {"chars": []}
rotation = int(text_chars["rotation"])
line_thresh = get_dynamic_line_thresh(text_chars, rotation)

for char_info in text_chars["chars"]:
font = char_info['font']
Expand All @@ -144,7 +185,10 @@ def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
span = update_span(line, span)
line = update_line(block, line)
block = update_block(blocks, block)
elif prev_char["char"] in LINE_BREAKS: # Look for newline character as a forcing signal for a new line
elif (
prev_char["char"] in LINE_BREAKS or
not is_same_line(char_info["bbox"], line["bbox"], line_thresh, rotation)
): # Look for newline character as a forcing signal for a new line
span = update_span(line, span)
line = update_line(block, line)
elif prev_font_info != font_info:
Expand Down
15 changes: 8 additions & 7 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,13 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings

fontname = None
fontflags = None
total_chars = text_page.count_chars()
char_infos = []

for i in range(total_chars):
char = pdfium_c.FPDFText_GetUnicode(text_page, i)
char = chr(char)
rad_to_deg = 180 / math.pi
all_chars = text_page.get_text_bounded()

for char_idx, char in enumerate(all_chars):
i = pdfium_c.FPDFText_GetCharIndexFromTextIndex(text_page, char_idx)
fontsize = round(pdfium_c.FPDFText_GetFontSize(text_page, i), 1)
fontweight = round(pdfium_c.FPDFText_GetFontWeight(text_page, i), 1)
if fontname is None or i % fontname_sample_freq == 0:
Expand All @@ -84,7 +85,7 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
update_previous_fonts(char_infos, i, prev_fontname, prev_fontflags, text_page, fontname_sample_freq)

rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * 180 / math.pi # convert from radians to degrees
rotation = rotation * rad_to_deg # convert from radians to degrees
coords = text_page.get_charbox(i, loose=rotation == 0) # Loose doesn't work properly when charbox is rotated
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, bl_origin, page_rotation, normalize=True)

Expand All @@ -98,11 +99,11 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
"rotation": rotation,
"char": char,
"bbox": device_coords,
"char_idx": i
"char_idx": char_idx
}
char_infos.append(char_info)

text_chars["chars"] = char_infos
text_chars["total_chars"] = total_chars
text_chars["total_chars"] = len(all_chars)
blocks.append(text_chars)
return blocks
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.13"
version = "0.3.14"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit 7460bf4

Please sign in to comment.