Skip to content

Commit

Permalink
fixes, make changes self contained
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Dec 9, 2024
1 parent 60b5a68 commit 85a5212
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 17 deletions.
8 changes: 4 additions & 4 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,13 @@ def paginated_plain_text_output(pdf_path, sort=False, hyphens=False, page_range=


def _process_span(span, page_width, page_height, keep_chars):
span["bbox"] = span["bbox"].unnormalize(page_width, page_height)
span["bbox"] = span["bbox"].unnormalize(page_width, page_height).bbox
span["text"] = handle_hyphens(postprocess_text(span["text"]), keep_hyphens=True)
if not keep_chars:
del span["chars"]
else:
for char in span["chars"]:
char["bbox"] = char["bbox"].unnormalize(page_width, page_height)
char["bbox"] = char["bbox"].unnormalize(page_width, page_height).bbox


def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None):
Expand All @@ -95,12 +95,12 @@ def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, f
for k in list(block.keys()):
if k not in ["lines", "bbox"]:
del block[k]
block["bbox"] = block["bbox"].unnormalize(page_width, page_height)
block["bbox"] = block["bbox"].unnormalize(page_width, page_height).bbox
for line in block["lines"]:
for k in list(line.keys()):
if k not in ["spans", "bbox"]:
del line[k]
line["bbox"] = line["bbox"].unnormalize(page_width, page_height)
line["bbox"] = line["bbox"].unnormalize(page_width, page_height).bbox
for span in line["spans"]:
_process_span(span, page_width, page_height, keep_chars)

Expand Down
8 changes: 1 addition & 7 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@

def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True, normalize=True) -> Chars:
chars: Chars = []
start_idx = 0
end_idx = 1

x_start, y_start, x_end, y_end = page_bbox
page_width = math.ceil(abs(x_end - x_start))
Expand All @@ -19,7 +17,6 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
for i in range(textpage.count_chars()):
fontname, fontflag = get_fontname(textpage, i)
text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))
end_idx = start_idx + len(text)

rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i)
loosebox = rotation == 0 and (not text == "'" or quote_loosebox)
Expand Down Expand Up @@ -50,9 +47,6 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
"size": pdfium_c.FPDFText_GetFontSize(textpage, i),
"weight": pdfium_c.FPDFText_GetFontWeight(textpage, i),
},
"char_idx": i,
"char_start_idx": start_idx,
"char_end_idx": end_idx
"char_idx": i
})
start_idx = end_idx
return chars
6 changes: 3 additions & 3 deletions pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def span_break():
"text": char["char"],
"rotation": char["rotation"],
"font": char["font"],
"char_start_idx": char["char_start_idx"],
"char_end_idx": char["char_end_idx"],
"char_start_idx": char["char_idx"],
"char_end_idx": char["char_idx"],
"chars": [char]
})

Expand All @@ -47,7 +47,7 @@ def span_break():
continue

span['text'] += char['char']
span['char_end_idx'] = char['char_end_idx']
span['char_end_idx'] = char['char_idx']
span['bbox'] = span['bbox'].merge(char['bbox'])
span['chars'].append(char)

Expand Down
5 changes: 2 additions & 3 deletions pdftext/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,6 @@ class Char(TypedDict):
rotation: float
font: Dict[str, Union[Any, str]]
char_idx: int
char_start_idx: int
char_end_idx: int


class Span(TypedDict):
Expand All @@ -135,7 +133,8 @@ class Span(TypedDict):
font_weight: float
font_size: float
chars: List[Char]

char_start_idx: int
char_end_idx: int

class Line(TypedDict):
spans: List[Span]
Expand Down

0 comments on commit 85a5212

Please sign in to comment.