From 85a521238d1f683a1e65e2cc49eb156e73602489 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Mon, 9 Dec 2024 21:20:08 +0000 Subject: [PATCH] fixes, make changes self contained --- pdftext/extraction.py | 8 ++++---- pdftext/pdf/chars.py | 8 +------- pdftext/pdf/pages.py | 6 +++--- pdftext/schema.py | 5 ++--- 4 files changed, 10 insertions(+), 17 deletions(-) diff --git a/pdftext/extraction.py b/pdftext/extraction.py index 5b2ea6d..9eb8bbf 100644 --- a/pdftext/extraction.py +++ b/pdftext/extraction.py @@ -78,13 +78,13 @@ def paginated_plain_text_output(pdf_path, sort=False, hyphens=False, page_range= def _process_span(span, page_width, page_height, keep_chars): - span["bbox"] = span["bbox"].unnormalize(page_width, page_height) + span["bbox"] = span["bbox"].unnormalize(page_width, page_height).bbox span["text"] = handle_hyphens(postprocess_text(span["text"]), keep_hyphens=True) if not keep_chars: del span["chars"] else: for char in span["chars"]: - char["bbox"] = char["bbox"].unnormalize(page_width, page_height) + char["bbox"] = char["bbox"].unnormalize(page_width, page_height).bbox def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None): @@ -95,12 +95,12 @@ def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, f for k in list(block.keys()): if k not in ["lines", "bbox"]: del block[k] - block["bbox"] = block["bbox"].unnormalize(page_width, page_height) + block["bbox"] = block["bbox"].unnormalize(page_width, page_height).bbox for line in block["lines"]: for k in list(line.keys()): if k not in ["spans", "bbox"]: del line[k] - line["bbox"] = line["bbox"].unnormalize(page_width, page_height) + line["bbox"] = line["bbox"].unnormalize(page_width, page_height).bbox for span in line["spans"]: _process_span(span, page_width, page_height, keep_chars) diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index 0b62d9c..ca377e3 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -9,8 +9,6 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True, normalize=True) -> Chars: chars: Chars = [] - start_idx = 0 - end_idx = 1 x_start, y_start, x_end, y_end = page_bbox page_width = math.ceil(abs(x_end - x_start)) @@ -19,7 +17,6 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio for i in range(textpage.count_chars()): fontname, fontflag = get_fontname(textpage, i) text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i)) - end_idx = start_idx + len(text) rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i) loosebox = rotation == 0 and (not text == "'" or quote_loosebox) @@ -50,9 +47,6 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio "size": pdfium_c.FPDFText_GetFontSize(textpage, i), "weight": pdfium_c.FPDFText_GetFontWeight(textpage, i), }, - "char_idx": i, - "char_start_idx": start_idx, - "char_end_idx": end_idx + "char_idx": i }) - start_idx = end_idx return chars diff --git a/pdftext/pdf/pages.py b/pdftext/pdf/pages.py index ccef7b1..9c7322b 100644 --- a/pdftext/pdf/pages.py +++ b/pdftext/pdf/pages.py @@ -21,8 +21,8 @@ def span_break(): "text": char["char"], "rotation": char["rotation"], "font": char["font"], - "char_start_idx": char["char_start_idx"], - "char_end_idx": char["char_end_idx"], + "char_start_idx": char["char_idx"], + "char_end_idx": char["char_idx"], "chars": [char] }) @@ -47,7 +47,7 @@ def span_break(): continue span['text'] += char['char'] - span['char_end_idx'] = char['char_end_idx'] + span['char_end_idx'] = char['char_idx'] span['bbox'] = span['bbox'].merge(char['bbox']) span['chars'].append(char) diff --git a/pdftext/schema.py b/pdftext/schema.py index f184662..bfd9cf2 100644 --- a/pdftext/schema.py +++ b/pdftext/schema.py @@ -124,8 +124,6 @@ class Char(TypedDict): rotation: float font: Dict[str, Union[Any, str]] char_idx: int - char_start_idx: int - char_end_idx: int class Span(TypedDict): @@ -135,7 +133,8 @@ class Span(TypedDict): font_weight: float font_size: float chars: List[Char] - + char_start_idx: int + char_end_idx: int class Line(TypedDict): spans: List[Span]