From 85a521238d1f683a1e65e2cc49eb156e73602489 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Mon, 9 Dec 2024 21:20:08 +0000
Subject: [PATCH] fixes, make changes self contained

---
 pdftext/extraction.py | 8 ++++----
 pdftext/pdf/chars.py  | 8 +-------
 pdftext/pdf/pages.py  | 6 +++---
 pdftext/schema.py     | 5 ++---
 4 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/pdftext/extraction.py b/pdftext/extraction.py
index 5b2ea6d..9eb8bbf 100644
--- a/pdftext/extraction.py
+++ b/pdftext/extraction.py
@@ -78,13 +78,13 @@ def paginated_plain_text_output(pdf_path, sort=False, hyphens=False, page_range=
 
 
 def _process_span(span, page_width, page_height, keep_chars):
-    span["bbox"] = span["bbox"].unnormalize(page_width, page_height)
+    span["bbox"] = span["bbox"].unnormalize(page_width, page_height).bbox
     span["text"] = handle_hyphens(postprocess_text(span["text"]), keep_hyphens=True)
     if not keep_chars:
         del span["chars"]
     else:
         for char in span["chars"]:
-            char["bbox"] = char["bbox"].unnormalize(page_width, page_height)
+            char["bbox"] = char["bbox"].unnormalize(page_width, page_height).bbox
 
 
 def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None):
@@ -95,12 +95,12 @@ def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, f
             for k in list(block.keys()):
                 if k not in ["lines", "bbox"]:
                     del block[k]
-            block["bbox"] = block["bbox"].unnormalize(page_width, page_height)
+            block["bbox"] = block["bbox"].unnormalize(page_width, page_height).bbox
             for line in block["lines"]:
                 for k in list(line.keys()):
                     if k not in ["spans", "bbox"]:
                         del line[k]
-                line["bbox"] = line["bbox"].unnormalize(page_width, page_height)
+                line["bbox"] = line["bbox"].unnormalize(page_width, page_height).bbox
                 for span in line["spans"]:
                     _process_span(span, page_width, page_height, keep_chars)
 
diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py
index 0b62d9c..ca377e3 100644
--- a/pdftext/pdf/chars.py
+++ b/pdftext/pdf/chars.py
@@ -9,8 +9,6 @@
 
 def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True, normalize=True) -> Chars:
     chars: Chars = []
-    start_idx = 0
-    end_idx = 1
 
     x_start, y_start, x_end, y_end = page_bbox
     page_width = math.ceil(abs(x_end - x_start))
@@ -19,7 +17,6 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
     for i in range(textpage.count_chars()):
         fontname, fontflag = get_fontname(textpage, i)
         text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))
-        end_idx = start_idx + len(text)
 
         rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i)
         loosebox = rotation == 0 and (not text == "'" or quote_loosebox)
@@ -50,9 +47,6 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
                 "size": pdfium_c.FPDFText_GetFontSize(textpage, i),
                 "weight": pdfium_c.FPDFText_GetFontWeight(textpage, i),
             },
-            "char_idx": i,
-            "char_start_idx": start_idx,
-            "char_end_idx": end_idx
+            "char_idx": i
         })
-        start_idx = end_idx
     return chars
diff --git a/pdftext/pdf/pages.py b/pdftext/pdf/pages.py
index ccef7b1..9c7322b 100644
--- a/pdftext/pdf/pages.py
+++ b/pdftext/pdf/pages.py
@@ -21,8 +21,8 @@ def span_break():
             "text": char["char"],
             "rotation": char["rotation"],
             "font": char["font"],
-            "char_start_idx": char["char_start_idx"],
-            "char_end_idx": char["char_end_idx"],
+            "char_start_idx": char["char_idx"],
+            "char_end_idx": char["char_idx"],
             "chars": [char]
         })
 
@@ -47,7 +47,7 @@ def span_break():
             continue
 
         span['text'] += char['char']
-        span['char_end_idx'] = char['char_end_idx']
+        span['char_end_idx'] = char['char_idx']
         span['bbox'] = span['bbox'].merge(char['bbox'])
         span['chars'].append(char)
 
diff --git a/pdftext/schema.py b/pdftext/schema.py
index f184662..bfd9cf2 100644
--- a/pdftext/schema.py
+++ b/pdftext/schema.py
@@ -124,8 +124,6 @@ class Char(TypedDict):
     rotation: float
     font: Dict[str, Union[Any, str]]
     char_idx: int
-    char_start_idx: int
-    char_end_idx: int
 
 
 class Span(TypedDict):
@@ -135,7 +133,8 @@ class Span(TypedDict):
     font_weight: float
     font_size: float
     chars: List[Char]
-
+    char_start_idx: int
+    char_end_idx: int
 
 class Line(TypedDict):
     spans: List[Span]