remove normalization now that we're not going through a model anymore

VikParuchuri · Dec 10, 2024 · 5e2ef20 · 5e2ef20
1 parent be036b2
commit 5e2ef20
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 26 deletions.
diff --git a/pdftext/extraction.py b/pdftext/extraction.py
@@ -78,13 +78,13 @@ def paginated_plain_text_output(pdf_path, sort=False, hyphens=False, page_range=
 
 
 def _process_span(span, page_width, page_height, keep_chars):
-    span["bbox"] = span["bbox"].unnormalize(page_width, page_height).bbox
+    span["bbox"] = span["bbox"].bbox
     span["text"] = handle_hyphens(postprocess_text(span["text"]), keep_hyphens=True)
     if not keep_chars:
         del span["chars"]
     else:
         for char in span["chars"]:
-            char["bbox"] = char["bbox"].unnormalize(page_width, page_height).bbox
+            char["bbox"] = char["bbox"].bbox
 
 
 def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None):
@@ -95,12 +95,12 @@ def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, f
             for k in list(block.keys()):
                 if k not in ["lines", "bbox"]:
                     del block[k]
-            block["bbox"] = block["bbox"].unnormalize(page_width, page_height).bbox
+            block["bbox"] = block["bbox"].bbox
             for line in block["lines"]:
                 for k in list(line.keys()):
                     if k not in ["spans", "bbox"]:
                         del line[k]
-                line["bbox"] = line["bbox"].unnormalize(page_width, page_height).bbox
+                line["bbox"] = line["bbox"].bbox
                 for span in line["spans"]:
                     _process_span(span, page_width, page_height, keep_chars)
 

diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py
@@ -7,7 +7,7 @@
 from pdftext.schema import Bbox, Chars
 
 
-def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True, normalize=True) -> Chars:
+def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True) -> Chars:
     chars: Chars = []
 
     x_start, y_start, x_end, y_end = page_bbox
@@ -34,8 +34,6 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
 
         bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)]
         bbox = Bbox(bbox).rotate(page_width, page_height, page_rotation)
-        if normalize:
-            bbox = bbox.normalize(page_width, page_height)
 
         chars.append({
             "bbox": bbox,

diff --git a/pdftext/pdf/pages.py b/pdftext/pdf/pages.py
@@ -188,8 +188,7 @@ def get_pages(
     pdf: pdfium.PdfDocument,
     page_range: range,
     flatten_pdf: bool = True,
-    quote_loosebox=True,
-    normalize=True
+    quote_loosebox=True
 ) -> Pages:
     pages: Pages = []
 
@@ -211,7 +210,7 @@ def get_pages(
         except:
             pass
 
-        chars = get_chars(textpage, page_bbox, page_rotation, quote_loosebox, normalize)
+        chars = get_chars(textpage, page_bbox, page_rotation, quote_loosebox)
         spans = get_spans(chars)
         lines = get_lines(spans)
         blocks = get_blocks(lines)

diff --git a/pdftext/schema.py b/pdftext/schema.py
@@ -101,22 +101,6 @@ def rotate(self, page_width: float, page_height: float, rotation: int) -> Bbox:
 
         return Bbox(rotated_bbox)
 
-    def normalize(self, page_width, page_height):
-        return Bbox([
-            self.bbox[0] / page_width,
-            self.bbox[1] / page_height,
-            self.bbox[2] / page_width,
-            self.bbox[3] / page_height
-        ])
-
-    def unnormalize(self, page_width, page_height):
-        return Bbox([
-            self.bbox[0] * page_width,
-            self.bbox[1] * page_height,
-            self.bbox[2] * page_width,
-            self.bbox[3] * page_height
-        ])
-
 
 class Char(TypedDict):
     bbox: Bbox
@@ -136,6 +120,7 @@ class Span(TypedDict):
     char_start_idx: int
     char_end_idx: int
 
+
 class Line(TypedDict):
     spans: List[Span]
     bbox: Bbox