Skip to content

Commit

Permalink
remove normalization now that we're not going through a model anymore
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Dec 10, 2024
1 parent be036b2 commit 5e2ef20
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 26 deletions.
8 changes: 4 additions & 4 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,13 @@ def paginated_plain_text_output(pdf_path, sort=False, hyphens=False, page_range=


def _process_span(span, page_width, page_height, keep_chars):
span["bbox"] = span["bbox"].unnormalize(page_width, page_height).bbox
span["bbox"] = span["bbox"].bbox
span["text"] = handle_hyphens(postprocess_text(span["text"]), keep_hyphens=True)
if not keep_chars:
del span["chars"]
else:
for char in span["chars"]:
char["bbox"] = char["bbox"].unnormalize(page_width, page_height).bbox
char["bbox"] = char["bbox"].bbox


def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None):
Expand All @@ -95,12 +95,12 @@ def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, f
for k in list(block.keys()):
if k not in ["lines", "bbox"]:
del block[k]
block["bbox"] = block["bbox"].unnormalize(page_width, page_height).bbox
block["bbox"] = block["bbox"].bbox
for line in block["lines"]:
for k in list(line.keys()):
if k not in ["spans", "bbox"]:
del line[k]
line["bbox"] = line["bbox"].unnormalize(page_width, page_height).bbox
line["bbox"] = line["bbox"].bbox
for span in line["spans"]:
_process_span(span, page_width, page_height, keep_chars)

Expand Down
4 changes: 1 addition & 3 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pdftext.schema import Bbox, Chars


def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True, normalize=True) -> Chars:
def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True) -> Chars:
chars: Chars = []

x_start, y_start, x_end, y_end = page_bbox
Expand All @@ -34,8 +34,6 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio

bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)]
bbox = Bbox(bbox).rotate(page_width, page_height, page_rotation)
if normalize:
bbox = bbox.normalize(page_width, page_height)

chars.append({
"bbox": bbox,
Expand Down
5 changes: 2 additions & 3 deletions pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,7 @@ def get_pages(
pdf: pdfium.PdfDocument,
page_range: range,
flatten_pdf: bool = True,
quote_loosebox=True,
normalize=True
quote_loosebox=True
) -> Pages:
pages: Pages = []

Expand All @@ -211,7 +210,7 @@ def get_pages(
except:
pass

chars = get_chars(textpage, page_bbox, page_rotation, quote_loosebox, normalize)
chars = get_chars(textpage, page_bbox, page_rotation, quote_loosebox)
spans = get_spans(chars)
lines = get_lines(spans)
blocks = get_blocks(lines)
Expand Down
17 changes: 1 addition & 16 deletions pdftext/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,22 +101,6 @@ def rotate(self, page_width: float, page_height: float, rotation: int) -> Bbox:

return Bbox(rotated_bbox)

def normalize(self, page_width, page_height):
return Bbox([
self.bbox[0] / page_width,
self.bbox[1] / page_height,
self.bbox[2] / page_width,
self.bbox[3] / page_height
])

def unnormalize(self, page_width, page_height):
return Bbox([
self.bbox[0] * page_width,
self.bbox[1] * page_height,
self.bbox[2] * page_width,
self.bbox[3] * page_height
])


class Char(TypedDict):
bbox: Bbox
Expand All @@ -136,6 +120,7 @@ class Span(TypedDict):
char_start_idx: int
char_end_idx: int


class Line(TypedDict):
spans: List[Span]
bbox: Bbox
Expand Down

0 comments on commit 5e2ef20

Please sign in to comment.