Skip to content

Commit

Permalink
refactor for a cleaner diff
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Dec 9, 2024
1 parent 1f01db8 commit 60b5a68
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 56 deletions.
58 changes: 58 additions & 0 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import math

import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c

from pdftext.pdf.utils import get_fontname
from pdftext.schema import Bbox, Chars


def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True, normalize=True) -> Chars:
chars: Chars = []
start_idx = 0
end_idx = 1

x_start, y_start, x_end, y_end = page_bbox
page_width = math.ceil(abs(x_end - x_start))
page_height = math.ceil(abs(y_end - y_start))

for i in range(textpage.count_chars()):
fontname, fontflag = get_fontname(textpage, i)
text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))
end_idx = start_idx + len(text)

rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i)
loosebox = rotation == 0 and (not text == "'" or quote_loosebox)

char_box = textpage.get_charbox(i, loose=loosebox)
cx_start, cy_start, cx_end, cy_end = char_box

cx_start -= x_start
cx_end -= x_start
cy_start -= y_start
cy_end -= y_start

ty_start = page_height - cy_start
ty_end = page_height - cy_end

bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)]
bbox = Bbox(bbox).rotate(page_width, page_height, page_rotation)
if normalize:
bbox = bbox.normalize(page_width, page_height)

chars.append({
"bbox": bbox,
"char": text,
"rotation": rotation,
"font": {
"name": fontname,
"flags": fontflag,
"size": pdfium_c.FPDFText_GetFontSize(textpage, i),
"weight": pdfium_c.FPDFText_GetFontWeight(textpage, i),
},
"char_idx": i,
"char_start_idx": start_idx,
"char_end_idx": end_idx
})
start_idx = end_idx
return chars
60 changes: 4 additions & 56 deletions pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,62 +5,10 @@
from typing import List

import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c

from pdftext.pdf.utils import LINE_BREAKS, SPACES, TABS, flatten, get_fontname
from pdftext.schema import Bbox, Blocks, Chars, Line, Lines, Pages, Span, Spans
from pdftext.settings import settings


def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True, normalize=True) -> Chars:
chars: Chars = []
start_idx = 0
end_idx = 1

x_start, y_start, x_end, y_end = page_bbox
page_width = math.ceil(abs(x_end - x_start))
page_height = math.ceil(abs(y_end - y_start))

for i in range(textpage.count_chars()):
fontname, fontflag = get_fontname(textpage, i)
text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))
end_idx = start_idx + len(text)

rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i)
loosebox = rotation == 0 and (not text == "'" or quote_loosebox)

char_box = textpage.get_charbox(i, loose=loosebox)
cx_start, cy_start, cx_end, cy_end = char_box

cx_start -= x_start
cx_end -= x_start
cy_start -= y_start
cy_end -= y_start

ty_start = page_height - cy_start
ty_end = page_height - cy_end

bbox = [cx_start, min(ty_start, ty_end), cx_end, max(ty_start, ty_end)]
bbox = Bbox(bbox).rotate(page_width, page_height, page_rotation)
if normalize:
bbox = bbox.normalize(page_width, page_height)

chars.append({
"bbox": bbox,
"char": text,
"rotation": rotation,
"font": {
"name": fontname,
"flags": fontflag,
"size": pdfium_c.FPDFText_GetFontSize(textpage, i),
"weight": pdfium_c.FPDFText_GetFontWeight(textpage, i),
},
"char_idx": i,
"char_start_idx": start_idx,
"char_end_idx": end_idx
})
start_idx = end_idx
return chars

from pdftext.pdf.chars import get_chars
from pdftext.pdf.utils import flatten
from pdftext.schema import Blocks, Chars, Line, Lines, Pages, Span, Spans


def get_spans(chars: Chars) -> Spans:
Expand Down

0 comments on commit 60b5a68

Please sign in to comment.