Skip to content

Commit

Permalink
Merge pull request #18 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Disable loosebox for quote characters and add page rotation patch
  • Loading branch information
VikParuchuri authored Dec 3, 2024
2 parents c065ac0 + 7336ed5 commit f26428a
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 20 deletions.
18 changes: 9 additions & 9 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ def _load_pdf(pdf, flatten_pdf):
# Must be called on the parent pdf, before the page was retrieved
if flatten_pdf:
pdf.init_forms()

return pdf


def _get_page_range(page_range, flatten_pdf=False):
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf)
def _get_page_range(page_range, flatten_pdf=False, quote_loosebox=True):
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pages = inference(text_chars, model)
return pages

Expand All @@ -44,17 +44,17 @@ def worker_init(pdf_path, flatten_pdf):
atexit.register(partial(worker_shutdown, pdf_doc))


def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None):
def _get_pages(pdf_path, page_range=None, flatten_pdf=False, quote_loosebox=True, workers=None):
pdf_doc = _load_pdf(pdf_path, flatten_pdf)
if page_range is None:
page_range = range(len(pdf_doc))

if workers is not None:
workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference
workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference

if workers is None or workers <= 1:
model = get_model()
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf)
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pdf_doc.close()
return inference(text_chars, model)

Expand All @@ -65,7 +65,7 @@ def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None):
page_range_chunks = [page_range[i * pages_per_worker:(i + 1) * pages_per_worker] for i in range(workers)]

with ProcessPoolExecutor(max_workers=workers, initializer=worker_init, initargs=(pdf_path, flatten_pdf)) as executor:
pages = list(executor.map(_get_page_range, page_range_chunks, repeat(flatten_pdf)))
pages = list(executor.map(_get_page_range, page_range_chunks, repeat(flatten_pdf), repeat(quote_loosebox)))

ordered_pages = [page for sublist in pages for page in sublist]
return ordered_pages
Expand Down Expand Up @@ -94,8 +94,8 @@ def _process_span(span, page_width, page_height, keep_chars):
char["bbox"] = unnormalize_bbox(char["bbox"], page_width, page_height)


def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, workers=None):
pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf)
def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None):
pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox)
for page in pages:
page_width, page_height = page["width"], page["height"]
for block in page["blocks"]:
Expand Down
25 changes: 15 additions & 10 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import math
from collections import defaultdict
from typing import Dict, List
from typing import List

import pypdfium2.raw as pdfium_c
from pypdfium2 import PdfiumError
Expand All @@ -10,8 +9,8 @@


def update_previous_fonts(char_infos: List, i: int, prev_fontname: str, prev_fontflags: int, text_page, fontname_sample_freq: int):
min_update = max(0, i - fontname_sample_freq) # Minimum index to update
for j in range(i-1, min_update, -1): # Goes from i to min_update
min_update = max(0, i - fontname_sample_freq) # Minimum index to update
for j in range(i - 1, min_update, -1): # Goes from i to min_update
fontname, fontflags = get_fontname(text_page, j)

# If we hit the region with the previous fontname, we can bail out
Expand All @@ -27,7 +26,7 @@ def flatten(page, flag=pdfium_c.FLAT_NORMALDISPLAY):
raise PdfiumError("Failed to flatten annotations / form fields.")


def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ):
def get_pdfium_chars(pdf, page_range, flatten_pdf, quote_loosebox=True, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ):
blocks = []

for page_idx in page_range:
Expand All @@ -40,9 +39,14 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
# Flattening invalidates existing handles to the page.
# It is necessary to re-initialize the page handle after flattening.
page = pdf.get_page(page_idx)

text_page = page.get_textpage()
page_rotation = page.get_rotation()
try:
page_rotation = page.get_rotation()
except KeyError:
# This happens on some PDFs, where pdfium_i.RotationToDegrees[ pdfium_c.FPDFPage_GetRotation(self) ] throws a KeyError -1
page_rotation = 0

bbox = page.get_bbox()
page_width = math.ceil(abs(bbox[2] - bbox[0]))
page_height = math.ceil(abs(bbox[1] - bbox[3]))
Expand Down Expand Up @@ -87,8 +91,9 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
update_previous_fonts(char_infos, i, prev_fontname, prev_fontflags, text_page, fontname_sample_freq)

rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * rad_to_deg # convert from radians to degrees
coords = text_page.get_charbox(i, loose=rotation == 0) # Loose doesn't work properly when charbox is rotated
rotation = rotation * rad_to_deg # convert from radians to degrees
use_loosebox = rotation == 0 and (not char == "'" or quote_loosebox) # Loose doesn't work properly when charbox is rotated or when it's a quote
coords = text_page.get_charbox(i, loose=use_loosebox)
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, page_rotation, normalize=True)

char_info = {
Expand All @@ -108,4 +113,4 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
text_chars["chars"] = char_infos
text_chars["total_chars"] = total_chars
blocks.append(text_chars)
return blocks
return blocks
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.19"
version = "0.3.20"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit f26428a

Please sign in to comment.