Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Disable loosebox for quote characters and add page rotation patch #18

Merged
merged 4 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ def _load_pdf(pdf, flatten_pdf):
# Must be called on the parent pdf, before the page was retrieved
if flatten_pdf:
pdf.init_forms()

return pdf


def _get_page_range(page_range, flatten_pdf=False):
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf)
def _get_page_range(page_range, flatten_pdf=False, quote_loosebox=True):
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pages = inference(text_chars, model)
return pages

Expand All @@ -44,17 +44,17 @@ def worker_init(pdf_path, flatten_pdf):
atexit.register(partial(worker_shutdown, pdf_doc))


def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None):
def _get_pages(pdf_path, page_range=None, flatten_pdf=False, quote_loosebox=True, workers=None):
pdf_doc = _load_pdf(pdf_path, flatten_pdf)
if page_range is None:
page_range = range(len(pdf_doc))

if workers is not None:
workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference
workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference

if workers is None or workers <= 1:
model = get_model()
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf)
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pdf_doc.close()
return inference(text_chars, model)

Expand All @@ -65,7 +65,7 @@ def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None):
page_range_chunks = [page_range[i * pages_per_worker:(i + 1) * pages_per_worker] for i in range(workers)]

with ProcessPoolExecutor(max_workers=workers, initializer=worker_init, initargs=(pdf_path, flatten_pdf)) as executor:
pages = list(executor.map(_get_page_range, page_range_chunks, repeat(flatten_pdf)))
pages = list(executor.map(_get_page_range, page_range_chunks, repeat(flatten_pdf), repeat(quote_loosebox)))

ordered_pages = [page for sublist in pages for page in sublist]
return ordered_pages
Expand Down Expand Up @@ -94,8 +94,8 @@ def _process_span(span, page_width, page_height, keep_chars):
char["bbox"] = unnormalize_bbox(char["bbox"], page_width, page_height)


def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, workers=None):
pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf)
def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None):
pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox)
for page in pages:
page_width, page_height = page["width"], page["height"]
for block in page["blocks"]:
Expand Down
25 changes: 15 additions & 10 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import math
from collections import defaultdict
from typing import Dict, List
from typing import List

import pypdfium2.raw as pdfium_c
from pypdfium2 import PdfiumError
Expand All @@ -10,8 +9,8 @@


def update_previous_fonts(char_infos: List, i: int, prev_fontname: str, prev_fontflags: int, text_page, fontname_sample_freq: int):
min_update = max(0, i - fontname_sample_freq) # Minimum index to update
for j in range(i-1, min_update, -1): # Goes from i to min_update
min_update = max(0, i - fontname_sample_freq) # Minimum index to update
for j in range(i - 1, min_update, -1): # Goes from i to min_update
fontname, fontflags = get_fontname(text_page, j)

# If we hit the region with the previous fontname, we can bail out
Expand All @@ -27,7 +26,7 @@ def flatten(page, flag=pdfium_c.FLAT_NORMALDISPLAY):
raise PdfiumError("Failed to flatten annotations / form fields.")


def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ):
def get_pdfium_chars(pdf, page_range, flatten_pdf, quote_loosebox=True, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ):
blocks = []

for page_idx in page_range:
Expand All @@ -40,9 +39,14 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
# Flattening invalidates existing handles to the page.
# It is necessary to re-initialize the page handle after flattening.
page = pdf.get_page(page_idx)

text_page = page.get_textpage()
page_rotation = page.get_rotation()
try:
page_rotation = page.get_rotation()
except KeyError:
# This happens on some PDFs, where pdfium_i.RotationToDegrees[ pdfium_c.FPDFPage_GetRotation(self) ] throws a KeyError -1
page_rotation = 0

bbox = page.get_bbox()
page_width = math.ceil(abs(bbox[2] - bbox[0]))
page_height = math.ceil(abs(bbox[1] - bbox[3]))
Expand Down Expand Up @@ -87,8 +91,9 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
update_previous_fonts(char_infos, i, prev_fontname, prev_fontflags, text_page, fontname_sample_freq)

rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * rad_to_deg # convert from radians to degrees
coords = text_page.get_charbox(i, loose=rotation == 0) # Loose doesn't work properly when charbox is rotated
rotation = rotation * rad_to_deg # convert from radians to degrees
use_loosebox = rotation == 0 and (not char == "'" or quote_loosebox) # Loose doesn't work properly when charbox is rotated or when it's a quote
coords = text_page.get_charbox(i, loose=use_loosebox)
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, page_rotation, normalize=True)

char_info = {
Expand All @@ -108,4 +113,4 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
text_chars["chars"] = char_infos
text_chars["total_chars"] = total_chars
blocks.append(text_chars)
return blocks
return blocks
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.19"
version = "0.3.20"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down
Loading