From 9f6dcf8a486faa486e0d561bed8fab46e8627feb Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Thu, 28 Nov 2024 14:26:50 +0000 Subject: [PATCH 1/4] update loosebox condition --- pdftext/pdf/chars.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index 9a52f1f..eb14e25 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -1,6 +1,5 @@ import math -from collections import defaultdict -from typing import Dict, List +from typing import List import pypdfium2.raw as pdfium_c from pypdfium2 import PdfiumError @@ -88,7 +87,8 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i) rotation = rotation * rad_to_deg # convert from radians to degrees - coords = text_page.get_charbox(i, loose=rotation == 0) # Loose doesn't work properly when charbox is rotated + use_loosebox = rotation == 0 and not char == "'" # Loose doesn't work properly when charbox is rotated or when it's a quote + coords = text_page.get_charbox(i, loose=use_loosebox) device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, page_rotation, normalize=True) char_info = { From 9001bc28d81382486ee3b38910d55203919df84e Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 3 Dec 2024 09:23:56 -0500 Subject: [PATCH 2/4] Page rotation patch --- pdftext/pdf/chars.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index eb14e25..a074544 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -41,7 +41,12 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings page = pdf.get_page(page_idx) text_page = page.get_textpage() - page_rotation = page.get_rotation() + try: + page_rotation = page.get_rotation() + except KeyError: + # This happens on some PDFs, where pdfium_i.RotationToDegrees[ pdfium_c.FPDFPage_GetRotation(self) ] throws a KeyError -1 + page_rotation = 0 + bbox = page.get_bbox() page_width = math.ceil(abs(bbox[2] - bbox[0])) page_height = math.ceil(abs(bbox[1] - bbox[3])) From 0881a418621613966abc546fd7132939d0bbc900 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Tue, 3 Dec 2024 16:13:30 +0000 Subject: [PATCH 3/4] loosebox for quotes by default, unless explicitly disabled --- pdftext/extraction.py | 18 +++++++++--------- pdftext/pdf/chars.py | 14 +++++++------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pdftext/extraction.py b/pdftext/extraction.py index a007a9c..d9ae1b1 100644 --- a/pdftext/extraction.py +++ b/pdftext/extraction.py @@ -20,12 +20,12 @@ def _load_pdf(pdf, flatten_pdf): # Must be called on the parent pdf, before the page was retrieved if flatten_pdf: pdf.init_forms() - + return pdf -def _get_page_range(page_range, flatten_pdf=False): - text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf) +def _get_page_range(page_range, flatten_pdf=False, quote_loosebox=True): + text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox) pages = inference(text_chars, model) return pages @@ -44,17 +44,17 @@ def worker_init(pdf_path, flatten_pdf): atexit.register(partial(worker_shutdown, pdf_doc)) -def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None): +def _get_pages(pdf_path, page_range=None, flatten_pdf=False, quote_loosebox=True, workers=None): pdf_doc = _load_pdf(pdf_path, flatten_pdf) if page_range is None: page_range = range(len(pdf_doc)) if workers is not None: - workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference + workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference if workers is None or workers <= 1: model = get_model() - text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf) + text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf, quote_loosebox) pdf_doc.close() return inference(text_chars, model) @@ -65,7 +65,7 @@ def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None): page_range_chunks = [page_range[i * pages_per_worker:(i + 1) * pages_per_worker] for i in range(workers)] with ProcessPoolExecutor(max_workers=workers, initializer=worker_init, initargs=(pdf_path, flatten_pdf)) as executor: - pages = list(executor.map(_get_page_range, page_range_chunks, repeat(flatten_pdf))) + pages = list(executor.map(_get_page_range, page_range_chunks, repeat(flatten_pdf), repeat(quote_loosebox))) ordered_pages = [page for sublist in pages for page in sublist] return ordered_pages @@ -94,8 +94,8 @@ def _process_span(span, page_width, page_height, keep_chars): char["bbox"] = unnormalize_bbox(char["bbox"], page_width, page_height) -def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, workers=None): - pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf) +def dictionary_output(pdf_path, sort=False, page_range=None, keep_chars=False, flatten_pdf=False, quote_loosebox=True, workers=None): + pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox) for page in pages: page_width, page_height = page["width"], page["height"] for block in page["blocks"]: diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index a074544..2990a2a 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -9,8 +9,8 @@ def update_previous_fonts(char_infos: List, i: int, prev_fontname: str, prev_fontflags: int, text_page, fontname_sample_freq: int): - min_update = max(0, i - fontname_sample_freq) # Minimum index to update - for j in range(i-1, min_update, -1): # Goes from i to min_update + min_update = max(0, i - fontname_sample_freq) # Minimum index to update + for j in range(i - 1, min_update, -1): # Goes from i to min_update fontname, fontflags = get_fontname(text_page, j) # If we hit the region with the previous fontname, we can bail out @@ -26,7 +26,7 @@ def flatten(page, flag=pdfium_c.FLAT_NORMALDISPLAY): raise PdfiumError("Failed to flatten annotations / form fields.") -def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ): +def get_pdfium_chars(pdf, page_range, flatten_pdf, quote_loosebox=True, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ): blocks = [] for page_idx in page_range: @@ -39,7 +39,7 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings # Flattening invalidates existing handles to the page. # It is necessary to re-initialize the page handle after flattening. page = pdf.get_page(page_idx) - + text_page = page.get_textpage() try: page_rotation = page.get_rotation() @@ -91,8 +91,8 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings update_previous_fonts(char_infos, i, prev_fontname, prev_fontflags, text_page, fontname_sample_freq) rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i) - rotation = rotation * rad_to_deg # convert from radians to degrees - use_loosebox = rotation == 0 and not char == "'" # Loose doesn't work properly when charbox is rotated or when it's a quote + rotation = rotation * rad_to_deg # convert from radians to degrees + use_loosebox = rotation == 0 and (not char == "'" or quote_loosebox) # Loose doesn't work properly when charbox is rotated or when it's a quote coords = text_page.get_charbox(i, loose=use_loosebox) device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, page_rotation, normalize=True) @@ -113,4 +113,4 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings text_chars["chars"] = char_infos text_chars["total_chars"] = total_chars blocks.append(text_chars) - return blocks \ No newline at end of file + return blocks From 7336ed59841bfb2f8b1ebb9fa8d679155f392a93 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 3 Dec 2024 15:36:07 -0500 Subject: [PATCH 4/4] Version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8468264..fd3a731 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.3.19" +version = "0.3.20" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"