From ae6b518be77acca56f8ecd642e22c00609ea702d Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 19 Nov 2024 13:25:38 -0500 Subject: [PATCH] Fix warnings --- extract_text.py | 4 +++- pdftext/extraction.py | 11 ++++++++++- pyproject.toml | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/extract_text.py b/extract_text.py index 6c90424..19e2e00 100644 --- a/extract_text.py +++ b/extract_text.py @@ -22,7 +22,9 @@ def main(): if args.pages is not None: pdf_doc = pdfium.PdfDocument(args.pdf_path) pages = [int(p) for p in args.pages.split(",")] - assert all(p <= len(pdf_doc) for p in pages), "Invalid page number(s) provided" + doc_len = len(pdf_doc) + pdf_doc.close() + assert all(p <= doc_len for p in pages), "Invalid page number(s) provided" if args.json: text = dictionary_output(args.pdf_path, sort=args.sort, page_range=pages, flatten_pdf=args.flatten_pdf, keep_chars=args.keep_chars, workers=args.workers) diff --git a/pdftext/extraction.py b/pdftext/extraction.py index 39741d2..a007a9c 100644 --- a/pdftext/extraction.py +++ b/pdftext/extraction.py @@ -1,3 +1,5 @@ +import atexit +from functools import partial from itertools import repeat from typing import List from concurrent.futures import ProcessPoolExecutor @@ -28,6 +30,10 @@ def _get_page_range(page_range, flatten_pdf=False): return pages +def worker_shutdown(pdf_doc): + pdf_doc.close() + + def worker_init(pdf_path, flatten_pdf): global model global pdf_doc @@ -35,6 +41,8 @@ def worker_init(pdf_path, flatten_pdf): pdf_doc = _load_pdf(pdf_path, flatten_pdf) model = get_model() + atexit.register(partial(worker_shutdown, pdf_doc)) + def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None): pdf_doc = _load_pdf(pdf_path, flatten_pdf) @@ -47,8 +55,10 @@ def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None): if workers is None or workers <= 1: model = get_model() text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf) + pdf_doc.close() return inference(text_chars, model) + pdf_doc.close() page_range = list(page_range) pages_per_worker = math.ceil(len(page_range) / workers) @@ -58,7 +68,6 @@ def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None): pages = list(executor.map(_get_page_range, page_range_chunks, repeat(flatten_pdf))) ordered_pages = [page for sublist in pages for page in sublist] - return ordered_pages diff --git a/pyproject.toml b/pyproject.toml index ff75626..8468264 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.3.18" +version = "0.3.19" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"