Skip to content

Commit

Permalink
Merge pull request #17 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Fix memory leak warnings
  • Loading branch information
VikParuchuri authored Nov 19, 2024
2 parents 10d979b + ae6b518 commit c065ac0
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 3 deletions.
4 changes: 3 additions & 1 deletion extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ def main():
if args.pages is not None:
pdf_doc = pdfium.PdfDocument(args.pdf_path)
pages = [int(p) for p in args.pages.split(",")]
assert all(p <= len(pdf_doc) for p in pages), "Invalid page number(s) provided"
doc_len = len(pdf_doc)
pdf_doc.close()
assert all(p <= doc_len for p in pages), "Invalid page number(s) provided"

if args.json:
text = dictionary_output(args.pdf_path, sort=args.sort, page_range=pages, flatten_pdf=args.flatten_pdf, keep_chars=args.keep_chars, workers=args.workers)
Expand Down
11 changes: 10 additions & 1 deletion pdftext/extraction.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import atexit
from functools import partial
from itertools import repeat
from typing import List
from concurrent.futures import ProcessPoolExecutor
Expand Down Expand Up @@ -28,13 +30,19 @@ def _get_page_range(page_range, flatten_pdf=False):
return pages


def worker_shutdown(pdf_doc):
pdf_doc.close()


def worker_init(pdf_path, flatten_pdf):
global model
global pdf_doc

pdf_doc = _load_pdf(pdf_path, flatten_pdf)
model = get_model()

atexit.register(partial(worker_shutdown, pdf_doc))


def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None):
pdf_doc = _load_pdf(pdf_path, flatten_pdf)
Expand All @@ -47,8 +55,10 @@ def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None):
if workers is None or workers <= 1:
model = get_model()
text_chars = get_pdfium_chars(pdf_doc, page_range, flatten_pdf)
pdf_doc.close()
return inference(text_chars, model)

pdf_doc.close()
page_range = list(page_range)

pages_per_worker = math.ceil(len(page_range) / workers)
Expand All @@ -58,7 +68,6 @@ def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None):
pages = list(executor.map(_get_page_range, page_range_chunks, repeat(flatten_pdf)))

ordered_pages = [page for sublist in pages for page in sublist]

return ordered_pages


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.18"
version = "0.3.19"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit c065ac0

Please sign in to comment.