diff --git a/py_pdf_parser/loaders.py b/py_pdf_parser/loaders.py index b7352a4e..1449f5a4 100644 --- a/py_pdf_parser/loaders.py +++ b/py_pdf_parser/loaders.py @@ -44,6 +44,7 @@ def load_file( def load( pdf_file: IO, pdf_file_path: Optional[str] = None, + password: Optional[str] = None, la_params: Optional[Dict] = None, **kwargs: Any, ) -> PDFDocument: @@ -52,13 +53,15 @@ def load( Args: pdf_file (io): The PDF file. + pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation + for `PDFDocument`. + password (str, optional): Password for the encrypted PDF. Required if the + PDF is encrypted. la_params (dict): The layout parameters passed to PDF Miner for analysis. See the PDFMiner documentation here: https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams. Note that py_pdf_parser will re-order the elements it receives from PDFMiner so options relating to element ordering will have no effect. - pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation - for `PDFDocument`. kwargs: Passed to `PDFDocument`. See the documentation for `PDFDocument`. Returns: @@ -69,7 +72,9 @@ def load( la_params = {**DEFAULT_LA_PARAMS, **la_params} pages: Dict[int, Page] = {} - for page in extract_pages(pdf_file, laparams=LAParams(**la_params)): + for page in extract_pages( + pdf_file, laparams=LAParams(**la_params), password=password + ): elements = [element for element in page if isinstance(element, LTTextBox)] # If all_texts=True then we may get some text from inside figures diff --git a/tests/data/pdfs/test_protected.pdf b/tests/data/pdfs/test_protected.pdf new file mode 100644 index 00000000..cc80d045 Binary files /dev/null and b/tests/data/pdfs/test_protected.pdf differ diff --git a/tests/test_loaders.py b/tests/test_loaders.py index ef116a74..123f9436 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -1,6 +1,8 @@ import os from unittest import TestCase +from pdfminer.pdfdocument import PDFPasswordIncorrect + from py_pdf_parser.components import PDFDocument from py_pdf_parser.loaders import load, load_file @@ -11,6 +13,20 @@ def test_load_file(self): document = load_file(file_path) self.assertIsInstance(document, PDFDocument) + def test_load_protected_file(self): + file_path = os.path.join( + os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf" + ) + document = load_file(file_path, password="p4ssword") + self.assertIsInstance(document, PDFDocument) + + def test_load_protected_file_wrong_password(self): + file_path = os.path.join( + os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf" + ) + with self.assertRaises(PDFPasswordIncorrect): + load_file(file_path, password="wrong_password") + def test_load(self): file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "test.pdf") with open(file_path, "rb") as in_file: