Skip to content

Commit

Permalink
Merge pull request #350 from dantecalderon/feature/support-open-prote…
Browse files Browse the repository at this point in the history
…cted-files-by-password

feat: Added support for opening password-protected files
  • Loading branch information
jstockwin authored Nov 10, 2023
2 parents 3b5b3dc + 4fcca50 commit 4985242
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 3 deletions.
11 changes: 8 additions & 3 deletions py_pdf_parser/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def load_file(
def load(
pdf_file: IO,
pdf_file_path: Optional[str] = None,
password: Optional[str] = None,
la_params: Optional[Dict] = None,
**kwargs: Any,
) -> PDFDocument:
Expand All @@ -52,13 +53,15 @@ def load(
Args:
pdf_file (io): The PDF file.
pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation
for `PDFDocument`.
password (str, optional): Password for the encrypted PDF. Required if the
PDF is encrypted.
la_params (dict): The layout parameters passed to PDF Miner for analysis. See
the PDFMiner documentation here:
https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams.
Note that py_pdf_parser will re-order the elements it receives from PDFMiner
so options relating to element ordering will have no effect.
pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation
for `PDFDocument`.
kwargs: Passed to `PDFDocument`. See the documentation for `PDFDocument`.
Returns:
Expand All @@ -69,7 +72,9 @@ def load(
la_params = {**DEFAULT_LA_PARAMS, **la_params}

pages: Dict[int, Page] = {}
for page in extract_pages(pdf_file, laparams=LAParams(**la_params)):
for page in extract_pages(
pdf_file, laparams=LAParams(**la_params), password=password
):
elements = [element for element in page if isinstance(element, LTTextBox)]

# If all_texts=True then we may get some text from inside figures
Expand Down
Binary file added tests/data/pdfs/test_protected.pdf
Binary file not shown.
16 changes: 16 additions & 0 deletions tests/test_loaders.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
from unittest import TestCase

from pdfminer.pdfdocument import PDFPasswordIncorrect

from py_pdf_parser.components import PDFDocument
from py_pdf_parser.loaders import load, load_file

Expand All @@ -11,6 +13,20 @@ def test_load_file(self):
document = load_file(file_path)
self.assertIsInstance(document, PDFDocument)

def test_load_protected_file(self):
file_path = os.path.join(
os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf"
)
document = load_file(file_path, password="p4ssword")
self.assertIsInstance(document, PDFDocument)

def test_load_protected_file_wrong_password(self):
file_path = os.path.join(
os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf"
)
with self.assertRaises(PDFPasswordIncorrect):
load_file(file_path, password="wrong_password")

def test_load(self):
file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "test.pdf")
with open(file_path, "rb") as in_file:
Expand Down

0 comments on commit 4985242

Please sign in to comment.