forked from lachlanpage/Markov-Chain-Sentence-Generator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_utilities.py
57 lines (40 loc) · 1.99 KB
/
pdf_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import PyPDF4
from colorama import Fore, Style
from config import Config
def extract_pdf_text(pdf_file_path):
"""
Extracts training_corpus_filename from a given PDF file and returns it as a single string.
Args:
pdf_file_path (str): The path of the PDF file to extract training_corpus_filename from.
Returns:
str: The extracted training_corpus_filename from the PDF file as a single string.
Raises:
Exception: Any exceptions raised during PDF file opening or training_corpus_filename extraction will propagate.
"""
with open(pdf_file_path, 'rb') as pdf_file:
pdf_reader = PyPDF4.PdfFileReader(pdf_file)
pdf_text = []
for page_num in range(pdf_reader.numPages):
page = pdf_reader.getPage(page_num)
pdf_text.append(page.extractText())
return "".join(pdf_text)
def convert_pdf_to_text_file(pdf_file_path):
"""
Extracts training_corpus_filename from a given PDF file, saves the extracted
training_corpus_filename to a new training_corpus_filename file,
and returns the path to the newly created training_corpus_filename file.
Args:
pdf_file_path (str): The path of the PDF file to be converted to training_corpus_filename.
Returns:
str: The path of the new training_corpus_filename file containing the extracted text from the PDF.
Raises:
Exception: Any exceptions raised during training_corpus_filename extraction or file writing will propagate.
"""
# Save the training_corpus_filename to a file with the same name as the PDF file, but with a .txt extension.
text_file_path = pdf_file_path.replace('.pdf', '.txt')
with open(text_file_path, 'w', encoding='utf-8', ) as text_file:
text_file.write(extract_pdf_text(pdf_file_path))
# Use the VERBOSE and QUIET flags from the Config class
if Config.VERBOSE:
print(f"{Fore.GREEN}[+] Saved the training corpus to '{text_file_path}'{Style.RESET_ALL}.")
return text_file_path