-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpdf2Text.py
121 lines (98 loc) · 3.59 KB
/
pdf2Text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
Need to search any data from a PDF.
Data present in the PDF is a combination of English, Telugu & Urdu languages.
The font used for Telugu is Shreelipi and for Urdu is Noori Nastaliq.
Format of Data present in PDF will be in .pdf format and some data will be in image format.
We need to search the data present in English, Telugu & Urdu languages which is present in Unicode
as well as in image format.
Sample data set has been uploaded.
"""
import time
import PIL.Image
import pdf2image
import pytesseract
from googletrans import Translator
PDF_PATH = ""
DPI = 200
OUTPUT_FOLDER = None
FIRST_PAGE = None
LAST_PAGE = None
FORMAT = 'jpg'
THREAD_COUNT = 1
USERPWD = None
USE_CROPBOX = False
STRICT = False
index = 0
def pdftopil(PDF_PATH):
start_time = time.time()
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, output_folder=OUTPUT_FOLDER, first_page=FIRST_PAGE,
last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD,
use_cropbox=USE_CROPBOX, strict=STRICT)
print("Time taken for Image Generation: " + str(time.time() - start_time))
return pil_images
def save_images(pil_images):
index = 1
for image in pil_images:
image.save("C:\\Users\\Kingsmanvk\\PycharmProjects\\selfPRO\\sih\\pages\\page_" + str(index) + ".jpg")
index += 1
print("Number of pages :", index - 1)
return index
def imageWork(PDF_PATH):
pil_images = pdftopil(PDF_PATH)
index = save_images(pil_images)
return index
def textWork(index):
start_time = time.time()
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract'
TESSDATA_PREFIX = 'C:\\Program Files\\Tesseract-OCR'
output = ""
for i in range(1, index):
output += pytesseract.image_to_string(
PIL.Image.open(
'C:\\Users\\Kingsmanvk\\PycharmProjects\\selfPRO\\sih\\pages\\page_' + str(i) + '.jpg').convert(
"RGB"),
lang='urd+tel+eng')
output += "\n______________________________________________________________________\n"
print("\nText retrieved :\n")
print(output)
print("Time taken for OCR Operation: " + str(time.time() - start_time))
return output
def search(output, s):
flag = 0
# print(set(output.split()))
# print(s.split())
for i in s.split():
if i in output.split():
flag = 1
else:
flag = 0
if flag:
print("Found")
return 1
else:
print("Not Found")
return 0
def translate(s, dest):
translator = Translator()
l = translator.translate(s, dest=dest)
# print(l)
return l
def main():
PDF_PATH = "C:\\Users\\Kingsmanvk\\PycharmProjects\\selfPRO\\sih\\demo.pdf"
# PDF_PATH = input("Enter the address of input file:\n") # C:\\Users\\Kingsmanvk\\PycharmProjects\\selfPRO\\sih\\demo.pdf
index = imageWork(PDF_PATH)
output = textWork(index)
# print(output.split())
n = 1
while n:
n = int(input("Enter 1 to have a search or 2 for translate operation else enter 0. "))
if n == 1:
s = input("Enter the word to search.")
f = search(output, s)
elif n == 2:
s, l = input("Enter the the word to translate and the language to which you wish to translate it.").split()
print(translate(s, l).text)
else:
print("Thank You.")
if __name__ == "__main__":
main()