-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprogram.py
130 lines (107 loc) · 5.87 KB
/
program.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from flask import Flask, jsonify, render_template, redirect, url_for, request, flash, json
from flask_pymongo import PyMongo
from pymongo import MongoClient
from werkzeug.utils import secure_filename
import os
import PIL
from PIL import Image, ImageDraw
import pytesseract as pt
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
import transliterate
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
program = Flask(__name__) #program is a variable that contains a website and load in memory
program.secret_key = "key" #If app.secret_key isn't set, Flask will not allow you to set or access the session dictionary.
#Database connection
program.config["MONGO_URI"] = "mongodb://localhost:27017/MyDB"
client = MongoClient()
db = client.MyDB #database MyDB
collection = db.user #inside MyDB, a collection called user
UPLOAD_FOLDER = 'static/saved_files' #The file will be stored here
program.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
# program.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 #maximum size a file can have i.e. 16 MB
ALLOWED_EXTENSIONS = set(['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif']) # These are allowed extensions
def allowed_file(filename): # This function is used to check file extenstion
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
#specifying the path of tesseract / it can also be dont using environment variable
pt.pytesseract.tesseract_cmd = r'C:\Users\ap888\Desktop\Internship CLIDE\Transliteration_flask\env\Tesseract.exe'
@program.route('/') #main page http://127.0.0.1:5000/
def index():
return render_template('upload.html')
@program.route('/upload', methods=['POST']) #upload page
def upload():
file = request.files['inputFile']
filename = secure_filename(file.filename) #secure_filename to make sure data can't be forged
if file and allowed_file(file.filename): #file shouldn't empty and shoudn't be any other extension
file.save(os.path.join(program.config['UPLOAD_FOLDER'], filename)) #save() method to save file on location
file_address = os.getcwd() + "\\static\\saved_files\\" + file.filename #stored file address
if '.' in filename and filename.rsplit('.', 1)[1].lower() == "pdf": #checking whether the file is pdf or not
images = convert_from_path(file_address)
text = " "
for i in range(len(images)):
text = text + pt.image_to_string(images[i], lang="hin")
db.user.insert_one({'file_name': file.filename, 'content' : text})
elif '.' in filename and filename.rsplit('.', 1)[1].lower() == "txt":
f = open(file_address,'r',encoding = 'utf-8') #if the file is text file
text = f.read() #read the entire content, it tet should be UTF-8 text
db.user.insert_one({'file_name': file.filename, 'content' : text}) #insert content into database
f.close()
else: # if file is image file
img = Image.open(file_address)
text = pt.image_to_string(img, lang="hin") #only for hindi language
db.user.insert_one({'file_name': file.filename, 'content' : text})
flash(file.filename + ' is successfully uploaded to the database!') # flash message
return redirect('/') # redirect of main page
else:
flash('Invalid Uplaod only txt, pdf, png, jpg, jpeg, gif') #flash message,if condition not satisfied
return redirect('/')
# This fucntion is used to get input value from user
@program.route("/search", methods=["POST", "GET"])
def search():
lang = {'hindi':"hi-t-i0-und"}#Transliteration Can be found by inspecting http response of google input tools page
if request.method == "POST":
user_serach = request.form["user_search"]
output = transliterate.transliteration(user_serach.strip(), lang['hindi'])#Transliteration of input string
return redirect(url_for("transliteration_search", search = output))#redirecting o transliteration_serach funcction with transliteration
else:
return redirect('/')
@program.route("/<search>")
def transliteration_search(search):
search_list = []
for x in search.split(" "):
if x != "":
search_list.append(x)
matching_content = {}
result = {}
for doc in collection.find():
doc_content = doc["content"].split("\n")
for x in doc_content:
if len(x) < len(search):
doc_content.remove(x)
res = process.extract(search, doc_content,scorer=fuzz.partial_ratio) #Fuzzywuzzy to extract all matching strings from file
for x in res:
if len(x[0]) < len(search):
res.remove(x)
best_sub_res = {}
for sub_res in res: #Finding best matching string among all
count = 0
for x in search_list:
if x in sub_res[0]:
count += 1
if count >= len(search_list) - (len(search_list) // 2): #string matching score should be greater than 50%
best_sub_res[sub_res[0]] = count
if best_sub_res:
sort_best_sub_res = sorted(best_sub_res.items(), key=lambda x: x[1], reverse=True)
accuracy = fuzz.partial_ratio(sort_best_sub_res[0][0],search)
if accuracy >= 25:
result[doc["file_name"]] = accuracy #storing file name and matching accuracy in dict
matching_content[doc["file_name"]] = sort_best_sub_res[0][0] #storing file name and matching string in dict
sort_result = sorted(result.items(), key=lambda x: x[1], reverse=True)
return render_template('result.html', file_names = sort_result, searching_for = search, number=len(sort_result), content = matching_content)
if __name__ == '__main__':
program.run(debug=True)