-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathapp.py
136 lines (92 loc) · 3.54 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from flask import Flask, jsonify, request, render_template
import sys
sys.path.append('Scraper/')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from wordcloud import WordCloud
from sklearn.linear_model import SGDClassifier, LogisticRegression
import string
import re
import spacy
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from database_module import DatabaseConnector
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()
class Predictors(TransformerMixin):
def transform(self, X, **transform_params):
# Cleaning Text
cleaned_text = [clean_text(text) for text in X]
return cleaned_text
def fit(self, X, y=None, **fit_params):
return self
def get_params(self, deep=True):
return {}
# Basic function to clean the text
def clean_text(text):
text = str(text)
text = text.replace('\\n', '\n')
text = text.replace('\\t', '\n')
text = text.replace('\\r', '\n')
text = text.replace("'b", ' ')
text = re.sub(' nan ', ' ', text)
text = re.sub(r'\\x[0-9a-z]{2}', r' ', text)
text = re.sub(r'[0-9]{2,}', r' ', text)
text = re.sub('http\S+\s*', ' ', text) # remove URLs
text = re.sub('RT|cc', ' ', text) # remove RT and cc
text = re.sub('#\S+', ' ', text) # remove hashtags
text = re.sub('@\S+', ' ', text) # remove mentions
text = re.sub('[%s]' % re.escape(
"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text) # remove punctuations
text = re.sub('\s+', ' ', text) # remove extra whitespace
text.lower()
text = re.sub(r'xx+', r' ', text)
text = re.sub(r'XX+', r' ', text)
return text.strip()
# Tokenizer function
def spacy_tokenizer(sentence):
mytokens = parser(sentence)
mytokens = [word.lemma_.lower().strip() if word.lemma_ !=
"-PRON-" else word.lower_ for word in mytokens]
mytokens = [
word for word in mytokens if word not in stop_words and word not in punctuations]
return mytokens
# Vectorizer
bow_vector = CountVectorizer(
tokenizer=spacy_tokenizer, ngram_range=(1, 1), max_features=3500)
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer, max_features=3500)
classifier = SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42, max_iter=5, tol=None)
app = Flask(__name__)
@app.route('/')
def index():
db = DatabaseConnector()
profiles = db.readDB()
return render_template("index.html", profiles = profiles)
@app.route('/predict', methods=['GET'])
def predict():
data = {}
data['skills'] = request.args.get('skills')
data['location'] = request.args.get('location')
testing_data = pd.DataFrame(data, index=[0])
testing_data = testing_data.skills
pickle_in = open('Classifier\Models\localmodel2.pickle', 'rb')
pipe = pickle.load(pickle_in)
predictions = pipe.predict(testing_data)
db = DatabaseConnector()
profiles = db.search_query(predictions[0], data['location'])
# result = []
# for prediction in predictions:
# d = {}
# d['class'] = str(prediction)
# db = DatabaseConnector()
# print(db.search_query(prediction, data['location']))
# result.append(d)
return render_template("index.html", profiles = profiles)
if __name__ == '__main__':
app.run(debug=True)