-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
368 lines (278 loc) · 12.5 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
'''
CS410 Final Project, UIUC - Fall 2019
Team Wolfram
@Authors:
- Pranav Velamakanni (pranavv2@illinois.edu)
- Tarik Koric (koric1@illinois.edu)
Summary:
Requirements: Python 3+
Modules: see README.md for a complete list
This project aims to provide sentiment analysis on live tweets fetched from Twitter.
The prediction is based on 3 models trained by us using a set of 1.6 million tweets.
All the below models have been trained and pickled to a file which is imported here.
Refer to TrainModel.ipynb for the training code.
# LogisticRegression - Accuracy ~ 77%
# Naive-Bayes - Accuracy ~ 76%
# Neural Network (single layer with 100 units) - Accuracy ~ 71%
'''
import logging
import argparse
import pickle
import os
import time
from collections import Counter
from tweepy import OAuthHandler
from tweepy import StreamListener
import tweepy
from nltk.corpus import stopwords # nltk.download('stopwords') before importing
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
### Pickled models
base_pickle_dir = 'Pickled data' # This directory contains the pickled models
models = ['LR.pickle', 'nn.pickle', 'naive-bayes.pickle'] # File names of the models
models_path = [*map(lambda file : os.path.join(base_pickle_dir, file), models)]
class Model:
'''
This class creates objects for the pre-trained models.
'''
## TF-IDF vector required to transform the tweet
# Vector specifications: Max features - 10,000, Ngram range - (1, 2)
vector = None
def __init__(self, model = None):
self.models = dict(zip(['LogisticRegression', 'NeuralNetwork', 'NaiveBayes'], models_path))
if not model or model not in self.models:
model = 'LogisticRegression'
self.model = self.import_model(self.models.get(model))
if not Model.vector:
Model.vector = self.init_vector()
def import_model(self, model_path):
'''
Loads the corresponding model from the pickle.
'''
with open(model_path, 'rb') as md:
return pickle.load(md)
def init_vector(self):
'''
Load the trained TF-IDF vector from pickle.
'''
with open(os.path.join(base_pickle_dir, 'vector.pickle'), 'rb') as vc:
return pickle.load(vc)
def label_prediction(self, prediction):
'''
Converts integer predictions to string.
'''
return 'Positive' if prediction[-1] == 1 else 'Negative'
def predict(self, data):
'''
Clean, transform a tweet and predict sentiment.
'''
if isinstance(data, str):
return self.label_prediction(self.model.predict(Model.vector.transform([Twitter.clean(data)])))
result = list()
for tweet in data:
result.append(self.label_prediction(self.model.predict(Model.vector.transform([Twitter.clean(data)]))))
return predict
class Twitter:
'''
This class creates an object that enables interaction with the Twitter API.
'''
def __init__(self):
'''
Initializes a Twitter object with authentication.
'''
# Keys and tokens from the Twitter Dev Console
consumer_key = 'gLRNOAhuVMPDvtr5aOvYqZ6Ze'
consumer_secret = '6n9F6Ieedd97SrtvZFiRvf5k5uognXDEYTUabsnIidKHH3PaDA'
access_token = '919706112261312512-fP82zHMs27OeeIsVtVXpNrVEt2CBSBH'
access_token_secret = 'zPUl78nX5hNONyBy6ei943TTKgonzN0JXhLfteYVQ5YKS'
# Attempt authentication
try:
# Create OAuthHandler object
self.auth = OAuthHandler(consumer_key, consumer_secret)
# Set access token and secret
self.auth.set_access_token(access_token, access_token_secret)
# Create tweepy API object to fetch tweets
self.api = tweepy.API(self.auth)
except:
logging.fatal("Error: Authentication Failed")
@classmethod
def clean(cls, tweet):
'''
Classmethod: clean tweet by removing stopwords, hashtags, @ mentions, websites and perform stemming.
'''
stemmer = PorterStemmer()
stage1 = [word for word in tweet.lower().split() if word not in stopwords.words('english')] # stopword removal
stage2 = [word[1:] if word.startswith('#') else word for word in stage1] # Hashtag symbol removal
stage3 = [stemmer.stem(word) for word in stage2 if not any([word.startswith('@'), word.startswith('http'), word.startswith('www')])] # @ mentions and websites removal and stemming
return ' '.join(stage3)
def get_tweets_by_user(self, user, count = 10):
'''
Fetch tweets for a certain user (username).
Max count supported by Twitter is 200.
'''
try:
tweet_object = self.api.user_timeline(screen_name = user, count = count)
return [*map(lambda x : x.text, tweet_object)]
except tweepy.TweepError as e:
logging.exception('Failed to fetch tweets: {}'.format(str))
def search_tweets(self, query, count = 10):
'''
Search for tweets matching a query.
#NOTE: This does not perform an exhaustive search.
'''
try:
tweet_object = self.api.search(query = query, count = count)
return [*map(lambda x : x.text, tweet_object)]
except tweepy.TweepError as e:
logging.exception('Failed to fetch tweets: {}'.format(str(e)))
class Classifier:
'''
Creates an object that loads all the models to perform analysis.
'''
def __init__(self):
self.LR = Model('LogisticRegression')
self.NB = Model('NaiveBayes')
self.NN = Model('NeuralNetwork')
self.models = {'LogisticRegression': self.LR,
'NaiveBayes': self.NB,
'NeuralNetwork': self.NN}
def weighted_average(self, data):
'''
The prediction of the Classifier is weighted to give models with higher
accuracy more weight.
Ranking based on accuracy, 1 - LR, 2 - NB, 3 - NN
'''
# Weights correspond to the ranking above.
weights = {'LogisticRegression': 0.40,
'NaiveBayes': 0.35,
'NeuralNetwork': 0.25}
total = {'Positive' : 0, 'Negative': 0}
for model, score in data.items():
total[score] += weights[model]
return (max(total, key = total.get), total[max(total, key = total.get)])
def predict(self, text, generate_summary = True):
'''
Predicts the sentiment of a tweet using all the imported models.
'''
predictions = dict()
if isinstance(text, str):
for name, model in self.models.items():
predictions[name] = model.predict(text)
return self.get_summary(predictions) if generate_summary else predictions
def get_summary(self, predictions):
'''
Using the raw predictions, generates a weighted pretty-printed summary.
'''
result = str()
for name, score in predictions.items():
result += '{}: {}\n'.format(name, score)
final_score = self.weighted_average(predictions)
result += 'Prediction: {} with a probability of {}%\n'.format(final_score[0], final_score[-1]*100)
return result
def visualize(self, results, title = None, save_to_file = False):
'''
Provides a pie chart to summarize the predicted sentiments
'''
final_scores_combined = Counter([self.weighted_average(result)[0] for result in results])
labels = final_scores_combined.keys()
sizes = final_scores_combined.values()
colors = ['lightcoral' if l == 'Negative' else 'yellowgreen' for l in labels]
explode = (0.1, 0)
plt.pie(sizes, labels = labels, explode = explode, colors = colors, shadow = True, autopct = '%1.1f%%')
plt.axis('equal')
if title:
plt.title(title)
if save_to_file:
plt.savefig('TweetSummaryPlot.png')
logging.info('Plot saved as TweetSummaryPlot.png')
else:
plt.show()
def process_data(self, data, user = None, save_to_file = False, visualize = False):
'''
Saves tweets to file and optionally provides a visual summary of predictions.
'''
predictions, results = list(), list()
for tweet in data:
print('Tweet: {}'.format(tweet))
prediction = self.predict(tweet, generate_summary = False)
result = self.get_summary(prediction)
predictions.append(prediction)
results.append(result)
print(result)
if save_to_file:
with open('TweetAnalysis-{}.txt'.format(user), 'w') as file:
for tweet, result in zip(data, results):
file.write(tweet + '\n')
file.write(result)
logging.info('File saved as TweetAnalysis-{}.txt'.format(user))
if visualize:
self.visualize(predictions, title = user, save_to_file = save_to_file)
class TwitterStreamListener(tweepy.StreamListener):
'''
Creates a Twitter stream object.
'''
def __init__(self, topic = None, classifier = None, save_to_file = False, time_limit = 20, visualize = False):
self.predictions = list()
self.limit = time_limit
self.start = time.time()
if not classifier:
self.model = Classifier()
else:
self.model = classifier
self.save_to_file = save_to_file
self.visualize = visualize
self.topic = topic
if self.save_to_file:
self.open_file = open('TweetStreamAnalysis.txt', 'w')
super(TwitterStreamListener, self).__init__()
def on_status(self, data):
'''
Process the incoming stream of tweets.
'''
if (time.time() - self.start) < self.limit:
prediction = self.model.predict(str(data.text), generate_summary = False)
summary = self.model.get_summary(prediction)
self.predictions.append(prediction)
print('Tweet: {}'.format(data.text))
print(summary)
if self.save_to_file:
self.open_file.write('Tweet: {}\n'.format(data.text))
self.open_file.write(summary)
time.sleep(0.25) # This sleep ensures that the stdout is not flooded with tweets.
return True
else:
if self.save_to_file:
self.open_file.close()
logging.info('File saved as TweetStreamAnalysis.txt')
if self.visualize:
self.model.visualize(self.predictions, title = self.topic, save_to_file = self.save_to_file)
return False
def on_error(self, status):
'''
Handle the error status.
'''
logging.error('Terminating program, error: {}'.format(status))
return False
if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Tweet sentiment analyzer')
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument('--user', '-u', type=str, default=None, help='Twitter username to fetch tweets')
group.add_argument('--stream', nargs='+', type=str, default=None, help='Stream a list of topics from Twitter')
parser.add_argument('--file', action='store_true', default=False, help='Store tweets and analysis from stream to file')
parser.add_argument('--visualize', action='store_true', default=False, help='Provides a pie chart with a summary of predictions')
parser.add_argument('--count', '-c', type=lambda x : 200 if int(x) > 200 else abs(int(x)), default=10, help='Number of tweets to fetch')
parser.add_argument('--time', type=int, default=20, help='Time to stream a topic')
args = parser.parse_args()
if args.user:
# Initialize twitter object
tw_connection = Twitter()
tweets = tw_connection.get_tweets_by_user(user = args.user, count = args.count)
# Initialize classifier
model = Classifier()
model.process_data(data = tweets, user = args.user, save_to_file = args.file, visualize = args.visualize)
if args.stream:
tw_connection = Twitter()
# Initialize stream object
streamListener = TwitterStreamListener(topic = args.stream, save_to_file = args.file, time_limit = args.time, visualize = args.visualize)
stream = tweepy.Stream(auth = tw_connection.api.auth, listener = streamListener)
stream.filter(track = args.stream)