-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSVM_model.py
644 lines (424 loc) · 18.5 KB
/
SVM_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
# coding: utf-8
# # Amazon Fine Food Reviews Analysis
#
#
# Data Source: https://www.kaggle.com/snap/amazon-fine-food-reviews <br>
#
# EDA: https://nycdatascience.com/blog/student-works/amazon-fine-foods-visualization/
#
#
# The Amazon Fine Food Reviews dataset consists of reviews of fine foods from Amazon.<br>
#
# Number of reviews: 568,454<br>
# Number of users: 256,059<br>
# Number of products: 74,258<br>
# Timespan: Oct 1999 - Oct 2012<br>
# Number of Attributes/Columns in data: 10
#
# Attribute Information:
#
# 1. Id
# 2. ProductId - unique identifier for the product
# 3. UserId - unqiue identifier for the user
# 4. ProfileName
# 5. HelpfulnessNumerator - number of users who found the review helpful
# 6. HelpfulnessDenominator - number of users who indicated whether they found the review helpful or not
# 7. Score - rating between 1 and 5
# 8. Time - timestamp for the review
# 9. Summary - brief summary of the review
# 10. Text - text of the review
#
#
# #### Objective:
# Given a review, determine whether the review is positive (rating of 4 or 5) or negative (rating of 1 or 2).
#
# <br>
# [Q] How to determine if a review is positive or negative?<br>
# <br>
# [Ans] We could use Score/Rating. A rating of 4 or 5 can be cosnidered as a positive review. A rating of 1 or 2 can be considered as negative one. A review of rating 3 is considered nuetral and such reviews are ignored from our analysis. This is an approximate and proxy way of determining the polarity (positivity/negativity) of a review.
#
#
#
# # [1]. Reading Data
# ## [1.1] Loading the data
#
# The dataset is available in two forms
# 1. .csv file
# 2. SQLite Database
#
# In order to load the data, We have used the SQLITE dataset as it is easier to query the data and visualise the data efficiently.
# <br>
#
# Here as we only want to get the global sentiment of the recommendations (positive or negative), we will purposefully ignore all Scores equal to 3. If the score is above 3, then the recommendation wil be set to "positive". Otherwise, it will be set to "negative".
# In[1]:
get_ipython().run_line_magic('matplotlib', 'inline')
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings("ignore")
# In[2]:
# using the SQLite Table to read data.
con = sqlite3.connect('database.sqlite')
#filtering only positive and negative reviews i.e.
# not taking into consideration those reviews with Score=3
filtered_data = pd.read_sql_query("""SELECT * FROM Reviews WHERE Score != 3""", con)
# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
if x < 3:
return 0
return 1
#changing reviews with score less than 3 to be 0 and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition)
filtered_data['Score'] = positiveNegative
# In[3]:
print(filtered_data.shape)
filtered_data.head()
# # [2] Exploratory Data Analysis
# ## [2.1] Data Cleaning: Deduplication
#
# It is observed (as shown in the table below) that the reviews data had many duplicate entries. Hence it was necessary to remove duplicates in order to get unbiased results for the analysis of the data. Following is an example:
# In[4]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display
# As it can be seen above that same user has multiple reviews with same values for HelpfulnessNumerator, HelpfulnessDenominator, Score, Time, Summary and Text and on doing analysis it was found that <br>
# <br>
# ProductId=B000HDOPZG was Loacker Quadratini Vanilla Wafer Cookies, 8.82-Ounce Packages (Pack of 8)<br>
# <br>
# ProductId=B000HDL1RQ was Loacker Quadratini Lemon Wafer Cookies, 8.82-Ounce Packages (Pack of 8) and so on<br>
#
# It was inferred after analysis that reviews with same parameters other than ProductId belonged to the same product just having different flavour or quantity. Hence in order to reduce redundancy it was decided to eliminate the rows having same parameters.<br>
#
# The method used for the same was that we first sort the data according to ProductId and then just keep the first similar product review and delelte the others. for eg. in the above just the review for ProductId=B000HDL1RQ remains. This method ensures that there is only one representative for each product and deduplication without sorting would lead to possibility of different representatives still existing for the same product.
# In[5]:
#Sorting data according to ProductId in ascending order
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
# In[6]:
#Deduplication of entries
final = sorted_data.drop_duplicates(subset = {"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape
# In[7]:
#Checking to see how much % of data still remains
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100
# <b>Observation:-</b> It was also seen that in two rows given below the value of HelpfulnessNumerator is greater than HelpfulnessDenominator which is not practically possible hence these two rows too are removed from calcualtions
# In[8]:
display = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""", con)
display
# In[9]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]
# In[10]:
#Before starting the next phase of preprocessing lets see the number of entries left
print(final.shape)
#How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()
# # [3] Preprocessing
# ## [3.1]. Preprocessing Review Text
#
# Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.
#
# Hence in the Preprocessing phase we do the following in the order below:-
#
# 1. Begin by removing the html tags
# 2. Remove any punctuations or limited set of special characters like , or . or # etc.
# 3. Check if the word is made up of english letters and is not alpha-numeric
# 4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
# 5. Convert the word to lowercase
# 6. Remove Stopwords
# 7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)<br>
#
# After which we collect the words used to describe positive and negative reviews
# In[11]:
# find sentences containing HTML tags
import re
i=0;
for sent in final['Text'].values:
if (len(re.findall('<.*?>', sent))):
print(i)
print(sent)
break;
i += 1;
# In[12]:
stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer
def cleanhtml(sentence): #function to clean the word of any html-tags
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', sentence)
return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
return cleaned
print(stop)
print('************************************')
print(sno.stem('tasty'))
# In[13]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
# this code takes a while to run as it needs to run on 500k sentences.
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in final['Text'].values:
filtered_sentence=[]
#print(sent);
sent=cleanhtml(sent) # remove HTMl tags
for w in sent.split():
for cleaned_words in cleanpunc(w).split():
if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):
if(cleaned_words.lower() not in stop):
s=(sno.stem(cleaned_words.lower())).encode('utf8')
filtered_sentence.append(s)
if (final['Score'].values)[i] == 1:
all_positive_words.append(s) #list of all words used to describe positive reviews
if(final['Score'].values)[i] == 0:
all_negative_words.append(s) #list of all words used to describe negative reviews reviews
else:
continue
else:
continue
#print(filtered_sentence)
str1 = b" ".join(filtered_sentence) #final string of cleaned words
#print("***********************************************************************")
final_string.append(str1)
i+=1
# In[14]:
final['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review
# In[15]:
final.head(3)
# In[16]:
# store final table into an SQlLite table for future.
conn = sqlite3.connect('final.sqlite')
c=conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)
# In[17]:
con = sqlite3.connect("final.sqlite")
cleaned_data = pd.read_sql_query("select * from Reviews", con)
# In[18]:
print(cleaned_data.shape)
cleaned_data.head()
# In[19]:
cleaned_data['Score'].value_counts()
# In[20]:
# To randomly sample 60k points from both class
random_sample_data = final.sample(n=100000)
random_sample_data.shape
# In[21]:
# Sort data based on time
final_sorted_time=random_sample_data.sort_values('Time',ascending=True,axis=0)
# In[22]:
final_sorted_time.head(3)
# In[23]:
#data splitting
y_train=final_sorted_time['Score'][0:49000]
y_cv=final_sorted_time['Score'][49000:70000]
y_test=final_sorted_time['Score'][70000:100000]
# In[24]:
train_data=final_sorted_time['CleanedText'][0:49000]
cv_data=final_sorted_time['CleanedText'][49000:70000]
test_data=final_sorted_time['CleanedText'][70000:100000]
# # [4] Featurization
# ## BAG OF WORDS
# ## [4.1] Bi-Grams and n-Grams.
# In[25]:
count_vector = CountVectorizer(ngram_range=(1,2))
bow_train = count_vector.fit_transform(train_data)
bow_cv = count_vector.transform(cv_data)
bow_test= count_vector.transform(test_data)
# In[50]:
from sklearn.externals import joblib
joblib.dump(count_vector, 'count_vector.pkl')
# In[26]:
print('No. of datapoints in training data of BOW',bow_train.shape)
print('No. of datapoints in cross validation data of BOW',bow_cv.shape)
print('No. of datapoints in testing data of BOW',bow_test.shape)
# # Project : SVM
# # Applying SVM
# ## Linear SVM
# ### [5.1.1] Applying Linear SVM on BOW,<font color='red'> SET 1</font>
# In[27]:
# Hyperparameter tuning:
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, auc
penalties = ['l1', 'l2']
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
cv_scores = []
tr_scores = []
i = 0
for alpha in C:
for p in penalties:
m = SGDClassifier(loss='hinge', penalty=p, alpha=alpha, n_jobs=-1)
clf = CalibratedClassifierCV(base_estimator=m, cv=None)
clf.fit(bow_train, y_train)
y_score=clf.predict_proba(bow_cv)[:,1]
scores = roc_auc_score(y_cv, y_score)
cv_scores.append(scores)
y_score=clf.predict_proba(bow_train)[:,1]
scores = roc_auc_score(y_train, y_score)
tr_scores.append(scores)
print("CV ROC_AUC Score : ", cv_scores[i], " Train ROC_AUC Score : ", tr_scores[i], "C : ", alpha, " penalty : ", p)
i += 1
# In[32]:
alpha = ['0.0001+L1', '0.0001+L2', '0.001+L1', '0.001+L2', '0.01+L1', '0.01+L2',
'0.1+L1', '0.1+L2', '1+L1', '1+L2', '10+L1', '10+L2', '100+L1', '100+L2','1000+L1','1000+L2']
plt.figure(figsize=(15,3))
plt.plot(range(len(C)*len(penalties)), tr_scores)
plt.plot(range(len(C)*len(penalties)), cv_scores)
plt.xticks(range(len(C)*len(penalties)), alpha, rotation = 45)
plt.legend(['Train AUC', 'CV AUC'])
plt.xlabel('Hyperparameter(alpha+Regularizer)')
plt.ylabel('AUC value')
plt.title('AUC value VS Hyperparameter Plot\n',size=18)
plt.show()
# From above observation, plot corresponding 0.01 with L2 regularizer shows nice tradeoff of train auc and cv auc.
# # Now Lets see result on Test data i.e on unseen data
# In[38]:
om = SGDClassifier(loss = 'hinge', penalty='l2', alpha=0.01)
model = CalibratedClassifierCV(base_estimator=om, cv=None)
model.fit(bow_train, y_train)
# In[51]:
joblib.dump(model, 'model.pkl')
# ### Confusion matrix of test data
# In[39]:
pred_test= model.predict(bow_test)
cm = confusion_matrix(y_test, pred_test)
cm
# In[40]:
# plot confusion matrix to describe the performance of classifier.
import seaborn as sns
class_label = ["negative", "positive"]
df_cm = pd.DataFrame(cm, index = class_label, columns = class_label)
sns.heatmap(df_cm, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
# ### ROC Curve
# In[42]:
train_fpr, train_tpr, thresholds = roc_curve(y_train, model.predict_proba(bow_train)[:,1])
test_fpr, test_tpr, thresholds = roc_curve(y_test, model.predict_proba(bow_test)[:,1])
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.plot([0,1],[0,1])
plt.legend()
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title("ROC Curve")
plt.show()
# In[45]:
pred = model.predict(bow_test)
pred_prob = model.predict_proba(bow_test)[:,1]
test_value= roc_auc_score(y_test, pred_prob)
print("AUC value on test data:",test_value)
# In[44]:
# To show main classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))
# In[57]:
from bs4 import BeautifulSoup
# In[61]:
def decontracted(phrase):
# specific
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
return phrase
# In[65]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])
# In[62]:
def clean_text(sentance):
sentance = re.sub(r"http\S+", "", sentance)
sentance = BeautifulSoup(sentance, 'lxml').get_text()
sentance = decontracted(sentance)
sentance = re.sub("\S*\d\S*", "", sentance).strip()
sentance = re.sub('[^A-Za-z]+', ' ', sentance)
# https://gist.github.com/sebleier/554280
sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
return sentance.strip()
# In[63]:
def predict(string):
clf = joblib.load('model.pkl')
count_vect = joblib.load('count_vector.pkl')
review_text= cleanhtml(string)
review_text = clean_text(string)
test_vect = count_vector.transform(([review_text]))
pred = clf.predict(test_vect)
print(pred[0])
if pred[0]:
prediction = "Positive"
else:
prediction = "Negative"
return prediction
# In[66]:
print(predict('Have been having this since years. Much better option than Bru.Nescafe still managing to do well in market with all the competitors breathing down it\'s neck. Good one!'))
# # [6] Conclusions
# In[48]:
# Creating table using PrettyTable library
from prettytable import PrettyTable
# Names of models
names= ['Linear SVM with BOW']
optimal_alpha = [0.01]
pen=['L2']
test=[test_value]
numbering = [1]
# Initializing prettytable
ptable = PrettyTable()
# Adding columns
ptable.add_column("S.NO.",numbering)
ptable.add_column("MODEL",names)
ptable.add_column("Best Alpha",optimal_alpha)
#ptable.add_column("Best C", values_of_C)
#table.add_column("gamma",gammas)
#ptable.add_column("Training's AUC",training_value)
#ptable.add_column("CV's AUC",cv_value)
ptable.add_column("Testing's AUC",test)
ptable.add_column("Penalty",pen)
# Printing the Table
ptable.align = "c"
#from prettytable import MSWORD_FRIENDLY
#ptable.set_style(MSWORD_FRIENDLY)
print(ptable)