-
Notifications
You must be signed in to change notification settings - Fork 0
/
EmailSpamDetection.py
144 lines (131 loc) · 4.85 KB
/
EmailSpamDetection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#creator : AbssZy
import nltk
import random
import os
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
import string
from warnings import simplefilter
simplefilter(action='ignore',category=FutureWarning)
from warnings import simplefilter
simplefilter(action='ignore',category=FutureWarning)
def pr(word,msg):
if word in msg:
return 1
else:
return 0
def find_feature(word_features,message):
feature = {}
for word in word_features:
feature[word] = pr(word,message)
return feature
def create_mnb_classifier(trainingset,testingset):
x=0
y=0
print("\nMultinomial Naive Bayes classifier is being trained and created....")
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(trainingset)
for t in testingset:
y=y+1
l=MNB_classifier.classify(t[0])
if(l==t[1]):
x=x+1
accuracy=x/y * 100
print("Multinomial Classifier accuracy = "+ str(accuracy))
return MNB_classifier
def create_bnb_classifier(trainingset,testingset):
x=0
y=0
print("\nBernoulli Naive Bayes classifier is being trained and created...")
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(trainingset)
for t in testingset:
y=y+1
l=BNB_classifier.classify(t[0])
if(l==t[1]):
x=x+1
accuracy=x/y * 100
print("BernoulliNB accuracy precent = " + str(accuracy))
return BNB_classifier
def create_logistic_regression_classifier(trainingset,testingset):
x=0
y=0
print("\nLogistic Regreesion classifier is being trained and created...")
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(trainingset)
for t in testingset:
y=y+1
l=LogisticRegression_classifier.classify(t[0])
if(l==t[1]):
x=x+1
accuracy=x/y * 100
print("Logistic Regression classifier accuracy = " + str(accuracy))
return LogisticRegression_classifier
def create_training_testing():
with open("SMSSpamCollection.txt") as f:
messages = f.read().split('\n')
print("Creating bag of words. ")
all_message = []
all_words = []
for message in messages:
if message.split('\t')[0] == "spam":
all_message.append([message.split('\t'), "spam"])
else:
all_message.append([message.split('\t'), "ham"])
for s in string.punctuation:
if s in message:
message = message.replace(s, " ")
stop = stopwords.words('english')
for word in message.split(" "):
if not word in stop:
all_words.append(word.lower())
print("Bag of words created.")
random.shuffle(all_message)
random.shuffle(all_message)
random.shuffle(all_message)
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:2000]
print("\nCreating feature set. ")
featureset = [(find_feature(word_features,message),category) for (message,category) in all_message]
print("Feature set created.")
trainingset = featureset[:int(len(featureset)*3/4)]
testingset = featureset[int(len(featureset)*3/4):]
print("\nLength of feature set ",len(featureset))
print("Length of training set",len(trainingset))
print("Length of testing set",len(testingset))
return word_features, featureset, trainingset, testingset
def main():
word_features, featureset, trainingset, testingset = create_training_testing()
MNB_classifier = create_mnb_classifier(trainingset, testingset)
BNB_classifier = create_bnb_classifier(trainingset, testingset)
LR_classifier = create_logistic_regression_classifier(trainingset, testingset)
mail = input('enter message:').lower()
x=0
print("\n")
print("Multinomial Naive Bayes")
print(" ")
feature = find_feature(word_features,mail)
print(MNB_classifier.classify(feature))
if(MNB_classifier.classify(feature)=="ham"):
x=x+1
print("\n")
print("Bernoulli Naive Bayes")
print(" ")
feature = find_feature(word_features, mail)
print(BNB_classifier.classify(feature))
if(BNB_classifier.classify(feature)=="ham"):
x=x+1
print("\n")
print("Logistic Regression")
print(" ")
feature = find_feature(word_features,mail)
print(LR_classifier.classify(feature))
if(LR_classifier.classify(feature)=="ham"):
x=x+1
if(x>=2):
print("\n*******\n message is classified as ham\n*******\n")
else:
print("\n*******\n message is classified as spam\n*******\n")
main()