-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbaiseclassifier.py
162 lines (124 loc) · 6.12 KB
/
baiseclassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# load some useful Python libratries
import pandas as pd # the library for working with data tables
import re
#to download files to local machine
# from google.colab import files
# files.download("data_NBC_1.csv")
from collections import Counter # a class for counting objects (words and text labels, in our case)
# enable pandas to display large texts and look into our data
pd.options.display.max_colwidth = 300
data=pd.read_csv('C:/Users/asaelb/PycharmProjects/yandex_preparation/week1/data_NBC_1.csv')
print(data.shape) # number of rows and columns
# enable pandas to display large texts and look into our data
pd.options.display.max_colwidth = 300
#print(data.shape) # number of rows and columns
#data.head(5)
import nltk
from nltk.tokenize import word_tokenize
#nltk.download()
from nltk.stem import PorterStemmer
porter=PorterStemmer()
# def stemSentence(sentence):
# token_words=word_tokenize(sentence)
# token_words
# stem_sentence=[]
# for word in token_words:
# stem_sentence.append(porter.stem(word))
# stem_sentence.append(" ")
# return "".join(stem_sentence)
#
# import re
from string import digits
# remove_digits = str.maketrans('', '', digits)
# res = text.translate(remove_digits)
def get_words(text):
""" This function converts the given text into an unordered and uncounted bag of words. """
text=text.lower()
return set(re.split('\W+', text)).difference({''})
# just an example
#get_words("this message is a long, long, long, long long long one.")
# apply this logic to texts of all messages
bags_of_words = [get_words(text) for text in data.text]
n_train = 3000
train_x, test_x, train_y, test_y = bags_of_words[:n_train], bags_of_words[n_train:], data.target[:n_train], data.target[n_train:]
# this counter will keep the number of spam and ham texts
label_counter = Counter()
# import pdb
# these counters will keep the frequency of each word in ham and spam texts
word_counters = {
'spam': Counter(),
'ham': Counter()}
all_words = set()
for label, words in zip(train_y, train_x):
all_words.update(words)
# TODO: use the `update` methods of all 3 counters, to calculate total number of
label_counter.update([label])
# pdb.set_trace()
word_counters[label].update(words)
# # TODO: use the `update` methods of all 3 counters, to calculate total number of
# label_counter[].update(label)
# # pdb.set_trace()
# word_counters[label].update(label)
#assert label_counter['spam'] == 409
#assert word_counters['ham']['hello'] >= 2
def prior_probability_of_label(label):
""" This function evaluates probability of the given label (it can be 'spam' or 'ham'), using the counters. """
# TODO: calculate and return this probability as ratio of number of texts with this labels to number all texts
P_label = label_counter[label] / n_train
return P_label
p_label=prior_probability_of_label('ham')
#assert round(prior_probability_of_label('spam'), 2) == 0.14
#assert round(prior_probability_of_label('ham'), 2) == 0.86
#non zero version of w_prob
def word_probability_given_label(word, label):
""" This function calculates probability of a word occurence in text, conditional on the label of this text. """
# TODO: calculate and return this probability
# as ratio of number of texts with this word and label to number of texts with this label
p=0.1
alpha=10**-1
p_word_given_label=(word_counters[label][word]+alpha*p)/(label_counter[label]+p)
return p_word_given_label
# P(text)=[text_probability_given_label(text, label) * prior_probability_of_label(label) + text_probability_given_label(text, complement label) * prior_probability_of_label(complement label))]
def text_probability_given_label(text, label):
""" This function calculates probability of the text conditional on its label. """
if isinstance(text, str):
text = get_words(text)
probability = 1.0
# TODO: calculate the probability of text given label.
# use a function defined above and the naive assumption of word independence
for word in all_words:
if word in text:
probability=probability*word_probability_given_label(word, label)
else:
probability=probability*(1-word_probability_given_label(word, label))
return probability
greeting1 = 'hello how are you'
greeting2 = 'hello teacher how are you'
#assert text_probability_given_label(greeting1, 'ham') > 0
#assert text_probability_given_label(greeting1, 'ham') < 0.0001
#assert text_probability_given_label(greeting2, 'ham') < text_probability_given_label(greeting1, 'ham')
def label_probability_given_text(text, label):
""" This function calculates probability of the label (spam or ham) conditional on the text. """
# TODO: calculate label probability conditional on text
# use the Bayes rule and the functions defined above
if label=='ham':
other_label='spam'
else:
other_label='ham'
p_text=prior_probability_of_label(label)*text_probability_given_label(text, label)+(prior_probability_of_label(other_label))*(text_probability_given_label(text, other_label))
p_label_given_text=prior_probability_of_label(label)*text_probability_given_label(text, label)/p_text
return p_label_given_text
text1 = 'hello how r you'
text2 = 'only today you can buy our book with 50% discount!'
#assert label_probability_given_text(text1, 'ham') + label_probability_given_text(text1, 'spam') == 1.0
#assert label_probability_given_text(text1, 'ham') > label_probability_given_text(text1, 'spam')
#assert label_probability_given_text(text1, 'ham') > label_probability_given_text(text2, 'ham')
thresholds = [0.002]
alpha=[10**-1]
P=[0.1]
for threshold in thresholds:
test_spam_probabilities = [label_probability_given_text(text, 'spam') for text in test_x]
test_predictions = ['spam' if spamness > threshold else 'ham' for spamness in test_spam_probabilities]
accuracy = sum(1 if pred == fact else 0 for pred, fact in zip(test_predictions, test_y)) /len(test_y)
print(accuracy)
assert accuracy > 0.9