-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier.py
74 lines (54 loc) · 1.41 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# coding: utf8
# By Maher AMARA
from optimisation import deep_split
def all_words():
return size_tot
def card(feature):
return len(feature)
def Prob_word(word, feature):
if word in feature.keys():
p = (int(feature[word]) + 1) / (len(feature) + all_words())
else:
p = 1 / (len(feature) + all_words())
return p
def Prob_SMS(feature, words):
# this is the naive part
# assuming that every word in a sentence is independent
p = 1
for word in words:
p *= Prob_word(word, feature)
return p
def Prob(feature, words):
# donne la probabilité que ce SMS est de cette caracteristique
# la partie BAYES
p = Prob_SMS(feature, words) / len(feature)
return p
def comparateur(words):
# camprer les probabilité
return Prob(d_ham, words) >= Prob(d_spam, words)
def classifier(SMS):
# takes sms and reterns spam or ham
words = deep_split(SMS)
if comparateur(words):
return "Ham"
else:
return "Spam"
# main
with open("ham_d1.txt", 'r') as ham:
lines = ham.readlines()
l1 = []
for line in lines:
l1 += [line.split()]
d_ham = dict(l1)
with open("spam_d1.txt", 'r') as spam:
lines = spam.readlines()
l1 = []
for line in lines:
l1 += [line.split()]
d_spam = dict(l1)
with open("sheet.txt", "r") as sheet:
ch = sheet.readline()
l = ch.split()
size_ham = int(l[0])
size_spam = int(l[1])
size_tot = int(l[2])