-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmeans_text_model.py
30 lines (26 loc) · 1005 Bytes
/
kmeans_text_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# Parts of the code from https://github.com/MihailSalnikov/tf-idf_and_k-means
# TF - Term Frequency in a document
# IDF - Inverse Document Frequency (the impotance of term in a set of documents)
# TF-IDF - Term Frequency-Inverse Document Frequency
class TFIDF:
def compute_idf(self, strings_list):
n = len(strings_list)
idf = dict.fromkeys(strings_list[0].keys(), 0)
for l in strings_list:
for word, count in l.items():
if count > 0:
idf[word] += 1
for word, v in idf.items():
idf[word] = log(n / float(v))
return self.idf
def compute_tf(self, word_dict, l):
tf = {}
sum_nk = len(l)
for word, count in word_dict.items():
tf[word] = count / sum_nk
return self.tf
def compute_tf_idf(self, tf, idf):
tf_idf = dict.fromkeys(tf.keys(), 0)
for word, v in tf.items():
tf_idf[word] = v * idf[word]
return self.tf_idf