-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtf_idf_analysis.py
executable file
·126 lines (99 loc) · 3.8 KB
/
tf_idf_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
#
# Performs an analysis of U.S. Presidental Inauguration Speeches based
# on TF-IDF information. The script performs multiple things:
#
# 1. Extraction of the five most relevant words (according to the
# TF-IDF weights) of each speech.
#
# 2. Using LDA to obtain 5 different topics present in the corpus
# of speeches and assigning every document to its model.
#
# Original author: Bastian Rieck
import numpy
import os
import sys
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
num_topics = 5
num_words = 5
"""
Extracts the top n words from a row in the TF-IDF matrix. This requires
knowledge about the feature names, i.e. the actual words present in the
text.
"""
def get_top_words(row, feature_names, weights, n=5):
top_ids = numpy.argsort(row)[:-n-1:-1]
top_names = [feature_names[i] for i in top_ids]
top_weights = [row[i] / weights[i] for i in top_ids]
return top_names, top_weights
"""
Gets all topics from a given topic model and describes them using
a number of words.
"""
def get_topics(model, feature_names, n=3):
topics = []
for index, topic in enumerate(model.components_):
words = " ".join( feature_names[i] for i in topic.argsort()[:-n-1:-1] )
topics.append( words )
return topics
"""
Extracts a year and a name from a filename.
"""
def get_year_and_name(filename):
basename = os.path.basename(filename)
name = os.path.splitext(basename)[0]
name = name.replace("_", " ")
year = name[:4]
name = name[5:]
return year, name
"""
main
"""
if __name__ == "__main__":
# First, some pre-processing to make the output a little bit more
# shiny and nice.
filenames = sys.argv[1:]
filename_to_name = dict()
filename_to_year = dict()
for filename in filenames:
year, name = get_year_and_name(filename)
filename_to_name[filename] = name
filename_to_year[filename] = year
#####################################################################
# TF-IDF analysis
#####################################################################
tf_idf_vectorizer = TfidfVectorizer(input='filename',
stop_words='english',
max_df=0.95,
strip_accents='unicode')
tf_idf = tf_idf_vectorizer.fit_transform(filenames)
f = open("/tmp/Topics_gnuplot.txt", "w")
for index, filename in enumerate(filenames):
year = filename_to_year[filename]
name = filename_to_name[filename]
top_words, top_weights = get_top_words( tf_idf[index].toarray().ravel(), tf_idf_vectorizer.get_feature_names(), tf_idf_vectorizer.idf_, num_words)
print("%s (%s): %s" % (year, name, " ".join(top_words)))
for index, (word, weight) in enumerate(zip(top_words, top_weights)):
if max(top_weights) != min(top_weights):
weight = ( weight - min(top_weights) ) / ( max(top_weights) - min(top_weights) )
else:
weight = 1.0
print("%s %d \"%s\" %s %f" % (year, index, name, word, weight), file=f)
print("\n", file=f)
f.close()
#####################################################################
# LDA analysis
#####################################################################
lda = LatentDirichletAllocation(n_topics=num_topics,
learning_method="online",
max_iter=20,
random_state=42)
lda.fit(tf_idf)
topics = get_topics(lda, tf_idf_vectorizer.get_feature_names())
for index, topic in enumerate(topics):
print("Topic %d: %s" % (index, "".join(topic)))
for index, filename in enumerate(filenames):
scores = lda.transform(tf_idf[index]).ravel()
topic = numpy.argsort(scores)[-2]
print(topics[topic])