-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathedurate_gensim.py
104 lines (86 loc) · 3.44 KB
/
edurate_gensim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""Analyze the responses to the Edurate evaluation using Gensim."""
import logging
import warnings
import gensim
from gensim import corpora
from profanity import profanity
from stop_words import get_stop_words
from six import viewitems
from colorama import Fore, Style
import pyLDAvis
import pyLDAvis.gensim
import webbrowser
import inspect
def gensim_analysis(list_responses, q_count, num_of_topics):
"""Complete the analysis for each answer."""
warnings.filterwarnings('ignore')
tokens = create_tokens(list_responses)
dictionary = dictionary_create(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
# print(corpus)
corp_eval(dictionary, tokens, corpus, q_count, num_of_topics)
logging.info("Analyzes gensim and returns the repeated words")
def create_tokens(list_responses):
"""Take in the list of responses and make each word a token."""
logging.info("Creating tokens")
stoplist = get_stop_words('en')
tokens = []
for res in list_responses:
temp = []
for word in res.split():
if not isinstance(word, int):
if not profanity.contains_profanity(word):
if word not in stoplist:
if word != "I":
temp.append(word)
if temp:
tokens.append(temp)
return tokens
def dictionary_create(tokens):
"""Create the dictionary from the tokens of the answer."""
dictionary = corpora.Dictionary(tokens)
logging.info("Created a dictionary using the tokens")
return dictionary
def corp_eval(dictionary, tokens, corpus, q_count, num_of_topics):
"""Evaluate the corpus and produce gensim visualization."""
i = len(tokens)
lda = gensim.models.ldamodel.LdaModel(
corpus,
id2word=dictionary,
num_topics=num_of_topics,
passes=1,
alpha='symmetric',
eta=None)
corpus = [dictionary.doc2bow(token) for token in tokens]
logging.debug(dictionary.token2id)
logging.debug(viewitems(dictionary.dfs))
print(
Fore.GREEN +
"Producing LDA analysis for question: ",
q_count,
Style.RESET_ALL)
print(lda)
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
print(Fore.YELLOW + "These are the current topics: " + Style.RESET_ALL)
print(lda.print_topics(i))
print(Fore.CYAN +
"Opening up visualization in a new tab in the browser...",
Style.RESET_ALL)
# Writing HTML of visualization to file instead of showing with pyLDAvis show function
# because the show function starts a server, which allows only one file to be displayed
# at once.
vis_html_text = pyLDAvis.prepared_data_to_html(vis)
vis_html_file_name = "vis" + str(q_count) + ".html"
vis_html_file = open(vis_html_file_name, "w")
vis_html_file.write(vis_html_text)
# Getting path to the edurate_gensim.py module, which is in the same directory
# as the HTML file. This path will be used to generate the file path to the HTML
# that is to be displayed.
MODULE_NAME = "edurate_gensim.py"
PATH_TO_MODULE = inspect.stack()[0][1]
# Removing name of module from path so that the path only includes up to the
# directory where the HTML file is located.
PATH_TO_HTML = PATH_TO_MODULE[:-len(MODULE_NAME)]
webbrowser.open("file:///" + PATH_TO_HTML + vis_html_file_name, new=2)
logging.info("Gensim visualization has been displayed.")
return dictionary.dfs