-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMalay_lda(FINAL).py
188 lines (145 loc) · 6.51 KB
/
Malay_lda(FINAL).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# =============================================================================
# MALAY CORPUS:
# 1. Tuned LDA model for train data
# 2. Use tuned lda model to infer topic distribution on test data
# =============================================================================
"Execute preprocessing functions"
runfile('C:/Users/shash/RP_LDA_functions.py', wdir='C:/Users/shash')
runfile('C:/Users/shash/RP_functions.py', wdir='C:/Users/shash')
#%%
dfmal = pd.read_csv("Malay_traindata.csv")
#%%
"Anonymize train terms e.g. monorail, rapidkl etc."
dfmal['textfin'] = dfmal['textfin'].apply(lambda x: standardize4(x))
"Remove prominent stop words"
dfmal['textfin'] = dfmal['textfin'].apply(lambda x: remove_stopwords2(x))
"Anonymize station names"
dfmal['textfin'] = dfmal['textfin'].apply(lambda x: replace_stn_names(x))
#%%
"PREP elements for LDA: tokenized text, dictionary and vectorized corpus"
corpus_mal = dfmal['textfin'].tolist()
train_texts = [doc.split(" ") for doc in corpus_mal]
bigram = gensim.models.Phrases(train_texts, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[train_texts], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
data_words_bigrams = make_bigrams(train_texts)
texts = data_words_bigrams
dictionary = Dictionary(texts)
print("\n")
print("total terms in dictionary")
print(len(dictionary.values()))
corpus = [dictionary.doc2bow(text) for text in texts]
#%%
"plot optimization graph for basic lda model"
"LONG RUNTIME!!"
lmlist, c_v = evaluate_graph(dictionary=dictionary, corpus=corpus, texts=texts, limit=10)
#%%
"Run pre tuning"
#ldamodel = LdaModel(corpus=corpus, num_topics=4, id2word=dictionary, passes=100,random_state=100)
#%%
"Run post tuning "
ldamodel = gensim.models.LdaMulticore(corpus=corpus, num_topics=8,
id2word=dictionary,
random_state=100,
chunksize=100,
passes=50,
alpha="asymmetric",
eta=0.91)
# Compute Coherence & perplexity Score
coherence_model_lda = CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)
print('\nPerplexity:', ldamodel.log_perplexity(corpus))
"n=6, stopwords2 removed, full ano, alpha=asymmetric, eta=0.91, coherence:0.4075572 , perplex: -7.11937651 "
"n=8, stopwords2 removed, full ano, alpha=asymmetric, eta=0.91, coherence:0.4403929 , perplex: -7.16848867 "
"n=8, stopwords2 removed, full ano, alpha=0.61, eta=0.91, coherence:0.4172946 , perplex: -7.422954886 "
#%%
"SAVE lda multicore"
ldamodel.save('Malay_tunedLDA(FINAL).model')
#%%
"Load saved model"
loading = LdaModel.load('Malay_tunedLDA(FINAL).model')
#%%
"VIEW & SAVE top terms of topics of lda model"
stored = loading.print_topics(num_words=30)
pprint(stored)
with open("Malay_lda_topictermList.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(stored)
#%%
"create & save pyLDAvis"
prepared_vis_dat = pyLDAvis.gensim.prepare(loading, corpus, dictionary)
#pyLDAvis.show(prepared_vis_dat)
pyLDAvis.save_html(prepared_vis_dat,'Malay_pyLDAvis(FINAL).html')
#%%
"Infering our tuned lda model on test data"
df_test_mal = pd.read_csv("Malay_testdata(labelled_Cleaned).csv") # all data
df_test_mal['LDAtotal']=df_test_mal['textfin'].apply(lambda x: belong(x))
g = (lambda x: pd.Series(topic_belong(x)))
df_test_mal[["topic1","topic2","topic3","topic4","topic5","topic6","topic7","topic8"]] = df_test_mal['textfin'].apply(g)
df_test_mal.to_csv("Malay_test_LDATopics_infer_V2(FINAL).csv", index=False)
#%%
dfmal.to_csv("Malay_traindata_Ano.csv", index=False)
#%%
df_tren = pd.read_csv("Malay_traindata_Ano.csv")
#%%
df_tren['textfin'] = df_tren['textfin'].astype(str)
df_tren['LDAtotal']=df_tren['textfin'].apply(lambda x: belong(x))
g = (lambda x: pd.Series(topic_belong(x)))
df_tren[["topic1","topic2","topic3","topic4","topic5","topic6","topic7","topic8"]] = df_tren['textfin'].apply(g)
df_tren.to_csv("Malay_traindata_Ano_infer(cloud).csv", index=False)
#%%
"TUNING HYPERPARAMETER OF LDA MULTICORE MODEL"
"SUPER LONG RUNTIME!!"
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 8
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25),
# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75),
corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
'Topics': [],
'Alpha': [],
'Beta': [],
'Coherence': []
}
# Can take a long time to run
if 1 == 1:
pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
# iterate through validation corpuses
for i in range(len(corpus_sets)):
# iterate through number of topics
for k in topics_range:
# iterate through alpha values
for a in alpha:
# iterare through beta values
for b in beta:
# get the coherence score for the given parameters
cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary,
k=k, a=a, b=b)
# Save the model results
model_results['Validation_Set'].append(corpus_title[i])
model_results['Topics'].append(k)
model_results['Alpha'].append(a)
model_results['Beta'].append(b)
model_results['Coherence'].append(cv)
pbar.update(1)
pd.DataFrame(model_results).to_csv('lda_tuning_results_MAL.csv', index=False)
pbar.close()