-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTM_custom_models.py
198 lines (167 loc) · 7.64 KB
/
TM_custom_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# library OCTIS
from octis.models.CTM import CTM
from octis.models.LDA import LDA
from octis.models.NMF import NMF
from octis.dataset.dataset import Dataset
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.models.model import save_model_output
# libraries datascience, NLP
import pandas as pd
import numpy as np
import re
# Other, utilities
import os
import time
import pickle
from tqdm import tqdm
import shutil
from gensim.models import nmf, TfidfModel
import gensim.corpora as corpora
import gensim.corpora as corpora
import octis.configuration.citations as citations
import octis.configuration.defaults as defaults
from gensim.matutils import corpus2csc
import numpy as np
### IMPORTS FOR OCTIS LDA
from octis.models.model import AbstractModel
import numpy as np
from gensim.models import ldamodel
import gensim.corpora as corpora
import octis.configuration.citations as citations
import octis.configuration.defaults as defaults
### DEFINE NEW MODEL FOR NMF, TF-IDF
class NMF_TFIDF(NMF) :
def train_model(self, dataset, hyperparameters=None, top_words=10):
if hyperparameters is None:
hyperparameters = {}
else :
print(hyperparameters)
if self.use_partitions:
partition = dataset.get_partitioned_corpus(use_validation=False)
else:
partition = [dataset.get_corpus(), []]
if self.id2word is None:
self.id2word = corpora.Dictionary(dataset.get_corpus())
if self.id_corpus is None:
self.id_corpus = [self.id2word.doc2bow(
document) for document in partition[0]]
self.tfidf = TfidfModel(self.id_corpus, smartirs='ntc')
self.tfidfcorpus=corpus2csc(self.tfidf[self.id_corpus])
hyperparameters["corpus"] = self.tfidfcorpus
hyperparameters["id2word"] = self.id2word
self.hyperparameters.update(hyperparameters)
self.trained_model = nmf.Nmf(**self.hyperparameters)
result = {}
result["topic-word-matrix"] = self.trained_model.get_topics()
if top_words > 0:
topics_output = []
for topic in result["topic-word-matrix"]:
top_k = np.argsort(topic)[-top_words:]
top_k_words = list(reversed([self.id2word[i] for i in top_k]))
topics_output.append(top_k_words)
result["topics"] = topics_output
result["topic-document-matrix"] = self._get_topic_document_matrix()
if self.use_partitions:
new_corpus = [self.id2word.doc2bow(
document) for document in partition[1]]
if self.update_with_test:
self.trained_model.update(new_corpus)
self.id_corpus.extend(new_corpus)
result["test-topic-word-matrix"] = self.trained_model.get_topics()
if top_words > 0:
topics_output = []
for topic in result["test-topic-word-matrix"]:
top_k = np.argsort(topic)[-top_words:]
top_k_words = list(
reversed([self.id2word[i] for i in top_k]))
topics_output.append(top_k_words)
result["test-topics"] = topics_output
result["test-topic-document-matrix"] = self._get_topic_document_matrix()
else:
result["test-topic-document-matrix"] = self._get_topic_document_matrix(new_corpus)
return result
### DEFINE NEW MODEL FOR LDA, TF-IDF
class LDA_TFIDF(LDA) :
def train_model(self, dataset, hyperparams=None, top_words=10):
"""
Train the model and return output
Parameters
----------
dataset : dataset to use to build the model
hyperparams : hyperparameters to build the model
top_words : if greater than 0 returns the most significant words for each topic in the output
(Default True)
Returns
-------
result : dictionary with up to 3 entries,
'topics', 'topic-word-matrix' and
'topic-document-matrix'
"""
if hyperparams is None:
hyperparams = {}
if self.use_partitions:
train_corpus, test_corpus = dataset.get_partitioned_corpus(use_validation=False)
else:
train_corpus = dataset.get_corpus()
if self.id2word is None:
self.id2word = corpora.Dictionary(dataset.get_corpus())
if self.id_corpus is None:
self.id_corpus = [self.id2word.doc2bow(document)
for document in train_corpus]
self.tfidf = TfidfModel(self.id_corpus, normalize=True, smartirs='ntc')
self.tfidfcorpus=self.tfidf[self.id_corpus]
# self.tfidfcorpus=corpus2csc(self.tfidf[self.id_corpus])
if "num_topics" not in hyperparams:
hyperparams["num_topics"] = self.hyperparameters["num_topics"]
# Allow alpha to be a float in case of symmetric alpha
if "alpha" in hyperparams:
if isinstance(hyperparams["alpha"], float):
hyperparams["alpha"] = [
hyperparams["alpha"]
] * hyperparams["num_topics"]
hyperparams["corpus"] = self.tfidfcorpus
hyperparams["id2word"] = self.id2word
self.hyperparameters.update(hyperparams)
self.trained_model = ldamodel.LdaModel(**self.hyperparameters)
result = {}
result["topic-word-matrix"] = self.trained_model.get_topics()
if top_words > 0:
topics_output = []
for topic in result["topic-word-matrix"]:
top_k = np.argsort(topic)[-top_words:]
top_k_words = list(reversed([self.id2word[i] for i in top_k]))
topics_output.append(top_k_words)
result["topics"] = topics_output
result["topic-document-matrix"] = self._get_topic_document_matrix()
if self.use_partitions:
new_corpus = [self.id2word.doc2bow(
document) for document in test_corpus]
if self.update_with_test:
self.trained_model.update(new_corpus)
self.id_corpus.extend(new_corpus)
result["test-topic-word-matrix"] = self.trained_model.get_topics()
if top_words > 0:
topics_output = []
for topic in result["test-topic-word-matrix"]:
top_k = np.argsort(topic)[-top_words:]
top_k_words = list(
reversed([self.id2word[i] for i in top_k]))
topics_output.append(top_k_words)
result["test-topics"] = topics_output
result["test-topic-document-matrix"] = self._get_topic_document_matrix()
else:
test_document_topic_matrix = []
for document in new_corpus:
document_topics_tuples = self.trained_model[document]
document_topics = np.zeros(
self.hyperparameters["num_topics"])
for single_tuple in document_topics_tuples:
document_topics[single_tuple[0]] = single_tuple[1]
test_document_topic_matrix.append(document_topics)
result["test-topic-document-matrix"] = np.array(
test_document_topic_matrix).transpose()
return result