-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTM_run_CTM_vanilla.py
244 lines (215 loc) · 12 KB
/
TM_run_CTM_vanilla.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# Preprocessing functions
from genericpath import isdir, exists
from nlp_preprocessing import make_TM_dataset
#library CTM
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import bert_embeddings_from_file, TopicModelDataPreparation
from contextualized_topic_models.datasets.dataset import CTMDataset
from contextualized_topic_models.evaluation.measures import CoherenceNPMI, InvertedRBO, CoherenceCV, TopicDiversity
from gensim.corpora.dictionary import Dictionary
from gensim.models import ldamodel
import pyLDAvis
#libraries datascience
import pandas as pd
import os
import numpy as np
import pickle
from tqdm import tqdm
#autres librairies nlp (pre processing...)
import nltk
import gensim
from string import punctuation
from bs4 import BeautifulSoup
import re
#Preprocess function
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import string
import warnings
from nltk.corpus import stopwords
# Run the vanilla CTM on B7 (issue with B8 which uses the phraser)
import random
import numpy as np
import torch
batch_size=32
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
import sys
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
################################################## PARAMETER SELECTION #######################
infile="datasets/beltweets_nlp.csv"
start = "2020-08-06"
end = "2020-08-08"
split = "1D"
path='TM/CTM_vanilla'
data_path=path+'/datasets'# Where to store files for preprocessed dataset (mandatory to for OCTIS based models)
langs=['english']
# If we want to run several base berts (LLM Component of CTM, see https://aclanthology.org/2021.acl-short.96/)
# models = ['cardiffnlp/twitter-roberta-base', 'bert-base-nli-mean-tokens', "paraphrase-distilroberta-base-v2"]
models = ['cardiffnlp/twitter-roberta-base']
# lang = 'russian'
# models = ['cardiffnlp/twitter-xlm-roberta-base','cointegrated/rubert-tiny2', 'DeepPavlov/rubert-base-cased-sentence']
num_epochs=5
# Values of k (number of topics) to try
k_range=range(3,8)
# Can be a custom list :
# k_range=[5, 15, 25, 50]
# Or a single value
k_range=[5, 6]
# Or fancier ranges
# k_range=[n for n in range(6,15,1)] + [n for n in range(15,33,3)]
# TODO Run each model this many times, to account for random initialization of weights
# So far we don't do it for Vanilla CTM. We use fixed initialization of weights so we can load the model later.
# model_runs=2
##################################################
for lang in langs :
try :
# Training CTM
# results={'russian':{'topics' : [], 'models':[], 'topic_distrib':[]}, 'english':{'topics' : [], 'models':[], 'topic_distrib':[]}}
# import torch
# from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
# model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
# model.cuda()
if lang=='russian' :
# qt=TopicModelDataPreparation("cointegrated/rubert-tiny2")
qt=TopicModelDataPreparation("DeepPavlov/rubert-base-cased-sentence")
if lang=='english' :
qt=TopicModelDataPreparation("cardiffnlp/twitter-roberta-base")
start=pd.Timestamp(start, tz='utc')
end=pd.Timestamp(end, tz='utc')
if split==None :
split=end-start
else :
split=pd.Timedelta(split)
while start < end :
results={'topics' : [], 'topic_distrib':[], 'coherence':[], 'diversity':[]}
print (f'Preprocessing {lang} data from {start} to {start+split}, (end = {end})')
preprocessed, unpreprocessed, vocab, deleted=make_TM_dataset(
infile=infile, start=start, end=start+pd.Timedelta(split), lang=lang, save_path=data_path, vocabulary_size=2000, make_ngrams=False, max_df=0.7, min_df=2, seuil_ngram=8, min_count_ngram=2)
a=[t for i,t in enumerate(preprocessed) if ~deleted[i]]
b=[t for i,t in enumerate(unpreprocessed) if ~deleted[i]]
# if not exists(data_path+f'/{start.strftime("%Y%m%d")}_{end.strftime("%Y%m%d")}_{lang}.csv') :
# print (f'Preprocessing {lang} data from {start.strftime("%Y%m%d")} to {start+split}, (end = {end.strftime("%Y%m%d")})')
# preprocessed, unpreprocessed, vocab, deleted=make_TM_dataset(
# infile=infile, start=start, end=start+pd.Timedelta(split), lang=lang, save_path=data_path, vocabulary_size=2000, make_ngrams=False, max_df=0.7, min_df=2, seuil_ngram=8, min_count_ngram=2)
# a=[t for i,t in enumerate(preprocessed) if ~deleted[i]]
# b=[t for i,t in enumerate(unpreprocessed) if ~deleted[i]]
# c=vocab
# else :
# # Use this if you want to make topics from files saved from make_TM_dataset
# data=pd.read_csv(data_path+f'/{start.strftime("%Y%m%d")}_{end.strftime("%Y%m%d")}_{lang}.csv')
# with open(data_path+f'/{start.strftime("%Y%m%d")}_{end.strftime("%Y%m%d")}_{lang}/octis_preprocessed/vocabulary.txt') as file:
# lines = [line.rstrip() for line in file]
# # a = preprocessed, b= unpreprocessed, feed them in this order
# (a, b,c) = (data[~data.deleted]['preproc'].to_list(),
# data[~data.deleted]['unpreproc'].to_list(),
# lines)
# To deal with a bug when Torch is fed a batch of size 1
while len(a)%batch_size==1 :
batch_size+=batch_size
print('batch size : ', batch_size)
print(f'Making base Bert dataset for {lang}...')
training=qt.fit(b, a)
print("training.X_bow.shape = ", training.X_bow.shape,
"training.X_contextual.shape =", training.X_contextual.shape,
)
# make a few values of n_topics :
for k in k_range :
print(f'training {lang} model for period {start.strftime("%Y%m%d")} to {end.strftime("%Y%m%d")} : {k} topics')
torch.manual_seed(10)
torch.cuda.manual_seed(10)
np.random.seed(10)
random.seed(10)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.deterministic = True
# ctm = CombinedTM(bow_size=len(qt.vocab), contextual_size=768, n_components=k, num_epochs=30) # base was 15
# ctm = CombinedTM(bow_size=len(qt.vocab), contextual_size=768, n_components=k, num_epochs=50, batch_size=batch_size) # base was 15
ctm = CombinedTM(bow_size=len(qt.vocab), contextual_size=768, n_components=k, num_epochs=num_epochs, batch_size=batch_size) # base was 15
print(f'fitting {lang} model for data from {start.strftime("%Y%m%d")} to {start+split}')
ctm.fit(training, n_samples=5)
# ctm.fit(training)
# path = path+"bench/results/B7_2k_095_POS/CTM_vanilla/%s/%d/"%(lang, (i))
# path = path+"/%s/%d/"%(lang, (i))
ctm.save(models_dir=path+f'/models/{start.strftime("%Y%m%d")}_{end.strftime("%Y%m%d")}_{lang}_{k}topics')
# results['models'].append(ctm)
print('getting topic list....')
# Output topics in separate CSV
topicslist=ctm.get_topic_lists(10)
topics={i:t for (i, t) in enumerate(topicslist)}
topics=pd.Series(topics)
print('getting topic distrib for {k}')
topics.to_csv(path+f'/{start.strftime("%Y%m%d")}_{end.strftime("%Y%m%d")}_{lang}_{k}topics.csv')
# Make a dict of results
results['topics'].append(topicslist)
print("results['topics']",results['topics'])
tweet_topics = ctm.get_doc_topic_distribution(training, n_samples=5)
results['topic_distrib'].append(tweet_topics)
print("results['topic_distrib']",results['topic_distrib'])
coherence=CoherenceCV(topics=topicslist, texts= [t.split() for t in a])
results['coherence'].append(coherence.score())
print("results['coherence']",results['coherence'])
diversity=TopicDiversity(topics=ctm.get_topic_lists(25))
results['diversity'].append(diversity.score())
print("results['diversity']",results['diversity'])
print('lengths of keys : ', [(k, len(results[k])) for k in list(results.keys())])
############### GET PYLDAVIS.... #########################
# Taken from B7_checktopics_en
# datapath="bench/orig_datasets/B7_2k_095_POS/"
# resultpath='bench/results/B7_2k_095_POS/CTM_vanilla/'
#Check vocab file
# vocab_path=f"datasets/B7_2k_095_POS/{d}/{lang}/vocabulary_n.txt"
# with open(vocab_path) as file:
# lines = [line.rstrip() for line in file]
# vocab_fromfile=lines
# Chose model (best K value) to load
# if lang=="russian":
# optimal_iter=[4,4,6,7,4][i]
# else:
# optimal_iter=5
res_df=pd.DataFrame(results, index=k_range)
norm_df=res_df[['coherence', 'diversity']]
norm_df = ((norm_df-norm_df.mean())/norm_df.std())
# If it is a model with a trade-off between diversity and coherence, get the lowest value of K after where the curves intersect
if (norm_df['diversity'].iloc[0]>norm_df['diversity'].iloc[-1]) and (norm_df['coherence'].iloc[0]<norm_df['diversity'].iloc[-1]):
optimal_iter=norm_df[norm_df['coherence']>=norm_df['diversity']].iloc[0].name
# TODO allow for runs
# optimal_run=res_df.loc[optimal_iter].reset_index()['coherence'].idxmax()
else :
# print(norm_df[norm_df['coherence']>=norm_df['diversity']].iloc[0])
optimal_iter=norm_df.groupby(norm_df.index)['coherence'].median().idxmax()
# TODO allow for runs
# optimal_run=res_df.loc[optimal_iter].reset_index()['coherence'].idxmax()
# topics = pd.read_csv(resultpath+f'{lang}/{d}/topics_{lang}_{d}_{optimal_iter}.csv', lineterminator='\n')
topics = res_df['topics'].loc[optimal_iter]
# Do something with the topics
[print(i, t) for i, t in enumerate(topics)]
# print(f'retraining embeddings for {lang}, {d}')
# # Retrain the embeddings
# data=pd.read_csv(datapath+f'{d}_{lang}.csv', lineterminator='\n')
# (a, b) = (data[~data.deleted]['preproc'].to_list(),
# data[~data.deleted]['unpreproc'].to_list(),
# )
# training = qt.fit(b, a)
vocab_fromtraining = qt.vocab
# print(f'loading model for {lang}, {d}')
# Load "best k" model
# ctm = CombinedTM(bow_size=len(vocab_fromtraining), contextual_size=768, n_components=30, num_epochs=50)
modelfolder= [f for f in os.scandir(path+f'/models/{start.strftime("%Y%m%d")}_{end.strftime("%Y%m%d")}_{lang}_{optimal_iter}topics')][0] # we load the optimal model from the folder with a v long name
ctm.load(modelfolder.path, epoch=num_epochs-1)
print(f'loading best {lang} model for, {optimal_iter} topics and making PyLDAVis')
lda=ctm.get_ldavis_data_format(vocab_fromtraining, training, 10)
vis_data1 = pyLDAvis.prepare(**lda)
pyLDAvis.save_html(vis_data1, path+f'/{start.strftime("%Y%m%d")}_{end.strftime("%Y%m%d")}_{lang}_{optimal_iter}topics.html')
start+=split
except ValueError as v:
print(v)
print('batch size : ', batch_size)
print('len texts', len(a))
print(len(a)%batch_size)
print(batch_size%2)