-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathadaptive_cluster.py
361 lines (331 loc) · 16.4 KB
/
adaptive_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import json
from cv2 import split
from sklearn import metrics
from sklearn.cluster import KMeans,MiniBatchKMeans
from sklearn.discriminant_analysis import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from tqdm import tqdm
from transformers import BertModel, BertTokenizerFast
import scipy
import torch.nn.functional as F
from models.MLM.utils import fineTuningDataset
from torch.utils.data import DataLoader
import numpy as np
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import random
import pandas as pd
def SSE_clu(v1,v2):
return sum(np.power(v1 - v2,2))
class WordsCluster(object):
def __init__(self, embedding_file, sim_threshold=0.55, del_sim_threshold=0.2, ignore_keywords=[], prep_words=[]):
self.sim_threshold = sim_threshold
self.del_sim_threshold = del_sim_threshold
self.model = BertModel.from_pretrained(embedding_file)
self.tokenizer = BertTokenizerFast.from_pretrained(embedding_file)
self.embedding_model = self.model.get_input_embeddings()
self.ignore_keywords = ignore_keywords
self.prep_words = prep_words
self.unexist = set()
# 初始化embedding信息
def initial_embedding_info(self, keywords):
all_keywords_w2v_list = []
all_keywords_embeddings = torch.tensor([])
# get embedding of each predicate by the mean of GLOVE vectors of all triplets
for keyword in keywords:
with torch.no_grad():
predicate_embedding = []
for triplet in keywords[keyword]:
total_embedding = []
for w in triplet:
input = self.tokenizer.encode(w, return_tensors="pt", add_special_tokens = False)
embedding_token = self.embedding_model(input)
word_embedding = torch.mean(embedding_token, dim=1)
total_embedding.append(word_embedding)
total_embedding = torch.cat(total_embedding, dim=0)
total_embedding = torch.sum(total_embedding, dim=0)
total_embedding = total_embedding / len(triplet)
predicate_embedding.append(total_embedding)
predicate_embedding = torch.stack(predicate_embedding, dim=0)
predicate_embedding = torch.mean(predicate_embedding, dim=0).unsqueeze(0)
all_keywords_w2v_list.append((keyword, predicate_embedding))
all_keywords_embeddings = torch.cat((all_keywords_embeddings, predicate_embedding), dim=0)
self.words_w2v_dic = dict(all_keywords_w2v_list)
return all_keywords_embeddings
# 获取类标
def get_class_represent_word(self, collection):
if len(collection) == 0:
return None
if len(collection) == 1:
return collection[0]
# 计算模平均,求模平均w2v相似度最高词语
sim_list = [torch.cosine_similarity(key_w2v, torch.mean(torch.stack(collection['words_w2v']), dim=0), dim=1).item() for key_w2v in collection['words_w2v']]
max_sim = max(sim_list)
if len(sim_list)==1:
max_sim = sim_list[0]
max_sim_index = sim_list.index(max_sim)
represent_word = collection["words"][max_sim_index]
represent_word_sim = max_sim
return represent_word, represent_word_sim
# 过滤
def filt_noise_words(self, collection):
delete_noise_opinion_words_indexes = []
scores = [np.mean(np.dot(collection['words_w2v'], x)) for x in collection['words_w2v']]
# 根据阈值过滤类中相似度较小词
for i in range(len(scores)):
if scores[i] <= self.del_sim_threshold:
delete_noise_opinion_words_indexes.append(i)
collection['words_w2v'] = [x for (i, x) in enumerate(collection['words_w2v']) if
i not in delete_noise_opinion_words_indexes]
collection['words'] = [x for (i, x) in enumerate(collection['words']) if
i not in delete_noise_opinion_words_indexes]
# t-sne 可视化
def t_sne_kmeans(self, words_w2v_embeddings):
vecArr = np.array(words_w2v_embeddings)
tsneData = TSNE().fit_transform(vecArr)
#开始进行可视化
f = plt.figure(figsize=(10,10))
ax = plt.subplot(aspect='equal')
sc = ax.scatter(tsneData[:,0], tsneData[:,1])
plt.xlim(-50,50)
plt.ylim(-50,50)
ax.axis('off')
ax.axis('tight')
plt.savefig('tsne.png')
# 计算欧拉距离
def calcDis(self, dataSet, centroids, k):
clalist=[]
for data in dataSet:
diff = np.tile(data, (k, 1)) - centroids #相减 (np.tile(a,(2,1))就是把a先沿x轴复制1倍,即没有复制,仍然是 [0,1,2]。 再把结果沿y方向复制2倍得到array([[0,1,2],[0,1,2]]))
squaredDiff = diff ** 2 #平方
squaredDist = np.sum(squaredDiff, axis=1) #和 (axis=1表示行)
distance = squaredDist ** 0.5 #开根号
clalist.append(distance)
clalist = np.array(clalist) #返回一个每个点到质点的距离len(dateSet)*k的数组
return clalist
# 计算质心
def classify(self, dataSet, centroids, k):
# 计算样本到质心的距离
clalist = self.calcDis(dataSet, centroids, k)
# 分组并计算新的质心
minDistIndices = np.argmin(clalist, axis=1) #axis=1 表示求出每行的最小值的下标
newCentroids = pd.DataFrame(dataSet).groupby(minDistIndices).mean() #DataFramte(dataSet)对DataSet分组,groupby(min)按照min进行统计分类,mean()对分类结果求均值
newCentroids = newCentroids.values
# 计算变化量
changed = newCentroids - centroids
return changed, newCentroids
def cluster(self, keywords, num_clusters, sim_threshold=None):
'''
keywords词语聚类
:param keywords_dic:
:return:
'''
result = []
if sim_threshold is None:
sim_threshold = self.sim_threshold
# we collect all keywords embedding in triplet-level, and get the mean of all relevant triplets as the embedding
all_keyfeatures = self.initial_embedding_info(keywords)
st = StandardScaler()
# all_keyfeatures = st.fit_transform(all_keyfeatures.numpy())
sk_kmeans = KMeans(n_clusters=num_clusters)
result_list = sk_kmeans.fit(all_keyfeatures)
centroids = result_list.cluster_centers_
closest_centroids_ids = result_list.labels_
# centroids, closest_centroids_ids = self.train(all_keyfeatures.numpy(), num_clusters, max_iterations=10)
# find the represented word
cluster_dict = dict()
for i, centroid in enumerate(centroids):
similarity = -1
for k in self.words_w2v_dic:
cur_similarity = torch.cosine_similarity(torch.tensor(centroid), self.words_w2v_dic[k], dim=-1)
if cur_similarity.item() > similarity:
similarity = cur_similarity
centroid_label = k
cluster_dict[str(i)] = dict()
cluster_dict[str(i)]["represent_word"] = centroid_label
cluster_dict[str(i)]["words"] = []
cluster_dict[str(i)]["represent_word_sim"] = similarity.item()
for m, k in enumerate(self.words_w2v_dic):
if closest_centroids_ids[m] == i:
cluster_dict[str(i)]["words"].append(k)
return cluster_dict
def upgrade_cluster(self, keywords, collection_words_list=[], sim_threshold=None):
if not sim_threshold:
sim_threshold = self.sim_threshold
for i in range(7, min(1, int(sim_threshold * 10) - 3), -1):
collections, un_seg_words = self.cluster(keywords, collection_words_list, i * 0.1)
keywords = un_seg_words
collection_words_list = [x['words'] for x in collections]
return collections, un_seg_words
def sim(self, w1, w2):
if w1 in self.embedding_model.unexist or w2 in self.unexist:
return -1
w1_w2v = self.embedding_model.get_word_embedding(w1)
w2_w2v = self.embedding_model.get_word_embedding(w2)
return np.dot(w1_w2v, w2_w2v)
def get_embedding(self, triplet):
total_embedding = []
for w in triplet:
input = self.tokenizer.encode(w, return_tensors="pt", add_special_tokens = False)
embedding_token = self.embedding_model(input)
word_embedding = torch.mean(embedding_token, dim=1)
total_embedding.append(word_embedding)
total_embedding = torch.cat(total_embedding, dim=0)
total_embedding = torch.mean(total_embedding, dim=0)
return total_embedding
def train(self, data, num_clusters, max_iterations):
# data precess
centroids=self.centroids_init(data, num_clusters)
# print(data[0])
#2.开始训练
num_examples=data.shape[0]
closest_centroids_ids=np.empty((num_examples,1))
for i in range(max_iterations):
# print('current iterations: ', i)
# 得到当前每个样本到k个中心点的距离,找最近的
closest_centroids_ids=self.centroids_find_closest(data,centroids)
#进行中心点位置更新
centroids=self.centroids_compute(data,closest_centroids_ids,num_clusters)
# print(centroids)
# print(closest_centroids_ids)
return centroids, closest_centroids_ids
def centroids_init(self, data, num_clusters):
num_examples=data.shape[0]
random_ids=np.random.permutation(num_examples) # shuffle the id and select the random centroids
centroids=data[random_ids[:num_clusters],:]
return centroids
def centroids_find_closest(self,data,centroids):
num_examples = data.shape[0]
num_centroids = centroids.shape[0]
closest_centroids_ids = np.zeros((num_examples,1))
for example_index in range(num_examples) :
distance = np.zeros((num_centroids,1))
for centroid_index in range(num_centroids):
distance_diff = data[example_index,:] - centroids[centroid_index,:]
distance[centroid_index] = np.sum(distance_diff**2)
closest_centroids_ids[example_index] = np.argmin(distance)
return closest_centroids_ids
def centroids_compute(self, data, closest_centroids_ids, num_clusters):
num_features = data.shape[1]
centroids = np.zeros((num_clusters,num_features))
for centroid_id in range(num_clusters) :
closest_ids = np.where(closest_centroids_ids == centroid_id)[0]
centroids[centroid_id] = np.mean(data[closest_ids,:],axis=0)
return centroids
def evaluate_func(self, keywords):
all_keyfeatures = self.initial_embedding_info(keywords)
# ls_k = [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150]
ls_k = range(10,562,10)
# ls_k = range(2,50)
ls_sil = []
ls_ch = []
ls_elbows =[]
ls_gs = []
st = StandardScaler()
all_keyfeatures = st.fit_transform(all_keyfeatures.numpy())
for i in ls_k:
ls_elbow = []
# centroids, closest_centroids_ids = self.train(all_keyfeatures, i, max_iterations=30)
sk_kmeans = KMeans(n_clusters=i)
result_list = sk_kmeans.fit(all_keyfeatures)
res2 = result_list.cluster_centers_
res1 = result_list.labels_
# res1 = closest_centroids_ids.astype(int).T.tolist()[0]###输出的是聚类类比 closest_centroids_ids
# res2 = centroids##输出的是聚类中心 centroids
# normalize all keyfeatures
# ls_gs.append(self.gap(all_keyfeatures, i))
for j in range(len(res1)):
choose_label = res2[int(res1[j]), :]
sse = SSE_clu(all_keyfeatures[j, :], choose_label)##肘方法
ls_elbow.append(sse)
# print(ls_elbow)
ls_sil.append(metrics.silhouette_score(all_keyfeatures,res1))###轮廓系数
ls_ch.append(metrics.calinski_harabasz_score(all_keyfeatures,res1))###CH值
ls_elbows.append(sum(ls_elbow))
return ls_elbows,ls_sil,ls_ch,ls_gs
def sum_distance(self, data, k):
model = KMeans(n_clusters=k)
result_list = model.fit(data)
res1 = result_list.labels_
res2 = result_list.cluster_centers_
disp = 0
for m in range(data.shape[0]):
disp += np.linalg.norm(data[m] - res2[res1[m]], axis=0)
return disp
def gap(self, data, k):
shape = data.shape
tops = data.max(axis=0)
bots = data.min(axis=0)
dists = scipy.matrix(np.diag(tops - bots))
rands = scipy.random.random_sample(size=(shape[0], shape[1]))
rands = rands * dists + bots
disp = self.sum_distance(data, k)
refdisps = self.sum_distance(rands, k)
gap = np.lib.scimath.log(np.mean(refdisps)) - np.lib.scimath.log(disp)
return gap
def monte_carlo(self, keywords, epochs=10):
matx_elbows = np.mat(np.zeros((epochs, 56)))
matx_sil = np.mat(np.zeros((epochs, 56)))
matx_ch = np.mat(np.zeros((epochs, 56)))
matx_gs = np.mat(np.zeros((epochs, 56)))
for i in range(epochs):
Repoch = self.evaluate_func(keywords)
matx_elbows[i, :] = Repoch[0]
matx_sil[i, :] = Repoch[1]
matx_ch[i, :] = Repoch[2]
# matx_gs[i, :] = Repoch[3]
mean_elbows = matx_elbows.sum(axis=0) / epochs
mean_sil = matx_sil.sum(axis=0) / epochs
mean_ch = matx_ch.sum(axis=0) / epochs
# matx_gs = matx_gs.sum(axis=0) / epochs
st = StandardScaler()
# mean_ch = st.fit_transform(mean_ch)
# print(mean_ch)
# mean_ch = mean_ch / max(mean_ch.tolist()[0])
print('SSE',mean_elbows.tolist()[0])
print('轮廓系数',mean_sil.tolist()[0])
print('Norm CH值',mean_ch.tolist()[0])
# print('Gap Statistic',matx_gs.tolist()[0])
# plt.figure(figsize=(15,8))
fig = plt.figure(figsize=(15, 8))
ax1 = fig.add_subplot(1, 1, 1)
ax2 = ax1.twinx()
# X = [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150]
X = range(10,562,10)
ax1.plot(X, mean_elbows.tolist()[0], marker='o', label='Elbow')
ax2.plot(X, mean_sil.tolist()[0], 'r', marker='*', label='Silhouette Coefficient')
# ax2.plot(X, mean_ch.tolist()[0], 'g', marker='*', label='CH norm')
# ax2.bar(X, matx_gs.tolist()[0], label='Gap Statistic')
ax1.set_ylabel('SSE', fontsize=20)
ax1.set_xlabel('K', fontsize=20)
ax2.set_ylabel('Value', fontsize=20)
ax1.tick_params(labelsize=20)
ax2.tick_params(labelsize=20)
ax1.legend(loc='lower left', fontsize=20)
ax2.legend(loc='upper right',fontsize=20)
# plt.show()
plt.savefig('centroids_4.png')
if __name__ == '__main__':
vg_dataset = fineTuningDataset('datasets/image_caption_triplet_all.json',"/home/qifan/datasets/coco/train2014/",'train')
# train_dataset = fineTuningDataset('gqa_triplets.json',"/home/qifan/datasets/GQA/images/",'train')
data_loader = DataLoader(vg_dataset, batch_size=8, shuffle=True)
predicate_words = vg_dataset.predicates_words
predicate_dict = dict()
for p in predicate_words:
predicate_dict[p] = [[p]]
for triplet_info in vg_dataset.triplets:
triplet = triplet_info['triplet']
predicate_dict[triplet[1].lower()].append(triplet)
prep_words = []
ignore_words = []
kmeans = WordsCluster('/home/qifan/FG-SGG_from_LM/bert-base-uncased', ignore_keywords=ignore_words, prep_words=prep_words)
# kmeans.monte_carlo(predicate_dict)
# using sim_threshold to initilize the number of clusters for total classes
# cluster_dict = kmeans.cluster(predicate_dict, num_clusters=230, sim_threshold=0.7)
# json.dump(cluster_dict, open('utils_data/cluster/CaCao_all_cluster_dict_07.json', 'w'))
# using sim_threshold to initilize the number of clusters for target classes
cluster_dict = kmeans.cluster(predicate_dict, num_clusters=39, sim_threshold=0.7)
json.dump(cluster_dict, open('utils_data/cluster/CaCao_map50_dict_07.json', 'w'))