-
Notifications
You must be signed in to change notification settings - Fork 0
/
util_visualization.py
83 lines (70 loc) · 3.35 KB
/
util_visualization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gc
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import re
import matplotlib.pyplot as plt
def get_dataset_labels(df, columns=['text', 'hard_label', 'soft_label_0', 'soft_label_1', 'disagreement']):
df['soft_label_1'] = df['soft_label'].apply(lambda x: x['1'])
df['soft_label_0'] = df['soft_label'].apply(lambda x: x['0'])
df['disagreement'] = df['soft_label_0'].apply(lambda x: int(x == 0 or x == 1))
return df[columns]
def clean_tweet(df):
for i in range(1, len(df["text"])+1):
clean_tweet = re.sub("<user>", "", df["text"][i])
clean_tweet = re.sub('prev_user', '.', clean_tweet)
clean_tweet = re.sub('"prev_agent":', '.', clean_tweet)
clean_tweet = re.sub('"agent":', '.', clean_tweet)
clean_tweet = re.sub('"user":', '.', clean_tweet)
clean_tweet = re.sub('"', '', clean_tweet)
clean_tweet = re.sub('=', '', clean_tweet)
clean_tweet = re.sub('{', '', clean_tweet)
clean_tweet = re.sub('}', '', clean_tweet)
clean_tweet = re.sub(':', '', clean_tweet)
clean_tweet = re.sub(',', '', clean_tweet)
clean_tweet = re.sub("]", '', clean_tweet)
clean_tweet = re.sub("r'\([^)]*\)", '', clean_tweet)
clean_tweet = re.sub("-", '', clean_tweet)
clean_tweet = re.sub("_", "", clean_tweet)
clean_tweet = re.sub("RT", "", clean_tweet)
clean_tweet = re.sub("<url>", "", clean_tweet)
clean_tweet = clean_tweet.replace('...', " ")
clean_tweet = clean_tweet.replace('\n', " ")
clean_tweet = clean_tweet.replace('&', " ")
clean_tweet = clean_tweet.strip()
df["text"][i] = clean_tweet.lower()
return df
def create_labels(labels):
color = []
for indx in labels.items():
if indx[1] == 0:
color.append('blue')
else:
color.append('red')
return color
def plot_tsne_pca(title, word_embedding_train, word_embeddings_test, color_train, color_test):
# Applica t-SNE ai pesi dell'ultimo layer
perpTrain = round((len(word_embedding_train)/100)*5)
perpTest = round((len(word_embeddings_test)/100)*5)
pca = PCA(n_components= 10,random_state= 42)
reduced_embeddings = pca.fit_transform(word_embedding_train)
pca_test = PCA(n_components= 10,random_state= 42)
reduced_embeddings_test = pca_test.fit_transform(word_embeddings_test)
#umap_test = UMAP(n_components=2, init='random', random_state=123)
#umap_train = UMAP(n_components=2, init='random', random_state=123)
#hidden_states = umap_train.fit_transform(reduced_embeddings)
#hidden_states_test = umap_test.fit_transform(reduced_embeddings_test)
tsne = TSNE(n_components=2,perplexity=perpTrain,
random_state=42)
hidden_states = tsne.fit_transform(reduced_embeddings)
tsne_test = TSNE(n_components=2, perplexity=perpTest,
random_state=42,n_iter = 3000)
hidden_states_test = tsne_test.fit_transform(reduced_embeddings_test)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(50, 30))
fig.suptitle(title)
ax1.scatter(hidden_states[:, 0], hidden_states[:, 1], c=color_train)
ax1.title.set_text('Train ' + title)
ax2.scatter(hidden_states_test[:, 0], hidden_states_test[:, 1], c=color_test)
ax2.title.set_text('Test ' + title)
plt.show()
gc.collect()
return hidden_states,hidden_states_test