-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
202 lines (183 loc) · 12.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from functions.my_LLE import My_LLE
from functions.my_GLLE import My_GLLE
from functions.my_GLLE_DirectSampling import My_GLLE_DirectSampling
import functions.load_datasets as load_datasets
import functions.utils as utils
from sklearn import manifold, datasets
import numpy as np
from sklearn.preprocessing import StandardScaler
from numpy import genfromtxt
import json
with open('settings.json') as json_file:
settings = json.load(json_file)
# ##################################### options for settings in the json file #####################################
# ================ method:
# LLE ---> it is original deterministic LLE
# GLLE ---> it is GLLE with EM algorithm
# GLLE_DirectSampling ---> it is GLLE with direct sampling
# ================ dataset:
# Swiss_roll, Swiss_roll_hole, S_curve, Sphere, Sphere_small ---> these are ready toy datasets
# User_data ---> User can put their dataset as a csv files named "data" (row-wise data) and an optional "colors" file
# ================ make_dataset_again:
# True ---> generate the ready toy datasets again (with new possible data settings)
# False ---> load the previously generated toy dataset (it will throw error if you have not generated a dataset before)
# ================ embed_again:
# True ---> train the embedding (unfolding) again
# False ---> do not train again and load the previous training phase. This can be useful for when user wants to generate several unfoldings and does not want to train again
# ================ generate_embedding_again:
# True ---> generate [multiple] unfoldings (embeddings)
# False ---> do not generate unfoldings (embeddings)
# ================ analyze_covariance_scales:
# True ---> generate unfoldings for various scales of covariance matrix for the sake of analysis
# False ---> do not generate unfoldings for various scales of covariance matrix
# ================ n_generation_of_embedding:
# A positive integer ---> it is the number of unfoldings (embeddings) to generate
# ================ max_iterations:
# A positive integer ---> maximum number of iterations for EM algorithm in stochastic linear reconstruction of GLLE
# ================ n_components:
# A positive integer (between 1 and dimensionality of data) ---> the dimensionality of unfolding (embedding)
# ================ verbosity:
# 0 ---> do not print logging information
# 1 ---> print logging information of level one
# 2 ---> print logging information of levels one and two
def main():
##################################### loading settings #####################################
method = settings["method"]
dataset = settings["dataset"]
make_dataset_again = True if settings["make_dataset_again"] == "True" else False
embed_again = True if settings["embed_again"] == "True" else False
generate_embedding_again = True if settings["generate_embedding_again"] == "True" else False
analyze_covariance_scales = True if settings["analyze_covariance_scales"] == "True" else False
n_generation_of_embedding = settings["n_generation_of_embedding"]
max_iterations = settings["max_iterations"]
n_components = settings["n_components"]
verbosity = settings["verbosity"]
##################################### loading or generating dataset #####################################
if make_dataset_again:
labels, color = None, None
if dataset == "Swiss_roll":
# X, color = datasets.make_swiss_roll(n_samples=1500)
X, color = datasets.make_swiss_roll(n_samples=5000)
if dataset == "Swiss_roll_hole":
X, color = load_datasets.make_swiss_roll_with_hole(n_samples=4950)
# utils.plot_3D(X, color, path_to_save='./datasets/'+dataset+"/", name="dataset")
elif dataset == "S_curve":
# X, color = datasets.make_s_curve(n_samples=1500, random_state=0)
X, color = datasets.make_s_curve(n_samples=5000, random_state=0)
elif dataset == "Sphere":
X, color = load_datasets.make_sphere_dataset(n_samples=5000, severed_poles=True)
elif dataset == "Sphere_small":
X, color = load_datasets.make_sphere_dataset(n_samples=1000, severed_poles=True)
elif dataset == "digits":
# https://scikit-learn.org/stable/datasets/toy_dataset.html#digits-dataset
# https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits
# https://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html
digits = datasets.load_digits(n_class=6)
X = digits.data
labels = digits.target
# scaler = StandardScaler()
# X = scaler.fit_transform(X)
elif dataset == "MNIST":
_, labels, X = load_datasets.read_MNIST_dataset(MNIST_subset_cardinality_training=100, read_dataset_again=True)
X = X.T #--> make it row-wise
elif dataset == "ORL_glasses":
X, labels = load_datasets.read_ORL_glasses_dataset(scale=0.5)
X = X.T #--> make it row-wise
if dataset in ["Swiss_roll", "Swiss_roll_hole", "S_curve", "Sphere", "Sphere_small"]:
utils.plot_3D(X, color, path_to_save='./datasets/'+dataset+"/", name="dataset")
utils.save_variable(variable=X, name_of_variable="X", path_to_save='./datasets/'+dataset+"/")
if color is not None:
utils.save_variable(variable=color, name_of_variable="color", path_to_save='./datasets/'+dataset+"/")
if labels is not None:
utils.save_variable(variable=labels, name_of_variable="labels", path_to_save='./datasets/'+dataset+"/")
if dataset == "digits":
utils.save_variable(variable=digits, name_of_variable="digits", path_to_save='./datasets/'+dataset+"/")
else:
if dataset == "User_data":
try:
X = genfromtxt("datasets/User_data/data.csv", delimiter=',')
except Exception as ex:
raise ValueError("There is no any user data in the dataset folder!")
try:
color = genfromtxt("datasets/User_data/color.csv", delimiter=',')
except Exception as ex:
color = None
try:
labels = genfromtxt("datasets/User_data/labels.csv", delimiter=',')
except Exception as ex:
labels = None
else:
X = utils.load_variable(name_of_variable="X", path='./datasets/'+dataset+"/")
try:
color = utils.load_variable(name_of_variable="color", path='./datasets/'+dataset+"/")
utils.plot_3D(X, color, path_to_save='./datasets/'+dataset+"/", name="dataset")
except:
color = None
try:
labels = utils.load_variable(name_of_variable="labels", path='./datasets/'+dataset+"/")
except:
labels = None
if dataset == "digits":
digits = utils.load_variable(name_of_variable="digits", path='./datasets/'+dataset+"/")
##################################### training the GLLE method #####################################
if method == "LLE_ready":
# https://scikit-learn.org/stable/auto_examples/manifold/plot_swissroll.html
# https://scikit-learn.org/stable/modules/generated/sklearn.manifold.locally_linear_embedding.html#sklearn.manifold.locally_linear_embedding
# Y, err = manifold.locally_linear_embedding(X, n_neighbors=10, n_components=n_components)
Y, err = manifold.locally_linear_embedding(X, n_neighbors=10, n_components=n_components, eigen_solver="dense")
elif method == "LLE":
my_LLE = My_LLE(X.T, n_neighbors=10, n_components=n_components, path_save="./saved_files/"+method+"/"+dataset+"/", verbosity=verbosity)
Y = my_LLE.fit_transform(calculate_again=embed_again)
Y = Y.T
elif method == "GLLE":
my_GLLE = My_GLLE(X.T, n_neighbors=10, n_components=n_components, path_save="./saved_files/"+method+"/"+dataset+"/", max_itr_reconstruction=max_iterations, verbosity=verbosity)
Y = my_GLLE.fit_transform(calculate_again=embed_again)
Y = Y.T
elif method == "GLLE_DirectSampling":
my_GLLE_DirectSampling = My_GLLE_DirectSampling(X.T, n_neighbors=10, n_components=n_components, path_save="./saved_files/"+method+"/"+dataset+"/", verbosity=verbosity)
Y = my_GLLE_DirectSampling.fit_transform(calculate_again=embed_again)
Y = Y.T
##################################### plot the trained unfolding #####################################
if dataset in ["Swiss_roll", "Swiss_roll_hole", "S_curve", "Sphere", "Sphere_small"]:
# utils.plot_3D(Y, color, path_to_save="./saved_files/"+method+"/"+dataset+"/", name="embedding_3D")
utils.plot_2D(Y, color, path_to_save="./saved_files/"+method+"/"+dataset+"/", name="embedding")
elif dataset in ["digits", "MNIST", "ORL_glasses"]:
utils.plot_embedding_with_labels(Y, labels, path_to_save="./saved_files/"+method+"/"+dataset+"/", name="embedding_numbers")
utils.plot_2D_with_labels(Y, labels, path_to_save="./saved_files/"+method+"/"+dataset+"/", name="embedding")
# utils.plot_embedding_with_labels_and_images(Y, labels, images=digits.images)
# utils.plot_components(Y.T, labels, which_dimensions_to_plot=[0,1], images=digits.images, image_scale=2, markersize=10, thumb_frac=0.05, cmap='gray')
elif dataset == "User_data":
# utils.plot_3D(Y, color, path_to_save="./saved_files/"+method+"/"+dataset+"/", name="embedding_3D")
utils.plot_2D(Y, color, path_to_save="./saved_files/"+method+"/"+dataset+"/", name="embedding")
##################################### generating unfoldings #####################################
if (method == "GLLE" or method == "GLLE_DirectSampling") and generate_embedding_again:
for itr in range(n_generation_of_embedding):
if method == "GLLE":
X_transformed = my_GLLE.generate_again()
elif method == "GLLE_DirectSampling":
X_transformed = my_GLLE_DirectSampling.generate_again()
Y = X_transformed.T
if color is not None:
utils.plot_2D(Y, color, path_to_save="./saved_files/"+method+"/"+dataset+"/generation/", name="embedding_gen"+str(itr))
if labels is not None:
utils.plot_embedding_with_labels(Y, labels, path_to_save="./saved_files/"+method+"/"+dataset+"/generation/", name="embedding_numbers_gen"+str(itr))
utils.plot_2D_with_labels(Y, labels, path_to_save="./saved_files/"+method+"/"+dataset+"/generation/", name="embedding_gen"+str(itr))
utils.save_variable(variable=X_transformed, name_of_variable="X_transformed", path_to_save="./saved_files/"+method+"/"+dataset+"/generation/gen"+str(itr)+"/")
##################################### analyzing the impact of covariance scales #####################################
if (method == "GLLE" or method == "GLLE_DirectSampling") and analyze_covariance_scales:
# n_interpolation = 5
# grid_ = np.linspace(0.01, 10, n_interpolation)
grid_ = [0.01, 0.1, 1, 5, 10]
for itr, sigma_i_multiplication in enumerate(grid_):
if method == "GLLE":
Sigma_linearReconstruction = my_GLLE.Sigma_linearReconstruction[:, :, :] * sigma_i_multiplication
X_transformed = my_GLLE.generate_again(Sigma_linearReconstruction)
elif method == "GLLE_DirectSampling":
Sigma_linearReconstruction = my_GLLE_DirectSampling.Cov_weights_linearReconstruction[:, :, :] * sigma_i_multiplication
X_transformed = my_GLLE_DirectSampling.generate_again(Sigma_linearReconstruction)
Y = X_transformed.T
utils.plot_2D(Y, color, path_to_save="./saved_files/"+method+"/"+dataset+"/interpolation/", name="embedding_gen"+str(itr), title="sigma_multipler = "+str(sigma_i_multiplication))
utils.save_variable(variable=X_transformed, name_of_variable="X_transformed", path_to_save="./saved_files/"+method+"/"+dataset+"/interpolation/itr"+str(itr)+"/")
if __name__ == "__main__":
main()