diff --git a/ChangePDB.py b/ChangePDB.py new file mode 100755 index 0000000..2adf933 --- /dev/null +++ b/ChangePDB.py @@ -0,0 +1,252 @@ +import argparse +import sys +import numpy as np +import re +import matplotlib.pyplot as plotter +import os + +parser = argparse.ArgumentParser(epilog='Change PDB files, eg the beta field. Annie Westerlund 2017.'); +parser.add_argument('-top','--topology_file',help='Input 1 topology file (.gro, .pdb, etc)',type=str,default=''); +parser.add_argument('-beta','--change_beta',help='Flag for writing beta of residues (optional).',action='store_true'); +parser.add_argument('-hel_beta','--change_helix_beta',help='Flag for writing beta of helices (optional).',action='store_true'); +parser.add_argument('-rename','--rename_atoms',help='Flag for renaming residues from CHARMM-gui to Gromacs (optional).',action='store_true'); + +parser.add_argument('-renumber','--renumber_residues',help='Flag for renumbering residues from sequential to chain numbering. Need to enter number of chains or set first resID. (optional)',action='store_true'); +parser.add_argument('-n_chains','--number_of_chains',help='Number of chains to use when renumbering the residues',default=1); +parser.add_argument('-first_resid','--first_resID',help='Start resID in the structure.',default=1); + +parser.add_argument('-f','--in_file',help='Input file.',default=''); +parser.add_argument('-f_conv','--helix_ind_conv_file',help='File containing residue number of helices (optional).',default=0); +parser.add_argument('-fe','--file_end_name',type=str,help='Output file end name (optional)', default=''); +parser.add_argument('-od','--out_directory',type=str,help='The directory where data should be saved (optional)',default=''); +parser.add_argument('-o','--out_file',help='File name (optional).',default='structure'); + +args = parser.parse_args(); + +# Put / at end of out directory if not present. Check so that folder extsts, otherwise construct it. +if args.out_directory !='': + if args.out_directory[-1] != '/': + args.out_directory += '/'; + + if not os.path.exists(args.out_directory): + os.makedirs(args.out_directory); + +# Change residue names +if args.rename_atoms: + print('Changing residue names to match gromacs names'); + fID1 = open(args.topology_file,'r'); + fID2 = open(args.out_directory + args.out_file + '_gmx_' + args.file_end_name+'.pdb','w'); + first_line = True; + + for line in fID1: + if line[0:3] == 'END': + continue; + + if first_line: + first_line = False; + fID2.write(line); + else: + atom = line[13:17]; + residue = line[17:21]; + + #Change the names of atoms and residues + if atom == 'CL ': + atom = 'CLA '; + residue = 'CLA '; + elif atom == 'K ': + atom = 'POT '; + residue = 'POT '; + elif atom == 'OH2 ': + atom = 'OW '; + residue = 'SOL '; + elif atom == 'H1 ': + atom = 'HW1 '; + residue = 'SOL '; + elif atom == 'H2 ': + atom = 'HW2 '; + residue = 'SOL '; + + tmpLine = line[0:13] + atom + residue + line[21::]; + fID2.write(tmpLine); + + fID1.close(); + fID2.close(); + + +# Change beta columns general +if args.change_beta: + print('Changing beta column ' + args.file_end_name); + values = np.loadtxt(args.in_file); + print('Max value: '+str(np.max(values))); + print('Min value: '+str(np.min(values))); + + + counter = 0; + resid_counter = -1; + current_resid = -1; + next_resid_write = -1; + fID1 = open(args.topology_file,'r'); + fID2 = open(args.out_directory + args.out_file + '_beta_'+ args.file_end_name+'.pdb','w'); + first_line = True; + + for line in fID1: + if line[0:3] == 'END': + continue; + + if first_line: + first_line = False; + fID2.write(line); + else: + if line[0:3] == 'END' or line[0:3]=='TER': + continue; + resid = int(line[23:26]); + + if resid != current_resid: + current_resid = resid; + resid_counter += 1; + + if resid_counter < len(values): + fID2.write(line[0:60]); + fID2.write(' '); + fID2.write('%.3f' % values[resid_counter]); + fID2.write(line[66::]); + else: + fID2.write(line[0:60]); + fID2.write(' '); + fID2.write('0.00'); + fID2.write(line[66::]); + + + fID1.close(); + fID2.close(); + +# Change beta column of helices +if args.change_helix_beta: + print('Changing beta column for each helix'); + helix_resid_convert_ind = np.loadtxt(args.helix_ind_conv_file); + + values = np.loadtxt(args.in_file); + values -= np.min(values); + values /= np.max(values); + values = np.floor(1000*values)/100; + + counter = 0; + resid_counter = -1; + current_resid = -1; + current_resid_min = helix_resid_convert_ind[counter,0]; + current_resid_max = helix_resid_convert_ind[counter,1]; + fID1 = open(args.topology_file,'r'); + fID2 = open(args.out_directory + args.out_file + '_helix_beta_'+ args.file_end_name+'.pdb','w'); + first_line = True; + + for line in fID1: + if line[0:3] == 'END' or line[0:3]=='TER': + continue; + + if first_line: + first_line = False; + fID2.write(line); + else: + resid = int(line[23:26]); + if resid != current_resid: + current_resid = resid; + resid_counter += 1; + + if resid_counter >= current_resid_min and resid_counter <= current_resid_max: + fID2.write(line[0:62]); + fID2.write(str(values[counter])); + fID2.write(line[66::]); + else: + fID2.write(line[0:62]); + fID2.write('0.00'); + fID2.write(line[66::]); + if resid_counter > current_resid_max: + counter += 1; + if counter < len(helix_resid_convert_ind[::,0]): + current_resid_min = helix_resid_convert_ind[counter,0]; + current_resid_max = helix_resid_convert_ind[counter,1]; + else: + current_resid_min = -1; + current_resid_max = -1; + + + fID1.close(); + fID2.close(); + +if args.renumber_residues: + + chain_letters = 'ABCDEFGHIJKLMNOPQRSTUVXYZ' + + n_chains = float(args.number_of_chains); + first_resid = int(args.first_resID); + + fID1 = open(args.topology_file,'r'); + fID2 = open(args.out_directory + args.out_file + '_renumbered_'+ args.file_end_name+'.pdb','w'); + + lines = []; + # Collect all lines + for line in fID1: + lines.append(line); + + # Count the number of residues + counter = 0; + resid_counter = 0; + current_resid = -1; + first_line = True; + + + for i in range(1,len(lines)): + line = lines[i]; + + if line[0:3] == 'END' or line[0:3]=='TER': + continue; + + + resid = int(line[23:26]); + if resid != current_resid: + current_resid = resid; + resid_counter += 1; + + # Compute total number of residues per chain + nResiduesPerChain = np.floor(resid_counter/n_chains); + + # Renumber residues + counter = 0; + resid_counter = 0; + current_resid = -1; + first_line = True; + new_resid = first_resid-1; + chain_id = 0; + + for i in range(0,len(lines)): + line = lines[i]; + if line[0:3] == 'END' or line[0:3]=='TER': + fID2.write(line); + continue; + + if first_line: + first_line = False; + fID2.write(line); + continue; + + resid = int(line[22:26]); + if resid != current_resid: + current_resid = resid; + resid_counter += 1; + new_resid += 1; + if resid_counter > nResiduesPerChain: + new_resid = first_resid; + resid_counter = 1; + chain_id += 1; + fID2.write('TER\n'); + + if new_resid < 10: + fID2.write(line[0:21]+chain_letters[chain_id]+' '+str(new_resid)+line[26::]); + elif new_resid < 100: + fID2.write(line[0:21]+chain_letters[chain_id]+' '+str(new_resid)+line[26::]); + else: + fID2.write(line[0:21]+chain_letters[chain_id]+' '+str(new_resid)+line[26::]); + + print(nResiduesPerChain) + + diff --git a/benchmarking/computing.py b/benchmarking/computing.py index 312ae8b..e11909a 100644 --- a/benchmarking/computing.py +++ b/benchmarking/computing.py @@ -27,7 +27,8 @@ def compute(extractor_type, overwrite=False, accuracy_method='mse', displacement=1e-1, - visualize=True, + visualize=False, + natoms=100, noise_level=1e-2, # [1e-2, 1e-2, 2e-1, 2e-1], output_dir="output/benchmarking/"): """ @@ -49,20 +50,24 @@ def compute(extractor_type, n_iterations=n_iterations) n_extractors = len(extractor_names) for iter in range(iterations_per_model): - modeldir = "{output_dir}/{extractor_type}/{feature_type}/{test_model}/noise-{noise_level}/iter-{iter}/".format( + modeldir = "{output_dir}/{extractor_type}/{feature_type}/{test_model}/noise-{noise_level}/atoms-{natoms}/iter-{iter}/".format( output_dir=output_dir, extractor_type=extractor_type, feature_type=feature_type, test_model=test_model, noise_level=noise_level, - iter=iter) + natoms=natoms, + iter=iter + ) finished_extractors = [] for name in extractor_names: - if not overwrite and os.path.exists(modeldir): + if os.path.exists(modeldir): filepath = "{}/{}/importance_per_residue.npy".format(modeldir, name) existing_files = glob.glob(filepath) - if len(existing_files) > 0: + if overwrite and len(existing_files) > 0: + logger.debug("File %s already exists. overwriting", existing_files[0]) + elif len(existing_files) > 0: logger.debug("File %s already exists. skipping computations", existing_files[0]) finished_extractors.append(name) else: @@ -70,10 +75,11 @@ def compute(extractor_type, else: os.makedirs(modeldir) needs_computations = len(finished_extractors) < n_extractors - dg = DataGenerator(natoms=100, + atoms_per_cluster = max(int(natoms / 10 + 0.5), 1) + dg = DataGenerator(natoms=natoms, nclusters=3, - natoms_per_cluster=[10, 10, 10], - nframes_per_cluster=1200 if needs_computations else 2, + natoms_per_cluster=[atoms_per_cluster, atoms_per_cluster, atoms_per_cluster], + nframes_per_cluster=12 * natoms if needs_computations else 2, # Faster generation for postprocessing purposes when we don't need the frames test_model=test_model, noise_natoms=None, diff --git a/benchmarking/configuration.py b/benchmarking/configuration.py index bba3f80..82331f4 100644 --- a/benchmarking/configuration.py +++ b/benchmarking/configuration.py @@ -62,24 +62,47 @@ def create_rand_feature_extractors(extractor_kwargs): ] -def create_RF_feature_extractors(extractor_kwargs, n_estimators=[10, 100, 200, 1000]): - extractors = [] - for one_vs_rest in [True, False]: - suffix = "" if one_vs_rest else "_multiclass" - for nest in n_estimators: - extractors.append( - fe.RandomForestFeatureExtractor( - name="{}-estimators{}".format(nest, suffix), - classifier_kwargs={ - 'n_estimators': nest - }, - one_vs_rest=one_vs_rest, - **extractor_kwargs) - ) - +def _shuffle_and_shorten(extractors, random_state=6, max_number_extractors=12): + extractors = np.array(extractors) + if random_state is not None: + np.random.seed(random_state) + np.random.shuffle(extractors) + extractors = extractors[:max_number_extractors] + if random_state is not None: + np.random.seed() #reset seed return extractors +def create_RF_feature_extractors(extractor_kwargs, + n_estimators=[10, 100, 1000], + min_samples_leaves=[0.25, 0.1, 1], + max_depths=[None, 1, 10, 100]): + extractors = [] + for one_vs_rest in [False, True]: + # We only consider min samples leafs and max_depths if we have real multiclass + for md in [None] if one_vs_rest else max_depths: + for msl in [1] if one_vs_rest else min_samples_leaves: + suffix = "" if one_vs_rest else "_multiclass" + if md is not None: # None is the default value + suffix += "_max_depth{}".format(msl) + if msl > 1: # 1 is the default vlaue + suffix += "_max_depth{}".format(msl) + for nest in n_estimators: + extractors.append( + fe.RandomForestFeatureExtractor( + name="{}-estimators{}".format(nest, suffix), + classifier_kwargs={ + 'n_estimators': nest, + 'min_samples_leaf': msl, + 'max_depth': md + }, + one_vs_rest=one_vs_rest, + **extractor_kwargs) + ) + return _shuffle_and_shorten(extractors) + # return extractors + + def create_PCA_feature_extractors(extractor_kwargs, variance_cutoffs=["auto", "1_components", "2_components", 50, 100]): return [ fe.PCAFeatureExtractor( @@ -156,48 +179,59 @@ def create_MLP_feature_extractors(extractor_kwargs, def create_AE_feature_extractors(extractor_kwargs, alpha_hidden_layers=[ - # (0.001, [1, ]), # new - (0.001, [10, ]), - # (0.00001, [10, ]), # new + (1e-2, "auto"), + (0.01, [10, ]), + (0.001, [100, ]), + (0.00001, [100, ]), (0.01, [10, 7, 5, 2, 5, 7, 10, ]), - (0.001, [10, 7, 5, 2, 5, 7, 10, ]), - (0.01, [20, 10, 7, 5, 2, 5, 7, 10, 20]), - (0.01, [20, 5, 20, ]), - # New values - # (0.0001, [8, 2, 8, ]), - # (0.0001, [16, ]), - # (0.1, [8, 2, 8, ]), - # (0.0001, [1, ]), - - # Benchmarking layer size - # (0.0001, [10, 2, 10, ]), - # (0.0001, [100, 25, 100, ]), # actually used in both benchmarks - # (0.0001, [100, 25, 5, 25, 100, ]), - # # benchmarking alpha - # (0.001, [100, 25, 100, ]), - # (0.01, [100, 25, 100, ]), - # (0.1, [100, 25, 100, ]), - - ] + + ], + batch_sizes=["auto", 10, 100], # 1000 + learning_rates=["adaptive"], + # batch_sizes=["auto"], + max_iters=[20, 200], # 2000 + # max_iters=[200], + parameter_set_default=None, #"original", + # Used to enhance performa. Just using one set of hyperparams ): + if parameter_set_default is not None: + learning_rates = ["adaptive"] + batch_sizes = ["auto"] + max_iters = [200] + if parameter_set_default == 'original': + # Default values used in bioRxiv version + alpha_hidden_layers = [(0.001, [10, ]), + (0.01, [10, 7, 5, 2, 5, 7, 10, ]), + (0.001, [10, 7, 5, 2, 5, 7, 10, ]), + (0.01, [20, 10, 7, 5, 2, 5, 7, 10, 20]), + (0.01, [20, 5, 20, ]), ] + elif parameter_set_default == 'auto': + # simple setup if you just want to run with one parameter set + alpha_hidden_layers = [(0.01, "auto")] + feature_extractors = [] for alpha, layers in alpha_hidden_layers: - name = "{}-alpha_{}-layers".format(alpha, "x".join([str(l) for l in layers])) - feature_extractors.append( - fe.MlpAeFeatureExtractor( - name=name, - classifier_kwargs={ - 'hidden_layer_sizes': layers, - 'max_iter': 200, - 'learning_rate': 'adaptive', - 'alpha': alpha, - 'solver': "adam", - 'early_stopping': True, - 'tol': 1e-2, - 'warm_start': False - }, - activation="logistic", - use_reconstruction_for_lrp=True, - **extractor_kwargs) - ) - return feature_extractors + for bs in batch_sizes: + for lr in learning_rates: + for mi in max_iters: + name = "{}-alpha_{}-layers".format(alpha, "x".join([str(l) for l in layers])) + name += "_{}-batchsize_{}-maxiter_{}-learningrate".format(bs, mi, lr) + feature_extractors.append( + fe.MlpAeFeatureExtractor( + name=name, + classifier_kwargs={ + 'hidden_layer_sizes': layers, + 'max_iter': mi, + 'learning_rate': lr, # not used with the adam solver + 'batch_size': bs, + 'alpha': alpha, + 'solver': "adam", + 'early_stopping': True, + 'tol': 1e-2, + 'warm_start': False + }, + activation="logistic", + use_reconstruction_for_lrp=True, + **extractor_kwargs) + ) + return _shuffle_and_shorten(feature_extractors, max_number_extractors=6) diff --git a/benchmarks.sh b/benchmarks.sh new file mode 100644 index 0000000..2036bc3 --- /dev/null +++ b/benchmarks.sh @@ -0,0 +1,19 @@ +for it in {1..3}; do +echo "Iteration $iter" +for ns in 1e-2; do +for tm in linear non-linear; do +for ft in inv-dist; do +cmd="python run_benchmarks.py --extractor_type=AE --output_dir=output/mega/results/benchmarking/ --feature_type=$ft --noise_level=$ns --test_model=$tm --accuracy_method mse relevant_fraction --overwrite=False;" +echo $cmd; +$cmd +done +done +done; +done; + +echo "Running all" +cmd="python run_benchmarks.py --extractor_type=all --output_dir=output/mega/results/benchmarking/ --feature_type inv-dist cartesian_rot compact-dist --noise_level 1e-2 5e-2 --test_model linear non-linear --accuracy_method mse relevant_fraction --overwrite=False;" +echo $cmd +$cmd + +echo "Done" \ No newline at end of file diff --git a/modules/data_generation.py b/modules/data_generation.py index 9c297e6..930752a 100755 --- a/modules/data_generation.py +++ b/modules/data_generation.py @@ -43,6 +43,8 @@ def __init__(self, natoms, nclusters, natoms_per_cluster, nframes_per_cluster, raise Exception("parameter natoms_per_cluster should be an array of length {}".format(nclusters)) if moved_atoms is not None and len(moved_atoms) != nclusters: raise Exception("parameter moved_atoms should be None or an array of length {}".format(nclusters)) + if max(natoms_per_cluster) >= natoms: + raise Exception("parameter natoms_per_cluster should be less than the number of atoms in the system") self.natoms = natoms self.nclusters = nclusters self.natoms_per_cluster = natoms_per_cluster @@ -271,9 +273,9 @@ def _to_compact_dist(self, conf): def _to_cartesian(self, conf): - if "_rot" in self.feature_type: + if "rot" in self.feature_type : conf = self._random_rotation(conf) - if "_trans" in self.feature_type: + if "trans" in self.feature_type: conf = self._random_translation(conf) feats = np.empty((self.nfeatures)) diff --git a/modules/feature_extraction/mlp_ae_feature_extractor.py b/modules/feature_extraction/mlp_ae_feature_extractor.py index 4f50408..988531d 100644 --- a/modules/feature_extraction/mlp_ae_feature_extractor.py +++ b/modules/feature_extraction/mlp_ae_feature_extractor.py @@ -27,12 +27,26 @@ def __init__(self, self.use_reconstruction_for_lrp = use_reconstruction_for_lrp logger.debug("Initializing MLP AE with the following parameters:" " use_reconstruction_for_lrp %s", use_reconstruction_for_lrp) + # If no hidden layers are specific or it is set to auto we will set this hyperparameter on the fly + self._automatic_layer_size = self.classifier_kwargs.get("hidden_layer_sizes", "auto") in [None, "auto"] def train(self, train_set, train_labels): logger.debug("Training %s with %s samples and %s features ...", self.name, train_set.shape[0], train_set.shape[1]) classifier_kwargs = self.classifier_kwargs - classifier_kwargs['hidden_layer_sizes'] = list(classifier_kwargs['hidden_layer_sizes']) + [train_set.shape[1]] + if self._automatic_layer_size: + # Use a rule of thumb with decreasing the layer half at the time + layer_sizes = [] + nfeatures = train_set.shape[1] + n_hidden_nodes = nfeatures / 2 + while n_hidden_nodes > 0 and n_hidden_nodes > nfeatures / 8: + layer_sizes.append(int(n_hidden_nodes + 0.5)) + n_hidden_nodes /= 2 + else: + layer_sizes = list(classifier_kwargs['hidden_layer_sizes']) + # add output layer same size as features + layer_sizes += [train_set.shape[1]] + classifier_kwargs['hidden_layer_sizes'] = layer_sizes classifier = sklearn.neural_network.MLPRegressor(**classifier_kwargs) classifier.fit(train_set, train_set) # note same output as input return classifier diff --git a/modules/visualization.py b/modules/visualization.py index 96fb6b7..91d05cb 100755 --- a/modules/visualization.py +++ b/modules/visualization.py @@ -363,24 +363,35 @@ def visualize(postprocessors, plt.clf() +def _sort_by_accuracy(accuracy, xlabels, postprocessors): + mean_accuracy = accuracy.mean(axis=0) + sorted_indices = [i for i, a in sorted(enumerate(mean_accuracy), key=lambda tpl: tpl[1], reverse=True)] + return accuracy[:, sorted_indices], np.array(xlabels)[sorted_indices], postprocessors[:, sorted_indices] + + def _show_performance(postprocessors, xlabels=None, title=None, filename=None, accuracy_limits=None, - supervised=False, + plot_per_state_accuracy=False, output_dir=None, - accuracy_method=None): + sort_by_accuracy=False, + accuracy_method=None, + width_factor=0.75): if len(postprocessors) == 0: return if not os.path.exists(output_dir): os.makedirs(output_dir) - nrows = 2 if supervised else 1 + + nrows = 2 if plot_per_state_accuracy else 1 fig, axs = plt.subplots(nrows, 1, sharex=True, sharey=False, squeeze=False, - figsize=(int(0.5 * postprocessors.shape[0]), 2 * nrows), + figsize=(int(width_factor * postprocessors.shape[0]), 2 * nrows), constrained_layout=True) fig.subplots_adjust(bottom=0.2) accuracy = utils.to_accuracy(postprocessors) + if sort_by_accuracy: + accuracy, xlabels, postprocessors = _sort_by_accuracy(accuracy, xlabels, postprocessors) ax0 = axs[0, 0] if title is not None: ax0.set_title(title) @@ -395,7 +406,7 @@ def _show_performance(postprocessors, elif accuracy_method == 'relevant_fraction': accuracy_label = "Accuracy:\nignoring irrelevant" ax0.set_ylabel(accuracy_label) - if supervised: + if plot_per_state_accuracy: # Per state ax0.get_shared_y_axes().join(ax0, axs[1, 0]) # share y range with regular accuracy axs[1, 0].boxplot(utils.to_accuracy_per_cluster(postprocessors), @@ -434,7 +445,8 @@ def show_single_extractor_performance(postprocessors, xlabels=xlabels, # title=extractor_type, filename=filename, - supervised=postprocessors[0, 0].extractor.supervised, + sort_by_accuracy=True, + plot_per_state_accuracy=postprocessors[0, 0].extractor.supervised, output_dir=output_dir, accuracy_method=accuracy_method) @@ -460,11 +472,58 @@ def show_all_extractors_performance(postprocessors, # title=title, filename=filename, accuracy_limits=[0.2, 1.1], - supervised=supervised, + plot_per_state_accuracy=supervised, output_dir=output_dir, + width_factor=0.4, accuracy_method=accuracy_method) +def show_system_size_dependence(n_atoms, + postprocessors, + extractor_types, + test_model=None, + noise_level=None, + feature_type=None, + filename=None, + output_dir="output/benchmarking/", + accuracy_method=None): + if len(postprocessors) == 0: + return + if filename is None: + filename = "{feature_type}_{test_model}_{noise_level}noise_{natoms}atoms_{accuracy_method}_ {extractor_types}.svg".format( + feature_type=feature_type, + test_model=test_model, + extractor_types="&".join(extractor_types), + noise_level=noise_level, + accuracy_method=accuracy_method, + natoms="{}to{}atoms".format(min(n_atoms), max(n_atoms)) + ) + fig, axs = plt.subplots(1, 1, sharex=True, sharey=False, squeeze=False, + # figsize=(int(0.5 * postprocessors.shape[0]), 2 * nrows), + constrained_layout=True) + ax = axs[0, 0] + if not os.path.exists(output_dir): + os.makedirs(output_dir) + accuracy_label = "Accuracy" + if accuracy_method == 'mse': + accuracy_label = "Accuracy:\nfinding all" + elif accuracy_method == 'relevant_fraction': + accuracy_label = "Accuracy:\nignoring irrelevant" + ax.set_ylabel(accuracy_label) + ax.set_xlabel("Number of atoms in toy model") + ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f')) + for extractor_idx, extractor in enumerate(extractor_types): + accuracy = utils.to_accuracy(postprocessors[:, extractor_idx]) + y = accuracy.mean(axis=1) + ystd = accuracy.std(axis=1) + ax.plot(n_atoms, y, label=extractor) + ax.fill_between(n_atoms, y - ystd, y + ystd, color="gray", alpha=0.2) + plt.tight_layout(pad=0.3) + plt.legend() + plt.savefig(output_dir + filename) + plt.clf() + + def _to_settings_string(extractor): parts = [] if isinstance(extractor, fe.MlpFeatureExtractor): @@ -472,11 +531,19 @@ def _to_settings_string(extractor): if alpha is not None: parts.append(utils.to_scientific_number_format(alpha)) layers = extractor.classifier_kwargs.get('hidden_layer_sizes', (100,)) - ls = [] - for idx, l in enumerate(layers): - if idx == 0 or l < layers[idx - 1]: # assuming decreasing layer size here, not showing decoding layers - ls.append(l) - parts.append("x".join([str(l) for l in ls])) + if isinstance(layers, str): + parts.append(layers) + else: + ls = [] + for idx, l in enumerate(layers): + if idx == 0 or l < layers[idx - 1]: # assuming decreasing layer size here, not showing decoding layers + ls.append(l) + parts.append("x".join([str(l) for l in ls])) + if isinstance(extractor, fe.MlpAeFeatureExtractor): + batch_size = extractor.classifier_kwargs.get('batch_size', None) + # learning_rate = extractor.classifier_kwargs.get('learning_rate', None) #not relevant when solver = 'adam' + max_iter = extractor.classifier_kwargs.get('max_iter', None) + parts += [str(s) for s in [batch_size, max_iter]] elif isinstance(extractor, fe.RbmFeatureExtractor): learning_rate = extractor.classifier_kwargs.get('learning_rate', None) if learning_rate is not None: @@ -488,6 +555,10 @@ def _to_settings_string(extractor): parts.append(str(nest)) binary = extractor.one_vs_rest parts.append("BC" if binary else "MC") + min_samples_leaf = extractor.classifier_kwargs.get('min_samples_leaf', 1) + max_depth = extractor.classifier_kwargs.get('max_depth', None) + parts.append("{}".format(min_samples_leaf)) + parts.append("{}".format(max_depth if max_depth is not None else "-")) elif isinstance(extractor, fe.PCAFeatureExtractor): cutoff = extractor.variance_cutoff if isinstance(cutoff, int) or isinstance(cutoff, float): diff --git a/run_benchmarks.py b/run_benchmarks.py deleted file mode 100644 index 1e8dc4f..0000000 --- a/run_benchmarks.py +++ /dev/null @@ -1,117 +0,0 @@ -from __future__ import absolute_import, division, print_function - -import logging -import sys - -logging.basicConfig( - stream=sys.stdout, - level=logging.DEBUG, - format='%(asctime)s %(name)s-%(levelname)s: %(message)s', - datefmt='%Y-%m-%d %H:%M:%S') -import matplotlib as mpl - -mpl.use('Agg') # TO DISABLE GUI. USEFUL WHEN RUNNING ON CLUSTER WITHOUT X SERVER -import argparse -import numpy as np -from benchmarking import computing -from modules import visualization, utils - -logger = logging.getLogger("benchmarking") - - -def _fix_extractor_type(extractor_types): - extractor_types = utils.make_list(extractor_types) - if len(extractor_types) == 1: - et = extractor_types[0] - if et == "supervised": - return ["KL", "RF", "MLP", "RAND"] - elif et == "unsupervised": - return ["PCA", "RBM", "AE", "RAND"] - elif et == "all": - return ["PCA", "RBM", "AE", "KL", "RF", "MLP", "RAND"] - return extractor_types - - -def create_argparser(): - _bool_lambda = lambda x: (str(x).lower() == 'true') - parser = argparse.ArgumentParser( - epilog='Benchmarking for demystifying') - parser.add_argument('--extractor_type', nargs='+', help='list of extractor types (MLP, KL, PCA, ..)', type=str, - required=True) - parser.add_argument('--output_dir', type=str, help='Root directory for output files', - default="output/benchmarking/") - parser.add_argument('--test_model', nargs='+', type=str, help='Toy model displacement: linear or non-linear', - default="linear") - parser.add_argument('--feature_type', nargs='+', - type=str, help='Toy model feature type: cartesian_rot, inv-dist, etc.', - default="cartesian_rot") - parser.add_argument('--noise_level', nargs='+', type=float, - help='Strength of noise added to atomic coordinates at each frame', - default=1e-2) - parser.add_argument('--displacement', type=float, help='Strength of displacement for important atoms', default=1e-1) - parser.add_argument('--overwrite', type=_bool_lambda, - help='Overwrite existing results with new (if set to False no new computations will be performed)', - default=False) - parser.add_argument('--visualize', type=_bool_lambda, help='Generate output figures', default=True) - parser.add_argument('--iterations_per_model', type=int, help='', default=10) - parser.add_argument('--accuracy_method', nargs='+', type=str, help='', default='mse') - - return parser - - -def do_run(args, extractor_types, noise_level, test_model, feature_type, accuracy_method): - visualize = args.visualize - output_dir = args.output_dir - fig_filename = "{feature_type}_{test_model}_{noise_level}noise_{accuracy_method}.svg".format( - feature_type=feature_type, - test_model=test_model, - noise_level=noise_level, - accuracy_method=accuracy_method) - best_processors = [] - for et in extractor_types: - try: - postprocessors = computing.compute(extractor_type=et, - output_dir=output_dir, - feature_type=feature_type, - overwrite=args.overwrite, - accuracy_method=accuracy_method, - iterations_per_model=args.iterations_per_model, - noise_level=noise_level, - visualize=visualize, - test_model=test_model) - if visualize: - visualization.show_single_extractor_performance(postprocessors=postprocessors, - extractor_type=et, - filename=fig_filename, - output_dir=output_dir, - accuracy_method=accuracy_method) - best_processors.append(utils.find_best(postprocessors)) - except Exception as ex: - logger.exception(ex) - logger.warn("Failed for extractor %s ", et) - raise ex - if visualize: - fig_filename = fig_filename.replace(".svg", "_{}.svg".format("-".join(extractor_types))) - visualization.show_all_extractors_performance(np.array(best_processors), - extractor_types, - feature_type=feature_type, - filename=fig_filename, - output_dir=output_dir, - accuracy_method=accuracy_method) - - -def run_all(args): - extractor_types = _fix_extractor_type(args.extractor_type) - for noise_level in utils.make_list(args.noise_level): - for test_model in utils.make_list(args.test_model): - for feature_type in utils.make_list(args.feature_type): - for accuracy_method in utils.make_list(args.accuracy_method): - do_run(args, extractor_types, noise_level, test_model, feature_type, accuracy_method) - - -if __name__ == "__main__": - parser = create_argparser() - args = parser.parse_args() - logger.info("Starting script run_benchmarks with arguments %s", args) - run_all(args) - logger.info("Done!")