rewrite SMAD's training and tuning scripts as suggested by the review…

…er (we now use MCC as a performance metric + different method for hyper-parameters calibration)
antoineBarbez · Sep 6, 2019 · 9f48bcc · 9f48bcc
1 parent b00044f
commit 9f48bcc
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 71 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/approaches/smad/model.py b/approaches/smad/model.py
@@ -13,6 +13,7 @@ def __init__(self, shape, input_size):
 		# Placeholders for training parameters
 		self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
 		self.beta          = tf.placeholder(tf.float32, name="beta")
+		self.gamma         = tf.placeholder(tf.float32, name="gamma")
 
 		# L2 regularization & initialization
 		l2_reg = tf.contrib.layers.l2_regularizer(scale=self.beta)
@@ -42,28 +43,25 @@ def __init__(self, shape, input_size):
 
 		# Loss function
 		with tf.name_scope("loss"):
-			self.loss = loss(self.logits, self.input_y)
+			self.loss = 1 - mcc(self.logits, self.input_y, self.gamma)
 			l2_loss = tf.losses.get_regularization_loss()
 			loss_reg = self.loss + l2_loss
 
 		# Learning mechanism
 		self.learning_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss_reg)
 
+def mcc(logits, labels, gamma):
+	'''
+	This function returns a differentiable approximation of the Matthew Correlation
+	Coefficient.
 
-def loss(logits, labels):
-	''' 
-	This function implements the Differentiable approximation of the f-measure from:
-	Martin Jansche (2005):
-	    [Maximum Expected F-Measure Training of Logistic Regression Models]
-
-	true_positive:  sum(sigmoid(gamma*logits)) for label = +1
-	detected: sum(sigmoid(gamma*logits))
-	gamma > 0
+	It approximates the network's prediction as:
+	floor(logits + 0.5) ~ sigmoid(gamma*logits) with gamma > 0
 	'''
-	gamma = 4
 
-	true_positive = tf.reduce_sum(tf.multiply(labels, tf.nn.sigmoid(gamma*logits)))
-	positive = tf.reduce_sum(labels)
-	detected = tf.reduce_sum(tf.nn.sigmoid(gamma*logits))
+	N = tf.cast(tf.size(logits), tf.float32) # TN + TP + FN + FP
+	S = tf.reduce_sum(labels)/N # (TP + FN) / N
+	P = tf.reduce_sum(tf.nn.sigmoid(gamma*logits))/N # (TP + FP) / N
+	TP = tf.reduce_sum(tf.multiply(labels, tf.nn.sigmoid(gamma*logits)))
 
-	return 1 - 2*true_positive/(positive+detected)
+	return ((TP/N) - S*P)/(P*S*(1-S)*(1-P))**0.5
diff --git a/experiments/training/train_smad.py b/experiments/training/train_smad.py
@@ -26,6 +26,7 @@ def parse_args():
 	parser.add_argument("test_system", help="The name of the system to be used for testing.\n Hence, the training will be performed using all the systems except this one.")
 	parser.add_argument("-lr", type=float, help="The learning rate to be used for training.")
 	parser.add_argument("-beta", type=float, help="The L2 regularization scale to be used for training.")
+	parser.add_argument("-gamma", type=int, help="Learning hyper-parameter, used to compute the network's loss function (to compute a differentiable approximation of the Matheus Correlation Coefficient)")
 	parser.add_argument('-dense_sizes', nargs='+', type=int, help="The sizes of each (dense) hidden layer in the network.")
 	parser.add_argument("-n_net", type=int, default=10, help="The number of distinct networks to be trained and saved.")
 	parser.add_argument("-n_step", type=int, default=200, help="The number of training steps.")
@@ -34,7 +35,7 @@ def parse_args():
 	return parser.parse_args()
 
 # Train a single network
-def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr, beta, decay_step, lr_decay):
+def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr, beta, gamma, decay_step, lr_decay):
 	learning_rate = start_lr
 	losses_train = []
 	losses_test  = []
@@ -47,12 +48,13 @@ def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr,
 					model.input_x: x_train,
 					model.input_y: y_train,
 					model.learning_rate:learning_rate,
-					model.beta:beta}
+					model.beta:beta,
+					model.gamma: gamma}
 
 		session.run(model.learning_step, feed_dict=feed_dict_train)
 
-		loss_train = session.run(model.loss, feed_dict={model.input_x:x_train, model.input_y:y_train})
-		loss_test  = session.run(model.loss, feed_dict={model.input_x:x_test, model.input_y:y_test})
+		loss_train = session.run(model.loss, feed_dict={model.input_x:x_train, model.input_y:y_train, model.gamma: gamma})
+		loss_test  = session.run(model.loss, feed_dict={model.input_x:x_test, model.input_y:y_test, model.gamma: gamma})
 		losses_train.append(loss_train)
 		losses_test.append(loss_test)
 	return losses_train, losses_test
@@ -94,6 +96,7 @@ def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr,
 				num_step=args.n_step,
 				start_lr=args.lr,
 				beta=args.beta,
+				gamma=args.gamma,
 				decay_step=args.decay_step,
 				lr_decay=args.lr_decay)
 
@@ -115,6 +118,7 @@ def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr,
 	print('Precision: ' + str(nnUtils.precision(ensemble_prediction, y_test)))
 	print('Recall   : ' + str(nnUtils.recall(ensemble_prediction, y_test)))
 	print('F-Mesure : ' + str(nnUtils.f_measure(ensemble_prediction, y_test)))
+	print('MCC      : ' + str(nnUtils.mcc(ensemble_prediction, y_test)))
 
 	# Plot learning curves
 	nnUtils.plot_learning_curves(all_losses_train, all_losses_test)
diff --git a/experiments/tuning/tune_smad.py b/experiments/tuning/tune_smad.py
@@ -1,15 +1,15 @@
 from context import ROOT_DIR, nnUtils, md
 
-import tensorflow        as tf
-import numpy             as np
+import tensorflow as tf
+import numpy      as np
 
 import argparse
 import os
 import progressbar
 import random
 os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 
-training_systems = {
+systems = {
 	'android-frameworks-opt-telephony',
 	'android-platform-support',
 	'apache-ant',
@@ -24,14 +24,14 @@ def parse_args():
 	parser = argparse.ArgumentParser()
 	parser.add_argument("antipattern", help="Either 'god_class' or 'feature_envy'")
 	parser.add_argument("test_system", help="The name of the system to be used for testing.\n Hence, the cross-validation will be performed using all the systems except this one.")
-	parser.add_argument("-n_fold", type=int, default=5, help="Number of folds (k) for a k-fold-cross-validation")
 	parser.add_argument("-n_step", type=int, default=100, help="Number of training steps (i.e., epochs) to be performed for each fold")
 	parser.add_argument("-n_test", type=int, default=200, help="Number of random hyper-parameters sets to be tested")
 	return parser.parse_args()
 
 def generateRandomHyperParameters():
 	learning_rate = 10**-random.uniform(0.0, 2.5)
 	beta = 10**-random.uniform(0.0, 2.5)
+	gamma = random.randint(1, 10)
 
 	minBound = 4
 	maxBound = 100
@@ -42,77 +42,73 @@ def generateRandomHyperParameters():
 		dense_sizes.append(dense_size)
 		maxBound = dense_size
 
-	return learning_rate, beta, dense_sizes
+	return learning_rate, beta, gamma, dense_sizes
 
-def train(session, model, x_train, y_train, num_step, lr, beta):
+def train(session, model, x_train, y_train, num_step, lr, beta, gamma):
 	for step in range(num_step):
 		feed_dict_train = {
 					model.input_x: x_train,
 					model.input_y: y_train,
 					model.learning_rate:lr,
-					model.beta:beta}
+					model.beta:beta,
+					model.gamma:gamma}
 
 		session.run(model.learning_step, feed_dict=feed_dict_train)
 
 if __name__ == "__main__":
 	args = parse_args()
 
-	# Remove the test system from the training set and build dataset
-	training_systems.remove(args.test_system)
-	dataset_x, dataset_y = nnUtils.build_dataset(args.antipattern, training_systems)
+	# Remove the test system from the set of systems
+	systems.remove(args.test_system)
 
 	bar = progressbar.ProgressBar(maxval=args.n_test, \
 		widgets=['Performing cross validation for ' + args.test_system + ': ' ,progressbar.Percentage()])
 	bar.start()
 
-	output_file_path = os.path.join(ROOT_DIR, 'experiments', 'tuning', 'results', 'smad', args.antipattern, args.test_system + '.csv')
+	output_file_path = os.path.join(ROOT_DIR, 'experiments', 'tuning', 'results_2', 'smad', args.antipattern, args.test_system + '.csv')
 
 	params = []
 	perfs  = []
 	for i in range(args.n_test):
-		learning_rate, beta, dense_sizes = generateRandomHyperParameters()
-		params.append([learning_rate, beta, dense_sizes])
-
-		# Due to the randomness of the process, repeat the cross validation 3 times 
-		# per hyper-pameters' set and take the average performance value.
-		performances = []
-		for j in range(3):
-			data_x, data_y = nnUtils.shuffle(dataset_x, dataset_y)
-
-			predictions = np.empty(shape=[0, 1])
-			for k in range(args.n_fold):
-				# Create the training and testing datasets for this fold
-				x_train, y_train, x_test, y_test = nnUtils.get_cross_validation_dataset(data_x, data_y, k, args.n_fold)
-
-				# New graph
-				tf.reset_default_graph()
-
-				# Create model
-				model = md.SMAD(
-					shape=dense_sizes, 
-					input_size=x_train.shape[-1])
-
-				with tf.Session() as session:
-					# Initialize the variables of the TensorFlow graph.
-					session.run(tf.global_variables_initializer())
-
-					# Train the model
-					train(
-						session=session,
-						model=model,
-						x_train=x_train,
-						y_train=y_train,
-						num_step=args.n_step,
-						lr=learning_rate,
-						beta=beta)
-
-					predictions = np.concatenate((predictions, session.run(model.inference, feed_dict={model.input_x: x_test})), axis=0)
-			performances.append(nnUtils.f_measure(predictions, data_y))
-		perfs.append(np.mean(np.array(performances), axis=0))
+		learning_rate, beta, gamma, dense_sizes = generateRandomHyperParameters()
+		params.append([learning_rate, beta, gamma, dense_sizes])
+
+		predictions = np.empty(shape=[0, 1])
+		labels      = np.empty(shape=[0, 1])
+		for validation_system in systems:
+			x_train, y_train = nnUtils.build_dataset(args.antipattern, [s for s in systems if s != validation_system])
+			x_valid, y_valid = nnUtils.build_dataset(args.antipattern, [validation_system])
+			labels = np.concatenate((labels, y_valid), axis=0)
+
+			# New graph
+			tf.reset_default_graph()
+
+			# Create model
+			model = md.SMAD(
+				shape=dense_sizes, 
+				input_size=x_train.shape[-1])
+
+			with tf.Session() as session:
+				# Initialize the variables of the TensorFlow graph.
+				session.run(tf.global_variables_initializer())
+
+				# Train the model
+				train(
+					session=session,
+					model=model,
+					x_train=x_train,
+					y_train=y_train,
+					num_step=args.n_step,
+					lr=learning_rate,
+					beta=beta,
+					gamma=gamma)
+
+				predictions = np.concatenate((predictions, session.run(model.inference, feed_dict={model.input_x: x_valid})), axis=0)
+		perfs.append(nnUtils.mcc(predictions, labels))
 
 		indexes = np.argsort(np.array(perfs))
 		with open(output_file_path, 'w') as file:
-			file.write("Learning rate;Beta;Dense sizes;F-measure\n")
+			file.write("Learning rate;Beta;Gamma;Dense sizes;MCC\n")
 			for j in reversed(indexes):
 				for k in range(len(params[j])):
 					file.write(str(params[j][k]) + ';')

diff --git a/utils/nnUtils.py b/utils/nnUtils.py
@@ -40,6 +40,15 @@ def accuracy(output, labels):
 
 	return true/size
 
+def mcc(output, labels):
+	N = labels.size
+	S = positive(labels)/N
+	P = detected(output)/N
+	TP = true_positive(output, labels)
+
+	return ((TP/N) - S*P)/(P*S*(1-S)*(1-P))**0.5
+
+
 
 ### UTILS ###
 
@@ -94,7 +103,7 @@ def get_optimal_hyperparameters(tuning_file):
 
 # Get the path of a trained model for a given approach (smad or asci)
 def get_save_path(approach, antipattern, test_system, model_number):
-	directory = os.path.join(ROOT_DIR, 'approaches', approach, 'trained_models', antipattern, test_system)
+	directory = os.path.join(ROOT_DIR, 'approaches', approach, 'trained_models_2', antipattern, test_system)
 	if not os.path.exists(directory):
 			os.makedirs(directory)
 	return os.path.join(directory, 'model_' + str(model_number))