Skip to content

Commit

Permalink
rewrite SMAD's training and tuning scripts as suggested by the review…
Browse files Browse the repository at this point in the history
…er (we now use MCC as a performance metric + different method for hyper-parameters calibration)
  • Loading branch information
antoineBarbez committed Sep 6, 2019
1 parent b00044f commit 9f48bcc
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 71 deletions.
Binary file modified .DS_Store
Binary file not shown.
28 changes: 13 additions & 15 deletions approaches/smad/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def __init__(self, shape, input_size):
# Placeholders for training parameters
self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
self.beta = tf.placeholder(tf.float32, name="beta")
self.gamma = tf.placeholder(tf.float32, name="gamma")

# L2 regularization & initialization
l2_reg = tf.contrib.layers.l2_regularizer(scale=self.beta)
Expand Down Expand Up @@ -42,28 +43,25 @@ def __init__(self, shape, input_size):

# Loss function
with tf.name_scope("loss"):
self.loss = loss(self.logits, self.input_y)
self.loss = 1 - mcc(self.logits, self.input_y, self.gamma)
l2_loss = tf.losses.get_regularization_loss()
loss_reg = self.loss + l2_loss

# Learning mechanism
self.learning_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss_reg)

def mcc(logits, labels, gamma):
'''
This function returns a differentiable approximation of the Matthew Correlation
Coefficient.
def loss(logits, labels):
'''
This function implements the Differentiable approximation of the f-measure from:
Martin Jansche (2005):
[Maximum Expected F-Measure Training of Logistic Regression Models]
true_positive: sum(sigmoid(gamma*logits)) for label = +1
detected: sum(sigmoid(gamma*logits))
gamma > 0
It approximates the network's prediction as:
floor(logits + 0.5) ~ sigmoid(gamma*logits) with gamma > 0
'''
gamma = 4

true_positive = tf.reduce_sum(tf.multiply(labels, tf.nn.sigmoid(gamma*logits)))
positive = tf.reduce_sum(labels)
detected = tf.reduce_sum(tf.nn.sigmoid(gamma*logits))
N = tf.cast(tf.size(logits), tf.float32) # TN + TP + FN + FP
S = tf.reduce_sum(labels)/N # (TP + FN) / N
P = tf.reduce_sum(tf.nn.sigmoid(gamma*logits))/N # (TP + FP) / N
TP = tf.reduce_sum(tf.multiply(labels, tf.nn.sigmoid(gamma*logits)))

return 1 - 2*true_positive/(positive+detected)
return ((TP/N) - S*P)/(P*S*(1-S)*(1-P))**0.5
12 changes: 8 additions & 4 deletions experiments/training/train_smad.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def parse_args():
parser.add_argument("test_system", help="The name of the system to be used for testing.\n Hence, the training will be performed using all the systems except this one.")
parser.add_argument("-lr", type=float, help="The learning rate to be used for training.")
parser.add_argument("-beta", type=float, help="The L2 regularization scale to be used for training.")
parser.add_argument("-gamma", type=int, help="Learning hyper-parameter, used to compute the network's loss function (to compute a differentiable approximation of the Matheus Correlation Coefficient)")
parser.add_argument('-dense_sizes', nargs='+', type=int, help="The sizes of each (dense) hidden layer in the network.")
parser.add_argument("-n_net", type=int, default=10, help="The number of distinct networks to be trained and saved.")
parser.add_argument("-n_step", type=int, default=200, help="The number of training steps.")
Expand All @@ -34,7 +35,7 @@ def parse_args():
return parser.parse_args()

# Train a single network
def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr, beta, decay_step, lr_decay):
def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr, beta, gamma, decay_step, lr_decay):
learning_rate = start_lr
losses_train = []
losses_test = []
Expand All @@ -47,12 +48,13 @@ def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr,
model.input_x: x_train,
model.input_y: y_train,
model.learning_rate:learning_rate,
model.beta:beta}
model.beta:beta,
model.gamma: gamma}

session.run(model.learning_step, feed_dict=feed_dict_train)

loss_train = session.run(model.loss, feed_dict={model.input_x:x_train, model.input_y:y_train})
loss_test = session.run(model.loss, feed_dict={model.input_x:x_test, model.input_y:y_test})
loss_train = session.run(model.loss, feed_dict={model.input_x:x_train, model.input_y:y_train, model.gamma: gamma})
loss_test = session.run(model.loss, feed_dict={model.input_x:x_test, model.input_y:y_test, model.gamma: gamma})
losses_train.append(loss_train)
losses_test.append(loss_test)
return losses_train, losses_test
Expand Down Expand Up @@ -94,6 +96,7 @@ def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr,
num_step=args.n_step,
start_lr=args.lr,
beta=args.beta,
gamma=args.gamma,
decay_step=args.decay_step,
lr_decay=args.lr_decay)

Expand All @@ -115,6 +118,7 @@ def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr,
print('Precision: ' + str(nnUtils.precision(ensemble_prediction, y_test)))
print('Recall : ' + str(nnUtils.recall(ensemble_prediction, y_test)))
print('F-Mesure : ' + str(nnUtils.f_measure(ensemble_prediction, y_test)))
print('MCC : ' + str(nnUtils.mcc(ensemble_prediction, y_test)))

# Plot learning curves
nnUtils.plot_learning_curves(all_losses_train, all_losses_test)
98 changes: 47 additions & 51 deletions experiments/tuning/tune_smad.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from context import ROOT_DIR, nnUtils, md

import tensorflow as tf
import numpy as np
import tensorflow as tf
import numpy as np

import argparse
import os
import progressbar
import random
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

training_systems = {
systems = {
'android-frameworks-opt-telephony',
'android-platform-support',
'apache-ant',
Expand All @@ -24,14 +24,14 @@ def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("antipattern", help="Either 'god_class' or 'feature_envy'")
parser.add_argument("test_system", help="The name of the system to be used for testing.\n Hence, the cross-validation will be performed using all the systems except this one.")
parser.add_argument("-n_fold", type=int, default=5, help="Number of folds (k) for a k-fold-cross-validation")
parser.add_argument("-n_step", type=int, default=100, help="Number of training steps (i.e., epochs) to be performed for each fold")
parser.add_argument("-n_test", type=int, default=200, help="Number of random hyper-parameters sets to be tested")
return parser.parse_args()

def generateRandomHyperParameters():
learning_rate = 10**-random.uniform(0.0, 2.5)
beta = 10**-random.uniform(0.0, 2.5)
gamma = random.randint(1, 10)

minBound = 4
maxBound = 100
Expand All @@ -42,77 +42,73 @@ def generateRandomHyperParameters():
dense_sizes.append(dense_size)
maxBound = dense_size

return learning_rate, beta, dense_sizes
return learning_rate, beta, gamma, dense_sizes

def train(session, model, x_train, y_train, num_step, lr, beta):
def train(session, model, x_train, y_train, num_step, lr, beta, gamma):
for step in range(num_step):
feed_dict_train = {
model.input_x: x_train,
model.input_y: y_train,
model.learning_rate:lr,
model.beta:beta}
model.beta:beta,
model.gamma:gamma}

session.run(model.learning_step, feed_dict=feed_dict_train)

if __name__ == "__main__":
args = parse_args()

# Remove the test system from the training set and build dataset
training_systems.remove(args.test_system)
dataset_x, dataset_y = nnUtils.build_dataset(args.antipattern, training_systems)
# Remove the test system from the set of systems
systems.remove(args.test_system)

bar = progressbar.ProgressBar(maxval=args.n_test, \
widgets=['Performing cross validation for ' + args.test_system + ': ' ,progressbar.Percentage()])
bar.start()

output_file_path = os.path.join(ROOT_DIR, 'experiments', 'tuning', 'results', 'smad', args.antipattern, args.test_system + '.csv')
output_file_path = os.path.join(ROOT_DIR, 'experiments', 'tuning', 'results_2', 'smad', args.antipattern, args.test_system + '.csv')

params = []
perfs = []
for i in range(args.n_test):
learning_rate, beta, dense_sizes = generateRandomHyperParameters()
params.append([learning_rate, beta, dense_sizes])

# Due to the randomness of the process, repeat the cross validation 3 times
# per hyper-pameters' set and take the average performance value.
performances = []
for j in range(3):
data_x, data_y = nnUtils.shuffle(dataset_x, dataset_y)

predictions = np.empty(shape=[0, 1])
for k in range(args.n_fold):
# Create the training and testing datasets for this fold
x_train, y_train, x_test, y_test = nnUtils.get_cross_validation_dataset(data_x, data_y, k, args.n_fold)

# New graph
tf.reset_default_graph()

# Create model
model = md.SMAD(
shape=dense_sizes,
input_size=x_train.shape[-1])

with tf.Session() as session:
# Initialize the variables of the TensorFlow graph.
session.run(tf.global_variables_initializer())

# Train the model
train(
session=session,
model=model,
x_train=x_train,
y_train=y_train,
num_step=args.n_step,
lr=learning_rate,
beta=beta)

predictions = np.concatenate((predictions, session.run(model.inference, feed_dict={model.input_x: x_test})), axis=0)
performances.append(nnUtils.f_measure(predictions, data_y))
perfs.append(np.mean(np.array(performances), axis=0))
learning_rate, beta, gamma, dense_sizes = generateRandomHyperParameters()
params.append([learning_rate, beta, gamma, dense_sizes])

predictions = np.empty(shape=[0, 1])
labels = np.empty(shape=[0, 1])
for validation_system in systems:
x_train, y_train = nnUtils.build_dataset(args.antipattern, [s for s in systems if s != validation_system])
x_valid, y_valid = nnUtils.build_dataset(args.antipattern, [validation_system])
labels = np.concatenate((labels, y_valid), axis=0)

# New graph
tf.reset_default_graph()

# Create model
model = md.SMAD(
shape=dense_sizes,
input_size=x_train.shape[-1])

with tf.Session() as session:
# Initialize the variables of the TensorFlow graph.
session.run(tf.global_variables_initializer())

# Train the model
train(
session=session,
model=model,
x_train=x_train,
y_train=y_train,
num_step=args.n_step,
lr=learning_rate,
beta=beta,
gamma=gamma)

predictions = np.concatenate((predictions, session.run(model.inference, feed_dict={model.input_x: x_valid})), axis=0)
perfs.append(nnUtils.mcc(predictions, labels))

indexes = np.argsort(np.array(perfs))
with open(output_file_path, 'w') as file:
file.write("Learning rate;Beta;Dense sizes;F-measure\n")
file.write("Learning rate;Beta;Gamma;Dense sizes;MCC\n")
for j in reversed(indexes):
for k in range(len(params[j])):
file.write(str(params[j][k]) + ';')
Expand Down
11 changes: 10 additions & 1 deletion utils/nnUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,15 @@ def accuracy(output, labels):

return true/size

def mcc(output, labels):
N = labels.size
S = positive(labels)/N
P = detected(output)/N
TP = true_positive(output, labels)

return ((TP/N) - S*P)/(P*S*(1-S)*(1-P))**0.5



### UTILS ###

Expand Down Expand Up @@ -94,7 +103,7 @@ def get_optimal_hyperparameters(tuning_file):

# Get the path of a trained model for a given approach (smad or asci)
def get_save_path(approach, antipattern, test_system, model_number):
directory = os.path.join(ROOT_DIR, 'approaches', approach, 'trained_models', antipattern, test_system)
directory = os.path.join(ROOT_DIR, 'approaches', approach, 'trained_models_2', antipattern, test_system)
if not os.path.exists(directory):
os.makedirs(directory)
return os.path.join(directory, 'model_' + str(model_number))
Expand Down

0 comments on commit 9f48bcc

Please sign in to comment.