implemented new training, tuning and predict scripts for smad and asci

antoineBarbez · Apr 23, 2019 · 3ce21cc · 3ce21cc
1 parent f998f5e
commit 3ce21cc
Show file tree

Hide file tree

Showing 36 changed files with 460 additions and 185 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/detection_tools/feature_envy/hist.py b/detection_tools/feature_envy/hist.py
@@ -86,4 +86,18 @@ def getSmells(systemName, alpha=2.6):
 				smells.append(m + ';' + c)
 
 
-	return smells
+	return smells
+
+
+def predict(systemName):
+	entities = dataUtils.getEntities('feature_envy', systemName)
+	smells = getSmells(systemName)
+
+	prediction = []
+	for entity in entities:
+		if entity in smells:
+			prediction.append([1.])
+		else:
+			prediction.append([0.])
+
+	return np.array(prediction)
diff --git a/detection_tools/feature_envy/incode.py b/detection_tools/feature_envy/incode.py
@@ -1,6 +1,8 @@
 from __future__ import division
 from context    import ROOT_DIR, dataUtils, entityUtils
 
+import numpy as np
+
 import csv
 import os
 
@@ -84,4 +86,18 @@ def getEnviedClasses(className, classAttributeMap, atfd, laa, fdp):
 	return enviedClass
 
 
+def predict(systemName):
+	entities = dataUtils.getEntities('feature_envy', systemName)
+	smells = getSmells(systemName)
+
+	prediction = []
+	for entity in entities:
+		if entity in smells:
+			prediction.append([1.])
+		else:
+			prediction.append([0.])
+
+	return np.array(prediction)
+
+
 
diff --git a/detection_tools/feature_envy/jdeodorant.py b/detection_tools/feature_envy/jdeodorant.py
@@ -1,6 +1,8 @@
 from __future__ import division
 from context    import ROOT_DIR, dataUtils, entityUtils
 
+import numpy as np
+
 import os
 
 def getSmells(systemName):
@@ -23,3 +25,16 @@ def getSmells(systemName):
 
 	return list(set(smells))
 
+def predict(systemName):
+	entities = dataUtils.getEntities('feature_envy', systemName)
+	smells = getSmells(systemName)
+
+	prediction = []
+	for entity in entities:
+		if entity in smells:
+			prediction.append([1.])
+		else:
+			prediction.append([0.])
+
+	return np.array(prediction)
+
diff --git a/detection_tools/god_class/decor.py b/detection_tools/god_class/decor.py
@@ -1,5 +1,7 @@
 from __future__ import division
-from context    import ROOT_DIR
+from context    import ROOT_DIR, dataUtils
+
+import numpy as np
 
 import csv
 import os
@@ -31,3 +33,17 @@ def getSmells(systemName):
 					smells.append(row['ClassName'])
 
 	return smells
+
+
+def predict(systemName):
+	entities = dataUtils.getEntities('god_class', systemName)
+	smells = getSmells(systemName)
+
+	prediction = []
+	for entity in entities:
+		if entity in smells:
+			prediction.append([1.])
+		else:
+			prediction.append([0.])
+
+	return np.array(prediction)
diff --git a/detection_tools/god_class/hist.py b/detection_tools/god_class/hist.py
@@ -42,3 +42,16 @@ def getSmells(systemName, alpha=8.0):
 
 	return smells
 
+def predict(systemName):
+	entities = dataUtils.getEntities('god_class', systemName)
+	smells = getSmells(systemName)
+
+	prediction = []
+	for entity in entities:
+		if entity in smells:
+			prediction.append([1.])
+		else:
+			prediction.append([0.])
+
+	return np.array(prediction)
+
diff --git a/detection_tools/god_class/jdeodorant.py b/detection_tools/god_class/jdeodorant.py
@@ -1,4 +1,6 @@
-from context import ROOT_DIR
+from context import ROOT_DIR, dataUtils
+
+import numpy as np
 
 import os
 
@@ -8,3 +10,16 @@ def getSmells(systemName):
 
 	with open(JDBlobFile, 'r') as file:
 		return list(set([line.split()[0] for line in file]))
+
+def predict(systemName):
+	entities = dataUtils.getEntities('god_class', systemName)
+	smells = getSmells(systemName)
+
+	prediction = []
+	for entity in entities:
+		if entity in smells:
+			prediction.append([1.])
+		else:
+			prediction.append([0.])
+
+	return np.array(prediction)
diff --git a/experiments/.DS_Store b/experiments/.DS_Store
diff --git a/experiments/training/.DS_Store b/experiments/training/.DS_Store
diff --git a/experiments/training/context.py b/experiments/training/context.py
@@ -5,6 +5,7 @@
 sys.path.insert(0, ROOT_DIR)
 
 import utils.dataUtils as dataUtils
-import utils.nnUtils as nnUtils
+import utils.nnUtils   as nnUtils
 
-import neural_networks.smad.model as md
+import neural_networks.asci.predict as asci
+import neural_networks.smad.model   as md
diff --git a/experiments/training/train_asci.py b/experiments/training/train_asci.py
@@ -0,0 +1,101 @@
+from context import nnUtils, asci
+from sklearn import tree
+
+import numpy as np
+
+import argparse
+import pickle
+
+training_systems = {
+	'android-frameworks-opt-telephony',
+	'android-platform-support',
+	'apache-ant',
+	'lucene',
+	'apache-tomcat',
+	'argouml',
+	'jedit',
+	'xerces-2_7_0'
+}
+
+def parse_args():
+	parser = argparse.ArgumentParser()
+	parser.add_argument("antipattern", help="Either 'god_class' or 'feature_envy'.")
+	parser.add_argument("test_system", help="The name of the system to be used for testing.\n Hence, the training will be performed using all the systems except this one.")
+	parser.add_argument("-n_tree", type=int, default=10, help="The number of distinct trees to be trained and saved.")
+	parser.add_argument("-min_samples_split", type=int, default=5)
+	parser.add_argument("-max_features", default='log2')
+	parser.add_argument("-max_depth", type=int, default=None)
+	parser.add_argument("-min_samples_leaf", type=int, default=2)
+	return parser.parse_args()
+
+# Build the dataset for asci, i.e., the labels are the indexes of the best tool for each input instance.
+# The order of the tools is given by the function asci.getToolsPredictions(...):
+# idx = 0: DECOR, InCode
+# idx = 1: HIST
+# idx = 2: JDeodorant
+def build_asci_dataset(antipattern, systems):
+	# Get real instances and labels
+	instances, labels = nnUtils.build_dataset(antipattern, systems)
+
+	# Compute the performances of each tool in order to sort them accordingly
+	nb_tools = 3
+	toolsOverallPredictions = [np.empty(shape=[0, 1]) for _ in range(nb_tools)]
+	for system in systems:
+		toolsPredictions = asci.getToolsPredictions(antipattern, system)
+		for i in range(nb_tools):
+			toolsOverallPredictions[i] = np.concatenate((toolsOverallPredictions[i], toolsPredictions[i]), axis=0)
+
+	toolsPerformances = [nnUtils.f_measure(pred, labels) for pred in toolsOverallPredictions]
+
+	# Indexes of the tools, sorted according to their performances on the training set
+	toolsSortedIndexes = np.argsort(np.array(toolsPerformances))
+
+	# Assign to each instance, the index of the tool that best predicted its label.
+	# In case of conflict, assign the index of the tool that performed the best on overall.
+
+	# Initialize with the index of the best tool as default index
+	toolsIndexes = [toolsSortedIndexes[-1] for _ in instances]
+	for i, label in enumerate(labels):
+		for toolIndex in toolsSortedIndexes:
+			if toolsOverallPredictions[toolIndex][i] == label:
+				toolsIndexes[i] = toolIndex
+
+	return instances, np.array(toolsIndexes)
+
+if __name__ == "__main__":
+	args = parse_args()
+
+	# Remove the test system from the training set and build dataset
+	training_systems.remove(args.test_system)
+	x_train, y_train = build_asci_dataset(args.antipattern, training_systems)
+
+	# Test dataset, note that here y_test contains the real labels while y_train contains tools' indexes
+	x_test, y_test = nnUtils.build_dataset(args.antipattern, [args.test_system])
+	toolsPredictions = asci.getToolsPredictions(args.antipattern, args.test_system)
+
+	# Train and compute ensemble prediction on test set
+	predictions = np.zeros((args.n_tree, x_test.shape[0], 1))
+	for i in range(args.n_tree):
+		clf = tree.DecisionTreeClassifier(
+			min_samples_split=args.min_samples_split,
+			max_features=args.max_features,
+			max_depth=args.max_depth,
+			min_samples_leaf=args.min_samples_leaf)
+		clf = clf.fit(x_train, y_train)
+
+		# Save the tree
+		with open(nnUtils.get_save_path('asci', args.antipattern, args.test_system, i), 'wb') as save_file:
+			pickle.dump(clf, save_file)
+
+		# Compute the prediction of the current tree
+		predictedToolIndexes = clf.predict(x_test)
+		for j, toolIndex in enumerate(predictedToolIndexes): 
+			predictions[i, j, 0] = toolsPredictions[toolIndex][j]
+
+	ensemble_prediction = np.mean(predictions, axis=0)
+
+	# Print Ensemble performances
+	print("\nPerformances on " + args.test_system + ": ")
+	print('Precision: ' + str(nnUtils.precision(ensemble_prediction, y_test)))
+	print('Recall   : ' + str(nnUtils.recall(ensemble_prediction, y_test)))
+	print('F-Mesure : ' + str(nnUtils.f_measure(ensemble_prediction, y_test)))
diff --git a/experiments/training/train_smad.py b/experiments/training/train_smad.py
@@ -33,23 +33,6 @@ def parse_args():
 	parser.add_argument("-lr_decay", type=float, default=0.5, help="The factor by which the learning rate is multiplied every 'decay_step' steps")
 	return parser.parse_args()
 
-# Get the path of a trained model
-def get_save_path(antipattern, test_system, net_number):
-	directory = os.path.join(ROOT_DIR, 'neural_networks', 'smad', 'trained_models', antipattern, test_system)
-	if not os.path.exists(directory):
-			os.makedirs(directory)
-	return os.path.join(directory, 'network' + str(net_number))
-
-def build_dataset(antipattern, systems):
-	input_size = {'god_class':8, 'feature_envy':9}
-	X = np.empty(shape=[0, input_size[antipattern]])
-	Y = np.empty(shape=[0, 1])
-	for systemName in systems:
-		X = np.concatenate((X, nnUtils.getInstances(systemName, antipattern)), axis=0)
-		Y = np.concatenate((Y, nnUtils.getLabels(systemName, antipattern)), axis=0)
-
-	return X, Y
-
 # Train a single network
 def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr, beta, decay_step, lr_decay):
 	learning_rate = start_lr
@@ -79,8 +62,8 @@ def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr,
 
 	# Remove the test system from the training set and build dataset
 	training_systems.remove(args.test_system)
-	x_train, y_train = build_dataset(args.antipattern, training_systems)
-	x_test, y_test = build_dataset(args.antipattern, [args.test_system])
+	x_train, y_train = nnUtils.build_dataset(args.antipattern, training_systems)
+	x_test, y_test = nnUtils.build_dataset(args.antipattern, [args.test_system])
 
 	# Create model
 	model = md.SMAD(
@@ -118,13 +101,13 @@ def train(session, model, x_train, y_train, x_test, y_test, num_step, start_lr,
 			all_losses_test.append(losses_test)
 
 			# Save the model
-			saver.save(sess=session, save_path=get_save_path(args.antipattern, args.test_system, i))
+			saver.save(sess=session, save_path=nnUtils.get_save_path('smad', args.antipattern, args.test_system, i))
 
 
 	# Compute the ensemble prediction on the test system
 	ensemble_prediction = nnUtils.ensemble_prediction(
 		model=model, 
-		save_paths=[get_save_path(args.antipattern, args.test_system, i) for i in range(args.n_net)], 
+		save_paths=[nnUtils.get_save_path('smad', args.antipattern, args.test_system, i) for i in range(args.n_net)], 
 		input_x=x_test)
 
 	# Print Ensemble performances

diff --git a/experiments/tuning/.DS_Store b/experiments/tuning/.DS_Store
diff --git a/experiments/tuning/context.py b/experiments/tuning/context.py
@@ -5,5 +5,6 @@
 sys.path.insert(0, ROOT_DIR)
 
 import utils.nnUtils as nnUtils
-import experiments.training.train_smad as train_smad
-import neural_networks.smad.model as md
+
+import neural_networks.asci.predict as asci
+import neural_networks.smad.model   as md
diff --git a/experiments/tuning/results/.DS_Store b/experiments/tuning/results/.DS_Store
diff --git a/experiments/tuning/results/asci/.DS_Store b/experiments/tuning/results/asci/.DS_Store
diff --git a/experiments/tuning/results/asci/god_class/.DS_Store b/experiments/tuning/results/asci/god_class/.DS_Store
diff --git a/experiments/tuning/results/smad/.DS_Store b/experiments/tuning/results/smad/.DS_Store
diff --git a/experiments/tuning/results/smad/god_class/.DS_Store b/experiments/tuning/results/smad/god_class/.DS_Store