add some work about word embedding and historical antipattern detection

antoineBarbez · Nov 15, 2017 · 6571cda · 6571cda
1 parent e18d9ea
commit 6571cda
Show file tree

Hide file tree

Showing 30 changed files with 482,277 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/historical_anti-pattern_detection/.DS_Store b/historical_anti-pattern_detection/.DS_Store
diff --git a/historical_anti-pattern_detection/data/.DS_Store b/historical_anti-pattern_detection/data/.DS_Store
diff --git a/historical_anti-pattern_detection/data/blob/apache-ant.csv b/historical_anti-pattern_detection/data/blob/apache-ant.csv
@@ -0,0 +1,8 @@
+proposal.sandbox.antlib.src.main.org.apache.tools.ant.Project;
+src.main.org.apache.tools.ant.Project;
+src.main.org.apache.tools.ant.taskdefs.Javadoc;
+org.apache.tools.ant.taskdefs.Zip;
+org.apache.tools.ant.types.Commandline;
+org.apache.tools.ant.taskdefs.Java;
+org.apache.tools.ant.types.Path;
+src.main.org.apache.tools.ant.taskdefs.optional.net.FTP;
diff --git a/historical_anti-pattern_detection/data/blob/apache-tomcat.csv b/historical_anti-pattern_detection/data/blob/apache-tomcat.csv
@@ -0,0 +1,5 @@
+java.org.apache.el.parser.ELParser;
+java.org.apache.catalina.core.StandardContext;
+java.org.apache.catalina.loader.WebappClassLoader;
+java.org.apache.catalina.deploy.WebXml;
+java.org.apache.catalina.servlets.WebdavServlet;
diff --git a/historical_anti-pattern_detection/data/blob/frameworks-base.csv b/historical_anti-pattern_detection/data/blob/frameworks-base.csv
@@ -0,0 +1,35 @@
+com.android.ddmlib.Device;
+
+services.java.com.android.server.WindowManagerService;
+
+com.android.ddmlib.Client;
+
+com.android.sdklib.devices.DeviceParser;
+
+com.android.ide.common.resources.ValueResourceParser;
+
+com.android.tools.lint.checks.TestLintClient;
+
+com.android.sdklib.internal.repository.PackageTest;
+
+com.android.ddmlib.testrunner.XmlTestRunListener;
+
+com.android.ddmlib.AndroidDebugBridge;
+
+core.java.android.app.ActivityThread;
+
+core.java.android.app.Activity;
+
+core.java.android.view.View;
+
+services.java.com.android.server.am.ActivityManagerService;
+
+core.java.com.android.internal.os.BatteryStatsImpl;
+
+core.java.android.widget.AbsListView;
+
+core.java.android.provider.Settings;
+
+opengl.java.android.opengl.GLLogWrapper;
+
+telephony.java.android.telephony.PhoneNumberUtils;
diff --git a/historical_anti-pattern_detection/data/blob/frameworks-opt-telephony.csv b/historical_anti-pattern_detection/data/blob/frameworks-opt-telephony.csv
@@ -0,0 +1,13 @@
+src.java.com.android.internal.telephony.DataConnection;
+src.java.com.android.internal.telephony.CallManager;
+src.java.com.android.internal.telephony.DataConnectionTracker;
+com.android.internal.telephony.PhoneBase;
+com.android.internal.telephony.gsm.SIMRecords;
+com.android.internal.telephony.IccCard;
+com.android.internal.telephony.cdma.CdmaLteServiceStateTracker;
+com.android.internal.telephony.DataConnectionTracker;
+com.android.internal.telephony.cdma.CdmaServiceStateTracker;
+com.android.internal.telephony.gsm.GsmServiceStateTracker;
+com.android.internal.telephony.gsm.GsmDataConnectionTracker;
+com.android.internal.telephony.cdma.CdmaDataConnectionTracker;
+com.android.internal.telephony.cdma.sms.BearerData;
diff --git a/historical_anti-pattern_detection/data/blob/frameworks-sdk.csv b/historical_anti-pattern_detection/data/blob/frameworks-sdk.csv
@@ -0,0 +1,10 @@
+com.android.sdkmanager.Main;
+com.android.sdkuilib.internal.repository.UpdaterWindowImpl;
+com.android.ide.eclipse.adt.internal.editors.layout.configuration.ConfigurationComposite;
+com.android.ide.eclipse.adt.internal.editors.uimodel.UiElementNode;
+com.android.sdklib.SdkManager;
+com.android.ddmlib.testrunner.InstrumentationResultParser;
+sdkmanager.libs.sdkuilib.tests.com.android.sdkuilib.internal.repository.sdkman2.PackagesDiffLogicTest;
+traceview.src.com.android.traceview.TimeLineView;
+ddms.libs.ddmuilib.src.com.android.ddmuilib.logcat.LogPanel;
+hierarchyviewer.src.com.android.hierarchyviewer.ui.Workspace;
diff --git a/historical_anti-pattern_detection/data/blob/frameworks-support.csv b/historical_anti-pattern_detection/data/blob/frameworks-support.csv
@@ -0,0 +1,5 @@
+android.support.v4.app.FragmentActivity;
+v4.java.android.support.v4.app.Fragment;
+v4.java.android.support.v4.app.FragmentManagerImpl;
+v4.java.android.support.v4.view.ViewPager;
+android.support.v13.view.ViewPager;
diff --git a/historical_anti-pattern_detection/data/blob/frameworks-tool-base.csv b/historical_anti-pattern_detection/data/blob/frameworks-tool-base.csv
diff --git a/historical_anti-pattern_detection/data/blob/jedit.csv b/historical_anti-pattern_detection/data/blob/jedit.csv
@@ -0,0 +1,5 @@
+bsh.ParserTokenManager;
+bsh.Parser;
+com.microstar.xml.XmlParser;
+org.gjt.sp.jedit.Buffer;
+org.gjt.sp.jedit.textarea.JEditTextArea;
diff --git a/historical_anti-pattern_detection/data/systems_history/.DS_Store b/historical_anti-pattern_detection/data/systems_history/.DS_Store
diff --git a/historical_anti-pattern_detection/data/systems_history/apache-ant.csv b/historical_anti-pattern_detection/data/systems_history/apache-ant.csv
diff --git a/historical_anti-pattern_detection/data/systems_history/apache-tomcat.csv b/historical_anti-pattern_detection/data/systems_history/apache-tomcat.csv
diff --git a/historical_anti-pattern_detection/data/systems_history/frameworks-base.csv b/historical_anti-pattern_detection/data/systems_history/frameworks-base.csv
diff --git a/historical_anti-pattern_detection/data/systems_history/frameworks-opt-telephony.csv b/historical_anti-pattern_detection/data/systems_history/frameworks-opt-telephony.csv
diff --git a/historical_anti-pattern_detection/data/systems_history/frameworks-sdk.csv b/historical_anti-pattern_detection/data/systems_history/frameworks-sdk.csv
diff --git a/historical_anti-pattern_detection/data/systems_history/frameworks-support.csv b/historical_anti-pattern_detection/data/systems_history/frameworks-support.csv
diff --git a/historical_anti-pattern_detection/data/systems_history/frameworks-tool-base.csv b/historical_anti-pattern_detection/data/systems_history/frameworks-tool-base.csv
diff --git a/historical_anti-pattern_detection/data/systems_history/jedit.csv b/historical_anti-pattern_detection/data/systems_history/jedit.csv
diff --git a/historical_anti-pattern_detection/methode2vec.py b/historical_anti-pattern_detection/methode2vec.py
@@ -0,0 +1,17 @@
+from pyspark.mllib.feature import Word2Vec
+from pyspark import SparkContext
+
+#import collections
+import reader
+
+sc = SparkContext()
+rdd = sc.textFile("./data.txt").map(lambda row: row.split(" "))
+#data = reader.getData('./systems_history/frameworks-base.csv')
+#rdd = sc.parallelize(List(data)).collect()
+
+
+word2vec = Word2Vec()
+word2vec.setMinCount(25)
+word2vec.setLearningRate(0.025)
+word2vec.setVectorSize(8)
+model = word2vec.fit(rdd)
diff --git a/historical_anti-pattern_detection/reader.py b/historical_anti-pattern_detection/reader.py
@@ -0,0 +1,158 @@
+from __future__ import print_function
+from __future__ import division
+
+import csv
+import os
+import sys
+import fnmatch
+
+import numpy as np
+
+
+def read(csvFile):
+	with open(csvFile, 'rb') as csvfile:
+		reader = csv.DictReader(csvfile, delimiter=';')
+		changes = []
+		for row in reader:
+			if row['Entity'] == 'METHOD':
+				change = {}
+				change['Snapshot'] = row['Snapshot']
+				change['Methode'] = row['Code']
+				changes.append(change)
+
+		return changes
+
+def readHistory(csvFile):
+	with open(csvFile, 'rb') as csvfile:
+		reader = csv.DictReader(csvfile, delimiter=';')
+		changes = []
+		for row in reader:
+			if row['Entity'] == 'METHOD':
+				code = row['Code'].split('.')
+				code.pop()
+				className = '.'.join(code)
+
+				change = {}
+				change['Snapshot'] = row['Snapshot']
+				change['Class'] = className
+				changes.append(change)
+
+			if row['Entity'] == 'CLASS':
+				change = {}
+				change['Snapshot'] = row['Snapshot']
+				change['Class'] = row['Code']
+				changes.append(change)
+
+		return changes
+
+
+#return the co-occurence matrix of the different classes
+def getCoocMatrix(changes):
+
+	data = []
+	classOcc = []
+	commit = []
+	commitNumber = changes[0]['Snapshot']
+	for i, change in enumerate(changes):
+		classOcc.append(change['Class'])
+		if commitNumber != change['Snapshot']:
+			data.append(commit)
+			commit = []
+			commitNumber = change['Snapshot']
+
+		commit.append(change['Class'])
+		if i == len(changes)-1:
+			data.append(commit)
+
+	classes = list(set(classOcc))
+	size = len(classes)
+	reverseDictionnary = {classes[i]: i for i in xrange(size)}
+
+	coocMatrix = np.zeros((size,size), dtype=np.int32)
+
+	for commit in data:
+		oneHotCommit = np.zeros(size, dtype=np.int32)
+		for className in set(commit):
+			oneHotCommit[reverseDictionnary[className]] = 1;
+
+		c = oneHotCommit.reshape(size,1)
+		coocMatrix += c.dot(c.T)
+
+	return coocMatrix
+
+#return the conditional probability matrix of the different classes
+def getCPM(csvFile):
+	changes = readHistory(csvFile)
+	coocMatrix = getCoocMatrix(changes)
+
+	'''concidering ci=True if the class i change in a commit ,
+	CPM(i,j) = P(cj|ci) = coocMatrix(i,j)/coocMatrix(i,i)'''
+	size = len(coocMatrix)
+	eye = np.identity(size)
+	ones = np.ones(size).reshape(size,1)
+
+	#DIV is a matrix which's column values are 1/coocMatrix(i,i)
+	DIV = ones.dot(np.divide(ones,(coocMatrix*eye).dot(ones)).T)
+	CPM = coocMatrix*DIV
+
+	return CPM
+
+
+def data2Text(csvFile):
+	changes = read(csvFile)
+
+	f = open('data.txt','a')
+
+	commit = ''
+	commitNumber = changes[0]['Snapshot']
+	for change in changes:
+		if commitNumber != change['Snapshot']:
+			f.write(commit + '\n')
+			commit = ''
+			commitNumber = change['Snapshot']
+
+		commit = commit + ' ' +change['Methode']
+
+	f.close()
+
+
+if __name__ == "__main__":
+	'''for path,dirs,files in os.walk('./systems_history'):
+		for f in fnmatch.filter(files,'*.csv'):
+			fullname = os.path.abspath(os.path.join(path,f))
+			changes = read(fullname)
+
+			snapshots = []
+			methods = []
+
+			for change in changes:
+				snapshots.append(change['Snapshot'])
+				methods.append(change['Methode'])
+
+			ratio = (len(methods)/len(set(methods)))
+			print('system name :', f)
+			print('nb snapshot :',len(set(snapshots)))
+			print('methods :', ratio)'''
+
+	#data2Text('./systems_history/frameworks-base.csv')
+
+	'''changes = [
+		{'Snapshot': '1', 'Class':'a'},
+		{'Snapshot': '1', 'Class':'b'},
+		{'Snapshot': '2', 'Class':'a'},
+		{'Snapshot': '3', 'Class':'a'},
+		{'Snapshot': '3', 'Class':'a'},
+		{'Snapshot': '3', 'Class':'c'},
+		{'Snapshot': '3', 'Class':'b'},
+		{'Snapshot': '4', 'Class':'b'},
+		{'Snapshot': '4', 'Class':'b'},
+		{'Snapshot': '4', 'Class':'c'},
+	]
+
+	coocMatrix = getCoocMatrix(changes)'''
+	CPM = getCPM('./data/systems_history/frameworks-support.csv')
+	print(CPM)
+
+
+
+
diff --git a/word_embedding/.DS_Store b/word_embedding/.DS_Store
diff --git a/word_embedding/assess.py b/word_embedding/assess.py
@@ -0,0 +1,34 @@
+from sklearn.manifold import TSNE
+from model import Java2Vec
+import matplotlib.pyplot as plt
+
+
+def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
+  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
+  plt.figure(figsize=(18, 18))  # in inches
+  for i, label in enumerate(labels):
+    x, y = low_dim_embs[i, :]
+    plt.scatter(x, y)
+    plt.annotate(label,
+                 xy=(x, y),
+                 xytext=(5, 2),
+                 textcoords='offset points',
+                 ha='right',
+                 va='bottom')
+
+  plt.savefig(filename)
+
+def visualize_embedding(pickle_file='embedding_model.pickle', nb_word=50):
+
+  model = Java2Vec(pickle_file)
+
+  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
+  plot_only = nb_word
+  low_dim_embs = tsne.fit_transform(model.vector_space[:plot_only, :])
+  labels = [model.line_to_word[i] for i in xrange(plot_only)]
+  plot_with_labels(low_dim_embs, labels)
+
+
+
+if __name__ == "__main__":
+  visualize_embedding('embedding_model.pickle', nb_word=84)
diff --git a/word_embedding/assess.pyc b/word_embedding/assess.pyc
diff --git a/word_embedding/model.py b/word_embedding/model.py
@@ -0,0 +1,94 @@
+import pickle
+import numpy as np
+
+
+class Java2Vec(object):
+    """Load a Py2Vec JSON model and provide access to the vectors and vector
+    space.
+    Provides access to word vectors as if it were a dictionary:
+        py2vec = Py2Vec('file')
+        py2vec['word']
+    Unrecognized words will return the Null vector (all 0).
+    Also provides a way to find the closest word to a vector in the vector
+    space.
+    Args:
+        json_file (str): Location of the JSON Py2Vec file.
+    Attributes:
+        null_vector (numpy array): The null (0) vector as a numpy array. It has
+            the correct size for the model's vector space.
+        vector_size (int): The number of dimensions in the vector space.
+    """
+    def __init__(self, pickle_file):
+        self.__model = {}
+        self.line_to_word = {}
+        space = []
+
+        # Load the Py2Vec data from a file
+        with open(pickle_file, 'r') as open_file:
+            #tmp_model = json.load(open_file)
+            #self.__model = {k: np.array(v) for k, v in tmp_model.iteritems()}
+            self.__model = pickle.load(open_file)
+
+            for line_number, key_word in enumerate(self.__model):
+                vector = self.__model[key_word]
+                self.line_to_word[line_number] = key_word
+                space.append(vector)
+
+        # Set up a vector space so we can quickly find the closest vector
+        self.vector_space = np.array(space)
+
+        # Null vector for unrecognized words
+        self.vector_size = len(vector)
+        self.null_vector = np.zeros(self.vector_size)
+
+    def __getitem__(self, key):
+        """Return the vector representation of a word.
+        Args:
+            key (str): A word to locate in the vector space.
+        Returns:
+            numpy array: The location of the word in the vector space, or the
+                null (0) vector if the word is not found.
+        """
+        return self.__model.get(key, self.null_vector)
+
+    def closest_words(self, input_arg, n=1):
+        """Return the n closest word to a given vector.
+        Args:
+            input_arg (str or numpy array): Either a string of a word in the
+                model, or a vector of the same dimension as the vector space.
+            n (Optional[int]): The number of values to return. Defaults to 1.
+        Returns:
+            list of tuples: A list containing tuples of the form:
+                (distance, word). None is returned if a string was provided as
+                an argument that is not in the model.
+        """
+        # If you gave us a word, find the vector, otherwise if the word is not
+        # in the model return None.
+        if isinstance(input_arg, basestring):
+            key = input_arg.lower()
+            vector = self.__model.get(key, None)
+            if vector is None:
+                return None
+        else:
+            vector = input_arg
+
+        # Find the closest vectors, note that we use n+1 because we sometimes
+        # discard the vector with distance == 0 and we still want to have n
+        # results.
+        squares = (self.vector_space - vector)**2
+        distances = np.sum(squares, axis=1)
+        line_numbers = np.argpartition(distances, n+1)[:n+1]
+
+        # argpartition partitions the list around the nth element, but does not
+        # guarantee the order is correct, so we have to sort.
+        output = []
+        for line_number in line_numbers:
+            dist = distances[line_number]
+            # Throw out identical vectors, there should be only one
+            if dist == 0:
+                continue
+
+            word = self.line_to_word[line_number]
+            output.append((round(dist, 3), word))
+
+        return sorted(output)[:n]
diff --git a/word_embedding/model.pyc b/word_embedding/model.pyc