Skip to content

Commit

Permalink
add some work about word embedding and historical antipattern detection
Browse files Browse the repository at this point in the history
  • Loading branch information
antoineBarbez committed Nov 15, 2017
1 parent e18d9ea commit 6571cda
Show file tree
Hide file tree
Showing 30 changed files with 482,277 additions and 0 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file added historical_anti-pattern_detection/.DS_Store
Binary file not shown.
Binary file not shown.
8 changes: 8 additions & 0 deletions historical_anti-pattern_detection/data/blob/apache-ant.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
proposal.sandbox.antlib.src.main.org.apache.tools.ant.Project;
src.main.org.apache.tools.ant.Project;
src.main.org.apache.tools.ant.taskdefs.Javadoc;
org.apache.tools.ant.taskdefs.Zip;
org.apache.tools.ant.types.Commandline;
org.apache.tools.ant.taskdefs.Java;
org.apache.tools.ant.types.Path;
src.main.org.apache.tools.ant.taskdefs.optional.net.FTP;
5 changes: 5 additions & 0 deletions historical_anti-pattern_detection/data/blob/apache-tomcat.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
java.org.apache.el.parser.ELParser;
java.org.apache.catalina.core.StandardContext;
java.org.apache.catalina.loader.WebappClassLoader;
java.org.apache.catalina.deploy.WebXml;
java.org.apache.catalina.servlets.WebdavServlet;
35 changes: 35 additions & 0 deletions historical_anti-pattern_detection/data/blob/frameworks-base.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
com.android.ddmlib.Device;

services.java.com.android.server.WindowManagerService;

com.android.ddmlib.Client;

com.android.sdklib.devices.DeviceParser;

com.android.ide.common.resources.ValueResourceParser;

com.android.tools.lint.checks.TestLintClient;

com.android.sdklib.internal.repository.PackageTest;

com.android.ddmlib.testrunner.XmlTestRunListener;

com.android.ddmlib.AndroidDebugBridge;

core.java.android.app.ActivityThread;

core.java.android.app.Activity;

core.java.android.view.View;

services.java.com.android.server.am.ActivityManagerService;

core.java.com.android.internal.os.BatteryStatsImpl;

core.java.android.widget.AbsListView;

core.java.android.provider.Settings;

opengl.java.android.opengl.GLLogWrapper;

telephony.java.android.telephony.PhoneNumberUtils;
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
src.java.com.android.internal.telephony.DataConnection;
src.java.com.android.internal.telephony.CallManager;
src.java.com.android.internal.telephony.DataConnectionTracker;
com.android.internal.telephony.PhoneBase;
com.android.internal.telephony.gsm.SIMRecords;
com.android.internal.telephony.IccCard;
com.android.internal.telephony.cdma.CdmaLteServiceStateTracker;
com.android.internal.telephony.DataConnectionTracker;
com.android.internal.telephony.cdma.CdmaServiceStateTracker;
com.android.internal.telephony.gsm.GsmServiceStateTracker;
com.android.internal.telephony.gsm.GsmDataConnectionTracker;
com.android.internal.telephony.cdma.CdmaDataConnectionTracker;
com.android.internal.telephony.cdma.sms.BearerData;
10 changes: 10 additions & 0 deletions historical_anti-pattern_detection/data/blob/frameworks-sdk.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
com.android.sdkmanager.Main;
com.android.sdkuilib.internal.repository.UpdaterWindowImpl;
com.android.ide.eclipse.adt.internal.editors.layout.configuration.ConfigurationComposite;
com.android.ide.eclipse.adt.internal.editors.uimodel.UiElementNode;
com.android.sdklib.SdkManager;
com.android.ddmlib.testrunner.InstrumentationResultParser;
sdkmanager.libs.sdkuilib.tests.com.android.sdkuilib.internal.repository.sdkman2.PackagesDiffLogicTest;
traceview.src.com.android.traceview.TimeLineView;
ddms.libs.ddmuilib.src.com.android.ddmuilib.logcat.LogPanel;
hierarchyviewer.src.com.android.hierarchyviewer.ui.Workspace;
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
android.support.v4.app.FragmentActivity;
v4.java.android.support.v4.app.Fragment;
v4.java.android.support.v4.app.FragmentManagerImpl;
v4.java.android.support.v4.view.ViewPager;
android.support.v13.view.ViewPager;
Empty file.
5 changes: 5 additions & 0 deletions historical_anti-pattern_detection/data/blob/jedit.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bsh.ParserTokenManager;
bsh.Parser;
com.microstar.xml.XmlParser;
org.gjt.sp.jedit.Buffer;
org.gjt.sp.jedit.textarea.JEditTextArea;
Binary file not shown.
25,118 changes: 25,118 additions & 0 deletions historical_anti-pattern_detection/data/systems_history/apache-ant.csv

Large diffs are not rendered by default.

7,795 changes: 7,795 additions & 0 deletions historical_anti-pattern_detection/data/systems_history/apache-tomcat.csv

Large diffs are not rendered by default.

301,951 changes: 301,951 additions & 0 deletions historical_anti-pattern_detection/data/systems_history/frameworks-base.csv

Large diffs are not rendered by default.

Large diffs are not rendered by default.

98,423 changes: 98,423 additions & 0 deletions historical_anti-pattern_detection/data/systems_history/frameworks-sdk.csv

Large diffs are not rendered by default.

5,366 changes: 5,366 additions & 0 deletions historical_anti-pattern_detection/data/systems_history/frameworks-support.csv

Large diffs are not rendered by default.

8,746 changes: 8,746 additions & 0 deletions historical_anti-pattern_detection/data/systems_history/frameworks-tool-base.csv

Large diffs are not rendered by default.

27,371 changes: 27,371 additions & 0 deletions historical_anti-pattern_detection/data/systems_history/jedit.csv

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions historical_anti-pattern_detection/methode2vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from pyspark.mllib.feature import Word2Vec
from pyspark import SparkContext

#import collections
import reader

sc = SparkContext()
rdd = sc.textFile("./data.txt").map(lambda row: row.split(" "))
#data = reader.getData('./systems_history/frameworks-base.csv')
#rdd = sc.parallelize(List(data)).collect()


word2vec = Word2Vec()
word2vec.setMinCount(25)
word2vec.setLearningRate(0.025)
word2vec.setVectorSize(8)
model = word2vec.fit(rdd)
158 changes: 158 additions & 0 deletions historical_anti-pattern_detection/reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
from __future__ import print_function
from __future__ import division

import csv
import os
import sys
import fnmatch

import numpy as np


def read(csvFile):
with open(csvFile, 'rb') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';')
changes = []
for row in reader:
if row['Entity'] == 'METHOD':
change = {}
change['Snapshot'] = row['Snapshot']
change['Methode'] = row['Code']
changes.append(change)

return changes

def readHistory(csvFile):
with open(csvFile, 'rb') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';')
changes = []
for row in reader:
if row['Entity'] == 'METHOD':
code = row['Code'].split('.')
code.pop()
className = '.'.join(code)

change = {}
change['Snapshot'] = row['Snapshot']
change['Class'] = className
changes.append(change)

if row['Entity'] == 'CLASS':
change = {}
change['Snapshot'] = row['Snapshot']
change['Class'] = row['Code']
changes.append(change)

return changes


#return the co-occurence matrix of the different classes
def getCoocMatrix(changes):

data = []
classOcc = []
commit = []
commitNumber = changes[0]['Snapshot']
for i, change in enumerate(changes):
classOcc.append(change['Class'])
if commitNumber != change['Snapshot']:
data.append(commit)
commit = []
commitNumber = change['Snapshot']

commit.append(change['Class'])
if i == len(changes)-1:
data.append(commit)

classes = list(set(classOcc))
size = len(classes)
reverseDictionnary = {classes[i]: i for i in xrange(size)}

coocMatrix = np.zeros((size,size), dtype=np.int32)

for commit in data:
oneHotCommit = np.zeros(size, dtype=np.int32)
for className in set(commit):
oneHotCommit[reverseDictionnary[className]] = 1;

c = oneHotCommit.reshape(size,1)
coocMatrix += c.dot(c.T)

return coocMatrix

#return the conditional probability matrix of the different classes
def getCPM(csvFile):
changes = readHistory(csvFile)
coocMatrix = getCoocMatrix(changes)

'''concidering ci=True if the class i change in a commit ,
CPM(i,j) = P(cj|ci) = coocMatrix(i,j)/coocMatrix(i,i)'''
size = len(coocMatrix)
eye = np.identity(size)
ones = np.ones(size).reshape(size,1)

#DIV is a matrix which's column values are 1/coocMatrix(i,i)
DIV = ones.dot(np.divide(ones,(coocMatrix*eye).dot(ones)).T)
CPM = coocMatrix*DIV

return CPM


def data2Text(csvFile):
changes = read(csvFile)

f = open('data.txt','a')

commit = ''
commitNumber = changes[0]['Snapshot']
for change in changes:
if commitNumber != change['Snapshot']:
f.write(commit + '\n')
commit = ''
commitNumber = change['Snapshot']

commit = commit + ' ' +change['Methode']

f.close()


if __name__ == "__main__":
'''for path,dirs,files in os.walk('./systems_history'):
for f in fnmatch.filter(files,'*.csv'):
fullname = os.path.abspath(os.path.join(path,f))
changes = read(fullname)
snapshots = []
methods = []
for change in changes:
snapshots.append(change['Snapshot'])
methods.append(change['Methode'])
ratio = (len(methods)/len(set(methods)))
print('system name :', f)
print('nb snapshot :',len(set(snapshots)))
print('methods :', ratio)'''

#data2Text('./systems_history/frameworks-base.csv')

'''changes = [
{'Snapshot': '1', 'Class':'a'},
{'Snapshot': '1', 'Class':'b'},
{'Snapshot': '2', 'Class':'a'},
{'Snapshot': '3', 'Class':'a'},
{'Snapshot': '3', 'Class':'a'},
{'Snapshot': '3', 'Class':'c'},
{'Snapshot': '3', 'Class':'b'},
{'Snapshot': '4', 'Class':'b'},
{'Snapshot': '4', 'Class':'b'},
{'Snapshot': '4', 'Class':'c'},
]
coocMatrix = getCoocMatrix(changes)'''
CPM = getCPM('./data/systems_history/frameworks-support.csv')
print(CPM)




Binary file added word_embedding/.DS_Store
Binary file not shown.
34 changes: 34 additions & 0 deletions word_embedding/assess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from sklearn.manifold import TSNE
from model import Java2Vec
import matplotlib.pyplot as plt


def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')

plt.savefig(filename)

def visualize_embedding(pickle_file='embedding_model.pickle', nb_word=50):

model = Java2Vec(pickle_file)

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
plot_only = nb_word
low_dim_embs = tsne.fit_transform(model.vector_space[:plot_only, :])
labels = [model.line_to_word[i] for i in xrange(plot_only)]
plot_with_labels(low_dim_embs, labels)



if __name__ == "__main__":
visualize_embedding('embedding_model.pickle', nb_word=84)
Binary file added word_embedding/assess.pyc
Binary file not shown.
94 changes: 94 additions & 0 deletions word_embedding/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import pickle
import numpy as np


class Java2Vec(object):
"""Load a Py2Vec JSON model and provide access to the vectors and vector
space.
Provides access to word vectors as if it were a dictionary:
py2vec = Py2Vec('file')
py2vec['word']
Unrecognized words will return the Null vector (all 0).
Also provides a way to find the closest word to a vector in the vector
space.
Args:
json_file (str): Location of the JSON Py2Vec file.
Attributes:
null_vector (numpy array): The null (0) vector as a numpy array. It has
the correct size for the model's vector space.
vector_size (int): The number of dimensions in the vector space.
"""
def __init__(self, pickle_file):
self.__model = {}
self.line_to_word = {}
space = []

# Load the Py2Vec data from a file
with open(pickle_file, 'r') as open_file:
#tmp_model = json.load(open_file)
#self.__model = {k: np.array(v) for k, v in tmp_model.iteritems()}
self.__model = pickle.load(open_file)

for line_number, key_word in enumerate(self.__model):
vector = self.__model[key_word]
self.line_to_word[line_number] = key_word
space.append(vector)

# Set up a vector space so we can quickly find the closest vector
self.vector_space = np.array(space)

# Null vector for unrecognized words
self.vector_size = len(vector)
self.null_vector = np.zeros(self.vector_size)

def __getitem__(self, key):
"""Return the vector representation of a word.
Args:
key (str): A word to locate in the vector space.
Returns:
numpy array: The location of the word in the vector space, or the
null (0) vector if the word is not found.
"""
return self.__model.get(key, self.null_vector)

def closest_words(self, input_arg, n=1):
"""Return the n closest word to a given vector.
Args:
input_arg (str or numpy array): Either a string of a word in the
model, or a vector of the same dimension as the vector space.
n (Optional[int]): The number of values to return. Defaults to 1.
Returns:
list of tuples: A list containing tuples of the form:
(distance, word). None is returned if a string was provided as
an argument that is not in the model.
"""
# If you gave us a word, find the vector, otherwise if the word is not
# in the model return None.
if isinstance(input_arg, basestring):
key = input_arg.lower()
vector = self.__model.get(key, None)
if vector is None:
return None
else:
vector = input_arg

# Find the closest vectors, note that we use n+1 because we sometimes
# discard the vector with distance == 0 and we still want to have n
# results.
squares = (self.vector_space - vector)**2
distances = np.sum(squares, axis=1)
line_numbers = np.argpartition(distances, n+1)[:n+1]

# argpartition partitions the list around the nth element, but does not
# guarantee the order is correct, so we have to sort.
output = []
for line_number in line_numbers:
dist = distances[line_number]
# Throw out identical vectors, there should be only one
if dist == 0:
continue

word = self.line_to_word[line_number]
output.append((round(dist, 3), word))

return sorted(output)[:n]
Binary file added word_embedding/model.pyc
Binary file not shown.
Loading

0 comments on commit 6571cda

Please sign in to comment.