-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add some work about word embedding and historical antipattern detection
- Loading branch information
1 parent
e18d9ea
commit 6571cda
Showing
30 changed files
with
482,277 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
proposal.sandbox.antlib.src.main.org.apache.tools.ant.Project; | ||
src.main.org.apache.tools.ant.Project; | ||
src.main.org.apache.tools.ant.taskdefs.Javadoc; | ||
org.apache.tools.ant.taskdefs.Zip; | ||
org.apache.tools.ant.types.Commandline; | ||
org.apache.tools.ant.taskdefs.Java; | ||
org.apache.tools.ant.types.Path; | ||
src.main.org.apache.tools.ant.taskdefs.optional.net.FTP; |
5 changes: 5 additions & 0 deletions
5
historical_anti-pattern_detection/data/blob/apache-tomcat.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
java.org.apache.el.parser.ELParser; | ||
java.org.apache.catalina.core.StandardContext; | ||
java.org.apache.catalina.loader.WebappClassLoader; | ||
java.org.apache.catalina.deploy.WebXml; | ||
java.org.apache.catalina.servlets.WebdavServlet; |
35 changes: 35 additions & 0 deletions
35
historical_anti-pattern_detection/data/blob/frameworks-base.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
com.android.ddmlib.Device; | ||
|
||
services.java.com.android.server.WindowManagerService; | ||
|
||
com.android.ddmlib.Client; | ||
|
||
com.android.sdklib.devices.DeviceParser; | ||
|
||
com.android.ide.common.resources.ValueResourceParser; | ||
|
||
com.android.tools.lint.checks.TestLintClient; | ||
|
||
com.android.sdklib.internal.repository.PackageTest; | ||
|
||
com.android.ddmlib.testrunner.XmlTestRunListener; | ||
|
||
com.android.ddmlib.AndroidDebugBridge; | ||
|
||
core.java.android.app.ActivityThread; | ||
|
||
core.java.android.app.Activity; | ||
|
||
core.java.android.view.View; | ||
|
||
services.java.com.android.server.am.ActivityManagerService; | ||
|
||
core.java.com.android.internal.os.BatteryStatsImpl; | ||
|
||
core.java.android.widget.AbsListView; | ||
|
||
core.java.android.provider.Settings; | ||
|
||
opengl.java.android.opengl.GLLogWrapper; | ||
|
||
telephony.java.android.telephony.PhoneNumberUtils; |
13 changes: 13 additions & 0 deletions
13
historical_anti-pattern_detection/data/blob/frameworks-opt-telephony.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
src.java.com.android.internal.telephony.DataConnection; | ||
src.java.com.android.internal.telephony.CallManager; | ||
src.java.com.android.internal.telephony.DataConnectionTracker; | ||
com.android.internal.telephony.PhoneBase; | ||
com.android.internal.telephony.gsm.SIMRecords; | ||
com.android.internal.telephony.IccCard; | ||
com.android.internal.telephony.cdma.CdmaLteServiceStateTracker; | ||
com.android.internal.telephony.DataConnectionTracker; | ||
com.android.internal.telephony.cdma.CdmaServiceStateTracker; | ||
com.android.internal.telephony.gsm.GsmServiceStateTracker; | ||
com.android.internal.telephony.gsm.GsmDataConnectionTracker; | ||
com.android.internal.telephony.cdma.CdmaDataConnectionTracker; | ||
com.android.internal.telephony.cdma.sms.BearerData; |
10 changes: 10 additions & 0 deletions
10
historical_anti-pattern_detection/data/blob/frameworks-sdk.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
com.android.sdkmanager.Main; | ||
com.android.sdkuilib.internal.repository.UpdaterWindowImpl; | ||
com.android.ide.eclipse.adt.internal.editors.layout.configuration.ConfigurationComposite; | ||
com.android.ide.eclipse.adt.internal.editors.uimodel.UiElementNode; | ||
com.android.sdklib.SdkManager; | ||
com.android.ddmlib.testrunner.InstrumentationResultParser; | ||
sdkmanager.libs.sdkuilib.tests.com.android.sdkuilib.internal.repository.sdkman2.PackagesDiffLogicTest; | ||
traceview.src.com.android.traceview.TimeLineView; | ||
ddms.libs.ddmuilib.src.com.android.ddmuilib.logcat.LogPanel; | ||
hierarchyviewer.src.com.android.hierarchyviewer.ui.Workspace; |
5 changes: 5 additions & 0 deletions
5
historical_anti-pattern_detection/data/blob/frameworks-support.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
android.support.v4.app.FragmentActivity; | ||
v4.java.android.support.v4.app.Fragment; | ||
v4.java.android.support.v4.app.FragmentManagerImpl; | ||
v4.java.android.support.v4.view.ViewPager; | ||
android.support.v13.view.ViewPager; |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
bsh.ParserTokenManager; | ||
bsh.Parser; | ||
com.microstar.xml.XmlParser; | ||
org.gjt.sp.jedit.Buffer; | ||
org.gjt.sp.jedit.textarea.JEditTextArea; |
Binary file not shown.
25,118 changes: 25,118 additions & 0 deletions
25,118
historical_anti-pattern_detection/data/systems_history/apache-ant.csv
Large diffs are not rendered by default.
Oops, something went wrong.
7,795 changes: 7,795 additions & 0 deletions
7,795
historical_anti-pattern_detection/data/systems_history/apache-tomcat.csv
Large diffs are not rendered by default.
Oops, something went wrong.
301,951 changes: 301,951 additions & 0 deletions
301,951
historical_anti-pattern_detection/data/systems_history/frameworks-base.csv
Large diffs are not rendered by default.
Oops, something went wrong.
6,694 changes: 6,694 additions & 0 deletions
6,694
historical_anti-pattern_detection/data/systems_history/frameworks-opt-telephony.csv
Large diffs are not rendered by default.
Oops, something went wrong.
98,423 changes: 98,423 additions & 0 deletions
98,423
historical_anti-pattern_detection/data/systems_history/frameworks-sdk.csv
Large diffs are not rendered by default.
Oops, something went wrong.
5,366 changes: 5,366 additions & 0 deletions
5,366
historical_anti-pattern_detection/data/systems_history/frameworks-support.csv
Large diffs are not rendered by default.
Oops, something went wrong.
8,746 changes: 8,746 additions & 0 deletions
8,746
historical_anti-pattern_detection/data/systems_history/frameworks-tool-base.csv
Large diffs are not rendered by default.
Oops, something went wrong.
27,371 changes: 27,371 additions & 0 deletions
27,371
historical_anti-pattern_detection/data/systems_history/jedit.csv
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from pyspark.mllib.feature import Word2Vec | ||
from pyspark import SparkContext | ||
|
||
#import collections | ||
import reader | ||
|
||
sc = SparkContext() | ||
rdd = sc.textFile("./data.txt").map(lambda row: row.split(" ")) | ||
#data = reader.getData('./systems_history/frameworks-base.csv') | ||
#rdd = sc.parallelize(List(data)).collect() | ||
|
||
|
||
word2vec = Word2Vec() | ||
word2vec.setMinCount(25) | ||
word2vec.setLearningRate(0.025) | ||
word2vec.setVectorSize(8) | ||
model = word2vec.fit(rdd) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
from __future__ import print_function | ||
from __future__ import division | ||
|
||
import csv | ||
import os | ||
import sys | ||
import fnmatch | ||
|
||
import numpy as np | ||
|
||
|
||
def read(csvFile): | ||
with open(csvFile, 'rb') as csvfile: | ||
reader = csv.DictReader(csvfile, delimiter=';') | ||
changes = [] | ||
for row in reader: | ||
if row['Entity'] == 'METHOD': | ||
change = {} | ||
change['Snapshot'] = row['Snapshot'] | ||
change['Methode'] = row['Code'] | ||
changes.append(change) | ||
|
||
return changes | ||
|
||
def readHistory(csvFile): | ||
with open(csvFile, 'rb') as csvfile: | ||
reader = csv.DictReader(csvfile, delimiter=';') | ||
changes = [] | ||
for row in reader: | ||
if row['Entity'] == 'METHOD': | ||
code = row['Code'].split('.') | ||
code.pop() | ||
className = '.'.join(code) | ||
|
||
change = {} | ||
change['Snapshot'] = row['Snapshot'] | ||
change['Class'] = className | ||
changes.append(change) | ||
|
||
if row['Entity'] == 'CLASS': | ||
change = {} | ||
change['Snapshot'] = row['Snapshot'] | ||
change['Class'] = row['Code'] | ||
changes.append(change) | ||
|
||
return changes | ||
|
||
|
||
#return the co-occurence matrix of the different classes | ||
def getCoocMatrix(changes): | ||
|
||
data = [] | ||
classOcc = [] | ||
commit = [] | ||
commitNumber = changes[0]['Snapshot'] | ||
for i, change in enumerate(changes): | ||
classOcc.append(change['Class']) | ||
if commitNumber != change['Snapshot']: | ||
data.append(commit) | ||
commit = [] | ||
commitNumber = change['Snapshot'] | ||
|
||
commit.append(change['Class']) | ||
if i == len(changes)-1: | ||
data.append(commit) | ||
|
||
classes = list(set(classOcc)) | ||
size = len(classes) | ||
reverseDictionnary = {classes[i]: i for i in xrange(size)} | ||
|
||
coocMatrix = np.zeros((size,size), dtype=np.int32) | ||
|
||
for commit in data: | ||
oneHotCommit = np.zeros(size, dtype=np.int32) | ||
for className in set(commit): | ||
oneHotCommit[reverseDictionnary[className]] = 1; | ||
|
||
c = oneHotCommit.reshape(size,1) | ||
coocMatrix += c.dot(c.T) | ||
|
||
return coocMatrix | ||
|
||
#return the conditional probability matrix of the different classes | ||
def getCPM(csvFile): | ||
changes = readHistory(csvFile) | ||
coocMatrix = getCoocMatrix(changes) | ||
|
||
'''concidering ci=True if the class i change in a commit , | ||
CPM(i,j) = P(cj|ci) = coocMatrix(i,j)/coocMatrix(i,i)''' | ||
size = len(coocMatrix) | ||
eye = np.identity(size) | ||
ones = np.ones(size).reshape(size,1) | ||
|
||
#DIV is a matrix which's column values are 1/coocMatrix(i,i) | ||
DIV = ones.dot(np.divide(ones,(coocMatrix*eye).dot(ones)).T) | ||
CPM = coocMatrix*DIV | ||
|
||
return CPM | ||
|
||
|
||
def data2Text(csvFile): | ||
changes = read(csvFile) | ||
|
||
f = open('data.txt','a') | ||
|
||
commit = '' | ||
commitNumber = changes[0]['Snapshot'] | ||
for change in changes: | ||
if commitNumber != change['Snapshot']: | ||
f.write(commit + '\n') | ||
commit = '' | ||
commitNumber = change['Snapshot'] | ||
|
||
commit = commit + ' ' +change['Methode'] | ||
|
||
f.close() | ||
|
||
|
||
if __name__ == "__main__": | ||
'''for path,dirs,files in os.walk('./systems_history'): | ||
for f in fnmatch.filter(files,'*.csv'): | ||
fullname = os.path.abspath(os.path.join(path,f)) | ||
changes = read(fullname) | ||
snapshots = [] | ||
methods = [] | ||
for change in changes: | ||
snapshots.append(change['Snapshot']) | ||
methods.append(change['Methode']) | ||
ratio = (len(methods)/len(set(methods))) | ||
print('system name :', f) | ||
print('nb snapshot :',len(set(snapshots))) | ||
print('methods :', ratio)''' | ||
|
||
#data2Text('./systems_history/frameworks-base.csv') | ||
|
||
'''changes = [ | ||
{'Snapshot': '1', 'Class':'a'}, | ||
{'Snapshot': '1', 'Class':'b'}, | ||
{'Snapshot': '2', 'Class':'a'}, | ||
{'Snapshot': '3', 'Class':'a'}, | ||
{'Snapshot': '3', 'Class':'a'}, | ||
{'Snapshot': '3', 'Class':'c'}, | ||
{'Snapshot': '3', 'Class':'b'}, | ||
{'Snapshot': '4', 'Class':'b'}, | ||
{'Snapshot': '4', 'Class':'b'}, | ||
{'Snapshot': '4', 'Class':'c'}, | ||
] | ||
coocMatrix = getCoocMatrix(changes)''' | ||
CPM = getCPM('./data/systems_history/frameworks-support.csv') | ||
print(CPM) | ||
|
||
|
||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from sklearn.manifold import TSNE | ||
from model import Java2Vec | ||
import matplotlib.pyplot as plt | ||
|
||
|
||
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): | ||
assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings' | ||
plt.figure(figsize=(18, 18)) # in inches | ||
for i, label in enumerate(labels): | ||
x, y = low_dim_embs[i, :] | ||
plt.scatter(x, y) | ||
plt.annotate(label, | ||
xy=(x, y), | ||
xytext=(5, 2), | ||
textcoords='offset points', | ||
ha='right', | ||
va='bottom') | ||
|
||
plt.savefig(filename) | ||
|
||
def visualize_embedding(pickle_file='embedding_model.pickle', nb_word=50): | ||
|
||
model = Java2Vec(pickle_file) | ||
|
||
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact') | ||
plot_only = nb_word | ||
low_dim_embs = tsne.fit_transform(model.vector_space[:plot_only, :]) | ||
labels = [model.line_to_word[i] for i in xrange(plot_only)] | ||
plot_with_labels(low_dim_embs, labels) | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
visualize_embedding('embedding_model.pickle', nb_word=84) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import pickle | ||
import numpy as np | ||
|
||
|
||
class Java2Vec(object): | ||
"""Load a Py2Vec JSON model and provide access to the vectors and vector | ||
space. | ||
Provides access to word vectors as if it were a dictionary: | ||
py2vec = Py2Vec('file') | ||
py2vec['word'] | ||
Unrecognized words will return the Null vector (all 0). | ||
Also provides a way to find the closest word to a vector in the vector | ||
space. | ||
Args: | ||
json_file (str): Location of the JSON Py2Vec file. | ||
Attributes: | ||
null_vector (numpy array): The null (0) vector as a numpy array. It has | ||
the correct size for the model's vector space. | ||
vector_size (int): The number of dimensions in the vector space. | ||
""" | ||
def __init__(self, pickle_file): | ||
self.__model = {} | ||
self.line_to_word = {} | ||
space = [] | ||
|
||
# Load the Py2Vec data from a file | ||
with open(pickle_file, 'r') as open_file: | ||
#tmp_model = json.load(open_file) | ||
#self.__model = {k: np.array(v) for k, v in tmp_model.iteritems()} | ||
self.__model = pickle.load(open_file) | ||
|
||
for line_number, key_word in enumerate(self.__model): | ||
vector = self.__model[key_word] | ||
self.line_to_word[line_number] = key_word | ||
space.append(vector) | ||
|
||
# Set up a vector space so we can quickly find the closest vector | ||
self.vector_space = np.array(space) | ||
|
||
# Null vector for unrecognized words | ||
self.vector_size = len(vector) | ||
self.null_vector = np.zeros(self.vector_size) | ||
|
||
def __getitem__(self, key): | ||
"""Return the vector representation of a word. | ||
Args: | ||
key (str): A word to locate in the vector space. | ||
Returns: | ||
numpy array: The location of the word in the vector space, or the | ||
null (0) vector if the word is not found. | ||
""" | ||
return self.__model.get(key, self.null_vector) | ||
|
||
def closest_words(self, input_arg, n=1): | ||
"""Return the n closest word to a given vector. | ||
Args: | ||
input_arg (str or numpy array): Either a string of a word in the | ||
model, or a vector of the same dimension as the vector space. | ||
n (Optional[int]): The number of values to return. Defaults to 1. | ||
Returns: | ||
list of tuples: A list containing tuples of the form: | ||
(distance, word). None is returned if a string was provided as | ||
an argument that is not in the model. | ||
""" | ||
# If you gave us a word, find the vector, otherwise if the word is not | ||
# in the model return None. | ||
if isinstance(input_arg, basestring): | ||
key = input_arg.lower() | ||
vector = self.__model.get(key, None) | ||
if vector is None: | ||
return None | ||
else: | ||
vector = input_arg | ||
|
||
# Find the closest vectors, note that we use n+1 because we sometimes | ||
# discard the vector with distance == 0 and we still want to have n | ||
# results. | ||
squares = (self.vector_space - vector)**2 | ||
distances = np.sum(squares, axis=1) | ||
line_numbers = np.argpartition(distances, n+1)[:n+1] | ||
|
||
# argpartition partitions the list around the nth element, but does not | ||
# guarantee the order is correct, so we have to sort. | ||
output = [] | ||
for line_number in line_numbers: | ||
dist = distances[line_number] | ||
# Throw out identical vectors, there should be only one | ||
if dist == 0: | ||
continue | ||
|
||
word = self.line_to_word[line_number] | ||
output.append((round(dist, 3), word)) | ||
|
||
return sorted(output)[:n] |
Binary file not shown.
Oops, something went wrong.