-
Notifications
You must be signed in to change notification settings - Fork 0
/
similarity.py
115 lines (100 loc) · 3.64 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import json
import time
from config import *
from algo.algo_veo import vertex_edge_overlap
from algo.algo_vs import vertex_edge_vector_similarity
from algo.algo_seqs import sequence_similarity
# Methods to run
sim_methods = [
"veo",
"vs",
"seqs",
]
# Similarity threshold above which pairs are to be found
sim_threshold = 1.0
# Verbose parameter for showing similarity score while execution
verbose = 0
def main():
# Make result directory
if not os.path.exists(result_folder):
os.mkdir(result_folder)
if verbose:
print("Similarity threshold :", sim_threshold)
# Load dataset for each combination of parameters
for n in v_counts:
for d in graph_densities:
for t in graph_types:
for w in max_edge_weights:
print("-" * 10)
print("v_count :", n)
print("density :", d)
print("type :", t)
print("max_weight :", w)
data1 = load_data(n, d, t, w, "A")
data2 = load_data(n, d, t, w, "B")
print("Data loaded")
r = pairwise_similarity(data1, data2, t)
save_results(r, n, d, t, w)
def pairwise_similarity(set1, set2, g_type):
result = {}
for i in range(len(set1)):
result[i] = {}
for j in range(len(set2)):
result[i][j] = {"method": {}, "time": {}}
if "veo" in sim_methods:
a = time.time()
sim_veo = vertex_edge_overlap(set1[i], set2[j], g_type)
b = time.time()
t_sim_veo = b - a
result[i][j]["method"]["veo"] = sim_veo
result[i][j]["time"]["veo"] = t_sim_veo
if verbose and sim_veo >= sim_threshold:
print(" veo :", sim_veo)
if "vs" in sim_methods:
a = time.time()
sim_vs = vertex_edge_vector_similarity(set1[i], set2[j], g_type)
b = time.time()
t_sim_vs = b - a
result[i][j]["method"]["vs"] = sim_vs
result[i][j]["time"]["vs"] = t_sim_vs
if verbose and sim_vs >= sim_threshold:
print(" vs :", sim_vs)
if "seqs" in sim_methods:
a = time.time()
sim_seqs = sequence_similarity(set1[i], set2[j], g_type)
b = time.time()
t_sim_seqs = b - a
result[i][j]["method"]["seqs"] = sim_seqs
result[i][j]["time"]["seqs"] = t_sim_seqs
if verbose and sim_seqs >= sim_threshold:
print("A", i, "and B", j, ":")
print(" seqs :", sim_seqs)
return result
def load_data(n, d, t, w, set_label):
f_title = str(n) + "_" + d + "_" + t + "_" + str(w)
file = data_folder + f_title + "_" + set_label + ".json"
print("Loading : " + file, end=" . . . ")
with open(file) as f:
data = json.load(f)
for x in data:
generate_vertex_index_list(x, n)
print("Done !")
return data
def save_results(results, n, d, t, w):
f_title = str(n) + "_" + d + "_" + t + "_" + str(w)
file = result_folder + f_title + ".json"
print("Saving : " + file, end=" . . . ")
with open(file, 'w') as f:
f.write(json.dumps(results))
print("Done !\n")
def generate_vertex_index_list(g, v_count):
"""Array which maps vertex label to matrix index"""
l = [-1] * v_count
index = 0
for v in g["vertices"]:
l[v] = index
index += 1
g["v_index"] = l
if __name__ == '__main__':
main()