-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathdata_utils.py
176 lines (161 loc) · 7.44 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import pickle as pkl
import numpy as np
from sklearn.metrics import ndcg_score
np.random.seed(0)
def evaluate(pred_rating_matrix, training_matrix, ground_truth_matrix, user_specific=False, for_csv=False):
"""
Prepare negative item indices to sample 99 items for each positive in testing
For each user get testing items,
For each testing item, paring with 99 negative samples
Calculate relevant eval metric
Average the evaluation score over all users as defined in the paper
Parameters
-----------------------
user_specific: mean of user specific average scores (scores for multiple items of a user is averaged)
for_csv: only print csv files if True for dataframe analysis
"""
user_size = pred_rating_matrix.shape[0]
item_size = pred_rating_matrix.shape[1]
item_indices = list(range(item_size))
# print("user size:{}, item size:{}".format(user_size, item_size))
# Get negative item indices
# Get testing items (ground truth - training)
hr5 = 0.
hr10 = 0.
hr20 = 0.
mrr = 0.
ndcg5 = 0.
ndcg10 = 0.
ndcg20 = 0.
valid_testing_size = 0
valid_testing_pairs = list() # how many 100 (one pos, 99 neg) pairs tested
for i in range(user_size):
# per user
hr5_ = 0.
hr10_ = 0.
hr20_ = 0.
mrr_ = 0.
ndcg5_ = 0.
ndcg10_ = 0.
ndcg20_ = 0.
# if i % 50 == 0:
# print("{}-th user".format(i))
# negative indices for the current user
# neg_indices = list(set(item_indices)-set(ground_truth_matrix[i]))
# neg_indices = [_ for _ in item_indices if _ not in np.where(ground_truth_matrix[i]>0)[0]]
neg_indices = np.where(ground_truth_matrix[i] == 0)[0]
# testing indices for the current user
training_indices = np.where(training_matrix[i] > 0)[0]
testing_indices = [x for x in np.where(ground_truth_matrix[i] > 0)[0] if x not in training_indices]
# print("testing indices:{}".format(testing_indices))
if len(testing_indices) > 0:
for ti in testing_indices:
# get testing size*99 items from neg_indices
selected_neg_indices = np.random.choice(np.array(neg_indices), 99, replace=False)
# print(selected_neg_indices)
indices = np.array(list(selected_neg_indices)+[ti])
# valid_testing_pairs.append(np.stack((np.array([i] * 100), indices), axis=1))
indices_sorted = np.argsort(pred_rating_matrix[i][indices])
# print(valid_testing_size, pred_rating_matrix[i][indices])
# print(indices_sorted)
ground_truth_indices = [99]
# ground_truth_indices = list(range(len(indices)-1, len(indices)-1-len(testing_indices), -1))
hr5_ex = 1. if len(intersection(ground_truth_indices, indices_sorted[-5:])) > 0 else 0.
hr5_ += hr5_ex
hr10_ex = 1. if len(intersection(ground_truth_indices, indices_sorted[-10:])) > 0 else 0.
hr10_ += hr10_ex
hr20_ex = 1. if len(intersection(ground_truth_indices, indices_sorted[-20:])) > 0 else 0.
hr20_ += hr20_ex
index = np.max([np.where(indices_sorted == ind) for ind in ground_truth_indices][0][0])
# sorted is reversed rank for mrr
rank = len(indices_sorted) - index
mrr_ex = (1./rank)
mrr_ += mrr_ex
# NDCG@K
y_true = np.asarray([[0.]*len(selected_neg_indices)+[1]])
y_pred = np.asarray([pred_rating_matrix[i][indices]])
# print(y_true.shape, y_pred.shape)
ndcg5_ex = ndcg_score(y_true, y_pred, k=5)
ndcg5_ += ndcg5_ex
ndcg10_ex = ndcg_score(y_true, y_pred, k=10)
ndcg10_ += ndcg10_ex
ndcg20_ex = ndcg_score(y_true, y_pred, k=20)
ndcg20_ += ndcg20_ex
valid_testing_size += 1
print(i, len(training_indices), len(testing_indices),
hr5_ex, hr10_ex, hr20_ex, ndcg5_ex, ndcg10_ex, ndcg20_ex, mrr_ex)
if user_specific:
num_examples = float(len(testing_indices))
hr5 += hr5_/num_examples
hr10 += hr10_/num_examples
hr20 += hr20_/num_examples
ndcg5 += ndcg5_/num_examples
ndcg10 += ndcg10_/num_examples
ndcg20 += ndcg20_/num_examples
mrr += mrr_/num_examples
else:
hr5 += hr5_
hr10 += hr10_
hr20 += hr20_
ndcg5 += ndcg5_
ndcg10 += ndcg10_
ndcg20 += ndcg20_
mrr += mrr_
# Store testing pairs
# np.save("testing_pairs", np.array(valid_testing_pairs), allow_pickle=False)
if not for_csv:
if user_specific:
print("valid testing size:{}".format(user_size))
print(
"hr@5:{:7.4f} hr@10:{:7.4f} hr@20:{:7.4f} mrr:{:7.4f} ndcg@5:{:7.4f} ndcg@10:{:7.4f} ndcg@20:{:7.4f}".format(
hr5 / user_size,
hr10 / user_size,
hr20 / user_size,
mrr / user_size,
ndcg5 / user_size,
ndcg10 / user_size,
ndcg20 / user_size
))
else:
print("valid testing size:{}".format(valid_testing_size))
print("hr@5:{:7.4f} hr@10:{:7.4f} hr@20:{:7.4f} mrr:{:7.4f} ndcg@5:{:7.4f} ndcg@10:{:7.4f} ndcg@20:{:7.4f}".format(
hr5/valid_testing_size,
hr10/valid_testing_size,
hr20/valid_testing_size,
mrr/valid_testing_size,
ndcg5/valid_testing_size,
ndcg10/valid_testing_size,
ndcg20/valid_testing_size
))
def intersection(lst1, lst2):
lst3 = [value for value in lst1 if value in lst2]
return lst3
if __name__ == "__main__":
# Ground truth
training_matrix_f = './data/adjacency_matrix.p'
ground_truth_matrix_f = './data/user_action.p'
# Prediction file
pred_matrix_f = './output/m_rating_pred_bestmrr.p'
# -------------------------------------------
# Load matrix
with open(training_matrix_f, 'rb') as f:
training_matrix = pkl.load(f)
if not isinstance(training_matrix, np.matrix):
training_matrix = training_matrix.todense()
else:
training_matrix = np.array(training_matrix)
# np.save("training_matrix", training_matrix, allow_pickle=False)
# with open("./MOOCCube/data-for-kgcrec/negative.p", 'rb') as f:
# negative = pkl.load(f)
# np.save("negative", negative, allow_pickle=False)
with open(ground_truth_matrix_f, 'rb') as f:
ground_truth_matrix = pkl.load(f)
if not isinstance(ground_truth_matrix, np.matrix):
ground_truth_matrix = ground_truth_matrix.todense()
else:
ground_truth_matrix = np.array(ground_truth_matrix)
with open(pred_matrix_f, 'rb') as f:
pred_matrix = pkl.load(f)
# print(pred_matrix)
# Evaluation
evaluate(pred_matrix, training_matrix, ground_truth_matrix, user_specific=False, for_csv=False)