This repository has been archived by the owner on Dec 13, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsvdcur.py
403 lines (346 loc) · 15.4 KB
/
svdcur.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
import numpy as np
from collections import defaultdict
import time
ratings = defaultdict(dict)
MAX_USERS = 6040
MAX_ITEMS = 3952
with open('./dataset/ratings.dat', 'r') as file:
for line in file:
user_id,movie_id,rating_value,timestamp = [*map(int, line.split('::'))]
ratings[user_id][movie_id] = rating_value
def normalise(ratings):
for user,movie_ratings in ratings.items():
mean = sum(movie_ratings.values()) / len(movie_ratings.values())
for movie_id in movie_ratings:
if(movie_ratings[movie_id] > 0):
movie_ratings[movie_id] -= mean
return ratings
ratings = normalise(ratings)
def RMSE(pred, truth):
'''
Calculate Root Mean Square Error (RMSE).
Inputs:
pred (1D numpy array): numpy array containing predicted values.
truth (1D numpy array): numpy array containing the ground truth values.
Returns:
rmse (float): The Root Mean Square Error.
'''
return np.sqrt(np.sum(np.square(pred-truth)/float(pred.shape[0])))
def RMSE_mat(matA, matB):
'''
Calculate Root Mean Square Error (RMSE) between two matrices. Mainly used
to find error original and reconstructed matrices while working with
matrix decompositions.
Inputs:
matA (2D numpy array): Matrix A
matB (2D numpy array): Matrix B
Returns:
rmse (float): Root Mean Square Error.
'''
return np.sqrt(np.sum(np.square(matA-matB))/(matA.shape[0]*matA.shape[1]))
def MAE(matA, matB):
return np.sum(np.abs(matA-matB)) /(matA.shape[0]*matA.shape[1])
#SVD
class SVD_topk:
"""
A Recommender System model based on the Singular Value Decomposition concepts.
Normalizes the Utility matrix consisting of users, movies and their ratings by subtracting values in a row
by their row mean , which handles the problem of strict and generous raters.
SVD factorizes the utility matrix into U(m x m), Sigma(m X n) and V* (V-transpose) (n X n).
The dot product U.Sigma.V* in the reduced form gives the prediction matrix.
"""
def __init__(self, matrix, K):
self.K = K
self.matrix = matrix
self.MAX_USERS = len(self.matrix)
self.MAX_ITEMS = len(self.matrix[0])
self.generate_svd_matrices()
self.tryDimReduction()
self.multiply()
def generate_svd_matrices(self):
""" Performs SVD on the normalized utility matrix and factorizes it into U, S and VT
"""
A = np.array(self.matrix)
AAt = np.dot(A,A.T)
self.eigen_values_AAt, self.eigen_vectors_AAt = np.linalg.eigh(AAt)
self.rank_AAt = np.linalg.matrix_rank(AAt)
self.index_AAt = sorted(range(len(self.eigen_values_AAt)),
key=lambda k: self.eigen_values_AAt[k], reverse=True)[:self.rank_AAt]
self.U = np.zeros(shape=(self.MAX_USERS,self.rank_AAt))
self.eigen_values_AAt = self.eigen_values_AAt[::-1]
self.eigen_values_AAt = self.eigen_values_AAt[:self.rank_AAt]
for i in range(self.rank_AAt):
self.U[:,i] = self.eigen_vectors_AAt[:,self.index_AAt[i]]
self.sigma = np.zeros(shape=(self.rank_AAt, self.rank_AAt))
for i in range(self.rank_AAt):
self.sigma[i,i] = self.eigen_values_AAt[i] ** 0.5
AtA = (A.T).dot(A)
self.eigen_values_AtA, self.eigen_vectors_AtA = np.linalg.eigh(AtA)
self.rank_AtA = np.linalg.matrix_rank(AtA)
self.index_AtA = sorted(range(len(self.eigen_values_AtA)), key=lambda k: self.eigen_values_AtA[k], reverse=True)[:self.rank_AtA]
self.V = np.zeros(shape=(self.MAX_ITEMS,self.rank_AtA))
self.eigen_values_AtA = self.eigen_values_AtA[::-1]
self.eigen_values_AtA = self.eigen_values_AtA[:self.rank_AtA]
for i in range(self.rank_AtA):
self.V[:,i] = self.eigen_vectors_AtA[:,self.index_AtA[i]]
self.V = self.V.T
def tryDimReduction(self):
dk = len(self.sigma) - self.K
self.sigma = self.sigma[:self.K, :self.K]
self.U = self.U[:,:-dk]
self.V = self.V[:-dk,:]
def multiply(self):
""" Multiplies the U, V and sigma matrices to get the matrix of predicted ratings.
"""
self.result = np.dot((np.dot(self.U, self.sigma)), self.V) * (-1)
matrix = np.zeros(shape=(3000, 3000))
for i in range(3000):
for j in range(3000):
if i in ratings and j in ratings[i]:
matrix[i-1][j-1] = ratings[i][j]
start = time.time()
svd = SVD_topk(matrix,7)
pred = svd.result
end = time.time()
print("SVD took : ", end-start, "secs to complete")
print("RMSE for SVD: ", RMSE_mat(pred,matrix))
print("MAE for SVD: ", MAE(pred,matrix))
#svd 90%
class SVD_90:
"""
A Recommender System model based on the Singular Value Decomposition concepts.
Normalizes the Utility matrix consisting of users, movies and their ratings by subtracting values in a row
by their row mean , which handles the problem of strict and generous raters.
SVD factorizes the utility matrix into U(m x m), Sigma(m X n) and V* (V-transpose) (n X n).
The dot product U.Sigma.V* in the reduced form gives the prediction matrix.
"""
def __init__(self, matrix):
self.matrix = matrix
self.MAX_USERS = len(self.matrix)
self.MAX_ITEMS = len(self.matrix[0])
self.generate_svd_matrices()
self.tryDimReduction()
self.multiply()
def generate_svd_matrices(self):
""" Performs SVD on the normalized utility matrix and factorizes it into U, S and VT
"""
A = np.array(self.matrix)
AAt = np.dot(A,A.T)
self.eigen_values_AAt, self.eigen_vectors_AAt = np.linalg.eigh(AAt)
self.rank_AAt = np.linalg.matrix_rank(AAt)
self.index_AAt = sorted(range(len(self.eigen_values_AAt)),
key=lambda k: self.eigen_values_AAt[k], reverse=True)[:self.rank_AAt]
self.U = np.zeros(shape=(self.MAX_USERS,self.rank_AAt))
self.eigen_values_AAt = self.eigen_values_AAt[::-1]
self.eigen_values_AAt = self.eigen_values_AAt[:self.rank_AAt]
for i in range(self.rank_AAt):
self.U[:,i] = self.eigen_vectors_AAt[:,self.index_AAt[i]]
self.sigma = np.zeros(shape=(self.rank_AAt, self.rank_AAt))
for i in range(self.rank_AAt):
self.sigma[i,i] = self.eigen_values_AAt[i] ** 0.5
AtA = (A.T).dot(A)
self.eigen_values_AtA, self.eigen_vectors_AtA = np.linalg.eigh(AtA)
self.rank_AtA = np.linalg.matrix_rank(AtA)
self.index_AtA = sorted(range(len(self.eigen_values_AtA)), key=lambda k: self.eigen_values_AtA[k], reverse=True)[:self.rank_AtA]
self.V = np.zeros(shape=(self.MAX_ITEMS,self.rank_AtA))
self.eigen_values_AtA = self.eigen_values_AtA[::-1]
self.eigen_values_AtA = self.eigen_values_AtA[:self.rank_AtA]
for i in range(self.rank_AtA):
self.V[:,i] = self.eigen_vectors_AtA[:,self.index_AtA[i]]
self.V = self.V.T
def tryDimReduction(self):
""" Tries to reduce the dimension of the matrics by following 90 %
retain energy rule.
"""
while True:
total_E = 0
size = np.shape(self.sigma)[0]
for i in range(size):
total_E += self.sigma[i,i]**2
retained_E = 0
if size > 0:
retained_E = total_E - self.sigma[size-1,size-1]**2
if total_E == 0 or retained_E/total_E < 0.9:
break
else:
self.U = self.U[:,:-1:]
self.V = self.V[:-1,:]
self.sigma = self.sigma[:,:-1]
self.sigma = self.sigma[:-1,:]
def multiply(self):
""" Multiplies the U, V and sigma matrices to get the matrix of predicted ratings.
"""
self.result = np.dot((np.dot(self.U, self.sigma)), self.V) * (-1)
matrix = np.zeros(shape=(3000, 3000))
for i in range(3000):
for j in range(3000):
if i in ratings and j in ratings[i]:
matrix[i-1][j-1] = ratings[i][j]
start = time.time()
svd = SVD_90(matrix)
pred = svd.result
end = time.time()
print("SVD with 90% energy took : ", end-start, "secs to complete")
print("RMSE for SVD with 90% energy: ", RMSE_mat(pred,matrix))
print("MAE for SVD with 90% energy: ", MAE(pred,matrix))
#cur
class CUR_topk:
"""
A Recommender System model based on CUR approximation.
Normalizes the Utility matrix consisting of users, movies and their ratings by subtracting values in a row
by their row mean , which handles the problem of strict and generous raters.
CUR factorizes the utility matrix into C(m x k), U(k X k) and R(k X n)..
The dot product C.U.R in the reduced form gives the prediction matrix.
"""
def __init__(self, matrix, K):
self.K = K
self.matrix = np.array(matrix)
self.numRows = np.shape(self.matrix)[0]
self.numCols = np.shape(self.matrix)[1]
self.getProbDistribution()
self.find_C_U_R(1,50)
self.multiply()
def getProbDistribution(self):
""" Gets the probability distribution for the rows and columns to be used in
random list generation.This list enables us to randomly select rows and columns.
"""
self.totalSum = sum(sum(self.matrix ** 2))
self.rowSum = (sum((self.matrix.T)**2))/self.totalSum
self.colSum = (sum(self.matrix ** 2))/self.totalSum
def generateRandomNos(self, probDist, size, sampleSize, choice):
""" The method generates sampleSize number of random numbers which are in the range of
size. It samples out them using the given probability distribution
"""
if choice == 0: return np.random.choice(np.arange(0,size), sampleSize, p=probDist)
else: return np.random.choice(np.arange(0,size), sampleSize, replace=False,p=probDist)
def find_C_U_R(self, choice, sampleSize):
""" Method to compute C,U and R matrix in the CUR decomposition
"""
rand_no = self.generateRandomNos(self.colSum, self.numCols, 50, choice)
self.c_indices = rand_no
self.C = (self.matrix[:,rand_no]).astype(float)
colIdx, idx = 0, 0
while colIdx < sampleSize:
for rowIdx in range(0, self.numRows):
self.C[rowIdx, colIdx] /= (sampleSize*self.colSum[rand_no[idx]])**0.5
idx += 1
colIdx += 1
rand_no = self.generateRandomNos(self.rowSum, self.numRows, 50, choice)
self.R = (self.matrix[rand_no,:]).astype(float)
rowIdx, idx = 0, 0
while rowIdx < sampleSize:
for colIdx in range(0, self.numCols):
self.R[rowIdx, colIdx] /= (sampleSize*self.rowSum[rand_no[idx]])**0.5
idx += 1
rowIdx += 1
self.U = self.R[:,self.c_indices]
svd = SVD_topk(self.U, self.K)
Y = svd.V.T
sigma_sq_plus = 1/(svd.sigma ** 2)
sigma_sq_plus[sigma_sq_plus == np.inf] = 0
sigma_sq_plus[sigma_sq_plus == -np.inf] = 0
sigma_sq_plus[sigma_sq_plus == np.nan] = 0
X = svd.U.T
self.U = np.dot(Y,np.dot(sigma_sq_plus,X))
self.U[self.U == np.inf] = 0
self.U[self.U == -np.inf] = 0
self.U[self.U == np.nan] = 0
def multiply(self):
""" Multiplies the C, U and R matrices to get the matrix of predicted ratings.
"""
self.result = np.dot((np.dot(self.C, self.U)), self.R)
self.result[self.result == np.inf] = 0
self.result[self.result == -np.inf] = 0
self.result[self.result == np.nan] = 0
matrix = np.zeros(shape=(3000, 3000))
for i in range(3000):
for j in range(3000):
if i in ratings and j in ratings[i]:
matrix[i-1][j-1] = ratings[i][j]
start = time.time()
cur = CUR_topk(matrix,6)
pred = cur.result
end = time.time()
print("CUR took : ", end-start, "secs to complete")
print("RMSE for CUR : ", RMSE_mat(pred,matrix))
print("MAE for CUR: ", MAE(pred,matrix))
#cur 90%
class CUR_90:
"""
A Recommender System model based on CUR approximation with 90% energy.
Normalizes the Utility matrix consisting of users, movies and their ratings by subtracting values in a row
by their row mean , which handles the problem of strict and generous raters.
CUR factorizes the utility matrix into C(m x k), U(k X k) and R(k X n)..
The dot product C.U.R in the reduced form gives the prediction matrix.
"""
def __init__(self, matrix):
self.matrix = np.array(matrix)
self.numRows = np.shape(self.matrix)[0]
self.numCols = np.shape(self.matrix)[1]
self.getProbDistribution()
self.find_C_U_R(1,50)
self.multiply()
def getProbDistribution(self):
""" Gets the probability distribution for the rows and columns to be used in
random list generation.
"""
self.totalSum = sum(sum(self.matrix ** 2))
self.rowSum = (sum((self.matrix.T)**2))/self.totalSum
self.colSum = (sum(self.matrix ** 2))/self.totalSum
def generateRandomNos(self, probDist, size, sampleSize, choice):
""" The method generates sampleSsize number of random numbers which are in the range of
size. It samples out them using the given probability distribution
"""
if choice == 0: return np.random.choice(np.arange(0,size), sampleSize, p=probDist)
else: return np.random.choice(np.arange(0,size), sampleSize, replace=False,p=probDist)
def find_C_U_R(self, choice, sampleSize):
""" Method to compute C,U and R matrix in the CUR decomposition
"""
rand_no = self.generateRandomNos(self.colSum, self.numCols, 50, choice)
self.c_indices = rand_no
self.C = (self.matrix[:,rand_no]).astype(float)
colIdx, idx = 0, 0
while colIdx < sampleSize:
for rowIdx in range(0, self.numRows):
self.C[rowIdx, colIdx] /= (sampleSize*self.colSum[rand_no[idx]])**0.5
idx += 1
colIdx += 1
rand_no = self.generateRandomNos(self.rowSum, self.numRows, 50, choice)
self.R = (self.matrix[rand_no,:]).astype(float)
rowIdx, idx = 0, 0
while rowIdx < sampleSize:
for colIdx in range(0, self.numCols):
self.R[rowIdx, colIdx] /= (sampleSize*self.rowSum[rand_no[idx]])**0.5
idx += 1
rowIdx += 1
self.U = self.R[:,self.c_indices]
svd = SVD_90(self.U)
Y = svd.V.T
sigma_sq_plus = 1/(svd.sigma ** 2)
sigma_sq_plus[sigma_sq_plus == np.inf] = 0
sigma_sq_plus[sigma_sq_plus == -np.inf] = 0
sigma_sq_plus[sigma_sq_plus == np.nan] = 0
X = svd.U.T
self.U = np.dot(Y,np.dot(sigma_sq_plus,X))
self.U[self.U == np.inf] = 0
self.U[self.U == -np.inf] = 0
self.U[self.U == np.nan] = 0
def multiply(self):
""" Multiplies the C, U and R matrices to get the matrix of predicted ratings.
"""
self.result = np.dot((np.dot(self.C, self.U)), self.R)
self.result[self.result == np.inf] = 0
self.result[self.result == -np.inf] = 0
self.result[self.result == np.nan] = 0
matrix = np.zeros(shape=(3000, 3000))
for i in range(3000):
for j in range(3000):
if i in ratings and j in ratings[i]:
matrix[i-1][j-1] = ratings[i][j]
start = time.time()
cur = CUR_90(matrix)
pred = cur.result
end = time.time()
print(" CUR with 90% energy took : ", end-start, "secs to complete")
print("RMSE for CUR with 90% energy: ", RMSE_mat(pred,matrix))
print("MAE for CUR with 90% energy: ", MAE(pred,matrix))