-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCollaborative_filtering.py
83 lines (67 loc) · 2.68 KB
/
Collaborative_filtering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
# importing the data from the file and arranging to make a matrix of User_id*Movie_id.
data = pd.read_csv("netflix/TrainingRatings.txt",
names=["Movie_id", "User_id", "Ratings"])
cols = data["Movie_id"]
rows = data["User_id"]
ratings = data["Ratings"]
df = data.pivot(index="User_id", columns="Movie_id", values="Ratings")
df = df.sort_index(axis=0)
df.astype('float16')
# calculating the mean values of the ratings.
mean_values = df.mean(axis=1).to_numpy()
# Centering the matrix of User_id*Movie_id to zero and replacing the nan alue to zero.
matrix = df.to_numpy()
matrix = np.subtract(matrix.T, mean_values).T
matrix = np.nan_to_num(matrix)
matrix.astype(np.float16)
normal = np.linalg.norm(matrix, axis=1)
del df
# Calculating the weights for all the users
weights = []
print("Calculating the weights. It will take 10-15 mins.")
for i, j, k in zip(range(28978), matrix, normal):
numerator = np.dot(j, np.transpose(matrix))
denominator = np.multiply(k, normal)
denominator = np.where(denominator == 0, 1, denominator)
weights.append(np.divide(numerator, denominator))
weights = np.asarray(weights)
np.fill_diagonal(weights, 0)
# Importing the trining data
df1 = pd.read_csv("netflix/TrainingRatings.txt",
names=["Movie_id", "User_id", "Ratings"])
cols = pd.unique(df1.loc[:, "Movie_id"])
rows = pd.unique(df1.loc[:, "User_id"])
data = df1.pivot(index="User_id", columns="Movie_id", values="Ratings")
data = data.sort_index(axis=0)
# Importing the test data
test_data = pd.read_csv("netflix/TestingRatings.txt",
names=["Movie_id", "User_id", "Ratings"])
test_data = test_data.to_numpy()
# Filtering the weights to use only the weights of the neighbors.
weights = np.clip(weights, 0, 1000)
# Calculating the kappa.
k = np.divide(1, (np.sum(np.absolute(weights), axis=1)))
np.nan_to_num(k, 0)
# Making the predictions over all the users
pred = np.dot(weights, matrix)
pred = np.multiply(k, pred.T).T
pred = np.add(mean_values, pred.T).T
pred = np.round(pred)
# Getting the predicted values for the target users only
predicted_y = []
for data in test_data:
user = np.where(rows == data[1])
movie = np.where(cols == data[0])
predicted = pred[user[0], movie[0]]
predicted = predicted[0]
# print(predicted)
predicted_y.append(predicted)
# Calculating the RMSE and MAE values.
true_y = test_data[:, 2]
rmse = mean_squared_error(true_y, predicted_y)
mae = mean_absolute_error(true_y, predicted_y)
print("RMSE is :", rmse)
print("MAE is :", mae)