-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecommendation.py
321 lines (212 loc) · 12 KB
/
recommendation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
'''
Movie Recommendation System
We build a basic content-based recommendation system and collaborative filtering based on user rankings using the MovieLens dataset.
Movielens dataset: http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
The dataset contains 100,000 ratings on 9,000 movies by 600 users.
movies.csv file shows every movie with its id, title, and genre.
To describe users' relation with movies we used the ratings.csv file which defines every user
with 'userId' and shows their 'rating' based on 'movieId'.
First, we will use this dataset to recommend movies with a content-based system then we will train our model and
recommend movies with model-based Collaborative Filtering with the KNN algorithm.
We also used another small dataset to create a GUI for content based sytem. (gui.py)
IMDB Movie Dataset: https://data.world/promptcloud/imdb-data-from-2006-to-2016
Gizem Kurnaz & Ayşe Ceren Çiçek
'''
# Libraries
import sklearn
import warnings
import numpy as np
import pandas as pd
from scipy import sparse
from surprise import SVD
from surprise import accuracy
from fuzzywuzzy import process
import matplotlib.pyplot as plt
from collections import defaultdict
from scipy.sparse import csr_matrix
from surprise import Reader, Dataset
from surprise.accuracy import rmse, mae
from sklearn.neighbors import NearestNeighbors
from surprise.model_selection import cross_validate
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
# Import data #
movies = pd.read_csv('MovieLensDataset/movies.csv')
ratings = pd.read_csv('MovieLensDataset/ratings.csv')
# Merge two data based on movieId attribute
merged_data = pd.merge(ratings, movies, on='movieId')
# We will not use timestamp attribute so we will drop this column
merged_data.drop(columns=['timestamp'], inplace=True)
movie_mean = pd.DataFrame(merged_data.groupby('title').mean())
movie_mean['counts'] = merged_data.groupby('title')['rating'].count()
# We can plot a histogram to see how rating value distribute
plt.figure(figsize=[10, 6])
movie_mean.rating.hist(bins=50)
plt.xlabel('Rating value')
plt.ylabel('Number of movies')
plt.show(block=False)
plt.savefig('ratingDistribution1.png')
plt.close()
# Data Preprocessing #
df_movies_cnt = pd.DataFrame(ratings.groupby('movieId').size(), columns=['count'])
movie_id_count = pd.DataFrame(merged_data.groupby('movieId').size(), columns=['count'])
# Collect all movie indexes, which has more than 10 votes, in a list called as popular_movies.
popular_movies = list(set(movie_id_count.query('count >= 10').index))
# Check if a movie in merged_data is in populer_movies and stored them in drop_movies.
drop_movies = merged_data[merged_data.movieId.isin(popular_movies)]
print('\nOriginal data: ', merged_data.shape)
print('Data after dropping unpopular movies: ', drop_movies.shape)
# Now we have a movie dataset which includes movies with more than 10 votes.
# We will apply the same process for user who voted less than or equal to 10.
df_users_cnt = pd.DataFrame(drop_movies.groupby('userId').size(), columns=['count'])
active_users = list(set(df_users_cnt.query('count >= 10').index))
drop_users = drop_movies[drop_movies.userId.isin(active_users)]
print('Original data: ', merged_data.shape)
print('Data after dropping both unpopular movies and inactive users: ', drop_users.shape, "\n\n\n")
# Now our dataset can present the real data with less value and more accuracy.
plt.figure(figsize=[10, 6])
drop_users.rating.hist(bins=50)
plt.xlabel('Rating value')
plt.ylabel('Number of movies')
plt.show(block=False)
plt.savefig('ratingDistribution2.png')
plt.close()
# Group movies by their title and count how much they voted.
drop_users.groupby('title')['rating'].count().sort_values(ascending=False).head()
# Create a dataframe to store rating averages of movies and their vote count.
ratings_dataframe = pd.DataFrame(drop_users.groupby('title')['rating'].mean())
ratings_dataframe.rename(columns={'rating': 'avg_rating'}, inplace=True)
ratings_dataframe['votes'] = pd.DataFrame(drop_users.groupby('title')['rating'].count())
ratings_dataframe.sort_values(by='votes', ascending=False).head()
print("\n\n-------------------------------------------------------------------------------------------")
print(" --- CONTENT BASED FILTERING --- \n")
# Content Based #
# Create a spreadsheet-style pivot table
user_movie_rates = drop_users.pivot_table(index='userId', columns='title', values='rating')
# Select a movie (Zombieland (2009)) to calculate similarity with other movies and improve our model on it.
zombieland_rates = user_movie_rates['Zombieland (2009)']
# Create a profile for each movie based on how the entirety of users rated it.
# Use corrwith() to get correlation between 2 DataFrame objects.
zombieland_corr = user_movie_rates.corrwith(zombieland_rates)
# Add correlation values to the dataframe as a new column.
zombieland_corr = pd.DataFrame(zombieland_corr, columns=['Correlation'])
# Drop NaN values
zombieland_corr.dropna(inplace=True)
# Sort the correlation column in descending order
zombieland_corr.sort_values('Correlation', ascending=False)
# Add the number of votes for those movies
zombieland_corr_count = zombieland_corr.join(ratings_dataframe['votes'])
zombieland_corr_count.sort_values('Correlation', ascending=False)
zombieland_corr_count.sort_values('votes', ascending=False)
# Get movies that have a correlation value higher than 0.4 and has more than 100 votes.
new_zombieland_corr = zombieland_corr_count[zombieland_corr_count['votes'] > 100]
new_zombieland_corr = new_zombieland_corr[new_zombieland_corr['Correlation'] > 0.4]
new_zombieland_corr.sort_values('Correlation', ascending=False)
# Based on the logic we used below, we will create a function to get similar movies of a given movie.
def contentBasedRecommender(movie_name):
movie_user_ratings = user_movie_rates[movie_name]
# Create pandas series of correlations for all films with the movie
movie_corr = user_movie_rates.corrwith(movie_user_ratings)
# Convert to df
movie_corr = pd.DataFrame(movie_corr, columns=['Correlation'])
# Drop nulls
movie_corr.dropna(inplace=True)
# Add column for number of votes
movie_corr = movie_corr.join(ratings_dataframe['votes'])
# Get movies that have more than 40 votes and the correlation between 0.5 and 1.0
new_movie_corr = movie_corr[movie_corr['votes'] >= 40]
new_movie_corr = new_movie_corr[new_movie_corr['Correlation'] > 0.5]
new_movie_corr = new_movie_corr[new_movie_corr['Correlation'] != 1.0]
# Sort in descending order
return new_movie_corr.sort_values('Correlation', ascending=False).head(20)
movie_name = "Fargo (1996)"
print("20 similar movies to ", movie_name, ": \n")
print(contentBasedRecommender(movie_name))
print("\n\n-------------------------------------------------------------------------------------------")
print(" --- COLLABORATIVE FILTERING with KNN --- ")
# User Based Collaborative Filtering With KNN #
# Use sparse matrix to use movieId as rows and userID as columns, and show user ratings for each movie
movie_users = drop_users.pivot(index='movieId', columns='userId', values='rating')
# Replace null values (NaN) with 0
movie_users = movie_users.fillna(0.0)
# Convert to compressed sparse row matrix with csr_matrixmethod
matrix_movie_users = csr_matrix(movie_users.values)
# Use KNN algorithm to train data, and cosine similarity as a distance metric.
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=25)
knn_model.fit(matrix_movie_users)
# Our recommender function will get name of the movie from the user and show 15 neasert neighbors which has least distance.
# We used fuzzywuzzy library for string matching to get better results if user types wrong characters.
# We turn all 15 neighbor's indices and get their titles and show them to user.
# The first index in neighbors is the movie itself and we do not print it.
def movieRecommender(movie_name, data, model, n_recommendations):
model.fit(data)
index = process.extractOne(movie_name, movies['title'])[2]
print('Selected movie is: ', movies['title'][index])
print('\nRecommendations are:\n')
distances, indices = model.kneighbors(data[index], n_neighbors=n_recommendations)
for i in indices[0]:
if i != index:
print(movies['title'][i])
movie_name = "toy story"
movieRecommender(movie_name, matrix_movie_users, knn_model, 15 + 1)
print("\n\n-------------------------------------------------------------------------------------------")
print(" --- MATRIX FACTORIZATION BASED COLLABORATIVE FILTERING WITH SVD METHOD --- ")
# Matrix Factorization Based Collaborative Filtering with SVD Method #
# Matrix factorization is another class of collaborative filtering algorithms used in recommender systems.
# We will use Singular Value Decomposition method to implement it.
# Data Preprocessing
data = ratings.drop('timestamp', axis=1)
# Check for null values
data.isna().sum()
data.shape
# Check for unique number of movies and users
movie = data['movieId'].nunique()
users = data['userId'].nunique()
print('Number of movies =', movie)
print('Number of users =', users, "\n\n")
# We can see distribution of ratings below (how many times a rating value is given)
data['rating'].value_counts().plot(kind='bar', colormap='terrain')
plt.show(block=False)
plt.savefig('ratingDistribution3.png')
plt.close()
# To get more accurate results, we will only use movies and users which have been rated more than 20 times
filtered_movies = data['movieId'].value_counts() > 20
filtered_movies = filtered_movies[filtered_movies].index.tolist()
filtered_users = data['userId'].value_counts() > 20
filtered_users = filtered_users[filtered_users].index.tolist()
# Now we have filtered users and movies as a list, and we will only use those values on our dataframe
data = data[(data['movieId'].isin(filtered_movies)) & (data['userId'].isin(filtered_users))]
# Now we do not have duplicate data and our data contains 66,405 rows (34,431 rows are removed).
# The Reader class is used to parse a file containing ratings.
reader = Reader(rating_scale=(0.5, 5))
# First create dataset from dataframe. Then create train and test set.
columns = ['userId', 'movieId', 'rating']
dataset = Dataset.load_from_df(data[columns], reader)
trainset = dataset.build_full_trainset()
testset = trainset.build_anti_testset()
# Use SVD method
model = SVD(n_epochs=25, verbose=True)
cross_validate(model, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('Training Done\n\n')
# Now we can predict
prediction = model.test(testset)
print("\n TOP 5 MOVIE RECOMMENDATIONS FOR EACH USER \n")
# We will define a function to recommend movies based on prediction for each user.
def get_Recommendations(prediction, n):
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in prediction:
top_n[uid].append((iid, est))
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
top_n[uid] = user_ratings[:n]
return top_n
top_n = get_Recommendations(prediction, n=5)
for uid, user_ratings in top_n.items():
movie_id = []
print("Recommendations for User {}:".format(uid))
for (iid, ratings) in user_ratings:
print("\t", movies[movies.movieId == iid]["title"].values[0])
print("\n")
print("\n WE MADE 5 MOVIE RECOMMENDATIONS FOR EACH USER WITH MATRIX FACTORIZATION METHOD \n")