-
Notifications
You must be signed in to change notification settings - Fork 0
/
Content_Based_recommender.py
80 lines (52 loc) · 2.7 KB
/
Content_Based_recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
import os
os.chdir('C:\\Users\\deeks\\Downloads\\ipnynb_files\\data\\recommender')
meta_data = pd.read_csv('movies_metadata.csv')
know_data = meta_data[['title', 'release_date', 'vote_average', 'vote_count', 'runtime', 'genres']]
#Convert release_date into pandas datetime format
know_data['release_date'] = pd.to_datetime(know_data['release_date'], errors='coerce')
#Extract year from the datetime
know_data['year'] = know_data['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
def convert_int(x):
try:
return int(x)
except:
return 0
#Apply convert_int to the year feature
know_data['year'] = know_data['year'].apply(convert_int)
#Drop the release_date column
know_data = know_data.drop('release_date', axis=1)
know_data['genres'] = know_data['genres'].fillna('[]')
know_data['genres'] = know_data['genres'].apply(literal_eval)
know_data['genres'] = know_data['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])
#Add the useful features into the cleaned dataframe
know_data['overview'], know_data['id'] = meta_data['overview'], meta_data['id']
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english')
know_data['overview'] = know_data['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(know_data['overview'])
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)
#Construct a reverse mapping of indices and movie titles, and drop duplicat
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
# Function that takes in movie title as input and gives recommendations
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
# Obtain the index of the movie that matches the title
idx = indices[title]
# Get the pairwsie similarity scores of all movies with that movie
# And convert it into a list of tuples as described above
sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the movies based on the cosine similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar movies. Ignore the first movie.
sim_scores = sim_scores[1:11]
# Get the movie indices
movie_indices = [i[0] for i in sim_scores]
# Return the top 10 most similar movies
return df['title'].iloc[movie_indices]
inp_movie = input('Type movie name to find similar ones').lower()
print(content_recommender(inp_movie))