forked from czwong/Movie_Machine_Learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandomForestRegressor.py
141 lines (109 loc) · 4.25 KB
/
randomForestRegressor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import pandas as pd
import numpy
import matplotlib.pyplot as plt
movie_dataset = pd.read_csv(r'''Resources/movie_metadata.csv''')
# Remove unnecessary characters
for index, row in movie_dataset.iterrows():
movie_dataset.loc[index, "movie_title"] = row["movie_title"].split('\xa0')[0]
# Drop duplicate rows
movies = movie_dataset.drop_duplicates(subset="movie_title")
# Select needed columns
movies = movies[["num_critic_for_reviews", "content_rating", "gross", "num_voted_users", "cast_total_facebook_likes", "num_user_for_reviews", "budget", "movie_facebook_likes", "imdb_score", "duration", "genres"]]
# Drop rows with missing values
movies = movies.dropna(how="any")
# Select rows that don't have zero values for the facebook likes variables
movies = movies.loc[(movies["cast_total_facebook_likes"] != 0) & (movies["movie_facebook_likes"] != 0)]
# Dummy encode genres
genres = movies["genres"].str.get_dummies(sep='|')
# Drop old genres column
movies = movies.drop("genres", axis=1)
# Merge new genres dataframe with the original one
movies_df = genres \
.merge(movies, left_index = True, right_index = True)
# Select X and y values
X = movies_df.drop("gross", axis=1)
y = movies_df['gross']
data = X.copy()
# Dummy encode content rating
data_binary_encoded = pd.get_dummies(data)
# Remove some of the content_rating variables
data_binary_encoded = data_binary_encoded.drop(["content_rating_Not Rated", "content_rating_Passed", "content_rating_Unrated", "content_rating_Approved"], axis=1)
# Separate data into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_binary_encoded, y, random_state=42)
# Fit the training data into the RandomForestRegressor model
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=8, random_state=42, n_estimators=300)
regr.fit(X_train, y_train)
# Quantify the model
training_score = regr.score(X_train, y_train)
testing_score = regr.score(X_test, y_test)
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")
from sklearn.metrics import mean_squared_error
predictions = regr.predict(X_test)
MSE = mean_squared_error(y_test, predictions)
r2 = regr.score(X_test, y_test)
print(f"MSE: {MSE}, R2: {r2}")
# Actual gross vs predicted gross
df = pd.DataFrame({"Actual": y_test, "Predicted": predictions})
new = pd.merge(df, movie_dataset, left_index=True, right_index=True)
new = new[["Actual", "Predicted", "movie_title"]]
# Round gross to a whole number
new = new.round({"Predicted":0})
# Send data to SQLite table
import sqlite3
def create_connection(db_file):
""" create a database connection to the SQLite database
specified by db_file
:param db_file: database file
:return: Connection object or None
"""
try:
conn = sqlite3.connect(db_file)
return conn
except Error as e:
print(e)
return None
def create_table(conn, create_table_sql):
""" create a table from the create_table_sql statement
:param conn: Connection object
:param create_table_sql: a CREATE TABLE statement
:return:
"""
try:
c = conn.cursor()
c.execute("DROP TABLE IF EXISTS predictions;")
c.execute(create_table_sql)
except Error:
print("bad")
return None
def create_project(conn, project):
"""
Create a new project into the predictions table
:param conn:
:param project:
:return: project id
"""
sql = ''' INSERT INTO predictions(Actual, Predicted, movie_title)
VALUES(?,?,?) '''
cur = conn.cursor()
cur.execute(sql, project)
return cur.lastrowid
# Create table statement
p_table = """ CREATE TABLE predictions (Actual integer, Predicted integer, movie_title text); """
# Database
database = "db/newfinaldata.sqlite"
# Create connection to database
conn = create_connection(database)
if conn is not None:
# create predictions table
create_table(conn, p_table)
else:
print("Cannot create the database connection.")
for index, row in new.iterrows():
with conn:
# create a new prediction row
predict = (row["Actual"], row["Predicted"], row["movie_title"])
create_project(conn, predict)
#project_id = create_project(conn, project)