diff --git a/ecommerce b/ecommerce new file mode 100644 index 0000000..9a9ecb2 --- /dev/null +++ b/ecommerce @@ -0,0 +1,44 @@ +# Import the libraries +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error, mean_absolute_error +import math +import pylab +import scipy.stats as stats + +df = pd.read_csv("/Users/tikkybamiro/Documents/linear practice/ecommerce.csv") +df.head() +df.describe() + +lm = LinearRegression() + +X = df[['Avg. Session Length', 'Time on App', 'Time on Website', 'Length of Membership']] +Y = df['Yearly Amount Spent'] + +X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42) + +lm.fit(X_train, Y_train) + +cdf = pd.DataFrame(lm.coef_, X.columns, columns=['Coefficients']) +cdf + +predictions = lm.predict(X_test) + +sns.scatterplot(x = predictions, y= Y_test) +plt.xlabel("Predictions") + +print("Mean Absolute Error: ", mean_absolute_error(Y_test, predictions)) +print("Mean Squared Error: ", mean_squared_error(Y_test, predictions)) +print("RMSE: ", math.sqrt(mean_squared_error(Y_test, predictions))) + +residuals = Y_test - predictions + +sns.displot(residuals, bins=20, kde=True) +plt.ylabel("Frequency") +plt.xlabel("Residuals") + +stats.probplot(residuals, dist="norm", plot=pylab) +pylab.show()