-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregression_analysis.py
91 lines (72 loc) · 3.65 KB
/
regression_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from itertools import combinations
# Load the dataset
file_path = r"your_file_path"
data = pd.read_excel(file_path)
# Define the target variable
y = data['TotalScore'].apply(pd.to_numeric, errors='coerce')
# Get all columns except the target variable
all_columns = data.columns.tolist()
all_columns.remove('TotalScore')
# Generate all possible combinations of predictors
for r in range(1, len(all_columns) + 1):
for subset in combinations(all_columns, r):
subset = list(subset)
X = data[subset].apply(pd.to_numeric, errors='coerce')
# Drop any rows with NaN values in X or y
X = X.dropna()
y = y.dropna()
# Ensure that X and y have the same index after dropping NaNs
X, y = X.align(y, join='inner', axis=0)
# Check if X and y are not empty
if not X.empty and not y.empty:
# Add a constant to the model for the intercept term in statsmodels
X = sm.add_constant(X)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fit the statsmodels OLS model
sm_model = sm.OLS(y_train, X_train).fit()
# Parsing the summary to create a human-readable summary
p_values = sm_model.pvalues
significant_vars = p_values[p_values < 0.05].index.tolist()
nonsignificant_vars = p_values[p_values >= 0.05].index.tolist()
# Remove 'const' from the lists if present
significant_vars = [var for var in significant_vars if var != 'const']
nonsignificant_vars = [var for var in nonsignificant_vars if var != 'const']
# summary statements
summary_statement = ""
if 'const' in p_values and p_values['const'] < 0.05:
summary_statement += "The model intercept is statistically significant. "
else:
summary_statement += "The model intercept is not statistically significant. "
if significant_vars:
summary_statement += f"The variables {', '.join(significant_vars)} have a significant correlation with TotalScore. "
else:
summary_statement += "No predictor variables have a significant correlation with TotalScore. "
print(summary_statement)
# Load the dataset
model = LinearRegression()
model.fit(X_train.drop('const', axis=1), y_train) # Drop the constant term
# Predict the 'TotalScore' on the test set
y_pred = model.predict(X_test.drop('const', axis=1))
# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print out the performance metrics
print(f"\nMean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")
# Plot the actual vs predicted values
plt.scatter(y_test, y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Total Scores')
plt.show()
else:
print(f"Skipping combination {subset} due to insufficient data after dropping NaNs.")