-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
executable file
·148 lines (124 loc) · 5.57 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.inspection import plot_partial_dependence
import shap
import lime
import lime.lime_tabular
# Optional: Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')
# Step 1: Read the Data from CSV
data = pd.read_csv('data.csv', header=1) # assuming the first row contains the headers
print(data.head())
# Step 2: Exploratory Data Analysis (EDA)
# Summary statistics
print(data.describe())
# Check for missing values
print(data.isnull().sum())
# Visualize missing values
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()
# Correlation matrix
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
# Pairplot
sns.pairplot(data)
plt.show()
# Distribution of features
data.hist(bins=30, figsize=(20, 15))
plt.show()
# Step 3: Clean the Data
data.fillna(data.mean(), inplace=True)
# Step 4: Split the Data into Features and Target
X = data[['SA (m2/g)', 'TPV (cm3/g)', 'MPV (cm3/g)', 'C%', 'H%', 'N%', 'O%', 'T (℃)', 'P (bar)']] # specify the columns correctly
y = data['CO2 uptake (mmol/g)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 5: Standardize the Features (if necessary)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
predictions = model.predict(X_test)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
mape = np.mean(np.abs((y_test - predictions) / y_test)) * 100
return r2, mae, mse, rmse, mape
# Hyperparameter Tuning and Cross-Validation
def hyperparameter_tuning(model, param_grid):
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_
print(f'Best Parameters: {grid_search.best_params_}')
return best_model
# Define parameter grids for each model
param_grids = {
'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},
'GBDT': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]},
'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]},
'LightGBM': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]},
'CatBoost': {'iterations': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]},
'AdaBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
}
# Train and Evaluate Various Models
models = {
'Random Forest': RandomForestRegressor(random_state=42),
'GBDT': GradientBoostingRegressor(random_state=42),
'XGBoost': XGBRegressor(random_state=42),
'LightGBM': LGBMRegressor(random_state=42),
'CatBoost': CatBoostRegressor(random_state=42, verbose=0),
'AdaBoost': AdaBoostRegressor(random_state=42)
}
results = {}
best_models = {}
for name, model in models.items():
print(f'Tuning {name}...')
tuned_model = hyperparameter_tuning(model, param_grids[name])
best_models[name] = tuned_model
results[name] = evaluate_model(tuned_model, X_test_scaled, y_test)
# Display the results
for name, metrics in results.items():
print(f"{name} - R^2: {metrics[0]:.2f}, MAE: {metrics[1]:.2f}, MSE: {metrics[2]:.2f}, RMSE: {metrics[3]:.2f}, MAPE: {metrics[4]:.2f}")
# SHAP and LIME Explanations, and Partial Dependence Plots for each model
features = [
('SA (m2/g)', 'TPV (cm3/g)'), ('SA (m2/g)', 'MPV (cm3/g)'), ('SA (m2/g)', 'N%'), ('SA (m2/g)', 'O%'), ('SA (m2/g)', 'T (℃)'),
('MPV (cm3/g)', 'TPV (cm3/g)'), ('C%', 'H%'), ('N%', 'H%')
]
for name, model in best_models.items():
if not name == 'AdaBoost':
print(f"Generating SHAP values for {name}...")
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test_scaled)
shap.summary_plot(shap_values, X_test_scaled, feature_names=X.columns, show=False)
plt.title(f'SHAP Summary Plot for {name}')
plt.show()
# Generate SHAP force plot for the first instance
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], X_test.iloc[0], matplotlib=True)
plt.title(f'SHAP Force Plot for {name} - Instance 0')
plt.show()
# LIME Explanation
lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=X.columns, mode='regression')
print(f"Generating LIME explanations for {name}...")
for i in range(5): # Explaining first 5 instances
exp = lime_explainer.explain_instance(X_test_scaled[i], model.predict, num_features=10)
exp.show_in_notebook(show_table=True)
# Partial Dependence Plots
print(f"Generating Partial Dependence Plots for {name}...")
plot_partial_dependence(model, X_train_scaled, features, feature_names=X.columns, grid_resolution=50)
plt.suptitle(f'Partial Dependence Plots for {name}')
plt.show()