-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path240623_carboncapture_decisiontrees.py
executable file
·324 lines (271 loc) · 12.3 KB
/
240623_carboncapture_decisiontrees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
# -*- coding: utf-8 -*-
"""240623_CarbonCapture_DecisionTrees
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1B_zlnpcZGuZBEBSh1uRd7R3R4F6cJaZ-
"""
# pip install --upgrade pandas numpy matplotlib seaborn scikit-learn shap lime xgboost lightgbm catboost dask dask-ml scipy
import shap
import lime
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import lime.lime_tabular
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
# Add this import before importing IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.inspection import PartialDependenceDisplay
from sklearn.experimental import enable_iterative_imputer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, explained_variance_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
# Optional: Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')
# Load the data
try:
df = pd.read_csv('/content/data.csv',encoding='cp1252')
except:
try:
df = pd.read_csv('/content/data.csv',encoding='utf-8')
except Exception as e:
print("Please check encoding!!!")
finally:
exit()
# Print the first few rows of the DataFrame
print(df.head())
# Print the column names
print(df.columns)
# Print the data type of each column
print(df.dtypes)
# Print the number of rows and columns
print(df.shape)
# Print a summary of the DataFrame
print(df.info())
# Print the descriptive statistics of the numeric columns
print(df.describe())
# Separate the first 4 columns and the last 11 columns
first_4_columns = df.iloc[:, :4]
last_11_columns = df.iloc[:, 4:]
# Print the first 4 columns
print(first_4_columns.head())
# Print the last 11 columns
print(last_11_columns.head())
estimators = [
('rf', RandomForestRegressor(n_estimators=10, random_state=42)),
('gbdt', GradientBoostingRegressor(n_estimators=10, random_state=42)),
('xgboost', XGBRegressor(n_estimators=10, random_state=42)),
('lightgbm', LGBMRegressor(n_estimators=10, random_state=42)),
('catboost', CatBoostRegressor(iterations=10, random_state=42, verbose=0)),
('ada_boost', AdaBoostRegressor(n_estimators=10, random_state=42)),
('gpr', GaussianProcessRegressor(kernel=C(1.0) * RBF(1.0))),
('mlp', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)),
('svr', SVR()),
('knn', KNeighborsRegressor()),
('decision_tree', DecisionTreeRegressor()),
('extra_trees', ExtraTreesRegressor()),
]
final_estimator = LinearRegression()
stacked_regressor = StackingRegressor(estimators=estimators, final_estimator=final_estimator)
imputer = IterativeImputer(estimator=stacked_regressor, max_iter=10, random_state=0)
# Create the imputer
# imputer = IterativeImputer(estimator=ExtraTreesRegressor(), max_iter=10, random_state=0)
print("Imputing Data...")
# Fit and transform the last 11 columns
imputed_data = imputer.fit_transform(last_11_columns)
# Convert the imputed data back to a pandas DataFrame
df_imputed_11 = pd.DataFrame(imputed_data, columns=last_11_columns.columns)
# Combine the first 4 columns with the imputed last 11 columns
df_final = pd.concat([first_4_columns.reset_index(drop=True), df_imputed_11.reset_index(drop=True)], axis=1)
print("Saving imputed data into file...")
# If you want to save the result
df_final.to_csv('/content/imputed_data.csv', index=False, encoding='utf-8')
print(df_final.head())
df = df_final
# Step 1: Assume data is loaded into df
data = df
print(data.head())
# Step 2: Exploratory Data Analysis (EDA)
# Summary statistics
print(data.describe())
# Check for missing values
print(data.isnull().sum())
# Visualize missing values
plt.figure(figsize=(12, 8))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()
# Correlation matrix for numerical columns
numerical_cols = data.select_dtypes(include=[np.number]).columns
corr_matrix = data[numerical_cols].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()
# Pairplot with feedstock hue (select top 5 correlated features with target)
target = 'CO2_uptake (mmol/g)'
top_features = corr_matrix[target].sort_values(ascending=False)[1:6].index.tolist()
sns.pairplot(data[['feedstock'] + top_features + [target]], hue='feedstock')
plt.show()
# Distribution of numerical features
data[numerical_cols].hist(bins=30, figsize=(20, 15))
plt.suptitle('Feature Distributions')
plt.tight_layout()
plt.show()
# Step 3: Feature Selection
X = data[numerical_cols].drop(target, axis=1)
y = data[target]
feedstock = data['feedstock']
selector = SelectKBest(f_regression, k=11) # Select top 11 features
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()
print("Selected features:", selected_features)
# Step 4: Split the Data
X_train, X_test, y_train, y_test, feedstock_train, feedstock_test = train_test_split(
X[selected_features], y, feedstock, test_size=0.2, random_state=42)
# Step 5: Standardize the Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
predictions = model.predict(X_test)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
mape = np.mean(np.abs((y_test - predictions) / y_test)) * 100
evs = explained_variance_score(y_test, predictions)
return r2, mae, mse, rmse, mape, evs
# Hyperparameter Tuning and Cross-Validation
def hyperparameter_tuning(model, param_grid, X_train, y_train):
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f'Best Parameters: {grid_search.best_params_}')
return best_model
# Define parameter grids for each model
param_grids = {
'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]},
'GBDT': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
'LightGBM': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'num_leaves': [31, 63, 127]},
'CatBoost': {'iterations': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'depth': [4, 6, 8]},
'AdaBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
}
# Train and Evaluate Various Models
models = {
'Random Forest': RandomForestRegressor(random_state=42),
'GBDT': GradientBoostingRegressor(random_state=42),
'XGBoost': XGBRegressor(random_state=42),
'LightGBM': LGBMRegressor(random_state=42),
'CatBoost': CatBoostRegressor(random_state=42, verbose=0),
'AdaBoost': AdaBoostRegressor(random_state=42)
}
results = {}
best_models = {}
for name, model in tqdm(models.items(), desc="Training models"):
print(f'Tuning {name}...')
try:
tuned_model = hyperparameter_tuning(model, param_grids[name], X_train_scaled, y_train)
best_models[name] = tuned_model
results[name] = evaluate_model(tuned_model, X_test_scaled, y_test)
# Cross-validation
cv_scores = cross_val_score(tuned_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validation MSE: {-cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
except Exception as e:
print(f"Error occurred while tuning {name}: {str(e)}")
# Display the results
for name, metrics in results.items():
print(f"{name} - R^2: {metrics[0]:.2f}, MAE: {metrics[1]:.2f}, MSE: {metrics[2]:.2f}, "
f"RMSE: {metrics[3]:.2f}, MAPE: {metrics[4]:.2f}, Explained Variance: {metrics[5]:.2f}")
# SHAP and LIME Explanations, and Partial Dependence Plots for each model
feature_pairs = [(selected_features[i], selected_features[j])
for i in range(len(selected_features))
for j in range(i+1, len(selected_features))][:8]
for name, model in best_models.items():
if name != 'AdaBoost':
print(f"Generating SHAP values for {name}...")
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test_scaled)
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test_scaled, feature_names=selected_features, show=False)
plt.title(f'SHAP Summary Plot for {name}')
plt.tight_layout()
plt.show()
# Generate SHAP force plot for the first instance
plt.figure(figsize=(12, 4))
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], X_test.iloc[0], feature_names=selected_features, matplotlib=True)
plt.title(f'SHAP Force Plot for {name} - Instance 0 (feedstock: {feedstock_test.iloc[0]})')
plt.tight_layout()
plt.show()
# LIME Explanation
lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=selected_features, mode='regression')
print(f"Generating LIME explanations for {name}...")
for i in range(5): # Explaining first 5 instances
exp = lime_explainer.explain_instance(X_test_scaled[i], model.predict, num_features=10)
exp.as_pyplot_figure()
plt.title(f'LIME Explanation for {name} - Instance {i} (feedstock: {feedstock_test.iloc[i]})')
plt.tight_layout()
plt.show()
# Partial Dependence Plots
print(f"Generating Partial Dependence Plots for {name}...")
fig, ax = plt.subplots(figsize=(12, 8))
PartialDependenceDisplay.from_estimator(model, X_train, feature_pairs, feature_names=selected_features, ax=ax)
plt.suptitle(f'Partial Dependence Plots for {name}')
plt.tight_layout()
plt.show()
# Feature Importance Plot
for name, model in best_models.items():
if hasattr(model, 'feature_importances_'):
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title(f"Feature Importances for {name}")
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), [selected_features[i] for i in indices], rotation=90)
plt.tight_layout()
plt.show()
# Scatter plot of predicted vs actual values, colored by feedstock
for name, model in best_models.items():
predictions = model.predict(X_test_scaled)
plt.figure(figsize=(10, 6))
scatter = plt.scatter(y_test, predictions, c=pd.Categorical(feedstock_test).codes, cmap='viridis')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual CO2 uptake (mmol/g)')
plt.ylabel('Predicted CO2 uptake (mmol/g)')
plt.title(f'Actual vs Predicted CO2 Uptake for {name}')
plt.colorbar(scatter, label='feedstock')
plt.tight_layout()
plt.show()
# Residual Plot
for name, model in best_models.items():
predictions = model.predict(X_test_scaled)
residuals = y_test - predictions
plt.figure(figsize=(10, 6))
plt.scatter(predictions, residuals, c=pd.Categorical(feedstock_test).codes, cmap='viridis')
plt.xlabel('Predicted CO2 uptake (mmol/g)')
plt.ylabel('Residuals')
plt.title(f'Residual Plot for {name}')
plt.axhline(y=0, color='r', linestyle='--')
plt.colorbar(label='feedstock')
plt.tight_layout()
plt.show()