240623_carboncapture_decisiontrees.py

# -*- coding: utf-8 -*-
"""240623_CarbonCapture_DecisionTrees

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1B_zlnpcZGuZBEBSh1uRd7R3R4F6cJaZ-
"""
# pip install --upgrade pandas numpy matplotlib seaborn scikit-learn shap lime xgboost lightgbm catboost dask dask-ml scipy

import shap
import lime
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import lime.lime_tabular
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Add this import before importing IterativeImputer
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.inspection import PartialDependenceDisplay
from sklearn.experimental import enable_iterative_imputer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, explained_variance_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

# Optional: Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Load the data
try:
    df = pd.read_csv('/content/data.csv',encoding='cp1252')
except:
    try:
        df = pd.read_csv('/content/data.csv',encoding='utf-8')
    except Exception as e:
        print("Please check encoding!!!")
finally:
    exit()

# Print the first few rows of the DataFrame
print(df.head())
# Print the column names
print(df.columns)
# Print the data type of each column
print(df.dtypes)
# Print the number of rows and columns
print(df.shape)
# Print a summary of the DataFrame
print(df.info())
# Print the descriptive statistics of the numeric columns
print(df.describe())

# Separate the first 4 columns and the last 11 columns
first_4_columns = df.iloc[:, :4]
last_11_columns = df.iloc[:, 4:]

# Print the first 4 columns
print(first_4_columns.head())

# Print the last 11 columns
print(last_11_columns.head())

estimators = [
    ('rf', RandomForestRegressor(n_estimators=10, random_state=42)),
    ('gbdt', GradientBoostingRegressor(n_estimators=10, random_state=42)),
    ('xgboost', XGBRegressor(n_estimators=10, random_state=42)),
    ('lightgbm', LGBMRegressor(n_estimators=10, random_state=42)),
    ('catboost', CatBoostRegressor(iterations=10, random_state=42, verbose=0)),
    ('ada_boost', AdaBoostRegressor(n_estimators=10, random_state=42)),
    ('gpr', GaussianProcessRegressor(kernel=C(1.0) * RBF(1.0))),
    ('mlp', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)),
    ('svr', SVR()),
    ('knn', KNeighborsRegressor()),
    ('decision_tree', DecisionTreeRegressor()),
    ('extra_trees', ExtraTreesRegressor()),
]


final_estimator = LinearRegression()
stacked_regressor = StackingRegressor(estimators=estimators, final_estimator=final_estimator)

imputer = IterativeImputer(estimator=stacked_regressor, max_iter=10, random_state=0)

# Create the imputer
# imputer = IterativeImputer(estimator=ExtraTreesRegressor(), max_iter=10, random_state=0)

print("Imputing Data...")

# Fit and transform the last 11 columns
imputed_data = imputer.fit_transform(last_11_columns)

# Convert the imputed data back to a pandas DataFrame
df_imputed_11 = pd.DataFrame(imputed_data, columns=last_11_columns.columns)

# Combine the first 4 columns with the imputed last 11 columns
df_final = pd.concat([first_4_columns.reset_index(drop=True), df_imputed_11.reset_index(drop=True)], axis=1)

print("Saving imputed data into file...")

# If you want to save the result
df_final.to_csv('/content/imputed_data.csv', index=False, encoding='utf-8')

print(df_final.head())

df = df_final

# Step 1: Assume data is loaded into df
data = df
print(data.head())

# Step 2: Exploratory Data Analysis (EDA)

# Summary statistics
print(data.describe())

# Check for missing values
print(data.isnull().sum())

# Visualize missing values
plt.figure(figsize=(12, 8))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

# Correlation matrix for numerical columns
numerical_cols = data.select_dtypes(include=[np.number]).columns
corr_matrix = data[numerical_cols].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

# Pairplot with feedstock hue (select top 5 correlated features with target)
target = 'CO2_uptake (mmol/g)'
top_features = corr_matrix[target].sort_values(ascending=False)[1:6].index.tolist()
sns.pairplot(data[['feedstock'] + top_features + [target]], hue='feedstock')
plt.show()

# Distribution of numerical features
data[numerical_cols].hist(bins=30, figsize=(20, 15))
plt.suptitle('Feature Distributions')
plt.tight_layout()
plt.show()

# Step 3: Feature Selection
X = data[numerical_cols].drop(target, axis=1)
y = data[target]
feedstock = data['feedstock']

selector = SelectKBest(f_regression, k=11)  # Select top 11 features
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()
print("Selected features:", selected_features)

# Step 4: Split the Data
X_train, X_test, y_train, y_test, feedstock_train, feedstock_test = train_test_split(
    X[selected_features], y, feedstock, test_size=0.2, random_state=42)

# Step 5: Standardize the Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_test - predictions) / y_test)) * 100
    evs = explained_variance_score(y_test, predictions)
    return r2, mae, mse, rmse, mape, evs

# Hyperparameter Tuning and Cross-Validation
def hyperparameter_tuning(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    print(f'Best Parameters: {grid_search.best_params_}')
    return best_model

# Define parameter grids for each model
param_grids = {
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]},
    'GBDT': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    'LightGBM': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'num_leaves': [31, 63, 127]},
    'CatBoost': {'iterations': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'depth': [4, 6, 8]},
    'AdaBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
}

# Train and Evaluate Various Models
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'GBDT': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42),
    'CatBoost': CatBoostRegressor(random_state=42, verbose=0),
    'AdaBoost': AdaBoostRegressor(random_state=42)
}

results = {}
best_models = {}

for name, model in tqdm(models.items(), desc="Training models"):
    print(f'Tuning {name}...')
    try:
        tuned_model = hyperparameter_tuning(model, param_grids[name], X_train_scaled, y_train)
        best_models[name] = tuned_model
        results[name] = evaluate_model(tuned_model, X_test_scaled, y_test)

        # Cross-validation
        cv_scores = cross_val_score(tuned_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
        print(f"Cross-validation MSE: {-cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    except Exception as e:
        print(f"Error occurred while tuning {name}: {str(e)}")

# Display the results
for name, metrics in results.items():
    print(f"{name} - R^2: {metrics[0]:.2f}, MAE: {metrics[1]:.2f}, MSE: {metrics[2]:.2f}, "
          f"RMSE: {metrics[3]:.2f}, MAPE: {metrics[4]:.2f}, Explained Variance: {metrics[5]:.2f}")

# SHAP and LIME Explanations, and Partial Dependence Plots for each model
feature_pairs = [(selected_features[i], selected_features[j])
                 for i in range(len(selected_features))
                 for j in range(i+1, len(selected_features))][:8]

for name, model in best_models.items():
    if name != 'AdaBoost':
        print(f"Generating SHAP values for {name}...")
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test_scaled)
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values, X_test_scaled, feature_names=selected_features, show=False)
        plt.title(f'SHAP Summary Plot for {name}')
        plt.tight_layout()
        plt.show()

        # Generate SHAP force plot for the first instance
        plt.figure(figsize=(12, 4))
        shap.initjs()
        shap.force_plot(explainer.expected_value, shap_values[0], X_test.iloc[0], feature_names=selected_features, matplotlib=True)
        plt.title(f'SHAP Force Plot for {name} - Instance 0 (feedstock: {feedstock_test.iloc[0]})')
        plt.tight_layout()
        plt.show()

    # LIME Explanation
    lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=selected_features, mode='regression')
    print(f"Generating LIME explanations for {name}...")
    for i in range(5):  # Explaining first 5 instances
        exp = lime_explainer.explain_instance(X_test_scaled[i], model.predict, num_features=10)
        exp.as_pyplot_figure()
        plt.title(f'LIME Explanation for {name} - Instance {i} (feedstock: {feedstock_test.iloc[i]})')
        plt.tight_layout()
        plt.show()

    # Partial Dependence Plots
    print(f"Generating Partial Dependence Plots for {name}...")
    fig, ax = plt.subplots(figsize=(12, 8))
    PartialDependenceDisplay.from_estimator(model, X_train, feature_pairs, feature_names=selected_features, ax=ax)
    plt.suptitle(f'Partial Dependence Plots for {name}')
    plt.tight_layout()
    plt.show()

# Feature Importance Plot
for name, model in best_models.items():
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]

        plt.figure(figsize=(10, 6))
        plt.title(f"Feature Importances for {name}")
        plt.bar(range(X_train.shape[1]), importances[indices])
        plt.xticks(range(X_train.shape[1]), [selected_features[i] for i in indices], rotation=90)
        plt.tight_layout()
        plt.show()

# Scatter plot of predicted vs actual values, colored by feedstock
for name, model in best_models.items():
    predictions = model.predict(X_test_scaled)
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(y_test, predictions, c=pd.Categorical(feedstock_test).codes, cmap='viridis')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual CO2 uptake (mmol/g)')
    plt.ylabel('Predicted CO2 uptake (mmol/g)')
    plt.title(f'Actual vs Predicted CO2 Uptake for {name}')
    plt.colorbar(scatter, label='feedstock')
    plt.tight_layout()
    plt.show()

# Residual Plot
for name, model in best_models.items():
    predictions = model.predict(X_test_scaled)
    residuals = y_test - predictions
    plt.figure(figsize=(10, 6))
    plt.scatter(predictions, residuals, c=pd.Categorical(feedstock_test).codes, cmap='viridis')
    plt.xlabel('Predicted CO2 uptake (mmol/g)')
    plt.ylabel('Residuals')
    plt.title(f'Residual Plot for {name}')
    plt.axhline(y=0, color='r', linestyle='--')
    plt.colorbar(label='feedstock')
    plt.tight_layout()
    plt.show()