Model Explainability (#18)

Save learned weights / coefficients for further analysis. Closes #17. Adds actual model parameters used during training (previously only saved best parameter results of grid search, which were not comprehensive).
s2t2 · Nov 10, 2023 · 3c6505d · 3c6505d
1 parent c88f749
commit 3c6505d
Show file tree

Hide file tree

Showing 39 changed files with 35,861 additions and 230 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,11 @@
 
 .DS_Store
 
+
+google-credentials.json
+google-credentials-shared.json
+
+
 data/*/*.csv
 data/*/*.csv.gz
 

diff --git a/app/classification/README.md b/app/classification/README.md
@@ -25,6 +25,7 @@ python -m app.classification.decision_tree
 
 ```sh
 python -m app.classification.random_forest
+FIG_SHOW=false FIG_SAVE=false python -m app.classification.random_forest
 ```
 
 
@@ -33,4 +34,6 @@ python -m app.classification.random_forest
 
 ```sh
 python -m app.classification.xgboost
+
+FIG_SHOW=false FIG_SAVE=false python -m app.classification.xgboost
 ```
diff --git a/app/classification/logistic_regression.py b/app/classification/logistic_regression.py
@@ -1,9 +1,13 @@
+import os
 
 # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
 from sklearn.linear_model import LogisticRegression
+from pandas import Series
 
+from app.classification import save_results_json
 from app.classification.pipeline import ClassificationPipeline
 
+
 class LogisticRegressionPipeline(ClassificationPipeline):
 
     def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
@@ -34,6 +38,22 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
         }
 
 
+    @property
+    def explainability_json(self):
+        return {
+            "intercept": self.intercept.round(4),
+           # "coefs": self.coefs.round(4).tolist(),
+            "coefs": self.coefs.round(4).to_dict(), # includes feature names!!
+        }
+
+    @property
+    def coefs(self):
+        return Series(self.model.coef_[0], index=self.model.feature_names_in_) #.sort_values(ascending=False) # don't sort? preserve order with features?
+
+    @property
+    def intercept(self):
+        return self.model.intercept_[0]
+
 
 if __name__ == "__main__":
 

diff --git a/app/classification/pipeline.py b/app/classification/pipeline.py
@@ -85,6 +85,7 @@ def model_type(self):
     def perform(self):
         self.train_eval()
         self.save_results()
+        self.save_coefs()
         self.save_predictions()
         self.plot_confusion_matrix()
 
@@ -93,6 +94,8 @@ def perform(self):
         else:
             self.plot_roc_curve()
 
+        #self.save_and_upload_model()
+
 
     def train_eval(self):
 
@@ -124,18 +127,12 @@ def train_eval(self):
         print("-----------------")
         print("BEST PARAMS:", self.gs.best_params_)
         print("BEST SCORE:", self.gs.best_score_)
-        clf = self.gs.best_estimator_.named_steps["classifier"]
-
-        self.class_names = self.class_names or list(clf.classes_)
-        self.class_labels = self.class_labels or class_labels(y_col=self.y_col, class_names=self.class_names)
 
-        # for logistic and xgboost:
-        #print("COEFS:")
-        #coefs = Series(model.coef_[0], index=features).sort_values(ascending=False)
+        # overwriting / updating self.model here, because this has the learned coefs, whereas the old self.model does not!
+        self.model = self.gs.best_estimator_.named_steps["classifier"]
 
-        # for xgboost:
-        # model.feature_importances_
-        # Series(model.feature_importances_, index=feature_names).sort_values(ascending=False)
+        self.class_names = self.class_names or list(self.model.classes_)
+        self.class_labels = self.class_labels or class_labels(y_col=self.y_col, class_names=self.class_names)
 
         print("-----------------")
         print("EVALUATION...")
@@ -157,11 +154,17 @@ def train_eval(self):
                 "param_grid": self.param_grid,
                 "best_params": self.gs.best_params_,
                 "best_score": self.gs.best_score_
-            }
+            },
+            "model_params": self.model.get_params() # all params used by the model!
         }
         self.results_json = {**self.results.as_json, **self.results_json} # merge dicts
         pprint(self.results_json)
 
+    @property
+    def explainability_json(self) -> dict:
+        """implement this in child class"""
+        raise NotImplementedError("Please implement in child class. Return a serializable dictionary for JSON conversion.")
+
 
     @cached_property
     def predictions_df(self) -> DataFrame:
@@ -178,6 +181,7 @@ def predictions_df(self) -> DataFrame:
         df = text_and_labels.merge(df, how="right", left_index=True, right_index=True)
         return df
 
+
     def save_results(self):
         json_filepath = os.path.join(self.results_dirpath, "results.json")
         save_results_json(self.results_json, json_filepath)
@@ -190,6 +194,11 @@ def save_predictions(self): # confusion_only=False
         #    csv_filepath = os.path.join(self.results_dirpath, "confusions.csv")
         df.to_csv(csv_filepath, index=False)
 
+    def save_coefs(self):
+        json_filepath = os.path.join(self.results_dirpath, "explainability.json")
+        save_results_json(self.explainability_json, json_filepath)
+
+
     @cached_property
     def results_dirpath(self):
         dirpath = self._results_dirpath or os.path.join(CLASSIFICATION_RESULTS_DIRPATH, self.y_col, self.model_dirname)
@@ -220,8 +229,6 @@ def plot_confusion_matrix(self, fig_show=FIG_SHOW, fig_save=FIG_SAVE, showscale=
             fig.write_html(os.path.join(self.results_dirpath, "confusion.html"))
 
 
-
-
     def plot_roc_curve(self, fig_show=FIG_SHOW, fig_save=FIG_SAVE, height=500):
         """Plots the ROC characteristic and the AUC Score
 

diff --git a/app/classification/random_forest.py b/app/classification/random_forest.py
@@ -1,5 +1,6 @@
 # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
 from sklearn.ensemble import RandomForestClassifier
+from pandas import Series
 
 from app.classification.pipeline import ClassificationPipeline
 
@@ -53,6 +54,21 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
         }
 
 
+    @property
+    def explainability_json(self):
+        return {
+            "coefs": self.coefs.round(4).to_dict(), # includes feature names!!
+        }
+
+    @property
+    def coefs(self):
+        """random forest has .feature_importances_ instead of .coef_ """
+        return Series(self.model.feature_importances_, index=self.model.feature_names_in_) #.sort_values(ascending=False) # don't sort? preserve order with features?
+
+
+
+
+
 if __name__ == "__main__":
 
     from app.classification import Y_COLS, Y_COLS_BINARY, Y_COLS_MULTICLASS

diff --git a/app/classification/xgboost.py b/app/classification/xgboost.py
@@ -14,6 +14,7 @@
 # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier
 
 from xgboost import XGBClassifier
+from pandas import Series
 
 from app.classification.pipeline import ClassificationPipeline
 
@@ -124,6 +125,17 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
         }
 
 
+    @property
+    def explainability_json(self):
+        return {
+            "coefs": self.coefs.round(4).to_dict(), # includes feature names!!
+        }
+
+    @property
+    def coefs(self):
+        """random forest has .feature_importances_ instead of .coef_ """
+        return Series(self.model.feature_importances_, index=self.model.feature_names_in_) #.sort_values(ascending=False) # don't sort? preserve order with features?
+