Skip to content

Commit

Permalink
Model Explainability (#18)
Browse files Browse the repository at this point in the history
Save learned weights / coefficients for further analysis. Closes #17. 

Adds actual model parameters used during training (previously only saved best parameter results of grid search, which were not comprehensive).
  • Loading branch information
s2t2 authored Nov 10, 2023
1 parent c88f749 commit 3c6505d
Show file tree
Hide file tree
Showing 39 changed files with 35,861 additions and 230 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@

.DS_Store


google-credentials.json
google-credentials-shared.json


data/*/*.csv
data/*/*.csv.gz

Expand Down
3 changes: 3 additions & 0 deletions app/classification/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ python -m app.classification.decision_tree

```sh
python -m app.classification.random_forest
FIG_SHOW=false FIG_SAVE=false python -m app.classification.random_forest
```


Expand All @@ -33,4 +34,6 @@ python -m app.classification.random_forest

```sh
python -m app.classification.xgboost

FIG_SHOW=false FIG_SAVE=false python -m app.classification.xgboost
```
20 changes: 20 additions & 0 deletions app/classification/logistic_regression.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import os

# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
from sklearn.linear_model import LogisticRegression
from pandas import Series

from app.classification import save_results_json
from app.classification.pipeline import ClassificationPipeline


class LogisticRegressionPipeline(ClassificationPipeline):

def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
Expand Down Expand Up @@ -34,6 +38,22 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
}


@property
def explainability_json(self):
return {
"intercept": self.intercept.round(4),
# "coefs": self.coefs.round(4).tolist(),
"coefs": self.coefs.round(4).to_dict(), # includes feature names!!
}

@property
def coefs(self):
return Series(self.model.coef_[0], index=self.model.feature_names_in_) #.sort_values(ascending=False) # don't sort? preserve order with features?

@property
def intercept(self):
return self.model.intercept_[0]


if __name__ == "__main__":

Expand Down
33 changes: 20 additions & 13 deletions app/classification/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def model_type(self):
def perform(self):
self.train_eval()
self.save_results()
self.save_coefs()
self.save_predictions()
self.plot_confusion_matrix()

Expand All @@ -93,6 +94,8 @@ def perform(self):
else:
self.plot_roc_curve()

#self.save_and_upload_model()


def train_eval(self):

Expand Down Expand Up @@ -124,18 +127,12 @@ def train_eval(self):
print("-----------------")
print("BEST PARAMS:", self.gs.best_params_)
print("BEST SCORE:", self.gs.best_score_)
clf = self.gs.best_estimator_.named_steps["classifier"]

self.class_names = self.class_names or list(clf.classes_)
self.class_labels = self.class_labels or class_labels(y_col=self.y_col, class_names=self.class_names)

# for logistic and xgboost:
#print("COEFS:")
#coefs = Series(model.coef_[0], index=features).sort_values(ascending=False)
# overwriting / updating self.model here, because this has the learned coefs, whereas the old self.model does not!
self.model = self.gs.best_estimator_.named_steps["classifier"]

# for xgboost:
# model.feature_importances_
# Series(model.feature_importances_, index=feature_names).sort_values(ascending=False)
self.class_names = self.class_names or list(self.model.classes_)
self.class_labels = self.class_labels or class_labels(y_col=self.y_col, class_names=self.class_names)

print("-----------------")
print("EVALUATION...")
Expand All @@ -157,11 +154,17 @@ def train_eval(self):
"param_grid": self.param_grid,
"best_params": self.gs.best_params_,
"best_score": self.gs.best_score_
}
},
"model_params": self.model.get_params() # all params used by the model!
}
self.results_json = {**self.results.as_json, **self.results_json} # merge dicts
pprint(self.results_json)

@property
def explainability_json(self) -> dict:
"""implement this in child class"""
raise NotImplementedError("Please implement in child class. Return a serializable dictionary for JSON conversion.")


@cached_property
def predictions_df(self) -> DataFrame:
Expand All @@ -178,6 +181,7 @@ def predictions_df(self) -> DataFrame:
df = text_and_labels.merge(df, how="right", left_index=True, right_index=True)
return df


def save_results(self):
json_filepath = os.path.join(self.results_dirpath, "results.json")
save_results_json(self.results_json, json_filepath)
Expand All @@ -190,6 +194,11 @@ def save_predictions(self): # confusion_only=False
# csv_filepath = os.path.join(self.results_dirpath, "confusions.csv")
df.to_csv(csv_filepath, index=False)

def save_coefs(self):
json_filepath = os.path.join(self.results_dirpath, "explainability.json")
save_results_json(self.explainability_json, json_filepath)


@cached_property
def results_dirpath(self):
dirpath = self._results_dirpath or os.path.join(CLASSIFICATION_RESULTS_DIRPATH, self.y_col, self.model_dirname)
Expand Down Expand Up @@ -220,8 +229,6 @@ def plot_confusion_matrix(self, fig_show=FIG_SHOW, fig_save=FIG_SAVE, showscale=
fig.write_html(os.path.join(self.results_dirpath, "confusion.html"))




def plot_roc_curve(self, fig_show=FIG_SHOW, fig_save=FIG_SAVE, height=500):
"""Plots the ROC characteristic and the AUC Score
Expand Down
16 changes: 16 additions & 0 deletions app/classification/random_forest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier
from pandas import Series

from app.classification.pipeline import ClassificationPipeline

Expand Down Expand Up @@ -53,6 +54,21 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
}


@property
def explainability_json(self):
return {
"coefs": self.coefs.round(4).to_dict(), # includes feature names!!
}

@property
def coefs(self):
"""random forest has .feature_importances_ instead of .coef_ """
return Series(self.model.feature_importances_, index=self.model.feature_names_in_) #.sort_values(ascending=False) # don't sort? preserve order with features?





if __name__ == "__main__":

from app.classification import Y_COLS, Y_COLS_BINARY, Y_COLS_MULTICLASS
Expand Down
12 changes: 12 additions & 0 deletions app/classification/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier

from xgboost import XGBClassifier
from pandas import Series

from app.classification.pipeline import ClassificationPipeline

Expand Down Expand Up @@ -124,6 +125,17 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
}


@property
def explainability_json(self):
return {
"coefs": self.coefs.round(4).to_dict(), # includes feature names!!
}

@property
def coefs(self):
"""random forest has .feature_importances_ instead of .coef_ """
return Series(self.model.feature_importances_, index=self.model.feature_names_in_) #.sort_values(ascending=False) # don't sort? preserve order with features?




Expand Down
Loading

0 comments on commit 3c6505d

Please sign in to comment.