From 3f8000342989b4ab86282de302f355f11f2be522 Mon Sep 17 00:00:00 2001 From: Amanda Paulson Date: Thu, 16 Jan 2025 16:42:43 -0800 Subject: [PATCH] exclude all-NaN columns from mordred features and impute the column mean for other NaNs to calculate AD index --- atomsci/ddm/pipeline/model_pipeline.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py index 7d49092a..2ae45c56 100644 --- a/atomsci/ddm/pipeline/model_pipeline.py +++ b/atomsci/ddm/pipeline/model_pipeline.py @@ -902,6 +902,10 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= pred_data = self.predict_embedding(dset_df, dset_params=dset_params) else: pred_data = copy.deepcopy(self.data.dataset.X) + + if self.featurization.descriptor_type=='mordred_filtered': + pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)] + pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data) try: if not hasattr(self, 'featurized_train_data'): @@ -926,6 +930,9 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= train_dset = dc.data.NumpyDataset(train_X) self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset) else: + if self.featurization.descriptor_type=='mordred_filtered': + train_X = train_X[:,~np.isnan(train_X).all(axis=0)] + train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X) self.featurized_train_data = train_X if not hasattr(self, "train_pair_dis") or not hasattr(self, "train_pair_dis_metric") or self.train_pair_dis_metric != dist_metric: @@ -933,7 +940,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= self.train_pair_dis_metric = dist_metric self.log.debug("Calculating AD index.") - + if AD_method == "local_density": result_df["AD_index"] = calc_AD_kmean_local_density(self.featurized_train_data, pred_data, k, train_dset_pair_distance=self.train_pair_dis, dist_metric=dist_metric) else: