Skip to content

Commit

Permalink
exclude all-NaN columns from mordred features and impute the column m…
Browse files Browse the repository at this point in the history
…ean for other NaNs to calculate AD index
  • Loading branch information
paulsonak committed Jan 17, 2025
1 parent 3fb2f45 commit 3f80003
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion atomsci/ddm/pipeline/model_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,10 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
pred_data = self.predict_embedding(dset_df, dset_params=dset_params)
else:
pred_data = copy.deepcopy(self.data.dataset.X)

if self.featurization.descriptor_type=='mordred_filtered':
pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)]
pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data)

try:
if not hasattr(self, 'featurized_train_data'):
Expand All @@ -926,14 +930,17 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
train_dset = dc.data.NumpyDataset(train_X)
self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset)
else:
if self.featurization.descriptor_type=='mordred_filtered':
train_X = train_X[:,~np.isnan(train_X).all(axis=0)]
train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X)
self.featurized_train_data = train_X

if not hasattr(self, "train_pair_dis") or not hasattr(self, "train_pair_dis_metric") or self.train_pair_dis_metric != dist_metric:
self.train_pair_dis = pairwise_distances(X=self.featurized_train_data, metric=dist_metric)
self.train_pair_dis_metric = dist_metric

self.log.debug("Calculating AD index.")

if AD_method == "local_density":
result_df["AD_index"] = calc_AD_kmean_local_density(self.featurized_train_data, pred_data, k, train_dset_pair_distance=self.train_pair_dis, dist_metric=dist_metric)
else:
Expand Down

0 comments on commit 3f80003

Please sign in to comment.