Merge pull request #385 from ATOMScience-org/bug_transformer_fitting

Transformers are now fitted using only the training data.
ATOMScience-org · Jan 15, 2025 · 94f93d0 · 94f93d0
2 parents eadd9d4 + d309761
commit 94f93d0
Show file tree

Hide file tree

Showing 20 changed files with 1,673 additions and 463 deletions.
diff --git a/atomsci/ddm/pipeline/featurization.py b/atomsci/ddm/pipeline/featurization.py
@@ -1021,7 +1021,7 @@ def featurize_data(self, dset_df, params, contains_responses):
         weights = input_dataset.w
         attr = input_model_dataset.attr
 
-        input_dataset = self.embedding_pipeline.model_wrapper.transform_dataset(input_dataset)
+        input_dataset = self.embedding_pipeline.model_wrapper.transform_dataset(input_dataset, fold='final')
 
         # Run the embedding model to generate features. 
         embedding = self.embedding_pipeline.model_wrapper.generate_embeddings(input_dataset)

diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py
@@ -6,7 +6,6 @@
 from deepchem.data import NumpyDataset
 import numpy as np
 import pandas as pd
-import deepchem as dc
 import uuid
 from atomsci.ddm.pipeline import featurization as feat
 from atomsci.ddm.pipeline import splitting as split
@@ -252,6 +251,19 @@ class ModelDataset(object):
             combined_train_valid_data (dc.Dataset): A dataset object (initialized as None), of the merged train
             and valid splits
 
+            combined_train_valid_data (dc.NumpyDataset): Cache for combined training and validation data, 
+            used by k-fold CV code
+
+            subset_response_dict (dictionary): Cache for subset-specific response values matched to IDs, 
+            used by k-fold CV code
+
+            subset_weight_dict (dictionary): Cache for subset-specific weights matched to IDs, 
+            used by k-fold CV code
+
+            untransformed_response_dict (dictionary): Cache for untransformed response values 
+            matched to IDs, used by k-fold CV code
+
+
         set in get_featurized_data:
             dataset: A new featurized DeepChem Dataset.
 
@@ -317,8 +329,11 @@ def __init__(self, params, featurization):
         self.combined_train_valid_data = None
         # Cache for subset-specific response values matched to IDs, used by k-fold CV code
         self.subset_response_dict = {}
-        # Cache for subset-specific response values matched to IDs, used by k-fold CV code
+        # Cache for subset-specific weights matched to IDs, used by k-fold CV code
         self.subset_weight_dict = {}
+        # Cache for untransformed response values matched to IDs, used by k-fold CV code
+        self.untransformed_response_dict = {}
+
 
     # ****************************************************************************************
     def load_full_dataset(self):
@@ -353,6 +368,7 @@ def get_featurized_data(self, params=None):
                 n_features: The count of features (int)
                 vals: The response col after featurization (np.array)
                 attr: A pd.dataframe containing the compound ids and smiles
+                untranfsormed_dataset: A NumpyDataset containing untransformed data
         """
 
         if params is None:
@@ -379,6 +395,7 @@ def get_featurized_data(self, params=None):
                 if params.prediction_type=='classification':
                     w = w.astype(np.float32)
 
+                self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids)
                 self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w)
                 self.log.info("Using prefeaturized data; number of features = " + str(self.n_features))
                 return
@@ -404,6 +421,7 @@ def get_featurized_data(self, params=None):
         self.log.debug("Number of features: " + str(self.n_features))
 
         # Create the DeepChem dataset       
+        self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids)
         self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w)
         # Checking for minimum number of rows
         if len(self.dataset) < params.min_compound_number:
@@ -681,15 +699,13 @@ def has_all_feature_columns(self, dset_df):
 
     # *************************************************************************************
 
-    def get_subset_responses_and_weights(self, subset, transformers):
+    def get_subset_responses_and_weights(self, subset):
         """Returns a dictionary mapping compound IDs in the given dataset subset to arrays of response values
         and weights.  Used by the perf_data module under k-fold CV.
 
         Args:
             subset (string): Label of subset, 'train', 'test', or 'valid'
 
-            transformers: Transformers object for full dataset
-
         Returns:
             tuple(response_dict, weight_dict)
                 (response_dict): dictionary mapping compound ids to arrays of per-task untransformed response values
@@ -703,16 +719,38 @@ def get_subset_responses_and_weights(self, subset, transformers):
             else:
                 raise ValueError('Unknown dataset subset type "%s"' % subset)
 
-            y = dc.trans.undo_transforms(dataset.y, transformers)
+            response_vals = dict(zip(dataset.ids, self.get_untransformed_responses(dataset.ids)))
+
             w = dataset.w
-            response_vals = dict([(id, y[i,:]) for i, id in enumerate(dataset.ids)])
             weights = dict([(id, w[i,:]) for i, id in enumerate(dataset.ids)])
             self.subset_response_dict[subset] = response_vals
             self.subset_weight_dict[subset] = weights
         return self.subset_response_dict[subset], self.subset_weight_dict[subset]
 
     # *************************************************************************************
 
+    def get_untransformed_responses(self, ids):
+        """
+        Returns a numpy array of untransformed response values for the given IDs.
+
+        Parameters:
+        ids (list or np.ndarray): List or array of IDs for which to retrieve untransformed response values.
+
+        Returns:
+        np.ndarray: A numpy array of untransformed response values corresponding to the given IDs.
+        """        
+
+        response_vals = np.zeros((len(ids), self.untransformed_dataset.y.shape[1]))
+        if len(self.untransformed_response_dict) == 0:
+            self.untransformed_response_dict = dict(zip(self.untransformed_dataset.ids, self.untransformed_dataset.y))
+
+        for i, id in enumerate(ids):
+            response_vals[i] = self.untransformed_response_dict[id]
+
+        return response_vals
+
+    # *************************************************************************************
+
     def _get_split_key(self):
         """Creates the proper CSV name for a split file
 
@@ -828,6 +866,8 @@ def get_featurized_data(self, dset_df, is_featurized=False):
                                                                                     params, self.contains_responses)
             self.log.warning("Done")
         self.n_features = self.featurization.get_feature_count()
+
+        self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids)
         self.dataset = NumpyDataset(features, self.vals, ids=ids)
 
     # ****************************************************************************************

diff --git a/atomsci/ddm/pipeline/model_pipeline.py b/atomsci/ddm/pipeline/model_pipeline.py
@@ -301,17 +301,10 @@ def load_featurize_data(self, params=None):
         # is fitted to the training data only. The transformers are then applied to the training,
         # validation and test sets separately.
         if not params.split_only:
-            self.model_wrapper.create_transformers(self.data)
+            self.model_wrapper.create_transformers(trans.get_all_training_datasets(self.data))
         else:
             self.run_mode = ''
 
-        if self.run_mode == 'training':
-            for i, (train, valid) in enumerate(self.data.train_valid_dsets):
-                train = self.model_wrapper.transform_dataset(train)
-                valid = self.model_wrapper.transform_dataset(valid)
-                self.data.train_valid_dsets[i] = (train, valid)
-            self.data.test_dset = self.model_wrapper.transform_dataset(self.data.test_dset)
-
         # ****************************************************************************************
 
     def create_model_metadata(self):
@@ -863,8 +856,6 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
             raise Exception("response_cols missing from model params")
         # Get features for each compound and construct a DeepChem Dataset from them
         self.data.get_featurized_data(dset_df, is_featurized)
-        # Transform the features and responses if needed
-        self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset)
 
         # Note that at this point, the dataset may contain fewer rows than the input. Typically this happens because
         # of invalid SMILES strings. Remove any rows from the input dataframe corresponding to SMILES strings that were
@@ -995,7 +986,7 @@ def predict_embedding(self, dset_df, dset_params=None):
         self.data = model_datasets.create_minimal_dataset(self.params, self.featurization)
         self.data.get_featurized_data(dset_df, is_featurized=False)
         # Not sure the following is necessary
-        self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset)
+        self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset, fold='final')
 
         # Get the embeddings as a numpy array
         embeddings = self.model_wrapper.generate_embeddings(self.data.dataset)
@@ -1577,7 +1568,7 @@ def ensemble_predict(model_uuids, collections, dset_df, labels=None, dset_params
             raise Exception("response_cols missing from model params")
         is_featurized = (len(set(pipe.featurization.get_feature_columns()) - set(dset_df.columns.values)) == 0)
         pipe.data.get_featurized_data(dset_df, is_featurized)
-        pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset)
+        pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset, fold='final')
 
         # Create a temporary data frame to hold the compound IDs and predictions. The model may not
         # return predictions for all the requested compounds, so we have to outer join the predictions