Removed unused 'fold' parameter. Added documentation for this PR

ATOMScience-org · Jan 7, 2025 · d309761 · d309761
1 parent 3fb2f45
commit d309761
Show file tree

Hide file tree

Showing 7 changed files with 288 additions and 27 deletions.
diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py
@@ -251,6 +251,19 @@ class ModelDataset(object):
             combined_train_valid_data (dc.Dataset): A dataset object (initialized as None), of the merged train
             and valid splits
 
+            combined_train_valid_data (dc.NumpyDataset): Cache for combined training and validation data, 
+            used by k-fold CV code
+
+            subset_response_dict (dictionary): Cache for subset-specific response values matched to IDs, 
+            used by k-fold CV code
+
+            subset_weight_dict (dictionary): Cache for subset-specific weights matched to IDs, 
+            used by k-fold CV code
+
+            untransformed_response_dict (dictionary): Cache for untransformed response values 
+            matched to IDs, used by k-fold CV code
+
+
         set in get_featurized_data:
             dataset: A new featurized DeepChem Dataset.
 
@@ -316,7 +329,7 @@ def __init__(self, params, featurization):
         self.combined_train_valid_data = None
         # Cache for subset-specific response values matched to IDs, used by k-fold CV code
         self.subset_response_dict = {}
-        # Cache for subset-specific response values matched to IDs, used by k-fold CV code
+        # Cache for subset-specific weights matched to IDs, used by k-fold CV code
         self.subset_weight_dict = {}
         # Cache for untransformed response values matched to IDs, used by k-fold CV code
         self.untransformed_response_dict = {}
@@ -355,6 +368,7 @@ def get_featurized_data(self, params=None):
                 n_features: The count of features (int)
                 vals: The response col after featurization (np.array)
                 attr: A pd.dataframe containing the compound ids and smiles
+                untranfsormed_dataset: A NumpyDataset containing untransformed data
         """
 
         if params is None:
@@ -692,8 +706,6 @@ def get_subset_responses_and_weights(self, subset):
         Args:
             subset (string): Label of subset, 'train', 'test', or 'valid'
 
-            transformers: Transformers object for full dataset
-
         Returns:
             tuple(response_dict, weight_dict)
                 (response_dict): dictionary mapping compound ids to arrays of per-task untransformed response values
@@ -718,8 +730,16 @@ def get_subset_responses_and_weights(self, subset):
     # *************************************************************************************
 
     def get_untransformed_responses(self, ids):
-        """ Returns a numpy array of untransformed response values
         """
+        Returns a numpy array of untransformed response values for the given IDs.
+
+        Parameters:
+        ids (list or np.ndarray): List or array of IDs for which to retrieve untransformed response values.
+
+        Returns:
+        np.ndarray: A numpy array of untransformed response values corresponding to the given IDs.
+        """        
+
         response_vals = np.zeros((len(ids), self.untransformed_dataset.y.shape[1]))
         if len(self.untransformed_response_dict) == 0:
             self.untransformed_response_dict = dict(zip(self.untransformed_dataset.ids, self.untransformed_dataset.y))

diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py
@@ -239,11 +239,16 @@ class ModelWrapper(object):
 
             output_dir (str): The parent path of the model directory
 
-            transformers (list): Initialized as an empty list, stores the transformers on the response cols
+            transformers (dict of lists): Initialized using transformers.get_blank_transformations.
+            Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}.
+            Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for
+            the transformer fitted for the final model. When using k-fold validation, 'final' is fitted
+            using all training and validation data. Without k-fold validation, transformers for 0 and 'final'
+            are the same.
 
-            transformers_x (list): Initialized as an empty list, stores the transformers on the features
+            transformers_x (dict of lists): Same as transformers, but stores the transformers on the features
 
-            transformers_w (list): Initialized as an empty list, stores the transformers on the weights
+            transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights
 
         set in setup_model_dirs:
             best_model_dir (str): The subdirectory under output_dir that contains the best model. Created in setup_model_dirs
@@ -269,11 +274,17 @@ def __init__(self, params, featurizer, ds_client):
 
                 output_dir (str): The parent path of the model directory
 
-                transformers (list): Initialized as an empty list, stores the transformers on the response cols
+                transformers (dict of lists): Initialized using transformers.get_blank_transformations.
+                Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}.
+                Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for
+                the transformer fitted for the final model. When using k-fold validation, 'final' is fitted
+                using all training and validation data. Without k-fold validation, transformers for 0 and 'final'
+                are the same.
 
-                transformers_x (list): Initialized as an empty list, stores the transformers on the features
+                transformers_x (dict of lists): Same as transformers, but stores the transformers on the features
+
+                transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights
 
-                transformers_w (list): Initialized as an empty list, stores the transformers on the weights
 
         """
         self.params = params
@@ -328,7 +339,7 @@ def _create_output_transformers(self, dataset):
         """Initialize transformers for responses and persist them for later.
 
         Args:
-            model_dataset: The ModelDataset object that handles the current dataset
+            dataset: A dc.Dataset object
 
         Side effects:
             Overwrites the attributes:
@@ -346,7 +357,7 @@ def _create_feature_transformers(self, dataset):
         """Initialize transformers for features, and persist them for later.
 
         Args:
-            model_dataset: The ModelDataset object that handles the current dataset
+            dataset: A dc.Dataset object
 
         Side effects:
             Overwrites the attributes:
@@ -361,17 +372,21 @@ def create_transformers(self, training_datasets):
         """Initialize transformers for responses, features and weights, and persist them for later.
 
         Args:
-            training_datasets: The ModelDataset object that handles the current dataset
+            training_datasets: A dictionary of dc.Datasets containing the training data from 
+            each fold. Generated using transformers.get_all_training_datasets.
 
         Side effects:
             Overwrites the attributes:
-                transformers: A list of deepchem transformation objects on responses, only if conditions are met
-
-                transformers_x: A list of deepchem transformation objects on features, only if conditions are met.
+                transformers (dict of lists): Initialized using transformers.get_blank_transformations.
+                Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}.
+                Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for
+                the transformer fitted for the final model. When using k-fold validation, 'final' is fitted
+                using all training and validation data. Without k-fold validation, transformers for 0 and 'final'
+                are the same.
 
-                transformers_w: A list of deepchem transformation objects on weights, only if conditions are met.
+                transformers_x (dict of lists): Same as transformers, but stores the transformers on the features
 
-                params.transformer_key: A string pointing to the dataset key containing the transformer in the datastore, or the path to the transformer
+                transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights
 
         """
         total_transformers = 0
@@ -459,7 +474,7 @@ def transform_dataset(self, dataset, fold):
 
         Args:
             dataset: The DeepChem DiskDataset that contains a dataset
-            fold (int): Which fold is being transformed.
+            fold (int/str): Which fold is being transformed.
 
         Returns:
             transformed_dataset: The transformed DeepChem DiskDataset
@@ -511,7 +526,7 @@ def get_train_valid_pred_results(self, perf_data):
         return perf_data.get_prediction_results()
 
         # ****************************************************************************************
-    def get_test_perf_data(self, model_dir, model_dataset, fold):
+    def get_test_perf_data(self, model_dir, model_dataset):
         """Returns the predicted values and metrics for the current test dataset against
         the version of the model stored in model_dir, as a PerfData object.
 
@@ -553,7 +568,7 @@ def get_test_pred_results(self, model_dir, model_dataset):
         return perf_data.get_prediction_results()
 
         # ****************************************************************************************
-    def get_full_dataset_perf_data(self, model_dataset, fold):
+    def get_full_dataset_perf_data(self, model_dataset):
         """Returns the predicted values and metrics from the current model for the full current dataset,
         as a PerfData object.
 
@@ -1617,7 +1632,7 @@ def _create_output_transformers(self, dataset):
         """Initialize transformers for responses and persist them for later.
 
         Args:
-            model_dataset: The ModelDataset object that handles the current dataset
+            dataset: The dc.Dataset object that contains the current training dataset
 
         Side effects:
             Overwrites the attributes:

diff --git a/atomsci/ddm/pipeline/transformations.py b/atomsci/ddm/pipeline/transformations.py
@@ -70,9 +70,12 @@ def create_feature_transformers(params, featurization, train_dset):
     DeepChem transformer object holding its parameters.
 
     Args:
-        params (argparse.namespace: Object containing the parameter list
+        params (argparse.namespace): Object containing the parameter list
+
+        featurization (featurization.Featurization): A Featurization object that will be used with
+        the train_dset object.
 
-        model_dataset (ModelDataset): Contains the dataset to be transformed.
+        train_dset (dc.Dataset): Contains the dataset used to fit the the transformers.
 
     Returns:
         (list of DeepChem transformer objects): list of transformers for the feature matrix
@@ -102,7 +105,7 @@ def create_weight_transformers(params, dataset):
     Args:
         params (argparse.namespace: Object containing the parameter list
 
-        model_dataset (ModelDataset): Contains the dataset to be transformed.
+        dataset (dc.Dataset): Contains the dataset to be transformed.
 
     Returns:
         (list of DeepChem transformer objects): list of transformers for the weight matrix
@@ -146,6 +149,12 @@ def get_transformer_keys(params):
     There is one set of transformers for each fold and then one transformer
     for both validation and training sets. AMPL automatically trains a model
     using all validation and training data at the end of the training loop.
+
+    Args:
+        params (argparse.namespace: Object containing the parameter list
+
+    Returns:
+        (list): A list of all keys used in transformer dictionaries.
     """
     if params.split_strategy != 'k_fold_cv':
         return [0, 'final']
@@ -156,6 +165,9 @@ def get_transformer_keys(params):
 def get_blank_transformations():
     """Get empty transformations dictionary
     These keys must always exist, even when there are no transformations
+
+    Returns:
+        (dict): A dictionary containing empty lists. Used when no transformers are needed
     """
     return {0:[], 'final':[]}
 
@@ -165,6 +177,13 @@ def get_all_training_datasets(model_dataset):
     This takes a model_dataset and returns a dictionary of all
     datasets that will need a transformer. The keys will match
     what is returned by get_transformer_keys
+
+    Args:
+        model_dataset: A model_datasets.ModelDataset object containing the current dataset.
+
+    Returns:
+        dict of dc.Datasets: A dictionary keyed using keys fold numbers and 'final'. Contains
+        the training data for each fold and the final training+validation training set.
     """
     result = {}
     if model_dataset.splitting is None:

diff --git a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py
@@ -16,6 +16,9 @@
 logger = logging.getLogger(__name__)
 
 def test_balancing_transformer():
+    """
+    Test the balancing transformer to ensure that it correctly adjusts weights for imbalanced datasets.
+    """
     dset_key = make_relative_to_file('../../test_datasets/MRP3_dataset.csv')
 
     res_dir = tempfile.mkdtemp()
@@ -32,6 +35,10 @@ def test_balancing_transformer():
     assert weight == 1
 
 def test_all_transformers():
+    """
+    Test all transformers to ensure they work correctly with the dataset.
+
+    """
     res_dir = tempfile.mkdtemp()
     dskey = os.path.join(res_dir, 'special_test_dset.csv')
     params = read_params(
@@ -88,13 +95,19 @@ def test_all_transformers():
     assert (valid_weight1*valid_count1*4 - valid_weight2*valid_count2) < 1e-4
 
 def make_pipeline(params):
+    """
+    Generates a pipeline given parameters
+    """
     pparams = parse.wrapper(params)
     model_pipeline = mp.ModelPipeline(pparams)
     model_pipeline.train_model()
 
     return model_pipeline
 
 def make_pipeline_and_get_weights(params):
+    """
+    Generates the pipeline and gets the weights given parameters
+    """
     model_pipeline = make_pipeline(params)
     model_wrapper = model_pipeline.model_wrapper
     train_dataset = model_pipeline.data.train_valid_dsets[0][0]
@@ -103,20 +116,37 @@ def make_pipeline_and_get_weights(params):
     return transformed_data.w
 
 def make_relative_to_file(relative_path):
+    """
+    Generates the full path relative to the location of this file.
+    """
+
     script_path = os.path.dirname(os.path.realpath(__file__))
     result = os.path.join(script_path, relative_path)
 
     return result
 
 def read_params(json_file, tmp_dskey, res_dir):
+    """
+    Read parameters from a JSON file and update them with the dataset key and result directory.
+
+    Parameters:
+    json_file (str): Path to the JSON file containing parameters.
+    tmp_dskey (str): Temporary dataset key.
+    res_dir (str): Result directory.
+
+    Returns:
+    dict: Updated parameters.
+    """
     with open(json_file, 'r') as file:
         params = json.load(file)
     params['result_dir'] = res_dir
     params['dataset_key'] = tmp_dskey
     return params
 
 def params_wo_balan(dset_key, res_dir):
-    # Train classification models without balancing weights. Repeat this several times so we can get some statistics on the performance metrics.
+    """
+    Reads params for models without balancing weight
+    """
     params = read_params(
         make_relative_to_file('jsons/wo_balancing_transformer.json'),
         dset_key,
@@ -125,7 +155,9 @@ def params_wo_balan(dset_key, res_dir):
     return params
 
 def params_w_balan(dset_key, res_dir):
-    # Now train models on the same dataset with balancing weights
+    """
+    Reads params for models with balancing weight
+    """
     params = read_params(
         make_relative_to_file('jsons/balancing_transformer.json'),
         dset_key,
@@ -135,6 +167,9 @@ def params_w_balan(dset_key, res_dir):
     return params
 
 def test_kfold_transformers():
+    """
+    Test transformers for a kfold classification model
+    """
     res_dir = tempfile.mkdtemp()
     dskey = os.path.join(res_dir, 'special_test_dset.csv')
     params = read_params(
@@ -217,6 +252,10 @@ def test_kfold_transformers():
     np.testing.assert_array_almost_equal(transformer_x.X_means, np.ones_like(transformer_x.X_means)*expected_mean)
 
 def test_kfold_regression_transformers():
+    """
+    Tests transformers for each fold of a kfold regression model. Ensures
+    that the transformers are correct for each fold.
+    """
     res_dir = tempfile.mkdtemp()
     dskey = os.path.join(res_dir, 'special_test_dset.csv')
     params = read_params(