From d3097619f4dc09efd1457dfbac2e1008ffac5e71 Mon Sep 17 00:00:00 2001
From: "he6@llnl.gov" <he6@llnl.gov>
Date: Tue, 7 Jan 2025 14:39:52 -0800
Subject: [PATCH] Removed unused 'fold' parameter. Added documentation for this
 PR

---
 atomsci/ddm/pipeline/model_datasets.py        | 28 ++++++--
 atomsci/ddm/pipeline/model_wrapper.py         | 51 +++++++++-----
 atomsci/ddm/pipeline/transformations.py       | 25 ++++++-
 .../test_balancing_transformer.py             | 43 +++++++++++-
 .../test/integrative/make_test_datasets.py    | 37 ++++++++++
 atomsci/ddm/test/unit/test_perf_data.py       | 61 ++++++++++++++++
 atomsci/ddm/test/unit/test_transformers.py    | 70 +++++++++++++++++++
 7 files changed, 288 insertions(+), 27 deletions(-)

diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py
index e89b45bb..8ebd7aa8 100644
--- a/atomsci/ddm/pipeline/model_datasets.py
+++ b/atomsci/ddm/pipeline/model_datasets.py
@@ -251,6 +251,19 @@ class ModelDataset(object):
             combined_train_valid_data (dc.Dataset): A dataset object (initialized as None), of the merged train
             and valid splits
 
+            combined_train_valid_data (dc.NumpyDataset): Cache for combined training and validation data, 
+            used by k-fold CV code
+
+            subset_response_dict (dictionary): Cache for subset-specific response values matched to IDs, 
+            used by k-fold CV code
+
+            subset_weight_dict (dictionary): Cache for subset-specific weights matched to IDs, 
+            used by k-fold CV code
+
+            untransformed_response_dict (dictionary): Cache for untransformed response values 
+            matched to IDs, used by k-fold CV code
+
+
         set in get_featurized_data:
             dataset: A new featurized DeepChem Dataset.
 
@@ -316,7 +329,7 @@ def __init__(self, params, featurization):
         self.combined_train_valid_data = None
         # Cache for subset-specific response values matched to IDs, used by k-fold CV code
         self.subset_response_dict = {}
-        # Cache for subset-specific response values matched to IDs, used by k-fold CV code
+        # Cache for subset-specific weights matched to IDs, used by k-fold CV code
         self.subset_weight_dict = {}
         # Cache for untransformed response values matched to IDs, used by k-fold CV code
         self.untransformed_response_dict = {}
@@ -355,6 +368,7 @@ def get_featurized_data(self, params=None):
                 n_features: The count of features (int)
                 vals: The response col after featurization (np.array)
                 attr: A pd.dataframe containing the compound ids and smiles
+                untranfsormed_dataset: A NumpyDataset containing untransformed data
         """
         
         if params is None:
@@ -692,8 +706,6 @@ def get_subset_responses_and_weights(self, subset):
         Args:
             subset (string): Label of subset, 'train', 'test', or 'valid'
 
-            transformers: Transformers object for full dataset
-
         Returns:
             tuple(response_dict, weight_dict)
                 (response_dict): dictionary mapping compound ids to arrays of per-task untransformed response values
@@ -718,8 +730,16 @@ def get_subset_responses_and_weights(self, subset):
     # *************************************************************************************
 
     def get_untransformed_responses(self, ids):
-        """ Returns a numpy array of untransformed response values
         """
+        Returns a numpy array of untransformed response values for the given IDs.
+
+        Parameters:
+        ids (list or np.ndarray): List or array of IDs for which to retrieve untransformed response values.
+
+        Returns:
+        np.ndarray: A numpy array of untransformed response values corresponding to the given IDs.
+        """        
+
         response_vals = np.zeros((len(ids), self.untransformed_dataset.y.shape[1]))
         if len(self.untransformed_response_dict) == 0:
             self.untransformed_response_dict = dict(zip(self.untransformed_dataset.ids, self.untransformed_dataset.y))
diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py
index b4e4d4ab..59e515b6 100644
--- a/atomsci/ddm/pipeline/model_wrapper.py
+++ b/atomsci/ddm/pipeline/model_wrapper.py
@@ -239,11 +239,16 @@ class ModelWrapper(object):
 
             output_dir (str): The parent path of the model directory
 
-            transformers (list): Initialized as an empty list, stores the transformers on the response cols
+            transformers (dict of lists): Initialized using transformers.get_blank_transformations.
+            Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}.
+            Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for
+            the transformer fitted for the final model. When using k-fold validation, 'final' is fitted
+            using all training and validation data. Without k-fold validation, transformers for 0 and 'final'
+            are the same.
 
-            transformers_x (list): Initialized as an empty list, stores the transformers on the features
+            transformers_x (dict of lists): Same as transformers, but stores the transformers on the features
 
-            transformers_w (list): Initialized as an empty list, stores the transformers on the weights
+            transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights
 
         set in setup_model_dirs:
             best_model_dir (str): The subdirectory under output_dir that contains the best model. Created in setup_model_dirs
@@ -269,11 +274,17 @@ def __init__(self, params, featurizer, ds_client):
 
                 output_dir (str): The parent path of the model directory
 
-                transformers (list): Initialized as an empty list, stores the transformers on the response cols
+                transformers (dict of lists): Initialized using transformers.get_blank_transformations.
+                Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}.
+                Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for
+                the transformer fitted for the final model. When using k-fold validation, 'final' is fitted
+                using all training and validation data. Without k-fold validation, transformers for 0 and 'final'
+                are the same.
 
-                transformers_x (list): Initialized as an empty list, stores the transformers on the features
+                transformers_x (dict of lists): Same as transformers, but stores the transformers on the features
+
+                transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights
 
-                transformers_w (list): Initialized as an empty list, stores the transformers on the weights
 
         """
         self.params = params
@@ -328,7 +339,7 @@ def _create_output_transformers(self, dataset):
         """Initialize transformers for responses and persist them for later.
 
         Args:
-            model_dataset: The ModelDataset object that handles the current dataset
+            dataset: A dc.Dataset object
 
         Side effects:
             Overwrites the attributes:
@@ -346,7 +357,7 @@ def _create_feature_transformers(self, dataset):
         """Initialize transformers for features, and persist them for later.
 
         Args:
-            model_dataset: The ModelDataset object that handles the current dataset
+            dataset: A dc.Dataset object
 
         Side effects:
             Overwrites the attributes:
@@ -361,17 +372,21 @@ def create_transformers(self, training_datasets):
         """Initialize transformers for responses, features and weights, and persist them for later.
 
         Args:
-            training_datasets: The ModelDataset object that handles the current dataset
+            training_datasets: A dictionary of dc.Datasets containing the training data from 
+            each fold. Generated using transformers.get_all_training_datasets.
 
         Side effects:
             Overwrites the attributes:
-                transformers: A list of deepchem transformation objects on responses, only if conditions are met
-
-                transformers_x: A list of deepchem transformation objects on features, only if conditions are met.
+                transformers (dict of lists): Initialized using transformers.get_blank_transformations.
+                Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}.
+                Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for
+                the transformer fitted for the final model. When using k-fold validation, 'final' is fitted
+                using all training and validation data. Without k-fold validation, transformers for 0 and 'final'
+                are the same.
 
-                transformers_w: A list of deepchem transformation objects on weights, only if conditions are met.
+                transformers_x (dict of lists): Same as transformers, but stores the transformers on the features
 
-                params.transformer_key: A string pointing to the dataset key containing the transformer in the datastore, or the path to the transformer
+                transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights
 
         """
         total_transformers = 0
@@ -459,7 +474,7 @@ def transform_dataset(self, dataset, fold):
 
         Args:
             dataset: The DeepChem DiskDataset that contains a dataset
-            fold (int): Which fold is being transformed.
+            fold (int/str): Which fold is being transformed.
 
         Returns:
             transformed_dataset: The transformed DeepChem DiskDataset
@@ -511,7 +526,7 @@ def get_train_valid_pred_results(self, perf_data):
         return perf_data.get_prediction_results()
 
         # ****************************************************************************************
-    def get_test_perf_data(self, model_dir, model_dataset, fold):
+    def get_test_perf_data(self, model_dir, model_dataset):
         """Returns the predicted values and metrics for the current test dataset against
         the version of the model stored in model_dir, as a PerfData object.
 
@@ -553,7 +568,7 @@ def get_test_pred_results(self, model_dir, model_dataset):
         return perf_data.get_prediction_results()
 
         # ****************************************************************************************
-    def get_full_dataset_perf_data(self, model_dataset, fold):
+    def get_full_dataset_perf_data(self, model_dataset):
         """Returns the predicted values and metrics from the current model for the full current dataset,
         as a PerfData object.
 
@@ -1617,7 +1632,7 @@ def _create_output_transformers(self, dataset):
         """Initialize transformers for responses and persist them for later.
 
         Args:
-            model_dataset: The ModelDataset object that handles the current dataset
+            dataset: The dc.Dataset object that contains the current training dataset
 
         Side effects:
             Overwrites the attributes:
diff --git a/atomsci/ddm/pipeline/transformations.py b/atomsci/ddm/pipeline/transformations.py
index fd8bedaa..7f42f003 100644
--- a/atomsci/ddm/pipeline/transformations.py
+++ b/atomsci/ddm/pipeline/transformations.py
@@ -70,9 +70,12 @@ def create_feature_transformers(params, featurization, train_dset):
     DeepChem transformer object holding its parameters.
 
     Args:
-        params (argparse.namespace: Object containing the parameter list
+        params (argparse.namespace): Object containing the parameter list
+
+        featurization (featurization.Featurization): A Featurization object that will be used with
+        the train_dset object.
 
-        model_dataset (ModelDataset): Contains the dataset to be transformed.
+        train_dset (dc.Dataset): Contains the dataset used to fit the the transformers.
 
     Returns:
         (list of DeepChem transformer objects): list of transformers for the feature matrix
@@ -102,7 +105,7 @@ def create_weight_transformers(params, dataset):
     Args:
         params (argparse.namespace: Object containing the parameter list
 
-        model_dataset (ModelDataset): Contains the dataset to be transformed.
+        dataset (dc.Dataset): Contains the dataset to be transformed.
 
     Returns:
         (list of DeepChem transformer objects): list of transformers for the weight matrix
@@ -146,6 +149,12 @@ def get_transformer_keys(params):
     There is one set of transformers for each fold and then one transformer
     for both validation and training sets. AMPL automatically trains a model
     using all validation and training data at the end of the training loop.
+
+    Args:
+        params (argparse.namespace: Object containing the parameter list
+
+    Returns:
+        (list): A list of all keys used in transformer dictionaries.
     """
     if params.split_strategy != 'k_fold_cv':
         return [0, 'final']
@@ -156,6 +165,9 @@ def get_transformer_keys(params):
 def get_blank_transformations():
     """Get empty transformations dictionary
     These keys must always exist, even when there are no transformations
+
+    Returns:
+        (dict): A dictionary containing empty lists. Used when no transformers are needed
     """
     return {0:[], 'final':[]}
 
@@ -165,6 +177,13 @@ def get_all_training_datasets(model_dataset):
     This takes a model_dataset and returns a dictionary of all
     datasets that will need a transformer. The keys will match
     what is returned by get_transformer_keys
+
+    Args:
+        model_dataset: A model_datasets.ModelDataset object containing the current dataset.
+
+    Returns:
+        dict of dc.Datasets: A dictionary keyed using keys fold numbers and 'final'. Contains
+        the training data for each fold and the final training+validation training set.
     """
     result = {}
     if model_dataset.splitting is None:
diff --git a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py
index 7ddf0b2d..f09369e7 100644
--- a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py
+++ b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py
@@ -16,6 +16,9 @@
 logger = logging.getLogger(__name__)
 
 def test_balancing_transformer():
+    """
+    Test the balancing transformer to ensure that it correctly adjusts weights for imbalanced datasets.
+    """
     dset_key = make_relative_to_file('../../test_datasets/MRP3_dataset.csv')
 
     res_dir = tempfile.mkdtemp()
@@ -32,6 +35,10 @@ def test_balancing_transformer():
     assert weight == 1
 
 def test_all_transformers():
+    """
+    Test all transformers to ensure they work correctly with the dataset.
+
+    """
     res_dir = tempfile.mkdtemp()
     dskey = os.path.join(res_dir, 'special_test_dset.csv')
     params = read_params(
@@ -88,6 +95,9 @@ def test_all_transformers():
     assert (valid_weight1*valid_count1*4 - valid_weight2*valid_count2) < 1e-4
 
 def make_pipeline(params):
+    """
+    Generates a pipeline given parameters
+    """
     pparams = parse.wrapper(params)
     model_pipeline = mp.ModelPipeline(pparams)
     model_pipeline.train_model()
@@ -95,6 +105,9 @@ def make_pipeline(params):
     return model_pipeline
 
 def make_pipeline_and_get_weights(params):
+    """
+    Generates the pipeline and gets the weights given parameters
+    """
     model_pipeline = make_pipeline(params)
     model_wrapper = model_pipeline.model_wrapper
     train_dataset = model_pipeline.data.train_valid_dsets[0][0]
@@ -103,12 +116,27 @@ def make_pipeline_and_get_weights(params):
     return transformed_data.w
 
 def make_relative_to_file(relative_path):
+    """
+    Generates the full path relative to the location of this file.
+    """
+    
     script_path = os.path.dirname(os.path.realpath(__file__))
     result = os.path.join(script_path, relative_path)
 
     return result
 
 def read_params(json_file, tmp_dskey, res_dir):
+    """
+    Read parameters from a JSON file and update them with the dataset key and result directory.
+
+    Parameters:
+    json_file (str): Path to the JSON file containing parameters.
+    tmp_dskey (str): Temporary dataset key.
+    res_dir (str): Result directory.
+
+    Returns:
+    dict: Updated parameters.
+    """
     with open(json_file, 'r') as file:
         params = json.load(file)
     params['result_dir'] = res_dir
@@ -116,7 +144,9 @@ def read_params(json_file, tmp_dskey, res_dir):
     return params
 
 def params_wo_balan(dset_key, res_dir):
-    # Train classification models without balancing weights. Repeat this several times so we can get some statistics on the performance metrics.
+    """
+    Reads params for models without balancing weight
+    """
     params = read_params(
         make_relative_to_file('jsons/wo_balancing_transformer.json'),
         dset_key,
@@ -125,7 +155,9 @@ def params_wo_balan(dset_key, res_dir):
     return params
 
 def params_w_balan(dset_key, res_dir):
-    # Now train models on the same dataset with balancing weights
+    """
+    Reads params for models with balancing weight
+    """
     params = read_params(
         make_relative_to_file('jsons/balancing_transformer.json'),
         dset_key,
@@ -135,6 +167,9 @@ def params_w_balan(dset_key, res_dir):
     return params
 
 def test_kfold_transformers():
+    """
+    Test transformers for a kfold classification model
+    """
     res_dir = tempfile.mkdtemp()
     dskey = os.path.join(res_dir, 'special_test_dset.csv')
     params = read_params(
@@ -217,6 +252,10 @@ def test_kfold_transformers():
     np.testing.assert_array_almost_equal(transformer_x.X_means, np.ones_like(transformer_x.X_means)*expected_mean)
 
 def test_kfold_regression_transformers():
+    """
+    Tests transformers for each fold of a kfold regression model. Ensures
+    that the transformers are correct for each fold.
+    """
     res_dir = tempfile.mkdtemp()
     dskey = os.path.join(res_dir, 'special_test_dset.csv')
     params = read_params(
diff --git a/atomsci/ddm/test/integrative/make_test_datasets.py b/atomsci/ddm/test/integrative/make_test_datasets.py
index e6ed8764..2b7767db 100644
--- a/atomsci/ddm/test/integrative/make_test_datasets.py
+++ b/atomsci/ddm/test/integrative/make_test_datasets.py
@@ -19,6 +19,9 @@ def get_absolute_path(relative_path):
     return absolute_path
 
 def get_features(feature_type):
+    """
+    Gets the feature columns given a feature_type, e.g. rdkit_raw
+    """
     desc_spec_file = get_absolute_path('../../data/descriptor_sets_sources_by_descr_type.csv')
     desc_spec_df = pd.read_csv(desc_spec_file, index_col=False)
 
@@ -157,6 +160,17 @@ def make_split_df(train_ids, valid_ids, test_ids):
     return split_df
 
 def make_test_dataset_and_split(dataset_key, feature_types):
+    """
+    Given a dataset key and a feature type, create a featurized
+    csv file and split.
+
+    Args:
+        dataset_key (str): Where to save the newly generated test DataFrame
+
+        feature_types (str): A feature type, e.g. rdkit_raw
+
+
+    """
     features = get_features(feature_types)
     df, train_ids, valid_ids, test_ids = make_test_dataset(features)
 
@@ -211,6 +225,17 @@ def make_kfold_test_dataset(features, fold_size=1000, num_test=1000, num_folds=5
     return df, train_valid_ids, test_ids
 
 def make_kfold_split_df(train_valid_ids, test_ids):
+    """
+    Given lists of train_valid_ids and test_ids create a split DataFrame
+
+    Args:
+        train_valid_ids (list of lists): A list of ids
+
+        test_ids (list): Ids for test compounds
+
+    Returns:
+        DataFrame: A split DataFrame that can be read by AMPL
+    """
     fold_dfs = []
     for i, tvi in enumerate(train_valid_ids):
         data = {'cmpd_id':tvi}
@@ -230,6 +255,18 @@ def make_kfold_split_df(train_valid_ids, test_ids):
     return split_df
 
 def make_kfold_dataset_and_split(dataset_key, feature_types, num_folds=3):
+    """
+    Given a dataset key and a feature type,  and nubmer of folds create a featurized
+    csv file and split.
+
+    Args:
+        dataset_key (str): Where to save the newly generated test DataFrame
+
+        feature_types (str): A feature type, e.g. rdkit_raw
+
+        num_folds (int): Number of folds
+
+    """
     features = get_features(feature_types)
     df, train_ids, test_ids = make_kfold_test_dataset(features, num_folds=num_folds)
 
diff --git a/atomsci/ddm/test/unit/test_perf_data.py b/atomsci/ddm/test/unit/test_perf_data.py
index 20275fe2..f5752abc 100644
--- a/atomsci/ddm/test/unit/test_perf_data.py
+++ b/atomsci/ddm/test/unit/test_perf_data.py
@@ -10,10 +10,28 @@
 import json
 
 def copy_to_temp(dskey, res_dir):
+    """
+    Copy a dataset to a temporary directory.
+
+    Parameters:
+    dskey (str): Path to the original dataset.
+    res_dir (str): Path to the temporary directory.
+
+    Returns:
+    str: Path to the copied dataset in the temporary directory.
+    """
     new_dskey = shutil.copy(dskey, res_dir)
     return new_dskey
 
 def setup_paths():
+    """
+    Set up the paths for the test, including creating a temporary directory and copying the dataset to it.
+
+    Returns:
+    tuple: A tuple containing:
+        - res_dir (str): Path to the temporary result directory.
+        - tmp_dskey (str): Path to the copied dataset in the temporary directory.
+    """
     script_path = os.path.dirname(os.path.realpath(__file__))
     res_dir = tempfile.mkdtemp()
     dskey = os.path.join(script_path, '../test_datasets/aurka_chembl_base_smiles_union.csv')
@@ -22,6 +40,18 @@ def setup_paths():
     return res_dir, tmp_dskey
 
 def read_params(json_file, res_dir, tmp_dskey):
+    """
+    Read parameters from a JSON file and update them with the result directory and dataset key.
+
+    Parameters:
+    json_file (str): Path to the JSON file containing parameters.
+    res_dir (str): Path to the result directory.
+    tmp_dskey (str): Path to the copied dataset in the temporary directory.
+
+    Returns:
+    dict: Updated parameters.
+    """
+
     with open(json_file, 'r') as file:
         params = json.load(file)
     params['result_dir'] = res_dir
@@ -29,12 +59,25 @@ def read_params(json_file, res_dir, tmp_dskey):
     return params
 
 def make_relative_to_file(relative_path):
+    """
+    Generates the full path relative to the location of this file.
+
+    Parameters:
+    relative_path (str): The relative path to convert.
+
+    Returns:
+    str: The absolute path corresponding to the relative path.
+    """
     script_path = os.path.dirname(os.path.realpath(__file__))
     result = os.path.join(script_path, relative_path)
 
     return result
 
 def test_KFoldRegressionPerfData():
+    """
+    Test the KFoldRegressionPerfData class to ensure it correctly handles k-fold regression performance data.
+
+    """
     res_dir, tmp_dskey = setup_paths()
 
     params = read_params(make_relative_to_file('config_perf_data_KFoldRegressoinPerfData.json'),
@@ -78,6 +121,9 @@ def test_KFoldRegressionPerfData():
     assert r2_std==0
 
 def test_KFoldRegressionPerfDataMulti():
+    """
+    Test the KFoldRegressionPerfData class for multi-fold regression performance data.
+    """
     res_dir, tmp_dskey = setup_paths()
 
     # duplicate pIC50 column
@@ -126,6 +172,10 @@ def test_KFoldRegressionPerfDataMulti():
     assert r2_std==0
 
 def test_KFoldClassificationPerfData():
+    """
+    Test the KFoldClassificationPerfData functionality.
+
+    """
     res_dir, tmp_dskey = setup_paths()
 
     params = read_params(
@@ -175,6 +225,10 @@ def test_KFoldClassificationPerfData():
     assert roc_auc_std==0
 
 def test_SimpleRegressionPerfData():
+    """
+    Test the SimpleRegressionPerfData class for correct performance data creation and metrics computation.
+
+    """
     res_dir, tmp_dskey = setup_paths()
 
     params = read_params(
@@ -216,6 +270,13 @@ def test_SimpleRegressionPerfData():
     assert r2_mean == 1
 
 def test_SimpleClassificationPerfData():
+    """
+    Test function for SimpleClassificationPerfData.
+
+    This function sets up a model pipeline, trains a model, and creates performance data
+    for a simple classification task. It then verifies the following:
+
+    """
     res_dir, tmp_dskey = setup_paths()
 
     params = read_params(
diff --git a/atomsci/ddm/test/unit/test_transformers.py b/atomsci/ddm/test/unit/test_transformers.py
index 46a3fbd3..69cfebd6 100644
--- a/atomsci/ddm/test/unit/test_transformers.py
+++ b/atomsci/ddm/test/unit/test_transformers.py
@@ -4,6 +4,18 @@
 
 
 def test_no_missing_values():
+    """
+    Test the `get_statistics_missing_ydata` function from the `trans` module
+    to ensure it correctly calculates the mean and standard deviation of the
+    y-values when there are no missing values in the dataset.
+
+    The test creates a dataset with no missing y-values and checks that the
+    calculated means and standard deviations match the expected values.
+
+    Assertions:
+        - The means of the y-values should be [3.0, 4.0].
+        - The standard deviations of the y-values should be approximately [1.632993, 1.632993].
+    """
     y = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
     w = np.array([[1, 1], [1, 1], [1, 1]])
     x = np.ones_like(y)
@@ -14,6 +26,20 @@ def test_no_missing_values():
     np.testing.assert_array_almost_equal(y_stds, [1.632993, 1.632993])
 
 def test_some_missing_values():
+    """
+    Test the handling of missing values in the dataset.
+
+    This test creates a dataset with some missing values in the target variable `y`
+    and verifies that the `get_statistics_missing_ydata` function correctly computes
+    the means and standard deviations of the non-missing values.
+
+    The test checks that the computed means and standard deviations of the non-missing
+    values in `y` match the expected values.
+
+    Assertions:
+    - The means of the non-missing values in `y` should be approximately [3.0, 5.0].
+    - The standard deviations of the non-missing values in `y` should be approximately [1.632993, 1.0].
+    """
     y = np.array([[1.0, np.nan], [3.0, 4.0], [5.0, 6.0]])
     w = np.array([[1, 0], [1, 1], [1, 1]])
     x = np.ones_like(y)
@@ -24,6 +50,16 @@ def test_some_missing_values():
     np.testing.assert_array_almost_equal(y_stds, [1.632993, 1.0])
 
 def test_all_missing_values():
+    """
+    Test the `get_statistics_missing_ydata` function with a dataset where all y-values are missing (NaN).
+
+    This test creates a dataset with all missing y-values and checks if the function correctly computes
+    the means and standard deviations of the y-values, which should both be arrays of zeros.
+
+    The test asserts that:
+    - The means of the y-values are [0.0, 0.0].
+    - The standard deviations of the y-values are [0.0, 0.0].
+    """
     y = np.array([[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]])
     w = np.array([[0, 0], [0, 0], [0, 0]])
     x = np.ones_like(y)
@@ -34,6 +70,18 @@ def test_all_missing_values():
     np.testing.assert_array_almost_equal(y_stds, [0.0, 0.0])
 
 def test_one_task_no_missing_values():
+    """
+    Test the `get_statistics_missing_ydata` function with a dataset that has no missing values.
+
+    This test creates a dataset with no missing values and checks if the mean and standard deviation
+    of the y-values are calculated correctly.
+    
+    The expected mean of y-values is [3.0] and the expected standard deviation is [1.632993].
+
+    Asserts:
+        - The calculated mean of y-values is almost equal to [3.0].
+        - The calculated standard deviation of y-values is almost equal to [1.632993].
+    """
     y = np.array([[1.0], [3.0], [5.0]])
     w = np.array([[1], [1], [1]])
     x = np.ones_like(y)
@@ -44,6 +92,16 @@ def test_one_task_no_missing_values():
     np.testing.assert_array_almost_equal(y_stds, [1.632993])
 
 def test_normalization_transformer_missing_data():
+    """
+    Test the NormalizationTransformerMissingData class for handling missing data in the target variable.
+
+    The expected means and standard deviations for `y` are:
+    - Means: [3.0, 5.0]
+    - Standard deviations: [1.632993, 1.0]
+
+    The expected transformed `y` values are:
+    - [[-1.224745, 0], [0.0, -1.0], [1.224745, 1.0]]
+    """
     # Create a mock dataset
     X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
     y = np.array([[1.0, np.nan], [3.0, 4.0], [5.0, 6.0]])
@@ -69,6 +127,18 @@ def test_normalization_transformer_missing_data():
     np.testing.assert_array_almost_equal(transformed_dataset.y, expected_transformed_y, decimal=6)
 
 def test_normalization_transformer_missing_data_transform_X():
+    """
+    Test the NormalizationTransformerMissingData with transform_X=True.
+
+    This test verifies the following:
+    1. The means and standard deviations of the features in the dataset are correctly computed.
+    2. The transformation is correctly applied to the dataset.
+    
+    Assertions:
+    - The computed means of the features should be [3.0, 4.0].
+    - The computed standard deviations of the features should be approximately [1.632993, 1.632993].
+    - The transformed feature values should match the expected transformed values with a precision of 6 decimal places.
+    """
     # Create a mock dataset
     X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
     y = np.array([[1.0], [3.0], [5.0]])