From d3097619f4dc09efd1457dfbac2e1008ffac5e71 Mon Sep 17 00:00:00 2001 From: "he6@llnl.gov" Date: Tue, 7 Jan 2025 14:39:52 -0800 Subject: [PATCH] Removed unused 'fold' parameter. Added documentation for this PR --- atomsci/ddm/pipeline/model_datasets.py | 28 ++++++-- atomsci/ddm/pipeline/model_wrapper.py | 51 +++++++++----- atomsci/ddm/pipeline/transformations.py | 25 ++++++- .../test_balancing_transformer.py | 43 +++++++++++- .../test/integrative/make_test_datasets.py | 37 ++++++++++ atomsci/ddm/test/unit/test_perf_data.py | 61 ++++++++++++++++ atomsci/ddm/test/unit/test_transformers.py | 70 +++++++++++++++++++ 7 files changed, 288 insertions(+), 27 deletions(-) diff --git a/atomsci/ddm/pipeline/model_datasets.py b/atomsci/ddm/pipeline/model_datasets.py index e89b45bb..8ebd7aa8 100644 --- a/atomsci/ddm/pipeline/model_datasets.py +++ b/atomsci/ddm/pipeline/model_datasets.py @@ -251,6 +251,19 @@ class ModelDataset(object): combined_train_valid_data (dc.Dataset): A dataset object (initialized as None), of the merged train and valid splits + combined_train_valid_data (dc.NumpyDataset): Cache for combined training and validation data, + used by k-fold CV code + + subset_response_dict (dictionary): Cache for subset-specific response values matched to IDs, + used by k-fold CV code + + subset_weight_dict (dictionary): Cache for subset-specific weights matched to IDs, + used by k-fold CV code + + untransformed_response_dict (dictionary): Cache for untransformed response values + matched to IDs, used by k-fold CV code + + set in get_featurized_data: dataset: A new featurized DeepChem Dataset. @@ -316,7 +329,7 @@ def __init__(self, params, featurization): self.combined_train_valid_data = None # Cache for subset-specific response values matched to IDs, used by k-fold CV code self.subset_response_dict = {} - # Cache for subset-specific response values matched to IDs, used by k-fold CV code + # Cache for subset-specific weights matched to IDs, used by k-fold CV code self.subset_weight_dict = {} # Cache for untransformed response values matched to IDs, used by k-fold CV code self.untransformed_response_dict = {} @@ -355,6 +368,7 @@ def get_featurized_data(self, params=None): n_features: The count of features (int) vals: The response col after featurization (np.array) attr: A pd.dataframe containing the compound ids and smiles + untranfsormed_dataset: A NumpyDataset containing untransformed data """ if params is None: @@ -692,8 +706,6 @@ def get_subset_responses_and_weights(self, subset): Args: subset (string): Label of subset, 'train', 'test', or 'valid' - transformers: Transformers object for full dataset - Returns: tuple(response_dict, weight_dict) (response_dict): dictionary mapping compound ids to arrays of per-task untransformed response values @@ -718,8 +730,16 @@ def get_subset_responses_and_weights(self, subset): # ************************************************************************************* def get_untransformed_responses(self, ids): - """ Returns a numpy array of untransformed response values """ + Returns a numpy array of untransformed response values for the given IDs. + + Parameters: + ids (list or np.ndarray): List or array of IDs for which to retrieve untransformed response values. + + Returns: + np.ndarray: A numpy array of untransformed response values corresponding to the given IDs. + """ + response_vals = np.zeros((len(ids), self.untransformed_dataset.y.shape[1])) if len(self.untransformed_response_dict) == 0: self.untransformed_response_dict = dict(zip(self.untransformed_dataset.ids, self.untransformed_dataset.y)) diff --git a/atomsci/ddm/pipeline/model_wrapper.py b/atomsci/ddm/pipeline/model_wrapper.py index b4e4d4ab..59e515b6 100644 --- a/atomsci/ddm/pipeline/model_wrapper.py +++ b/atomsci/ddm/pipeline/model_wrapper.py @@ -239,11 +239,16 @@ class ModelWrapper(object): output_dir (str): The parent path of the model directory - transformers (list): Initialized as an empty list, stores the transformers on the response cols + transformers (dict of lists): Initialized using transformers.get_blank_transformations. + Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}. + Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for + the transformer fitted for the final model. When using k-fold validation, 'final' is fitted + using all training and validation data. Without k-fold validation, transformers for 0 and 'final' + are the same. - transformers_x (list): Initialized as an empty list, stores the transformers on the features + transformers_x (dict of lists): Same as transformers, but stores the transformers on the features - transformers_w (list): Initialized as an empty list, stores the transformers on the weights + transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights set in setup_model_dirs: best_model_dir (str): The subdirectory under output_dir that contains the best model. Created in setup_model_dirs @@ -269,11 +274,17 @@ def __init__(self, params, featurizer, ds_client): output_dir (str): The parent path of the model directory - transformers (list): Initialized as an empty list, stores the transformers on the response cols + transformers (dict of lists): Initialized using transformers.get_blank_transformations. + Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}. + Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for + the transformer fitted for the final model. When using k-fold validation, 'final' is fitted + using all training and validation data. Without k-fold validation, transformers for 0 and 'final' + are the same. - transformers_x (list): Initialized as an empty list, stores the transformers on the features + transformers_x (dict of lists): Same as transformers, but stores the transformers on the features + + transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights - transformers_w (list): Initialized as an empty list, stores the transformers on the weights """ self.params = params @@ -328,7 +339,7 @@ def _create_output_transformers(self, dataset): """Initialize transformers for responses and persist them for later. Args: - model_dataset: The ModelDataset object that handles the current dataset + dataset: A dc.Dataset object Side effects: Overwrites the attributes: @@ -346,7 +357,7 @@ def _create_feature_transformers(self, dataset): """Initialize transformers for features, and persist them for later. Args: - model_dataset: The ModelDataset object that handles the current dataset + dataset: A dc.Dataset object Side effects: Overwrites the attributes: @@ -361,17 +372,21 @@ def create_transformers(self, training_datasets): """Initialize transformers for responses, features and weights, and persist them for later. Args: - training_datasets: The ModelDataset object that handles the current dataset + training_datasets: A dictionary of dc.Datasets containing the training data from + each fold. Generated using transformers.get_all_training_datasets. Side effects: Overwrites the attributes: - transformers: A list of deepchem transformation objects on responses, only if conditions are met - - transformers_x: A list of deepchem transformation objects on features, only if conditions are met. + transformers (dict of lists): Initialized using transformers.get_blank_transformations. + Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}. + Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for + the transformer fitted for the final model. When using k-fold validation, 'final' is fitted + using all training and validation data. Without k-fold validation, transformers for 0 and 'final' + are the same. - transformers_w: A list of deepchem transformation objects on weights, only if conditions are met. + transformers_x (dict of lists): Same as transformers, but stores the transformers on the features - params.transformer_key: A string pointing to the dataset key containing the transformer in the datastore, or the path to the transformer + transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights """ total_transformers = 0 @@ -459,7 +474,7 @@ def transform_dataset(self, dataset, fold): Args: dataset: The DeepChem DiskDataset that contains a dataset - fold (int): Which fold is being transformed. + fold (int/str): Which fold is being transformed. Returns: transformed_dataset: The transformed DeepChem DiskDataset @@ -511,7 +526,7 @@ def get_train_valid_pred_results(self, perf_data): return perf_data.get_prediction_results() # **************************************************************************************** - def get_test_perf_data(self, model_dir, model_dataset, fold): + def get_test_perf_data(self, model_dir, model_dataset): """Returns the predicted values and metrics for the current test dataset against the version of the model stored in model_dir, as a PerfData object. @@ -553,7 +568,7 @@ def get_test_pred_results(self, model_dir, model_dataset): return perf_data.get_prediction_results() # **************************************************************************************** - def get_full_dataset_perf_data(self, model_dataset, fold): + def get_full_dataset_perf_data(self, model_dataset): """Returns the predicted values and metrics from the current model for the full current dataset, as a PerfData object. @@ -1617,7 +1632,7 @@ def _create_output_transformers(self, dataset): """Initialize transformers for responses and persist them for later. Args: - model_dataset: The ModelDataset object that handles the current dataset + dataset: The dc.Dataset object that contains the current training dataset Side effects: Overwrites the attributes: diff --git a/atomsci/ddm/pipeline/transformations.py b/atomsci/ddm/pipeline/transformations.py index fd8bedaa..7f42f003 100644 --- a/atomsci/ddm/pipeline/transformations.py +++ b/atomsci/ddm/pipeline/transformations.py @@ -70,9 +70,12 @@ def create_feature_transformers(params, featurization, train_dset): DeepChem transformer object holding its parameters. Args: - params (argparse.namespace: Object containing the parameter list + params (argparse.namespace): Object containing the parameter list + + featurization (featurization.Featurization): A Featurization object that will be used with + the train_dset object. - model_dataset (ModelDataset): Contains the dataset to be transformed. + train_dset (dc.Dataset): Contains the dataset used to fit the the transformers. Returns: (list of DeepChem transformer objects): list of transformers for the feature matrix @@ -102,7 +105,7 @@ def create_weight_transformers(params, dataset): Args: params (argparse.namespace: Object containing the parameter list - model_dataset (ModelDataset): Contains the dataset to be transformed. + dataset (dc.Dataset): Contains the dataset to be transformed. Returns: (list of DeepChem transformer objects): list of transformers for the weight matrix @@ -146,6 +149,12 @@ def get_transformer_keys(params): There is one set of transformers for each fold and then one transformer for both validation and training sets. AMPL automatically trains a model using all validation and training data at the end of the training loop. + + Args: + params (argparse.namespace: Object containing the parameter list + + Returns: + (list): A list of all keys used in transformer dictionaries. """ if params.split_strategy != 'k_fold_cv': return [0, 'final'] @@ -156,6 +165,9 @@ def get_transformer_keys(params): def get_blank_transformations(): """Get empty transformations dictionary These keys must always exist, even when there are no transformations + + Returns: + (dict): A dictionary containing empty lists. Used when no transformers are needed """ return {0:[], 'final':[]} @@ -165,6 +177,13 @@ def get_all_training_datasets(model_dataset): This takes a model_dataset and returns a dictionary of all datasets that will need a transformer. The keys will match what is returned by get_transformer_keys + + Args: + model_dataset: A model_datasets.ModelDataset object containing the current dataset. + + Returns: + dict of dc.Datasets: A dictionary keyed using keys fold numbers and 'final'. Contains + the training data for each fold and the final training+validation training set. """ result = {} if model_dataset.splitting is None: diff --git a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py index 7ddf0b2d..f09369e7 100644 --- a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py +++ b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py @@ -16,6 +16,9 @@ logger = logging.getLogger(__name__) def test_balancing_transformer(): + """ + Test the balancing transformer to ensure that it correctly adjusts weights for imbalanced datasets. + """ dset_key = make_relative_to_file('../../test_datasets/MRP3_dataset.csv') res_dir = tempfile.mkdtemp() @@ -32,6 +35,10 @@ def test_balancing_transformer(): assert weight == 1 def test_all_transformers(): + """ + Test all transformers to ensure they work correctly with the dataset. + + """ res_dir = tempfile.mkdtemp() dskey = os.path.join(res_dir, 'special_test_dset.csv') params = read_params( @@ -88,6 +95,9 @@ def test_all_transformers(): assert (valid_weight1*valid_count1*4 - valid_weight2*valid_count2) < 1e-4 def make_pipeline(params): + """ + Generates a pipeline given parameters + """ pparams = parse.wrapper(params) model_pipeline = mp.ModelPipeline(pparams) model_pipeline.train_model() @@ -95,6 +105,9 @@ def make_pipeline(params): return model_pipeline def make_pipeline_and_get_weights(params): + """ + Generates the pipeline and gets the weights given parameters + """ model_pipeline = make_pipeline(params) model_wrapper = model_pipeline.model_wrapper train_dataset = model_pipeline.data.train_valid_dsets[0][0] @@ -103,12 +116,27 @@ def make_pipeline_and_get_weights(params): return transformed_data.w def make_relative_to_file(relative_path): + """ + Generates the full path relative to the location of this file. + """ + script_path = os.path.dirname(os.path.realpath(__file__)) result = os.path.join(script_path, relative_path) return result def read_params(json_file, tmp_dskey, res_dir): + """ + Read parameters from a JSON file and update them with the dataset key and result directory. + + Parameters: + json_file (str): Path to the JSON file containing parameters. + tmp_dskey (str): Temporary dataset key. + res_dir (str): Result directory. + + Returns: + dict: Updated parameters. + """ with open(json_file, 'r') as file: params = json.load(file) params['result_dir'] = res_dir @@ -116,7 +144,9 @@ def read_params(json_file, tmp_dskey, res_dir): return params def params_wo_balan(dset_key, res_dir): - # Train classification models without balancing weights. Repeat this several times so we can get some statistics on the performance metrics. + """ + Reads params for models without balancing weight + """ params = read_params( make_relative_to_file('jsons/wo_balancing_transformer.json'), dset_key, @@ -125,7 +155,9 @@ def params_wo_balan(dset_key, res_dir): return params def params_w_balan(dset_key, res_dir): - # Now train models on the same dataset with balancing weights + """ + Reads params for models with balancing weight + """ params = read_params( make_relative_to_file('jsons/balancing_transformer.json'), dset_key, @@ -135,6 +167,9 @@ def params_w_balan(dset_key, res_dir): return params def test_kfold_transformers(): + """ + Test transformers for a kfold classification model + """ res_dir = tempfile.mkdtemp() dskey = os.path.join(res_dir, 'special_test_dset.csv') params = read_params( @@ -217,6 +252,10 @@ def test_kfold_transformers(): np.testing.assert_array_almost_equal(transformer_x.X_means, np.ones_like(transformer_x.X_means)*expected_mean) def test_kfold_regression_transformers(): + """ + Tests transformers for each fold of a kfold regression model. Ensures + that the transformers are correct for each fold. + """ res_dir = tempfile.mkdtemp() dskey = os.path.join(res_dir, 'special_test_dset.csv') params = read_params( diff --git a/atomsci/ddm/test/integrative/make_test_datasets.py b/atomsci/ddm/test/integrative/make_test_datasets.py index e6ed8764..2b7767db 100644 --- a/atomsci/ddm/test/integrative/make_test_datasets.py +++ b/atomsci/ddm/test/integrative/make_test_datasets.py @@ -19,6 +19,9 @@ def get_absolute_path(relative_path): return absolute_path def get_features(feature_type): + """ + Gets the feature columns given a feature_type, e.g. rdkit_raw + """ desc_spec_file = get_absolute_path('../../data/descriptor_sets_sources_by_descr_type.csv') desc_spec_df = pd.read_csv(desc_spec_file, index_col=False) @@ -157,6 +160,17 @@ def make_split_df(train_ids, valid_ids, test_ids): return split_df def make_test_dataset_and_split(dataset_key, feature_types): + """ + Given a dataset key and a feature type, create a featurized + csv file and split. + + Args: + dataset_key (str): Where to save the newly generated test DataFrame + + feature_types (str): A feature type, e.g. rdkit_raw + + + """ features = get_features(feature_types) df, train_ids, valid_ids, test_ids = make_test_dataset(features) @@ -211,6 +225,17 @@ def make_kfold_test_dataset(features, fold_size=1000, num_test=1000, num_folds=5 return df, train_valid_ids, test_ids def make_kfold_split_df(train_valid_ids, test_ids): + """ + Given lists of train_valid_ids and test_ids create a split DataFrame + + Args: + train_valid_ids (list of lists): A list of ids + + test_ids (list): Ids for test compounds + + Returns: + DataFrame: A split DataFrame that can be read by AMPL + """ fold_dfs = [] for i, tvi in enumerate(train_valid_ids): data = {'cmpd_id':tvi} @@ -230,6 +255,18 @@ def make_kfold_split_df(train_valid_ids, test_ids): return split_df def make_kfold_dataset_and_split(dataset_key, feature_types, num_folds=3): + """ + Given a dataset key and a feature type, and nubmer of folds create a featurized + csv file and split. + + Args: + dataset_key (str): Where to save the newly generated test DataFrame + + feature_types (str): A feature type, e.g. rdkit_raw + + num_folds (int): Number of folds + + """ features = get_features(feature_types) df, train_ids, test_ids = make_kfold_test_dataset(features, num_folds=num_folds) diff --git a/atomsci/ddm/test/unit/test_perf_data.py b/atomsci/ddm/test/unit/test_perf_data.py index 20275fe2..f5752abc 100644 --- a/atomsci/ddm/test/unit/test_perf_data.py +++ b/atomsci/ddm/test/unit/test_perf_data.py @@ -10,10 +10,28 @@ import json def copy_to_temp(dskey, res_dir): + """ + Copy a dataset to a temporary directory. + + Parameters: + dskey (str): Path to the original dataset. + res_dir (str): Path to the temporary directory. + + Returns: + str: Path to the copied dataset in the temporary directory. + """ new_dskey = shutil.copy(dskey, res_dir) return new_dskey def setup_paths(): + """ + Set up the paths for the test, including creating a temporary directory and copying the dataset to it. + + Returns: + tuple: A tuple containing: + - res_dir (str): Path to the temporary result directory. + - tmp_dskey (str): Path to the copied dataset in the temporary directory. + """ script_path = os.path.dirname(os.path.realpath(__file__)) res_dir = tempfile.mkdtemp() dskey = os.path.join(script_path, '../test_datasets/aurka_chembl_base_smiles_union.csv') @@ -22,6 +40,18 @@ def setup_paths(): return res_dir, tmp_dskey def read_params(json_file, res_dir, tmp_dskey): + """ + Read parameters from a JSON file and update them with the result directory and dataset key. + + Parameters: + json_file (str): Path to the JSON file containing parameters. + res_dir (str): Path to the result directory. + tmp_dskey (str): Path to the copied dataset in the temporary directory. + + Returns: + dict: Updated parameters. + """ + with open(json_file, 'r') as file: params = json.load(file) params['result_dir'] = res_dir @@ -29,12 +59,25 @@ def read_params(json_file, res_dir, tmp_dskey): return params def make_relative_to_file(relative_path): + """ + Generates the full path relative to the location of this file. + + Parameters: + relative_path (str): The relative path to convert. + + Returns: + str: The absolute path corresponding to the relative path. + """ script_path = os.path.dirname(os.path.realpath(__file__)) result = os.path.join(script_path, relative_path) return result def test_KFoldRegressionPerfData(): + """ + Test the KFoldRegressionPerfData class to ensure it correctly handles k-fold regression performance data. + + """ res_dir, tmp_dskey = setup_paths() params = read_params(make_relative_to_file('config_perf_data_KFoldRegressoinPerfData.json'), @@ -78,6 +121,9 @@ def test_KFoldRegressionPerfData(): assert r2_std==0 def test_KFoldRegressionPerfDataMulti(): + """ + Test the KFoldRegressionPerfData class for multi-fold regression performance data. + """ res_dir, tmp_dskey = setup_paths() # duplicate pIC50 column @@ -126,6 +172,10 @@ def test_KFoldRegressionPerfDataMulti(): assert r2_std==0 def test_KFoldClassificationPerfData(): + """ + Test the KFoldClassificationPerfData functionality. + + """ res_dir, tmp_dskey = setup_paths() params = read_params( @@ -175,6 +225,10 @@ def test_KFoldClassificationPerfData(): assert roc_auc_std==0 def test_SimpleRegressionPerfData(): + """ + Test the SimpleRegressionPerfData class for correct performance data creation and metrics computation. + + """ res_dir, tmp_dskey = setup_paths() params = read_params( @@ -216,6 +270,13 @@ def test_SimpleRegressionPerfData(): assert r2_mean == 1 def test_SimpleClassificationPerfData(): + """ + Test function for SimpleClassificationPerfData. + + This function sets up a model pipeline, trains a model, and creates performance data + for a simple classification task. It then verifies the following: + + """ res_dir, tmp_dskey = setup_paths() params = read_params( diff --git a/atomsci/ddm/test/unit/test_transformers.py b/atomsci/ddm/test/unit/test_transformers.py index 46a3fbd3..69cfebd6 100644 --- a/atomsci/ddm/test/unit/test_transformers.py +++ b/atomsci/ddm/test/unit/test_transformers.py @@ -4,6 +4,18 @@ def test_no_missing_values(): + """ + Test the `get_statistics_missing_ydata` function from the `trans` module + to ensure it correctly calculates the mean and standard deviation of the + y-values when there are no missing values in the dataset. + + The test creates a dataset with no missing y-values and checks that the + calculated means and standard deviations match the expected values. + + Assertions: + - The means of the y-values should be [3.0, 4.0]. + - The standard deviations of the y-values should be approximately [1.632993, 1.632993]. + """ y = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) w = np.array([[1, 1], [1, 1], [1, 1]]) x = np.ones_like(y) @@ -14,6 +26,20 @@ def test_no_missing_values(): np.testing.assert_array_almost_equal(y_stds, [1.632993, 1.632993]) def test_some_missing_values(): + """ + Test the handling of missing values in the dataset. + + This test creates a dataset with some missing values in the target variable `y` + and verifies that the `get_statistics_missing_ydata` function correctly computes + the means and standard deviations of the non-missing values. + + The test checks that the computed means and standard deviations of the non-missing + values in `y` match the expected values. + + Assertions: + - The means of the non-missing values in `y` should be approximately [3.0, 5.0]. + - The standard deviations of the non-missing values in `y` should be approximately [1.632993, 1.0]. + """ y = np.array([[1.0, np.nan], [3.0, 4.0], [5.0, 6.0]]) w = np.array([[1, 0], [1, 1], [1, 1]]) x = np.ones_like(y) @@ -24,6 +50,16 @@ def test_some_missing_values(): np.testing.assert_array_almost_equal(y_stds, [1.632993, 1.0]) def test_all_missing_values(): + """ + Test the `get_statistics_missing_ydata` function with a dataset where all y-values are missing (NaN). + + This test creates a dataset with all missing y-values and checks if the function correctly computes + the means and standard deviations of the y-values, which should both be arrays of zeros. + + The test asserts that: + - The means of the y-values are [0.0, 0.0]. + - The standard deviations of the y-values are [0.0, 0.0]. + """ y = np.array([[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]]) w = np.array([[0, 0], [0, 0], [0, 0]]) x = np.ones_like(y) @@ -34,6 +70,18 @@ def test_all_missing_values(): np.testing.assert_array_almost_equal(y_stds, [0.0, 0.0]) def test_one_task_no_missing_values(): + """ + Test the `get_statistics_missing_ydata` function with a dataset that has no missing values. + + This test creates a dataset with no missing values and checks if the mean and standard deviation + of the y-values are calculated correctly. + + The expected mean of y-values is [3.0] and the expected standard deviation is [1.632993]. + + Asserts: + - The calculated mean of y-values is almost equal to [3.0]. + - The calculated standard deviation of y-values is almost equal to [1.632993]. + """ y = np.array([[1.0], [3.0], [5.0]]) w = np.array([[1], [1], [1]]) x = np.ones_like(y) @@ -44,6 +92,16 @@ def test_one_task_no_missing_values(): np.testing.assert_array_almost_equal(y_stds, [1.632993]) def test_normalization_transformer_missing_data(): + """ + Test the NormalizationTransformerMissingData class for handling missing data in the target variable. + + The expected means and standard deviations for `y` are: + - Means: [3.0, 5.0] + - Standard deviations: [1.632993, 1.0] + + The expected transformed `y` values are: + - [[-1.224745, 0], [0.0, -1.0], [1.224745, 1.0]] + """ # Create a mock dataset X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) y = np.array([[1.0, np.nan], [3.0, 4.0], [5.0, 6.0]]) @@ -69,6 +127,18 @@ def test_normalization_transformer_missing_data(): np.testing.assert_array_almost_equal(transformed_dataset.y, expected_transformed_y, decimal=6) def test_normalization_transformer_missing_data_transform_X(): + """ + Test the NormalizationTransformerMissingData with transform_X=True. + + This test verifies the following: + 1. The means and standard deviations of the features in the dataset are correctly computed. + 2. The transformation is correctly applied to the dataset. + + Assertions: + - The computed means of the features should be [3.0, 4.0]. + - The computed standard deviations of the features should be approximately [1.632993, 1.632993]. + - The transformed feature values should match the expected transformed values with a precision of 6 decimal places. + """ # Create a mock dataset X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) y = np.array([[1.0], [3.0], [5.0]])