Inserted Sklearn RobustScaler and PowerTransformer into the pipeline …

…and added tests
ATOMScience-org · Jan 21, 2025 · 9e8dfce · 9e8dfce
1 parent 2479de0
commit 9e8dfce
Show file tree

Hide file tree

Showing 10 changed files with 355 additions and 17 deletions.
diff --git a/atomsci/ddm/docs/PARAMETERS.md b/atomsci/ddm/docs/PARAMETERS.md
@@ -593,16 +593,17 @@ the model will train for max_epochs regardless of validation error.|
 
 |||
 |-|-|
-|*Description:*|type of transformation for the features|
-|*Default:*|normalization|
-|*Type:*|Choice|
+|*Description:*|type of transformation for the features. Choices are {"normalization", "RobustScaler", "PowerTransformer"}.|
+|*Default:*|"normalizaton"|
+|*Type:*|choice|
 
 - **response\_transform\_type**  
 
 |||
 |-|-|
-|*Description:*|type of transformation for the response column (defaults to "normalization") TODO: Not currently implemented|
-|*Default:*|normalization|
+|*Description:*|type of transformation for the response column. Choices are {"normalization"}|
+|*Default:*|"normalization"|
+|*Type:*|choice|
 
 - **transformer\_bucket**  
 
@@ -632,6 +633,54 @@ the model will train for max_epochs regardless of validation error.|
 |*Default:*|TRUE|
 |*Type:*|Bool|
 
+- **robustscaler_with_centering**  
+
+|||
+|-|-|
+|*Description:*|If `True`, center the data before scaling. This will cause `transform` to raise an exception when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.|
+|*Default:*|TRUE|
+|*Type:*|Bool|
+
+- **robustscaler_with_scaling**  
+
+|||
+|-|-|
+|*Description:*|If `True`, scale the data to interquartile range.|
+|*Default:*|TRUE|
+|*Type:*|Bool|
+
+  - **robustscaler_quartile_range**  
+
+|||
+|-|-|
+|*Description:*|Quantile range used to calculate `scale_`. By default this is equal to the IQR, i.e., `q_min` is the first quantile and `q_max` is the third quantile. `(q_min, q_max), 0.0 < q_min < q_max < 100.0`|
+|*Default:*|(25.0, 75.0)|
+|*Type:*|List|
+
+  - **robustscaler_unit_variance**  
+
+|||
+|-|-|
+|*Description:*|If `True`, scale data so that normally distributed features have a variance of 1. In general, if the difference between the x-values of `q_max` and `q_min` for a standard normal distribution is greater than 1, the dataset will be scaled down. If less than 1, the dataset will be scaled up.|
+|*Default:*|FALSE|
+|*Type:*|bool|
+
+  - **powertransformer_method**  
+
+|||
+|-|-|
+|*Description:*|The power transform method. Available methods are: ‘yeo-johnson’ , works with positive and negative values ‘box-cox’, only works with strictly positive values. Choices are {"yeo-johnson", "box-cox"}|
+|*Default:*|"yeo-johnson"|
+|*Type:*|choice|
+
+  - **powertransformer_standardize**  
+
+|||
+|-|-|
+|*Description:*|Set to True to apply zero-mean, unit-variance normalization to the transformed output.|
+|*Default:*|TRUE|
+|*Type:*|Bool|
+
 ---
 
 <a name="XGBoost"></a>

diff --git a/atomsci/ddm/pipeline/featurization.py b/atomsci/ddm/pipeline/featurization.py
@@ -22,6 +22,8 @@
 from rdkit.Chem import Descriptors
 from rdkit.ML.Descriptors import MoleculeDescriptors
 
+from sklearn.preprocessing import RobustScaler, PowerTransformer
+
 subclassed_mordred_classes = ['EState', 'MolecularDistanceEdge']
 try:
     from mordred import Calculator, descriptors
@@ -615,12 +617,13 @@ def get_feature_count(self):
         raise NotImplementedError
 
     # ****************************************************************************************
-    def create_feature_transformer(self, dataset):
+    def create_feature_transformer(self, dataset, params):
         """Fit a scaling and centering transformation to the feature matrix of the given dataset, and return a
         DeepChem transformer object holding its parameters.
 
         Args:
             dataset (deepchem.Dataset): featurized dataset
+            params (Namespace): Contains parameters used to instantiate the featurizer.
 
         Returns:
             Empty list
@@ -814,12 +817,13 @@ def featurize_data(self, dset_df, params, contains_responses):
         return features, ids, vals, attr, w, featurized_dset_df
 
     # ****************************************************************************************
-    def create_feature_transformer(self, dataset):
+    def create_feature_transformer(self, dataset, params):
         """Fit a scaling and centering transformation to the feature matrix of the given dataset, and return a
         DeepChem transformer object holding its parameters.
 
         Args:
             dataset (deepchem.Dataset): featurized dataset
+            params (Namespace): Contains parameters used to instantiate the featurizer.
 
         Returns:
             Empty list since we will not be transforming the features of a DynamicFeaturization object
@@ -1150,12 +1154,13 @@ def featurize_data(self, dset_df, params, contains_responses):
         raise NotImplementedError
 
     # ****************************************************************************************
-    def create_feature_transformer(self, dataset):
+    def create_feature_transformer(self, dataset, params):
         """Fit a scaling and centering transformation to the feature matrix of the given dataset, and return a
         DeepChem transformer object holding its parameters.
 
         Args:
             dataset (deepchem.Dataset): featurized dataset
+            params (Namespace): Contains parameters used to instantiate the featurizer.
 
         """
         # Leave it to subclasses to determine if features should be scaled and centered.
@@ -1588,17 +1593,39 @@ def get_feature_count(self):
         return len(self.get_feature_columns())
 
     # ****************************************************************************************
-    def create_feature_transformer(self, dataset):
+    def create_feature_transformer(self, dataset, params):
         """Fit a scaling and centering transformation to the feature matrix of the given dataset, and return a
         DeepChem transformer object holding its parameters.
 
         Args:
             dataset (deepchem.Dataset): featurized dataset
+            params (Namespace): Contains parameters used to instantiate the featurizer.
 
         Returns:
             (list of DeepChem transformer objects): list of transformers for the feature matrix
         """
-        transformers_x = [trans.NormalizationTransformerMissingData(transform_X=True, dataset=dataset)]
+        if params.feature_transform_type == 'normalization':
+            transformers_x = [trans.NormalizationTransformerMissingData(transform_X=True, dataset=dataset)]
+        elif params.feature_transformer_type == 'RobustScaler':
+            transformers_x = [
+                trans.SklearnTransformerWrapper(transform_X=True, dataset=dataset,
+                    sklearn_transformer=RobustScaler(
+                        with_centering=params.robustscaler_with_centering,
+                        with_scaling=params.robustscaler_with_scaling,
+                        quantile_range=params.robustscaler_quartile_range,
+                        unit_variance=params.robustscaler_unit_variance
+                    ))
+                ]
+        elif params.feature_transformer_type == 'PowerTransformer':
+            transformers_x = [
+                trans.SklearnTransformerWrapper(transform_X=True, dataset=dataset,
+                    sklearn_transformer=PowerTransformer(
+                        method=params.powertransformer_method,
+                        standardize=params.powertransformer_standardize
+                    ))
+                ]
+        else:
+            transformers_x = []
         return transformers_x
 
 

diff --git a/atomsci/ddm/pipeline/parameter_parser.py b/atomsci/ddm/pipeline/parameter_parser.py
@@ -533,7 +533,8 @@ def get_list_args(self):
                          "xgb_min_child_weight",
                          "xgb_subsample",
                          "xgb_colsample_bytree",
-                         "ki_convert_ratio"
+                         "ki_convert_ratio",
+                         "robustscaler_quartile_range"
                          }
 convert_to_int_list = {'layer_sizes','rf_max_features','rf_estimators', 'rf_max_depth',
                        'layer_nums', 'node_nums',
@@ -1252,11 +1253,13 @@ def get_parser():
     # **********************************************************************************************************
     # model_building_parameters: transformers
     parser.add_argument(
-        '--feature_transform_type', dest='feature_transform_type', choices=['normalization'],
+        '--feature_transform_type', dest='feature_transform_type', 
+        choices=['normalization', 'RobustScaler', 'PowerTransformer'],
         default='normalization', help='type of transformation for the features')
     parser.add_argument(
         '--response_transform_type', dest='response_transform_type', default='normalization',
-        help='type of normalization for the response column TODO: Not currently implemented')
+        choices=['normalization'],
+        help='type of normalization for the response column.')
     parser.add_argument(
         '--weight_transform_type', dest='weight_transform_type', choices=[None, 'None', 'balancing'], default=None,
         help='type of normalization for the weights')
@@ -1274,6 +1277,41 @@ def get_parser():
     parser.add_argument(
         '--transformers', dest='transformers', action='store_false',
         help='Boolean switch for using transformation on regression output. Default is True')
+
+    # RobustScaler parameters
+    parser.add_argument(
+        '--robustscaler_with_centering', action='store_false',
+        help='If True, center the data before scaling. '
+        'This will cause transform to raise an exception when attempted on sparse matrices, '
+        'because centering them entails building a dense matrix which in common use '
+        'cases is likely to be too large to fit in memory. Default is True')
+    parser.add_argument(
+        '--robustscaler_with_scaling', action='store_false',
+        help='If True, scale the data to interquartile range. Default is True')
+    parser.add_argument(
+        '--robustscaler_quartile_range', type=str, default='25.0,75.0',
+        help='Quantile range used to calculate scale_. '
+        'By default this is equal to the IQR, i.e., '
+        'q_min is the first quantile and q_max is the third quantile. '
+        '(q_min, q_max), 0.0 < q_min < q_max < 100.0. Default is "25.0,75.0"')
+    parser.add_argument(
+        '--robustscaler_with_scaling', action='store_true',
+        help='If True, scale data so that normally distributed features have a variance of 1. '
+        'In general, if the difference between the x-values of q_max and q_min for a standard '
+        'normal distribution is greater than 1, the dataset will be scaled down. '
+        'If less than 1, the dataset will be scaled up. Default is False.')
+
+    # PowerTransformer parameters
+    parser.add_argument(
+        '--powertransformer_method', choices=['yeo-johnson', 'box-cox'],
+        help='The power transform method. Available methods are: "yeo-johnson", '
+        'works with positive and negative values "box-cox", only works with strictly positive values. '
+        'Choices are {"yeo-johnson", "box-cox"}')
+    parser.add_argument(
+        '--powertransformer_standardize', action='store_false',
+        help='Set to True to apply zero-mean, unit-variance normalization to the transformed output. '
+        'Default is True.')
+
     parser.set_defaults(transformers=True)
 
     # **********************************************************************************************************

diff --git a/atomsci/ddm/pipeline/transformations.py b/atomsci/ddm/pipeline/transformations.py
@@ -7,8 +7,7 @@
 import numpy as np
 
 from deepchem.trans.transformers import Transformer, NormalizationTransformer, BalancingTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import RobustScaler
+
 
 logging.basicConfig(format='%(asctime)-15s %(message)s')
 log = logging.getLogger('ATOM')
@@ -84,7 +83,7 @@ def create_feature_transformers(params, featurization, train_dset):
         # response_transform_type and feature_transform_type, rather than params.transformers.
 
         # Scale and center feature matrix if featurization type calls for it
-        transformers_x = featurization.create_feature_transformer(train_dset)
+        transformers_x = featurization.create_feature_transformer(train_dset, params)
     else:
         transformers_x = []
 
@@ -126,7 +125,18 @@ def get_transformer_specific_metadata(params):
         transformer.
     """
     meta_dict = {}
-
+    if params.feature_transform_type == 'RobustScaler':
+        robustscaler_dict = dict(
+            robustscaler_with_centering = params.robustscaler_with_centering,
+            robustscaler_with_scaling = params.robustscaler_with_scaling,
+            robustscaler_with_quartile_range = params.robustscaler_quartile_range,
+            robustscaler_unit_variance = params.robustscaler_unit_variance,)
+        meta_dict['robustscaler_specific'] = robustscaler_dict
+    if params.feature_transform_type == 'PowerTransformer':
+        powertransformer_dict = dict(
+                        powertransformer_method = params.powertransformer_method,
+                        powertransformer_standardize = params.powertransformer_standardize)
+        meta_dict['powertransformer_specific'] = powertransformer_dict    
     return meta_dict
 
 # ****************************************************************************************

diff --git a/atomsci/ddm/test/integrative/balancing_trans/jsons/PowerTransformer_transformer.json b/atomsci/ddm/test/integrative/balancing_trans/jsons/PowerTransformer_transformer.json
@@ -0,0 +1,28 @@
+{
+        "dataset_key" : "replaced",
+        "datastore" : "False",
+        "uncertainty": "False",
+        "splitter": "scaffold",
+        "split_valid_frac": "0.20",
+        "split_test_frac": "0.20",
+        "split_strategy": "train_valid_test",
+        "prediction_type": "classification",
+        "model_choice_score_type": "roc_auc",
+        "response_cols" : "active",
+        "id_col": "compound_id",
+        "smiles_col" : "rdkit_smiles",
+        "result_dir": "replaced",
+        "system": "LC",
+        "transformers": "True",
+        "model_type": "NN",
+        "featurizer": "ecfp",
+        "feature_transform_type": "PowerTransformer",
+        "learning_rate": ".0007",
+        "layer_sizes": "20,10",
+        "dropouts": "0.3,0.3",
+        "save_results": "False",
+        "max_epochs": "2",
+        "early_stopping_patience": "2",
+        "verbose": "False",
+        "seed":"0"
+    }
diff --git a/atomsci/ddm/test/integrative/balancing_trans/jsons/RobustScaler_transformer.json b/atomsci/ddm/test/integrative/balancing_trans/jsons/RobustScaler_transformer.json
@@ -0,0 +1,32 @@
+{
+        "dataset_key" : "replaced",
+        "datastore" : "False",
+        "uncertainty": "False",
+        "splitter": "scaffold",
+        "split_valid_frac": "0.20",
+        "split_test_frac": "0.20",
+        "split_strategy": "train_valid_test",
+        "prediction_type": "classification",
+        "model_choice_score_type": "roc_auc",
+        "response_cols" : "active",
+        "id_col": "compound_id",
+        "smiles_col" : "rdkit_smiles",
+        "result_dir": "replaced",
+        "system": "LC",
+        "transformers": "True",
+        "model_type": "NN",
+        "featurizer": "ecfp",
+        "feature_transform_type": "RobustScaler",
+        "robustscaler_with_center": "True",
+        "robustscaler_with_scaling": "True",
+        "robustscaler_with_quartile_range": "30.0,80.0",
+        "robustscaler_unit_variance": "True",
+        "learning_rate": ".0007",
+        "layer_sizes": "20,10",
+        "dropouts": "0.3,0.3",
+        "save_results": "False",
+        "max_epochs": "2",
+        "early_stopping_patience": "2",
+        "verbose": "False",
+        "seed":"0"
+    }
diff --git a/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py b/atomsci/ddm/test/integrative/balancing_trans/test_balancing_transformer.py
@@ -7,6 +7,8 @@
 import os
 import json
 
+from sklearn.preprocessing import RobustScaler, PowerTransformer
+
 import sys
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 import make_test_datasets
@@ -328,6 +330,36 @@ def test_kfold_regression_transformers():
     # transformer means should be around expected_mean
     np.testing.assert_array_almost_equal(transformer.y_means, expected_y_means)
 
+def test_sklearn_transformers():
+    """
+    Test the balancing transformer to ensure that it correctly adjusts weights for imbalanced datasets.
+    """
+    dset_key = make_relative_to_file('../../test_datasets/MRP3_dataset.csv')
+    res_dir = tempfile.mkdtemp()
+
+    robustscaler_params = read_params(
+        make_relative_to_file('jsons/RobustScaler_transformer.json'),
+        dset_key,
+        res_dir
+    )
+
+    robustscaler_pipe = make_pipeline(robustscaler_params)
+    transformers_x = robustscaler_pipe.model_wrapper.transformers_x
+    assert len(transformers_x)==1
+    assert isinstance(transformers_x[0], trans.SklearnTransformerWrapper)
+    assert isinstance(transformers_x[0].sklearn_transformer, RobustScaler)
+
+    powertransformer_params = read_params(
+        make_relative_to_file('jsons/PowerTransformer_transformer.json'),
+        dset_key,
+        res_dir
+    )
+
+    powertransformer_pipe = make_pipeline(powertransformer_params)
+    transformers_x = powertransformer_pipe.model_wrapper.transformers_x
+    assert len(transformers_x)==1
+    assert isinstance(transformers_x[0], trans.SklearnTransformerWrapper)
+    assert isinstance(transformers_x[0].sklearn_transformer, PowerTransformer)
 
 if __name__ == '__main__':
     test_kfold_regression_transformers()