Skip to content

Commit

Permalink
Removed unused 'fold' parameter. Added documentation for this PR
Browse files Browse the repository at this point in the history
  • Loading branch information
stewarthe6 committed Jan 7, 2025
1 parent 3fb2f45 commit d309761
Show file tree
Hide file tree
Showing 7 changed files with 288 additions and 27 deletions.
28 changes: 24 additions & 4 deletions atomsci/ddm/pipeline/model_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,19 @@ class ModelDataset(object):
combined_train_valid_data (dc.Dataset): A dataset object (initialized as None), of the merged train
and valid splits
combined_train_valid_data (dc.NumpyDataset): Cache for combined training and validation data,
used by k-fold CV code
subset_response_dict (dictionary): Cache for subset-specific response values matched to IDs,
used by k-fold CV code
subset_weight_dict (dictionary): Cache for subset-specific weights matched to IDs,
used by k-fold CV code
untransformed_response_dict (dictionary): Cache for untransformed response values
matched to IDs, used by k-fold CV code
set in get_featurized_data:
dataset: A new featurized DeepChem Dataset.
Expand Down Expand Up @@ -316,7 +329,7 @@ def __init__(self, params, featurization):
self.combined_train_valid_data = None
# Cache for subset-specific response values matched to IDs, used by k-fold CV code
self.subset_response_dict = {}
# Cache for subset-specific response values matched to IDs, used by k-fold CV code
# Cache for subset-specific weights matched to IDs, used by k-fold CV code
self.subset_weight_dict = {}
# Cache for untransformed response values matched to IDs, used by k-fold CV code
self.untransformed_response_dict = {}
Expand Down Expand Up @@ -355,6 +368,7 @@ def get_featurized_data(self, params=None):
n_features: The count of features (int)
vals: The response col after featurization (np.array)
attr: A pd.dataframe containing the compound ids and smiles
untranfsormed_dataset: A NumpyDataset containing untransformed data
"""

if params is None:
Expand Down Expand Up @@ -692,8 +706,6 @@ def get_subset_responses_and_weights(self, subset):
Args:
subset (string): Label of subset, 'train', 'test', or 'valid'
transformers: Transformers object for full dataset
Returns:
tuple(response_dict, weight_dict)
(response_dict): dictionary mapping compound ids to arrays of per-task untransformed response values
Expand All @@ -718,8 +730,16 @@ def get_subset_responses_and_weights(self, subset):
# *************************************************************************************

def get_untransformed_responses(self, ids):
""" Returns a numpy array of untransformed response values
"""
Returns a numpy array of untransformed response values for the given IDs.
Parameters:
ids (list or np.ndarray): List or array of IDs for which to retrieve untransformed response values.
Returns:
np.ndarray: A numpy array of untransformed response values corresponding to the given IDs.
"""

response_vals = np.zeros((len(ids), self.untransformed_dataset.y.shape[1]))
if len(self.untransformed_response_dict) == 0:
self.untransformed_response_dict = dict(zip(self.untransformed_dataset.ids, self.untransformed_dataset.y))
Expand Down
51 changes: 33 additions & 18 deletions atomsci/ddm/pipeline/model_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,11 +239,16 @@ class ModelWrapper(object):
output_dir (str): The parent path of the model directory
transformers (list): Initialized as an empty list, stores the transformers on the response cols
transformers (dict of lists): Initialized using transformers.get_blank_transformations.
Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}.
Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for
the transformer fitted for the final model. When using k-fold validation, 'final' is fitted
using all training and validation data. Without k-fold validation, transformers for 0 and 'final'
are the same.
transformers_x (list): Initialized as an empty list, stores the transformers on the features
transformers_x (dict of lists): Same as transformers, but stores the transformers on the features
transformers_w (list): Initialized as an empty list, stores the transformers on the weights
transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights
set in setup_model_dirs:
best_model_dir (str): The subdirectory under output_dir that contains the best model. Created in setup_model_dirs
Expand All @@ -269,11 +274,17 @@ def __init__(self, params, featurizer, ds_client):
output_dir (str): The parent path of the model directory
transformers (list): Initialized as an empty list, stores the transformers on the response cols
transformers (dict of lists): Initialized using transformers.get_blank_transformations.
Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}.
Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for
the transformer fitted for the final model. When using k-fold validation, 'final' is fitted
using all training and validation data. Without k-fold validation, transformers for 0 and 'final'
are the same.
transformers_x (list): Initialized as an empty list, stores the transformers on the features
transformers_x (dict of lists): Same as transformers, but stores the transformers on the features
transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights
transformers_w (list): Initialized as an empty list, stores the transformers on the weights
"""
self.params = params
Expand Down Expand Up @@ -328,7 +339,7 @@ def _create_output_transformers(self, dataset):
"""Initialize transformers for responses and persist them for later.
Args:
model_dataset: The ModelDataset object that handles the current dataset
dataset: A dc.Dataset object
Side effects:
Overwrites the attributes:
Expand All @@ -346,7 +357,7 @@ def _create_feature_transformers(self, dataset):
"""Initialize transformers for features, and persist them for later.
Args:
model_dataset: The ModelDataset object that handles the current dataset
dataset: A dc.Dataset object
Side effects:
Overwrites the attributes:
Expand All @@ -361,17 +372,21 @@ def create_transformers(self, training_datasets):
"""Initialize transformers for responses, features and weights, and persist them for later.
Args:
training_datasets: The ModelDataset object that handles the current dataset
training_datasets: A dictionary of dc.Datasets containing the training data from
each fold. Generated using transformers.get_all_training_datasets.
Side effects:
Overwrites the attributes:
transformers: A list of deepchem transformation objects on responses, only if conditions are met
transformers_x: A list of deepchem transformation objects on features, only if conditions are met.
transformers (dict of lists): Initialized using transformers.get_blank_transformations.
Keyed using integer fold numbers or 'final' e.g., {0:[], 1:[], 'final':[]}.
Stores deepchem transformation objects on the response cols for each fold and uses the 'final' key for
the transformer fitted for the final model. When using k-fold validation, 'final' is fitted
using all training and validation data. Without k-fold validation, transformers for 0 and 'final'
are the same.
transformers_w: A list of deepchem transformation objects on weights, only if conditions are met.
transformers_x (dict of lists): Same as transformers, but stores the transformers on the features
params.transformer_key: A string pointing to the dataset key containing the transformer in the datastore, or the path to the transformer
transformers_w (dict of lists): Same as transformers, but stores the transformers on the weights
"""
total_transformers = 0
Expand Down Expand Up @@ -459,7 +474,7 @@ def transform_dataset(self, dataset, fold):
Args:
dataset: The DeepChem DiskDataset that contains a dataset
fold (int): Which fold is being transformed.
fold (int/str): Which fold is being transformed.
Returns:
transformed_dataset: The transformed DeepChem DiskDataset
Expand Down Expand Up @@ -511,7 +526,7 @@ def get_train_valid_pred_results(self, perf_data):
return perf_data.get_prediction_results()

# ****************************************************************************************
def get_test_perf_data(self, model_dir, model_dataset, fold):
def get_test_perf_data(self, model_dir, model_dataset):
"""Returns the predicted values and metrics for the current test dataset against
the version of the model stored in model_dir, as a PerfData object.
Expand Down Expand Up @@ -553,7 +568,7 @@ def get_test_pred_results(self, model_dir, model_dataset):
return perf_data.get_prediction_results()

# ****************************************************************************************
def get_full_dataset_perf_data(self, model_dataset, fold):
def get_full_dataset_perf_data(self, model_dataset):
"""Returns the predicted values and metrics from the current model for the full current dataset,
as a PerfData object.
Expand Down Expand Up @@ -1617,7 +1632,7 @@ def _create_output_transformers(self, dataset):
"""Initialize transformers for responses and persist them for later.
Args:
model_dataset: The ModelDataset object that handles the current dataset
dataset: The dc.Dataset object that contains the current training dataset
Side effects:
Overwrites the attributes:
Expand Down
25 changes: 22 additions & 3 deletions atomsci/ddm/pipeline/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,12 @@ def create_feature_transformers(params, featurization, train_dset):
DeepChem transformer object holding its parameters.
Args:
params (argparse.namespace: Object containing the parameter list
params (argparse.namespace): Object containing the parameter list
featurization (featurization.Featurization): A Featurization object that will be used with
the train_dset object.
model_dataset (ModelDataset): Contains the dataset to be transformed.
train_dset (dc.Dataset): Contains the dataset used to fit the the transformers.
Returns:
(list of DeepChem transformer objects): list of transformers for the feature matrix
Expand Down Expand Up @@ -102,7 +105,7 @@ def create_weight_transformers(params, dataset):
Args:
params (argparse.namespace: Object containing the parameter list
model_dataset (ModelDataset): Contains the dataset to be transformed.
dataset (dc.Dataset): Contains the dataset to be transformed.
Returns:
(list of DeepChem transformer objects): list of transformers for the weight matrix
Expand Down Expand Up @@ -146,6 +149,12 @@ def get_transformer_keys(params):
There is one set of transformers for each fold and then one transformer
for both validation and training sets. AMPL automatically trains a model
using all validation and training data at the end of the training loop.
Args:
params (argparse.namespace: Object containing the parameter list
Returns:
(list): A list of all keys used in transformer dictionaries.
"""
if params.split_strategy != 'k_fold_cv':
return [0, 'final']
Expand All @@ -156,6 +165,9 @@ def get_transformer_keys(params):
def get_blank_transformations():
"""Get empty transformations dictionary
These keys must always exist, even when there are no transformations
Returns:
(dict): A dictionary containing empty lists. Used when no transformers are needed
"""
return {0:[], 'final':[]}

Expand All @@ -165,6 +177,13 @@ def get_all_training_datasets(model_dataset):
This takes a model_dataset and returns a dictionary of all
datasets that will need a transformer. The keys will match
what is returned by get_transformer_keys
Args:
model_dataset: A model_datasets.ModelDataset object containing the current dataset.
Returns:
dict of dc.Datasets: A dictionary keyed using keys fold numbers and 'final'. Contains
the training data for each fold and the final training+validation training set.
"""
result = {}
if model_dataset.splitting is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
logger = logging.getLogger(__name__)

def test_balancing_transformer():
"""
Test the balancing transformer to ensure that it correctly adjusts weights for imbalanced datasets.
"""
dset_key = make_relative_to_file('../../test_datasets/MRP3_dataset.csv')

res_dir = tempfile.mkdtemp()
Expand All @@ -32,6 +35,10 @@ def test_balancing_transformer():
assert weight == 1

def test_all_transformers():
"""
Test all transformers to ensure they work correctly with the dataset.
"""
res_dir = tempfile.mkdtemp()
dskey = os.path.join(res_dir, 'special_test_dset.csv')
params = read_params(
Expand Down Expand Up @@ -88,13 +95,19 @@ def test_all_transformers():
assert (valid_weight1*valid_count1*4 - valid_weight2*valid_count2) < 1e-4

def make_pipeline(params):
"""
Generates a pipeline given parameters
"""
pparams = parse.wrapper(params)
model_pipeline = mp.ModelPipeline(pparams)
model_pipeline.train_model()

return model_pipeline

def make_pipeline_and_get_weights(params):
"""
Generates the pipeline and gets the weights given parameters
"""
model_pipeline = make_pipeline(params)
model_wrapper = model_pipeline.model_wrapper
train_dataset = model_pipeline.data.train_valid_dsets[0][0]
Expand All @@ -103,20 +116,37 @@ def make_pipeline_and_get_weights(params):
return transformed_data.w

def make_relative_to_file(relative_path):
"""
Generates the full path relative to the location of this file.
"""

script_path = os.path.dirname(os.path.realpath(__file__))
result = os.path.join(script_path, relative_path)

return result

def read_params(json_file, tmp_dskey, res_dir):
"""
Read parameters from a JSON file and update them with the dataset key and result directory.
Parameters:
json_file (str): Path to the JSON file containing parameters.
tmp_dskey (str): Temporary dataset key.
res_dir (str): Result directory.
Returns:
dict: Updated parameters.
"""
with open(json_file, 'r') as file:
params = json.load(file)
params['result_dir'] = res_dir
params['dataset_key'] = tmp_dskey
return params

def params_wo_balan(dset_key, res_dir):
# Train classification models without balancing weights. Repeat this several times so we can get some statistics on the performance metrics.
"""
Reads params for models without balancing weight
"""
params = read_params(
make_relative_to_file('jsons/wo_balancing_transformer.json'),
dset_key,
Expand All @@ -125,7 +155,9 @@ def params_wo_balan(dset_key, res_dir):
return params

def params_w_balan(dset_key, res_dir):
# Now train models on the same dataset with balancing weights
"""
Reads params for models with balancing weight
"""
params = read_params(
make_relative_to_file('jsons/balancing_transformer.json'),
dset_key,
Expand All @@ -135,6 +167,9 @@ def params_w_balan(dset_key, res_dir):
return params

def test_kfold_transformers():
"""
Test transformers for a kfold classification model
"""
res_dir = tempfile.mkdtemp()
dskey = os.path.join(res_dir, 'special_test_dset.csv')
params = read_params(
Expand Down Expand Up @@ -217,6 +252,10 @@ def test_kfold_transformers():
np.testing.assert_array_almost_equal(transformer_x.X_means, np.ones_like(transformer_x.X_means)*expected_mean)

def test_kfold_regression_transformers():
"""
Tests transformers for each fold of a kfold regression model. Ensures
that the transformers are correct for each fold.
"""
res_dir = tempfile.mkdtemp()
dskey = os.path.join(res_dir, 'special_test_dset.csv')
params = read_params(
Expand Down
Loading

0 comments on commit d309761

Please sign in to comment.