Skip to content

Commit

Permalink
Merge pull request #385 from ATOMScience-org/bug_transformer_fitting
Browse files Browse the repository at this point in the history
Transformers are now fitted using only the training data.
  • Loading branch information
stewarthe6 authored Jan 15, 2025
2 parents eadd9d4 + d309761 commit 94f93d0
Show file tree
Hide file tree
Showing 20 changed files with 1,673 additions and 463 deletions.
2 changes: 1 addition & 1 deletion atomsci/ddm/pipeline/featurization.py
Original file line number Diff line number Diff line change
Expand Up @@ -1021,7 +1021,7 @@ def featurize_data(self, dset_df, params, contains_responses):
weights = input_dataset.w
attr = input_model_dataset.attr

input_dataset = self.embedding_pipeline.model_wrapper.transform_dataset(input_dataset)
input_dataset = self.embedding_pipeline.model_wrapper.transform_dataset(input_dataset, fold='final')

# Run the embedding model to generate features.
embedding = self.embedding_pipeline.model_wrapper.generate_embeddings(input_dataset)
Expand Down
54 changes: 47 additions & 7 deletions atomsci/ddm/pipeline/model_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from deepchem.data import NumpyDataset
import numpy as np
import pandas as pd
import deepchem as dc
import uuid
from atomsci.ddm.pipeline import featurization as feat
from atomsci.ddm.pipeline import splitting as split
Expand Down Expand Up @@ -252,6 +251,19 @@ class ModelDataset(object):
combined_train_valid_data (dc.Dataset): A dataset object (initialized as None), of the merged train
and valid splits
combined_train_valid_data (dc.NumpyDataset): Cache for combined training and validation data,
used by k-fold CV code
subset_response_dict (dictionary): Cache for subset-specific response values matched to IDs,
used by k-fold CV code
subset_weight_dict (dictionary): Cache for subset-specific weights matched to IDs,
used by k-fold CV code
untransformed_response_dict (dictionary): Cache for untransformed response values
matched to IDs, used by k-fold CV code
set in get_featurized_data:
dataset: A new featurized DeepChem Dataset.
Expand Down Expand Up @@ -317,8 +329,11 @@ def __init__(self, params, featurization):
self.combined_train_valid_data = None
# Cache for subset-specific response values matched to IDs, used by k-fold CV code
self.subset_response_dict = {}
# Cache for subset-specific response values matched to IDs, used by k-fold CV code
# Cache for subset-specific weights matched to IDs, used by k-fold CV code
self.subset_weight_dict = {}
# Cache for untransformed response values matched to IDs, used by k-fold CV code
self.untransformed_response_dict = {}


# ****************************************************************************************
def load_full_dataset(self):
Expand Down Expand Up @@ -353,6 +368,7 @@ def get_featurized_data(self, params=None):
n_features: The count of features (int)
vals: The response col after featurization (np.array)
attr: A pd.dataframe containing the compound ids and smiles
untranfsormed_dataset: A NumpyDataset containing untransformed data
"""

if params is None:
Expand All @@ -379,6 +395,7 @@ def get_featurized_data(self, params=None):
if params.prediction_type=='classification':
w = w.astype(np.float32)

self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids)
self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w)
self.log.info("Using prefeaturized data; number of features = " + str(self.n_features))
return
Expand All @@ -404,6 +421,7 @@ def get_featurized_data(self, params=None):
self.log.debug("Number of features: " + str(self.n_features))

# Create the DeepChem dataset
self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids)
self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w)
# Checking for minimum number of rows
if len(self.dataset) < params.min_compound_number:
Expand Down Expand Up @@ -681,15 +699,13 @@ def has_all_feature_columns(self, dset_df):

# *************************************************************************************

def get_subset_responses_and_weights(self, subset, transformers):
def get_subset_responses_and_weights(self, subset):
"""Returns a dictionary mapping compound IDs in the given dataset subset to arrays of response values
and weights. Used by the perf_data module under k-fold CV.
Args:
subset (string): Label of subset, 'train', 'test', or 'valid'
transformers: Transformers object for full dataset
Returns:
tuple(response_dict, weight_dict)
(response_dict): dictionary mapping compound ids to arrays of per-task untransformed response values
Expand All @@ -703,16 +719,38 @@ def get_subset_responses_and_weights(self, subset, transformers):
else:
raise ValueError('Unknown dataset subset type "%s"' % subset)

y = dc.trans.undo_transforms(dataset.y, transformers)
response_vals = dict(zip(dataset.ids, self.get_untransformed_responses(dataset.ids)))

w = dataset.w
response_vals = dict([(id, y[i,:]) for i, id in enumerate(dataset.ids)])
weights = dict([(id, w[i,:]) for i, id in enumerate(dataset.ids)])
self.subset_response_dict[subset] = response_vals
self.subset_weight_dict[subset] = weights
return self.subset_response_dict[subset], self.subset_weight_dict[subset]

# *************************************************************************************

def get_untransformed_responses(self, ids):
"""
Returns a numpy array of untransformed response values for the given IDs.
Parameters:
ids (list or np.ndarray): List or array of IDs for which to retrieve untransformed response values.
Returns:
np.ndarray: A numpy array of untransformed response values corresponding to the given IDs.
"""

response_vals = np.zeros((len(ids), self.untransformed_dataset.y.shape[1]))
if len(self.untransformed_response_dict) == 0:
self.untransformed_response_dict = dict(zip(self.untransformed_dataset.ids, self.untransformed_dataset.y))

for i, id in enumerate(ids):
response_vals[i] = self.untransformed_response_dict[id]

return response_vals

# *************************************************************************************

def _get_split_key(self):
"""Creates the proper CSV name for a split file
Expand Down Expand Up @@ -828,6 +866,8 @@ def get_featurized_data(self, dset_df, is_featurized=False):
params, self.contains_responses)
self.log.warning("Done")
self.n_features = self.featurization.get_feature_count()

self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids)
self.dataset = NumpyDataset(features, self.vals, ids=ids)

# ****************************************************************************************
Expand Down
15 changes: 3 additions & 12 deletions atomsci/ddm/pipeline/model_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,17 +301,10 @@ def load_featurize_data(self, params=None):
# is fitted to the training data only. The transformers are then applied to the training,
# validation and test sets separately.
if not params.split_only:
self.model_wrapper.create_transformers(self.data)
self.model_wrapper.create_transformers(trans.get_all_training_datasets(self.data))
else:
self.run_mode = ''

if self.run_mode == 'training':
for i, (train, valid) in enumerate(self.data.train_valid_dsets):
train = self.model_wrapper.transform_dataset(train)
valid = self.model_wrapper.transform_dataset(valid)
self.data.train_valid_dsets[i] = (train, valid)
self.data.test_dset = self.model_wrapper.transform_dataset(self.data.test_dset)

# ****************************************************************************************

def create_model_metadata(self):
Expand Down Expand Up @@ -863,8 +856,6 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
raise Exception("response_cols missing from model params")
# Get features for each compound and construct a DeepChem Dataset from them
self.data.get_featurized_data(dset_df, is_featurized)
# Transform the features and responses if needed
self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset)

# Note that at this point, the dataset may contain fewer rows than the input. Typically this happens because
# of invalid SMILES strings. Remove any rows from the input dataframe corresponding to SMILES strings that were
Expand Down Expand Up @@ -995,7 +986,7 @@ def predict_embedding(self, dset_df, dset_params=None):
self.data = model_datasets.create_minimal_dataset(self.params, self.featurization)
self.data.get_featurized_data(dset_df, is_featurized=False)
# Not sure the following is necessary
self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset)
self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset, fold='final')

# Get the embeddings as a numpy array
embeddings = self.model_wrapper.generate_embeddings(self.data.dataset)
Expand Down Expand Up @@ -1577,7 +1568,7 @@ def ensemble_predict(model_uuids, collections, dset_df, labels=None, dset_params
raise Exception("response_cols missing from model params")
is_featurized = (len(set(pipe.featurization.get_feature_columns()) - set(dset_df.columns.values)) == 0)
pipe.data.get_featurized_data(dset_df, is_featurized)
pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset)
pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset, fold='final')

# Create a temporary data frame to hold the compound IDs and predictions. The model may not
# return predictions for all the requested compounds, so we have to outer join the predictions
Expand Down
Loading

0 comments on commit 94f93d0

Please sign in to comment.