-
Notifications
You must be signed in to change notification settings - Fork 67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bug transformer fitting #385
Changes from 15 commits
d44ee76
10675c6
71d0859
a2520a4
18987d3
4689f9e
cd14bd9
00a74af
9cb3f4d
72172a2
9863aef
a40dffa
6332c60
398cf06
218f70d
78f7e35
804a62b
23bd84f
61c18fd
9c8abc3
a7ed96a
b0940ad
a7eb892
d1b25d8
74c1953
58c454d
d17bfc4
d5b3b55
069fa28
3fb2f45
d309761
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,6 @@ | |
from deepchem.data import NumpyDataset | ||
import numpy as np | ||
import pandas as pd | ||
import deepchem as dc | ||
import uuid | ||
from atomsci.ddm.pipeline import featurization as feat | ||
from atomsci.ddm.pipeline import splitting as split | ||
|
@@ -379,6 +378,7 @@ def get_featurized_data(self, params=None): | |
if params.prediction_type=='classification': | ||
w = w.astype(np.float32) | ||
|
||
self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids) | ||
self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w) | ||
self.log.info("Using prefeaturized data; number of features = " + str(self.n_features)) | ||
return | ||
|
@@ -404,6 +404,7 @@ def get_featurized_data(self, params=None): | |
self.log.debug("Number of features: " + str(self.n_features)) | ||
|
||
# Create the DeepChem dataset | ||
self.untransformed_dataset = NumpyDataset(features, self.vals, ids=ids) | ||
self.dataset = NumpyDataset(features, self.vals, ids=ids, w=w) | ||
# Checking for minimum number of rows | ||
if len(self.dataset) < params.min_compound_number: | ||
|
@@ -681,7 +682,7 @@ def has_all_feature_columns(self, dset_df): | |
|
||
# ************************************************************************************* | ||
|
||
def get_subset_responses_and_weights(self, subset, transformers): | ||
def get_subset_responses_and_weights(self, subset): | ||
"""Returns a dictionary mapping compound IDs in the given dataset subset to arrays of response values | ||
and weights. Used by the perf_data module under k-fold CV. | ||
|
||
|
@@ -703,16 +704,33 @@ def get_subset_responses_and_weights(self, subset, transformers): | |
else: | ||
raise ValueError('Unknown dataset subset type "%s"' % subset) | ||
|
||
y = dc.trans.undo_transforms(dataset.y, transformers) | ||
response_vals = dict() | ||
dataset_ids = set(dataset.ids) | ||
for id, y in zip(self.untransformed_dataset.ids, self.untransformed_dataset.y): | ||
if id in dataset_ids: | ||
response_vals[id] = y | ||
|
||
w = dataset.w | ||
response_vals = dict([(id, y[i,:]) for i, id in enumerate(dataset.ids)]) | ||
weights = dict([(id, w[i,:]) for i, id in enumerate(dataset.ids)]) | ||
self.subset_response_dict[subset] = response_vals | ||
self.subset_weight_dict[subset] = weights | ||
return self.subset_response_dict[subset], self.subset_weight_dict[subset] | ||
|
||
# ************************************************************************************* | ||
|
||
def get_untransformed_responses(self, ids): | ||
""" Returns a numpy array of untransformed response values | ||
""" | ||
response_vals = np.zeros((len(ids), self.untransformed_dataset.y.shape[1])) | ||
response_dict = dict([(id, y) for id, y in zip(self.untransformed_dataset.ids, self.untransformed_dataset.y)]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where is this function used? does it make sense to make response_vals a dict and then turn that dict.keys() into a numpy array elsewhere? |
||
|
||
for i, id in enumerate(ids): | ||
response_vals[i] = response_dict[id] | ||
|
||
return response_vals | ||
|
||
# ************************************************************************************* | ||
|
||
def _get_split_key(self): | ||
"""Creates the proper CSV name for a split file | ||
|
||
|
@@ -828,6 +846,8 @@ def get_featurized_data(self, dset_df, is_featurized=False): | |
params, self.contains_responses) | ||
self.log.warning("Done") | ||
self.n_features = self.featurization.get_feature_count() | ||
|
||
self.untransformed_dataset= NumpyDataset(features, self.vals, ids=ids) | ||
self.dataset = NumpyDataset(features, self.vals, ids=ids) | ||
|
||
# **************************************************************************************** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -301,17 +301,10 @@ def load_featurize_data(self, params=None): | |
# is fitted to the training data only. The transformers are then applied to the training, | ||
# validation and test sets separately. | ||
if not params.split_only: | ||
self.model_wrapper.create_transformers(self.data) | ||
self.model_wrapper.create_transformers(trans.get_all_training_datasets(self.data)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I disagree. The structure created by get_all_training_datasets is very specific to how transformers are implemented and I would like all that code in one place. Eventually I think there should be a TransformerManager object that can build all needed transformers given a dataset. |
||
else: | ||
self.run_mode = '' | ||
|
||
if self.run_mode == 'training': | ||
for i, (train, valid) in enumerate(self.data.train_valid_dsets): | ||
train = self.model_wrapper.transform_dataset(train) | ||
valid = self.model_wrapper.transform_dataset(valid) | ||
self.data.train_valid_dsets[i] = (train, valid) | ||
self.data.test_dset = self.model_wrapper.transform_dataset(self.data.test_dset) | ||
|
||
# **************************************************************************************** | ||
|
||
def create_model_metadata(self): | ||
|
@@ -864,7 +857,7 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses= | |
# Get features for each compound and construct a DeepChem Dataset from them | ||
self.data.get_featurized_data(dset_df, is_featurized) | ||
# Transform the features and responses if needed | ||
self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset) | ||
self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset, fold='final') | ||
|
||
# Note that at this point, the dataset may contain fewer rows than the input. Typically this happens because | ||
# of invalid SMILES strings. Remove any rows from the input dataframe corresponding to SMILES strings that were | ||
|
@@ -995,7 +988,7 @@ def predict_embedding(self, dset_df, dset_params=None): | |
self.data = model_datasets.create_minimal_dataset(self.params, self.featurization) | ||
self.data.get_featurized_data(dset_df, is_featurized=False) | ||
# Not sure the following is necessary | ||
self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset) | ||
self.data.dataset = self.model_wrapper.transform_dataset(self.data.dataset, fold='final') | ||
|
||
# Get the embeddings as a numpy array | ||
embeddings = self.model_wrapper.generate_embeddings(self.data.dataset) | ||
|
@@ -1577,7 +1570,7 @@ def ensemble_predict(model_uuids, collections, dset_df, labels=None, dset_params | |
raise Exception("response_cols missing from model params") | ||
is_featurized = (len(set(pipe.featurization.get_feature_columns()) - set(dset_df.columns.values)) == 0) | ||
pipe.data.get_featurized_data(dset_df, is_featurized) | ||
pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset) | ||
pipe.data.dataset = pipe.model_wrapper.transform_dataset(pipe.data.dataset, fold='final') | ||
|
||
# Create a temporary data frame to hold the compound IDs and predictions. The model may not | ||
# return predictions for all the requested compounds, so we have to outer join the predictions | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need to deal with missing values if
dataset_ids
aren't inuntransformed_dataset.ids
- shouldn't happen but here it is skipped silently, in that you could havelen(response_vals)<len(dataset_ids)
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you should call
get_untransformed_responses
here instead of this for loop?get_untransformed_responses
returns an np.array right now but I don't know if that makes sense for anything since it could be arbitrary IDs flattened into an array.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I'll add a check here.