Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug ad index mordred #390

Merged
merged 6 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion atomsci/ddm/pipeline/model_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,10 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
pred_data = self.predict_embedding(dset_df, dset_params=dset_params)
else:
pred_data = copy.deepcopy(self.data.dataset.X)

if self.featurization.feat_type=="computed_descriptors" and self.featurization.descriptor_type=='mordred_filtered':
pred_data = pred_data[:,~np.isnan(pred_data).all(axis=0)]
pred_data = np.where(np.isnan(pred_data), np.nanmean(pred_data, axis=0), pred_data)

try:
if not hasattr(self, 'featurized_train_data'):
Expand All @@ -926,14 +930,17 @@ def predict_full_dataset(self, dset_df, is_featurized=False, contains_responses=
train_dset = dc.data.NumpyDataset(train_X)
self.featurized_train_data = self.model_wrapper.generate_embeddings(train_dset)
else:
if self.featurization.feat_type=="computed_descriptors" and self.featurization.descriptor_type=='mordred_filtered':
train_X = train_X[:,~np.isnan(train_X).all(axis=0)]
train_X = np.where(np.isnan(train_X), np.nanmean(train_X, axis=0), train_X)
self.featurized_train_data = train_X

if not hasattr(self, "train_pair_dis") or not hasattr(self, "train_pair_dis_metric") or self.train_pair_dis_metric != dist_metric:
self.train_pair_dis = pairwise_distances(X=self.featurized_train_data, metric=dist_metric)
self.train_pair_dis_metric = dist_metric

self.log.debug("Calculating AD index.")

if AD_method == "local_density":
result_df["AD_index"] = calc_AD_kmean_local_density(self.featurized_train_data, pred_data, k, train_dset_pair_distance=self.train_pair_dis, dist_metric=dist_metric)
else:
Expand Down
51 changes: 30 additions & 21 deletions atomsci/ddm/test/integrative/ad_index/test_ad_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def clean():
os.remove("./output/"+f)

def test():
"""Test full model pipeline: Curate data, fit model, and predict property for new compounds"""
"""Test AD index calculation: Curate data, fit model, and predict property for new compounds for each feature set"""

# Clean
# -----
Expand All @@ -33,31 +33,40 @@ def test():
python_path = sys.executable
hp_params["script_dir"] = script_dir
hp_params["python_path"] = python_path

for feat in ['ecfp','mordred_filtered','rdkit_raw','graphconv']:
if feat == 'ecfp':
hp_params['featurizer']=feat
elif feat =='graphconv':
hp_params['model_type']='NN'
hp_params['featurizer']=feat
else:
hp_params['featurizer']='computed_descriptors'
hp_params['descriptor_type']=feat
params = parse.wrapper(hp_params)
if not os.path.isfile(params.dataset_key):
params.dataset_key = os.path.join(params.script_dir, params.dataset_key)

params = parse.wrapper(hp_params)
if not os.path.isfile(params.dataset_key):
params.dataset_key = os.path.join(params.script_dir, params.dataset_key)
train_df = pd.read_csv(params.dataset_key)

train_df = pd.read_csv(params.dataset_key)
print(f"Train an RF models with {feat}")
pl = mp.ModelPipeline(params)
pl.train_model()

print("Train a RF models with ECFP")
pl = mp.ModelPipeline(params)
pl.train_model()
print("Calculate AD index with the just trained model.")
pred_df_mp = pl.predict_on_dataframe(train_df[:10], contains_responses=True, AD_method="z_score")

print("Calculate AD index with the just trained model.")
pred_df_mp = pl.predict_on_dataframe(train_df[:10], contains_responses=True, AD_method="z_score")
assert("AD_index" in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp'

assert("AD_index" in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp'

print("Calculate AD index with the saved model tarball file.")
pred_df_file = pfm.predict_from_model_file(model_path=pl.params.model_tarball_path,
input_df=train_df[:10],
id_col="compound_id",
smiles_col="base_rdkit_smiles",
response_col="pKi_mean",
dont_standardize=True,
AD_method="z_score")
assert("AD_index" in pred_df_file.columns.values), 'Error: No AD_index column in pred_df_file'
print("Calculate AD index with the saved model tarball file.")
pred_df_file = pfm.predict_from_model_file(model_path=pl.params.model_tarball_path,
input_df=train_df[:10],
id_col="compound_id",
smiles_col="base_rdkit_smiles",
response_col="pKi_mean",
dont_standardize=True,
AD_method="z_score")
assert("AD_index" in pred_df_file.columns.values), 'Error: No AD_index column in pred_df_file'

if __name__ == '__main__':
test()
Loading