-
Notifications
You must be signed in to change notification settings - Fork 17
/
gphsp.py
157 lines (130 loc) · 5.2 KB
/
gphsp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""Common functions for gphsp."""
import dill
import gpflow as gpf
import matplotlib
# Chem libraries
import mordred
import numpy as np
import pandas as pd
import rdkit
import rdkit.Chem as Chem
import scipy.stats as stats
import seaborn as sns
import sklearn.metrics
import tensorflow as tf
from mordred import descriptors as mordred_descriptors
Y_COLS = ['δd', 'δp', 'δh']
def notebook_context():
gpf.config.set_default_float(np.float64)
gpf.config.set_default_summary_fmt("notebook")
sns.set_context('talk', font_scale=1.25)
matplotlib.rcParams['figure.figsize'] = (12,8)
matplotlib.rcParams['lines.linewidth'] = 2
pd.set_option("display.precision", 3)
def print_modules(mods):
for mod in mods:
print(f'{mod.__name__:10s} = {mod.__version__}')
def peek_df(df):
print(df.columns)
print(df.shape)
display(df.head(1))
def get_isomeric_smiles(s):
return Chem.MolToSmiles(Chem.MolFromSmiles(s))
def calculate_mordred(smis):
"""Calculate mordred features."""
mols = np.array([Chem.MolFromSmiles(s) for s in smis])
calc = mordred.Calculator(
mordred_descriptors, ignore_3D=True, version='1.2.1')
mordred_df = calc.pandas(mols.tolist())
values = mordred_df.to_numpy(np.float32)
return values
def calculate_mask(values):
"""Calculate mask for mordred features."""
nan_mask = np.sum(np.isnan(values), axis=0) == 0
const_mask = np.array([len(np.unique(v)) > 1 for v in values.T])
mask = np.logical_and(nan_mask, const_mask)
return mask
def save_model(model, fname):
assert fname.endswith('.pkl'), f'Check your filename={fname}'
with open(fname, "wb") as f:
dill.dump(model, f)
def save_gp(model, adir, in_dim=6):
model.predict = tf.function(model.predict_f,
input_signature=[tf.TensorSpec(shape=[None, in_dim], dtype=tf.float64)])
tf.saved_model.save(model, adir)
def load_gp(adir):
return tf.saved_model.load(adir)
def make_gp(x, y, use_ard, parts=None, const_mean = True):
kernel = None
parts = parts or {'all':None}
for part in parts.values():
if use_ard:
ard = tf.convert_to_tensor(np.ones_like(parts).astype(np.float64))
kernel_part = gpf.kernels.SquaredExponential(lengthscales=ard, active_dims=part)
else:
kernel_part = gpf.kernels.SquaredExponential(active_dims=part)
kernel = kernel_part if kernel is None else kernel + kernel_part
mean_fn = gpf.mean_functions.Constant() if const_mean else None
model = gpf.models.GPR(data=(x, y),
kernel=kernel,
mean_function=mean_fn)
opt = gpf.optimizers.Scipy()
opt_logs = opt.minimize(model.training_loss,
model.trainable_variables,
options=dict(maxiter=1000))
return model
def predictions_as_features(x, model_dict, pred_fn=None):
default_pred_fn = lambda model, inputs: model.pred_dist(inputs)
pred_fn = pred_fn or default_pred_fn
new_x = np.zeros((len(x), 6), dtype=np.float64)
for index, model in enumerate(model_dict.values()):
y_mol_dist = pred_fn(model, x)
new_x[:,index] = y_mol_dist.mean()
new_x[:,index+3] = y_mol_dist.mean()
return new_x
def cast_1d_array(a):
"""Flatten array or tensor."""
a = a.numpy() if tf.is_tensor(a) else a
assert a.ndim == 1 or a.shape[1]==1, f'Expected 1D array {a.shape}'
return a.ravel()
def predict_from_model_dict(x, model_dict):
return {key: cast_1d_array(model.predict(x)) for key, model in model_dict.items()}
def load_model(fname):
assert fname.endswith('.pkl'), f'Check your filename={fname}'
with open(fname, "rb") as f:
model = dill.load(f)
return model
def evaluate(y_true, y_pred, y_std=None, info=None):
"""Evaluate predictions."""
y_true = cast_1d_array(y_true)
y_pred = cast_1d_array(y_pred)
stat = info or {}
stat['R2'] = sklearn.metrics.r2_score(y_true, y_pred)
stat['MAE'] = sklearn.metrics.mean_absolute_error(y_true, y_pred)
stat['tau'] = stats.kendalltau(y_true, y_pred).correlation
if y_std is not None:
y_error = np.abs(y_true-y_pred)
stat['uncertainty_tau'] = stats.kendalltau(y_error, y_std).correlation
return stat
class SmilesMap:
"""Class to map smiles to values."""
def __init__(self, fname):
data = dict(np.load(fname))
self.values = data['values']
smi = data['smiles']
self.mask = data['mask']
self.index = pd.Series(np.arange(len(smi)), index=smi)
def __call__(self, inputs):
index = self.index.loc[inputs].values
return self.values[index][:, self.mask]
def update(self, new_smi, new_values):
needs_update = np.array([not s in self.index.index for s in new_smi])
if needs_update.sum()!=len(new_smi):
raise ValueError('Provide only new smiles and features')
if len(new_smi) != len(new_values) or new_values.ndim!=2:
raise ValueError('Inconsistent shapes')
n = len(self.index)
new_index = pd.Series(np.arange(n, n+len(new_smi)), index=new_smi)
self.index = self.index.append(new_index)
self.values = np.vstack((self.values, new_values))
return self