-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun.py
135 lines (104 loc) · 4.14 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pathlib
from multiprocessing import freeze_support
import numpy
from matplotlib import pyplot
from yammbs import MoleculeStore
from yammbs.inputs import QCArchiveDataset
from yammbs.outputs import MetricCollection
def main():
force_fields = [
"openff-1.0.0",
"openff-1.1.0",
"openff-1.2.0",
"openff-1.3.0",
"openff-2.0.0",
"openff-2.1.0",
"gaff-1.8",
"gaff-2.11",
"de-force-1.0.1.offxml",
]
# pre-processed data is stored in the repository, just a larger dataset
# filtered by elements, takes values of "ch" (~40) and "cho" (~1600 molecules)
data_key = "cho"
dataset = QCArchiveDataset.model_validate_json(
open(f"yammbs/_tests/data/yammbs/01-processed-qm-{data_key}.json").read(),
)
# This file is also store at f"yammbs/_tests/data/yammbs/01-processed-qm-{data_key}.json"
if pathlib.Path(f"{data_key}.sqlite").exists():
store = MoleculeStore(f"{data_key}.sqlite")
else:
store = MoleculeStore.from_qm_dataset(
dataset,
database_name=f"{data_key}.sqlite",
)
for force_field in force_fields:
# This is called within each analysis method, but short-circuiting within them. It's convenient to call it here
# with the freeze_support setup so that later analysis methods can trust that the MM conformers are there
store.optimize_mm(force_field=force_field, n_processes=10)
with open("minimized.json", "w") as f:
f.write(store.get_outputs().model_dump_json())
with open("metrics.json", "w") as f:
f.write(store.get_metrics().model_dump_json())
plot(force_fields)
def plot(force_fields: list[str]):
metrics = MetricCollection.parse_file("metrics.json")
x_ranges = {
"dde": (-16.0, 16.0),
"rmsd": (-0.3, 3.3),
"tfd": (-0.05, 0.55),
}
# metrics are stored with force field at the top of the hierarchy,
# restructure it so that the type of metric is at the top
# these each keep the qcarchive_id in case it's useful, though they're not
# used in this script
ddes: dict[str, dict[int, float]] = {
force_field: {key: val.dde for key, val in metrics.metrics[force_field].items()}
for force_field in metrics.metrics.keys()
}
rmsds = {
force_field: {key: val.rmsd for key, val in metrics.metrics[force_field].items()}
for force_field in metrics.metrics.keys()
}
tfds = {
force_field: {key: val.tfd for key, val in metrics.metrics[force_field].items()}
for force_field in metrics.metrics.keys()
}
data = {
"dde": ddes,
"rmsd": rmsds,
"tfd": tfds,
}
for key in ["dde", "rmsd", "tfd"]:
figure, axis = pyplot.subplots()
for force_field in force_fields:
if key == "dde":
_data = numpy.array(
[*data[key][force_field].values()],
dtype=float,
)
counts, bins = numpy.histogram(
_data[numpy.isfinite(_data)],
bins=numpy.linspace(-15, 15, 31),
)
axis.stairs(counts, bins, label=force_field)
axis.set_ylabel("Count")
else:
sorted_data = numpy.sort([*data[key][force_field].values()])
axis.plot(
sorted_data,
numpy.arange(1, len(sorted_data) + 1) / len(sorted_data),
"-",
label=force_field,
)
axis.set_xlabel(key)
axis.set_ylabel("CDF")
axis.set_xlim(x_ranges[key])
axis.set_ylim((-0.05, 1.05))
axis.legend(loc=0)
figure.savefig(f"{key}.png", dpi=300)
if __name__ == "__main__":
# This setup is necessary for reasons that confused me - both setting it up in the __main__ block and calling
# freeze_support(). This is probably not necessary after MoleculeStore.optimize_mm() is called, so you can load up
# the same database for later analysis once the MM conformers are stored
freeze_support()
main()