-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_search.py
536 lines (489 loc) · 29.5 KB
/
model_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
"""
Searches for the best model/hyperparams/training for the Mags/Extra-Features model
"""
# pylint: disable=too-many-arguments, too-many-locals
from pathlib import Path
from contextlib import redirect_stdout
import sys
import json
from io import StringIO
from datetime import timedelta
import traceback
from warnings import filterwarnings
import numpy as np
import tensorflow as tf
import keras
from keras import layers, optimizers, callbacks as cb
from tensorflow.python.framework.errors_impl import OpError # pylint: disable=no-name-in-module
from hyperopt import fmin, tpe, hp, space_eval, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope
from ebop_maven import deb_example
from traininglib.tee import Tee
from traininglib import model_search_helpers
from traininglib.datasets import create_map_func, create_dataset_pipeline
from traininglib.keras_custom.callbacks import TrainingTimeoutCallback
import make_trained_cnn_model
# Configure the inputs and outputs of the model
CHOSEN_FEATURES = []
MAGS_BINS = deb_example.default_mags_bins
MAGS_WRAP_PHASE = None # None indicates wrap to centre on midpoint between eclipses
CHOSEN_LABELS = ["rA_plus_rB", "k", "J", "ecosw", "esinw", "bP"]
TRAINSET_NAME = "formal-training-dataset-500k"
TRAINSET_GLOB_TERM = "trainset00?.tfrecord" # Just the first 10 files, so 80k train/20k validation
TRAINSET_DIR = Path(".") / "datasets" / TRAINSET_NAME / "training"
VALIDSET_DIR = Path(".") / "datasets" / TRAINSET_NAME / "validation"
TESTSET_DIR = Path(".") / "datasets" / "synthetic-mist-tess-dataset"
FORMAL_TESTSET_DIR = Path(".") / "datasets/formal-test-dataset/"
MODEL_FILE_NAME = "search-model"
MAX_HYPEROPT_EVALS = 200 # Maximum number of distinct Hyperopt evals to run
HYPEROPT_LOSS_TH = 0.01 # Will stop search in the unlikely event we get below this loss
TRAINING_EPOCHS = 250 # Set high if we're using early stopping
BATCH_FRACTION = 0.001 # larger -> quicker training per epoch but more to converge
MAX_BUFFER_SIZE = 20000000 # Size of Dataset shuffle buffer (in instances)
TRAIN_PATIENCE = 7 # Number of epochs w/o improvement before training is stopped
TRAIN_TIMEOUT = timedelta(hours=1) # Timeout training if not completed within this time
SEED = 42 # Standard random seed ensures repeatable randomization
# Sets the random seed on python, numpy and keras's backend library (in this case tensorflow)
keras.utils.set_random_seed(SEED)
results_dir = Path(".") / "drop" / "model_search_current"
results_dir.mkdir(parents=True, exist_ok=True)
# -----------------------------------------------------------
# Set up the test datasets - we don't need to recreate per trial
# -----------------------------------------------------------
# No added noise or roll as this is already present in the datasets
test_mfunc = create_map_func(mags_bins=MAGS_BINS, mags_wrap_phase=MAGS_WRAP_PHASE,
ext_features=CHOSEN_FEATURES, labels=CHOSEN_LABELS)
test_ds, test_ct = [tf.data.TFRecordDataset] * 2, [int] * 2
for ti, (tname, tdir) in enumerate(zip(["trial", "real"], [TESTSET_DIR, FORMAL_TESTSET_DIR])):
tfiles = list(tdir.glob("**/*.tfrecord"))
test_ds[ti], test_ct[ti] = create_dataset_pipeline(tfiles, BATCH_FRACTION, test_mfunc)
print(f"Found {test_ct[ti]:,} {tname} test insts in {len(tfiles)} tfrecord files in", tdir)
# -----------------------------------------------------------
# Define the target model and the hyperparameter space
# -----------------------------------------------------------
#pylint: disable=line-too-long
scope.define(layers.ReLU)
scope.define(layers.LeakyReLU)
scope.define(layers.PReLU)
scope.define(optimizers.Adam)
scope.define(optimizers.Nadam)
# Shared choices: define the ranges, choices etc consistently, but keep the hp.*
# func calls specific to model type so fmin/TPE has free reign across each.
cnn_strides_pchoices = [(0.75, None), (0.125, 2), (0.125, 4)]
cnn_strides_fraction_kwargs = { "low": 0.25, "high": 1, "q": 0.25 }
cnn_pooling_type_choices_best = [layers.AvgPool1D, layers.MaxPool1D] # cannot be None
cnn_pooling_type_choices = [layers.AvgPool1D, layers.MaxPool1D, None]
cnn_padding_choices = ["same", "valid"]
cnn_activation_choices = ["relu"]
dnn_initializer_choices = ["he_uniform", "he_normal", "glorot_uniform"]
dnn_activation_choices = ["relu", "leaky_relu", "elu"]
loss_function_choices = [["mae"], ["mse"]] #, ["huber"]]
lr_qlogu_kwargs = { "low": -9, "high": -4.5, "q": 1e-4 } # ~1.1e-2 down to ~1.2e-4
sgd_momentum_uniform_kwargs = { "low": 0.0, "high": 1.0 }
lr_exp_decay_rate_kwargs = { "low": 0.90, "high": 0.95 }
lr_cos_warmup_steps_kwargs = { "low": 0, "high": 3000, "q": 1000 }
lr_cos_decay_steps_kwargs = { "low": 50000, "high": 150000, "q": 10000 }
lr_cos_alpha_choices = [0.01, 0.001]
lr_pw_boundary_choices = [[5000, 30000], [20000, 40000]]
lr_pw_value_choices = [[0.001, 0.0001, 0.00001], [0.0005, 0.00005, 0.00001], [0.0001, 0.001, 0.0001]]
# Genuinely shared choice between DNN layers and output layer
dnn_kernel_initializer_choice = hp.choice("dnn_init", dnn_initializer_choices)
# This will augment the model, giving an Estimator context information
metadata = {
"extra_features_and_defaults":
{f: deb_example.extra_features_and_defaults[f] for f in CHOSEN_FEATURES },
"mags_bins": MAGS_BINS,
"mags_wrap_phase": MAGS_WRAP_PHASE,
"labels_and_scales": {l: deb_example.labels_and_scales[l] for l in CHOSEN_LABELS},
"trainset_name": TRAINSET_NAME
}
trials_pspace = hp.choice("train_and_test_model", [
{
"description": "Best: current best model structure with varied dnn, hyperparams and training",
"model": {
"func": make_trained_cnn_model.make_best_model,
"model_name": "Model-Search-Variation-On-Best",
"chosen_features": CHOSEN_FEATURES,
"mags_bins": MAGS_BINS,
"mags_wrap_phase": MAGS_WRAP_PHASE,
"chosen_labels": CHOSEN_LABELS,
"trainset_name": TRAINSET_NAME,
"cnn_activation": hp.choice("best_cnn_activation", cnn_activation_choices),
"cnn_pooling": hp.choice("best_cnn_pooling", cnn_pooling_type_choices_best),
"dnn_num_layers": hp.uniformint("best_dnn_num_layers", low=1, high=4),
"dnn_num_units": hp.quniform("best_dnn_num_units", low=128, high=512, q=64),
"dnn_initializer": hp.choice("best_dnn_initializer", dnn_initializer_choices),
"dnn_activation": hp.choice("best_dnn_activation", dnn_activation_choices),
"dnn_dropout_rate": hp.quniform("best_dnn_dropout", low=0.3, high=0.7, q=0.1),
"dnn_num_taper_units": hp.quniform("best_dnn_taper_units", low=0, high=128, q=32),
"verbose": True
},
"optimizer": hp.choice("best_optimizer", [
{
"class": optimizers.Adam,
"learning_rate": hp.choice("best_adam_lr", [
hp.qloguniform("best_adam_lr_fixed", **lr_qlogu_kwargs),
{
"class": optimizers.schedules.ExponentialDecay,
"initial_learning_rate": hp.qloguniform("best_adam_exp_lr", **lr_qlogu_kwargs),
"decay_steps": 1000,
"decay_rate": hp.uniform("best_adam_exp_dr", **lr_exp_decay_rate_kwargs),
"staircase": hp.choice("best_adam_exp_staircase", [True, False])
},
{
"class": optimizers.schedules.CosineDecay,
"initial_learning_rate": 0.0,
"warmup_steps": hp.quniform("best_adam_cos_warmup_steps", **lr_cos_warmup_steps_kwargs),
"warmup_target": hp.qloguniform("best_adam_cos_warmup_target", **lr_qlogu_kwargs),
"decay_steps": hp.quniform("best_adam_cos_decay_steps", **lr_cos_decay_steps_kwargs),
"alpha": hp.choice("best_adam_cos_alpha", lr_cos_alpha_choices),
},
{
"class": optimizers.schedules.PiecewiseConstantDecay,
"boundaries": hp.choice("best_adam_pw_boundaries", lr_pw_boundary_choices),
"values": hp.choice("best_adam_pw_values", lr_pw_value_choices),
},
])
},
{
"class": optimizers.Nadam,
"learning_rate": hp.choice("best_nadam_lr", [
hp.qloguniform("best_nadam_lr_fixed", **lr_qlogu_kwargs),
{
"class": optimizers.schedules.ExponentialDecay,
"initial_learning_rate": hp.qloguniform("best_nadam_exp_lr", **lr_qlogu_kwargs),
"decay_steps": 1000,
"decay_rate": hp.uniform("best_nadam_exp_dr", **lr_exp_decay_rate_kwargs),
"staircase": hp.choice("best_nadam_exp_staircase", [True, False])
},
{
"class": optimizers.schedules.CosineDecay,
"initial_learning_rate": 0.0,
"warmup_steps": hp.quniform("best_nadam_cos_warmup_steps", **lr_cos_warmup_steps_kwargs),
"warmup_target": hp.qloguniform("best_nadam_cos_warmup_target", **lr_qlogu_kwargs),
"decay_steps": hp.quniform("best_nadam_cos_decay_steps", **lr_cos_decay_steps_kwargs),
"alpha": hp.choice("best_nadam_cos_alpha", lr_cos_alpha_choices),
},
{
"class": optimizers.schedules.PiecewiseConstantDecay,
"boundaries": hp.choice("best_nadam_pw_boundaries", lr_pw_boundary_choices),
"values": hp.choice("best_nadam_pw_values", lr_pw_value_choices),
},
])
},
# { # Covers both vanilla SGD and Nesterov momentum
# "class": optimizers.SGD,
# "learning_rate": hp.qloguniform("best_sgd_lr", **lr_qlogu_kwargs),
# "momentum": hp.uniform("best_sgd_momentum", **sgd_momentum_uniform_kwargs),
# "nesterov": hp.choice("best_sgd_nesterov", [True, False])
# }
]),
"loss_function": hp.choice("best_loss", loss_function_choices),
},
# {
# "description": "Free: explore model structure and hyperparams",
# "model": {
# "func": modelling.build_mags_ext_model,
# "name": "Model-Search-Free-Structure",
# "mags_input": {
# "func": modelling.mags_input_layer,
# "shape": (MAGS_BINS, 1),
# },
# "ext_input": {
# "func": modelling.ext_input_layer,
# "shape": (len(CHOSEN_FEATURES), 1),
# },
# "mags_layers": hp.choice("free_mags_layers", [
# {
# # Pairs of Conv1ds with fixed filters/kernels/strides and optional pooling layers
# "func": model_search_helpers.cnn_fixed_pairs_with_pooling,
# "num_pairs": hp.uniformint("free_cnn_fixed_num_pairs", low=2, high=4),
# "filters": hp.quniform("free_cnn_fixed_filters", low=32, high=96, q=16),
# "kernel_size": hp.quniform("free_cnn_fixed_kernel_size", low=4, high=12, q=4),
# # For strides None defers to strides_fraction
# "strides": hp.pchoice("free_cnn_fixed_strides", cnn_strides_pchoices),
# "strides_fraction": hp.quniform("free_cnn_fixed_strides_fraction", **cnn_strides_fraction_kwargs),
# "padding": hp.choice("free_cnn_fixed_padding", cnn_padding_choices),
# "activation": hp.choice("free_cnn_fixed_activation", cnn_activation_choices),
# "pooling_type": hp.choice("free_cnn_fixed_pooling_type", cnn_pooling_type_choices),
# "trailing_pool": hp.choice("free_cnn_fixed_trailing_pool", [True, False]),
# },
# {
# # Pairs of Conv1ds with doubling filters & halving kernels/strides per pair
# # and optional pooling layers
# "func": model_search_helpers.cnn_scaled_pairs_with_pooling,
# "num_pairs": hp.choice("free_cnn_scaled_num_pairs", [3]),
# "filters": hp.quniform("free_cnn_scaled_filters", low=16, high=32, q=16),
# "kernel_size": hp.quniform("free_cnn_scaled_kernel_size", low=8, high=32, q=8),
# # For strides None defers to strides_fraction
# "strides": hp.pchoice("free_cnn_scaled_strides", cnn_strides_pchoices),
# "strides_fraction": hp.quniform("free_cnn_scaled_strides_fraction", **cnn_strides_fraction_kwargs),
# "scaling_multiplier": 2,
# "padding": "same",
# "activation": hp.choice("free_cnn_scaled_activation", cnn_activation_choices),
# "pooling_type": hp.choice("free_cnn_scaled_pooling_type", cnn_pooling_type_choices),
# "trailing_pool": hp.choice("free_cnn_scaled_trailing_pool", [True, False]),
# },
# {
# # Randomized CNN with/without pooling.
# "func": model_search_helpers.cnn_with_pooling,
# "num_layers": hp.uniformint("free_cnn_num_layers", low=3, high=5),
# "filters": hp.quniform("free_cnn_filters", low=32, high=64, q=16),
# "kernel_size": hp.quniform("free_cnn_kernel_size", low=4, high=16, q=4),
# # For strides None defers to strides_fraction
# "strides": hp.pchoice("free_cnn_strides", cnn_strides_pchoices),
# "strides_fraction": hp.quniform("free_cnn_strides_fraction", **cnn_strides_fraction_kwargs),
# "padding": hp.choice("free_cnn_padding", cnn_padding_choices),
# "activation": hp.choice("free_cnn_activation", cnn_activation_choices),
# "pooling_ixs": hp.choice("free_cnn_pooling_ixs", [None, [2], [2, 5]]),
# "pooling_type": hp.choice("free_cnn_pooling_type", cnn_pooling_type_choices),
# },
# ]),
# "ext_layers": None,
# "dnn_layers": hp.choice("dnn_layers", [
# {
# "func": model_search_helpers.dnn_with_taper,
# "num_layers": hp.uniformint("free_dnn_num_layers", low=1, high=4),
# "units": hp.quniform("free_dnn_units", low=128, high=512, q=64),
# "kernel_initializer": dnn_kernel_initializer_choice,
# "activation": hp.choice("free_dnn_activation", dnn_activation_choices),
# "dropout_rate": hp.quniform("free_dnn_dropout", low=0.3, high=0.7, q=0.1),
# "taper_units": hp.quniform("free_dnn_taper", low=0, high=128, q=32),
# },
# ]),
# "output": {
# "func": modelling.output_layer,
# "metadata": metadata,
# "kernel_initializer": dnn_kernel_initializer_choice,
# "activation": "linear"
# },
# },
# "optimizer": hp.choice("free_optimizer", [
# {
# "class": optimizers.Adam,
# "learning_rate": hp.qloguniform("free_adam_lr", **lr_qlogu_kwargs)
# },
# {
# "class": optimizers.Nadam,
# "learning_rate": hp.qloguniform("free_nadam_lr", **lr_qlogu_kwargs)
# },
# # { # Covers both vanilla SGD and Nesterov momentum
# # "class": optimizers.SGD,
# # "learning_rate": hp.qloguniform("free_sgd_lr", **sgd_lr_qlogu_kwargs),
# # "momentum": hp.uniform("free_sgd_momentum", **sgd_momentum_uniform_kwargs),
# # "nesterov": hp.choice("free_sgd_nesterov", [True, False])
# # }
# ]),
# "loss_function": hp.choice("free_loss", loss_function_choices),
# }
])
# -----------------------------------------------------------
# Trials functions
# -----------------------------------------------------------
def train_and_test_model(trial_kwargs):
"""
Evaluate a single set of hyperparams by building, training and evaluating a model on them.
"""
print("\n" + "-"*80 + "\n",
"Evaluating model and hyperparameters based on the following trial_kwargs:\n",
json.dumps(trial_kwargs, indent=4, sort_keys=False, default=str) + "\n")
weighted_loss = candidate = None
status = STATUS_FAIL
# Reset so shuffling & other tf "random" behaviour is repeated for each trial
tf.random.set_seed(SEED)
# Set up the training and validation dataset pipelines
# Redo this every trial so we can potentially include these pipeline params in the search
tr_mfunc = create_map_func(mags_bins=MAGS_BINS, mags_wrap_phase=MAGS_WRAP_PHASE,
ext_features=CHOSEN_FEATURES, labels=CHOSEN_LABELS,
augmentation_callback=make_trained_cnn_model.augmentation_callback)
train_ds, train_ct = [tf.data.TFRecordDataset] * 2, [int] * 2
for ix, (dn, dd) in enumerate(zip(["training", "validation"],[TRAINSET_DIR, VALIDSET_DIR])):
files = list(dd.glob(TRAINSET_GLOB_TERM))
train_ds[ix], train_ct[ix] = create_dataset_pipeline(files, BATCH_FRACTION, tr_mfunc, None,
True, True, MAX_BUFFER_SIZE,seed=SEED)
print(f"Found {train_ct[ix]:,} {dn} instances over {len(files)}",
f"tfrecord file(s) matching glob '{TRAINSET_GLOB_TERM}' within", dd)
# Set up the training optimizer, loss and metrics
optimizer = model_search_helpers.get_trial_value(trial_kwargs, "optimizer")
loss_function = model_search_helpers.get_trial_value(trial_kwargs, "loss_function")
if not isinstance(loss_function, list):
loss_function = [loss_function]
fixed_metrics = ["mae", "mse", "r2_score"]
try:
# Build and Compile the trial model
# always use the same metrics as we use them for trial evaluation
candidate = model_search_helpers.get_trial_value(trial_kwargs, "model", False)
candidate.compile(optimizer=optimizer, loss=loss_function, metrics=fixed_metrics)
# Capture a summary of the newly built model
with StringIO() as stream:
candidate.summary(line_length=120, show_trainable=True,
print_fn=lambda line: stream.write(line + "\n"))
model_summary = stream.getvalue()
print(model_summary, "\n")
train_callbacks = [
cb.EarlyStopping("val_loss", min_delta=5e-5, restore_best_weights=True,
patience=TRAIN_PATIENCE, start_from_epoch=5, verbose=1),
TrainingTimeoutCallback(TRAIN_TIMEOUT, verbose=1)
]
candidate.fit(x=train_ds[0], epochs=TRAINING_EPOCHS, callbacks=train_callbacks,
validation_data=train_ds[1], verbose=2)
print(f"\nTrial evaluation of model against {test_ct[0]} test dataset instances.")
results = candidate.evaluate(x=test_ds[0], y=None, verbose=2)
print(f"\n'For info' evaluation against {test_ct[1]} formal-test dataset instances.")
candidate.evaluate(x=test_ds[1], y=None, verbose=2)
# Out final loss is always MAE from metrics. This allows us to vary the
# training loss function while using a consistent metric for trial evaluation.
mae = results[1 + fixed_metrics.index("mae")]
mse = results[1 + fixed_metrics.index("mse")]
# The trial can be evaluated on a "weighted loss"; the loss modified with a penalty
# on model complexity/#params (which is approximated from the number of trainable params).
weights = int(sum(np.prod(s) for s in [w.shape for w in candidate.trainable_weights]))
params = np.log(weights)
weighted_loss = mse * params
status = STATUS_OK
features = sum(candidate.get_layer(f"{n}-Input").output.shape[1] for n in ["Mags", "Ext"])
aic = features*np.log(mse) + 2*params
bic = features*np.log(mse) + np.log(features)*params
print(f"""
{'-'*80}
Trial result: MAE = {mae:.6f} and MSE = {mse:.6f}
count(trainable weights) = {weights:,d} yielding params(ln[weights]) = {params:.6f} and:
weighted loss(MSE*params) = {weighted_loss:6f}
AIC = {aic:,.3f}
BIC = {bic:,.3f}
{'-'*80}
""")
except Exception as exc:
# Log then rethrow any exception
print(f"\n*** Training failed! *** Caught a {type(exc).__name__}")
if isinstance(exc, OpError):
print(f"Details: {exc.op} / {exc.message}")
else:
print(f"Details: {exc}")
print(f"The call stack is: {traceback.print_exc()}\n")
raise
# Make sure anything returned is easily serialized into a pickle so we can save progress.
return { "loss": mae, "status": status, "mae": mae, "mse": mse,
"weighted_loss": weighted_loss, "AIC": aic, "BIC": bic,
"model_summary": model_summary }
def early_stopping_to_report_progress(this_trials, *early_stop_args):
"""
Callback for early stopping. We don't use it for early stopping but it is
useful for reporting on the current status of the trials without the
progress_bar making a mess when capturing the stdout output to a file.
We also use it to save the current best model details & params as we go.
"""
no_no_dont_stop = False # Will stop the trial if set to True
if this_trials and this_trials.best_trial:
# tids seem to be zero based; +1 for iteration
best_iter = this_trials.best_trial.get("tid", None) + 1
br = this_trials.best_trial.get("result", {})
this_iter = len(this_trials._ids or []) # pylint: disable=protected-access
print("\n" + "="*80 + "\n",
f"[{this_iter}/{MAX_HYPEROPT_EVALS}] Best Trial: #{best_iter},",
f"status={br.get('status', None)}, loss={br.get('loss', 0):.6f},",
f"MAE={br.get('mae', 0):.6f}, MSE={br.get('mse', 0):.6f}",
"\n" + "="*80 + "\n")
if this_iter == best_iter:
# This is the new best: persist its details so we don't loose them
model_summary = br.get("model_summary", None)
summary_file = results_dir / "best_model_summary.txt"
if model_summary:
print(f"Saving model summary to {summary_file}")
with open(summary_file, mode="w", encoding="utf8") as f:
f.write(model_summary)
else:
summary_file.unlink(missing_ok=True)
# We have to convert the misc vals dict, which has items like "activation": [2]
# into the form returned by fmin; the equivalent dict where above is "activation": 2
# With that we can use space_eval() func to give us the actual values from the vals
if "misc" in this_trials.best_trial:
vals = this_trials.best_trial["misc"].get("vals", {})
vals = { k: v[0] for (k, v) in vals.items() if v }
if vals:
params = space_eval(trials_pspace, vals)
params_file = results_dir / "best_model_params.json"
print(f"Saving parameters to {params_file}")
with open(params_file, mode="w", encoding="utf8") as f:
json.dump(params, f, indent=4, default=str)
return no_no_dont_stop, early_stop_args
# -----------------------------------------------------------
# Conduct the trials
# -----------------------------------------------------------
if __name__ == "__main__":
trials_log_file = results_dir / "trials.log"
trials_save_file = results_dir / "trials.pkl"
if trials_save_file.exists():
print(f"\nFound an existing trials progress file ({trials_save_file}) from which",
"hyperopt will attempt to resume the trials.\nIf you don't want to resume",
"these trials now is your chance to delete this progress file before continuing.")
input("Press enter to continue or Ctrl+C | Ctrl+Z to exit.")
resume = trials_save_file.exists()
with redirect_stdout(Tee(open(trials_log_file, "a" if resume else "w", encoding="utf8"))):
if resume:
print("\nResuming the previous Hyperopt trials...\n")
else:
print("\nStarting a new Hyperopt trials\n")
# CUDA_VISIBLE_DEVICES is set for the ebop_maven conda env. A value of "-1" suppresses GPUs,
# useful for repeatable results (Keras advises that out of order processing within GPU/CUDA
# can lead to varying results) and also avoiding memory constraints on smaller GPUs (mine!).
print("Runtime environment:", sys.prefix.replace("'", ""))
print("\n".join(f"{lib.__name__} v{lib.__version__}" for lib in [tf, keras]))
print(f"tensorflow can see {len(tf.config.list_physical_devices('GPU'))} physical GPU(s)")
# This appears to be a known false alarm raised from tf 2.16 which can safely be ignored
# See https://github.com/tensorflow/tensorflow/issues/62963
filterwarnings("ignore", "Your input ran out of data; interrupting training", UserWarning)
# The Tree-structured Parzen Estimator (TPE) Hyperparameter Optimization algorithm (HOA)
# see
# https://proceedings.neurips.cc/paper_files/paper/2011/hash/86e8f7ab32cfd12577bc2619bc635690-Abstract.html
# https://proceedings.mlr.press/v28/bergstra13.html
#
# Tree-structured refers to the configuration space, whereby some parameters are only
# well-defined when other parameters have particular values (e.g. #units in DNN layer 3
# is only well defined with #layers >= 3). In this case #units is a leaf & #layers a node.
#
# 1 initial trial steps are a stochastic search over the parameter space
# - models are evaluated against the desired objective/loss function
# - this initializes the trials history (parameter sets & objective value)
# - leads to an approximation of regions of the param space which lead to good/bad models
# 2 split the parameter space based on some quantile threshold \gamma
# - (e.g. \gamma=0.2 will split it 20%/80%)
# - Parzen Estimation (aka Kernel Density Estimation [KDE])
# - density estimation; an example is a histogram - a means of measuring the density
# (# measurements) for each "bin" within a range to estimate their distribution
# - estimators based on gaussian KDEs
# - fit a Gaussian KDE over each of the good and bad distributions
# - l(x) (aka "good" distribution) the ##% of each param values which evaluate the best
# - g(x) (aka "bad" distribution) the remaining param values which evaluate less well
# 3 determine the next potentialy "best" parameter set to test
# - for each param x;
# - draw random samples from l(x) (good dist)
# - evaluate each sample wrt l(x)/g(x) and select value which maximizes this
# 4 derive model from selected set
# - evaluate model against desired objective/loss function
# - result added to trials history
# 5 repeat steps 2 to 4 until total number of trials reached or params exhausted
# 6 select parameter set associated with the "best" objective measurement
#
# In the hyperopt paper (Bergstra+2013hyperopt) this is summarized as
# - on each iteration, for each hyperparameter
# - fits a Gaussian Mixture Model (GMM) - l(x) - to the set of hyperparam values with
# the smallest loss values ("good" dist)
# - fits a GMM - g(x) - to the remaining hyperparam values ("bad" dist)
# - chooses the hyperparam value x which maximizes l(x)/g(x)
best = fmin(fn = train_and_test_model,
space = trials_pspace,
trials = None, # Have fmin create new or deserialize the saved trials
algo = tpe.suggest,
max_evals = MAX_HYPEROPT_EVALS,
loss_threshold = HYPEROPT_LOSS_TH,
catch_eval_exceptions = True,
rstate=np.random.default_rng(SEED),
early_stop_fn=early_stopping_to_report_progress,
trials_save_file=f"{trials_save_file}",
verbose=True,
show_progressbar=False) # Can't use progressbar as it makes a mess of logging
# Report on the outcome (we saved the best to files as we went along)
best_params = space_eval(trials_pspace, best)
print("\nBest model hyperparameter set is:\n"
+ json.dumps(best_params, indent=4, sort_keys=False, default=str))