-
Notifications
You must be signed in to change notification settings - Fork 0
/
trainer.py
174 lines (146 loc) · 5.71 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import json
import pathlib
import torch
import torch.optim as optim
class Trainer:
"""Class for training and evaluating a model.
Attributes
----------
model : torch.nn.Module
The model to be trained.
train_data : iterable
A training data generator for a task.
test_data : dict
A batch of test data for a task.
lr : float
The learning rate for the Adam optimizer.
weight_decay : float
The weight decay for the Adam optimizer.
compute_all_metrics : bool
Whether to compute all task metrics.
test_freq : int
The frequency (in epochs) of testing the model.
save_freq : int
The frequency (in epochs) of saving the model.
path : str
The path to save model checkpoints and metrics to.
device : str
The device on which to perform computations.
optimizer : optim.Adam
An instance of the Adam optimizer for model parameters.
Methods
-------
train(n_epochs=2500, start_epoch=0)
Train the model for a specified number of epochs.
"""
def __init__(
self,
model,
train_data,
test_data,
lr=0.001,
weight_decay=0,
compute_all_metrics=True,
test_freq=100,
save_freq=100,
path="results",
device="cuda",
):
"""Constructor for the Trainer class.
Parameters
----------
model : torch.nn.Module
The model to be trained.
train_data : iterable
A training data generator for a task.
test_data : dict
A batch of test data for a task.
lr : float, optional (default: 0.001)
The learning rate for the Adam optimizer.
weight_decay : float, optional (default: 0)
The weight decay for the Adam optimizer.
compute_all_metrics : bool, optional (default: True)
If True, compute all task metrics during training and testing.
test_freq : int, optional (default: 100)
The frequency (in epochs) at which to evaluate the model on the test data.
save_freq : int, optional (default: 100)
The frequency (in epochs) at which to save the model.
path : str, optional (default: "results")
The directory to save model checkpoints and metrics to.
device : str, optional (default: 'cuda')
The device to run computations on (e.g., 'cpu', 'cuda').
Returns
-------
None
"""
self.model = model
self.train_data = train_data
self.test_data = test_data
self.lr = lr
self.weight_decay = weight_decay
self.compute_all_metrics = compute_all_metrics
self.test_freq = test_freq
self.save_freq = save_freq
self.path = path
self.device = device
self.model.set_device(self.device)
self.optimizer = optim.Adam(
self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
)
def train(self, n_epochs=2500, start_epoch=0):
"""Train `self.model` for a specified number of epochs.
Parameters
----------
n_epochs : int, optional (default: 2500)
The number of epochs to train the model for.
start_epoch : int, optional (default: 0)
The starting epoch number (for checkpoint numbering).
Returns
-------
None
"""
epoch = start_epoch
self.train_metrics = dict()
self.test_metrics = dict()
aux = None
for batch in self.train_data:
epoch += 1
if epoch - start_epoch > n_epochs:
break
self.model.train()
self.optimizer.zero_grad()
data = batch["data"].to(self.device)
init_state = batch["init_state"].to(self.device)
targets = batch["targets"].to(self.device)
_, outputs = self.model(data, init_state=init_state)
if self.compute_all_metrics:
aux = batch
train_loss, train_metric = self.model.task.compute_metrics(outputs, targets, aux)
train_loss.backward()
self.optimizer.step()
self.train_metrics[epoch] = train_metric.copy()
print(f"Epoch {epoch} (train):")
for k, v in train_metric.items():
print(f" - {k} = {v}.")
if epoch % self.save_freq == 0:
model_path = pathlib.Path(self.path).joinpath(f"model_{epoch}.pt")
torch.save(self.model, model_path)
print(f"Model saved at epoch {epoch}.")
if epoch % self.test_freq == 0:
with torch.no_grad():
self.model.eval()
data = self.test_data["data"].to(self.device)
init_state = self.test_data["init_state"].to(self.device)
targets = self.test_data["targets"].to(self.device)
if self.compute_all_metrics:
aux = self.test_data
_, outputs = self.model(data, init_state=init_state)
_, test_metric = self.model.task.compute_metrics(outputs, targets, aux)
self.test_metrics[epoch] = test_metric.copy()
print(f"Epoch {epoch} (test):")
for k, v in test_metric.items():
print(f" - {k} = {v}.")
train_metrics_path = pathlib.Path(self.path).joinpath("train_metrics.json")
json.dump(self.train_metrics, open(train_metrics_path, "w"), indent=4)
test_metrics_path = pathlib.Path(self.path).joinpath("test_metrics.json")
json.dump(self.test_metrics, open(test_metrics_path, "w"), indent=4)