From e9bfbd60123cd0a19aaafe1f0ad7f47efb2808a7 Mon Sep 17 00:00:00 2001 From: Saurabh Suresh Powar <66636289+Spnetic-5@users.noreply.github.com> Date: Fri, 14 Jul 2023 10:18:00 -0400 Subject: [PATCH] Added Momentum and Nesterov modifications (#148) * Added Momentum and Nesterov modifications * Resolved dummy argument error * Changes in update formulas * Corrected formulae, velocity allocation changes * Added concrete implementation of RMSProp * Report RMSE every 10% of num_epochs; Fix xtest calculation * Initialize networks with same weights; larger batch; larger test array * Start putting RMS and velocity structures in place; yet to be allocated and initialized * WIP: SGD and RMSprop optimizers plumbing at the network % update level * Added get_gradients() method (draft) * Clean up formatting and docstrings * Flush gradients to zero; code compiles but segfaults * Set default value for batch_size; tests pass in debug mode but segfault in optimized mode * Update learning rates in simple and sine examples because the default changed * Added draft test suite for optimizers * Store the optimizer as a member of the network type * Don't print to stdout; indentation * Added convergence tests * Resolved comments * Clean up * Import RMSProp * Remove old code * Add optimizer support notes --------- Co-authored-by: milancurcic --- README.md | 10 ++- example/quadratic.f90 | 126 +++++++++++++++++++-------- example/simple.f90 | 4 +- example/sine.f90 | 4 +- src/nf.f90 | 2 +- src/nf/nf_conv2d_layer.f90 | 15 +++- src/nf/nf_conv2d_layer_submodule.f90 | 12 +++ src/nf/nf_dense_layer.f90 | 14 ++- src/nf/nf_dense_layer_submodule.f90 | 12 +++ src/nf/nf_layer.f90 | 24 ++--- src/nf/nf_layer_submodule.f90 | 78 +++++------------ src/nf/nf_network.f90 | 11 ++- src/nf/nf_network_submodule.f90 | 88 ++++++++++++++++--- src/nf/nf_optimizers.f90 | 105 ++++++++++++++++++---- test/CMakeLists.txt | 1 + test/test_optimizers.f90 | 115 ++++++++++++++++++++++++ 16 files changed, 470 insertions(+), 151 deletions(-) create mode 100644 test/test_optimizers.f90 diff --git a/README.md b/README.md index 920a3e0..6be5811 100644 --- a/README.md +++ b/README.md @@ -17,10 +17,11 @@ Read the paper [here](https://arxiv.org/abs/1902.06714). * Training and inference of dense (fully connected) and convolutional neural networks +* Stochastic gradient descent optimizers: Classic, momentum, Nesterov momentum, + and RMSProp +* More than a dozen activation functions and their derivatives * Loading dense and convolutional models from Keras HDF5 (.h5) files -* Stochastic and mini-batch gradient descent for back-propagation * Data-based parallelism -* Several activation functions and their derivatives ### Available layers @@ -28,11 +29,14 @@ Read the paper [here](https://arxiv.org/abs/1902.06714). |------------|------------------|------------------------|----------------------|--------------|---------------| | Input | `input` | n/a | 1, 3 | n/a | n/a | | Dense (fully-connected) | `dense` | `input1d`, `flatten` | 1 | ✅ | ✅ | -| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ | +| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ❌ | | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ | | Flatten | `flatten` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ | | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ | +**Note:** The training of convolutional layers has been discovered to be broken +as of release 0.13.0. This will be fixed in a future (hopefully next) release. + ## Getting started Get the code: diff --git a/example/quadratic.f90 b/example/quadratic.f90 index da06bf3..168a6ae 100644 --- a/example/quadratic.f90 +++ b/example/quadratic.f90 @@ -7,20 +7,19 @@ program quadratic_fit use nf_optimizers, only: sgd implicit none - type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd, net_rms_prop + type(network) :: net(6) ! Training parameters integer, parameter :: num_epochs = 1000 integer, parameter :: train_size = 1000 - integer, parameter :: test_size = 30 - integer, parameter :: batch_size = 10 + integer, parameter :: test_size = 100 + integer, parameter :: batch_size = 100 real, parameter :: learning_rate = 0.01 real, parameter :: decay_rate = 0.9 ! Input and output data real, allocatable :: x(:), y(:) ! training data real, allocatable :: xtest(:), ytest(:) ! testing data - real, allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:), ypred_rms_prop(:) integer :: i, n @@ -28,7 +27,7 @@ program quadratic_fit print '(60("="))' allocate(xtest(test_size), ytest(test_size)) - xtest = [((i - 1) * 2 / test_size, i = 1, test_size)] + xtest = [(real(i - 1) * 2 / test_size, i = 1, test_size)] ytest = quadratic(xtest) ! x and y as 1-D arrays @@ -41,38 +40,30 @@ program quadratic_fit end do y = quadratic(x) - ! Instantiate a separate network for each optimization method. - net_sgd = network([input(1), dense(3), dense(1)]) - net_batch_sgd = network([input(1), dense(3), dense(1)]) - net_minibatch_sgd = network([input(1), dense(3), dense(1)]) - net_rms_prop = network([input(1), dense(3), dense(1)]) + ! Instantiate a network and copy an instance to the rest of the array + net(1) = network([input(1), dense(3), dense(1)]) + net(2:) = net(1) ! Print network info to stdout; this will be the same for all three networks. - call net_sgd % print_info() + call net(1) % print_info() - ! SGD optimizer - call sgd_optimizer(net_sgd, x, y, learning_rate, num_epochs) + ! SGD, no momentum + call sgd_optimizer(net(1), x, y, xtest, ytest, learning_rate, num_epochs) + + ! SGD, momentum + call sgd_optimizer(net(2), x, y, xtest, ytest, learning_rate, num_epochs, momentum=0.9) + + ! SGD, momentum with Nesterov + call sgd_optimizer(net(3), x, y, xtest, ytest, learning_rate, num_epochs, momentum=0.9, nesterov=.true.) ! Batch SGD optimizer - call batch_gd_optimizer(net_batch_sgd, x, y, learning_rate, num_epochs) + call batch_gd_optimizer(net(4), x, y, xtest, ytest, learning_rate, num_epochs) ! Mini-batch SGD optimizer - call minibatch_gd_optimizer(net_minibatch_sgd, x, y, learning_rate, num_epochs, batch_size) + call minibatch_gd_optimizer(net(5), x, y, xtest, ytest, learning_rate, num_epochs, batch_size) ! RMSProp optimizer - call rmsprop_optimizer(net_rms_prop, x, y, learning_rate, num_epochs, decay_rate) - - ! Calculate predictions on the test set - ypred_sgd = [(net_sgd % predict([xtest(i)]), i = 1, test_size)] - ypred_batch_sgd = [(net_batch_sgd % predict([xtest(i)]), i = 1, test_size)] - ypred_minibatch_sgd = [(net_minibatch_sgd % predict([xtest(i)]), i = 1, test_size)] - ypred_rms_prop = [(net_rms_prop % predict([xtest(i)]), i = 1, test_size)] - - ! Print the mean squared error - print '("Stochastic gradient descent MSE:", f9.6)', sum((ypred_sgd - ytest)**2) / size(ytest) - print '(" Batch gradient descent MSE: ", f9.6)', sum((ypred_batch_sgd - ytest)**2) / size(ytest) - print '(" Minibatch gradient descent MSE: ", f9.6)', sum((ypred_minibatch_sgd - ytest)**2) / size(ytest) - print '(" RMSProp MSE: ", f9.6)', sum((ypred_rms_prop - ytest)**2) / size(ytest) + call rmsprop_optimizer(net(6), x, y, xtest, ytest, learning_rate, num_epochs, decay_rate) contains @@ -82,39 +73,70 @@ real elemental function quadratic(x) result(y) y = (x**2 / 2 + x / 2 + 1) / 2 end function quadratic - subroutine sgd_optimizer(net, x, y, learning_rate, num_epochs) + subroutine sgd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, momentum, nesterov) ! In the stochastic gradient descent (SGD) optimizer, we run the forward ! and backward passes and update the weights for each training sample, ! one at a time. type(network), intent(inout) :: net real, intent(in) :: x(:), y(:) + real, intent(in) :: xtest(:), ytest(:) real, intent(in) :: learning_rate integer, intent(in) :: num_epochs + real, intent(in), optional :: momentum + logical, intent(in), optional :: nesterov + real, allocatable :: ypred(:) + real :: momentum_value + logical :: nesterov_value integer :: i, n - print *, "Running SGD optimizer..." + print '(a)', 'Stochastic gradient descent' + print '(34("-"))' + + ! Set default values for momentum and nesterov + if (.not. present(momentum)) then + momentum_value = 0.0 + else + momentum_value = momentum + end if + + if (.not. present(nesterov)) then + nesterov_value = .false. + else + nesterov_value = nesterov + end if do n = 1, num_epochs do i = 1, size(x) call net % forward([x(i)]) call net % backward([y(i)]) - call net % update(sgd(learning_rate=learning_rate)) + call net % update(sgd(learning_rate=learning_rate, momentum=momentum_value, nesterov=nesterov_value)) end do + + if (mod(n, num_epochs / 10) == 0) then + ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))] + print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest) + end if + end do + print *, '' + end subroutine sgd_optimizer - subroutine batch_gd_optimizer(net, x, y, learning_rate, num_epochs) + subroutine batch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs) ! Like the stochastic gradient descent (SGD) optimizer, except that here we ! accumulate the weight gradients for all training samples and update the ! weights once per epoch. type(network), intent(inout) :: net real, intent(in) :: x(:), y(:) + real, intent(in) :: xtest(:), ytest(:) real, intent(in) :: learning_rate integer, intent(in) :: num_epochs + real, allocatable :: ypred(:) integer :: i, n - print *, "Running batch GD optimizer..." + print '(a)', 'Batch gradient descent' + print '(34("-"))' do n = 1, num_epochs do i = 1, size(x) @@ -122,11 +144,19 @@ subroutine batch_gd_optimizer(net, x, y, learning_rate, num_epochs) call net % backward([y(i)]) end do call net % update(sgd(learning_rate=learning_rate / size(x))) + + if (mod(n, num_epochs / 10) == 0) then + ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))] + print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest) + end if + end do + print *, '' + end subroutine batch_gd_optimizer - subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_size) + subroutine minibatch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, batch_size) ! Like the batch SGD optimizer, except that here we accumulate the weight ! over a number of mini batches and update the weights once per mini batch. ! @@ -134,13 +164,16 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si ! this subroutine to converge to a solution. type(network), intent(inout) :: net real, intent(in) :: x(:), y(:) + real, intent(in) :: xtest(:), ytest(:) real, intent(in) :: learning_rate integer, intent(in) :: num_epochs, batch_size integer :: i, j, n, num_samples, num_batches, start_index, end_index real, allocatable :: batch_x(:), batch_y(:) integer, allocatable :: batch_indices(:) + real, allocatable :: ypred(:) - print *, "Running mini-batch GD optimizer..." + print '(a)', 'Minibatch gradient descent' + print '(34("-"))' num_samples = size(x) num_batches = num_samples / batch_size @@ -167,17 +200,28 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si call net % update(sgd(learning_rate=learning_rate / batch_size)) end do + + if (mod(n, num_epochs / 10) == 0) then + ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))] + print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest) + end if + end do + + print *, '' + end subroutine minibatch_gd_optimizer - subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate) + subroutine rmsprop_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, decay_rate) ! RMSprop optimizer for updating weights using root mean square type(network), intent(inout) :: net real, intent(in) :: x(:), y(:) + real, intent(in) :: xtest(:), ytest(:) real, intent(in) :: learning_rate, decay_rate integer, intent(in) :: num_epochs integer :: i, j, n real, parameter :: epsilon = 1e-8 ! Small constant to avoid division by zero + real, allocatable :: ypred(:) ! Define a dedicated type to store the RMSprop gradients. ! This is needed because array sizes vary between layers and we need to @@ -191,7 +235,8 @@ subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate) type(rms_gradient_dense), allocatable :: rms(:) - print *, "Running RMSprop optimizer..." + print '(a)', 'RMSProp optimizer' + print '(34("-"))' ! Here we allocate the array or RMS gradient derived types. ! We need one for each dense layer, however we will allocate it to the @@ -237,8 +282,15 @@ subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate) end select end do + if (mod(n, num_epochs / 10) == 0) then + ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))] + print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest) + end if + end do + print *, '' + end subroutine rmsprop_optimizer subroutine shuffle(arr) diff --git a/example/simple.f90 b/example/simple.f90 index 07e5646..32a93dc 100644 --- a/example/simple.f90 +++ b/example/simple.f90 @@ -1,5 +1,5 @@ program simple - use nf, only: dense, input, network + use nf, only: dense, input, network, sgd implicit none type(network) :: net real, allocatable :: x(:), y(:) @@ -24,7 +24,7 @@ program simple call net % forward(x) call net % backward(y) - call net % update() + call net % update(optimizer=sgd(learning_rate=1.)) if (mod(n, 50) == 0) & print '(i4,2(3x,f8.6))', n, net % predict(x) diff --git a/example/sine.f90 b/example/sine.f90 index 0ab5749..b37a243 100644 --- a/example/sine.f90 +++ b/example/sine.f90 @@ -1,5 +1,5 @@ program sine - use nf, only: dense, input, network + use nf, only: dense, input, network, sgd implicit none type(network) :: net real :: x(1), y(1) @@ -31,7 +31,7 @@ program sine call net % forward(x) call net % backward(y) - call net % update() + call net % update(optimizer=sgd(learning_rate=1.)) if (mod(n, 10000) == 0) then ypred = [(net % predict([xtest(i)]), i = 1, test_size)] diff --git a/src/nf.f90 b/src/nf.f90 index e4522aa..82fcb80 100644 --- a/src/nf.f90 +++ b/src/nf.f90 @@ -5,7 +5,7 @@ module nf use nf_layer_constructors, only: & conv2d, dense, flatten, input, maxpool2d, reshape use nf_network, only: network - use nf_optimizers, only: sgd + use nf_optimizers, only: sgd, rmsprop use nf_activation, only: activation_function, elu, exponential, & gaussian, linear, relu, leaky_relu, & sigmoid, softmax, softplus, step, tanhf, & diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90 index 83d6597..7b72980 100644 --- a/src/nf/nf_conv2d_layer.f90 +++ b/src/nf/nf_conv2d_layer.f90 @@ -30,11 +30,12 @@ module nf_conv2d_layer contains - procedure :: init procedure :: forward procedure :: backward + procedure :: get_gradients procedure :: get_num_params procedure :: get_params + procedure :: init procedure :: set_params end type conv2d_layer @@ -89,13 +90,23 @@ pure module function get_num_params(self) result(num_params) end function get_num_params pure module function get_params(self) result(params) - !! Get the parameters of the layer. + !! Return the parameters (weights and biases) of this layer. + !! The parameters are ordered as weights first, biases second. class(conv2d_layer), intent(in) :: self !! A `conv2d_layer` instance real, allocatable :: params(:) !! Parameters to get end function get_params + pure module function get_gradients(self) result(gradients) + !! Return the gradients of this layer. + !! The gradients are ordered as weights first, biases second. + class(conv2d_layer), intent(in) :: self + !! A `conv2d_layer` instance + real, allocatable :: gradients(:) + !! Gradients to get + end function get_gradients + module subroutine set_params(self, params) !! Set the parameters of the layer. class(conv2d_layer), intent(in out) :: self diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90 index a24145f..a2fadfc 100644 --- a/src/nf/nf_conv2d_layer_submodule.f90 +++ b/src/nf/nf_conv2d_layer_submodule.f90 @@ -202,6 +202,18 @@ pure module function get_params(self) result(params) end function get_params + pure module function get_gradients(self) result(gradients) + class(conv2d_layer), intent(in) :: self + real, allocatable :: gradients(:) + + gradients = [ & + pack(self % dw, .true.), & + pack(self % db, .true.) & + ] + + end function get_gradients + + module subroutine set_params(self, params) class(conv2d_layer), intent(in out) :: self real, intent(in) :: params(:) diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index ad0c6e2..ae523cc 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -33,10 +33,11 @@ module nf_dense_layer procedure :: backward procedure :: forward + procedure :: get_gradients procedure :: get_num_params procedure :: get_params - procedure :: set_params procedure :: init + procedure :: set_params end type dense_layer @@ -87,7 +88,7 @@ pure module function get_num_params(self) result(num_params) end function get_num_params pure module function get_params(self) result(params) - !! Return the parameters of this layer. + !! Return the parameters (weights and biases) of this layer. !! The parameters are ordered as weights first, biases second. class(dense_layer), intent(in) :: self !! Dense layer instance @@ -95,6 +96,15 @@ pure module function get_params(self) result(params) !! Parameters of this layer end function get_params + pure module function get_gradients(self) result(gradients) + !! Return the gradients of this layer. + !! The gradients are ordered as weights first, biases second. + class(dense_layer), intent(in) :: self + !! Dense layer instance + real, allocatable :: gradients(:) + !! Gradients of this layer + end function get_gradients + module subroutine set_params(self, params) !! Set the parameters of this layer. !! The parameters are ordered as weights first, biases second. diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index fb61060..1caec7c 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -69,6 +69,18 @@ pure module function get_params(self) result(params) end function get_params + pure module function get_gradients(self) result(gradients) + class(dense_layer), intent(in) :: self + real, allocatable :: gradients(:) + + gradients = [ & + pack(self % dw, .true.), & + pack(self % db, .true.) & + ] + + end function get_gradients + + module subroutine set_params(self, params) class(dense_layer), intent(in out) :: self real, intent(in) :: params(:) diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90 index 28fdaae..e9e90da 100644 --- a/src/nf/nf_layer.f90 +++ b/src/nf/nf_layer.f90 @@ -28,10 +28,10 @@ module nf_layer procedure :: forward procedure :: get_num_params procedure :: get_params + procedure :: get_gradients procedure :: set_params procedure :: init procedure :: print_info - procedure :: update ! Specific subroutines for different array ranks procedure, private :: backward_1d @@ -137,6 +137,14 @@ pure module function get_params(self) result(params) !! Parameters of this layer end function get_params + pure module function get_gradients(self) result(gradients) + !! Returns the gradients of this layer. + class(layer), intent(in) :: self + !! Layer instance + real, allocatable :: gradients(:) + !! Gradients of this layer + end function get_gradients + module subroutine set_params(self, params) !! Returns the parameters of this layer. class(layer), intent(in out) :: self @@ -145,20 +153,6 @@ module subroutine set_params(self, params) !! Parameters of this layer end subroutine set_params - impure elemental module subroutine update(self, optimizer, batch_size) - !! Update the weights and biases on the layer using the stored - !! gradients (from backward passes), and flush those same stored - !! gradients to zero. - !! This changes the state of the layer. - !! Typically used only internally from the `network % update` method. - class(layer), intent(in out) :: self - !! Layer instance - class(optimizer_base_type), intent(in) :: optimizer - !! Optimizer instance to use - integer, intent(in), optional :: batch_size - !! Batch size (default 1) - end subroutine update - end interface end module nf_layer diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90 index 2216d24..0746764 100644 --- a/src/nf/nf_layer_submodule.f90 +++ b/src/nf/nf_layer_submodule.f90 @@ -298,7 +298,6 @@ elemental module function get_num_params(self) result(num_params) end function get_num_params - pure module function get_params(self) result(params) class(layer), intent(in) :: self real, allocatable :: params(:) @@ -324,6 +323,30 @@ pure module function get_params(self) result(params) end function get_params + pure module function get_gradients(self) result(gradients) + class(layer), intent(in) :: self + real, allocatable :: gradients(:) + + select type (this_layer => self % p) + type is (input1d_layer) + ! No gradients to get. + type is (input3d_layer) + ! No gradients to get. + type is (dense_layer) + gradients = this_layer % get_gradients() + type is (conv2d_layer) + gradients = this_layer % get_gradients() + type is (maxpool2d_layer) + ! No gradients to get. + type is (flatten_layer) + ! No gradients to get. + type is (reshape3d_layer) + ! No gradients to get. + class default + error stop 'Unknown layer type.' + end select + + end function get_gradients module subroutine set_params(self, params) class(layer), intent(in out) :: self @@ -382,57 +405,4 @@ module subroutine set_params(self, params) end subroutine set_params - - impure elemental module subroutine update(self, optimizer, batch_size) - class(layer), intent(in out) :: self - class(optimizer_base_type), intent(in) :: optimizer - integer, intent(in), optional :: batch_size - integer :: batch_size_ - - batch_size_ = 1 - if (present(batch_size)) batch_size_ = batch_size - - select type (this_layer => self % p) - type is (dense_layer) - - ! Sum weight and bias gradients across images, if any - call co_sum(this_layer % dw) - call co_sum(this_layer % db) - - call optimizer % minimize( & - this_layer % weights, & - this_layer % dw / batch_size_ & - ) - call optimizer % minimize( & - this_layer % biases, & - this_layer % db / batch_size_ & - ) - - ! Reset gradients. - this_layer % dw = 0 - this_layer % db = 0 - - type is (conv2d_layer) - - ! Sum weight and bias gradients across images, if any - call co_sum(this_layer % dw) - call co_sum(this_layer % db) - - call optimizer % minimize( & - this_layer % kernel, & - this_layer % dw / batch_size_ & - ) - call optimizer % minimize( & - this_layer % biases, & - this_layer % db / batch_size_ & - ) - - ! Reset gradients. - this_layer % dw = 0 - this_layer % db = 0 - - end select - - end subroutine update - end submodule nf_layer_submodule diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90 index 38941ab..f12926e 100644 --- a/src/nf/nf_network.f90 +++ b/src/nf/nf_network.f90 @@ -13,14 +13,16 @@ module nf_network type :: network type(layer), allocatable :: layers(:) + class(optimizer_base_type), allocatable :: optimizer contains procedure :: backward + procedure :: get_gradients procedure :: get_num_params procedure :: get_params - procedure :: set_params procedure :: print_info + procedure :: set_params procedure :: train procedure :: update @@ -161,6 +163,13 @@ pure module function get_params(self) result(params) !! Network parameters to get end function get_params + pure module function get_gradients(self) result(gradients) + class(network), intent(in) :: self + !! Network instance + real, allocatable :: gradients(:) + !! Network gradients to set + end function get_gradients + module subroutine set_params(self, params) !! Set the network parameters (weights and biases). class(network), intent(in out) :: self diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index cdd8cad..5bafb7c 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -498,6 +498,26 @@ pure module function get_params(self) result(params) end function get_params + pure module function get_gradients(self) result(gradients) + class(network), intent(in) :: self + real, allocatable :: gradients(:) + integer :: n, nstart, nend + + allocate(gradients(self % get_num_params())) + + nstart = 1 + do n = 1, size(self % layers) + + if (self % layers(n) % get_num_params() < 1) cycle + + nend = nstart + self % layers(n) % get_num_params() - 1 + gradients(nstart:nend) = self % layers(n) % get_gradients() + nstart = nend + 1 + end do + + end function get_gradients + + module subroutine set_params(self, params) class(network), intent(in out) :: self real, intent(in) :: params(:) @@ -538,11 +558,13 @@ module subroutine train(self, input_data, output_data, batch_size, & ! Passing the optimizer instance is optional. ! If not provided, we default to SGD with its default settings. if (present(optimizer)) then - optimizer_ = optimizer + self % optimizer = optimizer else - optimizer_ = sgd() + self % optimizer = sgd() end if + call self % optimizer % init(self % get_num_params()) + dataset_size = size(output_data, dim=2) epoch_loop: do n = 1, epochs @@ -567,12 +589,7 @@ module subroutine train(self, input_data, output_data, batch_size, & call self % backward(output_data(:,j)) end do - select type (optimizer_) - type is (sgd) - call self % update(optimizer_, batch_size) - class default - error stop 'Unsupported optimizer' - end select + call self % update(batch_size=batch_size) end do batch_loop end do epoch_loop @@ -585,16 +602,59 @@ module subroutine update(self, optimizer, batch_size) class(optimizer_base_type), intent(in), optional :: optimizer integer, intent(in), optional :: batch_size class(optimizer_base_type), allocatable :: optimizer_ + integer :: batch_size_ + real, allocatable :: params(:) + integer :: n - ! Passing the optimizer instance is optional. - ! If not provided, we default to SGD with its default settings. - if (present(optimizer)) then - optimizer_ = optimizer + ! Passing the optimizer instance is optional. If not provided, and if the + ! optimizer has not already been set, we default to the default SGD. The + ! instantiation and initialization below of the optimizer is normally done + ! at the beginning of the network % train() method. However, if the user + ! wants to call network % update() directly, for example if they use their + ! own custom mini-batching routine, we initialize the optimizer here as + ! well. If it's initialized already, this step is a cheap no-op. + if (.not. allocated(self % optimizer)) then + if (present(optimizer)) then + self % optimizer = optimizer + else + self % optimizer = sgd() + end if + call self % optimizer % init(self % get_num_params()) + end if + + if (present(batch_size)) then + batch_size_ = batch_size else - optimizer_ = sgd() + batch_size_ = 1 end if - call self % layers % update(optimizer_, batch_size) + ! Sum weight and bias gradients across images, if any + do n = 2, size(self % layers) + select type(this_layer => self % layers(n) % p) + type is(dense_layer) + call co_sum(this_layer % dw) + call co_sum(this_layer % db) + type is(conv2d_layer) + call co_sum(this_layer % dw) + call co_sum(this_layer % db) + end select + end do + + params = self % get_params() + call self % optimizer % minimize(params, self % get_gradients() / batch_size_) + call self % set_params(params) + + ! Flush network gradients to zero. + do concurrent(n = 2:size(self % layers)) + select type(this_layer => self % layers(n) % p) + type is(dense_layer) + this_layer % dw = 0 + this_layer % db = 0 + type is(conv2d_layer) + this_layer % dw = 0 + this_layer % db = 0 + end select + end do end subroutine update diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index 725e561..3b7dc01 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -13,45 +13,114 @@ module nf_optimizers implicit none private - public :: optimizer_base_type, sgd + public :: optimizer_base_type, sgd, rmsprop type, abstract :: optimizer_base_type - real :: learning_rate = 1 + real :: learning_rate = 0.01 contains + procedure(init), deferred :: init procedure(minimize), deferred :: minimize end type optimizer_base_type abstract interface - elemental subroutine minimize(self, param, gradient) + + impure elemental subroutine init(self, num_params) + import :: optimizer_base_type + class(optimizer_base_type), intent(inout) :: self + integer, intent(in) :: num_params + end subroutine init + + pure subroutine minimize(self, param, gradient) import :: optimizer_base_type - class(optimizer_base_type), intent(in) :: self - real, intent(inout) :: param - real, intent(in) :: gradient + class(optimizer_base_type), intent(inout) :: self + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) end subroutine minimize + end interface type, extends(optimizer_base_type) :: sgd !! Stochastic Gradient Descent optimizer - real :: momentum = 0 !TODO - logical :: nesterov = .false. !TODO + real :: momentum = 0 + logical :: nesterov = .false. + real, allocatable :: velocity(:) contains + procedure :: init => init_sgd procedure :: minimize => minimize_sgd end type sgd + type, extends(optimizer_base_type) :: rmsprop + !! RMSProp optimizer + real :: decay_rate = 0.9 + real :: epsilon = 1e-8 + real, allocatable :: rms_gradient(:) + contains + procedure :: init => init_rmsprop + procedure :: minimize => minimize_rmsprop + end type rmsprop + contains - elemental subroutine minimize_sgd(self, param, gradient) + impure elemental subroutine init_sgd(self, num_params) + class(sgd), intent(inout) :: self + integer, intent(in) :: num_params + if (self % momentum > 0 .and. .not. allocated(self % velocity)) then + allocate(self % velocity(num_params)) + self % velocity = 0 + end if + end subroutine init_sgd + + + pure subroutine minimize_sgd(self, param, gradient) !! Concrete implementation of a stochastic gradient descent optimizer !! update rule. - class(sgd), intent(in) :: self - !! Optimizer instance - real, intent(inout) :: param - !! Network parameter (i.e. weight or bias) to update - real, intent(in) :: gradient - !! Loss gradient with respect to the parameter (dL/dw or dL/db) - ! TODO Implement momentum and Nesterov options - ! TODO (see https://keras.io/api/optimizers/sgd/) - param = param - self % learning_rate * gradient + class(sgd), intent(inout) :: self + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) + + if (self % momentum > 0) then + ! Apply momentum update + self % velocity = self % momentum * self % velocity & + - self % learning_rate * gradient + if (self % nesterov) then + ! Apply Nesterov update + param = param + self % momentum * self % velocity & + - self % learning_rate * gradient + else + param = param + self % velocity + end if + else + ! Apply regular update + param = param - self % learning_rate * gradient + end if + end subroutine minimize_sgd + + impure elemental subroutine init_rmsprop(self, num_params) + class(rmsprop), intent(inout) :: self + integer, intent(in) :: num_params + if (.not. allocated(self % rms_gradient)) then + allocate(self % rms_gradient(num_params)) + self % rms_gradient = 0 + end if + end subroutine init_rmsprop + + + pure subroutine minimize_rmsprop(self, param, gradient) + !! Concrete implementation of a RMSProp optimizer update rule. + class(rmsprop), intent(inout) :: self + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) + + ! Compute the RMS of the gradient using the RMSProp rule + self % rms_gradient = self % decay_rate * self % rms_gradient & + + (1 - self % decay_rate) * gradient**2 + + ! Update the network parameters based on the new RMS of the gradient + param = param - self % learning_rate & + / sqrt(self % rms_gradient + self % epsilon) * gradient + + end subroutine minimize_rmsprop + end module nf_optimizers diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 485dd87..26646ec 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,6 +15,7 @@ foreach(execid dense_network_from_keras cnn_from_keras conv2d_network + optimizers ) add_executable(test_${execid} test_${execid}.f90) target_link_libraries(test_${execid} PRIVATE neural h5fortran::h5fortran jsonfortran::jsonfortran ${LIBS}) diff --git a/test/test_optimizers.f90 b/test/test_optimizers.f90 new file mode 100644 index 0000000..dc41f91 --- /dev/null +++ b/test/test_optimizers.f90 @@ -0,0 +1,115 @@ +program test_optimizers + + use nf, only: dense, input, network, rmsprop, sgd + use iso_fortran_env, only: stderr => error_unit + + implicit none + type(network) :: net(4) + real, allocatable :: x(:), y(:) + real, allocatable :: ypred(:) + integer, parameter :: num_iterations = 1000 + integer :: n, i + logical :: ok = .true. + logical :: converged = .false. + + ! Instantiate a network and copy an instance to the rest of the array + net(1) = network([input(3), dense(5), dense(2)]) + net(2:) = net(1) + + x = [0.2, 0.4, 0.6] + y = [0.123456, 0.246802] + + do n = 0, num_iterations + + call net(1) % forward(x) + call net(1) % backward(y) + call net(1) % update(optimizer=sgd(learning_rate=1.)) + + ypred = net(1) % predict(x) + converged = check_convergence(y, ypred) + if (converged) exit + + end do + + if (.not. converged) then + write(stderr, '(a)') 'sgd should converge in simple training.. failed' + ok = .false. + end if + + converged = .false. + + do n = 0, num_iterations + + call net(2) % forward(x) + call net(2) % backward(y) + call net(2) % update(optimizer=sgd(learning_rate=1., momentum=0.9)) + + ypred = net(2) % predict(x) + converged = check_convergence(y, ypred) + if (converged) exit + + end do + + if (.not. converged) then + write(stderr, '(a)') & + 'sgd(momentum) should converge in simple training.. failed' + ok = .false. + end if + + converged = .false. + + do n = 0, num_iterations + + call net(3) % forward(x) + call net(3) % backward(y) + call net(3) % update(optimizer=sgd(learning_rate=1., momentum=0.9, nesterov=.true.)) + + ypred = net(3) % predict(x) + converged = check_convergence(y, ypred) + if (converged) exit + + end do + + if (.not. converged) then + write(stderr, '(a)') & + 'sgd(nesterov) should converge in simple training.. failed' + ok = .false. + end if + + ! Resetting convergence flag + converged = .false. + + do n = 0, num_iterations + + call net(4) % forward(x) + call net(4) % backward(y) + call net(4) % update(optimizer=rmsprop(learning_rate=0.01, decay_rate=0.9)) + + ypred = net(4) % predict(x) + converged = check_convergence(y, ypred) + if (converged) exit + + end do + + if (.not. converged) then + write(stderr, '(a)') 'rmsprop should converge in simple training.. failed' + ok = .false. + end if + + if (ok) then + print '(a)', 'test_optimizers: All tests passed.' + else + write(stderr, '(a)') 'test_optimizers: One or more tests failed.' + stop 1 + end if + + contains + + pure logical function check_convergence(y, ypred) result(converged) + ! Check convergence of ypred to y based on RMSE < tolerance. + real, intent(in) :: y(:), ypred(:) + real, parameter :: tolerance = 1e-3 + converged = sqrt(sum((ypred - y)**2) / size(y)) < tolerance + end function check_convergence + +end program test_optimizers