From 31fc06176f95fdbd223d365c94dcdda4db168aad Mon Sep 17 00:00:00 2001 From: Milan Curcic Date: Thu, 22 Jun 2023 11:18:15 -0400 Subject: [PATCH] SGD optimizer stub (#139) * Defining the SGD minimization step in the optimizer type * Add note about refactor needed * Pass optimizer instance down to layer % update() * Apply the optimizer update step in layer % update * Changes in tests and examples to account for the API change in network % update() * Make optimizer optional; default to SGD with learning rate of 1 * Apply optimizer to conv2d layer --- example/get_set_network_params.f90 | 3 +- example/quadratic.f90 | 7 ++-- example/simple.f90 | 2 +- example/sine.f90 | 2 +- src/nf/nf_conv2d_layer.f90 | 9 ----- src/nf/nf_conv2d_layer_submodule.f90 | 16 --------- src/nf/nf_dense_layer.f90 | 9 ----- src/nf/nf_dense_layer_submodule.f90 | 15 -------- src/nf/nf_layer.f90 | 9 +++-- src/nf/nf_layer_submodule.f90 | 54 ++++++++++++++++++++++++---- src/nf/nf_network.f90 | 15 ++++---- src/nf/nf_network_submodule.f90 | 33 +++++++++++++---- src/nf/nf_optimizers.f90 | 42 ++++++++++++++++++++-- test/test_dense_network.f90 | 3 +- 14 files changed, 138 insertions(+), 81 deletions(-) diff --git a/example/get_set_network_params.f90 b/example/get_set_network_params.f90 index 6f92401b..c4a5f980 100644 --- a/example/get_set_network_params.f90 +++ b/example/get_set_network_params.f90 @@ -1,5 +1,6 @@ program get_set_network_params use nf, only: dense, input, network + use nf_optimizers, only: sgd implicit none type(network) :: net1, net2 real :: x(1), y(1) @@ -37,7 +38,7 @@ program get_set_network_params call net1 % forward(x) call net1 % backward(y) - call net1 % update(1.) + call net1 % update(sgd(learning_rate=1.)) if (mod(n, 10000) == 0) then ypred1 = [(net1 % predict([xtest(i)]), i=1, test_size)] diff --git a/example/quadratic.f90 b/example/quadratic.f90 index 10d65d49..da06bf36 100644 --- a/example/quadratic.f90 +++ b/example/quadratic.f90 @@ -4,6 +4,7 @@ program quadratic_fit ! descent. use nf, only: dense, input, network use nf_dense_layer, only: dense_layer + use nf_optimizers, only: sgd implicit none type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd, net_rms_prop @@ -97,7 +98,7 @@ subroutine sgd_optimizer(net, x, y, learning_rate, num_epochs) do i = 1, size(x) call net % forward([x(i)]) call net % backward([y(i)]) - call net % update(learning_rate) + call net % update(sgd(learning_rate=learning_rate)) end do end do @@ -120,7 +121,7 @@ subroutine batch_gd_optimizer(net, x, y, learning_rate, num_epochs) call net % forward([x(i)]) call net % backward([y(i)]) end do - call net % update(learning_rate / size(x)) + call net % update(sgd(learning_rate=learning_rate / size(x))) end do end subroutine batch_gd_optimizer @@ -164,7 +165,7 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si call net % backward([y(i)]) end do - call net % update(learning_rate / batch_size) + call net % update(sgd(learning_rate=learning_rate / batch_size)) end do end do end subroutine minibatch_gd_optimizer diff --git a/example/simple.f90 b/example/simple.f90 index 0e7d2189..07e5646b 100644 --- a/example/simple.f90 +++ b/example/simple.f90 @@ -24,7 +24,7 @@ program simple call net % forward(x) call net % backward(y) - call net % update(1.) + call net % update() if (mod(n, 50) == 0) & print '(i4,2(3x,f8.6))', n, net % predict(x) diff --git a/example/sine.f90 b/example/sine.f90 index f59b8dca..0ab57494 100644 --- a/example/sine.f90 +++ b/example/sine.f90 @@ -31,7 +31,7 @@ program sine call net % forward(x) call net % backward(y) - call net % update(1.) + call net % update() if (mod(n, 10000) == 0) then ypred = [(net % predict([xtest(i)]), i = 1, test_size)] diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90 index 42734566..83d65977 100644 --- a/src/nf/nf_conv2d_layer.f90 +++ b/src/nf/nf_conv2d_layer.f90 @@ -36,7 +36,6 @@ module nf_conv2d_layer procedure :: get_num_params procedure :: get_params procedure :: set_params - procedure :: update end type conv2d_layer @@ -105,14 +104,6 @@ module subroutine set_params(self, params) !! Parameters to set end subroutine set_params - module subroutine update(self, learning_rate) - !! Update the weights and biases. - class(conv2d_layer), intent(in out) :: self - !! Dense layer instance - real, intent(in) :: learning_rate - !! Learning rate (must be > 0) - end subroutine update - end interface end module nf_conv2d_layer diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90 index 66e60697..a24145f9 100644 --- a/src/nf/nf_conv2d_layer_submodule.f90 +++ b/src/nf/nf_conv2d_layer_submodule.f90 @@ -225,20 +225,4 @@ module subroutine set_params(self, params) end subroutine set_params - - module subroutine update(self, learning_rate) - class(conv2d_layer), intent(in out) :: self - real, intent(in) :: learning_rate - - ! Sum weight and bias gradients across images, if any - call co_sum(self % dw) - call co_sum(self % db) - - self % kernel = self % kernel - learning_rate * self % dw - self % biases = self % biases - learning_rate * self % db - self % dw = 0 - self % db = 0 - - end subroutine update - end submodule nf_conv2d_layer_submodule diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index 2e20043d..ad0c6e2f 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -37,7 +37,6 @@ module nf_dense_layer procedure :: get_params procedure :: set_params procedure :: init - procedure :: update end type dense_layer @@ -115,14 +114,6 @@ module subroutine init(self, input_shape) !! Shape of the input layer end subroutine init - module subroutine update(self, learning_rate) - !! Update the weights and biases. - class(dense_layer), intent(in out) :: self - !! Dense layer instance - real, intent(in) :: learning_rate - !! Learning rate (must be > 0) - end subroutine update - end interface end module nf_dense_layer diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index 471b4095..fb610606 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -128,19 +128,4 @@ module subroutine init(self, input_shape) end subroutine init - module subroutine update(self, learning_rate) - class(dense_layer), intent(in out) :: self - real, intent(in) :: learning_rate - - ! Sum weight and bias gradients across images, if any - call co_sum(self % dw) - call co_sum(self % db) - - self % weights = self % weights - learning_rate * self % dw - self % biases = self % biases - learning_rate * self % db - self % dw = 0 - self % db = 0 - - end subroutine update - end submodule nf_dense_layer_submodule diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90 index b13f38f3..28fdaaea 100644 --- a/src/nf/nf_layer.f90 +++ b/src/nf/nf_layer.f90 @@ -4,6 +4,7 @@ module nf_layer !! user-facing API. use nf_base_layer, only: base_layer + use nf_optimizers, only: optimizer_base_type implicit none @@ -144,7 +145,7 @@ module subroutine set_params(self, params) !! Parameters of this layer end subroutine set_params - impure elemental module subroutine update(self, learning_rate) + impure elemental module subroutine update(self, optimizer, batch_size) !! Update the weights and biases on the layer using the stored !! gradients (from backward passes), and flush those same stored !! gradients to zero. @@ -152,8 +153,10 @@ impure elemental module subroutine update(self, learning_rate) !! Typically used only internally from the `network % update` method. class(layer), intent(in out) :: self !! Layer instance - real, intent(in) :: learning_rate - !! Learning rate to use; must be > 0. + class(optimizer_base_type), intent(in) :: optimizer + !! Optimizer instance to use + integer, intent(in), optional :: batch_size + !! Batch size (default 1) end subroutine update end interface diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90 index 4d88bd77..2216d241 100644 --- a/src/nf/nf_layer_submodule.f90 +++ b/src/nf/nf_layer_submodule.f90 @@ -8,6 +8,7 @@ use nf_input3d_layer, only: input3d_layer use nf_maxpool2d_layer, only: maxpool2d_layer use nf_reshape_layer, only: reshape3d_layer + use nf_optimizers, only: optimizer_base_type contains @@ -382,15 +383,54 @@ module subroutine set_params(self, params) end subroutine set_params - impure elemental module subroutine update(self, learning_rate) + impure elemental module subroutine update(self, optimizer, batch_size) class(layer), intent(in out) :: self - real, intent(in) :: learning_rate + class(optimizer_base_type), intent(in) :: optimizer + integer, intent(in), optional :: batch_size + integer :: batch_size_ + + batch_size_ = 1 + if (present(batch_size)) batch_size_ = batch_size + + select type (this_layer => self % p) + type is (dense_layer) + + ! Sum weight and bias gradients across images, if any + call co_sum(this_layer % dw) + call co_sum(this_layer % db) + + call optimizer % minimize( & + this_layer % weights, & + this_layer % dw / batch_size_ & + ) + call optimizer % minimize( & + this_layer % biases, & + this_layer % db / batch_size_ & + ) + + ! Reset gradients. + this_layer % dw = 0 + this_layer % db = 0 + + type is (conv2d_layer) + + ! Sum weight and bias gradients across images, if any + call co_sum(this_layer % dw) + call co_sum(this_layer % db) + + call optimizer % minimize( & + this_layer % kernel, & + this_layer % dw / batch_size_ & + ) + call optimizer % minimize( & + this_layer % biases, & + this_layer % db / batch_size_ & + ) + + ! Reset gradients. + this_layer % dw = 0 + this_layer % db = 0 - select type(this_layer => self % p) - type is(dense_layer) - call this_layer % update(learning_rate) - type is(conv2d_layer) - call this_layer % update(learning_rate) end select end subroutine update diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90 index c8fb764c..38941ab3 100644 --- a/src/nf/nf_network.f90 +++ b/src/nf/nf_network.f90 @@ -193,12 +193,11 @@ module subroutine train(self, input_data, output_data, batch_size, & !! Set to `size(input_data, dim=2)` for a batch gradient descent. integer, intent(in) :: epochs !! Number of epochs to run - class(optimizer_base_type), intent(in) :: optimizer - !! Optimizer instance; currently this is an `sgd` optimizer type - !! and it will be made to be a more general optimizer type. + class(optimizer_base_type), intent(in), optional :: optimizer + !! Optimizer instance to use. If not provided, the default is sgd(). end subroutine train - module subroutine update(self, learning_rate) + module subroutine update(self, optimizer, batch_size) !! Update the weights and biases on all layers using the stored !! gradients (from backward passes) on those layers, and flush those !! same stored gradients to zero. @@ -207,8 +206,12 @@ module subroutine update(self, learning_rate) !! but can be invoked by the user when creating custom optimizers. class(network), intent(in out) :: self !! Network instance - real, intent(in) :: learning_rate - !! Learning rate to use; must be > 0. + class(optimizer_base_type), intent(in), optional :: optimizer + !! Optimizer instance to use + integer, intent(in), optional :: batch_size + !! Batch size to use. + !! Set to 1 for a pure stochastic gradient descent (default). + !! Set to `size(input_data, dim=2)` for a batch gradient descent. end subroutine update end interface diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index e71a8e80..6a0156d3 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -520,7 +520,8 @@ module subroutine train(self, input_data, output_data, batch_size, & real, intent(in) :: output_data(:,:) integer, intent(in) :: batch_size integer, intent(in) :: epochs - class(optimizer_base_type), intent(in) :: optimizer + class(optimizer_base_type), intent(in), optional :: optimizer + class(optimizer_base_type), allocatable :: optimizer_ real :: pos integer :: dataset_size @@ -528,6 +529,14 @@ module subroutine train(self, input_data, output_data, batch_size, & integer :: i, j, n integer :: istart, iend, indices(2) + ! Passing the optimizer instance is optional. + ! If not provided, we default to SGD with its default settings. + if (present(optimizer)) then + optimizer_ = optimizer + else + optimizer_ = sgd() + end if + dataset_size = size(output_data, dim=2) epoch_loop: do n = 1, epochs @@ -552,9 +561,9 @@ module subroutine train(self, input_data, output_data, batch_size, & call self % backward(output_data(:,j)) end do - select type (optimizer) + select type (optimizer_) type is (sgd) - call self % update(optimizer % learning_rate / batch_size) + call self % update(optimizer_, batch_size) class default error stop 'Unsupported optimizer' end select @@ -565,10 +574,22 @@ module subroutine train(self, input_data, output_data, batch_size, & end subroutine train - module subroutine update(self, learning_rate) + module subroutine update(self, optimizer, batch_size) class(network), intent(in out) :: self - real, intent(in) :: learning_rate - call self % layers % update(learning_rate) + class(optimizer_base_type), intent(in), optional :: optimizer + integer, intent(in), optional :: batch_size + class(optimizer_base_type), allocatable :: optimizer_ + + ! Passing the optimizer instance is optional. + ! If not provided, we default to SGD with its default settings. + if (present(optimizer)) then + optimizer_ = optimizer + else + optimizer_ = sgd() + end if + + call self % layers % update(optimizer_, batch_size) + end subroutine update end submodule nf_network_submodule diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index 7d00a3cf..725e5615 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -1,6 +1,14 @@ module nf_optimizers - !! This module provides optimizer types to pass to the network constructor. + !! This module provides optimizer types to pass to the network train or update + !! methods. The implementation is based on an abstract optimizer base type + !! which has a required minimize method. The minimize method is an elemental + !! subroutine to allow operating in-place on arrays of network parameters + !! (weights/kernels and biases) of arbitrary ranks. An implementation of a new + !! optimizer thus requires writing a concrete optimizer type that extends the + !! abstract optimizer base type, and that implements a concrete minimize + !! method that accepts a scalar or array of network parameters to update and + !! the corresponding loss gradients. implicit none @@ -8,14 +16,42 @@ module nf_optimizers public :: optimizer_base_type, sgd type, abstract :: optimizer_base_type - character(:), allocatable :: name + real :: learning_rate = 1 + contains + procedure(minimize), deferred :: minimize end type optimizer_base_type + abstract interface + elemental subroutine minimize(self, param, gradient) + import :: optimizer_base_type + class(optimizer_base_type), intent(in) :: self + real, intent(inout) :: param + real, intent(in) :: gradient + end subroutine minimize + end interface + type, extends(optimizer_base_type) :: sgd !! Stochastic Gradient Descent optimizer - real :: learning_rate real :: momentum = 0 !TODO logical :: nesterov = .false. !TODO + contains + procedure :: minimize => minimize_sgd end type sgd +contains + + elemental subroutine minimize_sgd(self, param, gradient) + !! Concrete implementation of a stochastic gradient descent optimizer + !! update rule. + class(sgd), intent(in) :: self + !! Optimizer instance + real, intent(inout) :: param + !! Network parameter (i.e. weight or bias) to update + real, intent(in) :: gradient + !! Loss gradient with respect to the parameter (dL/dw or dL/db) + ! TODO Implement momentum and Nesterov options + ! TODO (see https://keras.io/api/optimizers/sgd/) + param = param - self % learning_rate * gradient + end subroutine minimize_sgd + end module nf_optimizers diff --git a/test/test_dense_network.f90 b/test/test_dense_network.f90 index 260a2ed8..fcfae094 100644 --- a/test/test_dense_network.f90 +++ b/test/test_dense_network.f90 @@ -1,6 +1,7 @@ program test_dense_network use iso_fortran_env, only: stderr => error_unit use nf, only: dense, input, network + use nf_optimizers, only: sgd implicit none type(network) :: net logical :: ok = .true. @@ -34,7 +35,7 @@ program test_dense_network do n = 1, num_iterations call net % forward(x) call net % backward(y) - call net % update(1.) + call net % update(sgd(learning_rate=1.)) if (all(abs(net % predict(x) - y) < tolerance)) exit end do