From 31fc06176f95fdbd223d365c94dcdda4db168aad Mon Sep 17 00:00:00 2001
From: Milan Curcic <caomaco@gmail.com>
Date: Thu, 22 Jun 2023 11:18:15 -0400
Subject: [PATCH] SGD optimizer stub (#139)

* Defining the SGD minimization step in the optimizer type

* Add note about refactor needed

* Pass optimizer instance down to layer % update()

* Apply the optimizer update step in layer % update

* Changes in tests and examples to account for the API change in network % update()

* Make optimizer optional; default to SGD with learning rate of 1

* Apply optimizer to conv2d layer
---
 example/get_set_network_params.f90   |  3 +-
 example/quadratic.f90                |  7 ++--
 example/simple.f90                   |  2 +-
 example/sine.f90                     |  2 +-
 src/nf/nf_conv2d_layer.f90           |  9 -----
 src/nf/nf_conv2d_layer_submodule.f90 | 16 ---------
 src/nf/nf_dense_layer.f90            |  9 -----
 src/nf/nf_dense_layer_submodule.f90  | 15 --------
 src/nf/nf_layer.f90                  |  9 +++--
 src/nf/nf_layer_submodule.f90        | 54 ++++++++++++++++++++++++----
 src/nf/nf_network.f90                | 15 ++++----
 src/nf/nf_network_submodule.f90      | 33 +++++++++++++----
 src/nf/nf_optimizers.f90             | 42 ++++++++++++++++++++--
 test/test_dense_network.f90          |  3 +-
 14 files changed, 138 insertions(+), 81 deletions(-)

diff --git a/example/get_set_network_params.f90 b/example/get_set_network_params.f90
index 6f92401b..c4a5f980 100644
--- a/example/get_set_network_params.f90
+++ b/example/get_set_network_params.f90
@@ -1,5 +1,6 @@
 program get_set_network_params
   use nf, only: dense, input, network
+  use nf_optimizers, only: sgd
   implicit none
   type(network) :: net1, net2
   real :: x(1), y(1)
@@ -37,7 +38,7 @@ program get_set_network_params
 
     call net1 % forward(x)
     call net1 % backward(y)
-    call net1 % update(1.)
+    call net1 % update(sgd(learning_rate=1.))
 
     if (mod(n, 10000) == 0) then
       ypred1 = [(net1 % predict([xtest(i)]), i=1, test_size)]
diff --git a/example/quadratic.f90 b/example/quadratic.f90
index 10d65d49..da06bf36 100644
--- a/example/quadratic.f90
+++ b/example/quadratic.f90
@@ -4,6 +4,7 @@ program quadratic_fit
   ! descent.
   use nf, only: dense, input, network
   use nf_dense_layer, only: dense_layer
+  use nf_optimizers, only: sgd
 
   implicit none
   type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd, net_rms_prop
@@ -97,7 +98,7 @@ subroutine sgd_optimizer(net, x, y, learning_rate, num_epochs)
       do i = 1, size(x)
         call net % forward([x(i)])
         call net % backward([y(i)])
-        call net % update(learning_rate)
+        call net % update(sgd(learning_rate=learning_rate))
       end do
     end do
 
@@ -120,7 +121,7 @@ subroutine batch_gd_optimizer(net, x, y, learning_rate, num_epochs)
         call net % forward([x(i)])
         call net % backward([y(i)])
       end do
-      call net % update(learning_rate / size(x))
+      call net % update(sgd(learning_rate=learning_rate / size(x)))
     end do
 
   end subroutine batch_gd_optimizer
@@ -164,7 +165,7 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si
           call net % backward([y(i)])
         end do
 
-        call net % update(learning_rate / batch_size)
+        call net % update(sgd(learning_rate=learning_rate / batch_size))
       end do
     end do
   end subroutine minibatch_gd_optimizer
diff --git a/example/simple.f90 b/example/simple.f90
index 0e7d2189..07e5646b 100644
--- a/example/simple.f90
+++ b/example/simple.f90
@@ -24,7 +24,7 @@ program simple
 
     call net % forward(x)
     call net % backward(y)
-    call net % update(1.)
+    call net % update()
 
     if (mod(n, 50) == 0) &
       print '(i4,2(3x,f8.6))', n, net % predict(x)
diff --git a/example/sine.f90 b/example/sine.f90
index f59b8dca..0ab57494 100644
--- a/example/sine.f90
+++ b/example/sine.f90
@@ -31,7 +31,7 @@ program sine
 
     call net % forward(x)
     call net % backward(y)
-    call net % update(1.)
+    call net % update()
 
     if (mod(n, 10000) == 0) then
       ypred = [(net % predict([xtest(i)]), i = 1, test_size)]
diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90
index 42734566..83d65977 100644
--- a/src/nf/nf_conv2d_layer.f90
+++ b/src/nf/nf_conv2d_layer.f90
@@ -36,7 +36,6 @@ module nf_conv2d_layer
     procedure :: get_num_params
     procedure :: get_params
     procedure :: set_params
-    procedure :: update
 
   end type conv2d_layer
 
@@ -105,14 +104,6 @@ module subroutine set_params(self, params)
         !! Parameters to set
     end subroutine set_params
 
-    module subroutine update(self, learning_rate)
-      !! Update the weights and biases.
-      class(conv2d_layer), intent(in out) :: self
-        !! Dense layer instance
-      real, intent(in) :: learning_rate
-        !! Learning rate (must be > 0)
-    end subroutine update
-
   end interface
 
 end module nf_conv2d_layer
diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90
index 66e60697..a24145f9 100644
--- a/src/nf/nf_conv2d_layer_submodule.f90
+++ b/src/nf/nf_conv2d_layer_submodule.f90
@@ -225,20 +225,4 @@ module subroutine set_params(self, params)
 
   end subroutine set_params
 
-
-  module subroutine update(self, learning_rate)
-    class(conv2d_layer), intent(in out) :: self
-    real, intent(in) :: learning_rate
-
-    ! Sum weight and bias gradients across images, if any
-    call co_sum(self % dw)
-    call co_sum(self % db)
-
-    self % kernel = self % kernel - learning_rate * self % dw
-    self % biases = self % biases - learning_rate * self % db
-    self % dw = 0
-    self % db = 0
-
-  end subroutine update
-
 end submodule nf_conv2d_layer_submodule
diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
index 2e20043d..ad0c6e2f 100644
--- a/src/nf/nf_dense_layer.f90
+++ b/src/nf/nf_dense_layer.f90
@@ -37,7 +37,6 @@ module nf_dense_layer
     procedure :: get_params
     procedure :: set_params
     procedure :: init
-    procedure :: update
 
   end type dense_layer
 
@@ -115,14 +114,6 @@ module subroutine init(self, input_shape)
         !! Shape of the input layer
     end subroutine init
 
-    module subroutine update(self, learning_rate)
-      !! Update the weights and biases.
-      class(dense_layer), intent(in out) :: self
-        !! Dense layer instance
-      real, intent(in) :: learning_rate
-        !! Learning rate (must be > 0)
-    end subroutine update
-
   end interface
 
 end module nf_dense_layer
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
index 471b4095..fb610606 100644
--- a/src/nf/nf_dense_layer_submodule.f90
+++ b/src/nf/nf_dense_layer_submodule.f90
@@ -128,19 +128,4 @@ module subroutine init(self, input_shape)
 
   end subroutine init
 
-  module subroutine update(self, learning_rate)
-    class(dense_layer), intent(in out) :: self
-    real, intent(in) :: learning_rate
-
-    ! Sum weight and bias gradients across images, if any
-    call co_sum(self % dw)
-    call co_sum(self % db)
-
-    self % weights = self % weights - learning_rate * self % dw
-    self % biases = self % biases - learning_rate * self % db
-    self % dw = 0
-    self % db = 0
-
-  end subroutine update
-
 end submodule nf_dense_layer_submodule
diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90
index b13f38f3..28fdaaea 100644
--- a/src/nf/nf_layer.f90
+++ b/src/nf/nf_layer.f90
@@ -4,6 +4,7 @@ module nf_layer
   !! user-facing API.
 
   use nf_base_layer, only: base_layer
+  use nf_optimizers, only: optimizer_base_type
 
   implicit none
 
@@ -144,7 +145,7 @@ module subroutine set_params(self, params)
         !! Parameters of this layer
     end subroutine set_params
 
-    impure elemental module subroutine update(self, learning_rate)
+    impure elemental module subroutine update(self, optimizer, batch_size)
       !! Update the weights and biases on the layer using the stored
       !! gradients (from backward passes), and flush those same stored
       !! gradients to zero.
@@ -152,8 +153,10 @@ impure elemental module subroutine update(self, learning_rate)
       !! Typically used only internally from the `network % update` method.
       class(layer), intent(in out) :: self
         !! Layer instance
-      real, intent(in) :: learning_rate
-        !! Learning rate to use; must be > 0.
+      class(optimizer_base_type), intent(in) :: optimizer
+        !! Optimizer instance to use
+      integer, intent(in), optional :: batch_size
+        !! Batch size (default 1)
     end subroutine update
 
   end interface
diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
index 4d88bd77..2216d241 100644
--- a/src/nf/nf_layer_submodule.f90
+++ b/src/nf/nf_layer_submodule.f90
@@ -8,6 +8,7 @@
   use nf_input3d_layer, only: input3d_layer
   use nf_maxpool2d_layer, only: maxpool2d_layer
   use nf_reshape_layer, only: reshape3d_layer
+  use nf_optimizers, only: optimizer_base_type
 
 contains
 
@@ -382,15 +383,54 @@ module subroutine set_params(self, params)
   end subroutine set_params
 
 
-  impure elemental module subroutine update(self, learning_rate)
+  impure elemental module subroutine update(self, optimizer, batch_size)
     class(layer), intent(in out) :: self
-    real, intent(in) :: learning_rate
+    class(optimizer_base_type), intent(in) :: optimizer
+    integer, intent(in), optional :: batch_size
+    integer :: batch_size_
+
+    batch_size_ = 1
+    if (present(batch_size)) batch_size_ = batch_size
+
+    select type (this_layer => self % p)
+      type is (dense_layer)
+
+        ! Sum weight and bias gradients across images, if any
+        call co_sum(this_layer % dw)
+        call co_sum(this_layer % db)
+
+        call optimizer % minimize( &
+          this_layer % weights, &
+          this_layer % dw / batch_size_ &
+        )
+        call optimizer % minimize( &
+          this_layer % biases, &
+          this_layer % db / batch_size_ &
+        )
+
+        ! Reset gradients.
+        this_layer % dw = 0
+        this_layer % db = 0
+
+      type is (conv2d_layer)
+
+        ! Sum weight and bias gradients across images, if any
+        call co_sum(this_layer % dw)
+        call co_sum(this_layer % db)
+
+        call optimizer % minimize( &
+          this_layer % kernel, &
+          this_layer % dw / batch_size_ &
+        )
+        call optimizer % minimize( &
+          this_layer % biases, &
+          this_layer % db / batch_size_ &
+        )
+
+        ! Reset gradients.
+        this_layer % dw = 0
+        this_layer % db = 0
 
-    select type(this_layer => self % p)
-      type is(dense_layer)
-        call this_layer % update(learning_rate)
-      type is(conv2d_layer)
-        call this_layer % update(learning_rate)
     end select
 
   end subroutine update
diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90
index c8fb764c..38941ab3 100644
--- a/src/nf/nf_network.f90
+++ b/src/nf/nf_network.f90
@@ -193,12 +193,11 @@ module subroutine train(self, input_data, output_data, batch_size, &
         !! Set to `size(input_data, dim=2)` for a batch gradient descent.
       integer, intent(in) :: epochs
         !! Number of epochs to run
-      class(optimizer_base_type), intent(in) :: optimizer
-        !! Optimizer instance; currently this is an `sgd` optimizer type
-        !! and it will be made to be a more general optimizer type.
+      class(optimizer_base_type), intent(in), optional :: optimizer
+        !! Optimizer instance to use. If not provided, the default is sgd().
     end subroutine train
 
-    module subroutine update(self, learning_rate)
+    module subroutine update(self, optimizer, batch_size)
       !! Update the weights and biases on all layers using the stored
       !! gradients (from backward passes) on those layers, and flush those
       !! same stored gradients to zero.
@@ -207,8 +206,12 @@ module subroutine update(self, learning_rate)
       !! but can be invoked by the user when creating custom optimizers.
       class(network), intent(in out) :: self
         !! Network instance
-      real, intent(in) :: learning_rate
-        !! Learning rate to use; must be > 0.
+      class(optimizer_base_type), intent(in), optional :: optimizer
+        !! Optimizer instance to use
+      integer, intent(in), optional :: batch_size
+        !! Batch size to use.
+        !! Set to 1 for a pure stochastic gradient descent (default).
+        !! Set to `size(input_data, dim=2)` for a batch gradient descent.
     end subroutine update
 
   end interface
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index e71a8e80..6a0156d3 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -520,7 +520,8 @@ module subroutine train(self, input_data, output_data, batch_size, &
     real, intent(in) :: output_data(:,:)
     integer, intent(in) :: batch_size
     integer, intent(in) :: epochs
-    class(optimizer_base_type), intent(in) :: optimizer
+    class(optimizer_base_type), intent(in), optional :: optimizer
+    class(optimizer_base_type), allocatable :: optimizer_
 
     real :: pos
     integer :: dataset_size
@@ -528,6 +529,14 @@ module subroutine train(self, input_data, output_data, batch_size, &
     integer :: i, j, n
     integer :: istart, iend, indices(2)
 
+    ! Passing the optimizer instance is optional.
+    ! If not provided, we default to SGD with its default settings.
+    if (present(optimizer)) then
+      optimizer_ = optimizer
+    else
+      optimizer_ = sgd()
+    end if
+
     dataset_size = size(output_data, dim=2)
 
     epoch_loop: do n = 1, epochs
@@ -552,9 +561,9 @@ module subroutine train(self, input_data, output_data, batch_size, &
           call self % backward(output_data(:,j))
         end do
 
-        select type (optimizer)
+        select type (optimizer_)
           type is (sgd)
-            call self % update(optimizer % learning_rate / batch_size)
+            call self % update(optimizer_, batch_size)
           class default
             error stop 'Unsupported optimizer'
         end select
@@ -565,10 +574,22 @@ module subroutine train(self, input_data, output_data, batch_size, &
   end subroutine train
 
 
-  module subroutine update(self, learning_rate)
+  module subroutine update(self, optimizer, batch_size)
     class(network), intent(in out) :: self
-    real, intent(in) :: learning_rate
-    call self % layers % update(learning_rate)
+    class(optimizer_base_type), intent(in), optional :: optimizer
+    integer, intent(in), optional :: batch_size
+    class(optimizer_base_type), allocatable :: optimizer_
+
+    ! Passing the optimizer instance is optional.
+    ! If not provided, we default to SGD with its default settings.
+    if (present(optimizer)) then
+      optimizer_ = optimizer
+    else
+      optimizer_ = sgd()
+    end if
+
+    call self % layers % update(optimizer_, batch_size)
+
   end subroutine update
 
 end submodule nf_network_submodule
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index 7d00a3cf..725e5615 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -1,6 +1,14 @@
 module nf_optimizers
 
-  !! This module provides optimizer types to pass to the network constructor.
+  !! This module provides optimizer types to pass to the network train or update
+  !! methods. The implementation is based on an abstract optimizer base type
+  !! which has a required minimize method. The minimize method is an elemental
+  !! subroutine to allow operating in-place on arrays of network parameters
+  !! (weights/kernels and biases) of arbitrary ranks. An implementation of a new
+  !! optimizer thus requires writing a concrete optimizer type that extends the
+  !! abstract optimizer base type, and that implements a concrete minimize
+  !! method that accepts a scalar or array of network parameters to update and
+  !! the corresponding loss gradients.
 
   implicit none
 
@@ -8,14 +16,42 @@ module nf_optimizers
   public :: optimizer_base_type, sgd
 
   type, abstract :: optimizer_base_type
-    character(:), allocatable :: name
+    real :: learning_rate = 1
+  contains
+    procedure(minimize), deferred :: minimize
   end type optimizer_base_type
 
+  abstract interface
+    elemental subroutine minimize(self, param, gradient)
+      import :: optimizer_base_type
+      class(optimizer_base_type), intent(in) :: self
+      real, intent(inout) :: param
+      real, intent(in) :: gradient
+    end subroutine minimize
+  end interface
+
   type, extends(optimizer_base_type) :: sgd
     !! Stochastic Gradient Descent optimizer
-    real :: learning_rate
     real :: momentum = 0 !TODO
     logical :: nesterov = .false. !TODO
+  contains
+    procedure :: minimize => minimize_sgd
   end type sgd
 
+contains
+
+  elemental subroutine minimize_sgd(self, param, gradient)
+    !! Concrete implementation of a stochastic gradient descent optimizer
+    !! update rule.
+    class(sgd), intent(in) :: self
+      !! Optimizer instance
+    real, intent(inout) :: param
+      !! Network parameter (i.e. weight or bias) to update
+    real, intent(in) :: gradient
+      !! Loss gradient with respect to the parameter (dL/dw or dL/db)
+    ! TODO Implement momentum and Nesterov options
+    ! TODO (see https://keras.io/api/optimizers/sgd/)
+    param = param - self % learning_rate * gradient
+  end subroutine minimize_sgd
+
 end module nf_optimizers
diff --git a/test/test_dense_network.f90 b/test/test_dense_network.f90
index 260a2ed8..fcfae094 100644
--- a/test/test_dense_network.f90
+++ b/test/test_dense_network.f90
@@ -1,6 +1,7 @@
 program test_dense_network
   use iso_fortran_env, only: stderr => error_unit
   use nf, only: dense, input, network
+  use nf_optimizers, only: sgd
   implicit none
   type(network) :: net
   logical :: ok = .true.
@@ -34,7 +35,7 @@ program test_dense_network
     do n = 1, num_iterations
       call net % forward(x)
       call net % backward(y)
-      call net % update(1.)
+      call net % update(sgd(learning_rate=1.))
       if (all(abs(net % predict(x) - y) < tolerance)) exit
     end do