From e9bfbd60123cd0a19aaafe1f0ad7f47efb2808a7 Mon Sep 17 00:00:00 2001
From: Saurabh Suresh Powar <66636289+Spnetic-5@users.noreply.github.com>
Date: Fri, 14 Jul 2023 10:18:00 -0400
Subject: [PATCH] Added Momentum and Nesterov modifications (#148)

* Added Momentum and Nesterov modifications

* Resolved dummy argument error

* Changes in update formulas

* Corrected formulae, velocity allocation changes

* Added concrete implementation of RMSProp

* Report RMSE every 10% of num_epochs; Fix xtest calculation

* Initialize networks with same weights; larger batch; larger test array

* Start putting RMS and velocity structures in place; yet to be allocated and initialized

* WIP: SGD and RMSprop optimizers plumbing at the network % update level

* Added get_gradients() method (draft)

* Clean up formatting and docstrings

* Flush gradients to zero; code compiles but segfaults

* Set default value for batch_size; tests pass in debug mode but segfault in optimized mode

* Update learning rates in simple and sine examples because the default changed

* Added draft test suite for optimizers

* Store the optimizer as a member of the network type

* Don't print to stdout; indentation

* Added convergence tests

* Resolved comments

* Clean up

* Import RMSProp

* Remove old code

* Add optimizer support notes

---------

Co-authored-by: milancurcic <caomaco@gmail.com>
---
 README.md                            |  10 ++-
 example/quadratic.f90                | 126 +++++++++++++++++++--------
 example/simple.f90                   |   4 +-
 example/sine.f90                     |   4 +-
 src/nf.f90                           |   2 +-
 src/nf/nf_conv2d_layer.f90           |  15 +++-
 src/nf/nf_conv2d_layer_submodule.f90 |  12 +++
 src/nf/nf_dense_layer.f90            |  14 ++-
 src/nf/nf_dense_layer_submodule.f90  |  12 +++
 src/nf/nf_layer.f90                  |  24 ++---
 src/nf/nf_layer_submodule.f90        |  78 +++++------------
 src/nf/nf_network.f90                |  11 ++-
 src/nf/nf_network_submodule.f90      |  88 ++++++++++++++++---
 src/nf/nf_optimizers.f90             | 105 ++++++++++++++++++----
 test/CMakeLists.txt                  |   1 +
 test/test_optimizers.f90             | 115 ++++++++++++++++++++++++
 16 files changed, 470 insertions(+), 151 deletions(-)
 create mode 100644 test/test_optimizers.f90

diff --git a/README.md b/README.md
index 920a3e0..6be5811 100644
--- a/README.md
+++ b/README.md
@@ -17,10 +17,11 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 
 * Training and inference of dense (fully connected) and convolutional neural
   networks
+* Stochastic gradient descent optimizers: Classic, momentum, Nesterov momentum,
+  and RMSProp
+* More than a dozen activation functions and their derivatives
 * Loading dense and convolutional models from Keras HDF5 (.h5) files
-* Stochastic and mini-batch gradient descent for back-propagation
 * Data-based parallelism
-* Several activation functions and their derivatives
 
 ### Available layers
 
@@ -28,11 +29,14 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 |------------|------------------|------------------------|----------------------|--------------|---------------|
 | Input | `input` | n/a | 1, 3 | n/a | n/a |
 | Dense (fully-connected) | `dense` | `input1d`, `flatten` | 1 | ✅ | ✅ |
-| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
+| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ❌ |
 | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
 | Flatten | `flatten` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ |
 | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ |
 
+**Note:** The training of convolutional layers has been discovered to be broken
+as of release 0.13.0. This will be fixed in a future (hopefully next) release.
+
 ## Getting started
 
 Get the code:
diff --git a/example/quadratic.f90 b/example/quadratic.f90
index da06bf3..168a6ae 100644
--- a/example/quadratic.f90
+++ b/example/quadratic.f90
@@ -7,20 +7,19 @@ program quadratic_fit
   use nf_optimizers, only: sgd
 
   implicit none
-  type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd, net_rms_prop
+  type(network) :: net(6)
 
   ! Training parameters
   integer, parameter :: num_epochs = 1000
   integer, parameter :: train_size = 1000
-  integer, parameter :: test_size = 30
-  integer, parameter :: batch_size = 10
+  integer, parameter :: test_size = 100
+  integer, parameter :: batch_size = 100
   real, parameter :: learning_rate = 0.01
   real, parameter :: decay_rate = 0.9
 
   ! Input and output data
   real, allocatable :: x(:), y(:) ! training data
   real, allocatable :: xtest(:), ytest(:) ! testing data
-  real, allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:), ypred_rms_prop(:)
 
   integer :: i, n
 
@@ -28,7 +27,7 @@ program quadratic_fit
   print '(60("="))'
 
   allocate(xtest(test_size), ytest(test_size))
-  xtest = [((i - 1) * 2 / test_size, i = 1, test_size)]
+  xtest = [(real(i - 1) * 2 / test_size, i = 1, test_size)]
   ytest = quadratic(xtest)
 
   ! x and y as 1-D arrays
@@ -41,38 +40,30 @@ program quadratic_fit
   end do
   y = quadratic(x)
 
-  ! Instantiate a separate network for each optimization method.
-  net_sgd = network([input(1), dense(3), dense(1)])
-  net_batch_sgd = network([input(1), dense(3), dense(1)])
-  net_minibatch_sgd = network([input(1), dense(3), dense(1)])
-  net_rms_prop = network([input(1), dense(3), dense(1)])
+  ! Instantiate a network and copy an instance to the rest of the array
+  net(1) = network([input(1), dense(3), dense(1)])
+  net(2:) = net(1)
 
   ! Print network info to stdout; this will be the same for all three networks.
-  call net_sgd % print_info()
+  call net(1) % print_info()
 
-  ! SGD optimizer
-  call sgd_optimizer(net_sgd, x, y, learning_rate, num_epochs)
+  ! SGD, no momentum
+  call sgd_optimizer(net(1), x, y, xtest, ytest, learning_rate, num_epochs)
+
+  ! SGD, momentum
+  call sgd_optimizer(net(2), x, y, xtest, ytest, learning_rate, num_epochs, momentum=0.9)
+
+  ! SGD, momentum with Nesterov
+  call sgd_optimizer(net(3), x, y, xtest, ytest, learning_rate, num_epochs, momentum=0.9, nesterov=.true.)
 
   ! Batch SGD optimizer
-  call batch_gd_optimizer(net_batch_sgd, x, y, learning_rate, num_epochs)
+  call batch_gd_optimizer(net(4), x, y, xtest, ytest, learning_rate, num_epochs)
 
   ! Mini-batch SGD optimizer
-  call minibatch_gd_optimizer(net_minibatch_sgd, x, y, learning_rate, num_epochs, batch_size)
+  call minibatch_gd_optimizer(net(5), x, y, xtest, ytest, learning_rate, num_epochs, batch_size)
 
   ! RMSProp optimizer
-  call rmsprop_optimizer(net_rms_prop, x, y, learning_rate, num_epochs, decay_rate)
-
-  ! Calculate predictions on the test set
-  ypred_sgd = [(net_sgd % predict([xtest(i)]), i = 1, test_size)]
-  ypred_batch_sgd = [(net_batch_sgd % predict([xtest(i)]), i = 1, test_size)]
-  ypred_minibatch_sgd = [(net_minibatch_sgd % predict([xtest(i)]), i = 1, test_size)]
-  ypred_rms_prop = [(net_rms_prop % predict([xtest(i)]), i = 1, test_size)]
-
-  ! Print the mean squared error
-  print '("Stochastic gradient descent MSE:", f9.6)', sum((ypred_sgd - ytest)**2) / size(ytest)
-  print '("     Batch gradient descent MSE: ", f9.6)', sum((ypred_batch_sgd - ytest)**2) / size(ytest)
-  print '(" Minibatch gradient descent MSE: ", f9.6)', sum((ypred_minibatch_sgd - ytest)**2) / size(ytest)
-  print '("                    RMSProp MSE: ", f9.6)', sum((ypred_rms_prop - ytest)**2) / size(ytest)
+  call rmsprop_optimizer(net(6), x, y, xtest, ytest, learning_rate, num_epochs, decay_rate)
 
 contains
 
@@ -82,39 +73,70 @@ real elemental function quadratic(x) result(y)
     y = (x**2 / 2 + x / 2 + 1) / 2
   end function quadratic
 
-  subroutine sgd_optimizer(net, x, y, learning_rate, num_epochs)
+  subroutine sgd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, momentum, nesterov)
     ! In the stochastic gradient descent (SGD) optimizer, we run the forward
     ! and backward passes and update the weights for each training sample,
     ! one at a time.
     type(network), intent(inout) :: net
     real, intent(in) :: x(:), y(:)
+    real, intent(in) :: xtest(:), ytest(:)
     real, intent(in) :: learning_rate
     integer, intent(in) :: num_epochs
+    real, intent(in), optional :: momentum
+    logical, intent(in), optional :: nesterov
+    real, allocatable :: ypred(:)
+    real :: momentum_value
+    logical :: nesterov_value
     integer :: i, n
 
-    print *, "Running SGD optimizer..."
+    print '(a)', 'Stochastic gradient descent'
+    print '(34("-"))'
+
+    ! Set default values for momentum and nesterov
+    if (.not. present(momentum)) then
+      momentum_value = 0.0
+    else
+      momentum_value = momentum
+    end if
+
+    if (.not. present(nesterov)) then
+      nesterov_value = .false.
+    else
+      nesterov_value = nesterov
+    end if
 
     do n = 1, num_epochs
       do i = 1, size(x)
         call net % forward([x(i)])
         call net % backward([y(i)])
-        call net % update(sgd(learning_rate=learning_rate))
+        call net % update(sgd(learning_rate=learning_rate, momentum=momentum_value, nesterov=nesterov_value))
       end do
+
+      if (mod(n, num_epochs / 10) == 0) then
+        ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
+        print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
+      end if
+
     end do
 
+    print *, ''
+
   end subroutine sgd_optimizer
 
-  subroutine batch_gd_optimizer(net, x, y, learning_rate, num_epochs)
+  subroutine batch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs)
     ! Like the stochastic gradient descent (SGD) optimizer, except that here we
     ! accumulate the weight gradients for all training samples and update the
     ! weights once per epoch.
     type(network), intent(inout) :: net
     real, intent(in) :: x(:), y(:)
+    real, intent(in) :: xtest(:), ytest(:)
     real, intent(in) :: learning_rate
     integer, intent(in) :: num_epochs
+    real, allocatable :: ypred(:)
     integer :: i, n
 
-    print *, "Running batch GD optimizer..."
+    print '(a)', 'Batch gradient descent'
+    print '(34("-"))'
 
     do n = 1, num_epochs
       do i = 1, size(x)
@@ -122,11 +144,19 @@ subroutine batch_gd_optimizer(net, x, y, learning_rate, num_epochs)
         call net % backward([y(i)])
       end do
       call net % update(sgd(learning_rate=learning_rate / size(x)))
+
+      if (mod(n, num_epochs / 10) == 0) then
+        ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
+        print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
+      end if
+
     end do
 
+    print *, ''
+
   end subroutine batch_gd_optimizer
 
-  subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_size)
+  subroutine minibatch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, batch_size)
     ! Like the batch SGD optimizer, except that here we accumulate the weight
     ! over a number of mini batches and update the weights once per mini batch.
     !
@@ -134,13 +164,16 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si
     ! this subroutine to converge to a solution.
     type(network), intent(inout) :: net
     real, intent(in) :: x(:), y(:)
+    real, intent(in) :: xtest(:), ytest(:)
     real, intent(in) :: learning_rate
     integer, intent(in) :: num_epochs, batch_size
     integer :: i, j, n, num_samples, num_batches, start_index, end_index
     real, allocatable :: batch_x(:), batch_y(:)
     integer, allocatable :: batch_indices(:)
+    real, allocatable :: ypred(:)
 
-    print *, "Running mini-batch GD optimizer..."
+    print '(a)', 'Minibatch gradient descent'
+    print '(34("-"))'
 
     num_samples = size(x)
     num_batches = num_samples / batch_size
@@ -167,17 +200,28 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si
 
         call net % update(sgd(learning_rate=learning_rate / batch_size))
       end do
+
+      if (mod(n, num_epochs / 10) == 0) then
+        ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
+        print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
+      end if
+
     end do
+
+    print *, ''
+
   end subroutine minibatch_gd_optimizer
 
-  subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate)
+  subroutine rmsprop_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, decay_rate)
     ! RMSprop optimizer for updating weights using root mean square
     type(network), intent(inout) :: net
     real, intent(in) :: x(:), y(:)
+    real, intent(in) :: xtest(:), ytest(:)
     real, intent(in) :: learning_rate, decay_rate
     integer, intent(in) :: num_epochs
     integer :: i, j, n
     real, parameter :: epsilon = 1e-8 ! Small constant to avoid division by zero
+    real, allocatable :: ypred(:)
 
     ! Define a dedicated type to store the RMSprop gradients.
     ! This is needed because array sizes vary between layers and we need to
@@ -191,7 +235,8 @@ subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate)
 
     type(rms_gradient_dense), allocatable :: rms(:)
 
-    print *, "Running RMSprop optimizer..."
+    print '(a)', 'RMSProp optimizer'
+    print '(34("-"))'
 
     ! Here we allocate the array or RMS gradient derived types.
     ! We need one for each dense layer, however we will allocate it to the
@@ -237,8 +282,15 @@ subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate)
         end select
       end do
 
+      if (mod(n, num_epochs / 10) == 0) then
+        ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
+        print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
+      end if
+
     end do
 
+    print *, ''
+
   end subroutine rmsprop_optimizer
 
   subroutine shuffle(arr)
diff --git a/example/simple.f90 b/example/simple.f90
index 07e5646..32a93dc 100644
--- a/example/simple.f90
+++ b/example/simple.f90
@@ -1,5 +1,5 @@
 program simple
-  use nf, only: dense, input, network
+  use nf, only: dense, input, network, sgd
   implicit none
   type(network) :: net
   real, allocatable :: x(:), y(:)
@@ -24,7 +24,7 @@ program simple
 
     call net % forward(x)
     call net % backward(y)
-    call net % update()
+    call net % update(optimizer=sgd(learning_rate=1.))
 
     if (mod(n, 50) == 0) &
       print '(i4,2(3x,f8.6))', n, net % predict(x)
diff --git a/example/sine.f90 b/example/sine.f90
index 0ab5749..b37a243 100644
--- a/example/sine.f90
+++ b/example/sine.f90
@@ -1,5 +1,5 @@
 program sine
-  use nf, only: dense, input, network
+  use nf, only: dense, input, network, sgd
   implicit none
   type(network) :: net
   real :: x(1), y(1)
@@ -31,7 +31,7 @@ program sine
 
     call net % forward(x)
     call net % backward(y)
-    call net % update()
+    call net % update(optimizer=sgd(learning_rate=1.))
 
     if (mod(n, 10000) == 0) then
       ypred = [(net % predict([xtest(i)]), i = 1, test_size)]
diff --git a/src/nf.f90 b/src/nf.f90
index e4522aa..82fcb80 100644
--- a/src/nf.f90
+++ b/src/nf.f90
@@ -5,7 +5,7 @@ module nf
   use nf_layer_constructors, only: &
     conv2d, dense, flatten, input, maxpool2d, reshape
   use nf_network, only: network
-  use nf_optimizers, only: sgd
+  use nf_optimizers, only: sgd, rmsprop
   use nf_activation, only: activation_function, elu, exponential,  &
                            gaussian, linear, relu, leaky_relu,     &
                            sigmoid, softmax, softplus, step, tanhf, &
diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90
index 83d6597..7b72980 100644
--- a/src/nf/nf_conv2d_layer.f90
+++ b/src/nf/nf_conv2d_layer.f90
@@ -30,11 +30,12 @@ module nf_conv2d_layer
 
   contains
 
-    procedure :: init
     procedure :: forward
     procedure :: backward
+    procedure :: get_gradients
     procedure :: get_num_params
     procedure :: get_params
+    procedure :: init
     procedure :: set_params
 
   end type conv2d_layer
@@ -89,13 +90,23 @@ pure module function get_num_params(self) result(num_params)
     end function get_num_params
 
     pure module function get_params(self) result(params)
-      !! Get the parameters of the layer.
+      !! Return the parameters (weights and biases) of this layer.
+      !! The parameters are ordered as weights first, biases second.
       class(conv2d_layer), intent(in) :: self
         !! A `conv2d_layer` instance
       real, allocatable :: params(:)
         !! Parameters to get
     end function get_params
 
+    pure module function get_gradients(self) result(gradients)
+      !! Return the gradients of this layer.
+      !! The gradients are ordered as weights first, biases second.
+      class(conv2d_layer), intent(in) :: self
+        !! A `conv2d_layer` instance
+      real, allocatable :: gradients(:)
+        !! Gradients to get
+    end function get_gradients
+
     module subroutine set_params(self, params)
       !! Set the parameters of the layer.
       class(conv2d_layer), intent(in out) :: self
diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90
index a24145f..a2fadfc 100644
--- a/src/nf/nf_conv2d_layer_submodule.f90
+++ b/src/nf/nf_conv2d_layer_submodule.f90
@@ -202,6 +202,18 @@ pure module function get_params(self) result(params)
   end function get_params
 
 
+  pure module function get_gradients(self) result(gradients)
+    class(conv2d_layer), intent(in) :: self
+    real, allocatable :: gradients(:)
+
+    gradients = [ &
+      pack(self % dw, .true.), &
+      pack(self % db, .true.) &
+    ]
+
+  end function get_gradients
+
+
   module subroutine set_params(self, params)
     class(conv2d_layer), intent(in out) :: self
     real, intent(in) :: params(:)
diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
index ad0c6e2..ae523cc 100644
--- a/src/nf/nf_dense_layer.f90
+++ b/src/nf/nf_dense_layer.f90
@@ -33,10 +33,11 @@ module nf_dense_layer
 
     procedure :: backward
     procedure :: forward
+    procedure :: get_gradients
     procedure :: get_num_params
     procedure :: get_params
-    procedure :: set_params
     procedure :: init
+    procedure :: set_params
 
   end type dense_layer
 
@@ -87,7 +88,7 @@ pure module function get_num_params(self) result(num_params)
     end function get_num_params
 
     pure module function get_params(self) result(params)
-      !! Return the parameters of this layer.
+      !! Return the parameters (weights and biases) of this layer.
       !! The parameters are ordered as weights first, biases second.
       class(dense_layer), intent(in) :: self
         !! Dense layer instance
@@ -95,6 +96,15 @@ pure module function get_params(self) result(params)
         !! Parameters of this layer
     end function get_params
 
+    pure module function get_gradients(self) result(gradients)
+      !! Return the gradients of this layer.
+      !! The gradients are ordered as weights first, biases second.
+      class(dense_layer), intent(in) :: self
+        !! Dense layer instance
+      real, allocatable :: gradients(:)
+        !! Gradients of this layer
+    end function get_gradients
+
     module subroutine set_params(self, params)
       !! Set the parameters of this layer.
       !! The parameters are ordered as weights first, biases second.
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
index fb61060..1caec7c 100644
--- a/src/nf/nf_dense_layer_submodule.f90
+++ b/src/nf/nf_dense_layer_submodule.f90
@@ -69,6 +69,18 @@ pure module function get_params(self) result(params)
   end function get_params
 
 
+  pure module function get_gradients(self) result(gradients)
+    class(dense_layer), intent(in) :: self
+    real, allocatable :: gradients(:)
+
+    gradients = [ &
+      pack(self % dw, .true.), &
+      pack(self % db, .true.) &
+    ]
+
+  end function get_gradients
+
+
   module subroutine set_params(self, params)
     class(dense_layer), intent(in out) :: self
     real, intent(in) :: params(:)
diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90
index 28fdaae..e9e90da 100644
--- a/src/nf/nf_layer.f90
+++ b/src/nf/nf_layer.f90
@@ -28,10 +28,10 @@ module nf_layer
     procedure :: forward
     procedure :: get_num_params
     procedure :: get_params
+    procedure :: get_gradients
     procedure :: set_params
     procedure :: init
     procedure :: print_info
-    procedure :: update
 
     ! Specific subroutines for different array ranks
     procedure, private :: backward_1d
@@ -137,6 +137,14 @@ pure module function get_params(self) result(params)
         !! Parameters of this layer
     end function get_params
 
+    pure module function get_gradients(self) result(gradients)
+      !! Returns the gradients of this layer.
+      class(layer), intent(in) :: self
+        !! Layer instance
+      real, allocatable :: gradients(:)
+        !! Gradients of this layer
+    end function get_gradients
+
     module subroutine set_params(self, params)
       !! Returns the parameters of this layer.
       class(layer), intent(in out) :: self
@@ -145,20 +153,6 @@ module subroutine set_params(self, params)
         !! Parameters of this layer
     end subroutine set_params
 
-    impure elemental module subroutine update(self, optimizer, batch_size)
-      !! Update the weights and biases on the layer using the stored
-      !! gradients (from backward passes), and flush those same stored
-      !! gradients to zero.
-      !! This changes the state of the layer.
-      !! Typically used only internally from the `network % update` method.
-      class(layer), intent(in out) :: self
-        !! Layer instance
-      class(optimizer_base_type), intent(in) :: optimizer
-        !! Optimizer instance to use
-      integer, intent(in), optional :: batch_size
-        !! Batch size (default 1)
-    end subroutine update
-
   end interface
 
 end module nf_layer
diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
index 2216d24..0746764 100644
--- a/src/nf/nf_layer_submodule.f90
+++ b/src/nf/nf_layer_submodule.f90
@@ -298,7 +298,6 @@ elemental module function get_num_params(self) result(num_params)
 
   end function get_num_params
 
-
   pure module function get_params(self) result(params)
     class(layer), intent(in) :: self
     real, allocatable :: params(:)
@@ -324,6 +323,30 @@ pure module function get_params(self) result(params)
 
   end function get_params
 
+  pure module function get_gradients(self) result(gradients)
+    class(layer), intent(in) :: self
+    real, allocatable :: gradients(:)
+
+    select type (this_layer => self % p)
+      type is (input1d_layer)
+        ! No gradients to get.
+      type is (input3d_layer)
+        ! No gradients to get.
+      type is (dense_layer)
+        gradients = this_layer % get_gradients()
+      type is (conv2d_layer)
+        gradients = this_layer % get_gradients()
+      type is (maxpool2d_layer)
+        ! No gradients to get.
+      type is (flatten_layer)
+        ! No gradients to get.
+      type is (reshape3d_layer)
+        ! No gradients to get.
+      class default
+        error stop 'Unknown layer type.'
+    end select
+
+  end function get_gradients
 
   module subroutine set_params(self, params)
     class(layer), intent(in out) :: self
@@ -382,57 +405,4 @@ module subroutine set_params(self, params)
 
   end subroutine set_params
 
-
-  impure elemental module subroutine update(self, optimizer, batch_size)
-    class(layer), intent(in out) :: self
-    class(optimizer_base_type), intent(in) :: optimizer
-    integer, intent(in), optional :: batch_size
-    integer :: batch_size_
-
-    batch_size_ = 1
-    if (present(batch_size)) batch_size_ = batch_size
-
-    select type (this_layer => self % p)
-      type is (dense_layer)
-
-        ! Sum weight and bias gradients across images, if any
-        call co_sum(this_layer % dw)
-        call co_sum(this_layer % db)
-
-        call optimizer % minimize( &
-          this_layer % weights, &
-          this_layer % dw / batch_size_ &
-        )
-        call optimizer % minimize( &
-          this_layer % biases, &
-          this_layer % db / batch_size_ &
-        )
-
-        ! Reset gradients.
-        this_layer % dw = 0
-        this_layer % db = 0
-
-      type is (conv2d_layer)
-
-        ! Sum weight and bias gradients across images, if any
-        call co_sum(this_layer % dw)
-        call co_sum(this_layer % db)
-
-        call optimizer % minimize( &
-          this_layer % kernel, &
-          this_layer % dw / batch_size_ &
-        )
-        call optimizer % minimize( &
-          this_layer % biases, &
-          this_layer % db / batch_size_ &
-        )
-
-        ! Reset gradients.
-        this_layer % dw = 0
-        this_layer % db = 0
-
-    end select
-
-  end subroutine update
-
 end submodule nf_layer_submodule
diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90
index 38941ab..f12926e 100644
--- a/src/nf/nf_network.f90
+++ b/src/nf/nf_network.f90
@@ -13,14 +13,16 @@ module nf_network
   type :: network
 
     type(layer), allocatable :: layers(:)
+    class(optimizer_base_type), allocatable :: optimizer
 
   contains
 
     procedure :: backward
+    procedure :: get_gradients
     procedure :: get_num_params
     procedure :: get_params
-    procedure :: set_params
     procedure :: print_info
+    procedure :: set_params
     procedure :: train
     procedure :: update
 
@@ -161,6 +163,13 @@ pure module function get_params(self) result(params)
         !! Network parameters to get
     end function get_params
 
+    pure module function get_gradients(self) result(gradients)
+      class(network), intent(in) :: self
+        !! Network instance
+      real, allocatable :: gradients(:)
+        !! Network gradients to set
+    end function get_gradients
+
     module subroutine set_params(self, params)
       !! Set the network parameters (weights and biases).
       class(network), intent(in out) :: self
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index cdd8cad..5bafb7c 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -498,6 +498,26 @@ pure module function get_params(self) result(params)
   end function get_params
 
 
+  pure module function get_gradients(self) result(gradients)
+    class(network), intent(in) :: self
+    real, allocatable :: gradients(:)
+    integer :: n, nstart, nend
+
+    allocate(gradients(self % get_num_params()))
+
+    nstart = 1
+    do n = 1, size(self % layers)
+
+      if (self % layers(n) % get_num_params() < 1) cycle
+
+      nend = nstart + self % layers(n) % get_num_params() - 1
+      gradients(nstart:nend) = self % layers(n) % get_gradients()
+      nstart = nend + 1
+    end do
+
+  end function get_gradients
+
+
   module subroutine set_params(self, params)
     class(network), intent(in out) :: self
     real, intent(in) :: params(:)
@@ -538,11 +558,13 @@ module subroutine train(self, input_data, output_data, batch_size, &
     ! Passing the optimizer instance is optional.
     ! If not provided, we default to SGD with its default settings.
     if (present(optimizer)) then
-      optimizer_ = optimizer
+      self % optimizer = optimizer
     else
-      optimizer_ = sgd()
+      self % optimizer = sgd()
     end if
 
+    call self % optimizer % init(self % get_num_params())
+
     dataset_size = size(output_data, dim=2)
 
     epoch_loop: do n = 1, epochs
@@ -567,12 +589,7 @@ module subroutine train(self, input_data, output_data, batch_size, &
           call self % backward(output_data(:,j))
         end do
 
-        select type (optimizer_)
-          type is (sgd)
-            call self % update(optimizer_, batch_size)
-          class default
-            error stop 'Unsupported optimizer'
-        end select
+        call self % update(batch_size=batch_size)
 
       end do batch_loop
     end do epoch_loop
@@ -585,16 +602,59 @@ module subroutine update(self, optimizer, batch_size)
     class(optimizer_base_type), intent(in), optional :: optimizer
     integer, intent(in), optional :: batch_size
     class(optimizer_base_type), allocatable :: optimizer_
+    integer :: batch_size_
+    real, allocatable :: params(:)
+    integer :: n
 
-    ! Passing the optimizer instance is optional.
-    ! If not provided, we default to SGD with its default settings.
-    if (present(optimizer)) then
-      optimizer_ = optimizer
+    ! Passing the optimizer instance is optional. If not provided, and if the
+    ! optimizer has not already been set, we default to the default SGD. The
+    ! instantiation and initialization below of the optimizer is normally done
+    ! at the beginning of the network % train() method. However, if the user
+    ! wants to call network % update() directly, for example if they use their
+    ! own custom mini-batching routine, we initialize the optimizer here as
+    ! well. If it's initialized already, this step is a cheap no-op.
+    if (.not. allocated(self % optimizer)) then
+      if (present(optimizer)) then
+        self % optimizer = optimizer
+      else
+        self % optimizer = sgd()
+      end if
+      call self % optimizer % init(self % get_num_params())
+    end if
+
+    if (present(batch_size)) then
+      batch_size_ = batch_size
     else
-      optimizer_ = sgd()
+      batch_size_ = 1
     end if
 
-    call self % layers % update(optimizer_, batch_size)
+    ! Sum weight and bias gradients across images, if any
+    do n = 2, size(self % layers)
+      select type(this_layer => self % layers(n) % p)
+        type is(dense_layer)
+          call co_sum(this_layer % dw)
+          call co_sum(this_layer % db)
+        type is(conv2d_layer)
+          call co_sum(this_layer % dw)
+          call co_sum(this_layer % db)
+      end select
+    end do
+
+    params = self % get_params()
+    call self % optimizer % minimize(params, self % get_gradients() / batch_size_)
+    call self % set_params(params)
+
+    ! Flush network gradients to zero.
+    do concurrent(n = 2:size(self % layers))
+      select type(this_layer => self % layers(n) % p)
+        type is(dense_layer)
+          this_layer % dw = 0
+          this_layer % db = 0
+        type is(conv2d_layer)
+          this_layer % dw = 0
+          this_layer % db = 0
+      end select
+    end do
 
   end subroutine update
 
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index 725e561..3b7dc01 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -13,45 +13,114 @@ module nf_optimizers
   implicit none
 
   private
-  public :: optimizer_base_type, sgd
+  public :: optimizer_base_type, sgd, rmsprop
 
   type, abstract :: optimizer_base_type
-    real :: learning_rate = 1
+    real :: learning_rate = 0.01
   contains
+    procedure(init), deferred :: init
     procedure(minimize), deferred :: minimize
   end type optimizer_base_type
 
   abstract interface
-    elemental subroutine minimize(self, param, gradient)
+
+    impure elemental subroutine init(self, num_params)
+      import :: optimizer_base_type
+      class(optimizer_base_type), intent(inout) :: self
+      integer, intent(in) :: num_params
+    end subroutine init
+
+    pure subroutine minimize(self, param, gradient)
       import :: optimizer_base_type
-      class(optimizer_base_type), intent(in) :: self
-      real, intent(inout) :: param
-      real, intent(in) :: gradient
+      class(optimizer_base_type), intent(inout) :: self
+      real, intent(inout) :: param(:)
+      real, intent(in) :: gradient(:)
     end subroutine minimize
+
   end interface
 
   type, extends(optimizer_base_type) :: sgd
     !! Stochastic Gradient Descent optimizer
-    real :: momentum = 0 !TODO
-    logical :: nesterov = .false. !TODO
+    real :: momentum = 0
+    logical :: nesterov = .false.
+    real, allocatable :: velocity(:)
   contains
+    procedure :: init => init_sgd
     procedure :: minimize => minimize_sgd
   end type sgd
 
+  type, extends(optimizer_base_type) :: rmsprop
+    !! RMSProp optimizer
+    real :: decay_rate = 0.9
+    real :: epsilon = 1e-8
+    real, allocatable :: rms_gradient(:)
+  contains
+    procedure :: init => init_rmsprop
+    procedure :: minimize => minimize_rmsprop
+  end type rmsprop
+
 contains
 
-  elemental subroutine minimize_sgd(self, param, gradient)
+  impure elemental subroutine init_sgd(self, num_params)
+    class(sgd), intent(inout) :: self
+    integer, intent(in) :: num_params
+    if (self % momentum > 0 .and. .not. allocated(self % velocity)) then
+      allocate(self % velocity(num_params))
+      self % velocity = 0
+    end if
+  end subroutine init_sgd
+
+
+  pure subroutine minimize_sgd(self, param, gradient)
     !! Concrete implementation of a stochastic gradient descent optimizer
     !! update rule.
-    class(sgd), intent(in) :: self
-      !! Optimizer instance
-    real, intent(inout) :: param
-      !! Network parameter (i.e. weight or bias) to update
-    real, intent(in) :: gradient
-      !! Loss gradient with respect to the parameter (dL/dw or dL/db)
-    ! TODO Implement momentum and Nesterov options
-    ! TODO (see https://keras.io/api/optimizers/sgd/)
-    param = param - self % learning_rate * gradient
+    class(sgd), intent(inout) :: self
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
+
+    if (self % momentum > 0) then
+      ! Apply momentum update
+      self % velocity = self % momentum * self % velocity &
+        - self % learning_rate * gradient
+      if (self % nesterov) then
+        ! Apply Nesterov update
+        param = param + self % momentum * self % velocity &
+          - self % learning_rate * gradient
+      else
+        param = param + self % velocity
+      end if
+    else
+      ! Apply regular update
+      param = param - self % learning_rate * gradient
+    end if
+
   end subroutine minimize_sgd
 
+
+  impure elemental subroutine init_rmsprop(self, num_params)
+    class(rmsprop), intent(inout) :: self
+    integer, intent(in) :: num_params
+    if (.not. allocated(self % rms_gradient)) then
+      allocate(self % rms_gradient(num_params))
+      self % rms_gradient = 0
+    end if
+  end subroutine init_rmsprop
+
+
+  pure subroutine minimize_rmsprop(self, param, gradient)
+    !! Concrete implementation of a RMSProp optimizer update rule.
+    class(rmsprop), intent(inout) :: self
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
+
+    ! Compute the RMS of the gradient using the RMSProp rule
+    self % rms_gradient = self % decay_rate * self % rms_gradient &
+      + (1 - self % decay_rate) * gradient**2
+
+    ! Update the network parameters based on the new RMS of the gradient
+    param = param - self % learning_rate &
+      / sqrt(self % rms_gradient + self % epsilon) * gradient
+
+  end subroutine minimize_rmsprop
+
 end module nf_optimizers
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 485dd87..26646ec 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -15,6 +15,7 @@ foreach(execid
   dense_network_from_keras
   cnn_from_keras
   conv2d_network
+  optimizers
   )
   add_executable(test_${execid} test_${execid}.f90)
   target_link_libraries(test_${execid} PRIVATE neural h5fortran::h5fortran jsonfortran::jsonfortran ${LIBS})
diff --git a/test/test_optimizers.f90 b/test/test_optimizers.f90
new file mode 100644
index 0000000..dc41f91
--- /dev/null
+++ b/test/test_optimizers.f90
@@ -0,0 +1,115 @@
+program test_optimizers
+
+  use nf, only: dense, input, network, rmsprop, sgd
+  use iso_fortran_env, only: stderr => error_unit
+
+  implicit none
+  type(network) :: net(4)
+  real, allocatable :: x(:), y(:)
+  real, allocatable :: ypred(:)
+  integer, parameter :: num_iterations = 1000
+  integer :: n, i
+  logical :: ok = .true.
+  logical :: converged = .false.
+
+  ! Instantiate a network and copy an instance to the rest of the array
+  net(1) = network([input(3), dense(5), dense(2)])
+  net(2:) = net(1)
+
+  x = [0.2, 0.4, 0.6]
+  y = [0.123456, 0.246802]
+
+  do n = 0, num_iterations
+
+    call net(1) % forward(x)
+    call net(1) % backward(y)
+    call net(1) % update(optimizer=sgd(learning_rate=1.))
+
+    ypred = net(1) % predict(x)
+    converged = check_convergence(y, ypred)
+    if (converged) exit
+
+  end do
+
+  if (.not. converged) then
+    write(stderr, '(a)') 'sgd should converge in simple training.. failed'
+    ok = .false.
+  end if
+
+  converged = .false.
+
+  do n = 0, num_iterations
+
+    call net(2) % forward(x)
+    call net(2) % backward(y)
+    call net(2) % update(optimizer=sgd(learning_rate=1., momentum=0.9))
+
+    ypred = net(2) % predict(x)
+    converged = check_convergence(y, ypred)
+    if (converged) exit
+
+  end do
+
+  if (.not. converged) then
+    write(stderr, '(a)') &
+      'sgd(momentum) should converge in simple training.. failed'
+    ok = .false.
+  end if
+
+  converged = .false.
+
+  do n = 0, num_iterations
+
+    call net(3) % forward(x)
+    call net(3) % backward(y)
+    call net(3) % update(optimizer=sgd(learning_rate=1., momentum=0.9, nesterov=.true.))
+
+    ypred = net(3) % predict(x)
+    converged = check_convergence(y, ypred)
+    if (converged) exit
+
+  end do
+
+  if (.not. converged) then
+    write(stderr, '(a)') &
+      'sgd(nesterov) should converge in simple training.. failed'
+    ok = .false.
+  end if
+
+  ! Resetting convergence flag
+  converged = .false.
+
+  do n = 0, num_iterations
+
+    call net(4) % forward(x)
+    call net(4) % backward(y)
+    call net(4) % update(optimizer=rmsprop(learning_rate=0.01, decay_rate=0.9))
+
+    ypred = net(4) % predict(x)
+    converged = check_convergence(y, ypred)
+    if (converged) exit
+
+  end do
+
+  if (.not. converged) then
+    write(stderr, '(a)') 'rmsprop should converge in simple training.. failed'
+    ok = .false.
+  end if
+
+  if (ok) then
+    print '(a)', 'test_optimizers: All tests passed.'
+  else
+    write(stderr, '(a)') 'test_optimizers: One or more tests failed.'
+    stop 1
+  end if
+
+  contains
+
+  pure logical function check_convergence(y, ypred) result(converged)
+    ! Check convergence of ypred to y based on RMSE < tolerance.
+    real, intent(in) :: y(:), ypred(:)
+    real, parameter :: tolerance = 1e-3
+    converged = sqrt(sum((ypred - y)**2) / size(y)) < tolerance
+  end function check_convergence
+
+end program test_optimizers