Adagrad Optimizer Implementation (#154)

Spnetic-5 · milancurcic · web-flow · commit b119194a6472 · 2023-08-05T22:53:31.000-04:00
* Adagrad Implementation

* Resolved comments

* Added test for adagrad

* Comment

* Fix L2 penalty and learning rate decay

* Add Adagrad to the list in README

* Bump minor version

---------

Co-authored-by: milancurcic &lt;caomaco@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 * Training and inference of dense (fully connected) and convolutional neural
   networks
 * Stochastic gradient descent optimizers: Classic, momentum, Nesterov momentum,
-  RMSProp, Adam, AdamW
+  RMSProp, Adagrad, Adam, AdamW
 * More than a dozen activation functions and their derivatives
 * Loading dense and convolutional models from Keras HDF5 (.h5) files
 * Data-based parallelism
diff --git a/example/quadratic.f90 b/example/quadratic.f90
@@ -4,10 +4,10 @@ program quadratic_fit
   ! descent.
   use nf, only: dense, input, network
   use nf_dense_layer, only: dense_layer
-  use nf_optimizers, only: sgd, rmsprop, adam
+  use nf_optimizers, only: sgd, rmsprop, adam, adagrad
 
   implicit none
-  type(network) :: net(9)
+  type(network) :: net(11)
 
   ! Training parameters
   integer, parameter :: num_epochs = 1000
@@ -95,6 +95,17 @@ program quadratic_fit
     beta1, beta2, epsilon, weight_decay_decoupled=1e-5 &
   )
 
+  ! Adagrad optimizer
+  call adagrad_optimizer( &
+    net(10), x, y, xtest, ytest, learning_rate, num_epochs, epsilon &
+  )
+
+  ! Adagrad optimizer with L2 regularization and learning rate decay
+  call adagrad_optimizer( &
+    net(11), x, y, xtest, ytest, learning_rate, num_epochs, epsilon, &
+      weight_decay_l2=1e-4, learning_rate_decay=0.99 &
+  )
+
 contains
 
   real elemental function quadratic(x) result(y)
@@ -358,6 +369,68 @@ subroutine adam_optimizer( &
 
   end subroutine adam_optimizer
 
+  subroutine adagrad_optimizer( &
+    net, x, y, xtest, ytest, learning_rate, num_epochs, epsilon, &
+      weight_decay_l2, learning_rate_decay &
+  )
+    ! Adagrad optimizer for updating weights using adaptive gradient algorithm
+    type(network), intent(inout) :: net
+    real, intent(in) :: x(:), y(:)
+    real, intent(in) :: xtest(:), ytest(:)
+    real, intent(in) :: learning_rate, epsilon
+    real, intent(in), optional :: weight_decay_l2
+    real, intent(in), optional :: learning_rate_decay
+    integer, intent(in) :: num_epochs
+    integer :: i, n
+    real, allocatable :: ypred(:)
+    real :: weight_decay_l2_val
+    real :: learning_rate_decay_val
+
+    ! Set default values for weight_decay_l2
+    if (.not. present(weight_decay_l2)) then
+      weight_decay_l2_val = 0.0
+    else
+      weight_decay_l2_val = weight_decay_l2
+    end if
+
+    ! Set default values for learning_rate_decay
+    if (.not. present(learning_rate_decay)) then
+      learning_rate_decay_val = 0.0
+    else
+      learning_rate_decay_val = learning_rate_decay
+    end if
+
+    print '(a)', 'Adagrad optimizer'
+    print '(34("-"))'
+
+    do n = 1, num_epochs
+
+      do i = 1, size(x)
+        call net % forward([x(i)])
+        call net % backward([y(i)])
+      end do
+
+      call net % update( &
+          adagrad( &
+          learning_rate=learning_rate, &
+          epsilon=epsilon, &
+          weight_decay_l2=weight_decay_l2_val, &
+          learning_rate_decay=learning_rate_decay_val &
+          ) &
+      )
+
+      if (mod(n, num_epochs / 10) == 0) then
+        ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
+        print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', &
+          n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
+      end if
+
+    end do
+
+    print *, ''
+
+  end subroutine adagrad_optimizer
+
   subroutine shuffle(arr)
     ! Shuffle an array using the Fisher-Yates algorithm.
     integer, intent(inout) :: arr(:)
diff --git a/fpm.toml b/fpm.toml
@@ -1,5 +1,5 @@
 name = "neural-fortran"
-version = "0.14.0"
+version = "0.15.0"
 license = "MIT"
 author = "Milan Curcic"
 maintainer = "milancurcic@hey.com"
diff --git a/src/nf.f90 b/src/nf.f90
@@ -5,7 +5,7 @@ module nf
   use nf_layer_constructors, only: &
     conv2d, dense, flatten, input, maxpool2d, reshape
   use nf_network, only: network
-  use nf_optimizers, only: sgd, rmsprop, adam
+  use nf_optimizers, only: sgd, rmsprop, adam, adagrad
   use nf_activation, only: activation_function, elu, exponential,  &
                            gaussian, linear, relu, leaky_relu,     &
                            sigmoid, softmax, softplus, step, tanhf, &
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
@@ -13,7 +13,7 @@ module nf_optimizers
   implicit none
 
   private
-  public :: optimizer_base_type, sgd, rmsprop, adam
+  public :: optimizer_base_type, sgd, rmsprop, adam, adagrad
 
   type, abstract :: optimizer_base_type
     real :: learning_rate = 0.01
@@ -87,6 +87,23 @@ end subroutine minimize
     procedure :: minimize => minimize_adam
   end type adam
 
+  type, extends(optimizer_base_type) :: adagrad
+    !! Adagrad optimizer by Duchi et al. (2011)
+    !!
+    !! Duchi, J., Hazan, E. and Singer, Y., 2011. Adaptive subgradient
+    !! methods for online learning and stochastic optimization. Journal
+    !! of Machine Learning Research, 12(Jul), pp.2121-2159.
+    !! http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
+    real :: epsilon = 1e-8
+    real :: weight_decay_l2 = 0
+    real :: learning_rate_decay = 0
+    real, allocatable, private :: sum_squared_gradient(:)
+    integer, private :: t = 0
+  contains
+    procedure :: init => init_adagrad
+    procedure :: minimize => minimize_adagrad
+  end type adagrad
+
 contains
 
   impure elemental subroutine init_sgd(self, num_params)
@@ -186,11 +203,49 @@ pure subroutine minimize_adam(self, param, gradient)
 
     ! Update parameters.
     param = param &
-      - self % learning_rate * m_hat / (sqrt(v_hat) + self % epsilon) &
-      - self % weight_decay_decoupled * param
+      - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) &
+      + self % weight_decay_decoupled * param)
 
     end associate
 
   end subroutine minimize_adam
 
+
+  impure elemental subroutine init_adagrad(self, num_params)
+    class(adagrad), intent(inout) :: self
+    integer, intent(in) :: num_params
+    if (.not. allocated(self % sum_squared_gradient)) then
+      allocate(self % sum_squared_gradient(num_params))
+      self % sum_squared_gradient = 0
+    end if
+  end subroutine init_adagrad
+
+
+  pure subroutine minimize_adagrad(self, param, gradient)
+    !! Concrete implementation of an Adagrad optimizer update rule.
+    class(adagrad), intent(inout) :: self
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
+
+    ! Update the current time step
+    self % t = self % t + 1
+
+    associate( &
+      ! If weight_decay_l2 > 0, use L2 regularization;
+      ! otherwise, default to regular Adagrad.
+      g => gradient + self % weight_decay_l2 * param, &
+      ! Amortize the learning rate as function of the current time step.
+      learning_rate => self % learning_rate &
+        / (1 + (self % t - 1) * self % learning_rate_decay) &
+    )
+
+      self % sum_squared_gradient = self % sum_squared_gradient + g**2
+
+      param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) &
+        + self % epsilon)
+
+    end associate
+
+  end subroutine minimize_adagrad
+
 end module nf_optimizers
diff --git a/test/test_optimizers.f90 b/test/test_optimizers.f90
@@ -1,10 +1,10 @@
 program test_optimizers
 
-  use nf, only: dense, input, network, rmsprop, sgd, adam
+  use nf, only: dense, input, network, rmsprop, sgd, adam, adagrad
   use iso_fortran_env, only: stderr => error_unit
 
   implicit none
-  type(network) :: net(5)
+  type(network) :: net(6)
   real, allocatable :: x(:), y(:)
   real, allocatable :: ypred(:)
   integer, parameter :: num_iterations = 1000
@@ -116,6 +116,26 @@ program test_optimizers
     ok = .false.
   end if
 
+  ! Test Adagrad optimizer
+  converged = .false.
+
+  do n = 0, num_iterations
+
+    call net(6) % forward(x)
+    call net(6) % backward(y)
+    call net(6) % update(optimizer=adagrad(learning_rate=0.01, weight_decay_l2=1e-4, learning_rate_decay=0.99))
+
+    ypred = net(5) % predict(x)
+    converged = check_convergence(y, ypred)
+    if (converged) exit
+
+  end do
+
+  if (.not. converged) then
+    write(stderr, '(a)') 'adagrad should converge in simple training.. failed'
+    ok = .false.
+  end if
+
 
   if (ok) then
     print '(a)', 'test_optimizers: All tests passed.'