Skip to content

Commit b119194

Browse files
Adagrad Optimizer Implementation (#154)
* Adagrad Implementation * Resolved comments * Added test for adagrad * Comment * Fix L2 penalty and learning rate decay * Add Adagrad to the list in README * Bump minor version --------- Co-authored-by: milancurcic <[email protected]>
1 parent 6adc1c2 commit b119194

File tree

6 files changed

+158
-10
lines changed

6 files changed

+158
-10
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
1818
* Training and inference of dense (fully connected) and convolutional neural
1919
networks
2020
* Stochastic gradient descent optimizers: Classic, momentum, Nesterov momentum,
21-
RMSProp, Adam, AdamW
21+
RMSProp, Adagrad, Adam, AdamW
2222
* More than a dozen activation functions and their derivatives
2323
* Loading dense and convolutional models from Keras HDF5 (.h5) files
2424
* Data-based parallelism

example/quadratic.f90

+75-2
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ program quadratic_fit
44
! descent.
55
use nf, only: dense, input, network
66
use nf_dense_layer, only: dense_layer
7-
use nf_optimizers, only: sgd, rmsprop, adam
7+
use nf_optimizers, only: sgd, rmsprop, adam, adagrad
88

99
implicit none
10-
type(network) :: net(9)
10+
type(network) :: net(11)
1111

1212
! Training parameters
1313
integer, parameter :: num_epochs = 1000
@@ -95,6 +95,17 @@ program quadratic_fit
9595
beta1, beta2, epsilon, weight_decay_decoupled=1e-5 &
9696
)
9797

98+
! Adagrad optimizer
99+
call adagrad_optimizer( &
100+
net(10), x, y, xtest, ytest, learning_rate, num_epochs, epsilon &
101+
)
102+
103+
! Adagrad optimizer with L2 regularization and learning rate decay
104+
call adagrad_optimizer( &
105+
net(11), x, y, xtest, ytest, learning_rate, num_epochs, epsilon, &
106+
weight_decay_l2=1e-4, learning_rate_decay=0.99 &
107+
)
108+
98109
contains
99110

100111
real elemental function quadratic(x) result(y)
@@ -358,6 +369,68 @@ subroutine adam_optimizer( &
358369

359370
end subroutine adam_optimizer
360371

372+
subroutine adagrad_optimizer( &
373+
net, x, y, xtest, ytest, learning_rate, num_epochs, epsilon, &
374+
weight_decay_l2, learning_rate_decay &
375+
)
376+
! Adagrad optimizer for updating weights using adaptive gradient algorithm
377+
type(network), intent(inout) :: net
378+
real, intent(in) :: x(:), y(:)
379+
real, intent(in) :: xtest(:), ytest(:)
380+
real, intent(in) :: learning_rate, epsilon
381+
real, intent(in), optional :: weight_decay_l2
382+
real, intent(in), optional :: learning_rate_decay
383+
integer, intent(in) :: num_epochs
384+
integer :: i, n
385+
real, allocatable :: ypred(:)
386+
real :: weight_decay_l2_val
387+
real :: learning_rate_decay_val
388+
389+
! Set default values for weight_decay_l2
390+
if (.not. present(weight_decay_l2)) then
391+
weight_decay_l2_val = 0.0
392+
else
393+
weight_decay_l2_val = weight_decay_l2
394+
end if
395+
396+
! Set default values for learning_rate_decay
397+
if (.not. present(learning_rate_decay)) then
398+
learning_rate_decay_val = 0.0
399+
else
400+
learning_rate_decay_val = learning_rate_decay
401+
end if
402+
403+
print '(a)', 'Adagrad optimizer'
404+
print '(34("-"))'
405+
406+
do n = 1, num_epochs
407+
408+
do i = 1, size(x)
409+
call net % forward([x(i)])
410+
call net % backward([y(i)])
411+
end do
412+
413+
call net % update( &
414+
adagrad( &
415+
learning_rate=learning_rate, &
416+
epsilon=epsilon, &
417+
weight_decay_l2=weight_decay_l2_val, &
418+
learning_rate_decay=learning_rate_decay_val &
419+
) &
420+
)
421+
422+
if (mod(n, num_epochs / 10) == 0) then
423+
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
424+
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', &
425+
n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
426+
end if
427+
428+
end do
429+
430+
print *, ''
431+
432+
end subroutine adagrad_optimizer
433+
361434
subroutine shuffle(arr)
362435
! Shuffle an array using the Fisher-Yates algorithm.
363436
integer, intent(inout) :: arr(:)

fpm.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name = "neural-fortran"
2-
version = "0.14.0"
2+
version = "0.15.0"
33
license = "MIT"
44
author = "Milan Curcic"
55
maintainer = "[email protected]"

src/nf.f90

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ module nf
55
use nf_layer_constructors, only: &
66
conv2d, dense, flatten, input, maxpool2d, reshape
77
use nf_network, only: network
8-
use nf_optimizers, only: sgd, rmsprop, adam
8+
use nf_optimizers, only: sgd, rmsprop, adam, adagrad
99
use nf_activation, only: activation_function, elu, exponential, &
1010
gaussian, linear, relu, leaky_relu, &
1111
sigmoid, softmax, softplus, step, tanhf, &

src/nf/nf_optimizers.f90

+58-3
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ module nf_optimizers
1313
implicit none
1414

1515
private
16-
public :: optimizer_base_type, sgd, rmsprop, adam
16+
public :: optimizer_base_type, sgd, rmsprop, adam, adagrad
1717

1818
type, abstract :: optimizer_base_type
1919
real :: learning_rate = 0.01
@@ -87,6 +87,23 @@ end subroutine minimize
8787
procedure :: minimize => minimize_adam
8888
end type adam
8989

90+
type, extends(optimizer_base_type) :: adagrad
91+
!! Adagrad optimizer by Duchi et al. (2011)
92+
!!
93+
!! Duchi, J., Hazan, E. and Singer, Y., 2011. Adaptive subgradient
94+
!! methods for online learning and stochastic optimization. Journal
95+
!! of Machine Learning Research, 12(Jul), pp.2121-2159.
96+
!! http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
97+
real :: epsilon = 1e-8
98+
real :: weight_decay_l2 = 0
99+
real :: learning_rate_decay = 0
100+
real, allocatable, private :: sum_squared_gradient(:)
101+
integer, private :: t = 0
102+
contains
103+
procedure :: init => init_adagrad
104+
procedure :: minimize => minimize_adagrad
105+
end type adagrad
106+
90107
contains
91108

92109
impure elemental subroutine init_sgd(self, num_params)
@@ -186,11 +203,49 @@ pure subroutine minimize_adam(self, param, gradient)
186203

187204
! Update parameters.
188205
param = param &
189-
- self % learning_rate * m_hat / (sqrt(v_hat) + self % epsilon) &
190-
- self % weight_decay_decoupled * param
206+
- self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) &
207+
+ self % weight_decay_decoupled * param)
191208

192209
end associate
193210

194211
end subroutine minimize_adam
195212

213+
214+
impure elemental subroutine init_adagrad(self, num_params)
215+
class(adagrad), intent(inout) :: self
216+
integer, intent(in) :: num_params
217+
if (.not. allocated(self % sum_squared_gradient)) then
218+
allocate(self % sum_squared_gradient(num_params))
219+
self % sum_squared_gradient = 0
220+
end if
221+
end subroutine init_adagrad
222+
223+
224+
pure subroutine minimize_adagrad(self, param, gradient)
225+
!! Concrete implementation of an Adagrad optimizer update rule.
226+
class(adagrad), intent(inout) :: self
227+
real, intent(inout) :: param(:)
228+
real, intent(in) :: gradient(:)
229+
230+
! Update the current time step
231+
self % t = self % t + 1
232+
233+
associate( &
234+
! If weight_decay_l2 > 0, use L2 regularization;
235+
! otherwise, default to regular Adagrad.
236+
g => gradient + self % weight_decay_l2 * param, &
237+
! Amortize the learning rate as function of the current time step.
238+
learning_rate => self % learning_rate &
239+
/ (1 + (self % t - 1) * self % learning_rate_decay) &
240+
)
241+
242+
self % sum_squared_gradient = self % sum_squared_gradient + g**2
243+
244+
param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) &
245+
+ self % epsilon)
246+
247+
end associate
248+
249+
end subroutine minimize_adagrad
250+
196251
end module nf_optimizers

test/test_optimizers.f90

+22-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
program test_optimizers
22

3-
use nf, only: dense, input, network, rmsprop, sgd, adam
3+
use nf, only: dense, input, network, rmsprop, sgd, adam, adagrad
44
use iso_fortran_env, only: stderr => error_unit
55

66
implicit none
7-
type(network) :: net(5)
7+
type(network) :: net(6)
88
real, allocatable :: x(:), y(:)
99
real, allocatable :: ypred(:)
1010
integer, parameter :: num_iterations = 1000
@@ -116,6 +116,26 @@ program test_optimizers
116116
ok = .false.
117117
end if
118118

119+
! Test Adagrad optimizer
120+
converged = .false.
121+
122+
do n = 0, num_iterations
123+
124+
call net(6) % forward(x)
125+
call net(6) % backward(y)
126+
call net(6) % update(optimizer=adagrad(learning_rate=0.01, weight_decay_l2=1e-4, learning_rate_decay=0.99))
127+
128+
ypred = net(5) % predict(x)
129+
converged = check_convergence(y, ypred)
130+
if (converged) exit
131+
132+
end do
133+
134+
if (.not. converged) then
135+
write(stderr, '(a)') 'adagrad should converge in simple training.. failed'
136+
ok = .false.
137+
end if
138+
119139

120140
if (ok) then
121141
print '(a)', 'test_optimizers: All tests passed.'

0 commit comments

Comments
 (0)