Skip to content

Commit e628d1e

Browse files
Embedding Layer (#205)
* embedding_layer: initial forward implementation * embedding_layer: implementation of embedding layer * embedding_layer: remove gradient attribute * embedding_layer: guard against zeros * embedding_layer: plumbing * embedding_layer: positional encoding * embedding_layer: update tests * embedding_layer: add more comments * embedding_layer: update cmake * embedding_layer: pr fixes * embedding_layer: add absolute positional encoding * embedding_layer: update constructor and tests * embedding_layer: make integer input generics * embedding_layer: update readme --------- Co-authored-by: milancurcic <[email protected]>
1 parent e68e6c2 commit e628d1e

12 files changed

+496
-8
lines changed

CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ add_library(neural-fortran
4343
src/nf/nf_layer_submodule.f90
4444
src/nf/nf_linear2d_layer.f90
4545
src/nf/nf_linear2d_layer_submodule.f90
46+
src/nf/nf_embedding_layer.f90
47+
src/nf/nf_embedding_layer_submodule.f90
4648
src/nf/nf_loss.f90
4749
src/nf/nf_loss_submodule.f90
4850
src/nf/nf_maxpool2d_layer.f90

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
3030
| Layer type | Constructor name | Supported input layers | Rank of output array | Forward pass | Backward pass |
3131
|------------|------------------|------------------------|----------------------|--------------|---------------|
3232
| Input | `input` | n/a | 1, 2, 3 | n/a | n/a |
33+
| Embedding | `embedding` | n/a | 2 |||
3334
| Dense (fully-connected) | `dense` | `input1d`, `dense`, `dropout`, `flatten` | 1 |||
3435
| Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 |||
3536
| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 || ✅(*) |

src/nf.f90

+3-2
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,14 @@ module nf
66
conv2d, &
77
dense, &
88
dropout, &
9+
embedding, &
910
flatten, &
1011
input, &
12+
layernorm, &
1113
linear2d, &
1214
maxpool2d, &
1315
reshape, &
14-
self_attention, &
15-
layernorm
16+
self_attention
1617
use nf_loss, only: mse, quadratic
1718
use nf_metrics, only: corr, maxabs
1819
use nf_network, only: network

src/nf/nf_embedding_layer.f90

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
module nf_embedding_layer
2+
3+
use nf_activation, only: activation_function
4+
use nf_base_layer, only: base_layer
5+
6+
implicit none
7+
8+
private
9+
public :: embedding_layer
10+
11+
type, extends(base_layer) :: embedding_layer
12+
!! Embedding Layer
13+
!! Stores inputs as a trainable lookup table. Inputs are
14+
!! integer indicies in a dictionary of `vocab_size`.
15+
!! This layer converts them into a table of shape
16+
!! (`sequence_length`, `model_dimension`)
17+
integer :: sequence_length, vocab_size, model_dimension
18+
integer :: positional
19+
20+
real, allocatable :: weights(:, :)
21+
real, allocatable :: output(:, :)
22+
real, allocatable :: dw(:, :) ! weight gradients
23+
24+
contains
25+
26+
procedure :: backward
27+
procedure :: forward
28+
procedure :: positional_trigonometric
29+
procedure :: positional_absolute
30+
procedure :: init
31+
procedure :: get_num_params
32+
procedure :: get_params
33+
procedure :: get_gradients
34+
procedure :: set_params
35+
36+
end type embedding_layer
37+
38+
interface embedding_layer
39+
module function embedding_layer_cons(vocab_size, model_dimension, positional) result(res)
40+
integer, intent(in) :: vocab_size, model_dimension
41+
integer, optional :: positional
42+
type(embedding_layer) :: res
43+
end function embedding_layer_cons
44+
end interface embedding_layer
45+
46+
interface
47+
pure module subroutine forward(self, input)
48+
!! Get vectors by indicis in the dictionary
49+
class(embedding_layer), intent(in out) :: self
50+
integer, intent(in) :: input(:)
51+
end subroutine forward
52+
53+
pure module subroutine backward(self, input, gradient)
54+
!! Update gradient at `input` indices
55+
!! dw_i = W_i + d_output_i
56+
class(embedding_layer), intent(in out) :: self
57+
integer, intent(in) :: input(:)
58+
real, intent(in) :: gradient(:, :)
59+
end subroutine backward
60+
61+
pure module subroutine positional_trigonometric(self, pos)
62+
!! Sum embedding with positional info (trigonometric, not trianable)
63+
class(embedding_layer), intent(in out) :: self
64+
integer, intent(in) :: pos
65+
end subroutine positional_trigonometric
66+
67+
pure module subroutine positional_absolute(self, pos)
68+
!! Sum embedding with absolute position
69+
class(embedding_layer), intent(in out) :: self
70+
integer, intent(in) :: pos
71+
end subroutine positional_absolute
72+
73+
module subroutine init(self, input_shape)
74+
class(embedding_layer), intent(in out) :: self
75+
integer, intent(in) :: input_shape(:)
76+
end subroutine init
77+
78+
pure module function get_num_params(self) result(num_params)
79+
class(embedding_layer), intent(in) :: self
80+
integer :: num_params
81+
end function get_num_params
82+
83+
module function get_params(self) result(params)
84+
class(embedding_layer), intent(in), target :: self
85+
real, allocatable :: params(:)
86+
end function get_params
87+
88+
module function get_gradients(self) result(gradients)
89+
class(embedding_layer), intent(in), target :: self
90+
real, allocatable :: gradients(:)
91+
end function get_gradients
92+
93+
module subroutine set_params(self, params)
94+
class(embedding_layer), intent(in out) :: self
95+
real, intent(in), target :: params(:)
96+
end subroutine set_params
97+
end interface
98+
end module nf_embedding_layer
+137
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
#define NONE 0
2+
#define TRIGONOMETRIC 1
3+
#define ABSOLUTE 2
4+
5+
submodule(nf_embedding_layer) nf_embedding_layer_submodule
6+
use nf_base_layer, only: base_layer
7+
implicit none
8+
contains
9+
module function embedding_layer_cons(vocab_size, model_dimension, positional) result(res)
10+
integer, intent(in) :: vocab_size, model_dimension
11+
integer, optional :: positional
12+
type(embedding_layer) :: res
13+
14+
res % vocab_size = vocab_size
15+
res % model_dimension = model_dimension
16+
if (.not. present(positional)) then
17+
res % positional = NONE
18+
else
19+
res % positional = positional
20+
end if
21+
end function embedding_layer_cons
22+
23+
module subroutine init(self, input_shape)
24+
class(embedding_layer), intent(in out) :: self
25+
integer, intent(in) :: input_shape(:)
26+
27+
self % sequence_length = input_shape(1)
28+
29+
allocate(self % output(self % sequence_length, self % model_dimension))
30+
31+
allocate(self % weights(self % vocab_size, self % model_dimension))
32+
self % weights = 0.1
33+
34+
allocate(self % dw(self % vocab_size, self % model_dimension))
35+
self % dw = 0.0
36+
end subroutine init
37+
38+
pure module subroutine forward(self, input)
39+
class(embedding_layer), intent(in out) :: self
40+
integer, intent(in) :: input(:)
41+
integer :: i, index
42+
43+
do concurrent(i = 1: self % sequence_length)
44+
index = input(i)
45+
if (index > size(self % weights, 1)) then
46+
index = 1
47+
elseif (index == 0) then
48+
index = 1
49+
end if
50+
51+
self % output(i, :) = self % weights(index, :)
52+
53+
if (self % positional == TRIGONOMETRIC) then
54+
call self % positional_trigonometric(i)
55+
elseif (self % positional == ABSOLUTE) then
56+
call self % positional_absolute(i)
57+
end if
58+
end do
59+
end subroutine forward
60+
61+
pure module subroutine backward(self, input, gradient)
62+
class(embedding_layer), intent(in out) :: self
63+
integer, intent(in) :: input(:)
64+
real, intent(in) :: gradient(:, :)
65+
integer :: i
66+
67+
do concurrent(i = 1: self % sequence_length)
68+
self % dw(input(i), :) = self % dw(input(i), :) + gradient(i, :)
69+
end do
70+
end subroutine backward
71+
72+
pure module subroutine positional_trigonometric(self, pos)
73+
class(embedding_layer), intent(in out) :: self
74+
integer, intent(in) :: pos
75+
integer :: i
76+
real :: theta
77+
78+
do concurrent(i = 1: floor(real(self % model_dimension) / 2))
79+
theta = (pos - 1) / 10000 ** (real(2 * (i-1)) / self % model_dimension)
80+
self % output(pos, 2 * i - 1) = self % output(pos, 2 * i - 1) + sin(theta)
81+
self % output(pos, 2 * i) = self % output(pos, 2 * i) + cos(theta)
82+
end do
83+
end subroutine positional_trigonometric
84+
85+
pure module subroutine positional_absolute(self, pos)
86+
class(embedding_layer), intent(in out) :: self
87+
integer, intent(in) :: pos
88+
integer :: i
89+
90+
do concurrent(i = 1: self % model_dimension)
91+
self % output(pos, i) = self % output(pos, i) + pos - 1
92+
end do
93+
end subroutine positional_absolute
94+
95+
pure module function get_num_params(self) result(num_params)
96+
class(embedding_layer), intent(in) :: self
97+
integer :: num_params
98+
num_params = self % vocab_size * self % model_dimension
99+
end function get_num_params
100+
101+
module function get_params(self) result(params)
102+
class(embedding_layer), intent(in), target :: self
103+
real, allocatable :: params(:)
104+
real, pointer :: w_(:) => null()
105+
106+
w_(1: product(shape(self % weights))) => self % weights
107+
params = w_
108+
end function get_params
109+
110+
module function get_gradients(self) result(gradients)
111+
class(embedding_layer), intent(in), target :: self
112+
real, allocatable :: gradients(:)
113+
real, pointer :: dw_(:) => null()
114+
115+
dw_(1: product(shape(self % dw))) => self % dw
116+
gradients = dw_
117+
end function get_gradients
118+
119+
module subroutine set_params(self, params)
120+
class(embedding_layer), intent(in out) :: self
121+
real, intent(in), target :: params(:)
122+
123+
real, pointer :: p_(:,:) => null()
124+
125+
! check if the number of parameters is correct
126+
if (size(params) /= self % get_num_params()) then
127+
error stop 'Error: number of parameters does not match'
128+
end if
129+
130+
associate(n => self % vocab_size * self % model_dimension)
131+
! reshape the weights
132+
p_(1:self % vocab_size, 1:self % model_dimension) => params(1 : n)
133+
self % weights = p_
134+
end associate
135+
136+
end subroutine set_params
137+
end submodule nf_embedding_layer_submodule

src/nf/nf_layer_constructors.f90

+18
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ module nf_layer_constructors
1818
maxpool2d, &
1919
reshape, &
2020
self_attention, &
21+
embedding, &
2122
layernorm
2223

2324
interface input
@@ -233,6 +234,23 @@ module function self_attention(num_heads) result(res)
233234
!! Resulting layer instance
234235
end function self_attention
235236

237+
module function embedding(sequence_length, vocab_size, model_dimension, positional) result(res)
238+
!! Embedding layer constructor.
239+
!!
240+
!! This layer is for inputting token indices from the dictionary to the network.
241+
!! Works as a trainable lookup table that converts each index into a vector.
242+
!! Embedding layer must be the first layer in a network.
243+
integer, intent(in) :: sequence_length
244+
!! max len of input sequence
245+
integer, intent(in) :: vocab_size
246+
!! length of token vocabulary
247+
integer, intent(in) :: model_dimension
248+
!! size of target embeddings
249+
integer, optional, intent(in) :: positional
250+
!! positional encoding
251+
type(layer) :: res
252+
end function embedding
253+
236254
module function layernorm() result(res)
237255
!! Layer Normalization
238256
!! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta

src/nf/nf_layer_constructors_submodule.f90

+20-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
use nf_reshape_layer, only: reshape3d_layer
1313
use nf_linear2d_layer, only: linear2d_layer
1414
use nf_self_attention_layer, only: self_attention_layer
15+
use nf_embedding_layer, only: embedding_layer
1516
use nf_layernorm_layer, only: layernorm_layer
1617
use nf_activation, only: activation_function, relu, sigmoid
1718

@@ -172,6 +173,7 @@ module function linear2d(out_features) result(res)
172173

173174
end function linear2d
174175

176+
175177
module function self_attention(num_heads) result(res)
176178
integer, intent(in) :: num_heads
177179
type(layer) :: res
@@ -180,9 +182,26 @@ module function self_attention(num_heads) result(res)
180182
allocate(res % p, source=self_attention_layer(num_heads))
181183
end function self_attention
182184

183-
module function layernorm() result(res)
185+
186+
module function embedding(sequence_length, vocab_size, model_dimension, positional) result(res)
187+
integer, intent(in) :: sequence_length, vocab_size, model_dimension
188+
integer, optional, intent(in) :: positional
184189
type(layer) :: res
190+
type(embedding_layer) :: embedding_layer_instance
191+
192+
embedding_layer_instance = embedding_layer(vocab_size, model_dimension, positional)
193+
call embedding_layer_instance % init([sequence_length])
194+
res % name = 'embedding'
195+
res % layer_shape = [sequence_length, model_dimension]
196+
res % input_layer_shape = [integer ::]
197+
allocate(res % p, source=embedding_layer_instance)
198+
res % initialized = .true.
199+
200+
end function embedding
201+
185202

203+
module function layernorm() result(res)
204+
type(layer) :: res
186205
res % name = 'layernorm'
187206
allocate(res % p, source=layernorm_layer())
188207
end function layernorm

0 commit comments

Comments
 (0)