Replacement of a matmul + use of merge (#181)

jvdp1 · Vandenplas, Jeremie · milancurcic · web-flow · commit 6dfaed004f4d · 2024-05-09T13:11:49.000-04:00
* dense_layer: replace a matmul(reshape) by a do concurrent

* nf_activation: replace some where statements by merge intrinsic

* Set correct size for self%gradient in dense_layer

* remove some unneeded pack()

* Remove notes on -fno-frontend-optimize (no longer necessary)

* Bump patch version

---------

Co-authored-by: Vandenplas, Jeremie &lt;jeremie.vandenplas@wur.nl&gt;
Co-authored-by: milancurcic &lt;caomaco@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -80,23 +80,21 @@ With gfortran, the following will create an optimized build of neural-fortran:
 ```
 fpm build \
   --profile release \
-  --flag "-fno-frontend-optimize -I$HDF5INC -L$HDF5LIB"
+  --flag "-I$HDF5INC -L$HDF5LIB"
 ```
 
 HDF5 is now a required dependency, so you have to provide it to fpm.
 The above command assumes that the `HDF5INC` and `HDF5LIB` environment
 variables are set to the include and library paths, respectively, of your
 HDF5 install.
-The `-fno-frontend-optimize` disables some optimizations that may be harmful
-when building neural-fortran.
 
 If you use Conda, the following instructions work:
 
 ```
 conda create -n nf hdf5
 conda activate nf
-fpm build --profile release --flag "-fno-frontend-optimize -I$CONDA_PREFIX/include -L$CONDA_PREFIX/lib -Wl,-rpath -Wl,$CONDA_PREFIX/lib"
-fpm test --profile release --flag "-fno-frontend-optimize -I$CONDA_PREFIX/include -L$CONDA_PREFIX/lib -Wl,-rpath -Wl,$CONDA_PREFIX/lib"
+fpm build --profile release --flag "-I$CONDA_PREFIX/include -L$CONDA_PREFIX/lib -Wl,-rpath -Wl,$CONDA_PREFIX/lib"
+fpm test --profile release --flag "-I$CONDA_PREFIX/include -L$CONDA_PREFIX/lib -Wl,-rpath -Wl,$CONDA_PREFIX/lib"
 ```
 
 #### Building in parallel mode
@@ -110,15 +108,15 @@ in parallel, respectively:
 fpm build \
   --compiler caf \
   --profile release \
-  --flag "-fno-frontend-optimize -I$HDF5INC -L$HDF5LIB"
+  --flag "-I$HDF5INC -L$HDF5LIB"
 ```
 
 #### Testing with fpm
 
 ```
 fpm test \
   --profile release \
-  --flag "-fno-frontend-optimize -I$HDF5INC -L$HDF5LIB"
+  --flag "-I$HDF5INC -L$HDF5LIB"
 ```
 
 For the time being, you need to specify the same compiler flags to `fpm test`
diff --git a/fpm.toml b/fpm.toml
@@ -1,5 +1,5 @@
 name = "neural-fortran"
-version = "0.16.0"
+version = "0.16.1"
 license = "MIT"
 author = "Milan Curcic"
 maintainer = "milancurcic@hey.com"
diff --git a/src/nf/nf_activation.f90 b/src/nf/nf_activation.f90
@@ -295,11 +295,7 @@ pure function eval_1d_relu_prime(self, x) result(res)
     class(relu), intent(in) :: self
     real, intent(in) :: x(:)
     real :: res(size(x))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = 0
-    end where
+    res = merge(1., 0., x > 0)
   end function eval_1d_relu_prime
 
   pure function eval_3d_relu(self, x) result(res)
@@ -315,11 +311,7 @@ pure function eval_3d_relu_prime(self, x) result(res)
     class(relu), intent(in) :: self
     real, intent(in) :: x(:,:,:)
     real :: res(size(x,1),size(x,2),size(x,3))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = 0
-    end where
+    res = merge(1., 0., x > 0)
   end function eval_3d_relu_prime
 
   pure function eval_1d_leaky_relu(self, x) result(res)
@@ -335,11 +327,7 @@ pure function eval_1d_leaky_relu_prime(self, x) result(res)
     class(leaky_relu), intent(in) :: self
     real, intent(in) :: x(:)
     real :: res(size(x))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = self % alpha
-    end where
+    res = merge(1., self%alpha, x > 0)
   end function eval_1d_leaky_relu_prime
 
   pure function eval_3d_leaky_relu(self, x) result(res)
@@ -355,11 +343,7 @@ pure function eval_3d_leaky_relu_prime(self, x) result(res)
     class(leaky_relu), intent(in) :: self
     real, intent(in) :: x(:,:,:)
     real :: res(size(x,1),size(x,2),size(x,3))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = self % alpha
-    end where
+    res = merge(1., self%alpha, x > 0)
   end function eval_3d_leaky_relu_prime
 
   pure function eval_1d_sigmoid(self, x) result(res)
@@ -465,11 +449,7 @@ pure function eval_1d_step(self, x) result(res)
     class(step), intent(in) :: self
     real, intent(in) :: x(:)
     real :: res(size(x))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = 0
-    end where
+    res = merge(1., 0., x > 0)
   end function eval_1d_step
 
   pure function eval_1d_step_prime(self, x) result(res)
@@ -485,11 +465,7 @@ pure function eval_3d_step(self, x) result(res)
     class(step), intent(in) :: self
     real, intent(in) :: x(:,:,:)
     real :: res(size(x,1),size(x,2),size(x,3))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = 0
-    end where
+    res = merge(1., 0., x > 0)
   end function eval_3d_step
 
   pure function eval_3d_step_prime(self, x) result(res)
diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90
@@ -195,7 +195,7 @@ pure module function get_params(self) result(params)
 
     params = [ &
       pack(self % kernel, .true.), &
-      pack(self % biases, .true.) &
+      self % biases &
     ]
 
   end function get_params
@@ -207,7 +207,7 @@ pure module function get_gradients(self) result(gradients)
 
     gradients = [ &
       pack(self % dw, .true.), &
-      pack(self % db, .true.) &
+      self % db &
     ]
 
   end function get_gradients
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
@@ -27,11 +27,15 @@ pure module subroutine backward(self, input, gradient)
     real, intent(in) :: gradient(:)
     real :: db(self % output_size)
     real :: dw(self % input_size, self % output_size)
+    integer :: i
 
     db = gradient * self % activation % eval_prime(self % z)
-    dw = matmul(reshape(input, [size(input), 1]), reshape(db, [1, size(db)]))
+!    dw = matmul(reshape(input, [size(input), 1]), reshape(db, [1, size(db)]))
+    do concurrent (i = 1:size(db))
+      self % dw(:,i) = self % dw(:,i) + input(:) * db(i)
+    enddo
     self % gradient = matmul(self % weights, db)
-    self % dw = self % dw + dw
+!    self % dw = self % dw + dw
     self % db = self % db + db
 
   end subroutine backward
@@ -63,7 +67,7 @@ pure module function get_params(self) result(params)
 
     params = [ &
       pack(self % weights, .true.), &
-      pack(self % biases, .true.) &
+      self % biases &
     ]
 
   end function get_params
@@ -75,7 +79,7 @@ pure module function get_gradients(self) result(gradients)
 
     gradients = [ &
       pack(self % dw, .true.), &
-      pack(self % db, .true.) &
+      self % db &
     ]
 
   end function get_gradients
@@ -135,7 +139,7 @@ module subroutine init(self, input_shape)
     allocate(self % db(self % output_size))
     self % db = 0
 
-    allocate(self % gradient(self % output_size))
+    allocate(self % gradient(self % input_size))
     self % gradient = 0
 
   end subroutine init

Original file line number	Diff line number	Diff line change
`@@ -195,7 +195,7 @@ pure module function get_params(self) result(params)`
`195`	`195`
`196`	`196`	`params = [ &`
`197`	`197`	`pack(self % kernel, .true.), &`
`198`		`- pack(self % biases, .true.) &`
	`198`	`+ self % biases &`
`199`	`199`	`]`
`200`	`200`
`201`	`201`	`end function get_params`
`@@ -207,7 +207,7 @@ pure module function get_gradients(self) result(gradients)`
`207`	`207`
`208`	`208`	`gradients = [ &`
`209`	`209`	`pack(self % dw, .true.), &`
`210`		`- pack(self % db, .true.) &`
	`210`	`+ self % db &`
`211`	`211`	`]`
`212`	`212`
`213`	`213`	`end function get_gradients`