diff --git a/CMakeLists.txt b/CMakeLists.txt index 667b5d73..da6c1025 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,8 +43,8 @@ add_library(neural-fortran src/nf/nf_layernorm_submodule.f90 src/nf/nf_layer.f90 src/nf/nf_layer_submodule.f90 - src/nf/nf_locally_connected1d_layer_submodule.f90 - src/nf/nf_locally_connected1d_layer.f90 + src/nf/nf_locally_connected2d_layer_submodule.f90 + src/nf/nf_locally_connected2d_layer.f90 src/nf/nf_linear2d_layer.f90 src/nf/nf_linear2d_layer_submodule.f90 src/nf/nf_embedding_layer.f90 diff --git a/README.md b/README.md index 65964786..5dbda06b 100644 --- a/README.md +++ b/README.md @@ -33,11 +33,9 @@ Read the paper [here](https://arxiv.org/abs/1902.06714). | Embedding | `embedding` | n/a | 2 | ✅ | ✅ | | Dense (fully-connected) | `dense` | `input1d`, `dense`, `dropout`, `flatten` | 1 | ✅ | ✅ | | Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ | -| Locally connected (1-d) | `locally_connected1d` | `input2d`, `locally_connected1d`, `conv1d`, `maxpool1d`, `reshape2d` | 2 | ✅ | ✅ | -| Convolutional (1-d) | `conv1d` | `input2d`, `conv1d`, `maxpool1d`, `reshape2d` | 2 | ✅ | ✅ | -| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ | -| Max-pooling (1-d) | `maxpool1d` | `input2d`, `conv1d`, `maxpool1d`, `reshape2d` | 2 | ✅ | ✅ | -| Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ | +| Locally connected (2-d) | `locally_connected` | `input`, `locally_connected`, `conv`, `maxpool`, `reshape` | 2 | ✅ | ✅ | +| Convolutional (1-d and 2-d) | `conv` | `input`, `conv`, `maxpool`, `reshape` | 2, 3 | ✅ | ✅ | +| Max-pooling (1-d and 2-d) | `maxpool` | `input`, `conv`, `maxpool`, `reshape` | 2, 3 | ✅ | ✅ | | Linear (2-d) | `linear2d` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | | Self-attention | `self_attention` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | | Layer Normalization | `layernorm` | `linear2d`, `self_attention` | 2 | ✅ | ✅ | diff --git a/example/cnn_mnist.f90 b/example/cnn_mnist.f90 index d2f61723..1ebe081c 100644 --- a/example/cnn_mnist.f90 +++ b/example/cnn_mnist.f90 @@ -1,7 +1,7 @@ program cnn_mnist use nf, only: network, sgd, & - input, conv2d, maxpool2d, flatten, dense, reshape, & + input, conv, maxpool, flatten, dense, reshape, & load_mnist, label_digits, softmax, relu implicit none @@ -21,10 +21,10 @@ program cnn_mnist net = network([ & input(784), & reshape(1, 28, 28), & - conv2d(filters=8, kernel_size=3, activation=relu()), & - maxpool2d(pool_size=2), & - conv2d(filters=16, kernel_size=3, activation=relu()), & - maxpool2d(pool_size=2), & + conv(filters=8, kernel_width=3, kernel_height=3, activation=relu()), & + maxpool(pool_width=2, pool_height=2, stride=2), & + conv(filters=16, kernel_width=3, kernel_height=3, activation=relu()), & + maxpool(pool_width=2, pool_height=2, stride=2), & dense(10, activation=softmax()) & ]) diff --git a/example/cnn_mnist_1d.f90 b/example/cnn_mnist_1d.f90 index b350a2f0..059d09c5 100644 --- a/example/cnn_mnist_1d.f90 +++ b/example/cnn_mnist_1d.f90 @@ -1,7 +1,7 @@ program cnn_mnist_1d use nf, only: network, sgd, & - input, conv1d, maxpool1d, flatten, dense, reshape, locally_connected1d, & + input, maxpool, flatten, dense, reshape, locally_connected, & load_mnist, label_digits, softmax, relu implicit none @@ -21,10 +21,10 @@ program cnn_mnist_1d net = network([ & input(784), & reshape(28, 28), & - locally_connected1d(filters=8, kernel_size=3, activation=relu()), & - maxpool1d(pool_size=2), & - locally_connected1d(filters=16, kernel_size=3, activation=relu()), & - maxpool1d(pool_size=2), & + locally_connected(filters=8, kernel_size=3, activation=relu()), & + maxpool(pool_width=2, stride=2), & + locally_connected(filters=16, kernel_size=3, activation=relu()), & + maxpool(pool_width=2, stride=2), & dense(10, activation=softmax()) & ]) diff --git a/fpm.toml b/fpm.toml index 1f2c2ac9..0d85b9dc 100644 --- a/fpm.toml +++ b/fpm.toml @@ -1,5 +1,5 @@ name = "neural-fortran" -version = "0.21.0" +version = "0.22.0" license = "MIT" author = "Milan Curcic" maintainer = "mcurcic@miami.edu" diff --git a/src/nf.f90 b/src/nf.f90 index f644826d..c7b21656 100644 --- a/src/nf.f90 +++ b/src/nf.f90 @@ -3,8 +3,7 @@ module nf use nf_datasets_mnist, only: label_digits, load_mnist use nf_layer, only: layer use nf_layer_constructors, only: & - conv1d, & - conv2d, & + conv, & dense, & dropout, & embedding, & @@ -12,9 +11,8 @@ module nf input, & layernorm, & linear2d, & - locally_connected1d, & - maxpool1d, & - maxpool2d, & + locally_connected, & + maxpool, & reshape, & self_attention use nf_loss, only: mse, quadratic diff --git a/src/nf/nf_conv1d_layer.f90 b/src/nf/nf_conv1d_layer.f90 index c39b11fc..65f82347 100644 --- a/src/nf/nf_conv1d_layer.f90 +++ b/src/nf/nf_conv1d_layer.f90 @@ -31,9 +31,10 @@ module nf_conv1d_layer procedure :: forward procedure :: backward - procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -97,14 +98,25 @@ module function get_params(self) result(params) !! Parameters to get end function get_params - module function get_gradients(self) result(gradients) - !! Return the gradients of this layer. - !! The gradients are ordered as weights first, biases second. + module subroutine get_params_ptr(self, w_ptr, b_ptr) + !! Return pointers to the parameters (weights and biases) of this layer. class(conv1d_layer), intent(in), target :: self !! A `conv1d_layer` instance - real, allocatable :: gradients(:) - !! Gradients to get - end function get_gradients + real, pointer, intent(out) :: w_ptr(:) + !! Pointer to the kernel weights (flattened) + real, pointer, intent(out) :: b_ptr(:) + !! Pointer to the biases + end subroutine get_params_ptr + + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + !! Return pointers to the gradients of this layer. + class(conv1d_layer), intent(in), target :: self + !! A `conv1d_layer` instance + real, pointer, intent(out) :: dw_ptr(:) + !! Pointer to the kernel weight gradients (flattened) + real, pointer, intent(out) :: db_ptr(:) + !! Pointer to the bias gradients + end subroutine get_gradients_ptr module subroutine set_params(self, params) !! Set the parameters of the layer. diff --git a/src/nf/nf_conv1d_layer_submodule.f90 b/src/nf/nf_conv1d_layer_submodule.f90 index 5404b9c7..98856689 100644 --- a/src/nf/nf_conv1d_layer_submodule.f90 +++ b/src/nf/nf_conv1d_layer_submodule.f90 @@ -152,13 +152,21 @@ module function get_params(self) result(params) params = [ w_, self % biases] end function get_params - module function get_gradients(self) result(gradients) + module subroutine get_params_ptr(self, w_ptr, b_ptr) class(conv1d_layer), intent(in), target :: self - real, allocatable :: gradients(:) - real, pointer :: dw_(:) => null() - dw_(1:size(self % dw)) => self % dw - gradients = [ dw_, self % db ] - end function get_gradients + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr => self % biases + end subroutine get_params_ptr + + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(conv1d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr module subroutine set_params(self, params) class(conv1d_layer), intent(in out) :: self diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90 index 4b79376e..d6c92c31 100644 --- a/src/nf/nf_conv2d_layer.f90 +++ b/src/nf/nf_conv2d_layer.f90 @@ -32,9 +32,10 @@ module nf_conv2d_layer procedure :: forward procedure :: backward - procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -98,14 +99,25 @@ module function get_params(self) result(params) !! Parameters to get end function get_params - module function get_gradients(self) result(gradients) - !! Return the gradients of this layer. - !! The gradients are ordered as weights first, biases second. + module subroutine get_params_ptr(self, w_ptr, b_ptr) + !! Return pointers to the parameters (weights and biases) of this layer. class(conv2d_layer), intent(in), target :: self !! A `conv2d_layer` instance - real, allocatable :: gradients(:) - !! Gradients to get - end function get_gradients + real, pointer, intent(out) :: w_ptr(:) + !! Pointer to the kernel weights (flattened) + real, pointer, intent(out) :: b_ptr(:) + !! Pointer to the biases + end subroutine get_params_ptr + + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + !! Return pointers to the gradients of this layer. + class(conv2d_layer), intent(in), target :: self + !! A `conv2d_layer` instance + real, pointer, intent(out) :: dw_ptr(:) + !! Pointer to the kernel weight gradients (flattened) + real, pointer, intent(out) :: db_ptr(:) + !! Pointer to the bias gradients + end subroutine get_gradients_ptr module subroutine set_params(self, params) !! Set the parameters of the layer. diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90 index 45a2c1da..56b398fc 100644 --- a/src/nf/nf_conv2d_layer_submodule.f90 +++ b/src/nf/nf_conv2d_layer_submodule.f90 @@ -204,21 +204,23 @@ module function get_params(self) result(params) end function get_params - - module function get_gradients(self) result(gradients) + + module subroutine get_params_ptr(self, w_ptr, b_ptr) class(conv2d_layer), intent(in), target :: self - real, allocatable :: gradients(:) - - real, pointer :: dw_(:) => null() + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr => self % biases + end subroutine get_params_ptr - dw_(1:size(self % dw)) => self % dw - gradients = [ & - dw_, & - self % db & - ] - - end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(conv2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr module subroutine set_params(self, params) diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index 862f4cdf..e93a57ca 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -33,9 +33,10 @@ module nf_dense_layer procedure :: backward procedure :: forward - procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -96,14 +97,17 @@ module function get_params(self) result(params) !! Parameters of this layer end function get_params - module function get_gradients(self) result(gradients) - !! Return the gradients of this layer. - !! The gradients are ordered as weights first, biases second. + module subroutine get_params_ptr(self, w_ptr, b_ptr) class(dense_layer), intent(in), target :: self - !! Dense layer instance - real, allocatable :: gradients(:) - !! Gradients of this layer - end function get_gradients + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + end subroutine get_params_ptr + + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(dense_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + end subroutine get_gradients_ptr module subroutine set_params(self, params) !! Set the parameters of this layer. diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index a424cf9c..c2f7e236 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -77,20 +77,22 @@ module function get_params(self) result(params) end function get_params - module function get_gradients(self) result(gradients) + module subroutine get_params_ptr(self, w_ptr, b_ptr) class(dense_layer), intent(in), target :: self - real, allocatable :: gradients(:) + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % weights)) => self % weights + b_ptr => self % biases + end subroutine get_params_ptr - real, pointer :: dw_(:) => null() - dw_(1:size(self % dw)) => self % dw - - gradients = [ & - dw_, & - self % db & - ] - - end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(dense_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr module subroutine set_params(self, params) diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90 index 517622b0..79569845 100644 --- a/src/nf/nf_layer.f90 +++ b/src/nf/nf_layer.f90 @@ -22,13 +22,13 @@ module nf_layer integer, allocatable :: layer_shape(:) integer, allocatable :: input_layer_shape(:) logical :: initialized = .false. + class(optimizer_base_type), allocatable :: optimizer contains procedure :: forward procedure :: get_num_params procedure :: get_params - procedure :: get_gradients procedure :: set_params procedure :: init procedure :: print_info @@ -160,14 +160,6 @@ module function get_params(self) result(params) !! Parameters of this layer end function get_params - module function get_gradients(self) result(gradients) - !! Returns the gradients of this layer. - class(layer), intent(in) :: self - !! Layer instance - real, allocatable :: gradients(:) - !! Gradients of this layer - end function get_gradients - module subroutine set_params(self, params) !! Returns the parameters of this layer. class(layer), intent(in out) :: self diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90 index d3f06ca3..12492311 100644 --- a/src/nf/nf_layer_constructors.f90 +++ b/src/nf/nf_layer_constructors.f90 @@ -9,16 +9,14 @@ module nf_layer_constructors private public :: & - conv1d, & - conv2d, & + conv, & dense, & dropout, & flatten, & input, & linear2d, & - locally_connected1d, & - maxpool1d, & - maxpool2d, & + locally_connected, & + maxpool, & reshape, & self_attention, & embedding, & @@ -94,111 +92,28 @@ end function input3d end interface input - interface reshape - - module function reshape2d(dim1, dim2) result(res) - !! Rank-1 to rank-2 reshape layer constructor. - integer, intent(in) :: dim1, dim2 - !! Shape of the output - type(layer) :: res - !! Resulting layer instance - end function reshape2d - - module function reshape3d(dim1, dim2, dim3) result(res) - !! Rank-1 to rank-3 reshape layer constructor. - integer, intent(in) :: dim1, dim2, dim3 - !! Shape of the output - type(layer) :: res - !! Resulting layer instance - end function reshape3d - - end interface reshape - - - interface - - module function dense(layer_size, activation) result(res) - !! Dense (fully-connected) layer constructor. - !! - !! This layer is a building block for dense, fully-connected networks, - !! or for an output layer of a convolutional network. - !! A dense layer must not be the first layer in the network. - !! - !! Example: - !! - !! ``` - !! use nf, only :: dense, layer, relu - !! type(layer) :: dense_layer - !! dense_layer = dense(10) - !! dense_layer = dense(10, activation=relu()) - !! ``` - integer, intent(in) :: layer_size - !! The number of neurons in a dense layer - class(activation_function), intent(in), optional :: activation - !! Activation function instance (default sigmoid) - type(layer) :: res - !! Resulting layer instance - end function dense - - module function dropout(rate) result(res) - !! Create a dropout layer with a given dropout rate. - !! - !! This layer is for randomly disabling neurons during training. - !! - !! Example: - !! - !! ``` - !! use nf, only :: dropout, layer - !! type(layer) :: dropout_layer - !! dropout_layer = dropout(rate=0.5) - !! ``` - real, intent(in) :: rate - !! Dropout rate - fraction of neurons to randomly disable during training - type(layer) :: res - !! Resulting layer instance - end function dropout - - module function flatten() result(res) - !! Flatten (3-d -> 1-d) layer constructor. - !! - !! Use this layer to chain layers with 3-d outputs to layers with 1-d - !! inputs. For example, to chain a `conv2d` or a `maxpool2d` layer - !! with a `dense` layer for a CNN for classification, place a `flatten` - !! layer between them. - !! - !! A flatten layer must not be the first layer in the network. - !! - !! Example: - !! - !! ``` - !! use nf, only :: flatten, layer - !! type(layer) :: flatten_layer - !! flatten_layer = flatten() - !! ``` - type(layer) :: res - !! Resulting layer instance - end function flatten + interface conv - module function conv1d(filters, kernel_size, activation) result(res) + module function conv1d(filters, kernel_width, activation) result(res) !! 1-d convolutional layer constructor. !! !! This layer is for building 1-d convolutional network. !! Although the established convention is to call these layers 1-d, - !! the shape of the data is actually 2-d: image width - !! and the number of channels. + !! the shape of the data is actually 2-d: image width and the number of channels. !! A conv1d layer must not be the first layer in the network. !! + !! This specific function is available under a generic name `conv`. + !! !! Example: !! !! ``` - !! use nf, only :: conv1d, layer + !! use nf, only :: conv, layer !! type(layer) :: conv1d_layer - !! conv1d_layer = dense(filters=32, kernel_size=3) - !! conv1d_layer = dense(filters=32, kernel_size=3, activation='relu') + !! conv1d_layer = conv(filters=32, kernel_size=3) !! ``` integer, intent(in) :: filters !! Number of filters in the output of the layer - integer, intent(in) :: kernel_size + integer, intent(in) :: kernel_width !! Width of the convolution window, commonly 3 or 5 class(activation_function), intent(in), optional :: activation !! Activation function (default sigmoid) @@ -206,49 +121,57 @@ module function conv1d(filters, kernel_size, activation) result(res) !! Resulting layer instance end function conv1d - module function conv2d(filters, kernel_size, activation) result(res) + module function conv2d(filters, kernel_width, kernel_height, activation) result(res) !! 2-d convolutional layer constructor. !! !! This layer is for building 2-d convolutional network. !! Although the established convention is to call these layers 2-d, - !! the shape of the data is actuall 3-d: image width, image height, - !! and the number of channels. + !! the shape of the data is actually 3-d: image width, image height, + !! and the number of channels. !! A conv2d layer must not be the first layer in the network. !! + !! This specific function is available under a generic name `conv`. + !! !! Example: !! !! ``` - !! use nf, only :: conv2d, layer - !! type(layer) :: conv2d_layer - !! conv2d_layer = dense(filters=32, kernel_size=3) - !! conv2d_layer = dense(filters=32, kernel_size=3, activation='relu') + !! use nf, only :: conv, layer + !! type(layer) :: conv2d_layer + !! conv2d_layer = conv(filters=32, kernel_width=3, kernel_height=3) !! ``` integer, intent(in) :: filters !! Number of filters in the output of the layer - integer, intent(in) :: kernel_size + integer, intent(in) :: kernel_width !! Width of the convolution window, commonly 3 or 5 + integer, intent(in) :: kernel_height + !! Height of the convolution window, commonly 3 or 5 class(activation_function), intent(in), optional :: activation !! Activation function (default sigmoid) type(layer) :: res !! Resulting layer instance end function conv2d + + end interface conv + + + interface locally_connected - module function locally_connected1d(filters, kernel_size, activation) result(res) + module function locally_connected2d(filters, kernel_size, activation) result(res) !! 1-d locally connected network constructor !! !! This layer is for building 1-d locally connected network. !! Although the established convention is to call these layers 1-d, - !! the shape of the data is actuall 2-d: image width, + !! the shape of the data is actually 2-d: image width, !! and the number of channels. !! A locally connected 1d layer must not be the first layer in the network. !! !! Example: !! !! ``` - !! use nf, only :: locally_connected1d, layer - !! type(layer) :: locally_connected1d_layer - !! locally_connected1d_layer = dense(filters=32, kernel_size=3) - !! locally_connected1d_layer = dense(filters=32, kernel_size=3, activation='relu') + !! use nf, only :: locally_connected2d, layer + !! type(layer) :: locally_connected2d_layer + !! locally_connected2d_layer = dense(filters=32, kernel_size=3) + !! locally_connected2d_layer = dense(filters=32, kernel_size=3, activation='relu') !! ``` integer, intent(in) :: filters !! Number of filters in the output of the layer @@ -258,52 +181,147 @@ module function locally_connected1d(filters, kernel_size, activation) result(res !! Activation function (default sigmoid) type(layer) :: res !! Resulting layer instance - end function locally_connected1d + end function locally_connected2d - module function maxpool1d(pool_size, stride) result(res) + end interface locally_connected + + + interface maxpool + + module function maxpool1d(pool_width, stride) result(res) !! 1-d maxpooling layer constructor. !! !! This layer is for downscaling other layers, typically `conv1d`. !! + !! This specific function is available under a generic name `maxpool`. + !! !! Example: !! !! ``` !! use nf, only :: maxpool1d, layer !! type(layer) :: maxpool1d_layer - !! maxpool1d_layer = maxpool1d(pool_size=2) - !! maxpool1d_layer = maxpool1d(pool_size=2, stride=3) + !! maxpool1d_layer = maxpool1d(pool_width=2, stride=2) !! ``` - integer, intent(in) :: pool_size + integer, intent(in) :: pool_width !! Width of the pooling window, commonly 2 - integer, intent(in), optional :: stride - !! Stride of the pooling window, commonly equal to `pool_size`; - !! Defaults to `pool_size` if omitted. + integer, intent(in) :: stride + !! Stride of the pooling window, commonly equal to `pool_width`; type(layer) :: res !! Resulting layer instance end function maxpool1d - module function maxpool2d(pool_size, stride) result(res) + module function maxpool2d(pool_width, pool_height, stride) result(res) !! 2-d maxpooling layer constructor. !! !! This layer is for downscaling other layers, typically `conv2d`. !! + !! This specific function is available under a generic name `maxpool`. + !! !! Example: !! !! ``` !! use nf, only :: maxpool2d, layer !! type(layer) :: maxpool2d_layer - !! maxpool2d_layer = maxpool2d(pool_size=2) - !! maxpool2d_layer = maxpool2d(pool_size=2, stride=3) + !! maxpool2d_layer = maxpool2d(pool_width=2, pool_height=2, stride=2) !! ``` - integer, intent(in) :: pool_size + integer, intent(in) :: pool_width !! Width of the pooling window, commonly 2 - integer, intent(in), optional :: stride - !! Stride of the pooling window, commonly equal to `pool_size`; - !! Defaults to `pool_size` if omitted. + integer, intent(in) :: pool_height + !! Height of the pooling window; currently must be equal to pool_width + integer, intent(in) :: stride + !! Stride of the pooling window, commonly equal to `pool_width`; type(layer) :: res !! Resulting layer instance end function maxpool2d + end interface maxpool + + + interface reshape + + module function reshape2d(dim1, dim2) result(res) + !! Rank-1 to rank-2 reshape layer constructor. + integer, intent(in) :: dim1, dim2 + !! Shape of the output + type(layer) :: res + !! Resulting layer instance + end function reshape2d + + module function reshape3d(dim1, dim2, dim3) result(res) + !! Rank-1 to rank-3 reshape layer constructor. + integer, intent(in) :: dim1, dim2, dim3 + !! Shape of the output + type(layer) :: res + !! Resulting layer instance + end function reshape3d + + end interface reshape + + + interface + + module function dense(layer_size, activation) result(res) + !! Dense (fully-connected) layer constructor. + !! + !! This layer is a building block for dense, fully-connected networks, + !! or for an output layer of a convolutional network. + !! A dense layer must not be the first layer in the network. + !! + !! Example: + !! + !! ``` + !! use nf, only :: dense, layer, relu + !! type(layer) :: dense_layer + !! dense_layer = dense(10) + !! dense_layer = dense(10, activation=relu()) + !! ``` + integer, intent(in) :: layer_size + !! The number of neurons in a dense layer + class(activation_function), intent(in), optional :: activation + !! Activation function instance (default sigmoid) + type(layer) :: res + !! Resulting layer instance + end function dense + + module function dropout(rate) result(res) + !! Create a dropout layer with a given dropout rate. + !! + !! This layer is for randomly disabling neurons during training. + !! + !! Example: + !! + !! ``` + !! use nf, only :: dropout, layer + !! type(layer) :: dropout_layer + !! dropout_layer = dropout(rate=0.5) + !! ``` + real, intent(in) :: rate + !! Dropout rate - fraction of neurons to randomly disable during training + type(layer) :: res + !! Resulting layer instance + end function dropout + + module function flatten() result(res) + !! Flatten (3-d -> 1-d) layer constructor. + !! + !! Use this layer to chain layers with 3-d outputs to layers with 1-d + !! inputs. For example, to chain a `conv2d` or a `maxpool2d` layer + !! with a `dense` layer for a CNN for classification, place a `flatten` + !! layer between them. + !! + !! A flatten layer must not be the first layer in the network. + !! + !! Example: + !! + !! ``` + !! use nf, only :: flatten, layer + !! type(layer) :: flatten_layer + !! flatten_layer = flatten() + !! ``` + type(layer) :: res + !! Resulting layer instance + end function flatten + module function linear2d(out_features) result(res) !! Rank-2 (sequence_length, out_features) linear layer constructor. !! sequence_length is determined at layer initialization, based on the diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90 index 1665d38a..23bb2284 100644 --- a/src/nf/nf_layer_constructors_submodule.f90 +++ b/src/nf/nf_layer_constructors_submodule.f90 @@ -9,7 +9,7 @@ use nf_input1d_layer, only: input1d_layer use nf_input2d_layer, only: input2d_layer use nf_input3d_layer, only: input3d_layer - use nf_locally_connected1d_layer, only: locally_connected1d_layer + use nf_locally_connected2d_layer, only: locally_connected2d_layer use nf_maxpool1d_layer, only: maxpool1d_layer use nf_maxpool2d_layer, only: maxpool2d_layer use nf_reshape2d_layer, only: reshape2d_layer @@ -24,9 +24,9 @@ contains - module function conv1d(filters, kernel_size, activation) result(res) + module function conv1d(filters, kernel_width, activation) result(res) integer, intent(in) :: filters - integer, intent(in) :: kernel_size + integer, intent(in) :: kernel_width class(activation_function), intent(in), optional :: activation type(layer) :: res @@ -44,19 +44,26 @@ module function conv1d(filters, kernel_size, activation) result(res) allocate( & res % p, & - source=conv1d_layer(filters, kernel_size, activation_tmp) & + source=conv1d_layer(filters, kernel_width, activation_tmp) & ) end function conv1d - module function conv2d(filters, kernel_size, activation) result(res) + module function conv2d(filters, kernel_width, kernel_height, activation) result(res) integer, intent(in) :: filters - integer, intent(in) :: kernel_size + integer, intent(in) :: kernel_width + integer, intent(in) :: kernel_height class(activation_function), intent(in), optional :: activation type(layer) :: res class(activation_function), allocatable :: activation_tmp + ! Enforce kernel_width == kernel_height for now; + ! If non-square kernels show to be desired, we'll relax this constraint + ! and refactor conv2d_layer to work with non-square kernels. + if (kernel_width /= kernel_height) & + error stop 'kernel_width must equal kernel_height in a conv2d layer' + res % name = 'conv2d' if (present(activation)) then @@ -69,12 +76,12 @@ module function conv2d(filters, kernel_size, activation) result(res) allocate( & res % p, & - source=conv2d_layer(filters, kernel_size, activation_tmp) & + source=conv2d_layer(filters, kernel_width, activation_tmp) & ) end function conv2d - module function locally_connected1d(filters, kernel_size, activation) result(res) + module function locally_connected2d(filters, kernel_size, activation) result(res) integer, intent(in) :: filters integer, intent(in) :: kernel_size class(activation_function), intent(in), optional :: activation @@ -82,7 +89,7 @@ module function locally_connected1d(filters, kernel_size, activation) result(res class(activation_function), allocatable :: activation_tmp - res % name = 'locally_connected1d' + res % name = 'locally_connected2d' if (present(activation)) then allocate(activation_tmp, source=activation) @@ -94,10 +101,10 @@ module function locally_connected1d(filters, kernel_size, activation) result(res allocate( & res % p, & - source=locally_connected1d_layer(filters, kernel_size, activation_tmp) & + source=locally_connected2d_layer(filters, kernel_size, activation_tmp) & ) - end function locally_connected1d + end function locally_connected2d module function dense(layer_size, activation) result(res) @@ -172,58 +179,49 @@ module function input3d(dim1, dim2, dim3) result(res) res % initialized = .true. end function input3d - module function maxpool1d(pool_size, stride) result(res) - integer, intent(in) :: pool_size - integer, intent(in), optional :: stride - integer :: stride_ + module function maxpool1d(pool_width, stride) result(res) + integer, intent(in) :: pool_width + integer, intent(in) :: stride type(layer) :: res - if (pool_size < 2) & - error stop 'pool_size must be >= 2 in a maxpool1d layer' - - ! Stride defaults to pool_size if not provided - if (present(stride)) then - stride_ = stride - else - stride_ = pool_size - end if + if (pool_width < 2) & + error stop 'pool_width must be >= 2 in a maxpool1d layer' - if (stride_ < 1) & + if (stride < 1) & error stop 'stride must be >= 1 in a maxpool1d layer' res % name = 'maxpool1d' allocate( & res % p, & - source=maxpool1d_layer(pool_size, stride_) & + source=maxpool1d_layer(pool_width, stride) & ) end function maxpool1d - module function maxpool2d(pool_size, stride) result(res) - integer, intent(in) :: pool_size - integer, intent(in), optional :: stride - integer :: stride_ + module function maxpool2d(pool_width, pool_height, stride) result(res) + integer, intent(in) :: pool_width + integer, intent(in) :: pool_height + integer, intent(in) :: stride type(layer) :: res - if (pool_size < 2) & - error stop 'pool_size must be >= 2 in a maxpool2d layer' + if (pool_width < 2) & + error stop 'pool_width must be >= 2 in a maxpool2d layer' - ! Stride defaults to pool_size if not provided - if (present(stride)) then - stride_ = stride - else - stride_ = pool_size - end if + ! Enforce pool_width == pool_height for now; + ! If non-square poolings show to be desired, we'll relax this constraint + ! and refactor maxpool2d_layer to work with non-square kernels. + if (pool_width /= pool_height) & + error stop 'pool_width must equal pool_height in a maxpool2d layer' - if (stride_ < 1) & + if (stride < 1) & error stop 'stride must be >= 1 in a maxpool2d layer' res % name = 'maxpool2d' allocate( & res % p, & - source=maxpool2d_layer(pool_size, stride_) & + source=maxpool2d_layer(pool_width, stride) & ) end function maxpool2d diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90 index eebedaa9..5b74eb5d 100644 --- a/src/nf/nf_layer_submodule.f90 +++ b/src/nf/nf_layer_submodule.f90 @@ -9,7 +9,7 @@ use nf_input1d_layer, only: input1d_layer use nf_input2d_layer, only: input2d_layer use nf_input3d_layer, only: input3d_layer - use nf_locally_connected1d_layer, only: locally_connected1d_layer + use nf_locally_connected2d_layer, only: locally_connected2d_layer use nf_maxpool1d_layer, only: maxpool1d_layer use nf_maxpool2d_layer, only: maxpool2d_layer use nf_reshape2d_layer, only: reshape2d_layer @@ -52,11 +52,11 @@ pure module subroutine backward_1d(self, previous, gradient) type is(flatten_layer) - ! Upstream layers permitted: input2d, input3d, conv1d, conv2d, locally_connected1d, maxpool1d, maxpool2d + ! Upstream layers permitted: input2d, input3d, conv1d, conv2d, locally_connected2d, maxpool1d, maxpool2d select type(prev_layer => previous % p) type is(input2d_layer) call this_layer % backward(prev_layer % output, gradient) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) call this_layer % backward(prev_layer % output, gradient) type is(maxpool1d_layer) call this_layer % backward(prev_layer % output, gradient) @@ -145,13 +145,13 @@ pure module subroutine backward_2d(self, previous, gradient) call this_layer % backward(prev_layer % output, gradient) type is(input2d_layer) call this_layer % backward(prev_layer % output, gradient) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) call this_layer % backward(prev_layer % output, gradient) type is(conv1d_layer) call this_layer % backward(prev_layer % output, gradient) end select - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) select type(prev_layer => previous % p) type is(maxpool1d_layer) @@ -160,7 +160,7 @@ pure module subroutine backward_2d(self, previous, gradient) call this_layer % backward(prev_layer % output, gradient) type is(input2d_layer) call this_layer % backward(prev_layer % output, gradient) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) call this_layer % backward(prev_layer % output, gradient) type is(conv1d_layer) call this_layer % backward(prev_layer % output, gradient) @@ -173,7 +173,7 @@ pure module subroutine backward_2d(self, previous, gradient) call this_layer % backward(prev_layer % output, gradient) type is(reshape2d_layer) call this_layer % backward(prev_layer % output, gradient) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) call this_layer % backward(prev_layer % output, gradient) type is(input2d_layer) call this_layer % backward(prev_layer % output, gradient) @@ -294,13 +294,13 @@ module subroutine forward(self, input) call this_layer % forward(prev_layer % output) end select - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) - ! Upstream layers permitted: input2d, locally_connected1d, maxpool1d, reshape2d + ! Upstream layers permitted: input2d, locally_connected2d, maxpool1d, reshape2d select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) call this_layer % forward(prev_layer % output) type is(maxpool1d_layer) call this_layer % forward(prev_layer % output) @@ -312,11 +312,11 @@ module subroutine forward(self, input) type is(conv1d_layer) - ! Upstream layers permitted: input2d, locally_connected1d, maxpool1d, reshape2d + ! Upstream layers permitted: input2d, locally_connected2d, maxpool1d, reshape2d select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) call this_layer % forward(prev_layer % output) type is(maxpool1d_layer) call this_layer % forward(prev_layer % output) @@ -328,11 +328,11 @@ module subroutine forward(self, input) type is(maxpool1d_layer) - ! Upstream layers permitted: input1d, locally_connected1d, maxpool1d, reshape2d + ! Upstream layers permitted: input1d, locally_connected2d, maxpool1d, reshape2d select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) call this_layer % forward(prev_layer % output) type is(maxpool1d_layer) call this_layer % forward(prev_layer % output) @@ -368,7 +368,7 @@ module subroutine forward(self, input) call this_layer % forward(prev_layer % output) type is(conv2d_layer) call this_layer % forward(prev_layer % output) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) call this_layer % forward(prev_layer % output) type is(maxpool1d_layer) call this_layer % forward(prev_layer % output) @@ -481,7 +481,7 @@ pure module subroutine get_output_2d(self, output) allocate(output, source=this_layer % output) type is(maxpool1d_layer) allocate(output, source=this_layer % output) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) allocate(output, source=this_layer % output) type is(conv1d_layer) allocate(output, source=this_layer % output) @@ -497,7 +497,7 @@ pure module subroutine get_output_2d(self, output) allocate(output, source=this_layer % output) class default error stop '2-d output can only be read from a input2d, maxpool1d, ' & - // 'locally_connected1d, conv1d, reshape2d, embedding, linear2d, ' & + // 'locally_connected2d, conv1d, reshape2d, embedding, linear2d, ' & // 'self_attention, or layernorm layer.' end select @@ -549,7 +549,7 @@ impure elemental module subroutine init(self, input) self % layer_shape = shape(this_layer % output) type is(dropout_layer) self % layer_shape = shape(this_layer % output) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) self % layer_shape = shape(this_layer % output) type is(maxpool1d_layer) self % layer_shape = shape(this_layer % output) @@ -611,7 +611,7 @@ elemental module function get_num_params(self) result(num_params) num_params = this_layer % get_num_params() type is (conv2d_layer) num_params = this_layer % get_num_params() - type is (locally_connected1d_layer) + type is (locally_connected2d_layer) num_params = this_layer % get_num_params() type is (maxpool1d_layer) num_params = 0 @@ -656,7 +656,7 @@ module function get_params(self) result(params) params = this_layer % get_params() type is (conv2d_layer) params = this_layer % get_params() - type is (locally_connected1d_layer) + type is (locally_connected2d_layer) params = this_layer % get_params() type is (maxpool1d_layer) ! No parameters to get. @@ -682,50 +682,6 @@ module function get_params(self) result(params) end function get_params - module function get_gradients(self) result(gradients) - class(layer), intent(in) :: self - real, allocatable :: gradients(:) - - select type (this_layer => self % p) - type is (input1d_layer) - ! No gradients to get. - type is (input2d_layer) - ! No gradients to get. - type is (input3d_layer) - ! No gradients to get. - type is (dense_layer) - gradients = this_layer % get_gradients() - type is (dropout_layer) - ! No gradients to get. - type is (conv1d_layer) - gradients = this_layer % get_gradients() - type is (conv2d_layer) - gradients = this_layer % get_gradients() - type is (locally_connected1d_layer) - gradients = this_layer % get_gradients() - type is (maxpool1d_layer) - ! No gradients to get. - type is (maxpool2d_layer) - ! No gradients to get. - type is (flatten_layer) - ! No gradients to get. - type is (reshape2d_layer) - ! No parameters to get. - type is (reshape3d_layer) - ! No gradients to get. - type is (linear2d_layer) - gradients = this_layer % get_gradients() - type is (self_attention_layer) - gradients = this_layer % get_gradients() - type is (embedding_layer) - gradients = this_layer % get_gradients() - type is (layernorm_layer) - gradients = this_layer % get_gradients() - class default - error stop 'Unknown layer type.' - end select - - end function get_gradients module subroutine set_params(self, params) class(layer), intent(in out) :: self @@ -776,7 +732,7 @@ module subroutine set_params(self, params) type is (conv2d_layer) call this_layer % set_params(params) - type is (locally_connected1d_layer) + type is (locally_connected2d_layer) call this_layer % set_params(params) type is (maxpool1d_layer) diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90 index 36ef56f0..7bffc06a 100644 --- a/src/nf/nf_layernorm.f90 +++ b/src/nf/nf_layernorm.f90 @@ -38,7 +38,9 @@ module nf_layernorm_layer procedure :: init procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: set_params end type layernorm_layer @@ -78,12 +80,24 @@ module function get_params(self) result(params) end function get_params + module subroutine get_params_ptr(self, g_ptr, b_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: g_ptr(:), b_ptr(:) + end subroutine get_params_ptr + + module function get_gradients(self) result(gradients) class(layernorm_layer), intent(in), target :: self real, allocatable :: gradients(:) end function get_gradients + module subroutine get_gradients_ptr(self, dg_ptr, db_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: dg_ptr(:), db_ptr(:) + end subroutine get_gradients_ptr + + module subroutine set_params(self, params) class(layernorm_layer), intent(in out) :: self real, intent(in), target :: params(:) diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90 index 4eaa4382..5e357b33 100644 --- a/src/nf/nf_layernorm_submodule.f90 +++ b/src/nf/nf_layernorm_submodule.f90 @@ -112,25 +112,31 @@ end function get_num_params module function get_params(self) result(params) class(layernorm_layer), intent(in), target :: self real, allocatable :: params(:) + params = [self % gamma, self % beta] + end function get_params - params = [ & - self % gamma, & - self % beta & - ] - end function get_params + module subroutine get_params_ptr(self, g_ptr, b_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: g_ptr(:), b_ptr(:) + g_ptr => self % gamma + b_ptr => self % beta + end subroutine get_params_ptr module function get_gradients(self) result(gradients) class(layernorm_layer), intent(in), target :: self real, allocatable :: gradients(:) + gradients = [self % d_gamma, self % d_beta] + end function get_gradients - gradients = [ & - self % d_gamma, & - self % d_beta & - ] - end function get_gradients + module subroutine get_gradients_ptr(self, dg_ptr, db_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: dg_ptr(:), db_ptr(:) + dg_ptr => self % d_gamma + db_ptr => self % d_beta + end subroutine get_gradients_ptr module subroutine set_params(self, params) diff --git a/src/nf/nf_linear2d_layer.f90 b/src/nf/nf_linear2d_layer.f90 index f785a14c..f2c8fd16 100644 --- a/src/nf/nf_linear2d_layer.f90 +++ b/src/nf/nf_linear2d_layer.f90 @@ -25,7 +25,9 @@ module nf_linear2d_layer procedure :: init procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: set_params end type linear2d_layer @@ -64,11 +66,21 @@ module function get_params(self) result(params) real, allocatable :: params(:) end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:), b_ptr(:) + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) class(linear2d_layer), intent(in), target :: self real, allocatable :: gradients(:) end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:), db_ptr(:) + end subroutine get_gradients_ptr + module subroutine set_params(self, params) class(linear2d_layer), intent(in out) :: self real, intent(in), target :: params(:) diff --git a/src/nf/nf_linear2d_layer_submodule.f90 b/src/nf/nf_linear2d_layer_submodule.f90 index 0dfe7e27..513527f0 100644 --- a/src/nf/nf_linear2d_layer_submodule.f90 +++ b/src/nf/nf_linear2d_layer_submodule.f90 @@ -82,33 +82,35 @@ end function get_num_params module function get_params(self) result(params) class(linear2d_layer), intent(in), target :: self real, allocatable :: params(:) - real, pointer :: w_(:) => null() + w_(1: size(self % weights)) => self % weights + params = [w_, self % biases] + end function get_params - w_(1: product(shape(self % weights))) => self % weights - - params = [ & - w_, & - self % biases & - ] - end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:), b_ptr(:) + w_ptr(1:size(self % weights)) => self % weights + b_ptr => self % biases + end subroutine get_params_ptr module function get_gradients(self) result(gradients) class(linear2d_layer), intent(in), target :: self real, allocatable :: gradients(:) - real, pointer :: dw_(:) => null() + dw_(1:size(self % dw)) => self % dw + gradients = [dw_, self % db] + end function get_gradients - dw_(1: product(shape(self % dw))) => self % dw - - gradients = [ & - dw_, & - self % db & - ] - end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:), db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr module subroutine set_params(self, params) diff --git a/src/nf/nf_locally_connected1d_layer.f90 b/src/nf/nf_locally_connected2d_layer.f90 similarity index 59% rename from src/nf/nf_locally_connected1d_layer.f90 rename to src/nf/nf_locally_connected2d_layer.f90 index beca76d5..2478dc0a 100644 --- a/src/nf/nf_locally_connected1d_layer.f90 +++ b/src/nf/nf_locally_connected2d_layer.f90 @@ -1,14 +1,14 @@ -module nf_locally_connected1d_layer - !! This modules provides a 1-d convolutional `locally_connected1d` type. +module nf_locally_connected2d_layer + !! This modules provides a 1-d convolutional `locally_connected2d` type. use nf_activation, only: activation_function use nf_base_layer, only: base_layer implicit none private - public :: locally_connected1d_layer + public :: locally_connected2d_layer - type, extends(base_layer) :: locally_connected1d_layer + type, extends(base_layer) :: locally_connected2d_layer integer :: width integer :: height @@ -32,23 +32,25 @@ module nf_locally_connected1d_layer procedure :: forward procedure :: backward procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params - end type locally_connected1d_layer + end type locally_connected2d_layer - interface locally_connected1d_layer - module function locally_connected1d_layer_cons(filters, kernel_size, activation) & + interface locally_connected2d_layer + module function locally_connected2d_layer_cons(filters, kernel_size, activation) & result(res) - !! `locally_connected1d_layer` constructor function + !! `locally_connected2d_layer` constructor function integer, intent(in) :: filters integer, intent(in) :: kernel_size class(activation_function), intent(in) :: activation - type(locally_connected1d_layer) :: res - end function locally_connected1d_layer_cons - end interface locally_connected1d_layer + type(locally_connected2d_layer) :: res + end function locally_connected2d_layer_cons + end interface locally_connected2d_layer interface @@ -56,24 +58,24 @@ module subroutine init(self, input_shape) !! Initialize the layer data structures. !! !! This is a deferred procedure from the `base_layer` abstract type. - class(locally_connected1d_layer), intent(in out) :: self - !! A `locally_connected1d_layer` instance + class(locally_connected2d_layer), intent(in out) :: self + !! A `locally_connected2d_layer` instance integer, intent(in) :: input_shape(:) !! Input layer dimensions end subroutine init pure module subroutine forward(self, input) - !! Apply a forward pass on the `locally_connected1d` layer. - class(locally_connected1d_layer), intent(in out) :: self - !! A `locally_connected1d_layer` instance + !! Apply a forward pass on the `locally_connected2d` layer. + class(locally_connected2d_layer), intent(in out) :: self + !! A `locally_connected2d_layer` instance real, intent(in) :: input(:,:) !! Input data end subroutine forward pure module subroutine backward(self, input, gradient) - !! Apply a backward pass on the `locally_connected1d` layer. - class(locally_connected1d_layer), intent(in out) :: self - !! A `locally_connected1d_layer` instance + !! Apply a backward pass on the `locally_connected2d` layer. + class(locally_connected2d_layer), intent(in out) :: self + !! A `locally_connected2d_layer` instance real, intent(in) :: input(:,:) !! Input data (previous layer) real, intent(in) :: gradient(:,:) @@ -82,8 +84,8 @@ end subroutine backward pure module function get_num_params(self) result(num_params) !! Get the number of parameters in the layer. - class(locally_connected1d_layer), intent(in) :: self - !! A `locally_connected1d_layer` instance + class(locally_connected2d_layer), intent(in) :: self + !! A `locally_connected2d_layer` instance integer :: num_params !! Number of parameters end function get_num_params @@ -91,29 +93,41 @@ end function get_num_params module function get_params(self) result(params) !! Return the parameters (weights and biases) of this layer. !! The parameters are ordered as weights first, biases second. - class(locally_connected1d_layer), intent(in), target :: self - !! A `locally_connected1d_layer` instance + class(locally_connected2d_layer), intent(in), target :: self + !! A `locally_connected2d_layer` instance real, allocatable :: params(:) !! Parameters to get end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(locally_connected2d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) !! Return the gradients of this layer. !! The gradients are ordered as weights first, biases second. - class(locally_connected1d_layer), intent(in), target :: self - !! A `locally_connected1d_layer` instance + class(locally_connected2d_layer), intent(in), target :: self + !! A `locally_connected2d_layer` instance real, allocatable :: gradients(:) !! Gradients to get end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(locally_connected2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + end subroutine get_gradients_ptr + module subroutine set_params(self, params) !! Set the parameters of the layer. - class(locally_connected1d_layer), intent(in out) :: self - !! A `locally_connected1d_layer` instance + class(locally_connected2d_layer), intent(in out) :: self + !! A `locally_connected2d_layer` instance real, intent(in) :: params(:) !! Parameters to set end subroutine set_params end interface -end module nf_locally_connected1d_layer +end module nf_locally_connected2d_layer diff --git a/src/nf/nf_locally_connected1d_layer_submodule.f90 b/src/nf/nf_locally_connected2d_layer_submodule.f90 similarity index 74% rename from src/nf/nf_locally_connected1d_layer_submodule.f90 rename to src/nf/nf_locally_connected2d_layer_submodule.f90 index 053c520b..5b2f5f85 100644 --- a/src/nf/nf_locally_connected1d_layer_submodule.f90 +++ b/src/nf/nf_locally_connected2d_layer_submodule.f90 @@ -1,4 +1,4 @@ -submodule(nf_locally_connected1d_layer) nf_locally_connected1d_layer_submodule +submodule(nf_locally_connected2d_layer) nf_locally_connected2d_layer_submodule use nf_activation, only: activation_function use nf_random, only: random_normal @@ -7,22 +7,22 @@ contains - module function locally_connected1d_layer_cons(filters, kernel_size, activation) result(res) + module function locally_connected2d_layer_cons(filters, kernel_size, activation) result(res) implicit none integer, intent(in) :: filters integer, intent(in) :: kernel_size class(activation_function), intent(in) :: activation - type(locally_connected1d_layer) :: res + type(locally_connected2d_layer) :: res res % kernel_size = kernel_size res % filters = filters res % activation_name = activation % get_name() allocate(res % activation, source = activation) - end function locally_connected1d_layer_cons + end function locally_connected2d_layer_cons module subroutine init(self, input_shape) implicit none - class(locally_connected1d_layer), intent(in out) :: self + class(locally_connected2d_layer), intent(in out) :: self integer, intent(in) :: input_shape(:) self % channels = input_shape(1) @@ -53,7 +53,7 @@ end subroutine init pure module subroutine forward(self, input) implicit none - class(locally_connected1d_layer), intent(in out) :: self + class(locally_connected2d_layer), intent(in out) :: self real, intent(in) :: input(:,:) integer :: input_channels, input_width integer :: j, n @@ -74,7 +74,7 @@ end subroutine forward pure module subroutine backward(self, input, gradient) implicit none - class(locally_connected1d_layer), intent(in out) :: self + class(locally_connected2d_layer), intent(in out) :: self real, intent(in) :: input(:,:) real, intent(in) :: gradient(:,:) integer :: input_channels, input_width, output_width @@ -117,29 +117,45 @@ pure module subroutine backward(self, input, gradient) end subroutine backward pure module function get_num_params(self) result(num_params) - class(locally_connected1d_layer), intent(in) :: self + class(locally_connected2d_layer), intent(in) :: self integer :: num_params num_params = product(shape(self % kernel)) + product(shape(self % biases)) end function get_num_params module function get_params(self) result(params) - class(locally_connected1d_layer), intent(in), target :: self + class(locally_connected2d_layer), intent(in), target :: self real, allocatable :: params(:) params = [self % kernel, self % biases] end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(locally_connected2d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr(1:size(self % biases)) => self % biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) - class(locally_connected1d_layer), intent(in), target :: self + class(locally_connected2d_layer), intent(in), target :: self real, allocatable :: gradients(:) gradients = [self % dw, self % db] end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(locally_connected2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr(1:size(self % db)) => self % db + end subroutine get_gradients_ptr + module subroutine set_params(self, params) - class(locally_connected1d_layer), intent(in out) :: self + class(locally_connected2d_layer), intent(in out) :: self real, intent(in) :: params(:) if (size(params) /= self % get_num_params()) then - error stop 'locally_connected1d_layer % set_params: Number of parameters does not match' + error stop 'locally_connected2d_layer % set_params: Number of parameters does not match' end if self % kernel = reshape(params(:product(shape(self % kernel))), shape(self % kernel)) @@ -149,4 +165,4 @@ module subroutine set_params(self, params) end subroutine set_params -end submodule nf_locally_connected1d_layer_submodule +end submodule nf_locally_connected2d_layer_submodule diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90 index 2bd7ce8c..2743ff5b 100644 --- a/src/nf/nf_network.f90 +++ b/src/nf/nf_network.f90 @@ -16,12 +16,10 @@ module nf_network type(layer), allocatable :: layers(:) class(loss_type), allocatable :: loss - class(optimizer_base_type), allocatable :: optimizer contains procedure :: backward - procedure :: get_gradients procedure :: get_num_params procedure :: get_params procedure :: print_info @@ -216,7 +214,6 @@ module integer function get_num_params(self) !! Network instance end function get_num_params - module function get_params(self) result(params) !! Get the network parameters (weights and biases). class(network), intent(in) :: self @@ -225,13 +222,6 @@ module function get_params(self) result(params) !! Network parameters to get end function get_params - module function get_gradients(self) result(gradients) - class(network), intent(in) :: self - !! Network instance - real, allocatable :: gradients(:) - !! Network gradients to set - end function get_gradients - module subroutine set_params(self, params) !! Set the network parameters (weights and biases). class(network), intent(in out) :: self diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index 449b5a5b..df95963a 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -8,7 +8,7 @@ use nf_input1d_layer, only: input1d_layer use nf_input2d_layer, only: input2d_layer use nf_input3d_layer, only: input3d_layer - use nf_locally_connected1d_layer, only: locally_connected1d_layer + use nf_locally_connected2d_layer, only: locally_connected2d_layer use nf_maxpool1d_layer, only: maxpool1d_layer use nf_maxpool2d_layer, only: maxpool2d_layer use nf_reshape2d_layer, only: reshape2d_layer @@ -18,7 +18,7 @@ use nf_embedding_layer, only: embedding_layer use nf_layernorm_layer, only: layernorm_layer use nf_layer, only: layer - use nf_layer_constructors, only: conv1d, conv2d, dense, flatten, input, maxpool1d, maxpool2d, reshape + use nf_layer_constructors, only: flatten use nf_loss, only: quadratic use nf_optimizers, only: optimizer_base_type, sgd use nf_parallel, only: tile_indices @@ -79,7 +79,7 @@ module function network_from_layers(layers) result(res) type is(conv2d_layer) res % layers = [res % layers(:n-1), flatten(), res % layers(n:)] n = n + 1 - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) res % layers = [res % layers(:n-1), flatten(), res % layers(n:)] n = n + 1 type is(maxpool2d_layer) @@ -185,7 +185,7 @@ module subroutine backward(self, output, loss) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) type is(conv1d_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) type is(layernorm_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) @@ -524,25 +524,6 @@ module function get_params(self) result(params) end function get_params - module function get_gradients(self) result(gradients) - class(network), intent(in) :: self - real, allocatable :: gradients(:) - integer :: n, nstart, nend - - allocate(gradients(self % get_num_params())) - - nstart = 1 - do n = 1, size(self % layers) - - if (self % layers(n) % get_num_params() < 1) cycle - - nend = nstart + self % layers(n) % get_num_params() - 1 - gradients(nstart:nend) = self % layers(n) % get_gradients() - nstart = nend + 1 - end do - - end function get_gradients - module subroutine set_params(self, params) class(network), intent(in out) :: self @@ -593,15 +574,8 @@ module subroutine train(self, input_data, output_data, batch_size, & integer :: i, j, n integer :: istart, iend, indices(2) - ! Passing the optimizer instance is optional. - ! If not provided, we default to SGD with its default settings. - if (present(optimizer)) then - self % optimizer = optimizer - else - self % optimizer = sgd() - end if - - call self % optimizer % init(self % get_num_params()) + ! The optional optimizer instance is passed through to the update() method + ! where it is optional as well. ! Passing the loss instance is optional. ! If not provided, we default to quadratic(). @@ -635,7 +609,7 @@ module subroutine train(self, input_data, output_data, batch_size, & call self % backward(output_data(:,j)) end do - call self % update(batch_size=batch_size) + call self % update(optimizer=optimizer, batch_size=batch_size) end do batch_loop end do epoch_loop @@ -649,22 +623,25 @@ module subroutine update(self, optimizer, batch_size) integer, intent(in), optional :: batch_size integer :: batch_size_ real, allocatable :: params(:) + real, pointer :: weights(:), biases(:), dw(:), db(:) integer :: n - ! Passing the optimizer instance is optional. If not provided, and if the - ! optimizer has not already been set, we default to the default SGD. The - ! instantiation and initialization below of the optimizer is normally done - ! at the beginning of the network % train() method. However, if the user - ! wants to call network % update() directly, for example if they use their - ! own custom mini-batching routine, we initialize the optimizer here as - ! well. If it's initialized already, this step is a cheap no-op. - if (.not. allocated(self % optimizer)) then + ! You can optionally pass an optimizer instance to the update() method. + ! This is necessary if you're not using the train() method, for example if + ! you're using your own custom mini-batching routine and calling the + ! forward(), backward(), and update() methods directly. + if (.not. allocated(self % layers(1) % optimizer)) then if (present(optimizer)) then - self % optimizer = optimizer + do n = 1, size(self % layers) + self % layers(n) % optimizer = optimizer + call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) + end do else - self % optimizer = sgd() + do n = 1, size(self % layers) + self % layers(n) % optimizer = sgd() + call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) + end do end if - call self % optimizer % init(self % get_num_params()) end if if (present(batch_size)) then @@ -686,32 +663,57 @@ module subroutine update(self, optimizer, batch_size) type is(conv1d_layer) call co_sum(this_layer % dw) call co_sum(this_layer % db) - type is(locally_connected1d_layer) + type is(locally_connected2d_layer) call co_sum(this_layer % dw) call co_sum(this_layer % db) end select end do #endif - params = self % get_params() - call self % optimizer % minimize(params, self % get_gradients() / batch_size_) - call self % set_params(params) - - ! Flush network gradients to zero. do n = 2, size(self % layers) select type(this_layer => self % layers(n) % p) type is(dense_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) + this_layer % dw = 0 + this_layer % db = 0 + type is(conv1d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(conv2d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 - type is(conv1d_layer) + type is(locally_connected2d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 - type is(locally_connected1d_layer) + type is(linear2d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 + type is(layernorm_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) + this_layer % d_gamma = 0 + this_layer % d_beta = 0 end select end do diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index c64cefed..9a6b1e1f 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -44,6 +44,7 @@ end subroutine minimize real :: momentum = 0 logical :: nesterov = .false. real, allocatable, private :: velocity(:) + integer, private :: start_index = 1 contains procedure :: init => init_sgd procedure :: minimize => minimize_sgd @@ -59,6 +60,7 @@ end subroutine minimize real :: decay_rate = 0.9 real :: epsilon = 1e-8 real, allocatable, private :: rms_gradient(:) + integer, private :: start_index = 1 contains procedure :: init => init_rmsprop procedure :: minimize => minimize_rmsprop @@ -82,6 +84,7 @@ end subroutine minimize real :: weight_decay_decoupled = 0 ! decoupled weight decay regularization (AdamW) real, allocatable, private :: m(:), v(:) integer, private :: t = 0 + integer, private :: start_index = 1 contains procedure :: init => init_adam procedure :: minimize => minimize_adam @@ -99,6 +102,7 @@ end subroutine minimize real :: learning_rate_decay = 0 real, allocatable, private :: sum_squared_gradient(:) integer, private :: t = 0 + integer, private :: start_index = 1 contains procedure :: init => init_adagrad procedure :: minimize => minimize_adagrad @@ -121,19 +125,38 @@ pure subroutine minimize_sgd(self, param, gradient) !! update rule. class(sgd), intent(inout) :: self real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(in) :: gradient(:) ! Always the same size as param + integer :: end_index if (self % momentum > 0) then + + ! end_index is part of the bookkeeping for updating velocity because each + ! batch update makes two calls to minimize, one for the weights and one for + ! the biases. + ! We use start_index and end_index to update the appropriate sections + ! of the velocity array. + end_index = self % start_index + size(param) - 1 + ! Apply momentum update - self % velocity = self % momentum * self % velocity & + self % velocity(self % start_index:end_index) = & + self % momentum * self % velocity(self % start_index:end_index) & - self % learning_rate * gradient if (self % nesterov) then ! Apply Nesterov update - param = param + self % momentum * self % velocity & + param = param + self % momentum * self % velocity(self % start_index:end_index) & - self % learning_rate * gradient else - param = param + self % velocity + param = param + self % velocity(self % start_index:end_index) + end if + + if (end_index < size(param)) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 end if + else ! Apply regular update param = param - self % learning_rate * gradient @@ -157,14 +180,27 @@ pure subroutine minimize_rmsprop(self, param, gradient) class(rmsprop), intent(inout) :: self real, intent(inout) :: param(:) real, intent(in) :: gradient(:) + integer :: end_index + + end_index = self % start_index + size(param) - 1 ! Compute the RMS of the gradient using the RMSProp rule - self % rms_gradient = self % decay_rate * self % rms_gradient & + self % rms_gradient(self % start_index:end_index) = & + self % decay_rate * self % rms_gradient(self % start_index:end_index) & + (1 - self % decay_rate) * gradient**2 ! Update the network parameters based on the new RMS of the gradient param = param - self % learning_rate & - / sqrt(self % rms_gradient + self % epsilon) * gradient + / sqrt(self % rms_gradient(self % start_index:end_index) + self % epsilon) & + * gradient + + if (end_index < size(param)) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 + end if end subroutine minimize_rmsprop @@ -185,20 +221,27 @@ pure subroutine minimize_adam(self, param, gradient) class(adam), intent(inout) :: self real, intent(inout) :: param(:) real, intent(in) :: gradient(:) + integer :: end_index + + end_index = self % start_index + size(param) - 1 self % t = self % t + 1 ! If weight_decay_l2 > 0, use L2 regularization; ! otherwise, default to regular Adam. associate(g => gradient + self % weight_decay_l2 * param) - self % m = self % beta1 * self % m + (1 - self % beta1) * g - self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 + self % m(self % start_index:end_index) = & + self % beta1 * self % m(self % start_index:end_index) & + + (1 - self % beta1) * g + self % v(self % start_index:end_index) = & + self % beta2 * self % v(self % start_index:end_index) & + + (1 - self % beta2) * g**2 end associate ! Compute bias-corrected first and second moment estimates. associate( & - m_hat => self % m / (1 - self % beta1**self % t), & - v_hat => self % v / (1 - self % beta2**self % t) & + m_hat => self % m(self % start_index:end_index) / (1 - self % beta1**self % t), & + v_hat => self % v(self % start_index:end_index) / (1 - self % beta2**self % t) & ) ! Update parameters. @@ -208,6 +251,14 @@ pure subroutine minimize_adam(self, param, gradient) end associate + if (end_index < size(param)) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 + end if + end subroutine minimize_adam @@ -226,6 +277,9 @@ pure subroutine minimize_adagrad(self, param, gradient) class(adagrad), intent(inout) :: self real, intent(inout) :: param(:) real, intent(in) :: gradient(:) + integer :: end_index + + end_index = self % start_index + size(param) - 1 ! Update the current time step self % t = self % t + 1 @@ -239,13 +293,23 @@ pure subroutine minimize_adagrad(self, param, gradient) / (1 + (self % t - 1) * self % learning_rate_decay) & ) - self % sum_squared_gradient = self % sum_squared_gradient + g**2 + self % sum_squared_gradient(self % start_index:end_index) = & + self % sum_squared_gradient(self % start_index:end_index) + g**2 - param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) & + param = param - learning_rate * g & + / (sqrt(self % sum_squared_gradient(self % start_index:end_index)) & + self % epsilon) end associate + if (end_index < size(param)) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 + end if + end subroutine minimize_adagrad -end module nf_optimizers +end module nf_optimizers \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ec4e139e..922a2936 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -8,7 +8,7 @@ foreach(execid dense_layer conv1d_layer conv2d_layer - locally_connected1d_layer + locally_connected2d_layer maxpool1d_layer maxpool2d_layer flatten_layer diff --git a/test/test_conv1d_layer.f90 b/test/test_conv1d_layer.f90 index 81d03c1f..b80b520b 100644 --- a/test/test_conv1d_layer.f90 +++ b/test/test_conv1d_layer.f90 @@ -1,7 +1,7 @@ program test_conv1d_layer use iso_fortran_env, only: stderr => error_unit - use nf, only: conv1d, input, layer + use nf, only: conv, input, layer use nf_input2d_layer, only: input2d_layer implicit none @@ -12,7 +12,7 @@ program test_conv1d_layer real, parameter :: tolerance = 1e-7 logical :: ok = .true. - conv1d_layer = conv1d(filters, kernel_size) + conv1d_layer = conv(filters, kernel_size) if (.not. conv1d_layer % name == 'conv1d') then ok = .false. @@ -52,7 +52,7 @@ program test_conv1d_layer sample_input = 0 input_layer = input(1, 3) - conv1d_layer = conv1d(filters, kernel_size) + conv1d_layer = conv(filters, kernel_size) call conv1d_layer % init(input_layer) select type(this_layer => input_layer % p); type is(input2d_layer) diff --git a/test/test_conv1d_network.f90 b/test/test_conv1d_network.f90 index 5a353cf9..88289ab4 100644 --- a/test/test_conv1d_network.f90 +++ b/test/test_conv1d_network.f90 @@ -1,7 +1,7 @@ program test_conv1d_network use iso_fortran_env, only: stderr => error_unit - use nf, only: conv1d, input, network, dense, sgd, maxpool1d + use nf, only: conv, input, network, dense, sgd, maxpool implicit none @@ -12,8 +12,8 @@ program test_conv1d_network ! 3-layer convolutional network net = network([ & input(3, 32), & - conv1d(filters=16, kernel_size=3), & - conv1d(filters=32, kernel_size=3) & + conv(filters=16, kernel_width=3), & + conv(filters=32, kernel_width=3) & ]) if (.not. size(net % layers) == 3) then @@ -49,8 +49,8 @@ program test_conv1d_network cnn = network([ & input(1, 5), & - conv1d(filters=1, kernel_size=3), & - conv1d(filters=1, kernel_size=3), & + conv(filters=1, kernel_width=3), & + conv(filters=1, kernel_width=3), & dense(1) & ]) @@ -86,9 +86,9 @@ program test_conv1d_network cnn = network([ & input(1, 8), & - conv1d(filters=1, kernel_size=3), & - maxpool1d(pool_size=2), & - conv1d(filters=1, kernel_size=3), & + conv(filters=1, kernel_width=3), & + maxpool(pool_width=2, stride=2), & + conv(filters=1, kernel_width=3), & dense(1) & ]) @@ -121,9 +121,9 @@ program test_conv1d_network cnn = network([ & input(1, 12), & - conv1d(filters=1, kernel_size=3), & ! 1x12x12 input, 1x10x10 output - maxpool1d(pool_size=2), & ! 1x10x10 input, 1x5x5 output - conv1d(filters=1, kernel_size=3), & ! 1x5x5 input, 1x3x3 output + conv(filters=1, kernel_width=3), & ! 1x12x12 input, 1x10x10 output + maxpool(pool_width=2, stride=2), & ! 1x10x10 input, 1x5x5 output + conv(filters=1, kernel_width=3), & ! 1x5x5 input, 1x3x3 output dense(9) & ! 9 outputs ]) diff --git a/test/test_conv2d_layer.f90 b/test/test_conv2d_layer.f90 index 10a14c5e..2d5868b9 100644 --- a/test/test_conv2d_layer.f90 +++ b/test/test_conv2d_layer.f90 @@ -1,7 +1,7 @@ program test_conv2d_layer use iso_fortran_env, only: stderr => error_unit - use nf, only: conv2d, input, layer + use nf, only: conv, input, layer use nf_input3d_layer, only: input3d_layer implicit none @@ -12,7 +12,7 @@ program test_conv2d_layer real, parameter :: tolerance = 1e-7 logical :: ok = .true. - conv_layer = conv2d(filters, kernel_size) + conv_layer = conv(filters, kernel_size, kernel_size) if (.not. conv_layer % name == 'conv2d') then ok = .false. @@ -52,7 +52,7 @@ program test_conv2d_layer sample_input = 0 input_layer = input(1, 3, 3) - conv_layer = conv2d(filters, kernel_size) + conv_layer = conv(filters, kernel_size, kernel_size) call conv_layer % init(input_layer) select type(this_layer => input_layer % p); type is(input3d_layer) diff --git a/test/test_conv2d_network.f90 b/test/test_conv2d_network.f90 index 73c4595a..c293a1d2 100644 --- a/test/test_conv2d_network.f90 +++ b/test/test_conv2d_network.f90 @@ -1,7 +1,7 @@ program test_conv2d_network use iso_fortran_env, only: stderr => error_unit - use nf, only: conv2d, input, network, dense, sgd, maxpool2d + use nf, only: conv, input, network, dense, sgd, maxpool implicit none @@ -12,8 +12,8 @@ program test_conv2d_network ! 3-layer convolutional network net = network([ & input(3, 32, 32), & - conv2d(filters=16, kernel_size=3), & - conv2d(filters=32, kernel_size=3) & + conv(filters=16, kernel_width=3, kernel_height=3), & + conv(filters=32, kernel_width=3, kernel_height=3) & ]) if (.not. size(net % layers) == 3) then @@ -49,8 +49,8 @@ program test_conv2d_network cnn = network([ & input(1, 5, 5), & - conv2d(filters=1, kernel_size=3), & - conv2d(filters=1, kernel_size=3), & + conv(filters=1, kernel_width=3, kernel_height=3), & + conv(filters=1, kernel_width=3, kernel_height=3), & dense(1) & ]) @@ -86,9 +86,9 @@ program test_conv2d_network cnn = network([ & input(1, 8, 8), & - conv2d(filters=1, kernel_size=3), & - maxpool2d(pool_size=2), & - conv2d(filters=1, kernel_size=3), & + conv(filters=1, kernel_width=3, kernel_height=3), & + maxpool(pool_width=2, pool_height=2, stride=2), & + conv(filters=1, kernel_width=3, kernel_height=3), & dense(1) & ]) @@ -121,9 +121,9 @@ program test_conv2d_network cnn = network([ & input(1, 12, 12), & - conv2d(filters=1, kernel_size=3), & ! 1x12x12 input, 1x10x10 output - maxpool2d(pool_size=2), & ! 1x10x10 input, 1x5x5 output - conv2d(filters=1, kernel_size=3), & ! 1x5x5 input, 1x3x3 output + conv(filters=1, kernel_width=3, kernel_height=3), & ! 1x12x12 input, 1x10x10 output + maxpool(pool_width=2, pool_height=2, stride=2), & ! 1x10x10 input, 1x5x5 output + conv(filters=1, kernel_width=3, kernel_height=3), & ! 1x5x5 input, 1x3x3 output dense(9) & ! 9 outputs ]) diff --git a/test/test_get_set_network_params.f90 b/test/test_get_set_network_params.f90 index 71963a1c..f2a3b6a8 100644 --- a/test/test_get_set_network_params.f90 +++ b/test/test_get_set_network_params.f90 @@ -1,6 +1,6 @@ program test_get_set_network_params use iso_fortran_env, only: stderr => error_unit - use nf, only: conv2d, dense, flatten, input, maxpool2d, network + use nf, only: conv, dense, flatten, input, network implicit none type(network) :: net logical :: ok = .true. @@ -10,7 +10,7 @@ program test_get_set_network_params ! First test get_num_params() net = network([ & input(3, 5, 5), & ! 5 x 5 image with 3 channels - conv2d(filters=2, kernel_size=3), & ! kernel shape [2, 3, 3, 3], output shape [2, 3, 3], 56 parameters total + conv(filters=2, kernel_width=3, kernel_height=3), & ! kernel shape [2, 3, 3, 3], output shape [2, 3, 3], 56 parameters total flatten(), & dense(4) & ! weights shape [72], biases shape [4], 76 parameters total ]) @@ -46,7 +46,7 @@ program test_get_set_network_params ! Finally, test set_params() and get_params() for a conv2d layer net = network([ & input(1, 3, 3), & - conv2d(filters=1, kernel_size=3) & + conv(filters=1, kernel_width=3, kernel_height=3) & ]) call net % set_params(test_params_conv2d) diff --git a/test/test_insert_flatten.f90 b/test/test_insert_flatten.f90 index 18e41b81..3437b746 100644 --- a/test/test_insert_flatten.f90 +++ b/test/test_insert_flatten.f90 @@ -1,7 +1,7 @@ program test_insert_flatten use iso_fortran_env, only: stderr => error_unit - use nf, only: network, input, conv2d, maxpool2d, flatten, dense, reshape + use nf, only: network, input, conv, maxpool, flatten, dense, reshape implicit none @@ -20,7 +20,7 @@ program test_insert_flatten net = network([ & input(3, 32, 32), & - conv2d(filters=1, kernel_size=3), & + conv(filters=1, kernel_width=3, kernel_height=3), & dense(10) & ]) @@ -33,14 +33,14 @@ program test_insert_flatten net = network([ & input(3, 32, 32), & - conv2d(filters=1, kernel_size=3), & - maxpool2d(pool_size=2, stride=2), & + conv(filters=1, kernel_width=3, kernel_height=3), & + maxpool(pool_width=2, stride=2), & dense(10) & ]) if (.not. net % layers(4) % name == 'flatten') then ok = .false. - write(stderr, '(a)') 'flatten layer inserted after maxpool2d.. failed' + write(stderr, '(a)') 'flatten layer inserted after maxpool.. failed' end if net = network([ & diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 index 6a897575..9e8bfccf 100644 --- a/test/test_layernorm.f90 +++ b/test/test_layernorm.f90 @@ -27,14 +27,14 @@ program test_layernorm_instance end if contains - function allclose(x, y) result(res) - real, intent(in) :: x(:) - real, intent(in) :: y(:) - logical :: res - res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y))) + logical function allclose(x, y) result(res) + real, intent(in) :: x(:), y(:) + !res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y))) + res = all(abs(x - y) <= 1e-05) end function allclose + subroutine test_layernorm_forward(layernorm_instance, input, ok) type(layernorm_layer), intent(in out) :: layernorm_instance real, intent(in out) :: input(:, :) @@ -61,6 +61,7 @@ subroutine test_layernorm_forward(layernorm_instance, input, ok) end if end subroutine test_layernorm_forward + subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok) type(layernorm_layer), intent(in out) :: layernorm_instance real, intent(in out) :: input(:, :) @@ -103,6 +104,7 @@ subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok) end if end subroutine test_layernorm_backward + subroutine test_layernorm_gradients(input, gradient, ok) real, intent(in out) :: input(:, :) real, intent(in out) :: gradient(:, :) @@ -152,6 +154,7 @@ subroutine test_layernorm_gradients(input, gradient, ok) end if end subroutine test_layernorm_gradients + subroutine test_layernorm_integration(ok) logical, intent(in out) :: ok @@ -160,13 +163,13 @@ subroutine test_layernorm_integration(ok) real :: y(6) = [0.7, 0.2, 0.1, 0.1, 0.01, 0.9] real :: tolerance = 0.1 integer :: epoch - integer :: epochs = 10000 + integer, parameter :: num_epochs = 100000 - net = network([& - input(2, 3),& - linear2d(3),& - layernorm(),& - flatten()& + net = network([ & + input(2, 3), & + linear2d(3), & + layernorm(), & + flatten() & ]) ! Kaiming weights to achieve semblance of convergance @@ -177,17 +180,18 @@ subroutine test_layernorm_integration(ok) l % biases = 0.2 end select - do epoch = 1, epochs + do epoch = 1, num_epochs call net % forward(x) call net % backward(y) call net % update(optimizer=sgd(learning_rate=0.001)) if (all(abs(net % predict(x) - y) < tolerance)) exit end do - if (.not. epoch <= epochs) then + if (.not. epoch <= num_epochs) then write(stderr, '(a)') & 'linear2d + layernorm should converge in simple training.. failed' ok = .false. end if end subroutine test_layernorm_integration + end program test_layernorm_instance diff --git a/test/test_locally_connected1d_layer.f90 b/test/test_locally_connected2d_layer.f90 similarity index 69% rename from test/test_locally_connected1d_layer.f90 rename to test/test_locally_connected2d_layer.f90 index e8a30cfc..0157b916 100644 --- a/test/test_locally_connected1d_layer.f90 +++ b/test/test_locally_connected2d_layer.f90 @@ -1,7 +1,7 @@ -program test_locally_connected1d_layer +program test_locally_connected2d_layer use iso_fortran_env, only: stderr => error_unit - use nf, only: locally_connected1d, input, layer + use nf, only: locally_connected, input, layer use nf_input2d_layer, only: input2d_layer implicit none @@ -12,21 +12,21 @@ program test_locally_connected1d_layer real, parameter :: tolerance = 1e-7 logical :: ok = .true. - locally_connected_1d_layer = locally_connected1d(filters, kernel_size) + locally_connected_1d_layer = locally_connected(filters, kernel_size) - if (.not. locally_connected_1d_layer % name == 'locally_connected1d') then + if (.not. locally_connected_1d_layer % name == 'locally_connected2d') then ok = .false. - write(stderr, '(a)') 'locally_connected1d layer has its name set correctly.. failed' + write(stderr, '(a)') 'locally_connected2d layer has its name set correctly.. failed' end if if (locally_connected_1d_layer % initialized) then ok = .false. - write(stderr, '(a)') 'locally_connected1d layer should not be marked as initialized yet.. failed' + write(stderr, '(a)') 'locally_connected2d layer should not be marked as initialized yet.. failed' end if if (.not. locally_connected_1d_layer % activation == 'relu') then ok = .false. - write(stderr, '(a)') 'locally_connected1d layer defaults to relu activation.. failed' + write(stderr, '(a)') 'locally_connected2d layer defaults to relu activation.. failed' end if input_layer = input(3, 32) @@ -34,17 +34,17 @@ program test_locally_connected1d_layer if (.not. locally_connected_1d_layer % initialized) then ok = .false. - write(stderr, '(a)') 'locally_connected1d layer should now be marked as initialized.. failed' + write(stderr, '(a)') 'locally_connected2d layer should now be marked as initialized.. failed' end if if (.not. all(locally_connected_1d_layer % input_layer_shape == [3, 32])) then ok = .false. - write(stderr, '(a)') 'locally_connected1d layer input layer shape should be correct.. failed' + write(stderr, '(a)') 'locally_connected2d layer input layer shape should be correct.. failed' end if if (.not. all(locally_connected_1d_layer % layer_shape == [filters, 30])) then ok = .false. - write(stderr, '(a)') 'locally_connected1d layer input layer shape should be correct.. failed' + write(stderr, '(a)') 'locally_connected2d layer input layer shape should be correct.. failed' end if ! Minimal locally_connected_1d layer: 1 channel, 3x3 pixel image; @@ -52,7 +52,7 @@ program test_locally_connected1d_layer sample_input = 0 input_layer = input(1, 3) - locally_connected_1d_layer = locally_connected1d(filters, kernel_size) + locally_connected_1d_layer = locally_connected(filters, kernel_size) call locally_connected_1d_layer % init(input_layer) select type(this_layer => input_layer % p); type is(input2d_layer) @@ -62,17 +62,16 @@ program test_locally_connected1d_layer call locally_connected_1d_layer % forward(input_layer) call locally_connected_1d_layer % get_output(output) - if (.not. all(abs(output) < tolerance)) then ok = .false. - write(stderr, '(a)') 'locally_connected1d layer with zero input and sigmoid function must forward to all 0.5.. failed' + write(stderr, '(a)') 'locally_connected2d layer with zero input and sigmoid function must forward to all 0.5.. failed' end if if (ok) then - print '(a)', 'test_locally_connected1d_layer: All tests passed.' + print '(a)', 'test_locally_connected2d_layer: All tests passed.' else - write(stderr, '(a)') 'test_locally_connected1d_layer: One or more tests failed.' + write(stderr, '(a)') 'test_locally_connected2d_layer: One or more tests failed.' stop 1 end if -end program test_locally_connected1d_layer +end program test_locally_connected2d_layer diff --git a/test/test_maxpool1d_layer.f90 b/test/test_maxpool1d_layer.f90 index 023a2c33..f3765686 100644 --- a/test/test_maxpool1d_layer.f90 +++ b/test/test_maxpool1d_layer.f90 @@ -1,7 +1,7 @@ program test_maxpool1d_layer use iso_fortran_env, only: stderr => error_unit - use nf, only: maxpool1d, input, layer + use nf, only: maxpool, input, layer use nf_input2d_layer, only: input2d_layer use nf_maxpool1d_layer, only: maxpool1d_layer @@ -16,7 +16,7 @@ program test_maxpool1d_layer integer :: i logical :: ok = .true., gradient_ok = .true. - maxpool_layer = maxpool1d(pool_size) + maxpool_layer = maxpool(pool_width=pool_size, stride=stride) if (.not. maxpool_layer % name == 'maxpool1d') then ok = .false. diff --git a/test/test_maxpool2d_layer.f90 b/test/test_maxpool2d_layer.f90 index 5983a217..29a56b57 100644 --- a/test/test_maxpool2d_layer.f90 +++ b/test/test_maxpool2d_layer.f90 @@ -1,7 +1,7 @@ program test_maxpool2d_layer use iso_fortran_env, only: stderr => error_unit - use nf, only: maxpool2d, input, layer + use nf, only: maxpool, input, layer use nf_input3d_layer, only: input3d_layer use nf_maxpool2d_layer, only: maxpool2d_layer @@ -16,7 +16,7 @@ program test_maxpool2d_layer integer :: i, j logical :: ok = .true., gradient_ok = .true. - maxpool_layer = maxpool2d(pool_size) + maxpool_layer = maxpool(pool_width=pool_size, pool_height=pool_size, stride=stride) if (.not. maxpool_layer % name == 'maxpool2d') then ok = .false.