diff --git a/CMakeLists.txt b/CMakeLists.txt index c1bf2231..fcd93915 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,6 +18,8 @@ add_library(neural-fortran src/nf.f90 src/nf/nf_activation.f90 src/nf/nf_base_layer.f90 + src/nf/nf_conv1d_layer.f90 + src/nf/nf_conv1d_layer_submodule.f90 src/nf/nf_conv2d_layer.f90 src/nf/nf_conv2d_layer_submodule.f90 src/nf/nf_cross_attention_layer.f90 @@ -37,12 +39,20 @@ add_library(neural-fortran src/nf/nf_input3d_layer_submodule.f90 src/nf/nf_layer_constructors.f90 src/nf/nf_layer_constructors_submodule.f90 + src/nf/nf_layernorm.f90 + src/nf/nf_layernorm_submodule.f90 src/nf/nf_layer.f90 src/nf/nf_layer_submodule.f90 + src/nf/nf_locally_connected1d_layer_submodule.f90 + src/nf/nf_locally_connected1d_layer.f90 src/nf/nf_linear2d_layer.f90 src/nf/nf_linear2d_layer_submodule.f90 + src/nf/nf_embedding_layer.f90 + src/nf/nf_embedding_layer_submodule.f90 src/nf/nf_loss.f90 src/nf/nf_loss_submodule.f90 + src/nf/nf_maxpool1d_layer.f90 + src/nf/nf_maxpool1d_layer_submodule.f90 src/nf/nf_maxpool2d_layer.f90 src/nf/nf_maxpool2d_layer_submodule.f90 src/nf/nf_metrics.f90 @@ -56,6 +66,8 @@ add_library(neural-fortran src/nf/nf_random.f90 src/nf/nf_reshape_layer.f90 src/nf/nf_reshape_layer_submodule.f90 + src/nf/nf_reshape2d_layer.f90 + src/nf/nf_reshape2d_layer_submodule.f90 src/nf/nf_self_attention_layer.f90 src/nf/io/nf_io_binary.f90 src/nf/io/nf_io_binary_submodule.f90 diff --git a/README.md b/README.md index a04ac32a..75da6525 100644 --- a/README.md +++ b/README.md @@ -30,17 +30,21 @@ Read the paper [here](https://arxiv.org/abs/1902.06714). | Layer type | Constructor name | Supported input layers | Rank of output array | Forward pass | Backward pass | |------------|------------------|------------------------|----------------------|--------------|---------------| | Input | `input` | n/a | 1, 2, 3 | n/a | n/a | +| Embedding | `embedding` | n/a | 2 | ✅ | ✅ | | Dense (fully-connected) | `dense` | `input1d`, `dense`, `dropout`, `flatten` | 1 | ✅ | ✅ | | Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ | -| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) | +| Locally connected (1-d) | `locally_connected1d` | `input2d`, `locally_connected1d`, `conv1d`, `maxpool1d`, `reshape2d` | 2 | ✅ | ✅ | +| Convolutional (1-d) | `conv1d` | `input2d`, `conv1d`, `maxpool1d`, `reshape2d` | 2 | ✅ | ✅ | +| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ | +| Max-pooling (1-d) | `maxpool1d` | `input2d`, `conv1d`, `maxpool1d`, `reshape2d` | 2 | ✅ | ✅ | | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ | -| Linear (2-d) | `linear2d` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | -| Self-attention | `self_attention` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | +| Linear (2-d) | `linear2d` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | +| Self-attention | `self_attention` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | +| Layer Normalization | `layernorm` | `linear2d`, `self_attention` | 2 | ✅ | ✅ | | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ | +| Reshape (1-d to 2-d) | `reshape2d` | `input2d`, `conv1d`, `locally_connected1d`, `maxpool1d` | 2 | ✅ | ✅ | | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ | -(*) See Issue [#145](https://github.com/modern-fortran/neural-fortran/issues/145) regarding non-converging CNN training on the MNIST dataset. - ## Getting started Get the code: @@ -265,7 +269,9 @@ Thanks to all open-source contributors to neural-fortran: [jvdp1](https://github.com/jvdp1), [jvo203](https://github.com/jvo203), [milancurcic](https://github.com/milancurcic), +[OneAdder](https://github.com/OneAdder), [pirpyn](https://github.com/pirpyn), +[rico07](https://github.com/ricor07), [rouson](https://github.com/rouson), [rweed](https://github.com/rweed), [Spnetic-5](https://github.com/Spnetic-5), diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index f4b706b8..b131c47c 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -1,5 +1,6 @@ foreach(execid cnn_mnist + cnn_mnist_1d dense_mnist get_set_network_params network_parameters diff --git a/example/cnn_mnist.f90 b/example/cnn_mnist.f90 index bec50b80..ef22f986 100644 --- a/example/cnn_mnist.f90 +++ b/example/cnn_mnist.f90 @@ -12,7 +12,7 @@ program cnn_mnist real, allocatable :: validation_images(:,:), validation_labels(:) real, allocatable :: testing_images(:,:), testing_labels(:) integer :: n - integer, parameter :: num_epochs = 10 + integer, parameter :: num_epochs = 250 call load_mnist(training_images, training_labels, & validation_images, validation_labels, & @@ -35,9 +35,9 @@ program cnn_mnist call net % train( & training_images, & label_digits(training_labels), & - batch_size=128, & + batch_size=16, & epochs=1, & - optimizer=sgd(learning_rate=3.) & + optimizer=sgd(learning_rate=0.001) & ) print '(a,i2,a,f5.2,a)', 'Epoch ', n, ' done, Accuracy: ', accuracy( & diff --git a/example/cnn_mnist_1d.f90 b/example/cnn_mnist_1d.f90 new file mode 100644 index 00000000..7e978034 --- /dev/null +++ b/example/cnn_mnist_1d.f90 @@ -0,0 +1,67 @@ +program cnn_mnist_1d + + use nf, only: network, sgd, & + input, conv1d, maxpool1d, flatten, dense, reshape, reshape2d, locally_connected1d, & + load_mnist, label_digits, softmax, relu + + implicit none + + type(network) :: net + + real, allocatable :: training_images(:,:), training_labels(:) + real, allocatable :: validation_images(:,:), validation_labels(:) + real, allocatable :: testing_images(:,:), testing_labels(:) + integer :: n + integer, parameter :: num_epochs = 250 + + call load_mnist(training_images, training_labels, & + validation_images, validation_labels, & + testing_images, testing_labels) + + net = network([ & + input(784), & + reshape2d([28, 28]), & + locally_connected1d(filters=8, kernel_size=3, activation=relu()), & + maxpool1d(pool_size=2), & + locally_connected1d(filters=16, kernel_size=3, activation=relu()), & + maxpool1d(pool_size=2), & + dense(10, activation=softmax()) & + ]) + + call net % print_info() + + epochs: do n = 1, num_epochs + + call net % train( & + training_images, & + label_digits(training_labels), & + batch_size=16, & + epochs=1, & + optimizer=sgd(learning_rate=0.01) & + ) + + print '(a,i2,a,f5.2,a)', 'Epoch ', n, ' done, Accuracy: ', accuracy( & + net, validation_images, label_digits(validation_labels)) * 100, ' %' + + end do epochs + + print '(a,f5.2,a)', 'Testing accuracy: ', & + accuracy(net, testing_images, label_digits(testing_labels)) * 100, '%' + + contains + + real function accuracy(net, x, y) + type(network), intent(in out) :: net + real, intent(in) :: x(:,:), y(:,:) + integer :: i, good + good = 0 + do i = 1, size(x, dim=2) + if (all(maxloc(net % predict(x(:,i))) == maxloc(y(:,i)))) then + good = good + 1 + end if + end do + accuracy = real(good) / size(x, dim=2) + end function accuracy + + end program cnn_mnist_1d + \ No newline at end of file diff --git a/fpm.toml b/fpm.toml index ebcceeb6..15a746e4 100644 --- a/fpm.toml +++ b/fpm.toml @@ -1,5 +1,5 @@ name = "neural-fortran" -version = "0.19.0" +version = "0.20.0" license = "MIT" author = "Milan Curcic" maintainer = "mcurcic@miami.edu" diff --git a/src/nf.f90 b/src/nf.f90 index 39f67ea3..172eafb3 100644 --- a/src/nf.f90 +++ b/src/nf.f90 @@ -3,14 +3,20 @@ module nf use nf_datasets_mnist, only: label_digits, load_mnist use nf_layer, only: layer use nf_layer_constructors, only: & + conv1d, & conv2d, & dense, & dropout, & + embedding, & flatten, & input, & + layernorm, & linear2d, & + locally_connected1d, & + maxpool1d, & maxpool2d, & reshape, & + reshape2d, & self_attention use nf_loss, only: mse, quadratic use nf_metrics, only: corr, maxabs diff --git a/src/nf/nf_activation.f90 b/src/nf/nf_activation.f90 index 309b43d2..caeab138 100644 --- a/src/nf/nf_activation.f90 +++ b/src/nf/nf_activation.f90 @@ -25,12 +25,14 @@ module nf_activation contains procedure(eval_1d_i), deferred :: eval_1d procedure(eval_1d_i), deferred :: eval_1d_prime + procedure(eval_2d_i), deferred :: eval_2d + procedure(eval_2d_i), deferred :: eval_2d_prime procedure(eval_3d_i), deferred :: eval_3d procedure(eval_3d_i), deferred :: eval_3d_prime procedure :: get_name - generic :: eval => eval_1d, eval_3d - generic :: eval_prime => eval_1d_prime, eval_3d_prime + generic :: eval => eval_1d, eval_2d, eval_3d + generic :: eval_prime => eval_1d_prime, eval_2d_prime, eval_3d_prime end type activation_function @@ -43,6 +45,13 @@ pure function eval_1d_i(self, x) result(res) real :: res(size(x)) end function eval_1d_i + pure function eval_2d_i(self, x) result(res) + import :: activation_function + class(activation_function), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + end function eval_2d_i + pure function eval_3d_i(self, x) result(res) import :: activation_function class(activation_function), intent(in) :: self @@ -57,6 +66,8 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_elu procedure :: eval_1d_prime => eval_1d_elu_prime + procedure :: eval_2d => eval_2d_elu + procedure :: eval_2d_prime => eval_2d_elu_prime procedure :: eval_3d => eval_3d_elu procedure :: eval_3d_prime => eval_3d_elu_prime end type elu @@ -65,6 +76,8 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_exponential procedure :: eval_1d_prime => eval_1d_exponential + procedure :: eval_2d => eval_2d_exponential + procedure :: eval_2d_prime => eval_2d_exponential procedure :: eval_3d => eval_3d_exponential procedure :: eval_3d_prime => eval_3d_exponential end type exponential @@ -73,6 +86,8 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_gaussian procedure :: eval_1d_prime => eval_1d_gaussian_prime + procedure :: eval_2d => eval_2d_gaussian + procedure :: eval_2d_prime => eval_2d_gaussian_prime procedure :: eval_3d => eval_3d_gaussian procedure :: eval_3d_prime => eval_3d_gaussian_prime end type gaussian @@ -81,6 +96,8 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_linear procedure :: eval_1d_prime => eval_1d_linear_prime + procedure :: eval_2d => eval_2d_linear + procedure :: eval_2d_prime => eval_2d_linear_prime procedure :: eval_3d => eval_3d_linear procedure :: eval_3d_prime => eval_3d_linear_prime end type linear @@ -89,6 +106,8 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_relu procedure :: eval_1d_prime => eval_1d_relu_prime + procedure :: eval_2d => eval_2d_relu + procedure :: eval_2d_prime => eval_2d_relu_prime procedure :: eval_3d => eval_3d_relu procedure :: eval_3d_prime => eval_3d_relu_prime end type relu @@ -98,6 +117,8 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_leaky_relu procedure :: eval_1d_prime => eval_1d_leaky_relu_prime + procedure :: eval_2d => eval_2d_leaky_relu + procedure :: eval_2d_prime => eval_2d_leaky_relu_prime procedure :: eval_3d => eval_3d_leaky_relu procedure :: eval_3d_prime => eval_3d_leaky_relu_prime end type leaky_relu @@ -106,6 +127,8 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_sigmoid procedure :: eval_1d_prime => eval_1d_sigmoid_prime + procedure :: eval_2d => eval_2d_sigmoid + procedure :: eval_2d_prime => eval_2d_sigmoid_prime procedure :: eval_3d => eval_3d_sigmoid procedure :: eval_3d_prime => eval_3d_sigmoid_prime end type sigmoid @@ -114,6 +137,8 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_softmax procedure :: eval_1d_prime => eval_1d_softmax_prime + procedure :: eval_2d => eval_2d_softmax + procedure :: eval_2d_prime => eval_2d_softmax_prime procedure :: eval_3d => eval_3d_softmax procedure :: eval_3d_prime => eval_3d_softmax_prime end type softmax @@ -122,6 +147,8 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_softplus procedure :: eval_1d_prime => eval_1d_softplus_prime + procedure :: eval_2d => eval_2d_softplus + procedure :: eval_2d_prime => eval_2d_softplus_prime procedure :: eval_3d => eval_3d_softplus procedure :: eval_3d_prime => eval_3d_softplus_prime end type softplus @@ -130,6 +157,8 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_step procedure :: eval_1d_prime => eval_1d_step_prime + procedure :: eval_2d => eval_2d_step + procedure :: eval_2d_prime => eval_2d_step_prime procedure :: eval_3d => eval_3d_step procedure :: eval_3d_prime => eval_3d_step_prime end type step @@ -138,6 +167,8 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_tanh procedure :: eval_1d_prime => eval_1d_tanh_prime + procedure :: eval_2d => eval_2d_tanh + procedure :: eval_2d_prime => eval_2d_tanh_prime procedure :: eval_3d => eval_3d_tanh procedure :: eval_3d_prime => eval_3d_tanh_prime end type tanhf @@ -147,14 +178,16 @@ end function eval_3d_i contains procedure :: eval_1d => eval_1d_celu procedure :: eval_1d_prime => eval_1d_celu_prime + procedure :: eval_2d => eval_2d_celu + procedure :: eval_2d_prime => eval_2d_celu_prime procedure :: eval_3d => eval_3d_celu procedure :: eval_3d_prime => eval_3d_celu_prime end type celu contains + ! ELU Activation Functions pure function eval_1d_elu(self, x) result(res) - ! Exponential Linear Unit (ELU) activation function. class(elu), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -166,8 +199,6 @@ pure function eval_1d_elu(self, x) result(res) end function eval_1d_elu pure function eval_1d_elu_prime(self, x) result(res) - ! First derivative of the Exponential Linear Unit (ELU) - ! activation function. class(elu), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -178,8 +209,29 @@ pure function eval_1d_elu_prime(self, x) result(res) end where end function eval_1d_elu_prime + pure function eval_2d_elu(self, x) result(res) + class(elu), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + where (x >= 0) + res = x + elsewhere + res = self % alpha * (exp(x) - 1) + end where + end function eval_2d_elu + + pure function eval_2d_elu_prime(self, x) result(res) + class(elu), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + where (x >= 0) + res = 1 + elsewhere + res = self % alpha * exp(x) + end where + end function eval_2d_elu_prime + pure function eval_3d_elu(self, x) result(res) - ! Exponential Linear Unit (ELU) activation function. class(elu), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -191,8 +243,6 @@ pure function eval_3d_elu(self, x) result(res) end function eval_3d_elu pure function eval_3d_elu_prime(self, x) result(res) - ! First derivative of the Exponential Linear Unit (ELU) - ! activation function. class(elu), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -203,24 +253,30 @@ pure function eval_3d_elu_prime(self, x) result(res) end where end function eval_3d_elu_prime + ! Exponential Activation Functions pure function eval_1d_exponential(self, x) result(res) - ! Exponential activation function. class(exponential), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) res = exp(x) end function eval_1d_exponential + pure function eval_2d_exponential(self, x) result(res) + class(exponential), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = exp(x) + end function eval_2d_exponential + pure function eval_3d_exponential(self, x) result(res) - ! Exponential activation function. class(exponential), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) res = exp(x) end function eval_3d_exponential + ! Gaussian Activation Functions pure function eval_1d_gaussian(self, x) result(res) - ! Gaussian activation function. class(gaussian), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -228,15 +284,27 @@ pure function eval_1d_gaussian(self, x) result(res) end function eval_1d_gaussian pure function eval_1d_gaussian_prime(self, x) result(res) - ! First derivative of the Gaussian activation function. class(gaussian), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) res = -2 * x * self % eval_1d(x) end function eval_1d_gaussian_prime + pure function eval_2d_gaussian(self, x) result(res) + class(gaussian), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = exp(-x**2) + end function eval_2d_gaussian + + pure function eval_2d_gaussian_prime(self, x) result(res) + class(gaussian), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = -2 * x * self % eval_2d(x) + end function eval_2d_gaussian_prime + pure function eval_3d_gaussian(self, x) result(res) - ! Gaussian activation function. class(gaussian), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -244,15 +312,14 @@ pure function eval_3d_gaussian(self, x) result(res) end function eval_3d_gaussian pure function eval_3d_gaussian_prime(self, x) result(res) - ! First derivative of the Gaussian activation function. class(gaussian), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) res = -2 * x * self % eval_3d(x) end function eval_3d_gaussian_prime + ! Linear Activation Functions pure function eval_1d_linear(self, x) result(res) - ! Linear activation function. class(linear), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -260,15 +327,27 @@ pure function eval_1d_linear(self, x) result(res) end function eval_1d_linear pure function eval_1d_linear_prime(self, x) result(res) - ! First derivative of the Linear activation function. class(linear), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) res = 1 end function eval_1d_linear_prime + pure function eval_2d_linear(self, x) result(res) + class(linear), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = x + end function eval_2d_linear + + pure function eval_2d_linear_prime(self, x) result(res) + class(linear), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = 1 + end function eval_2d_linear_prime + pure function eval_3d_linear(self, x) result(res) - ! Linear activation function. class(linear), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -276,15 +355,14 @@ pure function eval_3d_linear(self, x) result(res) end function eval_3d_linear pure function eval_3d_linear_prime(self, x) result(res) - ! First derivative of the Linear activation function. class(linear), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) res = 1 end function eval_3d_linear_prime + ! ReLU Activation Functions pure function eval_1d_relu(self, x) result(res) - !! Rectified Linear Unit (ReLU) activation function. class(relu), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -292,15 +370,27 @@ pure function eval_1d_relu(self, x) result(res) end function eval_1d_relu pure function eval_1d_relu_prime(self, x) result(res) - ! First derivative of the Rectified Linear Unit (ReLU) activation function. class(relu), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) res = merge(1., 0., x > 0) end function eval_1d_relu_prime + pure function eval_2d_relu(self, x) result(res) + class(relu), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = max(0., x) + end function eval_2d_relu + + pure function eval_2d_relu_prime(self, x) result(res) + class(relu), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = merge(1., 0., x > 0) + end function eval_2d_relu_prime + pure function eval_3d_relu(self, x) result(res) - !! Rectified Linear Unit (ReLU) activation function. class(relu), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -308,15 +398,14 @@ pure function eval_3d_relu(self, x) result(res) end function eval_3d_relu pure function eval_3d_relu_prime(self, x) result(res) - ! First derivative of the Rectified Linear Unit (ReLU) activation function. class(relu), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) res = merge(1., 0., x > 0) end function eval_3d_relu_prime + ! Leaky ReLU Activation Functions pure function eval_1d_leaky_relu(self, x) result(res) - !! Leaky Rectified Linear Unit (Leaky ReLU) activation function. class(leaky_relu), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -324,15 +413,27 @@ pure function eval_1d_leaky_relu(self, x) result(res) end function eval_1d_leaky_relu pure function eval_1d_leaky_relu_prime(self, x) result(res) - ! First derivative of the Leaky Rectified Linear Unit (Leaky ReLU) activation function. class(leaky_relu), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) res = merge(1., self%alpha, x > 0) end function eval_1d_leaky_relu_prime + pure function eval_2d_leaky_relu(self, x) result(res) + class(leaky_relu), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = max(self % alpha * x, x) + end function eval_2d_leaky_relu + + pure function eval_2d_leaky_relu_prime(self, x) result(res) + class(leaky_relu), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = merge(1., self%alpha, x > 0) + end function eval_2d_leaky_relu_prime + pure function eval_3d_leaky_relu(self, x) result(res) - !! Leaky Rectified Linear Unit (Leaky ReLU) activation function. class(leaky_relu), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -340,47 +441,57 @@ pure function eval_3d_leaky_relu(self, x) result(res) end function eval_3d_leaky_relu pure function eval_3d_leaky_relu_prime(self, x) result(res) - ! First derivative of the Leaky Rectified Linear Unit (Leaky ReLU) activation function. class(leaky_relu), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) res = merge(1., self%alpha, x > 0) end function eval_3d_leaky_relu_prime + ! Sigmoid Activation Functions pure function eval_1d_sigmoid(self, x) result(res) - ! Sigmoid activation function. class(sigmoid), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) res = 1 / (1 + exp(-x)) - endfunction eval_1d_sigmoid + end function eval_1d_sigmoid pure function eval_1d_sigmoid_prime(self, x) result(res) - ! First derivative of the sigmoid activation function. class(sigmoid), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) res = self % eval_1d(x) * (1 - self % eval_1d(x)) end function eval_1d_sigmoid_prime + pure function eval_2d_sigmoid(self, x) result(res) + class(sigmoid), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = 1 / (1 + exp(-x)) + end function eval_2d_sigmoid + + pure function eval_2d_sigmoid_prime(self, x) result(res) + class(sigmoid), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = self % eval_2d(x) * (1 - self % eval_2d(x)) + end function eval_2d_sigmoid_prime + pure function eval_3d_sigmoid(self, x) result(res) - ! Sigmoid activation function. class(sigmoid), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) res = 1 / (1 + exp(-x)) - endfunction eval_3d_sigmoid + end function eval_3d_sigmoid pure function eval_3d_sigmoid_prime(self, x) result(res) - ! First derivative of the sigmoid activation function. class(sigmoid), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) res = self % eval_3d(x) * (1 - self % eval_3d(x)) end function eval_3d_sigmoid_prime + ! Softmax Activation Functions pure function eval_1d_softmax(self, x) result(res) - !! Softmax activation function class(softmax), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -389,15 +500,28 @@ pure function eval_1d_softmax(self, x) result(res) end function eval_1d_softmax pure function eval_1d_softmax_prime(self, x) result(res) - !! Derivative of the softmax activation function. class(softmax), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) res = self%eval_1d(x) * (1 - self%eval_1d(x)) end function eval_1d_softmax_prime + pure function eval_2d_softmax(self, x) result(res) + class(softmax), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = exp(x - maxval(x)) + res = res / sum(res) + end function eval_2d_softmax + + pure function eval_2d_softmax_prime(self, x) result(res) + class(softmax), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = self % eval_2d(x) * (1 - self % eval_2d(x)) + end function eval_2d_softmax_prime + pure function eval_3d_softmax(self, x) result(res) - !! Softmax activation function class(softmax), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -406,15 +530,14 @@ pure function eval_3d_softmax(self, x) result(res) end function eval_3d_softmax pure function eval_3d_softmax_prime(self, x) result(res) - !! Derivative of the softmax activation function. class(softmax), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) res = self % eval_3d(x) * (1 - self % eval_3d(x)) end function eval_3d_softmax_prime + ! Softplus Activation Functions pure function eval_1d_softplus(self, x) result(res) - ! Softplus activation function. class(softplus), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -422,15 +545,27 @@ pure function eval_1d_softplus(self, x) result(res) end function eval_1d_softplus pure function eval_1d_softplus_prime(self, x) result(res) - class(softplus), intent(in) :: self - ! First derivative of the softplus activation function. + class(softplus), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) res = exp(x) / (exp(x) + 1) end function eval_1d_softplus_prime + pure function eval_2d_softplus(self, x) result(res) + class(softplus), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = log(exp(x) + 1) + end function eval_2d_softplus + + pure function eval_2d_softplus_prime(self, x) result(res) + class(softplus), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = exp(x) / (exp(x) + 1) + end function eval_2d_softplus_prime + pure function eval_3d_softplus(self, x) result(res) - ! Softplus activation function. class(softplus), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -438,15 +573,14 @@ pure function eval_3d_softplus(self, x) result(res) end function eval_3d_softplus pure function eval_3d_softplus_prime(self, x) result(res) - class(softplus), intent(in) :: self - ! First derivative of the softplus activation function. + class(softplus), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) res = exp(x) / (exp(x) + 1) end function eval_3d_softplus_prime + ! Step Activation Functions pure function eval_1d_step(self, x) result(res) - ! Step activation function. class(step), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -454,15 +588,27 @@ pure function eval_1d_step(self, x) result(res) end function eval_1d_step pure function eval_1d_step_prime(self, x) result(res) - ! First derivative of the step activation function. class(step), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) res = 0 end function eval_1d_step_prime + pure function eval_2d_step(self, x) result(res) + class(step), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = merge(1., 0., x > 0) + end function eval_2d_step + + pure function eval_2d_step_prime(self, x) result(res) + class(step), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = 0 + end function eval_2d_step_prime + pure function eval_3d_step(self, x) result(res) - ! Step activation function. class(step), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -470,15 +616,14 @@ pure function eval_3d_step(self, x) result(res) end function eval_3d_step pure function eval_3d_step_prime(self, x) result(res) - ! First derivative of the step activation function. class(step), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) res = 0 end function eval_3d_step_prime + ! Tanh Activation Functions pure function eval_1d_tanh(self, x) result(res) - ! Tangent hyperbolic activation function. class(tanhf), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -486,15 +631,27 @@ pure function eval_1d_tanh(self, x) result(res) end function eval_1d_tanh pure function eval_1d_tanh_prime(self, x) result(res) - ! First derivative of the tanh activation function. class(tanhf), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) res = 1 - tanh(x)**2 end function eval_1d_tanh_prime + pure function eval_2d_tanh(self, x) result(res) + class(tanhf), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = tanh(x) + end function eval_2d_tanh + + pure function eval_2d_tanh_prime(self, x) result(res) + class(tanhf), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + res = 1 - tanh(x)**2 + end function eval_2d_tanh_prime + pure function eval_3d_tanh(self, x) result(res) - ! Tangent hyperbolic activation function. class(tanhf), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -502,15 +659,14 @@ pure function eval_3d_tanh(self, x) result(res) end function eval_3d_tanh pure function eval_3d_tanh_prime(self, x) result(res) - ! First derivative of the tanh activation function. class(tanhf), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) res = 1 - tanh(x)**2 end function eval_3d_tanh_prime + ! CELU Activation Functions pure function eval_1d_celu(self, x) result(res) - ! Celu activation function. class(celu), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -519,10 +675,9 @@ pure function eval_1d_celu(self, x) result(res) else where res = self % alpha * (exp(x / self % alpha) - 1.0) end where - end function + end function eval_1d_celu pure function eval_1d_celu_prime(self, x) result(res) - ! Celu activation function. class(celu), intent(in) :: self real, intent(in) :: x(:) real :: res(size(x)) @@ -531,10 +686,31 @@ pure function eval_1d_celu_prime(self, x) result(res) else where res = exp(x / self % alpha) end where - end function + end function eval_1d_celu_prime + + pure function eval_2d_celu(self, x) result(res) + class(celu), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + where (x >= 0.0) + res = x + else where + res = self % alpha * (exp(x / self % alpha) - 1.0) + end where + end function eval_2d_celu + + pure function eval_2d_celu_prime(self, x) result(res) + class(celu), intent(in) :: self + real, intent(in) :: x(:,:) + real :: res(size(x,1),size(x,2)) + where (x >= 0.0) + res = 1.0 + else where + res = exp(x / self % alpha) + end where + end function eval_2d_celu_prime pure function eval_3d_celu(self, x) result(res) - ! Celu activation function. class(celu), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -543,10 +719,9 @@ pure function eval_3d_celu(self, x) result(res) else where res = self % alpha * (exp(x / self % alpha) - 1.0) end where - end function + end function eval_3d_celu pure function eval_3d_celu_prime(self, x) result(res) - ! Celu activation function. class(celu), intent(in) :: self real, intent(in) :: x(:,:,:) real :: res(size(x,1),size(x,2),size(x,3)) @@ -555,13 +730,10 @@ pure function eval_3d_celu_prime(self, x) result(res) else where res = exp(x / self % alpha) end where - end function + end function eval_3d_celu_prime + ! Utility Functions function get_activation_by_name(activation_name) result(res) - ! Workaround to get activation_function with some - ! hardcoded default parameters by its name. - ! Need this function since we get only activation name - ! from keras files. character(len=*), intent(in) :: activation_name class(activation_function), allocatable :: res @@ -611,16 +783,8 @@ function get_activation_by_name(activation_name) result(res) end function get_activation_by_name pure function get_name(self) result(name) - !! Return the name of the activation function. - !! - !! Normally we would place this in the definition of each type, however - !! accessing the name variable directly from the type would require type - !! guards just like we have here. This at least keeps all the type guards - !! in one place. class(activation_function), intent(in) :: self - !! The activation function instance. character(:), allocatable :: name - !! The name of the activation function. select type (self) class is (elu) name = 'elu' @@ -651,4 +815,4 @@ pure function get_name(self) result(name) end select end function get_name -end module nf_activation +end module nf_activation \ No newline at end of file diff --git a/src/nf/nf_conv1d_layer.f90 b/src/nf/nf_conv1d_layer.f90 new file mode 100644 index 00000000..c39b11fc --- /dev/null +++ b/src/nf/nf_conv1d_layer.f90 @@ -0,0 +1,119 @@ +module nf_conv1d_layer + !! This modules provides a 1-d convolutional `conv1d` type. + + use nf_activation, only: activation_function + use nf_base_layer, only: base_layer + implicit none + + private + public :: conv1d_layer + + type, extends(base_layer) :: conv1d_layer + + integer :: width + integer :: height + integer :: channels + integer :: kernel_size + integer :: filters + + real, allocatable :: biases(:) ! size(filters) + real, allocatable :: kernel(:,:,:) ! filters x channels x window + real, allocatable :: output(:,:) ! filters x output_width + real, allocatable :: z(:,:) ! kernel .dot. input + bias + + real, allocatable :: dw(:,:,:) ! weight (kernel) gradients + real, allocatable :: db(:) ! bias gradients + real, allocatable :: gradient(:,:) + + class(activation_function), allocatable :: activation + + contains + + procedure :: forward + procedure :: backward + procedure :: get_gradients + procedure :: get_num_params + procedure :: get_params + procedure :: init + procedure :: set_params + + end type conv1d_layer + + interface conv1d_layer + module function conv1d_layer_cons(filters, kernel_size, activation) & + result(res) + !! `conv1d_layer` constructor function + integer, intent(in) :: filters + integer, intent(in) :: kernel_size + class(activation_function), intent(in) :: activation + type(conv1d_layer) :: res + end function conv1d_layer_cons + end interface conv1d_layer + + interface + + module subroutine init(self, input_shape) + !! Initialize the layer data structures. + !! + !! This is a deferred procedure from the `base_layer` abstract type. + class(conv1d_layer), intent(in out) :: self + !! A `conv1d_layer` instance + integer, intent(in) :: input_shape(:) + !! Input layer dimensions + end subroutine init + + pure module subroutine forward(self, input) + !! Apply a forward pass on the `conv1d` layer. + class(conv1d_layer), intent(in out) :: self + !! A `conv1d_layer` instance + real, intent(in) :: input(:,:) + !! Input data + end subroutine forward + + pure module subroutine backward(self, input, gradient) + !! Apply a backward pass on the `conv1d` layer. + class(conv1d_layer), intent(in out) :: self + !! A `conv1d_layer` instance + real, intent(in) :: input(:,:) + !! Input data (previous layer) + real, intent(in) :: gradient(:,:) + !! Gradient (next layer) + end subroutine backward + + pure module function get_num_params(self) result(num_params) + !! Get the number of parameters in the layer. + class(conv1d_layer), intent(in) :: self + !! A `conv1d_layer` instance + integer :: num_params + !! Number of parameters + end function get_num_params + + module function get_params(self) result(params) + !! Return the parameters (weights and biases) of this layer. + !! The parameters are ordered as weights first, biases second. + class(conv1d_layer), intent(in), target :: self + !! A `conv1d_layer` instance + real, allocatable :: params(:) + !! Parameters to get + end function get_params + + module function get_gradients(self) result(gradients) + !! Return the gradients of this layer. + !! The gradients are ordered as weights first, biases second. + class(conv1d_layer), intent(in), target :: self + !! A `conv1d_layer` instance + real, allocatable :: gradients(:) + !! Gradients to get + end function get_gradients + + module subroutine set_params(self, params) + !! Set the parameters of the layer. + class(conv1d_layer), intent(in out) :: self + !! A `conv1d_layer` instance + real, intent(in) :: params(:) + !! Parameters to set + end subroutine set_params + + end interface + +end module nf_conv1d_layer diff --git a/src/nf/nf_conv1d_layer_submodule.f90 b/src/nf/nf_conv1d_layer_submodule.f90 new file mode 100644 index 00000000..5404b9c7 --- /dev/null +++ b/src/nf/nf_conv1d_layer_submodule.f90 @@ -0,0 +1,178 @@ +submodule(nf_conv1d_layer) nf_conv1d_layer_submodule + + use nf_activation, only: activation_function + use nf_random, only: random_normal + + implicit none + +contains + + module function conv1d_layer_cons(filters, kernel_size, activation) result(res) + integer, intent(in) :: filters + integer, intent(in) :: kernel_size + class(activation_function), intent(in) :: activation + type(conv1d_layer) :: res + + res % kernel_size = kernel_size + res % filters = filters + res % activation_name = activation % get_name() + allocate( res % activation, source = activation ) + end function conv1d_layer_cons + + module subroutine init(self, input_shape) + implicit none + class(conv1d_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + + self % channels = input_shape(1) + self % width = input_shape(2) - self % kernel_size + 1 + + ! Output of shape: filters x width + allocate(self % output(self % filters, self % width)) + self % output = 0 + + ! Kernel of shape: filters x channels x kernel_size + allocate(self % kernel(self % filters, self % channels, self % kernel_size)) + call random_normal(self % kernel) + self % kernel = self % kernel / self % kernel_size + + allocate(self % biases(self % filters)) + self % biases = 0 + + allocate(self % z, mold=self % output) + self % z = 0 + + allocate(self % gradient(input_shape(1), input_shape(2))) + self % gradient = 0 + + allocate(self % dw, mold=self % kernel) + self % dw = 0 + + allocate(self % db, mold=self % biases) + self % db = 0 + + end subroutine init + + pure module subroutine forward(self, input) + implicit none + class(conv1d_layer), intent(in out) :: self + real, intent(in) :: input(:,:) + integer :: input_channels, input_width + integer :: j, n + integer :: iws, iwe + + input_channels = size(input, dim=1) + input_width = size(input, dim=2) + + ! Loop over output positions. + do j = 1, self % width + ! Compute the input window corresponding to output index j. + ! In forward: center index = j + half_window, so window = indices j to j+kernel_size-1. + iws = j + iwe = j + self % kernel_size - 1 + + ! For each filter, compute the convolution (inner product over channels and kernel width). + do concurrent (n = 1:self % filters) + self % z(n, j) = sum(self % kernel(n,:,:) * input(:,iws:iwe)) + end do + + ! Add the bias for each filter. + self % z(:,j) = self % z(:,j) + self % biases + end do + + ! Apply the activation function. + self % output = self % activation % eval(self % z) + end subroutine forward + + pure module subroutine backward(self, input, gradient) + implicit none + class(conv1d_layer), intent(in out) :: self + ! 'input' has shape: (channels, input_width) + ! 'gradient' (dL/dy) has shape: (filters, output_width) + real, intent(in) :: input(:,:) + real, intent(in) :: gradient(:,:) + + integer :: input_channels, input_width, output_width + integer :: j, n, k + integer :: iws, iwe + + ! Local arrays to accumulate gradients. + real :: gdz(self % filters, self % width) ! local gradient (dL/dz) + real :: db_local(self % filters) + real :: dw_local(self % filters, self % channels, self % kernel_size) + + ! Determine dimensions. + input_channels = size(input, dim=1) + input_width = size(input, dim=2) + output_width = self % width ! Note: output_width = input_width - kernel_size + 1 + + !--- Compute the local gradient gdz = (dL/dy) * sigma'(z) for each output. + gdz = gradient * self % activation % eval_prime(self % z) + + !--- Compute bias gradients: db(n) = sum_j gdz(n, j) + db_local = sum(gdz, dim=2) + + !--- Initialize weight gradient and input gradient accumulators. + dw_local = 0.0 + self % gradient = 0.0 + + !--- Accumulate gradients over each output position. + ! In the forward pass the window for output index j was: + ! iws = j, iwe = j + kernel_size - 1. + do n = 1, self % filters + do j = 1, output_width + iws = j + iwe = j + self % kernel_size - 1 + do k = 1, self % channels + ! Weight gradient: accumulate contribution from the input window. + dw_local(n,k,:) = dw_local(n,k,:) + input(k,iws:iwe) * gdz(n,j) + ! Input gradient: propagate gradient back to the input window. + self % gradient(k,iws:iwe) = self % gradient(k,iws:iwe) + self % kernel(n,k,:) * gdz(n,j) + end do + end do + end do + + !--- Update stored gradients. + self % dw = self % dw + dw_local + self % db = self % db + db_local + + end subroutine backward + + pure module function get_num_params(self) result(num_params) + class(conv1d_layer), intent(in) :: self + integer :: num_params + num_params = product(shape(self % kernel)) + size(self % biases) + end function get_num_params + + module function get_params(self) result(params) + class(conv1d_layer), intent(in), target :: self + real, allocatable :: params(:) + real, pointer :: w_(:) => null() + w_(1:size(self % kernel)) => self % kernel + params = [ w_, self % biases] + end function get_params + + module function get_gradients(self) result(gradients) + class(conv1d_layer), intent(in), target :: self + real, allocatable :: gradients(:) + real, pointer :: dw_(:) => null() + dw_(1:size(self % dw)) => self % dw + gradients = [ dw_, self % db ] + end function get_gradients + + module subroutine set_params(self, params) + class(conv1d_layer), intent(in out) :: self + real, intent(in) :: params(:) + + if (size(params) /= self % get_num_params()) then + error stop 'conv1d_layer % set_params: Number of parameters does not match' + end if + + self % kernel = reshape(params(:product(shape(self % kernel))), shape(self % kernel)) + associate(n => product(shape(self % kernel))) + self % biases = params(n + 1 : n + self % filters) + end associate + + end subroutine set_params + +end submodule nf_conv1d_layer_submodule diff --git a/src/nf/nf_embedding_layer.f90 b/src/nf/nf_embedding_layer.f90 new file mode 100644 index 00000000..94a868a5 --- /dev/null +++ b/src/nf/nf_embedding_layer.f90 @@ -0,0 +1,98 @@ +module nf_embedding_layer + + use nf_activation, only: activation_function + use nf_base_layer, only: base_layer + + implicit none + + private + public :: embedding_layer + + type, extends(base_layer) :: embedding_layer + !! Embedding Layer + !! Stores inputs as a trainable lookup table. Inputs are + !! integer indicies in a dictionary of `vocab_size`. + !! This layer converts them into a table of shape + !! (`sequence_length`, `model_dimension`) + integer :: sequence_length, vocab_size, model_dimension + integer :: positional + + real, allocatable :: weights(:, :) + real, allocatable :: output(:, :) + real, allocatable :: dw(:, :) ! weight gradients + + contains + + procedure :: backward + procedure :: forward + procedure :: positional_trigonometric + procedure :: positional_absolute + procedure :: init + procedure :: get_num_params + procedure :: get_params + procedure :: get_gradients + procedure :: set_params + + end type embedding_layer + + interface embedding_layer + module function embedding_layer_cons(vocab_size, model_dimension, positional) result(res) + integer, intent(in) :: vocab_size, model_dimension + integer, optional :: positional + type(embedding_layer) :: res + end function embedding_layer_cons + end interface embedding_layer + + interface + pure module subroutine forward(self, input) + !! Get vectors by indicis in the dictionary + class(embedding_layer), intent(in out) :: self + integer, intent(in) :: input(:) + end subroutine forward + + pure module subroutine backward(self, input, gradient) + !! Update gradient at `input` indices + !! dw_i = W_i + d_output_i + class(embedding_layer), intent(in out) :: self + integer, intent(in) :: input(:) + real, intent(in) :: gradient(:, :) + end subroutine backward + + pure module subroutine positional_trigonometric(self, pos) + !! Sum embedding with positional info (trigonometric, not trianable) + class(embedding_layer), intent(in out) :: self + integer, intent(in) :: pos + end subroutine positional_trigonometric + + pure module subroutine positional_absolute(self, pos) + !! Sum embedding with absolute position + class(embedding_layer), intent(in out) :: self + integer, intent(in) :: pos + end subroutine positional_absolute + + module subroutine init(self, input_shape) + class(embedding_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + end subroutine init + + pure module function get_num_params(self) result(num_params) + class(embedding_layer), intent(in) :: self + integer :: num_params + end function get_num_params + + module function get_params(self) result(params) + class(embedding_layer), intent(in), target :: self + real, allocatable :: params(:) + end function get_params + + module function get_gradients(self) result(gradients) + class(embedding_layer), intent(in), target :: self + real, allocatable :: gradients(:) + end function get_gradients + + module subroutine set_params(self, params) + class(embedding_layer), intent(in out) :: self + real, intent(in), target :: params(:) + end subroutine set_params + end interface +end module nf_embedding_layer diff --git a/src/nf/nf_embedding_layer_submodule.f90 b/src/nf/nf_embedding_layer_submodule.f90 new file mode 100644 index 00000000..83992b22 --- /dev/null +++ b/src/nf/nf_embedding_layer_submodule.f90 @@ -0,0 +1,137 @@ +#define NONE 0 +#define TRIGONOMETRIC 1 +#define ABSOLUTE 2 + +submodule(nf_embedding_layer) nf_embedding_layer_submodule + use nf_base_layer, only: base_layer + implicit none +contains + module function embedding_layer_cons(vocab_size, model_dimension, positional) result(res) + integer, intent(in) :: vocab_size, model_dimension + integer, optional :: positional + type(embedding_layer) :: res + + res % vocab_size = vocab_size + res % model_dimension = model_dimension + if (.not. present(positional)) then + res % positional = NONE + else + res % positional = positional + end if + end function embedding_layer_cons + + module subroutine init(self, input_shape) + class(embedding_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + + self % sequence_length = input_shape(1) + + allocate(self % output(self % sequence_length, self % model_dimension)) + + allocate(self % weights(self % vocab_size, self % model_dimension)) + self % weights = 0.1 + + allocate(self % dw(self % vocab_size, self % model_dimension)) + self % dw = 0.0 + end subroutine init + + pure module subroutine forward(self, input) + class(embedding_layer), intent(in out) :: self + integer, intent(in) :: input(:) + integer :: i, index + + do concurrent(i = 1: self % sequence_length) + index = input(i) + if (index > size(self % weights, 1)) then + index = 1 + elseif (index == 0) then + index = 1 + end if + + self % output(i, :) = self % weights(index, :) + + if (self % positional == TRIGONOMETRIC) then + call self % positional_trigonometric(i) + elseif (self % positional == ABSOLUTE) then + call self % positional_absolute(i) + end if + end do + end subroutine forward + + pure module subroutine backward(self, input, gradient) + class(embedding_layer), intent(in out) :: self + integer, intent(in) :: input(:) + real, intent(in) :: gradient(:, :) + integer :: i + + do concurrent(i = 1: self % sequence_length) + self % dw(input(i), :) = self % dw(input(i), :) + gradient(i, :) + end do + end subroutine backward + + pure module subroutine positional_trigonometric(self, pos) + class(embedding_layer), intent(in out) :: self + integer, intent(in) :: pos + integer :: i + real :: theta + + do concurrent(i = 1: floor(real(self % model_dimension) / 2)) + theta = (pos - 1) / 10000 ** (real(2 * (i-1)) / self % model_dimension) + self % output(pos, 2 * i - 1) = self % output(pos, 2 * i - 1) + sin(theta) + self % output(pos, 2 * i) = self % output(pos, 2 * i) + cos(theta) + end do + end subroutine positional_trigonometric + + pure module subroutine positional_absolute(self, pos) + class(embedding_layer), intent(in out) :: self + integer, intent(in) :: pos + integer :: i + + do concurrent(i = 1: self % model_dimension) + self % output(pos, i) = self % output(pos, i) + pos - 1 + end do + end subroutine positional_absolute + + pure module function get_num_params(self) result(num_params) + class(embedding_layer), intent(in) :: self + integer :: num_params + num_params = self % vocab_size * self % model_dimension + end function get_num_params + + module function get_params(self) result(params) + class(embedding_layer), intent(in), target :: self + real, allocatable :: params(:) + real, pointer :: w_(:) => null() + + w_(1: product(shape(self % weights))) => self % weights + params = w_ + end function get_params + + module function get_gradients(self) result(gradients) + class(embedding_layer), intent(in), target :: self + real, allocatable :: gradients(:) + real, pointer :: dw_(:) => null() + + dw_(1: product(shape(self % dw))) => self % dw + gradients = dw_ + end function get_gradients + + module subroutine set_params(self, params) + class(embedding_layer), intent(in out) :: self + real, intent(in), target :: params(:) + + real, pointer :: p_(:,:) => null() + + ! check if the number of parameters is correct + if (size(params) /= self % get_num_params()) then + error stop 'Error: number of parameters does not match' + end if + + associate(n => self % vocab_size * self % model_dimension) + ! reshape the weights + p_(1:self % vocab_size, 1:self % model_dimension) => params(1 : n) + self % weights = p_ + end associate + + end subroutine set_params +end submodule nf_embedding_layer_submodule diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90 index db60cf0f..e5f92f64 100644 --- a/src/nf/nf_layer_constructors.f90 +++ b/src/nf/nf_layer_constructors.f90 @@ -9,15 +9,21 @@ module nf_layer_constructors private public :: & + conv1d, & conv2d, & dense, & dropout, & flatten, & input, & linear2d, & + locally_connected1d, & + maxpool1d, & maxpool2d, & reshape, & - self_attention + reshape2d, & + self_attention, & + embedding, & + layernorm interface input @@ -152,6 +158,33 @@ module function flatten() result(res) !! Resulting layer instance end function flatten + module function conv1d(filters, kernel_size, activation) result(res) + !! 1-d convolutional layer constructor. + !! + !! This layer is for building 1-d convolutional network. + !! Although the established convention is to call these layers 1-d, + !! the shape of the data is actually 2-d: image width + !! and the number of channels. + !! A conv1d layer must not be the first layer in the network. + !! + !! Example: + !! + !! ``` + !! use nf, only :: conv1d, layer + !! type(layer) :: conv1d_layer + !! conv1d_layer = dense(filters=32, kernel_size=3) + !! conv1d_layer = dense(filters=32, kernel_size=3, activation='relu') + !! ``` + integer, intent(in) :: filters + !! Number of filters in the output of the layer + integer, intent(in) :: kernel_size + !! Width of the convolution window, commonly 3 or 5 + class(activation_function), intent(in), optional :: activation + !! Activation function (default sigmoid) + type(layer) :: res + !! Resulting layer instance + end function conv1d + module function conv2d(filters, kernel_size, activation) result(res) !! 2-d convolutional layer constructor. !! @@ -179,6 +212,55 @@ module function conv2d(filters, kernel_size, activation) result(res) !! Resulting layer instance end function conv2d + module function locally_connected1d(filters, kernel_size, activation) result(res) + !! 1-d locally connected network constructor + !! + !! This layer is for building 1-d locally connected network. + !! Although the established convention is to call these layers 1-d, + !! the shape of the data is actuall 2-d: image width, + !! and the number of channels. + !! A locally connected 1d layer must not be the first layer in the network. + !! + !! Example: + !! + !! ``` + !! use nf, only :: locally_connected1d, layer + !! type(layer) :: locally_connected1d_layer + !! locally_connected1d_layer = dense(filters=32, kernel_size=3) + !! locally_connected1d_layer = dense(filters=32, kernel_size=3, activation='relu') + !! ``` + integer, intent(in) :: filters + !! Number of filters in the output of the layer + integer, intent(in) :: kernel_size + !! Width of the convolution window, commonly 3 or 5 + class(activation_function), intent(in), optional :: activation + !! Activation function (default sigmoid) + type(layer) :: res + !! Resulting layer instance + end function locally_connected1d + + module function maxpool1d(pool_size, stride) result(res) + !! 1-d maxpooling layer constructor. + !! + !! This layer is for downscaling other layers, typically `conv1d`. + !! + !! Example: + !! + !! ``` + !! use nf, only :: maxpool1d, layer + !! type(layer) :: maxpool1d_layer + !! maxpool1d_layer = maxpool1d(pool_size=2) + !! maxpool1d_layer = maxpool1d(pool_size=2, stride=3) + !! ``` + integer, intent(in) :: pool_size + !! Width of the pooling window, commonly 2 + integer, intent(in), optional :: stride + !! Stride of the pooling window, commonly equal to `pool_size`; + !! Defaults to `pool_size` if omitted. + type(layer) :: res + !! Resulting layer instance + end function maxpool1d + module function maxpool2d(pool_size, stride) result(res) !! 2-d maxpooling layer constructor. !! @@ -212,6 +294,17 @@ module function reshape(output_shape) result(res) !! Resulting layer instance end function reshape + module function reshape2d(output_shape) result(res) + !! Rank-1 to rank-any reshape layer constructor. + !! Currently implemented is only rank-2 for the output of the reshape. + !! + !! This layer is for connecting 1-d inputs to conv1d or similar layers. + integer, intent(in) :: output_shape(:) + !! Shape of the output + type(layer) :: res + !! Resulting layer instance + end function reshape2d + module function linear2d(out_features) result(res) !! Rank-2 (sequence_length, out_features) linear layer constructor. !! sequence_length is determined at layer initialization, based on the @@ -222,15 +315,40 @@ module function linear2d(out_features) result(res) !! Resulting layer instance end function linear2d - module function self_attention(num_heads) result(res) - !! Rank-2 (sequence_length, out_features) self attention constructor. - !! sequence_length and model_dimension are determined at layer initialization, based on the - !! output shape of the previous layer. - integer, intent(in) :: num_heads - !! Number of attention heads - type(layer) :: res - !! Resulting layer instance - end function self_attention + module function self_attention(num_heads) result(res) + !! Rank-2 (sequence_length, out_features) self attention constructor. + !! sequence_length and model_dimension are determined at layer initialization, based on the + !! output shape of the previous layer. + integer, intent(in) :: num_heads + !! Number of attention heads + type(layer) :: res + !! Resulting layer instance + end function self_attention + + module function embedding(sequence_length, vocab_size, model_dimension, positional) result(res) + !! Embedding layer constructor. + !! + !! This layer is for inputting token indices from the dictionary to the network. + !! Works as a trainable lookup table that converts each index into a vector. + !! Embedding layer must be the first layer in a network. + integer, intent(in) :: sequence_length + !! max len of input sequence + integer, intent(in) :: vocab_size + !! length of token vocabulary + integer, intent(in) :: model_dimension + !! size of target embeddings + integer, optional, intent(in) :: positional + !! positional encoding + type(layer) :: res + end function embedding + + module function layernorm() result(res) + !! Layer Normalization + !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta + !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`: + !! https://arxiv.org/abs/1607.06450v1 + type(layer) :: res + end function layernorm end interface diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90 index 9e5322c1..48fcd8a5 100644 --- a/src/nf/nf_layer_constructors_submodule.f90 +++ b/src/nf/nf_layer_constructors_submodule.f90 @@ -1,6 +1,7 @@ submodule(nf_layer_constructors) nf_layer_constructors_submodule use nf_layer, only: layer + use nf_conv1d_layer, only: conv1d_layer use nf_conv2d_layer, only: conv2d_layer use nf_dense_layer, only: dense_layer use nf_dropout_layer, only: dropout_layer @@ -8,16 +9,46 @@ use nf_input1d_layer, only: input1d_layer use nf_input2d_layer, only: input2d_layer use nf_input3d_layer, only: input3d_layer + use nf_locally_connected1d_layer, only: locally_connected1d_layer + use nf_maxpool1d_layer, only: maxpool1d_layer use nf_maxpool2d_layer, only: maxpool2d_layer use nf_reshape_layer, only: reshape3d_layer + use nf_reshape2d_layer, only: reshape2d_layer use nf_linear2d_layer, only: linear2d_layer use nf_self_attention_layer, only: self_attention_layer + use nf_embedding_layer, only: embedding_layer + use nf_layernorm_layer, only: layernorm_layer use nf_activation, only: activation_function, relu, sigmoid implicit none contains + module function conv1d(filters, kernel_size, activation) result(res) + integer, intent(in) :: filters + integer, intent(in) :: kernel_size + class(activation_function), intent(in), optional :: activation + type(layer) :: res + + class(activation_function), allocatable :: activation_tmp + + res % name = 'conv1d' + + if (present(activation)) then + allocate(activation_tmp, source=activation) + else + allocate(activation_tmp, source=relu()) + end if + + res % activation = activation_tmp % get_name() + + allocate( & + res % p, & + source=conv1d_layer(filters, kernel_size, activation_tmp) & + ) + + end function conv1d + module function conv2d(filters, kernel_size, activation) result(res) integer, intent(in) :: filters integer, intent(in) :: kernel_size @@ -43,6 +74,31 @@ module function conv2d(filters, kernel_size, activation) result(res) end function conv2d + module function locally_connected1d(filters, kernel_size, activation) result(res) + integer, intent(in) :: filters + integer, intent(in) :: kernel_size + class(activation_function), intent(in), optional :: activation + type(layer) :: res + + class(activation_function), allocatable :: activation_tmp + + res % name = 'locally_connected1d' + + if (present(activation)) then + allocate(activation_tmp, source=activation) + else + allocate(activation_tmp, source=relu()) + end if + + res % activation = activation_tmp % get_name() + + allocate( & + res % p, & + source=locally_connected1d_layer(filters, kernel_size, activation_tmp) & + ) + + end function locally_connected1d + module function dense(layer_size, activation) result(res) integer, intent(in) :: layer_size @@ -116,6 +172,33 @@ module function input3d(dim1, dim2, dim3) result(res) res % initialized = .true. end function input3d + module function maxpool1d(pool_size, stride) result(res) + integer, intent(in) :: pool_size + integer, intent(in), optional :: stride + integer :: stride_ + type(layer) :: res + + if (pool_size < 2) & + error stop 'pool_size must be >= 2 in a maxpool1d layer' + + ! Stride defaults to pool_size if not provided + if (present(stride)) then + stride_ = stride + else + stride_ = pool_size + end if + + if (stride_ < 1) & + error stop 'stride must be >= 1 in a maxpool1d layer' + + res % name = 'maxpool1d' + + allocate( & + res % p, & + source=maxpool1d_layer(pool_size, stride_) & + ) + + end function maxpool1d module function maxpool2d(pool_size, stride) result(res) integer, intent(in) :: pool_size @@ -161,6 +244,21 @@ module function reshape(output_shape) result(res) end function reshape + module function reshape2d(output_shape) result(res) + integer, intent(in) :: output_shape(:) + type(layer) :: res + + res % name = 'reshape2d' + res % layer_shape = output_shape + + if (size(output_shape) == 2) then + allocate(res % p, source=reshape2d_layer(output_shape)) + else + error stop 'size(output_shape) of the reshape layer must == 2' + end if + + end function reshape2d + module function linear2d(out_features) result(res) integer, intent(in) :: out_features @@ -171,6 +269,7 @@ module function linear2d(out_features) result(res) end function linear2d + module function self_attention(num_heads) result(res) integer, intent(in) :: num_heads type(layer) :: res @@ -179,4 +278,28 @@ module function self_attention(num_heads) result(res) allocate(res % p, source=self_attention_layer(num_heads)) end function self_attention + + module function embedding(sequence_length, vocab_size, model_dimension, positional) result(res) + integer, intent(in) :: sequence_length, vocab_size, model_dimension + integer, optional, intent(in) :: positional + type(layer) :: res + type(embedding_layer) :: embedding_layer_instance + + embedding_layer_instance = embedding_layer(vocab_size, model_dimension, positional) + call embedding_layer_instance % init([sequence_length]) + res % name = 'embedding' + res % layer_shape = [sequence_length, model_dimension] + res % input_layer_shape = [integer ::] + allocate(res % p, source=embedding_layer_instance) + res % initialized = .true. + + end function embedding + + + module function layernorm() result(res) + type(layer) :: res + res % name = 'layernorm' + allocate(res % p, source=layernorm_layer()) + end function layernorm + end submodule nf_layer_constructors_submodule diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90 index ecdeb41d..63af7264 100644 --- a/src/nf/nf_layer_submodule.f90 +++ b/src/nf/nf_layer_submodule.f90 @@ -1,6 +1,7 @@ submodule(nf_layer) nf_layer_submodule use iso_fortran_env, only: stderr => error_unit + use nf_conv1d_layer, only: conv1d_layer use nf_conv2d_layer, only: conv2d_layer use nf_dense_layer, only: dense_layer use nf_dropout_layer, only: dropout_layer @@ -8,10 +9,15 @@ use nf_input1d_layer, only: input1d_layer use nf_input2d_layer, only: input2d_layer use nf_input3d_layer, only: input3d_layer + use nf_locally_connected1d_layer, only: locally_connected1d_layer + use nf_maxpool1d_layer, only: maxpool1d_layer use nf_maxpool2d_layer, only: maxpool2d_layer + use nf_reshape2d_layer, only: reshape2d_layer use nf_reshape_layer, only: reshape3d_layer use nf_linear2d_layer, only: linear2d_layer use nf_self_attention_layer, only: self_attention_layer + use nf_embedding_layer, only: embedding_layer + use nf_layernorm_layer, only: layernorm_layer use nf_optimizers, only: optimizer_base_type contains @@ -23,7 +29,7 @@ pure module subroutine backward_1d(self, previous, gradient) real, intent(in) :: gradient(:) ! Backward pass from a 1-d layer downstream currently implemented - ! only for dense and flatten layers + ! only for dense, dropout and flatten layers select type(this_layer => self % p) type is(dense_layer) @@ -46,12 +52,18 @@ pure module subroutine backward_1d(self, previous, gradient) type is(flatten_layer) - ! Upstream layers permitted: input2d, input3d, conv2d, maxpool2d + ! Upstream layers permitted: input2d, input3d, conv1d, conv2d, locally_connected1d, maxpool1d, maxpool2d select type(prev_layer => previous % p) type is(input2d_layer) call this_layer % backward(prev_layer % output, gradient) + type is(locally_connected1d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(maxpool1d_layer) + call this_layer % backward(prev_layer % output, gradient) type is(input3d_layer) call this_layer % backward(prev_layer % output, gradient) + type is(conv1d_layer) + call this_layer % backward(prev_layer % output, gradient) type is(conv2d_layer) call this_layer % backward(prev_layer % output, gradient) type is(maxpool2d_layer) @@ -60,6 +72,10 @@ pure module subroutine backward_1d(self, previous, gradient) call this_layer % backward(prev_layer % output, gradient) type is(self_attention_layer) call this_layer % backward(prev_layer % output, gradient) + type is(embedding_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(layernorm_layer) + call this_layer % backward(prev_layer % output, gradient) end select end select @@ -80,10 +96,14 @@ pure module subroutine backward_2d(self, previous, gradient) select type(prev_layer => previous % p) type is(input2d_layer) call this_layer % backward(prev_layer % output, gradient) + type is(embedding_layer) + call this_layer % backward(prev_layer % output, gradient) type is(linear2d_layer) call this_layer % backward(prev_layer % output, gradient) type is(self_attention_layer) call this_layer % backward(prev_layer % output, gradient) + type is(layernorm_layer) + call this_layer % backward(prev_layer % output, gradient) end select type is(self_attention_layer) @@ -91,14 +111,84 @@ pure module subroutine backward_2d(self, previous, gradient) select type(prev_layer => previous % p) type is(input2d_layer) call this_layer % backward(prev_layer % output, gradient) + type is(embedding_layer) + call this_layer % backward(prev_layer % output, gradient) type is(linear2d_layer) call this_layer % backward(prev_layer % output, gradient) type is(self_attention_layer) call this_layer % backward(prev_layer % output, gradient) + type is(layernorm_layer) + call this_layer % backward(prev_layer % output, gradient) end select + type is(layernorm_layer) + + select type(prev_layer => previous % p) + type is(linear2d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(self_attention_layer) + call this_layer % backward(prev_layer % output, gradient) + end select end select + ! Backward pass from a 2-d layer downstream currently implemented + ! only for dense and flatten layers + + select type(this_layer => self % p) + + type is(conv1d_layer) + + select type(prev_layer => previous % p) + type is(maxpool1d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(reshape2d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(input2d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(locally_connected1d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(conv1d_layer) + call this_layer % backward(prev_layer % output, gradient) + end select + + type is(locally_connected1d_layer) + + select type(prev_layer => previous % p) + type is(maxpool1d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(reshape2d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(input2d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(locally_connected1d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(conv1d_layer) + call this_layer % backward(prev_layer % output, gradient) + end select + + type is(maxpool1d_layer) + + select type(prev_layer => previous % p) + type is(maxpool1d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(reshape2d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(locally_connected1d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(input2d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(conv1d_layer) + call this_layer % backward(prev_layer % output, gradient) + end select + + type is(reshape2d_layer) + select type(prev_layer => previous % p) + type is(input1d_layer) + call this_layer % backward(prev_layer % output, gradient) + end select + + end select + end subroutine backward_2d @@ -203,6 +293,54 @@ module subroutine forward(self, input) type is(reshape3d_layer) call this_layer % forward(prev_layer % output) end select + + type is(locally_connected1d_layer) + + ! Upstream layers permitted: input2d, locally_connected1d, maxpool1d, reshape2d + select type(prev_layer => input % p) + type is(input2d_layer) + call this_layer % forward(prev_layer % output) + type is(locally_connected1d_layer) + call this_layer % forward(prev_layer % output) + type is(maxpool1d_layer) + call this_layer % forward(prev_layer % output) + type is(reshape2d_layer) + call this_layer % forward(prev_layer % output) + type is(conv1d_layer) + call this_layer % forward(prev_layer % output) + end select + + type is(conv1d_layer) + + ! Upstream layers permitted: input2d, locally_connected1d, maxpool1d, reshape2d + select type(prev_layer => input % p) + type is(input2d_layer) + call this_layer % forward(prev_layer % output) + type is(locally_connected1d_layer) + call this_layer % forward(prev_layer % output) + type is(maxpool1d_layer) + call this_layer % forward(prev_layer % output) + type is(reshape2d_layer) + call this_layer % forward(prev_layer % output) + type is(conv1d_layer) + call this_layer % forward(prev_layer % output) + end select + + type is(maxpool1d_layer) + + ! Upstream layers permitted: input1d, locally_connected1d, maxpool1d, reshape2d + select type(prev_layer => input % p) + type is(input2d_layer) + call this_layer % forward(prev_layer % output) + type is(locally_connected1d_layer) + call this_layer % forward(prev_layer % output) + type is(maxpool1d_layer) + call this_layer % forward(prev_layer % output) + type is(reshape2d_layer) + call this_layer % forward(prev_layer % output) + type is(conv1d_layer) + call this_layer % forward(prev_layer % output) + end select type is(maxpool2d_layer) @@ -220,20 +358,30 @@ module subroutine forward(self, input) type is(flatten_layer) - ! Upstream layers permitted: input2d, input3d, conv2d, maxpool2d, reshape3d + ! Upstream layers permitted: input2d, input3d, conv2d, maxpool1d, maxpool2d, reshape2d, reshape3d, locally_connected2d select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) type is(input3d_layer) call this_layer % forward(prev_layer % output) + type is(conv1d_layer) + call this_layer % forward(prev_layer % output) type is(conv2d_layer) call this_layer % forward(prev_layer % output) + type is(locally_connected1d_layer) + call this_layer % forward(prev_layer % output) + type is(maxpool1d_layer) + call this_layer % forward(prev_layer % output) type is(maxpool2d_layer) call this_layer % forward(prev_layer % output) + type is(reshape2d_layer) + call this_layer % forward(prev_layer % output) type is(reshape3d_layer) call this_layer % forward(prev_layer % output) type is(linear2d_layer) call this_layer % forward(prev_layer % output) + type is(layernorm_layer) + call this_layer % forward(prev_layer % output) end select type is(reshape3d_layer) @@ -247,25 +395,49 @@ module subroutine forward(self, input) type is(flatten_layer) call this_layer % forward(prev_layer % output) end select + + type is(reshape2d_layer) + select type(prev_layer => input % p) + type is(input1d_layer) + call this_layer % forward(prev_layer % output) + end select type is(linear2d_layer) - ! Upstream layers permitted: input2d, linear2d + ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) + type is(embedding_layer) + call this_layer % forward(prev_layer % output) type is(linear2d_layer) call this_layer % forward(prev_layer % output) type is(self_attention_layer) call this_layer % forward(prev_layer % output) + type is(layernorm_layer) + call this_layer % forward(prev_layer % output) end select type is(self_attention_layer) - ! Upstream layers permitted: input2d, linear2d + ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) + type is(embedding_layer) + call this_layer % forward(prev_layer % output) + type is(linear2d_layer) + call this_layer % forward(prev_layer % output) + type is(self_attention_layer) + call this_layer % forward(prev_layer % output) + type is(layernorm_layer) + call this_layer % forward(prev_layer % output) + end select + + type is(layernorm_layer) + + ! Upstream layers permitted: linear2d, self_attention + select type(prev_layer => input % p) type is(linear2d_layer) call this_layer % forward(prev_layer % output) type is(self_attention_layer) @@ -307,12 +479,26 @@ pure module subroutine get_output_2d(self, output) type is(input2d_layer) allocate(output, source=this_layer % output) + type is(maxpool1d_layer) + allocate(output, source=this_layer % output) + type is(locally_connected1d_layer) + allocate(output, source=this_layer % output) + type is(conv1d_layer) + allocate(output, source=this_layer % output) + type is(reshape2d_layer) + allocate(output, source=this_layer % output) + type is(embedding_layer) + allocate(output, source=this_layer % output) type is(linear2d_layer) allocate(output, source=this_layer % output) type is(self_attention_layer) allocate(output, source=this_layer % output) + type is(layernorm_layer) + allocate(output, source=this_layer % output) class default - error stop '2-d output can only be read from an input2d or linear2d layer.' + error stop '2-d output can only be read from a input2d, maxpool1d, ' & + // 'locally_connected1d, conv1d, reshape2d, embedding, linear2d, ' & + // 'self_attention, or layernorm layer.' end select @@ -354,19 +540,27 @@ impure elemental module subroutine init(self, input) call this_layer % init(input % layer_shape) end select - ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d, or - ! self_attention layers is not known until we receive an input layer. + ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d, + ! self_attention or layernorm layers is not known until we receive an input layer. select type(this_layer => self % p) + type is(conv1d_layer) + self % layer_shape = shape(this_layer % output) type is(conv2d_layer) self % layer_shape = shape(this_layer % output) type is(dropout_layer) self % layer_shape = shape(this_layer % output) + type is(locally_connected1d_layer) + self % layer_shape = shape(this_layer % output) + type is(maxpool1d_layer) + self % layer_shape = shape(this_layer % output) type is(flatten_layer) self % layer_shape = shape(this_layer % output) type is(linear2d_layer) self % layer_shape = shape(this_layer % output) type is(self_attention_layer) self % layer_shape = shape(this_layer % output) + type is(layernorm_layer) + self % layer_shape = shape(this_layer % output) type is(maxpool2d_layer) self % layer_shape = shape(this_layer % output) end select @@ -413,18 +607,30 @@ elemental module function get_num_params(self) result(num_params) num_params = this_layer % get_num_params() type is (dropout_layer) num_params = 0 + type is (conv1d_layer) + num_params = this_layer % get_num_params() type is (conv2d_layer) num_params = this_layer % get_num_params() + type is (locally_connected1d_layer) + num_params = this_layer % get_num_params() + type is (maxpool1d_layer) + num_params = 0 type is (maxpool2d_layer) num_params = 0 type is (flatten_layer) num_params = 0 + type is (reshape2d_layer) + num_params = 0 type is (reshape3d_layer) num_params = 0 type is (linear2d_layer) num_params = this_layer % get_num_params() type is (self_attention_layer) num_params = this_layer % get_num_params() + type is (embedding_layer) + num_params = this_layer % get_num_params() + type is (layernorm_layer) + num_params = this_layer % get_num_params() class default error stop 'Unknown layer type.' end select @@ -446,18 +652,30 @@ module function get_params(self) result(params) params = this_layer % get_params() type is (dropout_layer) ! No parameters to get. + type is (conv1d_layer) + params = this_layer % get_params() type is (conv2d_layer) params = this_layer % get_params() + type is (locally_connected1d_layer) + params = this_layer % get_params() + type is (maxpool1d_layer) + ! No parameters to get. type is (maxpool2d_layer) ! No parameters to get. type is (flatten_layer) ! No parameters to get. + type is (reshape2d_layer) + ! No parameters to get. type is (reshape3d_layer) ! No parameters to get. type is (linear2d_layer) params = this_layer % get_params() type is (self_attention_layer) params = this_layer % get_params() + type is (embedding_layer) + params = this_layer % get_params() + type is (layernorm_layer) + params = this_layer % get_params() class default error stop 'Unknown layer type.' end select @@ -479,11 +697,19 @@ module function get_gradients(self) result(gradients) gradients = this_layer % get_gradients() type is (dropout_layer) ! No gradients to get. + type is (conv1d_layer) + gradients = this_layer % get_gradients() type is (conv2d_layer) gradients = this_layer % get_gradients() + type is (locally_connected1d_layer) + gradients = this_layer % get_gradients() + type is (maxpool1d_layer) + ! No gradients to get. type is (maxpool2d_layer) ! No gradients to get. type is (flatten_layer) + ! No gradients to get. + type is (reshape2d_layer) ! No parameters to get. type is (reshape3d_layer) ! No gradients to get. @@ -491,6 +717,10 @@ module function get_gradients(self) result(gradients) gradients = this_layer % get_gradients() type is (self_attention_layer) gradients = this_layer % get_gradients() + type is (embedding_layer) + gradients = this_layer % get_gradients() + type is (layernorm_layer) + gradients = this_layer % get_gradients() class default error stop 'Unknown layer type.' end select @@ -539,15 +769,31 @@ module subroutine set_params(self, params) ! No parameters to set. write(stderr, '(a)') 'Warning: calling set_params() ' & // 'on a zero-parameter layer; nothing to do.' + + type is (conv1d_layer) + call this_layer % set_params(params) type is (conv2d_layer) call this_layer % set_params(params) + + type is (locally_connected1d_layer) + call this_layer % set_params(params) + + type is (maxpool1d_layer) + ! No parameters to set. + write(stderr, '(a)') 'Warning: calling set_params() ' & + // 'on a zero-parameter layer; nothing to do.' type is (linear2d_layer) call this_layer % set_params(params) type is (self_attention_layer) call this_layer % set_params(params) + type is (embedding_layer) + call this_layer % set_params(params) + + type is (layernorm_layer) + call this_layer % set_params(params) type is (maxpool2d_layer) ! No parameters to set. @@ -558,6 +804,11 @@ module subroutine set_params(self, params) ! No parameters to set. write(stderr, '(a)') 'Warning: calling set_params() ' & // 'on a zero-parameter layer; nothing to do.' + + type is (reshape2d_layer) + ! No parameters to set. + write(stderr, '(a)') 'Warning: calling set_params() ' & + // 'on a zero-parameter layer; nothing to do.' type is (reshape3d_layer) ! No parameters to set. diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90 new file mode 100644 index 00000000..36ef56f0 --- /dev/null +++ b/src/nf/nf_layernorm.f90 @@ -0,0 +1,92 @@ +module nf_layernorm_layer + use nf_activation, only: activation_function + use nf_base_layer, only: base_layer + + implicit none + + private + public :: layernorm_layer + + type, extends(base_layer) :: layernorm_layer + !! Layer Normalization + !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta + !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`: + !! https://arxiv.org/abs/1607.06450v1 + integer :: sequence_length + integer :: model_dimension + + real :: eps + real, allocatable :: gamma(:) + real, allocatable :: beta(:) + + real, allocatable :: d_gamma(:) + real, allocatable :: d_beta(:) + real, allocatable :: gradient(:, :) + + real, allocatable :: mu(:, :) + real, allocatable :: sigma(:) + + real, allocatable :: output(:, :) + + ! temp storages + real, allocatable, private :: normalized(:, :) + real, allocatable, private :: one_over_sigma(:, :) + real, allocatable, private :: gradient_by_gamma_over_sigma(:, :) + contains + procedure :: forward + procedure :: backward + procedure :: init + procedure :: get_num_params + procedure :: get_params + procedure :: get_gradients + procedure :: set_params + end type layernorm_layer + + interface layernorm_layer + module function layernorm_layer_cons() & + result(res) + type(layernorm_layer) :: res + end function layernorm_layer_cons + end interface layernorm_layer + + interface + pure module subroutine forward(self, input) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + end subroutine forward + + pure module subroutine backward(self, input, gradient) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + real, intent(in) :: gradient(:, :) + end subroutine backward + + module subroutine init(self, input_shape) + class(layernorm_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + end subroutine init + + pure module function get_num_params(self) result(num_params) + class(layernorm_layer), intent(in) :: self + integer :: num_params + end function get_num_params + + + module function get_params(self) result(params) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: params(:) + end function get_params + + + module function get_gradients(self) result(gradients) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: gradients(:) + end function get_gradients + + + module subroutine set_params(self, params) + class(layernorm_layer), intent(in out) :: self + real, intent(in), target :: params(:) + end subroutine set_params + end interface +end module nf_layernorm_layer \ No newline at end of file diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90 new file mode 100644 index 00000000..4eaa4382 --- /dev/null +++ b/src/nf/nf_layernorm_submodule.f90 @@ -0,0 +1,149 @@ +submodule(nf_layernorm_layer) nf_layernorm_layer_submodule + implicit none +contains + module function layernorm_layer_cons() & + result(res) + type(layernorm_layer) :: res + + res % eps = 1e-5 + end function layernorm_layer_cons + + pure module subroutine forward(self, input) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + integer :: i + + ! mu = x - MEAN_last_dim(x) + do concurrent(i = 1: self % model_dimension) + self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension) + end do + + ! square root of variance shifted be eps + self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps) + + ! normalize mu by variance by first axis + do concurrent(i = 1: self % model_dimension) + self % normalized(:, i) = self % mu(:, i) / self % sigma + end do + + ! forward through trainable params gamma and beta + do concurrent(i = 1: self % sequence_length) + self % output(i, :) = self % normalized(i, :) * self % gamma + self % beta + end do + end subroutine forward + + pure module subroutine backward(self, input, gradient) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + real, intent(in) :: gradient(:, :) + + self % one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension)) + self % gradient_by_gamma_over_sigma = & + gradient & + * spread(self % gamma, dim=1, ncopies=self % sequence_length) & + * self % one_over_sigma + + ! d_output/d_gamma = sum(d_output/d_y * mu/sigma) + self % d_gamma = sum(gradient * self % mu * self % one_over_sigma, dim=1) + + ! d_output/d_beta = sum(d_output/d_y) * 1 + self % d_beta = sum(gradient, dim=1) + + ! From this article: + ! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/ + ! d_output/d_x = d_output/d_y * gamma/sigma + ! - d_output/d_y + ! - sum(d_output/d_y * gamma/sigma) / len + ! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len + self % gradient = & + self % gradient_by_gamma_over_sigma & + - spread(& + sum(self % gradient_by_gamma_over_sigma, dim=2),& + dim=2,& + ncopies=self % model_dimension& + ) / self % model_dimension & + - self % mu * spread(& + sum(self % gradient_by_gamma_over_sigma * self % mu * (self % one_over_sigma ** 2), dim=2),& + dim=2,& + ncopies=self % model_dimension& + ) / self % model_dimension + end subroutine backward + + module subroutine init(self, input_shape) + class(layernorm_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + + if (size(input_shape) /= 2) then + error stop "LayerNorm Layer accepts 2D input" + end if + self % sequence_length = input_shape(1) + self % model_dimension = input_shape(2) + + ! default initialization from PyTorch + allocate(self % gamma(self % model_dimension)) + self % gamma = 1. + allocate(self % beta(self % model_dimension)) + self % beta = 0. + + allocate(self % d_gamma(self % model_dimension)) + allocate(self % d_beta(self % model_dimension)) + allocate(self % gradient(self % sequence_length, self % model_dimension)) + + allocate(self % mu(self % sequence_length, self % model_dimension)) + allocate(self % sigma(self % sequence_length)) + + allocate(self % output(self % sequence_length, self % model_dimension)) + + allocate(self % normalized, mold=self % mu) + allocate(self % one_over_sigma, mold=self % mu) + allocate(self % gradient_by_gamma_over_sigma, mold=self % mu) + end subroutine init + + pure module function get_num_params(self) result(num_params) + class(layernorm_layer), intent(in) :: self + integer :: num_params + + ! Number of weights times number of biases + num_params = 2 * self % model_dimension + + end function get_num_params + + + module function get_params(self) result(params) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: params(:) + + params = [ & + self % gamma, & + self % beta & + ] + + end function get_params + + + module function get_gradients(self) result(gradients) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: gradients(:) + + gradients = [ & + self % d_gamma, & + self % d_beta & + ] + + end function get_gradients + + + module subroutine set_params(self, params) + class(layernorm_layer), intent(in out) :: self + real, intent(in), target :: params(:) + + ! check if the number of parameters is correct + if (size(params) /= self % get_num_params()) then + error stop 'Error: number of parameters does not match' + end if + + self % gamma = params(1: self % model_dimension) + self % beta = params(self % model_dimension + 1: 2 * self % model_dimension) + + end subroutine set_params +end submodule nf_layernorm_layer_submodule diff --git a/src/nf/nf_locally_connected1d_layer.f90 b/src/nf/nf_locally_connected1d_layer.f90 new file mode 100644 index 00000000..beca76d5 --- /dev/null +++ b/src/nf/nf_locally_connected1d_layer.f90 @@ -0,0 +1,119 @@ +module nf_locally_connected1d_layer + !! This modules provides a 1-d convolutional `locally_connected1d` type. + + use nf_activation, only: activation_function + use nf_base_layer, only: base_layer + implicit none + + private + public :: locally_connected1d_layer + + type, extends(base_layer) :: locally_connected1d_layer + + integer :: width + integer :: height + integer :: channels + integer :: kernel_size + integer :: filters + + real, allocatable :: biases(:,:) ! size(filters) + real, allocatable :: kernel(:,:,:,:) ! filters x channels x window x window + real, allocatable :: output(:,:) ! filters x output_width * output_height + real, allocatable :: z(:,:) ! kernel .dot. input + bias + + real, allocatable :: dw(:,:,:,:) ! weight (kernel) gradients + real, allocatable :: db(:,:) ! bias gradients + real, allocatable :: gradient(:,:) + + class(activation_function), allocatable :: activation + + contains + + procedure :: forward + procedure :: backward + procedure :: get_gradients + procedure :: get_num_params + procedure :: get_params + procedure :: init + procedure :: set_params + + end type locally_connected1d_layer + + interface locally_connected1d_layer + module function locally_connected1d_layer_cons(filters, kernel_size, activation) & + result(res) + !! `locally_connected1d_layer` constructor function + integer, intent(in) :: filters + integer, intent(in) :: kernel_size + class(activation_function), intent(in) :: activation + type(locally_connected1d_layer) :: res + end function locally_connected1d_layer_cons + end interface locally_connected1d_layer + + interface + + module subroutine init(self, input_shape) + !! Initialize the layer data structures. + !! + !! This is a deferred procedure from the `base_layer` abstract type. + class(locally_connected1d_layer), intent(in out) :: self + !! A `locally_connected1d_layer` instance + integer, intent(in) :: input_shape(:) + !! Input layer dimensions + end subroutine init + + pure module subroutine forward(self, input) + !! Apply a forward pass on the `locally_connected1d` layer. + class(locally_connected1d_layer), intent(in out) :: self + !! A `locally_connected1d_layer` instance + real, intent(in) :: input(:,:) + !! Input data + end subroutine forward + + pure module subroutine backward(self, input, gradient) + !! Apply a backward pass on the `locally_connected1d` layer. + class(locally_connected1d_layer), intent(in out) :: self + !! A `locally_connected1d_layer` instance + real, intent(in) :: input(:,:) + !! Input data (previous layer) + real, intent(in) :: gradient(:,:) + !! Gradient (next layer) + end subroutine backward + + pure module function get_num_params(self) result(num_params) + !! Get the number of parameters in the layer. + class(locally_connected1d_layer), intent(in) :: self + !! A `locally_connected1d_layer` instance + integer :: num_params + !! Number of parameters + end function get_num_params + + module function get_params(self) result(params) + !! Return the parameters (weights and biases) of this layer. + !! The parameters are ordered as weights first, biases second. + class(locally_connected1d_layer), intent(in), target :: self + !! A `locally_connected1d_layer` instance + real, allocatable :: params(:) + !! Parameters to get + end function get_params + + module function get_gradients(self) result(gradients) + !! Return the gradients of this layer. + !! The gradients are ordered as weights first, biases second. + class(locally_connected1d_layer), intent(in), target :: self + !! A `locally_connected1d_layer` instance + real, allocatable :: gradients(:) + !! Gradients to get + end function get_gradients + + module subroutine set_params(self, params) + !! Set the parameters of the layer. + class(locally_connected1d_layer), intent(in out) :: self + !! A `locally_connected1d_layer` instance + real, intent(in) :: params(:) + !! Parameters to set + end subroutine set_params + + end interface + +end module nf_locally_connected1d_layer diff --git a/src/nf/nf_locally_connected1d_layer_submodule.f90 b/src/nf/nf_locally_connected1d_layer_submodule.f90 new file mode 100644 index 00000000..053c520b --- /dev/null +++ b/src/nf/nf_locally_connected1d_layer_submodule.f90 @@ -0,0 +1,152 @@ +submodule(nf_locally_connected1d_layer) nf_locally_connected1d_layer_submodule + + use nf_activation, only: activation_function + use nf_random, only: random_normal + + implicit none + +contains + + module function locally_connected1d_layer_cons(filters, kernel_size, activation) result(res) + implicit none + integer, intent(in) :: filters + integer, intent(in) :: kernel_size + class(activation_function), intent(in) :: activation + type(locally_connected1d_layer) :: res + + res % kernel_size = kernel_size + res % filters = filters + res % activation_name = activation % get_name() + allocate(res % activation, source = activation) + end function locally_connected1d_layer_cons + + module subroutine init(self, input_shape) + implicit none + class(locally_connected1d_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + + self % channels = input_shape(1) + self % width = input_shape(2) - self % kernel_size + 1 + + allocate(self % output(self % filters, self % width)) + self % output = 0 + + allocate(self % kernel(self % filters, self % width, self % channels, self % kernel_size)) + call random_normal(self % kernel) + self % kernel = self % kernel / real(self % kernel_size**2) + + allocate(self % biases(self % filters, self % width)) + self % biases = 0 + + allocate(self % z, mold=self % output) + self % z = 0 + + allocate(self % gradient(input_shape(1), input_shape(2))) + self % gradient = 0 + + allocate(self % dw, mold=self % kernel) + self % dw = 0 + + allocate(self % db, mold=self % biases) + self % db = 0 + end subroutine init + + pure module subroutine forward(self, input) + implicit none + class(locally_connected1d_layer), intent(in out) :: self + real, intent(in) :: input(:,:) + integer :: input_channels, input_width + integer :: j, n + integer :: iws, iwe + + input_channels = size(input, dim=1) + input_width = size(input, dim=2) + + do j = 1, self % width + iws = j + iwe = j + self % kernel_size - 1 + do n = 1, self % filters + self % z(n, j) = sum(self % kernel(n, j, :, :) * input(:, iws:iwe)) + self % biases(n, j) + end do + end do + self % output = self % activation % eval(self % z) + end subroutine forward + + pure module subroutine backward(self, input, gradient) + implicit none + class(locally_connected1d_layer), intent(in out) :: self + real, intent(in) :: input(:,:) + real, intent(in) :: gradient(:,:) + integer :: input_channels, input_width, output_width + integer :: j, n, k + integer :: iws, iwe + real :: gdz(self % filters, self % width) + real :: db_local(self % filters, self % width) + real :: dw_local(self % filters, self % width, self % channels, self % kernel_size) + + input_channels = size(input, dim=1) + input_width = size(input, dim=2) + output_width = self % width + + do j = 1, output_width + gdz(:, j) = gradient(:, j) * self % activation % eval_prime(self % z(:, j)) + end do + + do n = 1, self % filters + do j = 1, output_width + db_local(n, j) = gdz(n, j) + end do + end do + + dw_local = 0.0 + self % gradient = 0.0 + + do n = 1, self % filters + do j = 1, output_width + iws = j + iwe = j + self % kernel_size - 1 + do k = 1, self % channels + dw_local(n, j, k, :) = dw_local(n, j, k, :) + input(k, iws:iwe) * gdz(n, j) + self % gradient(k, iws:iwe) = self % gradient(k, iws:iwe) + self % kernel(n, j, k, :) * gdz(n, j) + end do + end do + end do + + self % dw = self % dw + dw_local + self % db = self % db + db_local + end subroutine backward + + pure module function get_num_params(self) result(num_params) + class(locally_connected1d_layer), intent(in) :: self + integer :: num_params + num_params = product(shape(self % kernel)) + product(shape(self % biases)) + end function get_num_params + + module function get_params(self) result(params) + class(locally_connected1d_layer), intent(in), target :: self + real, allocatable :: params(:) + params = [self % kernel, self % biases] + end function get_params + + module function get_gradients(self) result(gradients) + class(locally_connected1d_layer), intent(in), target :: self + real, allocatable :: gradients(:) + gradients = [self % dw, self % db] + end function get_gradients + + module subroutine set_params(self, params) + class(locally_connected1d_layer), intent(in out) :: self + real, intent(in) :: params(:) + + if (size(params) /= self % get_num_params()) then + error stop 'locally_connected1d_layer % set_params: Number of parameters does not match' + end if + + self % kernel = reshape(params(:product(shape(self % kernel))), shape(self % kernel)) + associate(n => product(shape(self % kernel))) + self % biases = reshape(params(n + 1 :), shape(self % biases)) + end associate + + end subroutine set_params + +end submodule nf_locally_connected1d_layer_submodule diff --git a/src/nf/nf_maxpool1d_layer.f90 b/src/nf/nf_maxpool1d_layer.f90 new file mode 100644 index 00000000..b9a14d07 --- /dev/null +++ b/src/nf/nf_maxpool1d_layer.f90 @@ -0,0 +1,69 @@ +module nf_maxpool1d_layer + !! This module provides the 1-d maxpooling layer. + + use nf_base_layer, only: base_layer + implicit none + + private + public :: maxpool1d_layer + + type, extends(base_layer) :: maxpool1d_layer + integer :: channels + integer :: width ! Length of the input along the pooling dimension + integer :: pool_size + integer :: stride + + ! Location (as input matrix indices) of the maximum value within each pooling region. + ! Dimensions: (channels, new_width) + integer, allocatable :: maxloc(:,:) + + ! Gradient for the input (same shape as the input). + real, allocatable :: gradient(:,:) + ! Output after pooling (dimensions: (channels, new_width)). + real, allocatable :: output(:,:) + contains + procedure :: init + procedure :: forward + procedure :: backward + end type maxpool1d_layer + + interface maxpool1d_layer + pure module function maxpool1d_layer_cons(pool_size, stride) result(res) + !! `maxpool1d` constructor function. + integer, intent(in) :: pool_size + !! Width of the pooling window. + integer, intent(in) :: stride + !! Stride of the pooling window. + type(maxpool1d_layer) :: res + end function maxpool1d_layer_cons + end interface maxpool1d_layer + + interface + module subroutine init(self, input_shape) + !! Initialize the `maxpool1d` layer instance with an input shape. + class(maxpool1d_layer), intent(in out) :: self + !! `maxpool1d_layer` instance. + integer, intent(in) :: input_shape(:) + !! Array shape of the input layer, expected as (channels, width). + end subroutine init + + pure module subroutine forward(self, input) + !! Run a forward pass of the `maxpool1d` layer. + class(maxpool1d_layer), intent(in out) :: self + !! `maxpool1d_layer` instance. + real, intent(in) :: input(:,:) + !! Input data (output of the previous layer), with shape (channels, width). + end subroutine forward + + pure module subroutine backward(self, input, gradient) + !! Run a backward pass of the `maxpool1d` layer. + class(maxpool1d_layer), intent(in out) :: self + !! `maxpool1d_layer` instance. + real, intent(in) :: input(:,:) + !! Input data (output of the previous layer). + real, intent(in) :: gradient(:,:) + !! Gradient from the downstream layer, with shape (channels, pooled width). + end subroutine backward + end interface + +end module nf_maxpool1d_layer \ No newline at end of file diff --git a/src/nf/nf_maxpool1d_layer_submodule.f90 b/src/nf/nf_maxpool1d_layer_submodule.f90 new file mode 100644 index 00000000..336264f7 --- /dev/null +++ b/src/nf/nf_maxpool1d_layer_submodule.f90 @@ -0,0 +1,93 @@ +submodule(nf_maxpool1d_layer) nf_maxpool1d_layer_submodule + + implicit none + +contains + + pure module function maxpool1d_layer_cons(pool_size, stride) result(res) + implicit none + integer, intent(in) :: pool_size + integer, intent(in) :: stride + type(maxpool1d_layer) :: res + res % pool_size = pool_size + res % stride = stride + end function maxpool1d_layer_cons + + + module subroutine init(self, input_shape) + implicit none + class(maxpool1d_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + + self % channels = input_shape(1) + self % width = input_shape(2) / self % stride + + allocate(self % maxloc(self % channels, self % width)) + self % maxloc = 0 + + allocate(self % gradient(input_shape(1),input_shape(2))) + self % gradient = 0 + + allocate(self % output(self % channels, self % width)) + self % output = 0 + + end subroutine init + + pure module subroutine forward(self, input) + implicit none + class(maxpool1d_layer), intent(in out) :: self + real, intent(in) :: input(:,:) + integer :: input_width + integer :: i, n + integer :: ii + integer :: iend + integer :: iextent + integer :: maxloc_x + + input_width = size(input, dim=2) + + iextent = input_width - mod(input_width, self % stride) + + ! Stride along the width of the input + stride_over_input: do concurrent(i = 1:iextent:self % stride) + + ! Index of the pooling layer + ii = i / self % stride + 1 + iend = i + self % pool_size - 1 + + maxpool_for_each_channel: do concurrent(n = 1:self % channels) + + ! Get and store the location of the maximum value + maxloc_x = maxloc(input(n, i:iend), dim=1) + self % maxloc(n,ii) = maxloc_x + i - 1 + + self % output(n,ii) = input(n, self % maxloc(n,ii)) + + end do maxpool_for_each_channel + + end do stride_over_input + + end subroutine forward + + pure module subroutine backward(self, input, gradient) + implicit none + class(maxpool1d_layer), intent(in out) :: self + real, intent(in) :: input(:,:) + real, intent(in) :: gradient(:,:) + integer :: gradient_shape(2) + integer :: channels, width + integer :: i, n + + gradient_shape = shape(gradient) + channels = gradient_shape(1) + width = gradient_shape(2) + + ! The gradient of a max-pooling layer is assigned to the stored max locations + do concurrent(n = 1:channels, i = 1:width) + self % gradient(n, self % maxloc(n,i)) = gradient(n,i) + end do + + end subroutine backward + + +end submodule nf_maxpool1d_layer_submodule diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90 index 5916924e..2bd7ce8c 100644 --- a/src/nf/nf_network.f90 +++ b/src/nf/nf_network.f90 @@ -32,17 +32,19 @@ module nf_network procedure, private :: evaluate_batch_1d procedure, private :: forward_1d + procedure, private :: forward_1d_int procedure, private :: forward_2d procedure, private :: forward_3d procedure, private :: predict_1d + procedure, private :: predict_1d_int procedure, private :: predict_2d procedure, private :: predict_3d procedure, private :: predict_batch_1d procedure, private :: predict_batch_3d generic :: evaluate => evaluate_batch_1d - generic :: forward => forward_1d, forward_2d, forward_3d - generic :: predict => predict_1d, predict_2d, predict_3d + generic :: forward => forward_1d, forward_1d_int, forward_2d, forward_3d + generic :: predict => predict_1d, predict_1d_int, predict_2d, predict_3d generic :: predict_batch => predict_batch_1d, predict_batch_3d end type network @@ -95,6 +97,12 @@ module subroutine forward_1d(self, input) !! 1-d input data end subroutine forward_1d + module subroutine forward_1d_int(self, input) + !! Same as `forward_1d` except `integer` + class(network), intent(in out) :: self + integer, intent(in) :: input(:) + end subroutine forward_1d_int + module subroutine forward_2d(self, input) !! Apply a forward pass through the network. !! @@ -137,6 +145,13 @@ module function predict_1d(self, input) result(res) !! Output of the network end function predict_1d + module function predict_1d_int(self, input) result(res) + !! Same as `predict_1d` except `integer` + class(network), intent(in out) :: self + integer, intent(in) :: input(:) + real, allocatable :: res(:) + end function predict_1d_int + module function predict_2d(self, input) result(res) !! Return the output of the network given the input 1-d array. class(network), intent(in out) :: self @@ -201,6 +216,7 @@ module integer function get_num_params(self) !! Network instance end function get_num_params + module function get_params(self) result(params) !! Get the network parameters (weights and biases). class(network), intent(in) :: self diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index f344c5c5..1752f1f8 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -1,5 +1,6 @@ submodule(nf_network) nf_network_submodule + use nf_conv1d_layer, only: conv1d_layer use nf_conv2d_layer, only: conv2d_layer use nf_dense_layer, only: dense_layer use nf_dropout_layer, only: dropout_layer @@ -7,12 +8,17 @@ use nf_input1d_layer, only: input1d_layer use nf_input2d_layer, only: input2d_layer use nf_input3d_layer, only: input3d_layer + use nf_locally_connected1d_layer, only: locally_connected1d_layer + use nf_maxpool1d_layer, only: maxpool1d_layer use nf_maxpool2d_layer, only: maxpool2d_layer + use nf_reshape2d_layer, only: reshape2d_layer use nf_reshape_layer, only: reshape3d_layer use nf_linear2d_layer, only: linear2d_layer use nf_self_attention_layer, only: self_attention_layer + use nf_embedding_layer, only: embedding_layer + use nf_layernorm_layer, only: layernorm_layer use nf_layer, only: layer - use nf_layer_constructors, only: conv2d, dense, flatten, input, maxpool2d, reshape + use nf_layer_constructors, only: conv1d, conv2d, dense, flatten, input, maxpool1d, maxpool2d, reshape, reshape2d use nf_loss, only: quadratic use nf_optimizers, only: optimizer_base_type, sgd use nf_parallel, only: tile_indices @@ -46,7 +52,7 @@ module function network_from_layers(layers) result(res) error stop 'Error: A network must have at least 2 layers.' ! The first layer must be an input layer - if (.not. layers(1) % name == 'input') & + if (.not. layers(1) % name == 'input' .and. .not. layers(1) % name == 'embedding') & error stop 'Error: First layer in the network must be an input layer.' !TODO Ensure that the layers are in allowed sequence: @@ -73,12 +79,24 @@ module function network_from_layers(layers) result(res) type is(conv2d_layer) res % layers = [res % layers(:n-1), flatten(), res % layers(n:)] n = n + 1 + type is(locally_connected1d_layer) + res % layers = [res % layers(:n-1), flatten(), res % layers(n:)] + n = n + 1 type is(maxpool2d_layer) res % layers = [res % layers(:n-1), flatten(), res % layers(n:)] n = n + 1 type is(reshape3d_layer) res % layers = [res % layers(:n-1), flatten(), res % layers(n:)] n = n + 1 + type is(maxpool1d_layer) + res % layers = [res % layers(:n-1), flatten(), res % layers(n:)] + n = n + 1 + type is(conv1d_layer) + res % layers = [res % layers(:n-1), flatten(), res % layers(n:)] + n = n + 1 + type is(reshape2d_layer) + res % layers = [res % layers(:n-1), flatten(), res % layers(n:)] + n = n + 1 class default n = n + 1 end select @@ -147,7 +165,6 @@ module subroutine backward(self, output, loss) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) type is(conv2d_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) - type is(flatten_layer) if (size(self % layers(n) % layer_shape) == 2) then call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient_2d) @@ -156,13 +173,22 @@ module subroutine backward(self, output, loss) end if type is(maxpool2d_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) - type is(reshape3d_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) type is(linear2d_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) type is(self_attention_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) + type is(maxpool1d_layer) + call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) + type is(reshape2d_layer) + call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) + type is(conv1d_layer) + call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) + type is(locally_connected1d_layer) + call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) + type is(layernorm_layer) + call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) end select end if @@ -207,8 +233,9 @@ module subroutine forward_1d(self, input) integer :: n ! Set the input array into the input layer - select type(input_layer => self % layers(1) % p); type is(input1d_layer) - call input_layer % set(input) + select type(input_layer => self % layers(1) % p) + type is(input1d_layer) + call input_layer % set(input) end select do n = 2, size(self % layers) @@ -217,6 +244,21 @@ module subroutine forward_1d(self, input) end subroutine forward_1d + module subroutine forward_1d_int(self, input) + class(network), intent(in out) :: self + integer, intent(in) :: input(:) + integer :: n + + select type(input_layer => self % layers(1) % p) + type is(embedding_layer) + call input_layer % forward(input) + end select + + do n = 2, size(self % layers) + call self % layers(n) % forward(self % layers(n - 1)) + end do + + end subroutine forward_1d_int module subroutine forward_2d(self, input) class(network), intent(in out) :: self @@ -281,6 +323,31 @@ module function predict_1d(self, input) result(res) end function predict_1d + module function predict_1d_int(self, input) result(res) + class(network), intent(in out) :: self + integer, intent(in) :: input(:) + real, allocatable :: res(:) + integer :: n, num_layers + + num_layers = size(self % layers) + + call self % set_training_mode(.false.) + call self % forward(input) + call self % set_training_mode(.true.) + + select type(output_layer => self % layers(num_layers) % p) + type is(dense_layer) + res = output_layer % output + type is(dropout_layer) + res = output_layer % output + type is(flatten_layer) + res = output_layer % output + class default + error stop 'network % output not implemented for ' // & + trim(self % layers(num_layers) % name) // ' layer' + end select + + end function predict_1d_int module function predict_2d(self, input) result(res) class(network), intent(in out) :: self @@ -438,7 +505,6 @@ module function get_num_params(self) end function get_num_params - module function get_params(self) result(params) class(network), intent(in) :: self real, allocatable :: params(:) @@ -458,7 +524,6 @@ module function get_params(self) result(params) end function get_params - module function get_gradients(self) result(gradients) class(network), intent(in) :: self real, allocatable :: gradients(:) @@ -618,6 +683,12 @@ module subroutine update(self, optimizer, batch_size) type is(conv2d_layer) call co_sum(this_layer % dw) call co_sum(this_layer % db) + type is(conv1d_layer) + call co_sum(this_layer % dw) + call co_sum(this_layer % db) + type is(locally_connected1d_layer) + call co_sum(this_layer % dw) + call co_sum(this_layer % db) end select end do #endif @@ -635,6 +706,12 @@ module subroutine update(self, optimizer, batch_size) type is(conv2d_layer) this_layer % dw = 0 this_layer % db = 0 + type is(conv1d_layer) + this_layer % dw = 0 + this_layer % db = 0 + type is(locally_connected1d_layer) + this_layer % dw = 0 + this_layer % db = 0 end select end do diff --git a/src/nf/nf_reshape2d_layer.f90 b/src/nf/nf_reshape2d_layer.f90 new file mode 100644 index 00000000..6b99729b --- /dev/null +++ b/src/nf/nf_reshape2d_layer.f90 @@ -0,0 +1,77 @@ +module nf_reshape2d_layer + + !! This module provides the concrete reshape layer type. + !! It is used internally by the layer type. + !! It is not intended to be used directly by the user. + + use nf_base_layer, only: base_layer + + implicit none + + private + public :: reshape2d_layer + + type, extends(base_layer) :: reshape2d_layer + + !! Concrete implementation of a reshape layer type + !! It implements only rank-1 to rank-2 reshaping. + + integer :: input_shape(1) + integer :: output_shape(2) + real, allocatable :: gradient(:) + real, allocatable :: output(:,:) + + contains + + procedure :: backward + procedure :: forward + procedure :: init + + end type reshape2d_layer + + interface reshape2d_layer + pure module function reshape2d_layer_cons(output_shape) result(res) + !! This function returns the `reshape_layer` instance. + integer, intent(in) :: output_shape(2) + !! The shape of the output + type(reshape2d_layer) :: res + !! reshape_layer instance + end function reshape2d_layer_cons + end interface reshape2d_layer + + interface + + pure module subroutine backward(self, input, gradient) + !! Apply the backward pass for the reshape2d layer. + !! This is just flattening to a rank-1 array. + class(reshape2d_layer), intent(in out) :: self + !! Dense layer instance + real, intent(in) :: input(:) + !! Input from the previous layer + real, intent(in) :: gradient(:,:) + !! Gradient from the next layer + end subroutine backward + + pure module subroutine forward(self, input) + !! Apply the forward pass for the reshape2d layer. + !! This is just a reshape from rank-1 to rank-2 array. + class(reshape2d_layer), intent(in out) :: self + !! Dense layer instance + real, intent(in) :: input(:) + !! Input from the previous layer + end subroutine forward + + module subroutine init(self, input_shape) + !! Initialize the layer data structures. + !! + !! This is a deferred procedure from the `base_layer` abstract type. + class(reshape2d_layer), intent(in out) :: self + !! Dense layer instance + integer, intent(in) :: input_shape(:) + !! Shape of the input layer + end subroutine init + + end interface + + end module nf_reshape2d_layer + \ No newline at end of file diff --git a/src/nf/nf_reshape2d_layer_submodule.f90 b/src/nf/nf_reshape2d_layer_submodule.f90 new file mode 100644 index 00000000..487d5cb8 --- /dev/null +++ b/src/nf/nf_reshape2d_layer_submodule.f90 @@ -0,0 +1,50 @@ +submodule(nf_reshape2d_layer) nf_reshape2d_layer_submodule + + use nf_base_layer, only: base_layer + + implicit none + +contains + + pure module function reshape2d_layer_cons(output_shape) result(res) + integer, intent(in) :: output_shape(2) + type(reshape2d_layer) :: res + res % output_shape = output_shape + end function reshape2d_layer_cons + + + pure module subroutine backward(self, input, gradient) + class(reshape2d_layer), intent(in out) :: self + real, intent(in) :: input(:) + real, intent(in) :: gradient(:,:) + ! The `input` dummy argument is not used but nevertheless declared + ! because the abstract type requires it. + self % gradient = pack(gradient, .true.) + end subroutine backward + + + pure module subroutine forward(self, input) + class(reshape2d_layer), intent(in out) :: self + real, intent(in) :: input(:) + self % output = reshape(input, self % output_shape) + end subroutine forward + + + module subroutine init(self, input_shape) + class(reshape2d_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + + self % input_shape = input_shape + + allocate(self % gradient(input_shape(1))) + self % gradient = 0 + + allocate(self % output( & + self % output_shape(1), & + self % output_shape(2) & + )) + self % output = 0 + + end subroutine init + +end submodule nf_reshape2d_layer_submodule diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 741e9930..ec4e139e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -6,14 +6,21 @@ foreach(execid linear2d_layer parametric_activation dense_layer + conv1d_layer conv2d_layer + locally_connected1d_layer + maxpool1d_layer maxpool2d_layer flatten_layer insert_flatten reshape_layer + reshape2d_layer multihead_attention_layer + embedding_layer + layernorm dense_network get_set_network_params + conv1d_network conv2d_network optimizers loss diff --git a/test/test_conv1d_layer.f90 b/test/test_conv1d_layer.f90 new file mode 100644 index 00000000..81d03c1f --- /dev/null +++ b/test/test_conv1d_layer.f90 @@ -0,0 +1,77 @@ +program test_conv1d_layer + + use iso_fortran_env, only: stderr => error_unit + use nf, only: conv1d, input, layer + use nf_input2d_layer, only: input2d_layer + + implicit none + + type(layer) :: conv1d_layer, input_layer + integer, parameter :: filters = 32, kernel_size=3 + real, allocatable :: sample_input(:,:), output(:,:) + real, parameter :: tolerance = 1e-7 + logical :: ok = .true. + + conv1d_layer = conv1d(filters, kernel_size) + + if (.not. conv1d_layer % name == 'conv1d') then + ok = .false. + write(stderr, '(a)') 'conv1d layer has its name set correctly.. failed' + end if + + if (conv1d_layer % initialized) then + ok = .false. + write(stderr, '(a)') 'conv1d layer should not be marked as initialized yet.. failed' + end if + + if (.not. conv1d_layer % activation == 'relu') then + ok = .false. + write(stderr, '(a)') 'conv1d layer defaults to relu activation.. failed' + end if + + input_layer = input(3, 32) + call conv1d_layer % init(input_layer) + + if (.not. conv1d_layer % initialized) then + ok = .false. + write(stderr, '(a)') 'conv1d layer should now be marked as initialized.. failed' + end if + + if (.not. all(conv1d_layer % input_layer_shape == [3, 32])) then + ok = .false. + write(stderr, '(a)') 'conv1d layer input layer shape should be correct.. failed' + end if + + if (.not. all(conv1d_layer % layer_shape == [filters, 30])) then + ok = .false. + write(stderr, '(a)') 'conv1d layer input layer shape should be correct.. failed' + end if + + ! Minimal conv1d layer: 1 channel, 3x3 pixel image; + allocate(sample_input(1, 3)) + sample_input = 0 + + input_layer = input(1, 3) + conv1d_layer = conv1d(filters, kernel_size) + call conv1d_layer % init(input_layer) + + select type(this_layer => input_layer % p); type is(input2d_layer) + call this_layer % set(sample_input) + end select + + call conv1d_layer % forward(input_layer) + call conv1d_layer % get_output(output) + + if (.not. all(abs(output) < tolerance)) then + ok = .false. + write(stderr, '(a)') 'conv1d layer with zero input and sigmoid function must forward to all 0.5.. failed' + end if + + if (ok) then + print '(a)', 'test_conv1d_layer: All tests passed.' + else + write(stderr, '(a)') 'test_conv1d_layer: One or more tests failed.' + stop 1 + end if + +end program test_conv1d_layer diff --git a/test/test_conv1d_network.f90 b/test/test_conv1d_network.f90 new file mode 100644 index 00000000..5a353cf9 --- /dev/null +++ b/test/test_conv1d_network.f90 @@ -0,0 +1,154 @@ +program test_conv1d_network + + use iso_fortran_env, only: stderr => error_unit + use nf, only: conv1d, input, network, dense, sgd, maxpool1d + + implicit none + + type(network) :: net + real, allocatable :: sample_input(:,:), output(:,:) + logical :: ok = .true. + + ! 3-layer convolutional network + net = network([ & + input(3, 32), & + conv1d(filters=16, kernel_size=3), & + conv1d(filters=32, kernel_size=3) & + ]) + + if (.not. size(net % layers) == 3) then + write(stderr, '(a)') 'conv2d network should have 3 layers.. failed' + ok = .false. + end if + + ! Test for output shape + allocate(sample_input(3, 32)) + sample_input = 0 + + call net % forward(sample_input) + call net % layers(3) % get_output(output) + + if (.not. all(shape(output) == [32, 28])) then + write(stderr, '(a)') 'conv1d network output should have correct shape.. failed' + ok = .false. + end if + + deallocate(sample_input, output) + + training1: block + + type(network) :: cnn + real :: y(1) + real :: tolerance = 1e-4 + integer :: n + integer, parameter :: num_iterations = 1000 + + ! Test training of a minimal constant mapping + allocate(sample_input(1, 5)) + call random_number(sample_input) + + cnn = network([ & + input(1, 5), & + conv1d(filters=1, kernel_size=3), & + conv1d(filters=1, kernel_size=3), & + dense(1) & + ]) + + y = [0.1234567] + + do n = 1, num_iterations + call cnn % forward(sample_input) + call cnn % backward(y) + call cnn % update(optimizer=sgd(learning_rate=1.)) + + if (all(abs(cnn % predict(sample_input) - y) < tolerance)) exit + end do + + if (.not. n <= num_iterations) then + write(stderr, '(a)') & + 'convolutional network 1 should converge in simple training.. failed' + ok = .false. + end if + + end block training1 + + training2: block + + type(network) :: cnn + real :: x(1, 8) + real :: y(1) + real :: tolerance = 1e-4 + integer :: n + integer, parameter :: num_iterations = 1000 + + call random_number(x) + y = [0.1234567] + + cnn = network([ & + input(1, 8), & + conv1d(filters=1, kernel_size=3), & + maxpool1d(pool_size=2), & + conv1d(filters=1, kernel_size=3), & + dense(1) & + ]) + + do n = 1, num_iterations + call cnn % forward(x) + call cnn % backward(y) + call cnn % update(optimizer=sgd(learning_rate=1.)) + if (all(abs(cnn % predict(x) - y) < tolerance)) exit + end do + + if (.not. n <= num_iterations) then + write(stderr, '(a)') & + 'convolutional network 2 should converge in simple training.. failed' + ok = .false. + end if + + end block training2 + + training3: block + + type(network) :: cnn + real :: x(1, 12) + real :: y(9) + real :: tolerance = 1e-4 + integer :: n + integer, parameter :: num_iterations = 5000 + + call random_number(x) + y = [0.12345, 0.23456, 0.34567, 0.45678, 0.56789, 0.67890, 0.78901, 0.89012, 0.90123] + + cnn = network([ & + input(1, 12), & + conv1d(filters=1, kernel_size=3), & ! 1x12x12 input, 1x10x10 output + maxpool1d(pool_size=2), & ! 1x10x10 input, 1x5x5 output + conv1d(filters=1, kernel_size=3), & ! 1x5x5 input, 1x3x3 output + dense(9) & ! 9 outputs + ]) + + do n = 1, num_iterations + call cnn % forward(x) + call cnn % backward(y) + call cnn % update(optimizer=sgd(learning_rate=1.)) + if (all(abs(cnn % predict(x) - y) < tolerance)) exit + end do + + if (.not. n <= num_iterations) then + write(stderr, '(a)') & + 'convolutional network 3 should converge in simple training.. failed' + ok = .false. + end if + + end block training3 + + + if (ok) then + print '(a)', 'test_conv1d_network: All tests passed.' + else + write(stderr, '(a)') 'test_conv1d_network: One or more tests failed.' + stop 1 + end if + + end program test_conv1d_network + \ No newline at end of file diff --git a/test/test_conv2d_network.f90 b/test/test_conv2d_network.f90 index 1bdfc677..73c4595a 100644 --- a/test/test_conv2d_network.f90 +++ b/test/test_conv2d_network.f90 @@ -60,6 +60,7 @@ program test_conv2d_network call cnn % forward(sample_input) call cnn % backward(y) call cnn % update(optimizer=sgd(learning_rate=1.)) + if (all(abs(cnn % predict(sample_input) - y) < tolerance)) exit end do diff --git a/test/test_embedding_layer.f90 b/test/test_embedding_layer.f90 new file mode 100644 index 00000000..99b7fca6 --- /dev/null +++ b/test/test_embedding_layer.f90 @@ -0,0 +1,133 @@ +program test_embedding_layer + use iso_fortran_env, only: stderr => error_unit + use nf_embedding_layer, only: embedding_layer + use nf_layer, only: layer + use nf_layer_constructors, only: embedding_constructor => embedding + implicit none + + logical :: ok = .true. + integer :: sample_input(3) = [2, 1, 3] + + call test_simple(ok, sample_input) + call test_positional_trigonometric(ok, sample_input) + call test_positional_absolute(ok, sample_input) + + if (ok) then + print '(a)', 'test_embedding_layer: All tests passed.' + else + write(stderr, '(a)') 'test_embedding_layer: One or more tests failed.' + error stop 1 + end if + +contains + subroutine test_simple(ok, sample_input) + logical, intent(in out) :: ok + integer, intent(in) :: sample_input(:) + + real :: sample_gradient(3, 2) = reshape([0.1, 0.2, 0.3, 0.4, 0.6, 0.6], [3, 2]) + real :: output_flat(6) + real :: expected_output_flat(6) = reshape([0.3, 0.1, 0.5, 0.4, 0.2, 0.6], [6]) + real :: dw_flat(8) + real :: expected_dw_flat(8) = reshape([0.2, 0.1, 0.3, 0., 0.6, 0.4, 0.6, 0.], [8]) + type(embedding_layer) :: embedding + + embedding = embedding_layer(vocab_size=4, model_dimension=2) + call embedding % init([3]) + embedding % weights = reshape([0.1, 0.3, 0.5, 0.7, 0.2, 0.4, 0.6, 0.8], [4, 2]) + + call embedding % forward(sample_input) + + output_flat = reshape(embedding % output, [6]) + if (.not. all(output_flat.eq.expected_output_flat)) then + ok = .false. + write(stderr, '(a)') 'forward returned incorrect values.. failed' + end if + + call embedding % backward(sample_input, sample_gradient) + dw_flat = reshape(embedding % dw, shape(dw_flat)) + if (.not. all(dw_flat.eq.expected_dw_flat)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect dw values.. failed' + end if + end subroutine test_simple + + subroutine test_positional_trigonometric(ok, sample_input) + logical, intent(in out) :: ok + integer, intent(in) :: sample_input(:) + + real :: output_flat(12) + real :: expected_output_flat(12) = reshape([& + 0.3, 0.941471, 1.4092975,& + 1.3, 0.64030236, 0.08385316,& + 0.3, 0.10999984, 0.51999867,& + 1.3, 1.09995, 1.4998& + ], [12]) + type(embedding_layer) :: embedding + + real :: theta + integer :: i, pos + + embedding = embedding_layer(vocab_size=5, model_dimension=4, positional=1) + call embedding % init([3]) + embedding % weights = reshape([& + 0.1, 0.3, 0.5, 0.7, 0.2,& + 0.1, 0.3, 0.5, 0.7, 0.2,& + 0.1, 0.3, 0.5, 0.7, 0.2,& + 0.1, 0.3, 0.5, 0.7, 0.2& + ], [5, 4]) + + call embedding % forward(sample_input) + + output_flat = reshape(embedding % output, [12]) + if (.not. all(abs(output_flat - expected_output_flat) <= (1e-06 + 1e-05 * abs(expected_output_flat)))) then + ok = .false. + write(stderr, '(a)') 'trigonometric positional encoding returned incorrect values.. failed' + end if + end subroutine test_positional_trigonometric + + subroutine test_positional_absolute(ok, sample_input) + logical, intent(in out) :: ok + integer, intent(in) :: sample_input(:) + + real :: output_flat(12) + real :: expected_output_flat(12) = reshape([& + 0.3, 1.1, 2.5,& + 0.3, 1.1, 2.5,& + 0.3, 1.1, 2.5,& + 0.3, 1.1, 2.5& + ], [12]) + type(embedding_layer) :: embedding + + real :: theta + integer :: i, pos + + embedding = embedding_layer(vocab_size=5, model_dimension=4, positional=2) + call embedding % init([3]) + embedding % weights = reshape([& + 0.1, 0.3, 0.5, 0.7, 0.2,& + 0.1, 0.3, 0.5, 0.7, 0.2,& + 0.1, 0.3, 0.5, 0.7, 0.2,& + 0.1, 0.3, 0.5, 0.7, 0.2& + ], [5, 4]) + + call embedding % forward(sample_input) + + output_flat = reshape(embedding % output, [12]) + if (.not. all(abs(output_flat - expected_output_flat) <= (1e-06 + 1e-05 * abs(expected_output_flat)))) then + ok = .false. + write(stderr, '(a)') 'absolute positional encoding returned incorrect values.. failed' + end if + end subroutine test_positional_absolute + + subroutine test_embedding_constructor(ok, sample_input) + logical, intent(in out) :: ok + integer, intent(in) :: sample_input(:) + + type(layer) :: embedding_constructed + + embedding_constructed = embedding_constructor(sequence_length=3, vocab_size=5, model_dimension=4) + embedding_constructed = embedding_constructor(sequence_length=3, vocab_size=5, model_dimension=4, positional=0) + embedding_constructed = embedding_constructor(sequence_length=3, vocab_size=5, model_dimension=4, positional=1) + embedding_constructed = embedding_constructor(sequence_length=3, vocab_size=5, model_dimension=4, positional=2) + end subroutine test_embedding_constructor +end program test_embedding_layer diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 new file mode 100644 index 00000000..6a897575 --- /dev/null +++ b/test/test_layernorm.f90 @@ -0,0 +1,193 @@ +program test_layernorm_instance + use iso_fortran_env, only: stderr => error_unit + use nf_layernorm_layer, only: layernorm_layer + use nf_linear2d_layer, only: linear2d_layer + use nf_layer, only: layer + use nf, only: sgd, layernorm, network, input, flatten, linear2d + implicit none + + logical :: ok = .true. + type(layernorm_layer) :: layernorm_instance + real :: sample_input(3, 4) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4]) + real :: sample_gradient(3, 4) = reshape([0.1, 3., 2., 0.1, 3., 3., 0.1, 2., 0.1, 3., 0.1, 3.], [3, 4]) + + layernorm_instance = layernorm_layer() + call layernorm_instance % init([3, 4]) + + call test_layernorm_forward(layernorm_instance, sample_input, ok) + call test_layernorm_backward(layernorm_instance, sample_input, sample_gradient, ok) + call test_layernorm_gradients(sample_input, sample_gradient, ok) + call test_layernorm_integration(ok) + + if (ok) then + print '(a)', 'test_layernorm_layer: All tests passed.' + else + write(stderr, '(a)') 'test_layernorm_layer: One or more tests failed.' + error stop 1 + end if + +contains + function allclose(x, y) result(res) + real, intent(in) :: x(:) + real, intent(in) :: y(:) + logical :: res + + res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y))) + end function allclose + + subroutine test_layernorm_forward(layernorm_instance, input, ok) + type(layernorm_layer), intent(in out) :: layernorm_instance + real, intent(in out) :: input(:, :) + logical, intent(in out) :: ok + real :: output_shape(2) + real :: output_flat(12) + real :: expected_shape(2) = [3, 4] + real :: expected_output_flat(12) = [& + -0.693158746, 0.939844191, -0.992156327, 1.72702277, -0.970368207, 0.971188426,& + -0.552177250, 1.05800152, 1.02837324, -0.481686622, -1.02747762, -1.00740564& + ] + + call layernorm_instance % forward(input) + + output_shape = shape(layernorm_instance % output) + if (.not. all(output_shape.eq.expected_shape)) then + ok = .false. + write(stderr, '(a)') 'forward returned incorrect shape.. failed' + end if + output_flat = reshape(layernorm_instance % output, shape(output_flat)) + if (.not. allclose(output_flat, expected_output_flat)) then + ok = .false. + write(stderr, '(a)') 'forward returned incorrect values.. failed' + end if + end subroutine test_layernorm_forward + + subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok) + type(layernorm_layer), intent(in out) :: layernorm_instance + real, intent(in out) :: input(:, :) + real, intent(in out) :: gradient(:, :) + logical, intent(in out) :: ok + + real :: gradient_shape(2) + real :: gradient_flat(12) + real :: expected_gradient_shape(2) = [3, 4] + real :: expected_gradient_flat(12) = [& + -0.227230772, 0.103088334, -9.88590196E-02, -2.86390483E-02, 0.283811331, 0.277955681,& + -0.215662330, -0.105019525, -0.269407451, 0.471532196, -0.281880081, 9.03107598E-02& + ] + + real :: d_gamma(4) + real :: expected_d_gamma(4) = [0.765904069, 0.175162792, 2.16362262, -4.57002449] + real :: d_beta(4) + real :: expected_d_beta(4) = [5.1, 6.1, 2.2, 6.1] + + call layernorm_instance % backward(input, gradient) + + gradient_shape = shape(layernorm_instance % gradient) + if (.not. all(gradient_shape.eq.expected_gradient_shape)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect gradient shape.. failed' + end if + gradient_flat = reshape(layernorm_instance % gradient, shape(gradient_flat)) + if (.not. allclose(gradient_flat, expected_gradient_flat)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect gradient values.. failed' + end if + + if (.not. allclose(layernorm_instance % d_gamma, expected_d_gamma)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect d_gamma values.. failed' + end if + if (.not. allclose(layernorm_instance % d_beta, expected_d_beta)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect d_beta values.. failed' + end if + end subroutine test_layernorm_backward + + subroutine test_layernorm_gradients(input, gradient, ok) + real, intent(in out) :: input(:, :) + real, intent(in out) :: gradient(:, :) + logical, intent(in out) :: ok + type(layernorm_layer) :: layernorm_instance + type(sgd) :: optim + + real :: parameters(8) + real :: expected_parameters(8) + real :: updated_output(12) + real :: expected_updated_output(12) = [& + -0.738849819, 0.881645918, -1.03555739,& + 1.66299772, -1.02966857, 0.908487320,& + -0.562230229, 1.01311040, 0.984123051,& + -0.564699769, -1.13543355, -1.11444426& + ] + + layernorm_instance = layernorm_layer() + call layernorm_instance % init([3, 4]) + + call layernorm_instance % forward(input) + call layernorm_instance % backward(input, gradient) + + if (layernorm_instance % get_num_params() /= 8) then + ok = .false. + write(stderr, '(a)') 'incorrect number of parameters.. failed' + end if + + expected_parameters(1: 4) = 1. + expected_parameters(5: 8) = 0. + parameters = layernorm_instance % get_params() + if (.not. all(parameters.eq.expected_parameters)) then + ok = .false. + write(stderr, '(a)') 'incorrect parameters.. failed' + end if + + optim = SGD(learning_rate=0.01) + call optim % minimize(parameters, layernorm_instance % get_gradients()) + call layernorm_instance % set_params(parameters) + + call layernorm_instance % forward(input) + + updated_output = reshape(layernorm_instance % output, [12]) + if (.not. allclose(updated_output, expected_updated_output)) then + ok = .false. + write(stderr, '(a)') 'incorrect output after parameters update.. failed' + end if + end subroutine test_layernorm_gradients + + subroutine test_layernorm_integration(ok) + logical, intent(in out) :: ok + + type(network) :: net + real :: x(2, 3) = reshape([0.1, 2., 0.3, 4., 0.5, 6.], [2, 3]) + real :: y(6) = [0.7, 0.2, 0.1, 0.1, 0.01, 0.9] + real :: tolerance = 0.1 + integer :: epoch + integer :: epochs = 10000 + + net = network([& + input(2, 3),& + linear2d(3),& + layernorm(),& + flatten()& + ]) + + ! Kaiming weights to achieve semblance of convergance + select type(l => net % layers(2) % p) + type is(linear2d_layer) + call random_number(l % weights) + l % weights = l % weights * sqrt(2. / 6.) + l % biases = 0.2 + end select + + do epoch = 1, epochs + call net % forward(x) + call net % backward(y) + call net % update(optimizer=sgd(learning_rate=0.001)) + if (all(abs(net % predict(x) - y) < tolerance)) exit + end do + + if (.not. epoch <= epochs) then + write(stderr, '(a)') & + 'linear2d + layernorm should converge in simple training.. failed' + ok = .false. + end if + end subroutine test_layernorm_integration +end program test_layernorm_instance diff --git a/test/test_locally_connected1d_layer.f90 b/test/test_locally_connected1d_layer.f90 new file mode 100644 index 00000000..e8a30cfc --- /dev/null +++ b/test/test_locally_connected1d_layer.f90 @@ -0,0 +1,78 @@ +program test_locally_connected1d_layer + + use iso_fortran_env, only: stderr => error_unit + use nf, only: locally_connected1d, input, layer + use nf_input2d_layer, only: input2d_layer + + implicit none + + type(layer) :: locally_connected_1d_layer, input_layer + integer, parameter :: filters = 32, kernel_size=3 + real, allocatable :: sample_input(:,:), output(:,:) + real, parameter :: tolerance = 1e-7 + logical :: ok = .true. + + locally_connected_1d_layer = locally_connected1d(filters, kernel_size) + + if (.not. locally_connected_1d_layer % name == 'locally_connected1d') then + ok = .false. + write(stderr, '(a)') 'locally_connected1d layer has its name set correctly.. failed' + end if + + if (locally_connected_1d_layer % initialized) then + ok = .false. + write(stderr, '(a)') 'locally_connected1d layer should not be marked as initialized yet.. failed' + end if + + if (.not. locally_connected_1d_layer % activation == 'relu') then + ok = .false. + write(stderr, '(a)') 'locally_connected1d layer defaults to relu activation.. failed' + end if + + input_layer = input(3, 32) + call locally_connected_1d_layer % init(input_layer) + + if (.not. locally_connected_1d_layer % initialized) then + ok = .false. + write(stderr, '(a)') 'locally_connected1d layer should now be marked as initialized.. failed' + end if + + if (.not. all(locally_connected_1d_layer % input_layer_shape == [3, 32])) then + ok = .false. + write(stderr, '(a)') 'locally_connected1d layer input layer shape should be correct.. failed' + end if + + if (.not. all(locally_connected_1d_layer % layer_shape == [filters, 30])) then + ok = .false. + write(stderr, '(a)') 'locally_connected1d layer input layer shape should be correct.. failed' + end if + + ! Minimal locally_connected_1d layer: 1 channel, 3x3 pixel image; + allocate(sample_input(1, 3)) + sample_input = 0 + + input_layer = input(1, 3) + locally_connected_1d_layer = locally_connected1d(filters, kernel_size) + call locally_connected_1d_layer % init(input_layer) + + select type(this_layer => input_layer % p); type is(input2d_layer) + call this_layer % set(sample_input) + end select + + call locally_connected_1d_layer % forward(input_layer) + call locally_connected_1d_layer % get_output(output) + + + if (.not. all(abs(output) < tolerance)) then + ok = .false. + write(stderr, '(a)') 'locally_connected1d layer with zero input and sigmoid function must forward to all 0.5.. failed' + end if + + if (ok) then + print '(a)', 'test_locally_connected1d_layer: All tests passed.' + else + write(stderr, '(a)') 'test_locally_connected1d_layer: One or more tests failed.' + stop 1 + end if + +end program test_locally_connected1d_layer diff --git a/test/test_maxpool1d_layer.f90 b/test/test_maxpool1d_layer.f90 new file mode 100644 index 00000000..023a2c33 --- /dev/null +++ b/test/test_maxpool1d_layer.f90 @@ -0,0 +1,95 @@ +program test_maxpool1d_layer + + use iso_fortran_env, only: stderr => error_unit + use nf, only: maxpool1d, input, layer + use nf_input2d_layer, only: input2d_layer + use nf_maxpool1d_layer, only: maxpool1d_layer + + implicit none + + type(layer) :: maxpool_layer, input_layer + integer, parameter :: pool_size = 2, stride = 2 + integer, parameter :: channels = 3, length = 32 + integer, parameter :: input_shape(2) = [channels, length] + integer, parameter :: output_shape(2) = [channels, length / 2] + real, allocatable :: sample_input(:,:), output(:,:), gradient(:,:) + integer :: i + logical :: ok = .true., gradient_ok = .true. + + maxpool_layer = maxpool1d(pool_size) + + if (.not. maxpool_layer % name == 'maxpool1d') then + ok = .false. + write(stderr, '(a)') 'maxpool1d layer has its name set correctly.. failed' + end if + + if (maxpool_layer % initialized) then + ok = .false. + write(stderr, '(a)') 'maxpool1d layer should not be marked as initialized yet.. failed' + end if + + input_layer = input(channels, length) + call maxpool_layer % init(input_layer) + + if (.not. maxpool_layer % initialized) then + ok = .false. + write(stderr, '(a)') 'maxpool1d layer should now be marked as initialized.. failed' + end if + + if (.not. all(maxpool_layer % input_layer_shape == input_shape)) then + ok = .false. + write(stderr, '(a)') 'maxpool1d layer input layer shape should be correct.. failed' + end if + + if (.not. all(maxpool_layer % layer_shape == output_shape)) then + ok = .false. + write(stderr, '(a)') 'maxpool1d layer output layer shape should be correct.. failed' + end if + + ! Allocate and initialize sample input data + allocate(sample_input(channels, length)) + do concurrent(i = 1:length) + sample_input(:,i) = i + end do + + select type(this_layer => input_layer % p); type is(input2d_layer) + call this_layer % set(sample_input) + end select + + call maxpool_layer % forward(input_layer) + call maxpool_layer % get_output(output) + + do i = 1, length / 2 + if (.not. all(output(:,i) == stride * i)) then + ok = .false. + write(stderr, '(a)') 'maxpool1d layer forward pass correctly propagates the max value.. failed' + end if + end do + + ! Test the backward pass + allocate(gradient, source=output) + call maxpool_layer % backward(input_layer, gradient) + + select type(this_layer => maxpool_layer % p); type is(maxpool1d_layer) + do i = 1, length + if (mod(i,2) == 0) then + if (.not. all(sample_input(:,i) == this_layer % gradient(:,i))) gradient_ok = .false. + else + if (.not. all(this_layer % gradient(:,i) == 0)) gradient_ok = .false. + end if + end do + end select + + if (.not. gradient_ok) then + ok = .false. + write(stderr, '(a)') 'maxpool1d layer backward pass produces the correct dL/dx.. failed' + end if + + if (ok) then + print '(a)', 'test_maxpool1d_layer: All tests passed.' + else + write(stderr, '(a)') 'test_maxpool1d_layer: One or more tests failed.' + stop 1 + end if + +end program test_maxpool1d_layer diff --git a/test/test_reshape2d_layer.f90 b/test/test_reshape2d_layer.f90 new file mode 100644 index 00000000..52817eac --- /dev/null +++ b/test/test_reshape2d_layer.f90 @@ -0,0 +1,55 @@ +program test_reshape2d_layer + + use iso_fortran_env, only: stderr => error_unit + use nf, only: input, network, reshape2d_layer => reshape2d + use nf_datasets, only: download_and_unpack, keras_reshape_url + + implicit none + + type(network) :: net + real, allocatable :: sample_input(:), output(:,:) + integer, parameter :: output_shape(2) = [4,4] + integer, parameter :: input_size = product(output_shape) + character(*), parameter :: keras_reshape_path = 'keras_reshape.h5' + logical :: file_exists + logical :: ok = .true. + + ! Create the network + net = network([ & + input(input_size), & + reshape2d_layer(output_shape) & + ]) + + if (.not. size(net % layers) == 2) then + write(stderr, '(a)') 'the network should have 2 layers.. failed' + ok = .false. + end if + + ! Initialize test data + allocate(sample_input(input_size)) + call random_number(sample_input) + + ! Propagate forward and get the output + call net % forward(sample_input) + call net % layers(2) % get_output(output) + + ! Check shape of the output + if (.not. all(shape(output) == output_shape)) then + write(stderr, '(a)') 'the reshape layer produces expected output shape.. failed' + ok = .false. + end if + + ! Check if reshaped input matches output + if (.not. all(reshape(sample_input, output_shape) == output)) then + write(stderr, '(a)') 'the reshape layer produces expected output values.. failed' + ok = .false. + end if + + if (ok) then + print '(a)', 'test_reshape2d_layer: All tests passed.' + else + write(stderr, '(a)') 'test_reshape2d_layer: One or more tests failed.' + stop 1 + end if + +end program test_reshape2d_layer