add DNN_TARGET_FP16

opencv · alalek · Oct 21, 2019 · May 31, 2019 · May 31, 2019 · Jun 16, 2019
commit 6df05bf6145092c3bf846e4d54f3abcac7b3ef75
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -92,6 +92,7 @@ CV__DNN_INLINE_NS_BEGIN
         DNN_TARGET_MYRIAD,
         DNN_TARGET_VULKAN,
         DNN_TARGET_FPGA,  //!< FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
+        DNN_TARGET_CUDA_FP16,
         DNN_TARGET_CUDA_FP32,
         DNN_TARGET_CUDA_FP64
     };
@@ -566,6 +567,7 @@ CV__DNN_INLINE_NS_BEGIN
          * | DNN_TARGET_OPENCL_FP16 |                  + |                            + |                    |                   |
          * | DNN_TARGET_MYRIAD      |                    |                            + |                    |                   |
          * | DNN_TARGET_FPGA        |                    |                            + |                    |                   |
+         * | DNN_TARGET_CUDA_FP16   |                    |                              |                    |                 + |
          * | DNN_TARGET_CUDA_FP32   |                    |                              |                    |                 + |
          * | DNN_TARGET_CUDA_FP64   |                    |                              |                    |                 + |
          */

diff --git a/modules/dnn/src/cuda/activations.cu b/modules/dnn/src/cuda/activations.cu
@@ -2,6 +2,9 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
 #include "math.hpp"
 #include "types.hpp"
 #include "vector_traits.hpp"
@@ -13,8 +16,6 @@
 
 #include <opencv2/core.hpp>
 
-#include <cuda_runtime.h>
-
 #include <cstddef>
 
 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace kernels {
@@ -52,15 +53,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         __global__ void bnll(span<T> output, view<T> input) {
             for (auto i : grid_stride_range(output.size())) {
                 using utils::log1pexp;
-                output[i] = input[i] > 0 ? input[i] + log1pexp(-input[i]) : log1pexp(input[i]);
+                output[i] = input[i] > T(0) ? input[i] + log1pexp(-input[i]) : log1pexp(input[i]);
             }
         }
 
         template <class T>
         __global__ void elu(span<T> output, view<T> input) {
             for (auto i : grid_stride_range(output.size())) {
                 using utils::exp;
-                output[i] = input[i] >= 0 ? input[i] : expm1(input[i]);
+                output[i] = input[i] >= T(0) ? input[i] : expm1(input[i]);
             }
         }
 
@@ -73,10 +74,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
 
             for (auto i : grid_stride_range(output.size() / 4)) {
                 vector_type vec = srcPtr[i];
-                vec.w = vec.w >= 0.0 ? vec.w : slope * vec.w;
-                vec.x = vec.x >= 0.0 ? vec.x : slope * vec.x;
-                vec.y = vec.y >= 0.0 ? vec.y : slope * vec.y;
-                vec.z = vec.z >= 0.0 ? vec.z : slope * vec.z;
+                vec.w = vec.w >= T(0) ? vec.w : slope * vec.w;
+                vec.x = vec.x >= T(0) ? vec.x : slope * vec.x;
+                vec.y = vec.y >= T(0) ? vec.y : slope * vec.y;
+                vec.z = vec.z >= T(0) ? vec.z : slope * vec.z;
                 dstPtr[i] = vec;
             }
         }
@@ -90,16 +91,16 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
 
             for (auto i : grid_stride_range(output.size() / 2)) {
                 vector_type vec = srcPtr[i];
-                vec.x = vec.x >= 0.0 ? vec.x : slope * vec.x;
-                vec.y = vec.y >= 0.0 ? vec.y : slope * vec.y;
+                vec.x = vec.x >= T(0) ? vec.x : slope * vec.x;
+                vec.y = vec.y >= T(0) ? vec.y : slope * vec.y;
                 dstPtr[i] = vec;
             }
         }
 
         template <class T>
         __global__ void relu(span<T> output, view<T> input, T slope) {
             for (auto i : grid_stride_range(output.size()))
-                output[i] = input[i] >= 0.0 ? input[i] : slope * input[i];
+                output[i] = input[i] >= T(0) ? input[i] : slope * input[i];
         }
 
         template <class T>
@@ -150,7 +151,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         __global__ void axiswise_relu(span<T> output, view<T> input, size_type inner_size, view<T> slope) {
             for (auto i : grid_stride_range(output.size())) {
                 const index_type c = (i % inner_size) / static_cast<size_type>(slope.size());
-                output[i] = input[i] < 0 ? input[i] * slope[c] : input[i];
+                output[i] = input[i] < T(0) ? input[i] * slope[c] : input[i];
             }
         }
 
@@ -208,6 +209,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         launch_kernel(kernel, policy, output, input);
     }
 
+    template void abs<__half>(const Stream& stream, span<__half> output, view<__half> input);
     template void abs<float>(const Stream& stream, span<float> output, view<float> input);
     template void abs<double>(const Stream& stream, span<double> output, view<double> input);
 
@@ -220,6 +222,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         launch_kernel(kernel, policy, output, input);
     }
 
+    template void tanh<__half>(const Stream&, span<__half>, view<__half>);
     template void tanh<float>(const Stream&, span<float>, view<float>);
     template void tanh<double>(const Stream&, span<double>, view<double>);
 
@@ -232,6 +235,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         launch_kernel(kernel, policy, output, input);
     }
 
+    template void sigmoid<__half>(const Stream&, span<__half>, view<__half>);
     template void sigmoid<float>(const Stream&, span<float>, view<float>);
     template void sigmoid<double>(const Stream&, span<double>, view<double>);
 
@@ -244,6 +248,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         launch_kernel(kernel, policy, output, input);
     }
 
+    template void bnll<__half>(const Stream&, span<__half>, view<__half>);
     template void bnll<float>(const Stream&, span<float>, view<float>);
     template void bnll<double>(const Stream&, span<double>, view<double>);
 
@@ -256,6 +261,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         launch_kernel(kernel, policy, output, input);
     }
 
+    template void elu<__half>(const Stream&, span<__half>, view<__half>);
     template void elu<float>(const Stream&, span<float>, view<float>);
     template void elu<double>(const Stream&, span<double>, view<double>);
 
@@ -277,13 +283,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         }
     }
 
+    template void relu<__half>(const Stream&, span<__half>, view<__half>, __half);
     template void relu<float>(const Stream&, span<float>, view<float>, float);
     template void relu<double>(const Stream&, span<double>, view<double>, double);
 
     template <class T>
     void clipped_relu(const Stream& stream, span<T> output, view<T> input, T floor, T ceiling) {
         CV_Assert(input.size() == output.size());
-        CV_Assert(floor <= ceiling);
+        CV_Assert(double(floor) <= double(ceiling));
 
         if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
             auto kernel = raw::clipped_relu_vec4<T>;
@@ -300,6 +307,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         }
     }
 
+    template void clipped_relu<__half>(const Stream&, span<__half>, view<__half>, __half, __half);
     template void clipped_relu<float>(const Stream&, span<float>, view<float>, float, float);
     template void clipped_relu<double>(const Stream&, span<double>, view<double>, double, double);
 
@@ -312,6 +320,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         launch_kernel(kernel, policy, output, input, inner_size, slope);
     }
 
+    template void axiswise_relu<__half>(const Stream&, span<__half>, view<__half>, view<__half>, std::size_t);
     template void axiswise_relu<float>(const Stream&, span<float>, view<float>, view<float>, std::size_t);
     template void axiswise_relu<double>(const Stream&, span<double>, view<double>, view<double>, std::size_t);
 
@@ -334,6 +343,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         }
     }
 
+    template void power<__half>(const Stream&, span<__half>, view<__half>, __half, __half, __half);
     template void power<float>(const Stream&, span<float>, view<float>, float, float, float);
     template void power<double>(const Stream&, span<double>, view<double>, double, double, double);
 

diff --git a/modules/dnn/src/cuda/atomics.hpp b/modules/dnn/src/cuda/atomics.hpp
@@ -7,9 +7,30 @@
 
 #include <cuda_runtime.h>
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#else
+inline __device__ void atomicAdd(__half* address, __half val) {
+    unsigned int* address_as_ui = (unsigned int *)((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        __half tmpres = hsum + val;
+        hsum = __half_raw(tmpres);
+
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+} while (assumed != old);
+}
+#endif
+
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
 #else
-__device__ double atomicAdd(double* address, double val)
+inline __device__ double atomicAdd(double* address, double val)
 {
     unsigned long long int* address_as_ull = (unsigned long long int*)address;
     unsigned long long int old = *address_as_ull, assumed;

diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu
@@ -2,6 +2,8 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
 
 #include "array.hpp"
 #include "types.hpp"
@@ -13,8 +15,6 @@
 #include "../cuda4dnn/csl/tensor.hpp"
 #include "../cuda4dnn/csl/span.hpp"
 
-#include <cuda_runtime.h>
-
 #include <cstddef>
 
 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace kernels {
@@ -171,6 +171,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         }
     }
 
+    template void concat<__half>(const Stream&, TensorSpan<__half>, std::size_t, TensorView<__half>, std::size_t);
     template void concat<float>(const Stream&, TensorSpan<float>, std::size_t, TensorView<float>,  std::size_t);
     template void concat<double>(const Stream&, TensorSpan<double>, std::size_t, TensorView<double>, std::size_t);
 
@@ -237,6 +238,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         }
     }
 
+    template void concat_with_offsets(const Stream&, TensorSpan<__half>, TensorView<__half>, const std::vector<std::size_t>&);
     template void concat_with_offsets(const Stream&, TensorSpan<float>, TensorView<float>, const std::vector<std::size_t>&);
     template void concat_with_offsets(const Stream&, TensorSpan<double>, TensorView<double>, const std::vector<std::size_t>&);
 

diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu
@@ -2,6 +2,10 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include "../cuda4dnn/csl/fp16.hpp"
+
 #include "math.hpp"
 #include "grid_stride_loop.hpp"
 #include "execution.hpp"
@@ -11,8 +15,6 @@
 
 #include <opencv2/core.hpp>
 
-#include <cuda_runtime.h>
-
 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace kernels {
 
     namespace raw {
@@ -53,6 +55,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         launch_kernel(kernel, policy, output, x, y);
     }
 
+    template void eltwise_max_2(const Stream& stream, span<__half> output, view<__half> x, view<__half> y);
     template void eltwise_max_2(const Stream& stream, span<float> output, view<float> x, view<float> y);
     template void eltwise_max_2(const Stream& stream, span<double> output, view<double> x, view<double> y);
 
@@ -66,6 +69,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         launch_kernel(kernel, policy, output, x, y);
     }
 
+    template void eltwise_sum_2(const Stream& stream, span<__half> output, view<__half> x, view<__half> y);
     template void eltwise_sum_2(const Stream& stream, span<float> output, view<float> x, view<float> y);
     template void eltwise_sum_2(const Stream& stream, span<double> output, view<double> x, view<double> y);
 
@@ -84,6 +88,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         launch_kernel(kernel, policy, output, coeff_x, x, coeff_y, y);
     }
 
+    template void eltwise_sum_coeff_2(const Stream&, span<__half>, __half, view<__half>, __half, view<__half>);
     template void eltwise_sum_coeff_2(const Stream&, span<float>, float, view<float>, float, view<float>);
     template void eltwise_sum_coeff_2(const Stream&, span<double>, double, view<double>, double, view<double>);
 
@@ -97,6 +102,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         launch_kernel(kernel, policy, output, x, y);
     }
 
+    template void eltwise_prod_2(const Stream& stream, span<__half> output, view<__half> x, view<__half> y);
     template void eltwise_prod_2(const Stream& stream, span<float> output, view<float> x, view<float> y);
     template void eltwise_prod_2(const Stream& stream, span<double> output, view<double> x, view<double> y);
 

diff --git a/modules/dnn/src/cuda/fill.cu b/modules/dnn/src/cuda/fill.cu
@@ -2,6 +2,10 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include "../cuda4dnn/csl/fp16.hpp"
+
 #include "grid_stride_loop.hpp"
 #include "execution.hpp"
 
@@ -10,8 +14,6 @@
 
 #include <opencv2/core.hpp>
 
-#include <cuda_runtime.h>
-
 #include <cstddef>
 
 namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace kernels {
@@ -43,7 +45,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl  { namespace k
         }
     }
 
-    template void fill<float>(const Stream&, span<float>, float);
-    template void fill<double>(const Stream&, span<double>, double);
+    template void fill(const Stream&, span<__half>, __half);
+    template void fill(const Stream&, span<float>, float);
+    template void fill(const Stream&, span<double>, double);
 
 }}}}} /* cv::dnn::cuda4dnn::csl::kernels */