From 64716ab37bbd8d3acb20d6fab61f4c1f718f2765 Mon Sep 17 00:00:00 2001 From: Yashas Date: Fri, 31 May 2019 10:47:46 +0530 Subject: [PATCH 001/129] stub cuda4dnn design --- modules/dnn/include/opencv2/dnn/dnn.hpp | 55 +++++++-- modules/dnn/src/dnn.cpp | 152 +++++++++++++++++++++--- modules/dnn/src/op_cuda.cpp | 12 ++ modules/dnn/src/op_cuda.hpp | 65 ++++++++++ modules/dnn/src/precomp.hpp | 1 + 5 files changed, 257 insertions(+), 28 deletions(-) create mode 100644 modules/dnn/src/op_cuda.cpp create mode 100644 modules/dnn/src/op_cuda.hpp diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index 8b69b76c1d3f..43c2972e4207 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -71,7 +71,8 @@ CV__DNN_INLINE_NS_BEGIN DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE, //!< Intel's Inference Engine computational backend. DNN_BACKEND_OPENCV, - DNN_BACKEND_VKCOM + DNN_BACKEND_VKCOM, + DNN_BACKEND_CUDA }; /** @@ -85,7 +86,8 @@ CV__DNN_INLINE_NS_BEGIN DNN_TARGET_OPENCL_FP16, DNN_TARGET_MYRIAD, DNN_TARGET_VULKAN, - DNN_TARGET_FPGA //!< FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin. + DNN_TARGET_FPGA, //!< FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin. + DNN_TARGET_CUDA_FP32 }; CV_EXPORTS std::vector< std::pair > getAvailableBackends(); @@ -170,6 +172,8 @@ CV__DNN_INLINE_NS_BEGIN * * Each class, derived from Layer, must implement allocate() methods to declare own outputs and forward() to compute outputs. * Also before using the new layer into networks you must register your layer by using one of @ref dnnLayerFactory "LayerFactory" macros. + * + * If a layer intends to provide a CUDA implementation, it must implement initCUDA() and forwardCUDA() methods. */ class CV_EXPORTS_W Layer : public Algorithm { @@ -221,6 +225,20 @@ CV__DNN_INLINE_NS_BEGIN */ void forward_fallback(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals); + /** @brief forward the @p inputs through the layer + * + * @param[in] inputs input tensors + * @param[out] outputs output tensors + * @param[out] workspace scratchpad memory that can be used for anything + * + * This method needs to be implemented iff the layer supports computation on a CUDA device. If not implemented, + * the forward pass is computed using the CPU. + */ + virtual void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs + /* cuda4dnn::csl::workspace& workspace */); + /** @brief * @overload * @deprecated Use Layer::finalize(InputArrayOfArrays, OutputArrayOfArrays) instead @@ -274,6 +292,24 @@ CV__DNN_INLINE_NS_BEGIN virtual Ptr initInfEngine(const std::vector > &inputs); virtual Ptr initVkCom(const std::vector > &inputs); + + /** + * @brief Initializes the layer to perform forward pass on CUDA capable devices. + * + * @params[in] stream stream to use for operations + * @params[in] cublas_handle cuBLAS handle to use for cuBLAS operations + * @params[in] cudnn_handle cuDNN handle to use for cuDNN operations + * @params[out] scratch_mem_in_bytes request extra device memory in bytes for internals + * + * This method needs to be implemented iff the layer supports computation on a CUDA device. + */ + virtual void initCUDA(/* + cuda4dnn::csl::Stream stream, + cuda4dnn::csl::cublas::Handle cublas_handle, + cuda4dnn::csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes + */); + /** * @brief Automatic Halide scheduling based on layer hyper-parameters. * @param[in] node Backend node with Halide functions. @@ -515,13 +551,14 @@ CV__DNN_INLINE_NS_BEGIN * @see Target * * List of supported combinations backend / target: - * | | DNN_BACKEND_OPENCV | DNN_BACKEND_INFERENCE_ENGINE | DNN_BACKEND_HALIDE | - * |------------------------|--------------------|------------------------------|--------------------| - * | DNN_TARGET_CPU | + | + | + | - * | DNN_TARGET_OPENCL | + | + | + | - * | DNN_TARGET_OPENCL_FP16 | + | + | | - * | DNN_TARGET_MYRIAD | | + | | - * | DNN_TARGET_FPGA | | + | | + * | | DNN_BACKEND_OPENCV | DNN_BACKEND_INFERENCE_ENGINE | DNN_BACKEND_HALIDE | DNN_BACKEND_CUDA | + * |------------------------|--------------------|------------------------------|--------------------|-------------------| + * | DNN_TARGET_CPU | + | + | + | | + * | DNN_TARGET_OPENCL | + | + | + | | + * | DNN_TARGET_OPENCL_FP16 | + | + | | | + * | DNN_TARGET_MYRIAD | | + | | | + * | DNN_TARGET_FPGA | | + | | | + * | DNN_TARGET_CUDA_FP32 | | | | + | */ CV_WRAP void setPreferableTarget(int targetId); diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 480cf96fe746..4a58f3b58861 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -43,6 +43,7 @@ #include "op_halide.hpp" #include "op_inf_engine.hpp" #include "op_vkcom.hpp" +#include "op_cuda.hpp" #include "halide_scheduler.hpp" #include #include @@ -141,6 +142,11 @@ class BackendRegistry if (haveVulkan()) backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN)); #endif + +#ifdef HAVE_CUDA + if(haveCUDA()) + backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP32)); +#endif } static inline bool checkIETarget(int target) { @@ -540,6 +546,7 @@ struct DataLayer : public Layer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || + backendId == DNN_BACKEND_CUDA || (backendId == DNN_BACKEND_INFERENCE_ENGINE && inputsData.size() == 1); } @@ -674,6 +681,15 @@ struct DataLayer : public Layer } #endif +#ifdef HAVE_CUDA + void forwardCUDA(std::vector>& inputs, std::vector>& outputs) CV_OVERRIDE + { + /* standardize/normalize on device */ + // use CPU for now + Layer::forwardCUDA(inputs, outputs); + } +#endif + int outputNameToIndex(const String& tgtName) CV_OVERRIDE { int idx = (int)(std::find(outNames.begin(), outNames.end(), tgtName) - outNames.begin()); @@ -745,6 +761,10 @@ struct DataLayer : public Layer return Ptr(); } +#ifdef HAVE_CUDA + void initCUDA() CV_OVERRIDE { } +#endif + std::vector outNames; // Preprocessing parameters for each network's input. std::vector scaleFactors; @@ -1009,6 +1029,15 @@ static Ptr wrapMat(int backendId, int targetId, cv::Mat& m) #ifdef HAVE_VULKAN return Ptr(new VkComBackendWrapper(m)); #endif // HAVE_VULKAN + } + else if (backendId == DNN_BACKEND_CUDA) + { + CV_Assert(haveCUDA()); + CV_Assert(IS_DNN_CUDA_TARGET(targetId)); + +#ifdef HAVE_CUDA + return CUDABackendWrapperFP32::create(m); +#endif } else CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); @@ -1094,6 +1123,14 @@ struct Net::Impl #ifdef HAVE_VULKAN return Ptr(new VkComBackendWrapper(baseBuffer, host)); #endif + } + else if (preferableBackend == DNN_BACKEND_CUDA) + { + CV_Assert(haveCUDA()); + CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget)); +#ifdef HAVE_CUDA + return CUDABackendWrapperFP32::create(baseBuffer, shape); +#endif } else CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); @@ -1200,6 +1237,9 @@ struct Net::Impl preferableTarget == DNN_TARGET_FPGA); CV_Assert(preferableBackend != DNN_BACKEND_VKCOM || preferableTarget == DNN_TARGET_VULKAN); + CV_Assert(preferableBackend != DNN_BACKEND_CUDA || + preferableTarget == DNN_TARGET_CUDA_FP32); + if (!netWasAllocated || this->blobsToKeep != blobsToKeep_) { if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)) @@ -1235,6 +1275,13 @@ struct Net::Impl preferableTarget = DNN_TARGET_CPU; } + if (preferableBackend == DNN_BACKEND_CUDA && !haveCUDA()) + { + CV_LOG_WARNING(NULL, "DNN module was not built with CUDA backend; switching to CPU"); + preferableBackend = DNN_BACKEND_OPENCV; + preferableTarget = DNN_TARGET_CPU; + } + clear(); allocateLayers(blobsToKeep_); @@ -1245,7 +1292,7 @@ struct Net::Impl initBackend(); - if (!netWasAllocated ) + if (!netWasAllocated) { #ifdef HAVE_HALIDE if (preferableBackend == DNN_BACKEND_HALIDE) @@ -1389,6 +1436,8 @@ struct Net::Impl initInfEngineBackend(); else if (preferableBackend == DNN_BACKEND_VKCOM) initVkComBackend(); + else if (preferableBackend == DNN_BACKEND_CUDA) + initCUDABackend(); else CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); } @@ -1777,6 +1826,18 @@ struct Net::Impl #endif // HAVE_INF_ENGINE } + void initCUDABackend() { + CV_Assert(haveCUDA()); + +#ifdef HAVE_CUDA + for (auto& layer : layers) + { + auto& ld = layer.second; + ld.layerInstance->initCUDA(); + } +#endif + } + void allocateLayer(int lid, const LayersShapesMap& layersShapes) { CV_TRACE_FUNCTION(); @@ -2279,7 +2340,8 @@ struct Net::Impl if( !ld.skip ) { std::map >::iterator it = ld.backendNodes.find(preferableBackend); - if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty()) + if (preferableBackend == DNN_BACKEND_OPENCV || + (preferableBackend != DNN_BACKEND_CUDA && (it == ld.backendNodes.end() || it->second.empty()))) { if (isAsync) CV_Error(Error::StsNotImplemented, "Default implementation fallbacks in asynchronous mode"); @@ -2435,32 +2497,61 @@ struct Net::Impl } else { - Ptr node = it->second; - CV_Assert(!node.empty()); - if (preferableBackend == DNN_BACKEND_HALIDE) - { - forwardHalide(ld.outputBlobsWrappers, node); - } - else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE) - { - forwardInfEngine(ld.outputBlobsWrappers, node, isAsync); - } - else if (preferableBackend == DNN_BACKEND_VKCOM) + if (preferableBackend == DNN_BACKEND_CUDA) { try { - forwardVkCom(ld.outputBlobsWrappers, node); + layer->forwardCUDA(ld.inputBlobsWrappers, ld.outputBlobsWrappers); } - catch (const cv::Exception& e) + catch (const cv::Exception&) { - CV_LOG_ERROR(NULL, "forwardVkCom failed, fallback to CPU implementation. " << e.what()); - it->second = Ptr(); - forwardLayer(ld); + CV_LOG_WARNING(NULL, "Layer does not support CUDA. Switching to CPU implementation. "); + auto actual_target = preferableTarget; + preferableBackend = DNN_BACKEND_OPENCV; + preferableTarget = DNN_TARGET_CPU; + try + { + forwardLayer(ld); + } + catch (...) + { + preferableTarget = actual_target; + preferableBackend = DNN_BACKEND_CUDA; + throw; + } + preferableTarget = actual_target; + preferableBackend = DNN_BACKEND_CUDA; } } else { - CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); + Ptr node = it->second; + CV_Assert(!node.empty()); + if (preferableBackend == DNN_BACKEND_HALIDE) + { + forwardHalide(ld.outputBlobsWrappers, node); + } + else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE) + { + forwardInfEngine(ld.outputBlobsWrappers, node, isAsync); + } + else if (preferableBackend == DNN_BACKEND_VKCOM) + { + try + { + forwardVkCom(ld.outputBlobsWrappers, node); + } + catch (const cv::Exception& e) + { + CV_LOG_ERROR(NULL, "forwardVkCom failed, fallback to CPU implementation. " << e.what()); + it->second = Ptr(); + forwardLayer(ld); + } + } + else + { + CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); + } } } } @@ -3632,6 +3723,29 @@ bool Layer::supportBackend(int backendId) return backendId == DNN_BACKEND_OPENCV; } +void Layer::initCUDA() +{ + /* + ** Implementing initCUDA is required iff the layer supports forward pass on CUDA devices. + ** Otherwise, the forward pass will fallback to CPU automatically. + ** + ** Hence, if the derived class did not implement initCUDA, we do nothing here. + */ +} + +void Layer::forwardCUDA(std::vector>& inputs, std::vector>& outputs) +{ + /* + ** Implementing forwardCUDA is required iff the layer supports forward pass on CUDA devices. + ** Otherwise, the forward pass will fallback to CPU automatically. + ** + ** Hence, if the derived class did not implement forwardCUDA, we throw to let the network know that + ** the layer does not support forward pass on CUDA devices. This will inform the network to use the + ** CPU version. + */ + CV_Error(Error::StsNotImplemented, "Layer does not have a CUDA implementation"); +} + Ptr Layer::initVkCom(const std::vector > &) { CV_Error(Error::StsNotImplemented, "VkCom pipeline of " + type + diff --git a/modules/dnn/src/op_cuda.cpp b/modules/dnn/src/op_cuda.cpp new file mode 100644 index 000000000000..60c5fa5badf4 --- /dev/null +++ b/modules/dnn/src/op_cuda.cpp @@ -0,0 +1,12 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "precomp.hpp" +#include "op_cuda.hpp" + +namespace cv { + namespace dnn { + + } /* namespace dnn */ +} /* namespace cv */ diff --git a/modules/dnn/src/op_cuda.hpp b/modules/dnn/src/op_cuda.hpp new file mode 100644 index 000000000000..925d7c834c8e --- /dev/null +++ b/modules/dnn/src/op_cuda.hpp @@ -0,0 +1,65 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_OP_CUDA_HPP +#define OPENCV_DNN_SRC_OP_CUDA_HPP + +namespace cv { + namespace dnn { + inline bool haveCUDA() { +#ifdef HAVE_CUDA + return true; +#else + return false; +#endif + } + +#ifdef HAVE_CUDA + /* CUDA Tensors are represented by csl::Tensor + ** CUDABackendWrapperFP32 wraps a csl::TensorSpan + ** It also maintains a reference to the csl::Tensor. + */ + class CUDABackendWrapperFP32 : public BackendWrapper { + public: + CUDABackendWrapperFP32(Mat& m) : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_CUDA_FP32) { + /* TODO: + ** 1. store a reference to cv::Mat + ** 2. create a csl::Tensor + ** 3. create a csl::TensorSpan (or store shape) + */ + } + + CUDABackendWrapperFP32(const Ptr& base, const MatShape& shape) + : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_CUDA_FP32) { + /* TODO: + ** 1. copy reference to csl::Tensor of base + ** 2. set TensorSpan to mimic `shape` (or store shape) + */ + } + + static Ptr create(Mat& m) + { + return Ptr(new CUDABackendWrapperFP32(m)); + } + + static Ptr create(const Ptr& base, const MatShape& shape) + { + return Ptr(new CUDABackendWrapperFP32(base, shape)); + } + + virtual void copyToHost() CV_OVERRIDE { } + virtual void setHostDirty() CV_OVERRIDE { } + + //TensorSpan getSpan(); + //TensorView getView(); + + /* TensorSpan member vs create in getSpan() + ** member tensor span can save shape changes + */ + }; +#endif + } /* namespace dnn */ +} /* namespace cv */ + +#endif /* OPENCV_DNN_SRC_OP_CUDA_HPP */ diff --git a/modules/dnn/src/precomp.hpp b/modules/dnn/src/precomp.hpp index 00e5522ab5a7..20d95bd8e86c 100644 --- a/modules/dnn/src/precomp.hpp +++ b/modules/dnn/src/precomp.hpp @@ -77,6 +77,7 @@ namespace cv { namespace dnn { CV__DNN_INLINE_NS_BEGIN #define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16) +#define IS_DNN_CUDA_TARGET(id) (id == DNN_TARGET_CUDA_FP32) Mutex& getInitializationMutex(); void initializeLayerFactory(); CV__DNN_INLINE_NS_END From 20f4f2b0c9ff61a11661c1c3899407755cb5d2ab Mon Sep 17 00:00:00 2001 From: Yashas Date: Fri, 31 May 2019 20:29:50 +0530 Subject: [PATCH 002/129] minor fixes for tests and doxygen --- modules/dnn/include/opencv2/dnn/dnn.hpp | 10 +++++----- modules/dnn/test/test_common.impl.hpp | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index 43c2972e4207..3ff59fafb166 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -229,7 +229,7 @@ CV__DNN_INLINE_NS_BEGIN * * @param[in] inputs input tensors * @param[out] outputs output tensors - * @param[out] workspace scratchpad memory that can be used for anything + * param[out] workspace scratchpad memory that can be used for anything * * This method needs to be implemented iff the layer supports computation on a CUDA device. If not implemented, * the forward pass is computed using the CPU. @@ -296,10 +296,10 @@ CV__DNN_INLINE_NS_BEGIN /** * @brief Initializes the layer to perform forward pass on CUDA capable devices. * - * @params[in] stream stream to use for operations - * @params[in] cublas_handle cuBLAS handle to use for cuBLAS operations - * @params[in] cudnn_handle cuDNN handle to use for cuDNN operations - * @params[out] scratch_mem_in_bytes request extra device memory in bytes for internals + * param[in] stream stream to use for operations + * param[in] cublas_handle cuBLAS handle to use for cuBLAS operations + * param[in] cudnn_handle cuDNN handle to use for cuDNN operations + * param[out] scratch_mem_in_bytes request extra device memory in bytes for internals * * This method needs to be implemented iff the layer supports computation on a CUDA device. */ diff --git a/modules/dnn/test/test_common.impl.hpp b/modules/dnn/test/test_common.impl.hpp index d64923cb33a9..9c3a8ebb593d 100644 --- a/modules/dnn/test/test_common.impl.hpp +++ b/modules/dnn/test/test_common.impl.hpp @@ -26,6 +26,7 @@ void PrintTo(const cv::dnn::Backend& v, std::ostream* os) case DNN_BACKEND_INFERENCE_ENGINE: *os << "DLIE"; return; case DNN_BACKEND_VKCOM: *os << "VKCOM"; return; case DNN_BACKEND_OPENCV: *os << "OCV"; return; + case DNN_BACKEND_CUDA: *os << "CUDA"; return; } // don't use "default:" to emit compiler warnings *os << "DNN_BACKEND_UNKNOWN(" << (int)v << ")"; } @@ -39,6 +40,7 @@ void PrintTo(const cv::dnn::Target& v, std::ostream* os) case DNN_TARGET_MYRIAD: *os << "MYRIAD"; return; case DNN_TARGET_VULKAN: *os << "VULKAN"; return; case DNN_TARGET_FPGA: *os << "FPGA"; return; + case DNN_TARGET_CUDA_FP32: *os << "CUDA_FP32"; return; } // don't use "default:" to emit compiler warnings *os << "DNN_TARGET_UNKNOWN(" << (int)v << ")"; } From d8f49fd396df167d29ea0a576d1f68a9e1819206 Mon Sep 17 00:00:00 2001 From: Yashas Date: Sun, 16 Jun 2019 20:00:19 +0530 Subject: [PATCH 003/129] add csl public api directory to module headers --- modules/dnn/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index fa6eadfb8d64..2729b20b26c9 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -100,7 +100,11 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-inconsistent-missing-override") # Clang endif() -ocv_glob_module_sources(${sources_options} SOURCES ${fw_srcs}) + +file(GLOB csl_hdrs "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/csl/*.hpp") +source_group("Include\\opencv2\\csl" FILES ${csl_hdrs}) + +ocv_glob_module_sources(${sources_options} SOURCES ${fw_srcs} HEADERS ${csl_hdrs}) ocv_create_module(${libs} ${INF_ENGINE_TARGET}) ocv_add_samples() ocv_add_accuracy_tests(${INF_ENGINE_TARGET}) From b9edc003e8ee468ffc801bf8713130b973b119ed Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 18 Jun 2019 13:29:19 +0530 Subject: [PATCH 004/129] add low-level CSL components --- .../dnn/include/opencv2/dnn/csl/cublas.hpp | 52 ++++ modules/dnn/include/opencv2/dnn/csl/cudnn.hpp | 52 ++++ modules/dnn/include/opencv2/dnn/csl/error.hpp | 20 ++ .../dnn/include/opencv2/dnn/csl/stream.hpp | 52 ++++ .../dnn/include/opencv2/dnn/csl/workspace.hpp | 38 +++ modules/dnn/include/opencv2/dnn/dnn.hpp | 25 +- modules/dnn/src/cuda4dnn/csl/cublas.cpp | 110 +++++++ modules/dnn/src/cuda4dnn/csl/cublas.hpp | 10 + modules/dnn/src/cuda4dnn/csl/cudnn.cpp | 85 +++++ modules/dnn/src/cuda4dnn/csl/cudnn.hpp | 26 ++ modules/dnn/src/cuda4dnn/csl/error.hpp | 24 ++ modules/dnn/src/cuda4dnn/csl/memory.hpp | 284 +++++++++++++++++ modules/dnn/src/cuda4dnn/csl/nvcc_defs.hpp | 20 ++ modules/dnn/src/cuda4dnn/csl/pointer.hpp | 290 ++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/span.hpp | 59 ++++ modules/dnn/src/cuda4dnn/csl/stream.cpp | 91 ++++++ modules/dnn/src/cuda4dnn/csl/stream.hpp | 22 ++ modules/dnn/src/cuda4dnn/csl/workspace.cpp | 31 ++ modules/dnn/src/cuda4dnn/csl/workspace.hpp | 22 ++ modules/dnn/src/cuda4dnn/test.cpp | 18 -- modules/dnn/src/dnn.cpp | 58 +++- modules/dnn/src/proxy_cuda.cpp | 63 ++++ 22 files changed, 1407 insertions(+), 45 deletions(-) create mode 100644 modules/dnn/include/opencv2/dnn/csl/cublas.hpp create mode 100644 modules/dnn/include/opencv2/dnn/csl/cudnn.hpp create mode 100644 modules/dnn/include/opencv2/dnn/csl/error.hpp create mode 100644 modules/dnn/include/opencv2/dnn/csl/stream.hpp create mode 100644 modules/dnn/include/opencv2/dnn/csl/workspace.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/cublas.cpp create mode 100644 modules/dnn/src/cuda4dnn/csl/cublas.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/cudnn.cpp create mode 100644 modules/dnn/src/cuda4dnn/csl/cudnn.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/error.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/memory.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/nvcc_defs.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/pointer.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/span.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/stream.cpp create mode 100644 modules/dnn/src/cuda4dnn/csl/stream.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/workspace.cpp create mode 100644 modules/dnn/src/cuda4dnn/csl/workspace.hpp delete mode 100644 modules/dnn/src/cuda4dnn/test.cpp create mode 100644 modules/dnn/src/proxy_cuda.cpp diff --git a/modules/dnn/include/opencv2/dnn/csl/cublas.hpp b/modules/dnn/include/opencv2/dnn/csl/cublas.hpp new file mode 100644 index 000000000000..5ed347ff8bfc --- /dev/null +++ b/modules/dnn/include/opencv2/dnn/csl/cublas.hpp @@ -0,0 +1,52 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CSL_CUBLAS_HPP +#define OPENCV_DNN_CSL_CUBLAS_HPP + +#include "error.hpp" +#include "stream.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas { + + //! exception class for errors thrown by the cuBLAS API + class cuBLASException : public CUDAException { + public: + using CUDAException::CUDAException; + }; + + /** @brief sharable cuBLAS smart handle + * + * Handle is a smart sharable wrapper for cuBLAS handle which ensures that the handle + * is destroyed after all references to the handle are destroyed. The handle can be + * associated with a CUDA stream by specifying the stream during construction. By default, + * the handle is associated with the default stream. + * + * @note Moving a Handle object to another invalidates the former + */ + class Handle { + public: + Handle(); + Handle(const Handle&) noexcept; + Handle(Handle&&) noexcept; + Handle(Stream strm); + + Handle& operator=(const Handle&) noexcept; + Handle& operator=(Handle&&) noexcept; + + //!< returns true if the handle is valid + explicit operator bool() const noexcept; + + private: + friend class HandleAccessor; + + class UniqueHandle; + std::shared_ptr handle; + }; + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */ + +#endif /* OPENCV_DNN_CSL_CUBLAS_HPP */ diff --git a/modules/dnn/include/opencv2/dnn/csl/cudnn.hpp b/modules/dnn/include/opencv2/dnn/csl/cudnn.hpp new file mode 100644 index 000000000000..873e4f869a5c --- /dev/null +++ b/modules/dnn/include/opencv2/dnn/csl/cudnn.hpp @@ -0,0 +1,52 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CSL_CUDNN_HPP +#define OPENCV_DNN_CSL_CUDNN_HPP + +#include "error.hpp" +#include "stream.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { + + //! exception class for errors thrown by the cuDNN API + class cuDNNException : public CUDAException { + public: + using CUDAException::CUDAException; + }; + + /** @brief sharable cuDNN smart handle + * + * Handle is a smart sharable wrapper for cuDNN handle which ensures that the handle + * is destroyed after all references to the handle are destroyed. The handle can be + * associated with a CUDA stream by specifying the stream during construction. By default, + * the handle is associated with the default stream. + * + * @note Moving a Handle object to another invalidates the former + */ + class Handle { + public: + Handle(); + Handle(const Handle&) noexcept; + Handle(Handle&&) noexcept; + Handle(Stream strm); + + Handle& operator=(const Handle&) noexcept; + Handle& operator=(Handle&&) noexcept; + + //!< returns true if the handle is valid + explicit operator bool() const noexcept; + + private: + friend class HandleAccessor; + + class UniqueHandle; + std::shared_ptr handle; + }; + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ + +#endif /* OPENCV_DNN_CSL_CUDNN_HPP */ diff --git a/modules/dnn/include/opencv2/dnn/csl/error.hpp b/modules/dnn/include/opencv2/dnn/csl/error.hpp new file mode 100644 index 000000000000..2210c748de64 --- /dev/null +++ b/modules/dnn/include/opencv2/dnn/csl/error.hpp @@ -0,0 +1,20 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CSL_ERROR_HPP +#define OPENCV_DNN_CSL_ERROR_HPP + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + //! exception class for errors thrown by the CUDA APIs + class CUDAException : public cv::Exception { + public: + using cv::Exception::Exception; + }; + +}}}} /* namespace cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CSL_ERROR_HPP */ diff --git a/modules/dnn/include/opencv2/dnn/csl/stream.hpp b/modules/dnn/include/opencv2/dnn/csl/stream.hpp new file mode 100644 index 000000000000..70fb616723b2 --- /dev/null +++ b/modules/dnn/include/opencv2/dnn/csl/stream.hpp @@ -0,0 +1,52 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CSL_STREAM_HPP +#define OPENCV_DNN_CSL_STREAM_HPP + +#include + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /** @brief sharable smart CUDA stream + * + * Stream is a smart sharable wrapper for CUDA stream handle which ensures that + * the handle is destroyed after use. Unless explicitly specified by a constructor argument, + * the stream object represents the default stream. + * + * @note Moving a Stream object to another invalidates the former + */ + class Stream { + public: + Stream(); + Stream(const Stream&) noexcept; + Stream(Stream&&) noexcept; + + //!< if \p create is `true`, a new stream will be created instead of the otherwise default stream + Stream(bool create); + + Stream& operator=(const Stream&) noexcept; + Stream& operator=(Stream&&) noexcept; + + //!< blocks the caller thread until all operations in the stream complete + void synchronize() const; + + //!< returns true if there are operations pending in the stream + bool busy() const; + + //!< returns true if the stream is valid + explicit operator bool() const noexcept; + + private: + friend class StreamAccessor; + + class UniqueStream; + std::shared_ptr stream; + }; + +}}}} /* namespace cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CSL_STREAM_HPP */ diff --git a/modules/dnn/include/opencv2/dnn/csl/workspace.hpp b/modules/dnn/include/opencv2/dnn/csl/workspace.hpp new file mode 100644 index 000000000000..59ca92dc532f --- /dev/null +++ b/modules/dnn/include/opencv2/dnn/csl/workspace.hpp @@ -0,0 +1,38 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CSL_WORKSPACE_HPP +#define OPENCV_DNN_CSL_WORKSPACE_HPP + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /** @brief maintains a single block of reusable device memory + * + * Each Workspace object is intended to be used by a single entity at a time but by + * different entities at different times. It maintains a single reusable block of memory which + * is sufficient for the largest consumer. + */ + class Workspace { + public: + Workspace(); + + /** @brief reserve \p bytes of memory */ + void require(std::size_t bytes); + + /** @brief number of bytes reserved by the largest consumer */ + std::size_t size() const noexcept; + + private: + friend class WorkspaceAccessor; + + class Impl; + std::shared_ptr impl; + }; + +}}}} /* cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CSL_WORKSPACE_HPP */ diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index 3ff59fafb166..c8ad545ced17 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -42,6 +42,11 @@ #ifndef OPENCV_DNN_DNN_HPP #define OPENCV_DNN_DNN_HPP +#include +#include +#include +#include + #include #include #include "opencv2/core/async.hpp" @@ -229,15 +234,15 @@ CV__DNN_INLINE_NS_BEGIN * * @param[in] inputs input tensors * @param[out] outputs output tensors - * param[out] workspace scratchpad memory that can be used for anything + * @param[out] workspace scratchpad memory that can be used for anything * * This method needs to be implemented iff the layer supports computation on a CUDA device. If not implemented, * the forward pass is computed using the CPU. */ virtual void forwardCUDA( std::vector>& inputs, - std::vector>& outputs - /* cuda4dnn::csl::workspace& workspace */); + std::vector>& outputs, + cuda4dnn::csl::Workspace& workspace); /** @brief * @overload @@ -296,19 +301,19 @@ CV__DNN_INLINE_NS_BEGIN /** * @brief Initializes the layer to perform forward pass on CUDA capable devices. * - * param[in] stream stream to use for operations - * param[in] cublas_handle cuBLAS handle to use for cuBLAS operations - * param[in] cudnn_handle cuDNN handle to use for cuDNN operations - * param[out] scratch_mem_in_bytes request extra device memory in bytes for internals + * @param[in] stream stream to use for operations + * @param[in] cublas_handle cuBLAS handle to use for cuBLAS operations + * @param[in] cudnn_handle cuDNN handle to use for cuDNN operations + * @param[out] scratch_mem_in_bytes request extra device memory in bytes for internals; defaults to zero * - * This method needs to be implemented iff the layer supports computation on a CUDA device. + * This method needs to be implemented iff the layer supports forward pass compuatation on CUDA devices. */ - virtual void initCUDA(/* + virtual void initCUDA( cuda4dnn::csl::Stream stream, cuda4dnn::csl::cublas::Handle cublas_handle, cuda4dnn::csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes - */); + ); /** * @brief Automatic Halide scheduling based on layer hyper-parameters. diff --git a/modules/dnn/src/cuda4dnn/csl/cublas.cpp b/modules/dnn/src/cuda4dnn/csl/cublas.cpp new file mode 100644 index 000000000000..9fd8fe097977 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/cublas.cpp @@ -0,0 +1,110 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "cublas.hpp" +#include "stream.hpp" + +#include + +#include + +#include +#include +#include + +#define CUDA4DNN_CHECK_CUBLAS(call) \ + ::cv::dnn::cuda4dnn::csl::cublas::check((call), CV_Func, __FILE__, __LINE__) + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas { + + static void check(cublasStatus_t status, const char* func, const char* file, int line) { + auto cublasGetErrorString = [](cublasStatus_t err) { + switch (err) { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; + } + return "UNKNOWN_CUBLAS_ERROR"; + }; + + if (status != CUBLAS_STATUS_SUCCESS) + throw cuBLASException(Error::GpuApiCallError, cublasGetErrorString(status), func, file, line); + } + + /** noncopyable cuBLAS smart handle + * + * UniqueHandle is a smart non-sharable wrapper for cuBLAS handle which ensures that the handle + * is destroyed after use. The handle can be associated with a CUDA stream by specifying the + * stream during construction. By default, the handle is associated with the default stream. + */ + class Handle::UniqueHandle { + public: + UniqueHandle() { CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle)); } + UniqueHandle(UniqueHandle&) = delete; + UniqueHandle(UniqueHandle&& other) noexcept + : stream(std::move(other.stream)), handle{ other.handle } { + other.handle = nullptr; + } + + UniqueHandle(Stream strm) : stream(std::move(strm)) { + CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle)); + try { + CUDA4DNN_CHECK_CUBLAS(cublasSetStream(handle, StreamAccessor::get(stream))); + } catch (...) { + /* cublasDestroy won't throw if a valid handle is passed */ + CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle)); + throw; + } + } + + ~UniqueHandle() noexcept { + if (handle != nullptr) { + /* cublasDestroy won't throw if a valid handle is passed */ + CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle)); + } + } + + UniqueHandle& operator=(const UniqueHandle&) = delete; + UniqueHandle& operator=(UniqueHandle&& other) noexcept { + stream = std::move(other.stream); + handle = other.handle; + other.handle = nullptr; + return *this; + } + + //!< returns the raw cuBLAS handle + cublasHandle_t get() const noexcept { return handle; } + + private: + Stream stream; + cublasHandle_t handle; + }; + + /* used to access the raw cuBLAS handle held by Handle */ + class HandleAccessor { + public: + static cublasHandle_t get(Handle& handle) { + CV_Assert(handle); + return handle.handle->get(); + } + }; + + Handle::Handle() : handle(std::make_shared()) { } + Handle::Handle(const Handle&) noexcept = default; + Handle::Handle(Handle&&) noexcept = default; + Handle::Handle(Stream strm) : handle(std::make_shared(std::move(strm))) { } + + Handle& Handle::operator=(const Handle&) noexcept = default; + Handle& Handle::operator=(Handle&&) noexcept = default; + + Handle::operator bool() const noexcept { return static_cast(handle); } + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */ diff --git a/modules/dnn/src/cuda4dnn/csl/cublas.hpp b/modules/dnn/src/cuda4dnn/csl/cublas.hpp new file mode 100644 index 000000000000..b9ddcca7ec63 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/cublas.hpp @@ -0,0 +1,10 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUBLAS_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_CUBLAS_HPP + +#include + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUBLAS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.cpp b/modules/dnn/src/cuda4dnn/csl/cudnn.cpp new file mode 100644 index 000000000000..d461fac4d58c --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.cpp @@ -0,0 +1,85 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "cudnn.hpp" +#include "stream.hpp" + +#include + +#include + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { + + /** @brief noncopyable cuDNN smart handle + * + * UniqueHandle is a smart non-sharable wrapper for cuDNN handle which ensures that the handle + * is destroyed after use. The handle can be associated with a CUDA stream by specifying the + * stream during construction. By default, the handle is associated with the default stream. + */ + class Handle::UniqueHandle { + public: + UniqueHandle() { CUDA4DNN_CHECK_CUDNN(cudnnCreate(&handle)); } + UniqueHandle(UniqueHandle&) = delete; + UniqueHandle(UniqueHandle&& other) noexcept + : stream(std::move(other.stream)), handle{ other.handle } { + other.handle = nullptr; + } + + UniqueHandle(Stream strm) : stream(std::move(strm)) { + CUDA4DNN_CHECK_CUDNN(cudnnCreate(&handle)); + try { + CUDA4DNN_CHECK_CUDNN(cudnnSetStream(handle, StreamAccessor::get(stream))); + } catch (...) { + /* cudnnDestroy won't throw if a valid handle is passed */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle)); + throw; + } + } + + ~UniqueHandle() noexcept { + if (handle != nullptr) { + /* cudnnDestroy won't throw if a valid handle is passed */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle)); + } + } + + UniqueHandle& operator=(const UniqueHandle&) = delete; + UniqueHandle& operator=(UniqueHandle&& other) noexcept { + stream = std::move(other.stream); + handle = other.handle; + other.handle = nullptr; + return *this; + } + + //!< returns the raw cuDNN handle + cudnnHandle_t get() const noexcept { return handle; } + + private: + Stream stream; + cudnnHandle_t handle; + }; + + /** used to access the raw cuDNN handle held by Handle */ + class HandleAccessor { + public: + static cudnnHandle_t get(const Handle& handle) { + CV_Assert(handle); + return handle.handle->get(); + } + }; + + Handle::Handle() : handle(std::make_shared()) { } + Handle::Handle(const Handle&) noexcept = default; + Handle::Handle(Handle&&) noexcept = default; + Handle::Handle(Stream strm) : handle(std::make_shared(std::move(strm))) { } + + Handle& Handle::operator=(const Handle&) noexcept = default; + Handle& Handle::operator=(Handle&&) noexcept = default; + + Handle::operator bool() const noexcept { return static_cast(handle); } + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp new file mode 100644 index 000000000000..a50eed098e9f --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp @@ -0,0 +1,26 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP + +#include + +#include + +#define CUDA4DNN_CHECK_CUDNN(call) \ + ::cv::dnn::cuda4dnn::csl::cudnn::detail::check((call), CV_Func, __FILE__, __LINE__) + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { + + namespace detail { + inline void check(cudnnStatus_t status, const char* func, const char* file, int line) { + if (status != CUDNN_STATUS_SUCCESS) + throw cuDNNException(Error::GpuApiCallError, cudnnGetErrorString(status), func, file, line); + } + } + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/error.hpp b/modules/dnn/src/cuda4dnn/csl/error.hpp new file mode 100644 index 000000000000..05b7b25f9001 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/error.hpp @@ -0,0 +1,24 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_ERROR_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_ERROR_HPP + +#include + +#include + +#define CUDA4DNN_CHECK_CUDA(call) \ + ::cv::dnn::cuda4dnn::csl::detail::check((call), CV_Func, __FILE__, __LINE__) + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + namespace detail { + inline void check(cudaError_t err, const char* func, const char* file, int line) { + if (err != cudaSuccess) + throw CUDAException(Error::GpuApiCallError, cudaGetErrorString(err), func, file, line); + } + } +}}}} /* namespace cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_ERROR_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/memory.hpp b/modules/dnn/src/cuda4dnn/csl/memory.hpp new file mode 100644 index 000000000000..81190a47fb73 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/memory.hpp @@ -0,0 +1,284 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_MEMORY_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_MEMORY_HPP + +#include "error.hpp" +#include "pointer.hpp" + +#include + +#include + +#include +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /* @brief smart device pointer with allocation/deallocation methods + * + * ManagedPtr is a smart shared device pointer which also handles memory allocation. + */ + template + class ManagedPtr { + static_assert(!std::is_const::value && !std::is_volatile::value, "T cannot be cv-qualified"); + static_assert(std::is_standard_layout::value, "T must satisfy StandardLayoutType"); + + public: + using element_type = T; + + using pointer = DevicePtr; + using const_pointer = DevicePtr::type>; + + using size_type = std::size_t; + + ManagedPtr() noexcept : wrapped{ nullptr }, n{ 0 }, capacity{ 0 } { } + ManagedPtr(const ManagedPtr&) noexcept = default; + ManagedPtr(ManagedPtr&& other) noexcept + : wrapped{ std::move(other.wrapped) }, n{ other.n }, capacity { other.capacity } + { + other.reset(); + } + + /** allocates device memory for \p count number of element */ + ManagedPtr(size_type count) { + if (count <= 0) { + CV_Error(Error::StsBadArg, "number of elements is zero or negative"); + } + + void* temp = nullptr; + CUDA4DNN_CHECK_CUDA(cudaMalloc(&temp, count * sizeof(element_type))); + + auto ptr = typename pointer::pointer(static_cast(temp)); + wrapped.reset(ptr, [](element_type* ptr) { + if (ptr != nullptr) { + /* contract violation for std::shared_ptr if cudaFree throws */ + CUDA4DNN_CHECK_CUDA(cudaFree(ptr)); + } + }); + /* std::shared_ptr::reset invokves the deleter if an exception occurs; hence, we don't + * need to have a try-catch block to free the allocated device memory + */ + + n = capacity = count; + } + + ManagedPtr& operator=(ManagedPtr&& other) noexcept { + wrapped = std::move(other.wrapped); + n = other.n; + capacity = other.capacity; + + other.reset(); + return *this; + } + + size_type size() const noexcept { return n; } + + void reset() noexcept { wrapped.reset(); n = capacity = 0; } + + /** + * deallocates any previously allocated memory and allocates device memory + * for \p count number of elements + * + * @note might optimize to avoid unnecessary reallocation + * + * Exception Guarantee: Strong + */ + void reset(size_type count) { + /* we need to fully own the memory to perform optimizations */ + if (wrapped.use_count() == 1) { + /* avoid reallocation if the existing capacity is sufficient */ + if (count <= capacity) { + n = count; + return; + } + } + + /* no optimization performed; allocate memory */ + ManagedPtr tmp(count); + swap(tmp, *this); + } + + pointer get() const noexcept { return pointer(wrapped.get()); } + + explicit operator bool() const noexcept { return wrapped; } + + friend bool operator==(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped == rhs.wrapped; } + friend bool operator!=(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped != rhs.wrapped; } + + friend void swap(ManagedPtr& lhs, ManagedPtr& rhs) noexcept { + using std::swap; + swap(lhs.wrapped, rhs.wrapped); + swap(lhs.n, rhs.n); + swap(lhs.capacity, rhs.capacity); + } + + private: + std::shared_ptr wrapped; + size_type n, capacity; + }; + + /** copies entire memory block pointed by \p src to \p dest + * + * \param[in] src device pointer + * \param[out] dest host pointer + * + * Pre-conditions: + * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src + * + * Exception Guarantee: Basic + */ + template + void memcpy(T *dest, const ManagedPtr& src) { + memcpy(dest, src.get(), src.size()); + } + + /** copies data from memory pointed by \p src to fully fill \p dest + * + * \param[in] src host pointer + * \param[out] dest device pointer + * + * Pre-conditions: + * - memory pointed by \p src must be at least as big as the memory block held by \p dest + * + * Exception Guarantee: Basic + */ + template + void memcpy(const ManagedPtr& dest, const T* src) { + memcpy(dest.get(), src, dest.size()); + } + + /** copies data from memory pointed by \p src to \p dest + * + * if the two \p src and \p dest have different sizes, the number of elements copied is + * equal to the size of the smaller memory block + * + * \param[in] src device pointer + * \param[out] dest device pointer + * + * Exception Guarantee: Basic + */ + template + void memcpy(const ManagedPtr& dest, const ManagedPtr& src) { + memcpy(dest.get(), src.get(), std::min(dest.size(), src.size())); + } + + /** sets device memory block to a specific 8-bit value + * + * \param[in] src device pointer + * \param[out] ch 8-bit value to fill the device memory with + * + * Exception Guarantee: Basic + */ + template + void memset(const ManagedPtr& dest, std::int8_t ch) { + memset(dest.get(), ch, dest.size()); + } + + /** copies entire memory block pointed by \p src to \p dest asynchronously + * + * \param[in] src device pointer + * \param[out] dest host pointer + * \param stream CUDA stream that has to be used for the memory transfer + * + * Pre-conditions: + * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src + * - \p dest points to page-locked memory + * + * Exception Guarantee: Basic + */ + template + void memcpy(T *dest, const ManagedPtr& src, const Stream& stream) { + CV_Assert(stream); + memcpy(dest, src.get(), src.size(), stream); + } + + /** copies data from memory pointed by \p src to \p dest asynchronously + * + * \param[in] src host pointer + * \param[out] dest device pointer + * \param stream CUDA stream that has to be used for the memory transfer + * + * Pre-conditions: + * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src + * - \p src points to page-locked memory + * + * Exception Guarantee: Basic + */ + template + void memcpy(const ManagedPtr& dest, const T* src, const Stream& stream) { + CV_Assert(stream); + memcpy(dest.get(), src, dest.size(), stream); + } + + /** copies data from memory pointed by \p src to \p dest asynchronously + * + * \param[in] src device pointer + * \param[out] dest device pointer + * \param stream CUDA stream that has to be used for the memory transfer + * + * if the two \p src and \p dest have different sizes, the number of elements copied is + * equal to the size of the smaller memory block + * + * Exception Guarantee: Basic + */ + template + void memcpy(ManagedPtr& dest, const ManagedPtr& src, const Stream& stream) { + CV_Assert(stream); + memcpy(dest.get(), src.get(), std::min(dest.size(), src.size()), stream); + } + + /** sets device memory block to a specific 8-bit value asynchronously + * + * \param[in] src device pointer + * \param[out] ch 8-bit value to fill the device memory with + * \param stream CUDA stream that has to be used for the memory operation + * + * Exception Guarantee: Basic + */ + template + void memset(const ManagedPtr& dest, int ch, const Stream& stream) { + CV_Assert(stream); + memset(dest.get(), ch, dest.size(), stream); + } + + /** @brief registers host memory as page-locked and unregisters on destruction */ + class MemoryLockGuard { + public: + MemoryLockGuard() noexcept : ptr { nullptr } { } + MemoryLockGuard(const MemoryLockGuard&) = delete; + MemoryLockGuard(MemoryLockGuard&& other) noexcept : ptr{ other.ptr } { + other.ptr = nullptr; + } + + /** + * Pre-conditons: + * - host memory should be unregistered + */ + MemoryLockGuard(void* ptr_, std::size_t size_in_bytes) : ptr{ ptr_ } { + CUDA4DNN_CHECK_CUDA(cudaHostRegister(ptr_, size_in_bytes, cudaHostRegisterPortable)); + } + + MemoryLockGuard& operator=(const MemoryLockGuard&) = delete; + MemoryLockGuard& operator=(MemoryLockGuard&& other) noexcept { + ptr = other.ptr; + other.ptr = nullptr; + return *this; + } + + ~MemoryLockGuard() { + if(ptr != nullptr) + CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr)); + } + + private: + void *ptr; + }; + +}}}} /* namespace cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_MEMORY_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/nvcc_defs.hpp b/modules/dnn/src/cuda4dnn/csl/nvcc_defs.hpp new file mode 100644 index 000000000000..44a2ac0fa0fd --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/nvcc_defs.hpp @@ -0,0 +1,20 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_NVCC_DEFS_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_NVCC_DEFS_HPP + +#include + +#ifdef __CUDACC__ +# define CUDA4DNN_HOST __host__ +# define CUDA4DNN_DEVICE __device__ +# define CUDA4DNN_HOST_DEVICE CUDA4DNN_HOST CUDA4DNN_DEVICE +#else +# define CUDA4DNN_HOST +# define CUDA4DNN_DEVICE +# define CUDA4DNN_HOST_DEVICE +#endif + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_NVCC_DEFS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/pointer.hpp b/modules/dnn/src/cuda4dnn/csl/pointer.hpp new file mode 100644 index 000000000000..aeb298c97e25 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/pointer.hpp @@ -0,0 +1,290 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_POINTER_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_POINTER_HPP + +#include "nvcc_defs.hpp" +#include "error.hpp" +#include "stream.hpp" + +#include + +#include + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /** @brief provides a type-safe device pointer + * + * DevicePtr wraps a raw pointer and mimics its behaviour. It does not implicitly convert + * to a raw pointer. This ensures that accidental mixing of host and device pointers do not happen. + * + * It is meant to point to locations in device memory. Hence, it provides dereferencing or + * array subscript capability for device code only. + * + * A `const DevicePtr` represents an immutable pointer to a mutable memory. + * A `DevicePtr` represents a mutable pointer to an immutable memory. + * A `const DevicePtr` represents an immutable pointer to an immutable memory. + * + * A `DevicePtr` can implicitly convert to `DevicePtr`. + */ + template + class DevicePtr { + static_assert(std::is_standard_layout::value, "T must satisfy StandardLayoutType"); + + public: + using element_type = T; + using difference_type = std::ptrdiff_t; + using pointer = typename std::add_pointer::type; + using reference = typename std::add_lvalue_reference::type; + + DevicePtr() = default; + CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { } + + CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; } + + CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; }; + CUDA4DNN_DEVICE reference operator[](difference_type idx) const noexcept { return get()[idx]; } + CUDA4DNN_DEVICE reference operator*() const noexcept { return *get(); } + CUDA4DNN_DEVICE pointer operator->() const noexcept { return get(); } + + template::type, + typename std::enable_if::value, bool>::type = true> + CUDA4DNN_HOST_DEVICE operator DevicePtr() const noexcept { return DevicePtr{ptr}; } + + CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; } + + CUDA4DNN_HOST_DEVICE DevicePtr operator++() noexcept { + ++ptr; + return *this; + } + + CUDA4DNN_HOST_DEVICE DevicePtr operator++(int) noexcept { + auto tmp = DevicePtr(*this); + ptr++; + return tmp; + } + + CUDA4DNN_HOST_DEVICE DevicePtr operator--() noexcept { + --ptr; + return *this; + } + + CUDA4DNN_HOST_DEVICE DevicePtr operator--(int) noexcept { + auto tmp = DevicePtr(*this); + ptr--; + return tmp; + } + + CUDA4DNN_HOST_DEVICE DevicePtr operator+=(std::ptrdiff_t offset) noexcept { + ptr += offset; + return *this; + } + + CUDA4DNN_HOST_DEVICE DevicePtr operator-=(std::ptrdiff_t offset) noexcept { + ptr -= offset; + return *this; + } + + CUDA4DNN_HOST_DEVICE friend DevicePtr operator+(DevicePtr lhs, std::ptrdiff_t offset) noexcept { + return lhs += offset; + } + + CUDA4DNN_HOST_DEVICE friend DevicePtr operator-(DevicePtr lhs, std::ptrdiff_t offset) noexcept { + return lhs -= offset; + } + + CUDA4DNN_HOST_DEVICE friend difference_type operator-(DevicePtr lhs, DevicePtr rhs) noexcept { + return lhs.ptr - rhs.ptr; + } + + CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; } + CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); } + CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; } + CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; } + CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); } + CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); } + + CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; } + + CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept { + using std::swap; + swap(lhs.ptr, rhs.ptr); + } + + template + CUDA4DNN_HOST friend std::basic_ostream& operator<<(std::basic_ostream& os, DevicePtr other) { + os << other.get() << " (device)"; + return os; + } + + private: + pointer ptr; + }; + + /** copies \p n elements from \p src to \p dest4 + * + * \param[in] src device pointer + * \param[out] dest host pointer + * + * Pre-conditions: + * - memory pointed by \p dest and \p src must be large enough to hold \p n elements + * + * Exception Guarantee: Basic + */ + template + void memcpy(T *dest, DevicePtr src, std::size_t n) { + if (n <= 0) { + CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); + } + + CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest, src.get(), n * sizeof(T), cudaMemcpyDefault)); + } + + /** copies \p n elements from \p src to \p dest + * + * \param[in] src host pointer + * \param[out] dest device pointer + * + * Pre-conditions: + * - memory pointed by \p dest and \p src must be large enough to hold \p n elements + * + * Exception Guarantee: Basic + */ + template + void memcpy(DevicePtr dest, const T* src, std::size_t n) { + if (n <= 0) { + CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); + } + + CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src, n * sizeof(T), cudaMemcpyDefault)); + } + + /** copies \p n elements from \p src to \p dest + * + * \param[in] src device pointer + * \param[out] dest device pointer + * + * Pre-conditions: + * - memory pointed by \p dest and \p src must be large enough to hold \p n elements + * + * Exception Guarantee: Basic + */ + template + void memcpy(DevicePtr dest, DevicePtr src, std::size_t n) { + if (n <= 0) { + CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); + } + + CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault)); + } + + /** sets \p n elements to \p ch in \p dest + * + * \param[in] src device pointer + * \param[out] ch 8-bit value to fill the device memory with + * + * Pre-conditions: + * - memory pointed by \p dest must be large enough to hold \p n elements + * + * Exception Guarantee: Basic + */ + template + void memset(DevicePtr dest, std::int8_t ch, std::size_t n) { + if (n <= 0) { + CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); + } + + CUDA4DNN_CHECK_CUDA(cudaMemset(dest.get(), ch, n * sizeof(T))); + } + + /** copies \p n elements from \p src to \p dest asynchronously + * + * \param[in] src device pointer + * \param[out] dest host pointer + * \param stream CUDA stream that has to be used for the memory transfer + * + * Pre-conditions: + * - memory pointed by \p dest and \p src must be large enough to hold \p n elements + * - \p dest points to page-locked memory + * + * Exception Guarantee: Basic + */ + template + void memcpy(T *dest, DevicePtr src, std::size_t n, const Stream& stream) { + if (n <= 0) { + CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); + } + + CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest, src.get(), n * sizeof(T), cudaMemcpyDefault, StreamAccessor::get(stream))); + } + + /** copies data from memory pointed by \p src to \p dest asynchronously + * + * \param[in] src host pointer + * \param[out] dest device pointer + * \param stream CUDA stream that has to be used for the memory transfer + * + * Pre-conditions: + * - memory pointed by \p dest and \p src must be large enough to hold \p n elements + * - \p src points to page-locked memory + * + * Exception Guarantee: Basic + */ + template + void memcpy(DevicePtr dest, const T *src, std::size_t n, const Stream& stream) { + if (n <= 0) { + CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); + } + + CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src, n * sizeof(T), cudaMemcpyDefault, StreamAccessor::get(stream))); + } + + /** copies \p n elements from \p src to \p dest asynchronously + * + * \param[in] src device pointer + * \param[out] dest device pointer + * \param stream CUDA stream that has to be used for the memory transfer + * + * Pre-conditions: + * - memory pointed by \p dest and \p src must be large enough to hold \p n elements + * + * Exception Guarantee: Basic + */ + template + void memcpy(DevicePtr dest, DevicePtr src, std::size_t n, const Stream& stream) { + if (n <= 0) { + CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); + } + + CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault, StreamAccessor::get(stream))); + } + + /** sets \p n elements to \p ch in \p dest asynchronously + * + * \param[in] src device pointer + * \param[out] ch 8-bit value to fill the device memory with + * \param stream CUDA stream that has to be used for the memory operation + * + * Pre-conditions: + * - memory pointed by \p dest must be large enough to hold \p n elements + * + * Exception Guarantee: Basic + */ + template + void memset(DevicePtr dest, std::int8_t ch, std::size_t n, const Stream& stream) { + if (n <= 0) { + CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); + } + + CUDA4DNN_CHECK_CUDA(cudaMemsetAsync(dest.get(), ch, n * sizeof(T), StreamAccessor::get(stream))); + } + +}}}} /* namespace cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_POINTER_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/span.hpp b/modules/dnn/src/cuda4dnn/csl/span.hpp new file mode 100644 index 000000000000..c18e7b96cc7d --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/span.hpp @@ -0,0 +1,59 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_SPAN_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_SPAN_HPP + +#include "pointer.hpp" +#include "nvcc_defs.hpp" + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /** @brief provides non-owning mutable view for device arrays + * + * const span/span provides mutable access to the elements unless T is const qualified + * const span makes the span immutable but not the elements + */ + template + class span { + static_assert(std::is_standard_layout::value, "T must satisfy StandardLayoutType"); + + public: + using value_type = T; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + using pointer = DevicePtr; + using const_pointer = DevicePtr::type>; + using reference = typename std::add_lvalue_reference::type; + using const_reference = typename std::add_lvalue_reference::type>; + + using iterator = pointer; + using const_iterator = const_pointer; + + span() noexcept : ptr{ nullptr }, sz{ 0 } { } + CUDA4DNN_HOST_DEVICE span(pointer first, pointer last) noexcept : ptr{ first }, sz{ last - first } { } + CUDA4DNN_HOST_DEVICE span(pointer first, size_type count) noexcept : ptr{ first }, sz{ count } { } + + CUDA4DNN_HOST_DEVICE size_type size() const noexcept { return sz; } + CUDA4DNN_HOST_DEVICE bool empty() const noexcept { return size() == 0; } + + CUDA4DNN_DEVICE reference operator[](difference_type index) const { return ptr[index]; } + CUDA4DNN_HOST_DEVICE pointer data() const noexcept { return ptr; } + + private: + pointer ptr; + size_type sz; + }; + + /** @brief provides non-owning immutable view for device arrays */ + template + using view = span; + +}}}} /* namespace cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_SPAN_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/stream.cpp b/modules/dnn/src/cuda4dnn/csl/stream.cpp new file mode 100644 index 000000000000..3d3bc87ac999 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/stream.cpp @@ -0,0 +1,91 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "error.hpp" +#include "stream.hpp" + +#include +#include + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /** @brief noncopyable smart CUDA stream + * + * UniqueStream is a smart non-sharable wrapper for CUDA stream handle which ensures that + * the handle is destroyed after use. Unless explicitly specified by a constructor argument, + * the stream object represents the default stream. + */ + class Stream::UniqueStream { + public: + UniqueStream() noexcept : stream{ 0 } { } + UniqueStream(UniqueStream&) = delete; + UniqueStream(UniqueStream&& other) noexcept { + stream = other.stream; + other.stream = 0; + } + + UniqueStream(bool create) : stream{ 0 } { + if (create) { + CUDA4DNN_CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + } + } + + ~UniqueStream() { + try { + if (stream != 0) + CUDA4DNN_CHECK_CUDA(cudaStreamDestroy(stream)); + } catch (const CUDAException& ex) { + std::ostringstream os; + os << "Asynchronous exception caught during CUDA stream destruction.\n"; + os << ex.what(); + os << "Exception will be ignored.\n"; + CV_LOG_WARNING(0, os.str().c_str()); + } + } + + UniqueStream& operator=(const UniqueStream&) = delete; + UniqueStream& operator=(UniqueStream&& other) noexcept { + stream = other.stream; + other.stream = 0; + return *this; + } + + //!< returns the raw CUDA stream handle + cudaStream_t get() const noexcept { return stream; } + + void synchronize() const { CUDA4DNN_CHECK_CUDA(cudaStreamSynchronize(stream)); } + bool busy() const { + auto status = cudaStreamQuery(stream); + if (status == cudaErrorNotReady) + return true; + CUDA4DNN_CHECK_CUDA(status); + return false; + } + + private: + cudaStream_t stream; + }; + + Stream::Stream() : stream(std::make_shared()) { } + Stream::Stream(const Stream&) noexcept = default; + Stream::Stream(Stream&&) noexcept = default; + Stream::Stream(bool create) : stream(std::make_shared(create)) { } + + Stream& Stream::operator=(const Stream&) noexcept = default; + Stream& Stream::operator=(Stream&&) noexcept = default; + + void Stream::synchronize() const { stream->synchronize(); } + bool Stream::busy() const { return stream->busy(); } + Stream::operator bool() const noexcept { return static_cast(stream); } + + cudaStream_t StreamAccessor::get(const Stream& stream) { + CV_Assert(stream); + return stream.stream->get(); + } + +}}}} /* namespace cv::dnn::cuda4dnn::csl */ diff --git a/modules/dnn/src/cuda4dnn/csl/stream.hpp b/modules/dnn/src/cuda4dnn/csl/stream.hpp new file mode 100644 index 000000000000..a1abf3304e01 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/stream.hpp @@ -0,0 +1,22 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_STREAM_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_STREAM_HPP + +#include + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /** used to access the raw CUDA stream handle held by Handle */ + class StreamAccessor { + public: + static cudaStream_t get(const Stream& stream); + }; + +}}}} /* cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_STREAM_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/workspace.cpp b/modules/dnn/src/cuda4dnn/csl/workspace.cpp new file mode 100644 index 000000000000..fa62848532cc --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/workspace.cpp @@ -0,0 +1,31 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "workspace.hpp" +#include "memory.hpp" + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + class Workspace::Impl { + public: + ManagedPtr ptr; + }; + + Workspace::Workspace() : impl(std::make_shared()) { } + + void Workspace::require(std::size_t bytes) { + if (bytes > impl->ptr.size()) + impl->ptr.reset(bytes); + } + + std::size_t Workspace::size() const noexcept { return impl->ptr.size(); } + + DevicePtr WorkspaceAccessor::get(const Workspace& workspace) { + return DevicePtr(workspace.impl->ptr.get()); + } + +}}}} /* cv::dnn::cuda4dnn::csl */ diff --git a/modules/dnn/src/cuda4dnn/csl/workspace.hpp b/modules/dnn/src/cuda4dnn/csl/workspace.hpp new file mode 100644 index 000000000000..967d234d2d93 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/workspace.hpp @@ -0,0 +1,22 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_WORKSPACE_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_WORKSPACE_HPP + +#include "pointer.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /** used to access the raw CUDA stream handle held by Handle */ + class WorkspaceAccessor { + public: + static DevicePtr get(const Workspace& workspace); + }; + +}}}} /* cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_WORKSPACE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/test.cpp b/modules/dnn/src/cuda4dnn/test.cpp deleted file mode 100644 index 066d91974dee..000000000000 --- a/modules/dnn/src/cuda4dnn/test.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -// this file is a stub and will be removed once actual code is added - -#include "../precomp.hpp" - -#ifndef HAVE_CUDA -# error "CUDA4DNN should be enabled iff CUDA and cuDNN were found" -#endif - -#include - -void cuda4dnn_build_test_func() { - auto ver = cudnnGetVersion(); - CV_UNUSED(ver); -} diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 4a58f3b58861..83decb5e4800 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -45,6 +45,12 @@ #include "op_vkcom.hpp" #include "op_cuda.hpp" #include "halide_scheduler.hpp" + +#include +#include +#include +#include + #include #include #include @@ -681,15 +687,6 @@ struct DataLayer : public Layer } #endif -#ifdef HAVE_CUDA - void forwardCUDA(std::vector>& inputs, std::vector>& outputs) CV_OVERRIDE - { - /* standardize/normalize on device */ - // use CPU for now - Layer::forwardCUDA(inputs, outputs); - } -#endif - int outputNameToIndex(const String& tgtName) CV_OVERRIDE { int idx = (int)(std::find(outNames.begin(), outNames.end(), tgtName) - outNames.begin()); @@ -761,10 +758,6 @@ struct DataLayer : public Layer return Ptr(); } -#ifdef HAVE_CUDA - void initCUDA() CV_OVERRIDE { } -#endif - std::vector outNames; // Preprocessing parameters for each network's input. std::vector scaleFactors; @@ -1067,6 +1060,13 @@ struct Net::Impl preferableBackend = DNN_BACKEND_DEFAULT; preferableTarget = DNN_TARGET_CPU; skipInfEngineInit = false; + +#ifdef HAVE_CUDA + /* we do not use the member initializer list to decouple the evaluation order from the declaration order */ + stream = cuda4dnn::csl::Stream(true); + cublasHandle = cuda4dnn::csl::cublas::Handle(stream); + cudnnHandle = cuda4dnn::csl::cudnn::Handle(stream); +#endif } Ptr netInputLayer; @@ -1089,6 +1089,13 @@ struct Net::Impl std::vector layersTimings; Mat output_blob; +#ifdef HAVE_CUDA + cuda4dnn::csl::Stream stream; + cuda4dnn::csl::cublas::Handle cublasHandle; + cuda4dnn::csl::cudnn::Handle cudnnHandle; +#endif + cuda4dnn::csl::Workspace workspace; + Ptr wrap(Mat& host) { if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU) @@ -1833,7 +1840,9 @@ struct Net::Impl for (auto& layer : layers) { auto& ld = layer.second; - ld.layerInstance->initCUDA(); + std::size_t workspace_size_required = 0; + ld.layerInstance->initCUDA(stream, cublasHandle, cudnnHandle, workspace_size_required); + workspace.require(workspace_size_required); } #endif } @@ -2501,7 +2510,10 @@ struct Net::Impl { try { - layer->forwardCUDA(ld.inputBlobsWrappers, ld.outputBlobsWrappers); + CV_Assert(haveCUDA()); +#ifdef HAVE_CUDA + layer->forwardCUDA(ld.inputBlobsWrappers, ld.outputBlobsWrappers, workspace); +#endif } catch (const cv::Exception&) { @@ -2591,6 +2603,11 @@ struct Net::Impl //forward itself forwardLayer(ld); + +#ifdef HAVE_CUDA + if (preferableBackend == DNN_BACKEND_CUDA) + stream.synchronize(); +#endif } void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes) @@ -3723,7 +3740,11 @@ bool Layer::supportBackend(int backendId) return backendId == DNN_BACKEND_OPENCV; } -void Layer::initCUDA() +void Layer::initCUDA( + cuda4dnn::csl::Stream stream, + cuda4dnn::csl::cublas::Handle cublas_handle, + cuda4dnn::csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes) { /* ** Implementing initCUDA is required iff the layer supports forward pass on CUDA devices. @@ -3733,7 +3754,10 @@ void Layer::initCUDA() */ } -void Layer::forwardCUDA(std::vector>& inputs, std::vector>& outputs) +void Layer::forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + cuda4dnn::csl::Workspace& workspace) { /* ** Implementing forwardCUDA is required iff the layer supports forward pass on CUDA devices. diff --git a/modules/dnn/src/proxy_cuda.cpp b/modules/dnn/src/proxy_cuda.cpp new file mode 100644 index 000000000000..9e1a5a49eeeb --- /dev/null +++ b/modules/dnn/src/proxy_cuda.cpp @@ -0,0 +1,63 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "precomp.hpp" +#include "op_cuda.hpp" + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /* when the dnn module is compiled without CUDA support, the CSL components are not compiled + ** however, the headers which are exposed require proxy implementations to compile + */ + +#ifndef HAVE_CUDA + class StreamAccessor { }; + class Stream::UniqueStream { }; + + Stream::Stream() { } + Stream::Stream(const Stream&) noexcept = default; + Stream::Stream(Stream&&) noexcept = default; + Stream::Stream(bool create) { } + Stream& Stream::operator=(const Stream&) noexcept = default; + Stream& Stream::operator=(Stream&&) noexcept = default; + void Stream::synchronize() const { } + bool Stream::busy() const { return false; } + Stream::operator bool() const noexcept { return false; } + + namespace cublas { + class HandleAccessor { }; + class Handle::UniqueHandle { }; + + Handle::Handle() { } + Handle::Handle(const Handle&) noexcept = default; + Handle::Handle(Handle&&) noexcept = default; + Handle::Handle(Stream strm) { } + Handle& Handle::operator=(const Handle&) noexcept = default; + Handle& Handle::operator=(Handle&&) noexcept = default; + Handle::operator bool() const noexcept { return false; } + } + + namespace cudnn { + class HandleAccessor { }; + class Handle::UniqueHandle { }; + + Handle::Handle() { } + Handle::Handle(const Handle&) noexcept = default; + Handle::Handle(Handle&&) noexcept = default; + Handle::Handle(Stream strm) { } + Handle& Handle::operator=(const Handle&) noexcept = default; + Handle& Handle::operator=(Handle&&) noexcept = default; + Handle::operator bool() const noexcept { return false; } + } + + class WorkspaceAccessor { }; + class Workspace::Impl { }; + + Workspace::Workspace() { } + void Workspace::require(std::size_t bytes) { } + std::size_t Workspace::size() const noexcept { return 0; } + +#endif + +}}}} /* cv::dnn::cuda4dnn::csl */ From 2f9afc805aac81df237fae1ddb7de37eabdf002c Mon Sep 17 00:00:00 2001 From: Yashas Date: Fri, 21 Jun 2019 05:24:21 +0530 Subject: [PATCH 005/129] add high-level CSL components --- modules/dnn/src/cuda4dnn/csl/cublas.cpp | 2 + modules/dnn/src/cuda4dnn/csl/cudnn.cpp | 2 + modules/dnn/src/cuda4dnn/csl/stream.cpp | 2 + modules/dnn/src/cuda4dnn/csl/tensor.hpp | 781 +++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/workspace.cpp | 2 + 5 files changed, 789 insertions(+) create mode 100644 modules/dnn/src/cuda4dnn/csl/tensor.hpp diff --git a/modules/dnn/src/cuda4dnn/csl/cublas.cpp b/modules/dnn/src/cuda4dnn/csl/cublas.cpp index 9fd8fe097977..8f7550dbceb9 100644 --- a/modules/dnn/src/cuda4dnn/csl/cublas.cpp +++ b/modules/dnn/src/cuda4dnn/csl/cublas.cpp @@ -2,6 +2,8 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. +#include "../../precomp.hpp" + #include "cublas.hpp" #include "stream.hpp" diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.cpp b/modules/dnn/src/cuda4dnn/csl/cudnn.cpp index d461fac4d58c..e0966de4ea4f 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.cpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.cpp @@ -2,6 +2,8 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. +#include "../../precomp.hpp" + #include "cudnn.hpp" #include "stream.hpp" diff --git a/modules/dnn/src/cuda4dnn/csl/stream.cpp b/modules/dnn/src/cuda4dnn/csl/stream.cpp index 3d3bc87ac999..18e76c5dfbfe 100644 --- a/modules/dnn/src/cuda4dnn/csl/stream.cpp +++ b/modules/dnn/src/cuda4dnn/csl/stream.cpp @@ -2,6 +2,8 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. +#include "../../precomp.hpp" + #include "error.hpp" #include "stream.hpp" diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp new file mode 100644 index 000000000000..ffe67825d2a7 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -0,0 +1,781 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_TENSOR_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_TENSOR_HPP + +#include "nvcc_defs.hpp" +#include "memory.hpp" +#include "cublas.hpp" +#include "cudnn.hpp" +#include "span.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CSL_DEFAULT_TENSOR_RANK + #define CSL_DEFAULT_TENSOR_RANK 4 +#endif + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /** \file tensor.hpp + * + * The tensor library contains three kinds of tensor objects which are summarized + * in the table below: + * + * TYPE | OWNERSHIP | MUTABLE | PASS TO KERNELS + * ------------ + --------- + ------- + --------------- + * Tensor | Yes | Yes | No + * TensorSpan | No | Yes | Yes + * TensorView | No | No | Yes + * + * Tensor is implicitly convertible to TensorSpan and TensorView + * TensorSpan is implicitly convertible to TensorView + * + * "TensorType", frequently used as a template parameter, can refer to Tensor, TensorSpan or TensorView. + */ + + template + class TensorSpan; + + template + class TensorView; + + /** @brief multi-dimensional contiguous GPU tensor containing elements of a single type + * + * \tparam T type of data stored by the tensor + * \tparam rank_ rank of the tensor + */ + template + class Tensor { + static_assert(rank_ > 0, "Scalars are not supported"); + static_assert(std::is_standard_layout::value, "T must staisfy StandardLayoutType"); + + public: + using value_type = typename ManagedPtr::element_type; + using pointer = typename ManagedPtr::pointer; + using const_pointer = typename ManagedPtr::const_pointer; + using size_type = std::size_t; + + static constexpr auto rank = rank_; + + Tensor() noexcept { std::fill(std::begin(sizes), std::end(sizes), 0); } + Tensor(const Tensor&) = delete; + Tensor(Tensor&& other) noexcept { + data = std::move(other.data); + sizes = other.sizes; + std::fill(std::begin(other.sizes), std::end(other.sizes), 0); + } + + /** @brief constructs a tensor of specific size + * + * Whatever arguments are accepted by the resize methods are accepted here. + */ + template + Tensor(Args... sizes) { resize(std::forward(sizes)...); } + + Tensor& operator=(const Tensor&) = delete; + Tensor& operator=(Tensor&& other) noexcept { + data = std::move(other.data); + sizes = other.sizes; + std::fill(std::begin(other.sizes), std::end(other.sizes), 0); + return *this; + } + + /** returns the total number of elements in the tensor */ + size_type size() const noexcept { + return std::accumulate(std::begin(sizes), std::end(sizes), 1, std::multiplies()); + } + + /** returns true if the tensor is empty */ + bool empty() const noexcept { return !size(); } + + /** @brief returns the length of the axis + * + * Every axis is assigned a zero-based index which can be used to select an axis. + * Negative index can be used to select an axis from the end. + * + * Examples: + * > -1 represents the last axis + * > 0 represents the first axis + * > 1 represents the second axis + * + * Pre-conditions: + * - the axis must be in the range [-rank, rank) + */ + size_type get_axis_size(int axis) const noexcept { + axis = axis < 0 ? rank + axis : axis; + CV_Assert(axis >= 0 && axis < rank); + return sizes[axis]; + } + + /** returns a device pointer to mutable device memory */ + pointer get() noexcept { return data.get(); } + + /** returns a device pointer to immutable device memory */ + const_pointer get() const noexcept { return data.get(); } + + /** @brief resizes the tensor + * + * Pre-conditions: + * - [start, end) represents a range containing length of the axes in order starting from axis zero + * - number of sizes provided must be less than or equal to the tensor rank + * - the sizes must be positive integers + * + * The length of unspecified axes will be assumed to be one. + * + * Exception Guarantee: Strong + */ + template + typename std::enable_if::value, void> // TODO is_iterator + ::type resize(ForwardItr start, ForwardItr end) { + CV_Assert(start != end); + CV_Assert(std::distance(start, end) <= rank); + + using ItrValueType = typename std::iterator_traits::value_type; + auto total = std::accumulate(start, end, 1, std::multiplies()); + data.reset(total); + + /* length of the unspecified axes are assumed to be one */ + auto fill_sizes = rank - std::distance(start, end); + std::fill_n(std::begin(sizes), fill_sizes, 1); + std::copy(start, end, std::begin(sizes) + fill_sizes); + } + + /** @brief resizes the tensor + * constructs a range out of the arguments and invokes range-based resize method + */ + template + void resize(Sizes... new_sizes_) { + static_assert(sizeof...(Sizes) <= rank, "number of axes exceeds the tensor rank"); + std::array new_sizes = { static_cast(new_sizes_)... }; + resize(std::begin(new_sizes), std::end(new_sizes)); + } + + /** @brief resizes the tensor + * + * Pre-conditions: + * - the reference tensor must be a non-empty tensor + * - the reference tensor's rank must be lesser than or equal to the rank of target tensor + * + * Exception Guarantee: Strong + */ + template + void resize_as(const TensorType& tensor) { + static_assert(TensorType::rank <= rank, "cannot resize a tensor of lower rank to a tensor of higher rank"); + std::array new_sizes; + for (int i = 0; i < TensorType::rank; i++) + new_sizes[i] = tensor.get_axis_size(i); + resize(std::begin(new_sizes), std::end(new_sizes)); + } + + /** @brief reshapes the tensor + * + * Length deduction: + * The length of at most one axis can be deduced using the total size constraint. The axis can + * be marked for deduction by specifying the size as -1. + * + * The axes for which no size was provided (excluding -1) will be assumed to be one. + * + * Pre-conditions: + * - [start, end) represents a range containing length of the axes in order starting from axis zero + * - the number of lengths provided must be less than or equal to the tensor rank + * - at most one axis length is allowed for length deduction + * - the lengths provided must ensure that the total number of elements remains unchanged + * + * Exception Guarantee: Strong + */ + template + typename std::enable_if::value, void> // TODO is_iterator + ::type reshape(ForwardItr start, ForwardItr end) { + CV_Assert(start != end); + CV_Assert(std::distance(start, end) <= rank); + + using ItrValueType = typename std::iterator_traits::value_type; + + /* the user may leave at most one axis size for deduction by specifying -1 */ + auto sizes_to_deduce = std::count(start, end, -1); + if (sizes_to_deduce > 1) { CV_Error(Error::StsBadArg, "only one axis size can be deduced"); } + + /* sizes must be positive numbers with the exception of -1 */ + auto invalid_sizes = std::count_if(start, end, [](ItrValueType x) { + return !(x > 0 || x == -1); + }); + if (invalid_sizes) { CV_Error(Error::StsBadArg, "invalid axis size"); } + + /* compute the total number of elements in the new tensor */ + size_type unknown_size = 0; + auto total = std::accumulate(start, end, 1, std::multiplies()); + if (total < 0) { + /* there is an unknown size */ + if (std::abs(total) <= size()) { + unknown_size = size() / std::abs(total); + total = size(); + } + /* Edge case: if `total` is already more than size(), skip the deduction as it's impossible + ** Since `total` is negative, the size check which follows will fail and throw an error + */ + } + + /* the number of elements before and after reshape must be exactly same */ + if (total != size()) { + CV_Error(Error::StsBadArg, "new axes do not preserve the tensor element count"); + } + + /* we assume the size of the unspecified axes to be one */ + auto fill_sizes = rank - std::distance(start, end); + std::fill_n(std::begin(sizes), fill_sizes, 1); + std::copy(start, end, std::begin(sizes) + fill_sizes); + + /* replace the unknown axis with the correct value */ + std::replace(std::begin(sizes), std::end(sizes), size_type(-1), unknown_size); + } + + /** @brief reshapes the tensor + * constructs a range out of the arguments and invokes range-based reshape method + */ + template + void reshape(Sizes... new_sizes_) { + static_assert(sizeof...(Sizes) <= rank, "number of axes exceeds the tensor rank"); + std::array new_sizes = { static_cast(new_sizes_)... }; + reshape(std::begin(new_sizes), std::end(new_sizes)); + } + + /** @brief reshapes the tensor + * + * Pre-conditions: + * - the reference tensor must be a non-empty tensor + * - the reference tensor's rank must be lesser than or equal to the rank of target tensor + * + * Exception Guarantee: Strong + */ + template + void reshape_as(const TensorType& tensor) { + static_assert(TensorType::rank <= rank, "cannot reshape a tensor of lower rank to a tensor of higher rank"); + std::array new_sizes; + for (int i = 0; i < TensorType::rank; i++) + new_sizes[i] = tensor.get_axis_size(i); + reshape(std::begin(new_sizes), std::end(new_sizes)); + } + + operator TensorSpan() noexcept; /* defined later */ + operator TensorView() const noexcept; /* defined later */ + + friend void swap(Tensor& lhs, Tensor& rhs) noexcept { + using std::swap; + swap(lhs.data, rhs.data); + swap(lhs.sizes, rhs.sizes); + } + + private: + std::array sizes; + ManagedPtr data; + }; + + /** @brief provides a non-owning mutable span of a Tensor + * + * \tparam T type of data stored by the tensor + * \tparam rank rank of the tensor + * + * A span is valid if and only if the following hold true: + * - parent tensor is still alive + * - parent tensor holds a valid memory block + * - parent tensor hasn't performed any resizing operation since the span was created + * + * A span may be used if and only if it is valid. + */ + template + class TensorSpan { + public: + using tensor_type = Tensor; + using value_type = typename tensor_type::value_type; + using pointer = typename tensor_type::pointer; + using const_pointer = typename tensor_type::const_pointer; + using size_type = typename tensor_type::size_type; + + static constexpr auto rank = rank_; + + TensorSpan() noexcept : ptr{ nullptr } { std::fill(std::begin(sizes), std::end(sizes), 0); } + TensorSpan(const TensorSpan&) noexcept = default; + TensorSpan(tensor_type& parent) noexcept : ptr{ parent.get() } { + for (std::size_t i = 0; i < rank; i++) + sizes[i] = parent.get_axis_size(i); + } + + /* returns the total number of elements in the span */ + CUDA4DNN_HOST/*_DEVICE*/ size_type size() const noexcept { + return std::accumulate(std::begin(sizes), std::end(sizes), 1, std::multiplies()); + } + + /** returns true if the tensor is empty */ + CUDA4DNN_HOST/*_DEVICE*/ bool empty() const noexcept { return !size(); } + + /** @brief returns the length of the axis + * + * Negative axis numbers can be used to select axis from the lower order. + * Examples: + * > -1 represents the last axis + * > 0 represents the first axis + * > 1 represents the second axis + * + * Pre-conditions: + * - the axis must be in the range [-rank, rank) + */ + CUDA4DNN_HOST_DEVICE size_type get_axis_size(int axis) const noexcept { + axis = axis < 0 ? rank + axis : axis; + CV_Assert(axis >= 0 && axis < rank); + return sizes[axis]; + } + + /** returns a device pointer to mutable device memory */ + CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; } + + /** @brief reshapes the span + * + * Length deduction: + * The length of at most one axis can be deduced using the total size constraint. The axis can + * be marked for deduction by specifying the corresponding size as -1. + * + * The axes for which no size was provided (excluding -1) will be assumed to be one. + * + * Pre-conditions: + * - [start, end) represents a range containing length of the axes in order + * - the number of axis lengths provided must be less than or equal to the tensor rank + * - at most one axis length is allowed for length deduction + * - the lengths provided must ensure that the total number of elements remains unchnged + * + * Exception Guarantee: Strong + */ + template CUDA4DNN_HOST + typename std::enable_if::value, void> // TODO is_iterator + ::type reshape(ForwardItr start, ForwardItr end) { + CV_Assert(start != end); + CV_Assert(std::distance(start, end) <= rank); + + using ItrValueType = typename std::iterator_traits::value_type; + + /* the user may leave at most one axis size for deduction by specifying -1 */ + auto sizes_to_deduce = std::count(start, end, -1); + if (sizes_to_deduce > 1) { CV_Error(Error::StsBadArg, "only one axis size can be deduced"); } + + /* sizes must be positive numbers with the exception of -1 */ + auto invalid_sizes = std::count_if(start, end, [](ItrValueType x) { + return !(x > 0 || x == -1); + }); + if (invalid_sizes) { CV_Error(Error::StsBadArg, "invalid axis size"); } + + /* compute the total number of elements in the new tensor */ + size_type unknown_size = 0; + auto total = std::accumulate(start, end, 1, std::multiplies()); + if (total < 0) { + /* there is an unknown size */ + if (std::abs(total) <= size()) { + unknown_size = size() / std::abs(total); + total = size(); + } + /* Edge case: if `total` is already more than size(), skip the deduction as it's impossible + ** Since `total` is negative, the size check which follows will fail and throw an error + */ + } + + /* the number of elements before and after reshape must be exactly same */ + if (total != size()) { + CV_Error(Error::StsBadArg, "new axes do not preserve the tensor element count"); + } + + /* we assume the size of the unspecified axes to be one */ + auto fill_sizes = rank - std::distance(start, end); + std::fill_n(std::begin(sizes), fill_sizes, 1); + std::copy(start, end, std::begin(sizes) + fill_sizes); + + /* replace the unknown axis with the correct value */ + std::replace(std::begin(sizes), std::end(sizes), size_type(-1), unknown_size); + } + + /** @brief reshapes the span + * constructs a range out of the arguments and invokes range-based reshape method + */ + template + CUDA4DNN_HOST void reshape(Sizes... new_sizes_) { + static_assert(sizeof...(Sizes) <= rank, "number of axes exceeds the tensor rank"); + std::array new_sizes = { static_cast(new_sizes_)... }; + reshape(std::begin(new_sizes), std::end(new_sizes)); + } + + /** @brief reshapes the span + * + * Pre-conditions: + * - the reference tensor must be a non-empty tensor + * - the reference tensor's rank must be lesser than or equal to the rank of target tensor + * + * Exception Guarantee: Strong + */ + template + void reshape_as(const TensorType& tensor) { + static_assert(TensorType::rank <= rank, "cannot reshape a tensor of lower rank to a tensor of higher rank"); + std::array new_sizes; + for (int i = 0; i < TensorType::rank; i++) + new_sizes[i] = tensor.get_axis_size(i); + reshape(std::begin(new_sizes), std::end(new_sizes)); + } + + /** @brief obtains a subspan of the span + * + * The axes for which no size was provided will be assumed to be one. + * + * Pre-conditions: + * - the `offset` must be less than the size of the span + * - [start, end) represents a range containing length of the subspan axes in order + * - the number of axis lengths provided must be less than or equal to the tensor rank + * - the lengths provided must ensure that the number of elements does not exceed (old size - offset) + * + * Exception Guarantee: Strong + */ + template CUDA4DNN_HOST + typename std::enable_if::value, TensorSpan> // TODO is_iterator + ::type subspan(size_type offset, ForwardItr start, ForwardItr end) const { + CV_Assert(start != end); + CV_Assert(std::distance(start, end) <= rank); + + auto cur_size = size(); + CV_Assert(offset < cur_size); + + using ItrValueType = typename std::iterator_traits::value_type; + + /* sizes must be positive numbers */ + auto invalid_sizes = std::count_if(start, end, [](ItrValueType x) { + return !(x > 0); + }); + if (invalid_sizes) { CV_Error(Error::StsBadArg, "invalid axis size"); } + + /* the number of elements must be equal to the new size */ + auto max_size = (cur_size - offset); + auto total = std::accumulate(start, end, 1, std::multiplies()); + if (total > max_size) { + CV_Error(Error::StsBadArg, "axis lengths lead to OOB accesses"); + } + + TensorSpan temp; + + /* we assume the size of the unspecified axes to be one */ + auto fill_sizes = rank - std::distance(start, end); + std::fill_n(std::begin(temp.sizes), fill_sizes, 1); + std::copy(start, end, std::begin(temp.sizes) + fill_sizes); + + temp.ptr = ptr + offset; + return temp; + } + + /** @brief obtains a subspan of the span + * constructs a range out of the size arguments and invokes the range-based subspan method + */ + template + CUDA4DNN_HOST TensorSpan subspan(size_type offset, Sizes... new_sizes_) const { + static_assert(sizeof...(Sizes) <= rank, "number of axes exceeds the tensor rank"); + std::array new_sizes = { static_cast(new_sizes_)... }; + return subspan(offset, std::begin(new_sizes), std::end(new_sizes)); + } + + operator TensorView() const noexcept; /* defined later */ + + friend void swap(TensorSpan& lhs, TensorSpan& rhs) noexcept { + using std::swap; + swap(lhs.ptr, rhs.ptr); + swap(lhs.sizes, rhs.sizes); + } + + private: + size_type sizes[rank]; + pointer ptr; + }; + + template + Tensor::operator TensorSpan() noexcept { + return TensorSpan(*this); + } + + /** @brief view of a tensor + * + * \tparam T type of data stored by the tensor + * \tparam rank rank of the tensor + * + * A view is valid if and only if the following hold true: + * - parent tensor is still alive + * - parent tensor holds a valid memory block + * - parent tensor hasn't performed any resizing operation since the view was created + */ + template + class TensorView { + public: + using tensor_type = Tensor; + using value_type = typename tensor_type::value_type; + using pointer = typename tensor_type::pointer; + using const_pointer = typename tensor_type::const_pointer; + using size_type = typename tensor_type::size_type; + + static constexpr auto rank = rank_; + + TensorView() noexcept : ptr{ nullptr } { std::fill_n(sizes, rank, 0); } + TensorView(const TensorView&) noexcept = default; + TensorView(const TensorSpan& other) noexcept : ptr{ other.get() } { + for (int i = 0; i < rank; i++) + sizes[i] = other.get_axis_size(i); + } + TensorView(const tensor_type& parent) noexcept : ptr{ parent.get() } { + for (std::size_t i = 0; i < rank; i++) + sizes[i] = parent.get_axis_size(i); + } + + TensorView& operator=(const TensorView&) = default; + TensorView& operator=(const TensorSpan& other) noexcept { + TensorView tmp(other); + swap(*this, tmp); + return *this; + } + + /* returns the total number of elements in the view */ + CUDA4DNN_HOST/*_DEVICE*/ size_type size() const noexcept { + return std::accumulate(std::begin(sizes), std::end(sizes), 1, std::multiplies()); + } + + /** returns true if the tensor is empty */ + CUDA4DNN_HOST/*_DEVICE*/ bool empty() const noexcept { return !size(); } + + /** @brief returns the length of the axis + * + * Negative axis numbers can be used to select axis from the lower order. + * Examples: + * > -1 represents the last axis + * > 0 represents the first axis + * > 1 represents the second axis + * + * Pre-conditions: + * - the axis must be in the range [-rank, rank) + */ + CUDA4DNN_HOST_DEVICE size_type get_axis_size(int axis) const noexcept { + axis = axis < 0 ? rank + axis : axis; + CV_Assert(axis >= 0 && axis < rank); + return sizes[axis]; + } + + /** returns a device pointer to immutable device memory */ + CUDA4DNN_HOST_DEVICE const_pointer get() const noexcept { return ptr; } + + /** @brief reshapes the view + * + * Length deduction: + * The length of at most one axis can be deduced using the total size constraint. The axis can + * be marked for deduction by specifying the size as -1. + * + * The axes for which no size was provided (excluding -1) will be assumed to be one. + * + * Pre-conditions: + * - [start, end) represents a range containing length of the axes in order + * - the number of axis lengths provided must be less than or equal to the tensor rank + * - at most one axis length is allowed for length deduction + * - the lengths provided must ensure that the total number of elements remains unchnged + * + * Exception Guarantee: Strong + */ + template CUDA4DNN_HOST + typename std::enable_if::value, void> + ::type reshape(ForwardItr start, ForwardItr end) { + CV_Assert(start != end); + CV_Assert(std::distance(start, end) <= rank); + + using ItrValueType = typename std::iterator_traits::value_type; + + /* the user may leave at most one axis size for deduction by specifying -1 */ + auto sizes_to_deduce = std::count(start, end, -1); + if (sizes_to_deduce > 1) { CV_Error(Error::StsBadArg, "only one axis size can be deduced"); } + + /* sizes must be positive numbers with the exception of -1 */ + auto invalid_sizes = std::count_if(start, end, [](ItrValueType x) { + return !(x > 0 || x == -1); + }); + if (invalid_sizes) { CV_Error(Error::StsBadArg, "invalid axis size"); } + + /* compute the total number of elements in the new tensor */ + size_type unknown_size = 0; + auto total = std::accumulate(start, end, 1, std::multiplies()); + if (total < 0) { + /* there is an unknown size */ + if (std::abs(total) <= size()) { + unknown_size = size() / std::abs(total); + total = size(); + } + /* Edge case: if `total` is already more than size(), skip the deduction as it's impossible + ** Since `total` is negative, the size check which follows will fail and throw an error + */ + } + + /* the number of elements before and after reshape must be exactly same */ + if (total != size()) { + CV_Error(Error::StsBadArg, "new axes do not preserve the tensor element count"); + } + + /* we assume the size of the unspecified axes to be one */ + auto fill_sizes = rank - std::distance(start, end); + std::fill_n(std::begin(sizes), fill_sizes, 1); + std::copy(start, end, std::begin(sizes) + fill_sizes); + + /* replace the unknown axis with the correct value */ + std::replace(std::begin(sizes), std::end(sizes), size_type(-1), unknown_size); + } + + /** @brief reshapes the view + * constructs a range out of the arguments and invokes range-based reshape method + */ + template + CUDA4DNN_HOST void reshape(Sizes... new_sizes_) { + static_assert(sizeof...(Sizes) <= rank, "number of axes exceeds the tensor rank"); + std::array new_sizes = { static_cast(new_sizes_)... }; + reshape(std::begin(new_sizes), std::end(new_sizes)); + } + + /** @brief reshapes the view + * + * Pre-conditions: + * - the reference tensor must be a non-empty tensor + * - the reference tensor's rank must be lesser than or equal to the rank of target tensor + * + * Exception Guarantee: Strong + */ + template + void reshape_as(const TensorType& tensor) { + static_assert(TensorType::rank <= rank, "cannot reshape a tensor of lower rank to a tensor of higher rank"); + std::array new_sizes; + for (int i = 0; i < TensorType::rank; i++) + new_sizes[i] = tensor.get_axis_size(i); + reshape(std::begin(new_sizes), std::end(new_sizes)); + } + + /** @brief obtains a subview of the view + * + * The axes for which no size was provided will be assumed to be one. + * + * Pre-conditions: + * - the `offset` must be less than the size of the view + * - [start, end) represents a range containing length of the subview axes in order + * - the number of axis lengths provided must be less than or equal to the tensor rank + * - the lengths provided must ensure that the number of elements does not exceed (old size - offset) + * + * Exception Guarantee: Strong + */ + template CUDA4DNN_HOST + typename std::enable_if::value, TensorView> // TODO is_iterator + ::type subview(size_type offset, ForwardItr start, ForwardItr end) const { + CV_Assert(start != end); + CV_Assert(std::distance(start, end) <= rank); + + auto cur_size = size(); + CV_Assert(offset < cur_size); + + using ItrValueType = typename std::iterator_traits::value_type; + + /* sizes must be positive numbers */ + auto invalid_sizes = std::count_if(start, end, [](ItrValueType x) { + return !(x > 0); + }); + if (invalid_sizes) { CV_Error(Error::StsBadArg, "invalid axis size"); } + + /* the number of elements must be equal to the new size */ + auto max_size = (cur_size - offset); + auto total = std::accumulate(start, end, 1, std::multiplies()); + if (total > max_size) { + CV_Error(Error::StsBadArg, "axes lengths lead to OOB accesses"); + } + + TensorView temp; + + /* we assume the size of the unspecified axes to be one */ + auto fill_sizes = rank - std::distance(start, end); + std::fill_n(std::begin(temp.sizes), fill_sizes, 1); + std::copy(start, end, std::begin(temp.sizes) + fill_sizes); + + temp.ptr = ptr + offset; + return temp; + } + + /** @brief obtains a subview of the view + * constructs a range out of the size arguments and invokes the range-based subview method + */ + template + CUDA4DNN_HOST TensorView subview(size_type offset, Sizes... new_sizes_) const { + static_assert(sizeof...(Sizes) <= rank, "number of axes exceeds the tensor rank"); + std::array new_sizes = { static_cast(new_sizes_)... }; + return subview(offset, std::begin(new_sizes), std::end(new_sizes)); + } + + friend void swap(TensorView& lhs, TensorView& rhs) noexcept { + using std::swap; + swap(lhs.ptr, rhs.ptr); + swap(lhs.sizes, rhs.sizes); + } + + private: + size_type sizes[rank]; + const_pointer ptr; + }; + + template + Tensor::operator TensorView() const noexcept { + return TensorView(*this); + } + + template + TensorSpan::operator TensorView() const noexcept { + return TensorView(*this); + } + + /** returns true if the two Tensor/TensorSpan/TensorView objects have the same shape */ + template inline + bool is_same_shape(const TensorType1& x, const TensorType2& y) noexcept { + constexpr auto rank1 = TensorType1::rank; + constexpr auto rank2 = TensorType2::rank; + + if (rank1 != rank2) + return false; + + for (int i = 0; i < rank1; i++) + if (x.get_axis_size(i) != y.get_axis_size(i)) + return false; + return true; + } + + /** returns the rank to which the given tensor can be squeezed to */ + template inline + std::size_t get_effective_rank(const TensorType& x) noexcept { + constexpr auto rank = TensorType::rank; + std::size_t effective_rank = rank; + for (int i = 0; i < rank; i++, effective_rank--) + if (x.get_axis_size(i) != 1) + break; + return effective_rank; + } + + /** returns the length of the axes of a TensorType object in a std::vector */ + template inline + std::vector get_shape_vector(const TensorType& x) { + constexpr auto rank = TensorType::rank; + std::vector shape(rank); + for (int i = 0; i < rank; i++) + shape[i] = x.get_axis_size(i); + return shape; + } + +}}}} /* namespace cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_TENSOR_HPP*/ diff --git a/modules/dnn/src/cuda4dnn/csl/workspace.cpp b/modules/dnn/src/cuda4dnn/csl/workspace.cpp index fa62848532cc..a8ae44b5ed5e 100644 --- a/modules/dnn/src/cuda4dnn/csl/workspace.cpp +++ b/modules/dnn/src/cuda4dnn/csl/workspace.cpp @@ -2,6 +2,8 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. +#include "../../precomp.hpp" + #include "workspace.hpp" #include "memory.hpp" From adad256226f0e05f8092d245315a2b4b026fc7d2 Mon Sep 17 00:00:00 2001 From: Yashas Date: Fri, 21 Jun 2019 05:37:22 +0530 Subject: [PATCH 006/129] integrate csl::Tensor into backbone code --- modules/dnn/src/op_cuda.cpp | 106 +++++++++++++++++++++++ modules/dnn/src/op_cuda.hpp | 165 +++++++++++++++++++++++++++--------- 2 files changed, 231 insertions(+), 40 deletions(-) diff --git a/modules/dnn/src/op_cuda.cpp b/modules/dnn/src/op_cuda.cpp index 60c5fa5badf4..214b2263e1ee 100644 --- a/modules/dnn/src/op_cuda.cpp +++ b/modules/dnn/src/op_cuda.cpp @@ -5,8 +5,114 @@ #include "precomp.hpp" #include "op_cuda.hpp" +#ifdef HAVE_CUDA +#include "cuda4dnn/csl/stream.hpp" +#include "cuda4dnn/csl/tensor.hpp" +#include "cuda4dnn/csl/pointer.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + +#include +#include + +#include + namespace cv { namespace dnn { +#ifdef HAVE_CUDA + CUDABackendWrapperFP32::CUDABackendWrapperFP32(Mat& m) + : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_CUDA_FP32) + { + CV_Assert(m.isContinuous()); + CV_Assert(m.type() == CV_32F); + CV_Assert(m.size.dims() <= tensor_type::rank); + + shape = cv::dnn::shape(m); + + shared_block = std::make_shared(); + shared_block->host_dirty = true; + shared_block->device_dirty = false; + shared_block->host = m; + shared_block->memGuard = csl::MemoryLockGuard(m.data, m.total() * sizeof(float)); + shared_block->parent = createTensorHeaderFromMat(m); + } + + CUDABackendWrapperFP32::CUDABackendWrapperFP32(const Ptr& base_, const MatShape& shape_) + : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_CUDA_FP32) + { + const Ptr base = base_.dynamicCast(); + + shape = shape_; + shared_block = base->shared_block; + } + + Ptr CUDABackendWrapperFP32::create(Mat& m) + { + return Ptr(new CUDABackendWrapperFP32(m)); + } + + Ptr CUDABackendWrapperFP32::create(const Ptr& base, const MatShape& shape) + { + return Ptr(new CUDABackendWrapperFP32(base, shape)); + } + + /* blocking */ + void CUDABackendWrapperFP32::copyToHost() { + if(shared_block->device_dirty) { + shared_block->host_dirty = false; + shared_block->device_dirty = false; + + /* If the wrapper is being reused, the device tensor might be larger in size. + * Using the device tensor does not give incorrect code, but it leads to unused regions + * of memory being copied. + * + * We use a view to ensure that only the required region of memory is copied. + */ + auto view = tensor_view_type(shared_block->parent).subview(0, std::begin(shape), std::end(shape)); + copyTensorToMat(shared_block->host, view, shared_block->stream); + + shared_block->stream.synchronize(); + } + } + + void CUDABackendWrapperFP32::setHostDirty() { + shared_block->device_dirty = false; + shared_block->host_dirty = true; + } + + /* non-blocking + * we don't have to block for copying to device because all operations are put into a stream which + * ensures that the operations added to the stream are performed in order + */ + void CUDABackendWrapperFP32::copyToDevice() { + if(shared_block->host_dirty) { + shared_block->host_dirty = false; + shared_block->device_dirty = false; + + auto span = tensor_span_type(shared_block->parent).subspan(0, std::begin(shape), std::end(shape)); + copyMatToTensor(span, shared_block->host, shared_block->stream); + } + } + + void CUDABackendWrapperFP32::setDeviceDirty() noexcept { + shared_block->device_dirty = true; + shared_block->host_dirty = false; + } + + void CUDABackendWrapperFP32::setStream(csl::Stream stream) noexcept { + shared_block->stream = std::move(stream); + } + + CUDABackendWrapperFP32::tensor_span_type CUDABackendWrapperFP32::getSpan() noexcept { + setDeviceDirty(); + return tensor_span_type(shared_block->parent).subspan(0, std::begin(shape), std::end(shape)); + } + + CUDABackendWrapperFP32::tensor_view_type CUDABackendWrapperFP32::getView() noexcept { + copyToDevice(); + return tensor_view_type(shared_block->parent).subview(0, std::begin(shape), std::end(shape)); + } +#endif /* ifdef HAVE_CUDA */ } /* namespace dnn */ } /* namespace cv */ diff --git a/modules/dnn/src/op_cuda.hpp b/modules/dnn/src/op_cuda.hpp index 925d7c834c8e..edde2e9b2c84 100644 --- a/modules/dnn/src/op_cuda.hpp +++ b/modules/dnn/src/op_cuda.hpp @@ -5,6 +5,17 @@ #ifndef OPENCV_DNN_SRC_OP_CUDA_HPP #define OPENCV_DNN_SRC_OP_CUDA_HPP +#ifdef HAVE_CUDA +#include "cuda4dnn/csl/stream.hpp" +#include "cuda4dnn/csl/tensor.hpp" +#include "cuda4dnn/csl/pointer.hpp" +#endif + +#include +#include + +#include + namespace cv { namespace dnn { inline bool haveCUDA() { @@ -16,47 +27,121 @@ namespace cv { } #ifdef HAVE_CUDA - /* CUDA Tensors are represented by csl::Tensor - ** CUDABackendWrapperFP32 wraps a csl::TensorSpan - ** It also maintains a reference to the csl::Tensor. - */ - class CUDABackendWrapperFP32 : public BackendWrapper { + /** @brief creates csl::Tensor object from cv::Mat */ + template > inline + TensorT createTensorHeaderFromMat(const cv::Mat& mat) { + auto is_matrix_type_same_as_tensor_type = [&mat]() { + switch (mat.type()) { + case CV_32F: return std::is_same::value; + default: return false; + } + }; + CV_Assert(is_matrix_type_same_as_tensor_type()); + + auto sizes = shape(mat); + return TensorT(std::begin(sizes), std::end(sizes)); + } + + /** @brief copies data from a cv::Mat and fills a TensorType + * + * Pre-conditions: + * - \p mat must be larger or equal to the tensor in size + * + * @note performance is best for continuous and page-locked cv::Mat + */ + template inline + void copyMatToTensor(const TensorSpanT& tensor, const cv::Mat& mat, const cuda4dnn::csl::Stream& stream) { + CV_Assert(mat.total() >= tensor.size()); + + cv::Mat source = mat.isContinuous() ? mat : mat.clone(); + CV_Assert(source.isContinuous()); + + using T = typename TensorSpanT::value_type; + cuda4dnn::csl::memcpy(tensor.get(), reinterpret_cast(source.data), tensor.size(), stream); + } + + /** @brief copies data from a TensorType to a cv::Mat + * + * Pre-conditions: + * - \p mat must be larger or equal to the tensor in size + * + * @note performance is best for continuous and page-locked cv::Mat + */ + template > inline + void copyTensorToMat(cv::Mat& mat, TensorType& tensor, const cuda4dnn::csl::Stream& stream) { + CV_Assert(mat.total() >= tensor.size()); + + cv::Mat source = mat.isContinuous() ? mat : mat.clone(); + CV_Assert(source.isContinuous()); + + using T = typename TensorType::value_type; + cuda4dnn::csl::memcpy(reinterpret_cast(source.data), tensor.get(), tensor.size(), stream); + + if(source.data != mat.data) + source.copyTo(mat); + } + + class CUDABackendWrapperFP32 final : public BackendWrapper { public: - CUDABackendWrapperFP32(Mat& m) : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_CUDA_FP32) { - /* TODO: - ** 1. store a reference to cv::Mat - ** 2. create a csl::Tensor - ** 3. create a csl::TensorSpan (or store shape) - */ - } - - CUDABackendWrapperFP32(const Ptr& base, const MatShape& shape) - : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_CUDA_FP32) { - /* TODO: - ** 1. copy reference to csl::Tensor of base - ** 2. set TensorSpan to mimic `shape` (or store shape) - */ - } - - static Ptr create(Mat& m) - { - return Ptr(new CUDABackendWrapperFP32(m)); - } - - static Ptr create(const Ptr& base, const MatShape& shape) - { - return Ptr(new CUDABackendWrapperFP32(base, shape)); - } - - virtual void copyToHost() CV_OVERRIDE { } - virtual void setHostDirty() CV_OVERRIDE { } - - //TensorSpan getSpan(); - //TensorView getView(); - - /* TensorSpan member vs create in getSpan() - ** member tensor span can save shape changes - */ + using value_type = float; + using tensor_type = cuda4dnn::csl::Tensor; + using tensor_span_type = cuda4dnn::csl::TensorSpan; + using tensor_view_type = cuda4dnn::csl::TensorView; + + CUDABackendWrapperFP32(Mat&); + CUDABackendWrapperFP32(const Ptr& base, const MatShape& shape); + + static Ptr create(Mat&); + static Ptr create(const Ptr& base, const MatShape& shape); + + void copyToHost() override; + void setHostDirty() override; + + void copyToDevice(); + void setDeviceDirty() noexcept; + + MatShape getShape() const noexcept { return shape; } + + /** @note setting the stream updates the stream for all wrappers which use the same buffer */ + void setStream(cuda4dnn::csl::Stream stream) noexcept; + + /* Optimization Note: use getSpan() and getView() judiciously + * + * getSpan() is meant to be used when the memory is going to be modified + * getView() is meant to be used when the memory is only going to be read + * + * getSpan() marks the device memory as dirty but getView() does not + * + * getView() implicitly performs host to device memory transfer if required + * getSpan() does not perform any synchronization (use copyToDevice if sync. is required) + */ + tensor_span_type getSpan() noexcept; + tensor_view_type getView() noexcept; + + private: + /* The same device memory can be reused by different layers whenever possible. + * Hence, it is possible for different backend warppers to point to the same device memory. + * However, it may use only a part of the total device memory and have a different shape. + * + * We store the common information such as device tensor and its corresponding host memory in + * a shared block. The shared block is shared by all backend wrappers which use the same device memory. + * The shape, which can be different for different wrappers, is stored as a member object. + */ + + MatShape shape; + + struct shared_block_type { + bool host_dirty; + bool device_dirty; + + cv::Mat host; + cuda4dnn::csl::MemoryLockGuard memGuard; /* keeps host memory page-locked */ + + tensor_type parent; + cuda4dnn::csl::Stream stream; + }; + + std::shared_ptr shared_block; }; #endif } /* namespace dnn */ From 8635b5e8fd47cac5676050e261a98c056b5f1409 Mon Sep 17 00:00:00 2001 From: Yashas Date: Fri, 21 Jun 2019 11:54:19 +0530 Subject: [PATCH 007/129] switch to CPU iff unsupported; otherwise, fail on error --- modules/dnn/src/dnn.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 83decb5e4800..19e25205cf8c 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -2515,9 +2515,19 @@ struct Net::Impl layer->forwardCUDA(ld.inputBlobsWrappers, ld.outputBlobsWrappers, workspace); #endif } - catch (const cv::Exception&) + catch (const cv::Exception& ex) { - CV_LOG_WARNING(NULL, "Layer does not support CUDA. Switching to CPU implementation. "); + /* if the layer failed because of an error, rethrow */ + if (ex.code != Error::StsNotImplemented) + throw; + + /* the layer does not have a CUDA implementation; use CPU for this layer */ + std::ostringstream os; + os << ld.name << " >> " << ex.what(); + if (ex.code == Error::StsNotImplemented) + os << "Switching to CPU for this layer.\n"; + CV_LOG_WARNING(NULL, os.str().c_str()); + auto actual_target = preferableTarget; preferableBackend = DNN_BACKEND_OPENCV; preferableTarget = DNN_TARGET_CPU; From 6615b7c5fb7feeab330ac4e532471ceb43fd17e1 Mon Sep 17 00:00:00 2001 From: Yashas Date: Sat, 22 Jun 2019 13:29:38 +0530 Subject: [PATCH 008/129] add fully connected layer --- modules/dnn/src/cuda4dnn/csl/cublas.cpp | 65 ++++++++- modules/dnn/src/cuda4dnn/csl/cublas.hpp | 38 ++++++ modules/dnn/src/cuda4dnn/csl/cudnn.cpp | 15 +- modules/dnn/src/cuda4dnn/csl/cudnn.hpp | 89 ++++++++++++ modules/dnn/src/cuda4dnn/csl/tensor.hpp | 129 ++++++++++++++---- modules/dnn/src/dnn.cpp | 15 ++ .../dnn/src/layers/fully_connected_layer.cpp | 88 ++++++++++++ modules/dnn/src/op_cuda.hpp | 12 +- 8 files changed, 407 insertions(+), 44 deletions(-) diff --git a/modules/dnn/src/cuda4dnn/csl/cublas.cpp b/modules/dnn/src/cuda4dnn/csl/cublas.cpp index 8f7550dbceb9..79756e26565c 100644 --- a/modules/dnn/src/cuda4dnn/csl/cublas.cpp +++ b/modules/dnn/src/cuda4dnn/csl/cublas.cpp @@ -6,6 +6,7 @@ #include "cublas.hpp" #include "stream.hpp" +#include "pointer.hpp" #include @@ -93,7 +94,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu /* used to access the raw cuBLAS handle held by Handle */ class HandleAccessor { public: - static cublasHandle_t get(Handle& handle) { + static cublasHandle_t get(const Handle& handle) { CV_Assert(handle); return handle.handle->get(); } @@ -109,4 +110,66 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu Handle::operator bool() const noexcept { return static_cast(handle); } + template <> + void gemm(const Handle& handle, + bool transa, bool transb, + std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, + float alpha, const DevicePtr A, std::size_t lda, + const DevicePtr B, std::size_t ldb, + float beta, const DevicePtr C, std::size_t ldc) + { + CV_Assert(handle); + + auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, + opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; + int irows_c = static_cast(rows_c), + icols_c = static_cast(cols_c), + icommon_dim = static_cast(common_dim), + ilda = static_cast(lda), + ildb = static_cast(ldb), + ildc = static_cast(ldc); + + CUDA4DNN_CHECK_CUBLAS( + cublasSgemm( + HandleAccessor::get(handle), + opa, opb, + irows_c, icols_c, icommon_dim, + &alpha, A.get(), ilda, + B.get(), ildb, + &beta, C.get(), ildc + ) + ); + } + + template <> + void gemm(const Handle& handle, + bool transa, bool transb, + std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, + double alpha, const DevicePtr A, std::size_t lda, + const DevicePtr B, std::size_t ldb, + double beta, const DevicePtr C, std::size_t ldc) + { + CV_Assert(handle); + + auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, + opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; + int irows_c = static_cast(rows_c), + icols_c = static_cast(cols_c), + icommon_dim = static_cast(common_dim), + ilda = static_cast(lda), + ildb = static_cast(ldb), + ildc = static_cast(ldc); + + CUDA4DNN_CHECK_CUBLAS( + cublasDgemm( + HandleAccessor::get(handle), + opa, opb, + irows_c, icols_c, icommon_dim, + &alpha, A.get(), ilda, + B.get(), ildb, + &beta, C.get(), ildc + ) + ); + } + }}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */ diff --git a/modules/dnn/src/cuda4dnn/csl/cublas.hpp b/modules/dnn/src/cuda4dnn/csl/cublas.hpp index b9ddcca7ec63..78416354985f 100644 --- a/modules/dnn/src/cuda4dnn/csl/cublas.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cublas.hpp @@ -5,6 +5,44 @@ #ifndef OPENCV_DNN_CUDA4DNN_CSL_CUBLAS_HPP #define OPENCV_DNN_CUDA4DNN_CSL_CUBLAS_HPP +#include "pointer.hpp" + #include +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas { + + /** @brief GEMM for colummn-major matrices + * + * \f$ C = \alpha AB + \beta C \f$ + * + * @tparam T matrix element type (must be `float` or `double`) + * + * @param handle valid cuBLAS Handle + * @param transa use transposed matrix of A for computation + * @param transb use transposed matrix of B for computation + * @param rows_c number of rows in C + * @param cols_c number of columns in C + * @param common_dim common dimension of A (or trans A) and B (or trans B) + * @param alpha scale factor for AB + * @param[in] A pointer to column-major matrix A in device memory + * @param lda leading dimension of matrix A + * @param[in] B pointer to column-major matrix B in device memory + * @param ldb leading dimension of matrix B + * @param beta scale factor for C + * @param[in,out] C pointer to column-major matrix C in device memory + * @param ldc leading dimension of matrix C + * + * Exception Guarantee: Basic + */ + template + typename std::enable_if::value || std::is_same::value, void> + ::type gemm(const Handle& handle, + bool transa, bool transb, + std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, + T alpha, const DevicePtr A, std::size_t lda, + const DevicePtr B, std::size_t ldb, + T beta, const DevicePtr C, std::size_t ldc); + +}}}}} /* cv::dnn::cuda4dnn::csl::cublas */ + #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUBLAS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.cpp b/modules/dnn/src/cuda4dnn/csl/cudnn.cpp index e0966de4ea4f..ec7ce5c11266 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.cpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.cpp @@ -6,6 +6,7 @@ #include "cudnn.hpp" #include "stream.hpp" +#include "pointer.hpp" #include @@ -65,15 +66,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu cudnnHandle_t handle; }; - /** used to access the raw cuDNN handle held by Handle */ - class HandleAccessor { - public: - static cudnnHandle_t get(const Handle& handle) { - CV_Assert(handle); - return handle.handle->get(); - } - }; - Handle::Handle() : handle(std::make_shared()) { } Handle::Handle(const Handle&) noexcept = default; Handle::Handle(Handle&&) noexcept = default; @@ -84,4 +76,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu Handle::operator bool() const noexcept { return static_cast(handle); } + cudnnHandle_t HandleAccessor::get(const Handle& handle) { + CV_Assert(handle); + return handle.handle->get(); + } + }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp index a50eed098e9f..63d403649b0c 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp @@ -7,6 +7,8 @@ #include +#include "pointer.hpp" + #include #define CUDA4DNN_CHECK_CUDNN(call) \ @@ -19,6 +21,93 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu if (status != CUDNN_STATUS_SUCCESS) throw cuDNNException(Error::GpuApiCallError, cudnnGetErrorString(status), func, file, line); } + + /** get_data_type returns the equivalent cudnn enumeration constant for type T */ + template auto get_data_type()->decltype(CUDNN_DATA_FLOAT); + template <> inline auto get_data_type()->decltype(CUDNN_DATA_FLOAT) { return CUDNN_DATA_FLOAT; } + template <> inline auto get_data_type()->decltype(CUDNN_DATA_FLOAT) { return CUDNN_DATA_DOUBLE; } + } + + + /** used to access the raw cuDNN handle held by Handle */ + class HandleAccessor { + public: + static cudnnHandle_t get(const Handle& handle); + }; + + template + class TensorDescriptor { + public: + TensorDescriptor() noexcept : descriptor{ nullptr } { } + TensorDescriptor(const TensorDescriptor&) = delete; + TensorDescriptor(TensorDescriptor&& other) + : descriptor{ other.descriptor } { + other.descriptor = nullptr; + } + + TensorDescriptor(std::size_t N, std::size_t chans, std::size_t height, std::size_t width) { + CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorDescriptor(&descriptor)); + try { + CUDA4DNN_CHECK_CUDNN(cudnnSetTensor4dDescriptor(descriptor, + CUDNN_TENSOR_NCHW, detail::get_data_type(), + static_cast(N), static_cast(chans), + static_cast(height), static_cast(width))); + } + catch (...) { + /* cudnnDestroyTensorDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor)); + throw; + } + } + + ~TensorDescriptor() noexcept { + if (descriptor != nullptr) { + /* cudnnDestroyTensorDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor)); + } + } + + TensorDescriptor& operator=(const TensorDescriptor&) = delete; + TensorDescriptor& operator=(TensorDescriptor&& other) noexcept { + descriptor = other.descriptor; + other.descriptor = nullptr; + return *this; + }; + + cudnnTensorDescriptor_t get() const noexcept { return descriptor; } + + private: + cudnnTensorDescriptor_t descriptor; + }; + + /** @brief element-wise addition with broadcasting + * + * \f$ C = \alpha A + \beta C \f$ + * + * @tparam T matrix element type (must be `float` or `double`) + * + * @param handle valid cuDNN handle + * @param alpha scale factor for A + * @param aDesc tensor descriptor for A + * @param[in] A pointer to tensor in device memory + * @param beta scale factor for C + * @param cDesc tensor descriptor for C + * @param[in] C pointer to tensor in device memory + * + * Exception Guarantee: Basic + */ + template + typename std::enable_if::value || std::is_same::value, void> + ::type add(const Handle& handle, + T alpha, const TensorDescriptor& aDesc, DevicePtr A, + T beta, const TensorDescriptor& cDesc, DevicePtr C) + { + CUDA4DNN_CHECK_CUDNN( + cudnnAddTensor(HandleAccessor::get(handle), + &alpha, aDesc.get(), A.get(), + &beta, cDesc.get(), C.get() + ) + ); } }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index ffe67825d2a7..dce22577444d 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -47,12 +47,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * "TensorType", frequently used as a template parameter, can refer to Tensor, TensorSpan or TensorView. */ - template - class TensorSpan; - - template - class TensorView; - /** @brief multi-dimensional contiguous GPU tensor containing elements of a single type * * \tparam T type of data stored by the tensor @@ -270,9 +264,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { reshape(std::begin(new_sizes), std::end(new_sizes)); } - operator TensorSpan() noexcept; /* defined later */ - operator TensorView() const noexcept; /* defined later */ - friend void swap(Tensor& lhs, Tensor& rhs) noexcept { using std::swap; swap(lhs.data, rhs.data); @@ -488,8 +479,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return subspan(offset, std::begin(new_sizes), std::end(new_sizes)); } - operator TensorView() const noexcept; /* defined later */ - friend void swap(TensorSpan& lhs, TensorSpan& rhs) noexcept { using std::swap; swap(lhs.ptr, rhs.ptr); @@ -501,11 +490,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { pointer ptr; }; - template - Tensor::operator TensorSpan() noexcept { - return TensorSpan(*this); - } - /** @brief view of a tensor * * \tparam T type of data stored by the tensor @@ -730,19 +714,24 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { const_pointer ptr; }; - template - Tensor::operator TensorView() const noexcept { - return TensorView(*this); - } + /** returns true if the two TensorType objects have the same shape */ + template inline + bool is_same_shape(const TensorType1& x, const TensorType2& y) noexcept { + constexpr auto rank1 = TensorType1::rank; + constexpr auto rank2 = TensorType2::rank; + + if (rank1 != rank2) + return false; - template - TensorSpan::operator TensorView() const noexcept { - return TensorView(*this); + for (int i = 0; i < rank1; i++) + if (x.get_axis_size(i) != y.get_axis_size(i)) + return false; + return true; } - /** returns true if the two Tensor/TensorSpan/TensorView objects have the same shape */ + /** returns true if the two TensorType objects are compatible */ template inline - bool is_same_shape(const TensorType1& x, const TensorType2& y) noexcept { + bool is_shape_compatible(const TensorType1& x, const TensorType2& y) noexcept { constexpr auto rank1 = TensorType1::rank; constexpr auto rank2 = TensorType2::rank; @@ -750,7 +739,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return false; for (int i = 0; i < rank1; i++) - if (x.get_axis_size(i) != y.get_axis_size(i)) + if (x.get_axis_size(i) != y.get_axis_size(i) && + x.get_axis_size(i) != 1 && y.get_axis_size(i) != 1) return false; return true; } @@ -776,6 +766,91 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return shape; } + namespace tensor_ops { + + /** @brief performs generalized matrix-multiplication + * + * Pre-conditions: + * - \p A and \p B must meet the mathematical requirements for matrix multiplication + * - \p result must be large enough to hold the result + * + * Exception Gaurantee: Basic + */ + template inline + void gemm(const cublas::Handle& handle, T beta, TensorSpan result, T alpha, bool transa, TensorView A, bool transb, TensorView B) { + /* matrix operations can be performed only on rank two or less tensors */ + CV_Assert(get_effective_rank(A) <= 2 && + get_effective_rank(B) <= 2 && + get_effective_rank(result) <= 2); + + /* check dimension requirements for matrix multiplication */ + if (!transa && !transb) { + CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2)); + CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2)); + CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1)); + } else if (!transa && transb) { + CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2)); + CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1)); + CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1)); + } else if (transa && !transb) { + CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2)); + CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2)); + CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1)); + } else { + CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2)); + CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1)); + CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1)); + } + + const auto result_nr = result.get_axis_size(-2); + const auto result_nc = result.get_axis_size(-1); + const auto common_dim = A.get_axis_size(transa ? -2 : -1); + const auto A_nc = A.get_axis_size(-1); + const auto B_nc = B.get_axis_size(-1); + + cublas::gemm(handle, + transb, transa, + result_nc, result_nr, common_dim, + alpha, B.get(), B_nc, + A.get(), A_nc, + beta, result.get(), result_nc); + } + + /** @brief performs element-wise addition with broadcasting + * + * Pre-conditions: + * - \p A and \p result must be compatible tensors + * + * Exception Gaurantee: Basic + */ + template inline + void add(const cudnn::Handle& handle, T beta, TensorSpan result, T alpha, TensorView A) { + /* mathematical requirements */ + CV_Assert(is_shape_compatible(result, A)); + + /* technical requirements */ + CV_Assert(get_effective_rank(result) <= 4); + CV_Assert(get_effective_rank(A) <= 4); + + using cudnn::TensorDescriptor; + auto aDesc = TensorDescriptor( + A.get_axis_size(-4), + A.get_axis_size(-3), + A.get_axis_size(-2), + A.get_axis_size(-1) + ); + + auto cDesc = TensorDescriptor( + result.get_axis_size(-4), + result.get_axis_size(-3), + result.get_axis_size(-2), + result.get_axis_size(-1) + ); + + cudnn::add(handle, alpha, aDesc, A.get(), beta, cDesc, result.get()); + } + } + }}}} /* namespace cv::dnn::cuda4dnn::csl */ -#endif /* OPENCV_DNN_CUDA4DNN_CSL_TENSOR_HPP*/ +#endif /* OPENCV_DNN_CUDA4DNN_CSL_TENSOR_HPP */ diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 19e25205cf8c..ff6b4eeb3865 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -1892,6 +1892,13 @@ struct Net::Impl for (size_t i = 0; i < ninputs; i++) { ld.inputBlobsWrappers[i] = wrap(netInputLayer->inputsData[i]); +#ifdef HAVE_CUDA + if (IS_DNN_CUDA_TARGET(preferableTarget)) + { + auto wrapper = ld.inputBlobsWrappers[i].dynamicCast(); + wrapper->setStream(stream); + } +#endif } } else @@ -1920,11 +1927,19 @@ struct Net::Impl for (int i = 0; i < ld.outputBlobs.size(); ++i) { ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]); +#ifdef HAVE_CUDA + if (IS_DNN_CUDA_TARGET(preferableTarget)) + { + auto wrapper = ld.outputBlobsWrappers[i].dynamicCast(); + wrapper->setStream(stream); + } +#endif } ld.internalBlobsWrappers.resize(ld.internals.size()); for (int i = 0; i < ld.internals.size(); ++i) { ld.internalBlobsWrappers[i] = wrap(ld.internals[i]); + /* we don't set stream for CUDA backend wrappers for internals as they are not used */ } Ptr layerPtr = ld.getLayerInstance(); diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index c9baf79d0035..4daa21208f63 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -42,6 +42,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include @@ -51,6 +52,11 @@ using namespace cv::dnn::ocl4dnn; #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -123,6 +129,7 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1) || (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && axis == 1); } @@ -415,6 +422,87 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) + { + CV_UNUSED(workspace); + + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto actual_dims = input_wrapper->getShape().size(); + CV_Assert(get_effective_rank(input) <= actual_dims); + + auto extra_dims = input.rank - actual_dims; + auto flatten_start_axis = clamp(axis, actual_dims) + extra_dims; + + std::size_t batch_size = 1; + for (int j = 0; j < flatten_start_axis; j++) + batch_size *= input.get_axis_size(j); + + auto input_size = input.size() / batch_size; + CV_Assert(input_size == weightsTensor.get_axis_size(-1)); + + auto output_size = output.size() / batch_size; + CV_Assert(output_size == weightsTensor.get_axis_size(-2)); + + /* we treat the input and output as a matrix with dimensions (batch_size, input_size) + * and (batch_size, output_size) respectively + * + * weight matrix dimensions: (output_size, input_size) + * + * I(W^T) = O + * (batch_size, input_size) * (input_size, output_size) = (batch_size, output_size) + */ + input.reshape(batch_size, input_size); + output.reshape(batch_size, output_size); + csl::tensor_ops::gemm(cublasHandle, 0.0, output, 1.0, false, input, true, weightsTensor); + + if (bias) + { + output.reshape(batch_size, 1, output_size, 1); + csl::tensor_ops::add(cudnnHandle, 1.0, output, 1.0, biasTensor); + } + } + } + + void initCUDA( + csl::Stream stream, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes + ) + { + cublasHandle = std::move(cublas_handle); + cudnnHandle = std::move(cudnn_handle); + + weightsTensor = createTensorHeaderFromMat(weightsMat); + CV_Assert(get_effective_rank(weightsTensor) == 2); + copyMatToTensor(weightsTensor, weightsMat, stream); + + if (bias) + { + biasTensor = createTensorHeaderFromMat(biasMat); + copyMatToTensor(biasTensor, biasMat, stream); + biasTensor.reshape(-1, 1); + CV_Assert(weightsTensor.get_axis_size(-2) == biasTensor.get_axis_size(-2)); + } + } + + csl::Tensor weightsTensor, biasTensor; + csl::cublas::Handle cublasHandle; + csl::cudnn::Handle cudnnHandle; +#endif + virtual Ptr initHalide(const std::vector > &inputs) CV_OVERRIDE { #ifdef HAVE_HALIDE diff --git a/modules/dnn/src/op_cuda.hpp b/modules/dnn/src/op_cuda.hpp index edde2e9b2c84..6ec1b893e0c5 100644 --- a/modules/dnn/src/op_cuda.hpp +++ b/modules/dnn/src/op_cuda.hpp @@ -29,7 +29,7 @@ namespace cv { #ifdef HAVE_CUDA /** @brief creates csl::Tensor object from cv::Mat */ template > inline - TensorT createTensorHeaderFromMat(const cv::Mat& mat) { + TensorT createTensorHeaderFromMat(const cv::Mat& mat) { auto is_matrix_type_same_as_tensor_type = [&mat]() { switch (mat.type()) { case CV_32F: return std::is_same::value; @@ -49,14 +49,13 @@ namespace cv { * * @note performance is best for continuous and page-locked cv::Mat */ - template inline - void copyMatToTensor(const TensorSpanT& tensor, const cv::Mat& mat, const cuda4dnn::csl::Stream& stream) { + template inline + void copyMatToTensor(const cuda4dnn::csl::TensorSpan tensor, const cv::Mat& mat, const cuda4dnn::csl::Stream& stream) { CV_Assert(mat.total() >= tensor.size()); cv::Mat source = mat.isContinuous() ? mat : mat.clone(); CV_Assert(source.isContinuous()); - using T = typename TensorSpanT::value_type; cuda4dnn::csl::memcpy(tensor.get(), reinterpret_cast(source.data), tensor.size(), stream); } @@ -67,14 +66,13 @@ namespace cv { * * @note performance is best for continuous and page-locked cv::Mat */ - template > inline - void copyTensorToMat(cv::Mat& mat, TensorType& tensor, const cuda4dnn::csl::Stream& stream) { + template inline + void copyTensorToMat(cv::Mat& mat, cuda4dnn::csl::TensorView tensor, const cuda4dnn::csl::Stream& stream) { CV_Assert(mat.total() >= tensor.size()); cv::Mat source = mat.isContinuous() ? mat : mat.clone(); CV_Assert(source.isContinuous()); - using T = typename TensorType::value_type; cuda4dnn::csl::memcpy(reinterpret_cast(source.data), tensor.get(), tensor.size(), stream); if(source.data != mat.data) From cd0234f80c0c1037df547dd16c7f39d9484fd814 Mon Sep 17 00:00:00 2001 From: Yashas Date: Sat, 22 Jun 2019 19:46:51 +0530 Subject: [PATCH 009/129] add softmax layer --- modules/dnn/src/cuda4dnn/csl/cudnn.hpp | 32 ++++++++++++++ modules/dnn/src/cuda4dnn/csl/tensor.hpp | 36 +++++++++++++++- modules/dnn/src/layers/softmax_layer.cpp | 55 ++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp index 63d403649b0c..4a8a7019e6ae 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp @@ -110,6 +110,38 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu ); } + /** @brief computes softmax (or log softmax) + * + * @tparam T matrix element type (must be `float` or `double`) + * + * @param handle valid cuDNN handle + * @param outputDesc tensor descriptor for A + * @param[out] output pointer to tensor in device memory + * @param inputDesc tensor descriptor for C + * @param[in] input pointer to tensor in device memory + * @param log apply log on probabilities + * + * Exception Guarantee: Basic + */ + template + typename std::enable_if::value || std::is_same::value, void> + ::type softmax(const cudnn::Handle& handle, + const TensorDescriptor& outputDesc, DevicePtr output, + const TensorDescriptor& inputDesc, DevicePtr input, + bool log) + { + T alpha = 1.0, beta = 0.0; + cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE; + CUDA4DNN_CHECK_CUDNN( + cudnnSoftmaxForward( + HandleAccessor::get(handle), + algo, CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, inputDesc.get(), input.get(), + &beta, outputDesc.get(), output.get() + ) + ); + } + }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index dce22577444d..a122467d9067 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -716,7 +716,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { /** returns true if the two TensorType objects have the same shape */ template inline - bool is_same_shape(const TensorType1& x, const TensorType2& y) noexcept { + bool is_shape_same(const TensorType1& x, const TensorType2& y) noexcept { constexpr auto rank1 = TensorType1::rank; constexpr auto rank2 = TensorType2::rank; @@ -849,6 +849,40 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { cudnn::add(handle, alpha, aDesc, A.get(), beta, cDesc, result.get()); } + + /** @brief performs element-wise addition with broadcasting + * + * Pre-conditions: + * - \p A and \p result must be compatible tensors + * + * Exception Gaurantee: Basic + */ + template inline + void softmax(const cudnn::Handle& handle, TensorSpan output, TensorView input, bool log) { + /* mathematical requirements */ + CV_Assert(is_shape_same(output, input)); + + /* technical requirements */ + CV_Assert(get_effective_rank(output) <= 4); + CV_Assert(get_effective_rank(input) <= 4); + + using cudnn::TensorDescriptor; + auto inputDesc = TensorDescriptor( + input.get_axis_size(-4), + input.get_axis_size(-3), + input.get_axis_size(-2), + input.get_axis_size(-1) + ); + + auto outputDesc = TensorDescriptor( + output.get_axis_size(-4), + output.get_axis_size(-3), + output.get_axis_size(-2), + output.get_axis_size(-1) + ); + + cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log); + } } }}}} /* namespace cv::dnn::cuda4dnn::csl */ diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index 59c816349219..ec2212efae49 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -42,6 +42,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -54,6 +55,11 @@ using std::max; using namespace cv::dnn::ocl4dnn; #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -90,6 +96,7 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || (backendId == DNN_BACKEND_HALIDE && haveHalide() && axisRaw == 1) || (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !logSoftMax) || (backendId == DNN_BACKEND_VKCOM && haveVulkan()); @@ -286,6 +293,54 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) + { + CV_UNUSED(workspace); + + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto actual_dims = input_wrapper->getShape().size(); + CV_Assert(get_effective_rank(input) <= actual_dims); + + auto extra_dims = input.rank - actual_dims; + auto channel_axis = clamp(axisRaw, actual_dims) + extra_dims; + + std::size_t batch_size = 1; + for (int j = 0; j < channel_axis; j++) + batch_size *= input.get_axis_size(j); + + auto channel_size = input.get_axis_size(channel_axis); + input.reshape(batch_size, channel_size, 1, -1); + output.reshape(batch_size, channel_size, 1, -1); + + csl::tensor_ops::softmax(cudnnHandle, output, input, logSoftMax); + } + } + + void initCUDA( + csl::Stream stream, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes + ) + { + cudnnHandle = std::move(cudnn_handle); + } + + csl::cudnn::Handle cudnnHandle; +#endif + virtual Ptr initVkCom(const std::vector > &inputs) CV_OVERRIDE { #ifdef HAVE_VULKAN From e3e5cc411f5862f8918deba09110e1a4c991e92d Mon Sep 17 00:00:00 2001 From: Yashas Date: Sun, 23 Jun 2019 12:46:01 +0530 Subject: [PATCH 010/129] add activation layers --- modules/dnn/src/cuda/math.cu | 243 ++++++++++++++++ modules/dnn/src/cuda/test.cu | 18 -- modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp | 135 +++++++++ modules/dnn/src/cuda4dnn/csl/kernels.hpp | 45 +++ modules/dnn/src/cuda4dnn/csl/tensor.hpp | 77 +++++ modules/dnn/src/layers/elementwise_layers.cpp | 266 +++++++++++++++++- 6 files changed, 757 insertions(+), 27 deletions(-) create mode 100644 modules/dnn/src/cuda/math.cu delete mode 100644 modules/dnn/src/cuda/test.cu create mode 100644 modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/kernels.hpp diff --git a/modules/dnn/src/cuda/math.cu b/modules/dnn/src/cuda/math.cu new file mode 100644 index 000000000000..0ef18142f566 --- /dev/null +++ b/modules/dnn/src/cuda/math.cu @@ -0,0 +1,243 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include + +#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/kernel_utils.hpp" +#include "../cuda4dnn/csl/span.hpp" +#include "../cuda4dnn/csl/stream.hpp" + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace detail { + template __device__ T abs(T val); + template <> __device__ float abs(float val) { return fabsf(val); } + template <> __device__ double abs(double val) { return fabs(val); } + + template __device__ T exp(T val); + template <> __device__ float exp(float val) { return expf(val); } + template <> __device__ double exp(double val) { return ::exp(val); } + + template __device__ T max(T x, T y); + template <> __device__ float max(float x, float y) { return fmaxf(x, y); } + template <> __device__ double max(double x, double y) { return fmax(x, y); } + + template __device__ T min(T x, T y); + template <> __device__ float min(float x, float y) { return fminf(x, y); } + template <> __device__ double min(double x, double y) { return fmin(x, y); } + + template __device__ T log1p(T val); + template <> __device__ float log1p(float val) { return log1pf(val); } + template <> __device__ double log1p(double val) { return ::log1p(val); } + + template __device__ T log1pexp(T val); + template <> __device__ double log1pexp(double val) { + if (val <= -37) + return exp(val); + else if (-37 < val && val <= 18) + return log1p(exp(val)); + else if (18 < val && val <= 33.3) + return val + exp(-val); + else + return val; + } + template <> __device__ float log1pexp(float val) { return log1pexp(val); } + + template __device__ T tanh(T val); + template <> __device__ float tanh(float val) { return tanhf(val); } + template <> __device__ double tanh(double val) { return ::tanh(val); } + + template __device__ T pow(T val, T exp); + template <> __device__ float pow(float val, float exp) { return powf(val, exp); } + template <> __device__ double pow(double val, double exp) { return ::pow(val, exp); } + + template + __device__ T sigmoid(T val) { return T(1) / (1 + exp(-val)); } + } + + namespace raw { + template + __global__ void abs(span dest, view src) { + assert(src.size() >= dest.size()); + for (auto i : grid_stride_range(dest.size())) { + using detail::abs; + dest[i] = abs(src[i]); + } + } + + template + __global__ void tanh(span dest, view src) { + assert(src.size() >= dest.size()); + for (auto i : grid_stride_range(dest.size())) { + using detail::tanh; + dest[i] = tanh(src[i]); + } + } + + template + __global__ void sigmoid(span dest, view src) { + assert(src.size() >= dest.size()); + for (auto i : grid_stride_range(dest.size())) { + using detail::sigmoid; + dest[i] = sigmoid(src[i]); + } + } + + template + __global__ void bnll(span dest, view src) { + assert(src.size() >= dest.size()); + for (auto i : grid_stride_range(dest.size())) { + using detail::log1pexp; + dest[i] = src[i] > 0 ? src[i] + log1pexp(-src[i]) : log1pexp(src[i]); + } + } + + template + __global__ void elu(span dest, view src) { + assert(src.size() >= dest.size()); + for (auto i : grid_stride_range(dest.size())) { + using detail::exp; + dest[i] = src[i] >= 0 ? src[i] : (exp(src[i]) - 1); + } + } + + template + __global__ void relu(span dest, view src, T slope) { + assert(src.size() >= dest.size()); + for (auto i : grid_stride_range(dest.size())) + dest[i] = src[i] >= 0.0 ? src[i] : slope * src[i]; + } + + template + __global__ void clipped_relu(span dest, view src, T floor, T ceiling) { + assert(src.size() >= dest.size()); + assert(floor <= ceiling); + for (auto i : grid_stride_range(dest.size())) { + using detail::max; + using detail::min; + dest[i] = min(max(src[i], floor), ceiling); + } + } + + template + __global__ void axiswise_relu(span dest, view src, view slope, std::size_t inner_size, std::size_t channel_size) { + assert(src.size() >= dest.size()); + for (auto i : grid_stride_range(dest.size())) { + const auto c = (i % inner_size) / channel_size; + dest[i] = src[i] < 0 ? src[i] * slope[c] : src[i]; + } + } + + template + __global__ void power(span dest, view src, T exp, T scale, T shift) { + assert(src.size() >= dest.size()); + for (auto i : grid_stride_range(dest.size())) { + using detail::pow; + dest[i] = pow(shift + scale * src[i], exp); + } + } + } + + template + void abs(const Stream& stream, span dest, view src) { + CV_Assert(src.size() >= dest.size()); + + auto policy = make_policy(raw::abs, 0, stream); + launch_kernel(raw::abs, policy, dest, src); + } + + template void abs(const Stream& stream, span dest, view src); + template void abs(const Stream& stream, span dest, view src); + + template + void tanh(const Stream& stream, span dest, view src) { + CV_Assert(src.size() >= dest.size()); + + auto policy = make_policy(raw::tanh, 0, stream); + launch_kernel(raw::tanh, policy, dest, src); + } + + template void tanh(const Stream& stream, span dest, view src); + template void tanh(const Stream& stream, span dest, view src); + + template + void sigmoid(const Stream& stream, span dest, view src) { + CV_Assert(src.size() >= dest.size()); + + auto policy = make_policy(raw::sigmoid, 0, stream); + launch_kernel(raw::sigmoid, policy, dest, src); + } + + template void sigmoid(const Stream& stream, span dest, view src); + template void sigmoid(const Stream& stream, span dest, view src); + + template + void bnll(const Stream& stream, span dest, view src) { + CV_Assert(src.size() >= dest.size()); + + auto policy = make_policy(raw::bnll, 0, stream); + launch_kernel(raw::bnll, policy, dest, src); + } + + template void bnll(const Stream& stream, span dest, view src); + template void bnll(const Stream& stream, span dest, view src); + + template + void elu(const Stream& stream, span dest, view src) { + CV_Assert(src.size() >= dest.size()); + + auto policy = make_policy(raw::elu, 0, stream); + launch_kernel(raw::elu, policy, dest, src); + } + + template void elu(const Stream& stream, span dest, view src); + template void elu(const Stream& stream, span dest, view src); + + template + void relu(const Stream& stream, span dest, view src, T slope) { + CV_Assert(src.size() >= dest.size()); + + auto policy = make_policy(raw::relu, 0, stream); + launch_kernel(raw::relu, policy, dest, src, slope); + } + + template void relu(const Stream& stream, span dest, view src, float slope); + template void relu(const Stream& stream, span dest, view src, double slope); + + template + void clipped_relu(const Stream& stream, span dest, view src, T floor, T ceiling) { + CV_Assert(src.size() >= dest.size()); + CV_Assert(floor <= ceiling); + + auto policy = make_policy(raw::clipped_relu, 0, stream); + launch_kernel(raw::clipped_relu, policy, dest, src, floor, ceiling); + } + + template void clipped_relu(const Stream& stream, span dest, view src, float floor, float ceiling); + template void clipped_relu(const Stream& stream, span dest, view src, double floor, double ceiling); + + template + void axiswise_relu(const Stream& stream, span dest, view src, view slope, std::size_t inner_size, std::size_t channel_size) { + CV_Assert(src.size() >= dest.size()); + + auto policy = make_policy(raw::axiswise_relu, 0, stream); + launch_kernel(raw::axiswise_relu, policy, dest, src, slope, inner_size, channel_size); + } + + template void axiswise_relu(const Stream& stream, span dest, view src, view slope, std::size_t inner_size, std::size_t channel_size); + template void axiswise_relu(const Stream& stream, span dest, view src, view slope, std::size_t inner_size, std::size_t channel_size); + + template + void power(const Stream& stream, span dest, view src, T exp, T scale, T shift) { + CV_Assert(src.size() >= dest.size()); + + auto policy = make_policy(raw::power, 0, stream); + launch_kernel(raw::power, policy, dest, src, exp, scale, shift); + } + + template void power(const Stream& stream, span dest, view src, float exp, float scale, float shift); + template void power(const Stream& stream, span dest, view src, double exp, double scale, double shift); + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda/test.cu b/modules/dnn/src/cuda/test.cu deleted file mode 100644 index 1a50e97cbbbb..000000000000 --- a/modules/dnn/src/cuda/test.cu +++ /dev/null @@ -1,18 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -// this file is a stub and will be removed once actual code is added - -#include "../precomp.hpp" - -#include - -#ifndef HAVE_CUDA -# error "CUDA files should not be compiled if CUDA was not enabled" -#endif - -__global__ void cuda4dnn_build_test_kernel(float* addr) { - int idx = threadIdx.x; - addr[idx] = 0.0; -} diff --git a/modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp b/modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp new file mode 100644 index 000000000000..c7c3570e6f3b --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp @@ -0,0 +1,135 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_KERNEL_UTILS_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_KERNEL_UTILS_HPP + +#include "error.hpp" +#include "stream.hpp" +#include "nvcc_defs.hpp" + +#include + +#ifdef __CUDACC__ +#include +#endif + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + +#ifdef __CUDACC__ + struct execution_policy { + execution_policy(dim3 grid_size, dim3 block_size) + : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { } + + execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem) + : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { } + + execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm) + : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ StreamAccessor::get(strm) } { } + + execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm) + : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ StreamAccessor::get(strm) } { } + + dim3 grid; + dim3 block; + std::size_t sharedMem; + cudaStream_t stream; + }; + + template inline + execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) { + int grid_size, block_size; + CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem)); + return execution_policy(grid_size, block_size, sharedMem, stream); + } + + template inline + void launch_kernel(Kernel kernel, Args ...args) { + auto policy = make_policy(kernel); + kernel <<>> (std::forward(args)...); + } + + template inline + void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) { + kernel <<>> (std::forward(args)...); + } + + template inline + void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) { + kernel <<>> (std::forward(args)...); + } + + template CUDA4DNN_DEVICE auto getGridDim()->decltype(dim3::x); + template <> inline CUDA4DNN_DEVICE auto getGridDim<0>()->decltype(dim3::x) { return gridDim.x; } + template <> inline CUDA4DNN_DEVICE auto getGridDim<1>()->decltype(dim3::x) { return gridDim.y; } + template <> inline CUDA4DNN_DEVICE auto getGridDim<2>()->decltype(dim3::x) { return gridDim.z; } + + template CUDA4DNN_DEVICE auto getBlockDim()->decltype(dim3::x); + template <> inline CUDA4DNN_DEVICE auto getBlockDim<0>()->decltype(dim3::x) { return blockDim.x; } + template <> inline CUDA4DNN_DEVICE auto getBlockDim<1>()->decltype(dim3::x) { return blockDim.y; } + template <> inline CUDA4DNN_DEVICE auto getBlockDim<2>()->decltype(dim3::x) { return blockDim.z; } + + template CUDA4DNN_DEVICE auto getBlockIdx()->decltype(uint3::x); + template <> inline CUDA4DNN_DEVICE auto getBlockIdx<0>()->decltype(uint3::x) { return blockIdx.x; } + template <> inline CUDA4DNN_DEVICE auto getBlockIdx<1>()->decltype(uint3::x) { return blockIdx.y; } + template <> inline CUDA4DNN_DEVICE auto getBlockIdx<2>()->decltype(uint3::x) { return blockIdx.z; } + + template CUDA4DNN_DEVICE auto getThreadIdx()->decltype(uint3::x); + template <> inline CUDA4DNN_DEVICE auto getThreadIdx<0>()->decltype(uint3::x) { return threadIdx.x; } + template <> inline CUDA4DNN_DEVICE auto getThreadIdx<1>()->decltype(uint3::x) { return threadIdx.y; } + template <> inline CUDA4DNN_DEVICE auto getThreadIdx<2>()->decltype(uint3::x) { return threadIdx.z; } + + template + class grid_stride_range_generic { + public: + CUDA4DNN_DEVICE grid_stride_range_generic(std::size_t to_) : from(0), to(to_) { } + CUDA4DNN_DEVICE grid_stride_range_generic(std::size_t from_, std::size_t to_) : from(from_), to(to_) { } + + class iterator + { + public: + CUDA4DNN_DEVICE iterator(std::size_t pos_) : pos(pos_) {} + + CUDA4DNN_DEVICE size_t operator*() const { return pos; } + + CUDA4DNN_DEVICE iterator& operator++() { + pos += getGridDim() * getBlockDim(); + return *this; + } + + CUDA4DNN_DEVICE bool operator!=(const iterator& other) const { + /* NOTE HACK + ** 'pos' can move in large steps (see operator++) + ** expansion of range for loop uses != as the loop conditioion + ** => operator!= must return false if 'pos' crosses the end + */ + return pos < other.pos; + } + + private: + std::size_t pos; + }; + + CUDA4DNN_DEVICE iterator begin() const { + return iterator(from + getBlockDim() * getBlockIdx() + getThreadIdx()); + } + + CUDA4DNN_DEVICE iterator end() const { + return iterator(to); + } + + private: + std::size_t from, to; + }; + + using grid_stride_range_x = grid_stride_range_generic<0>; + using grid_stride_range_y = grid_stride_range_generic<1>; + using grid_stride_range_z = grid_stride_range_generic<2>; + using grid_stride_range = grid_stride_range_x; + +#endif /* __CUDACC__ */ + +}}}} /* cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_KERNEL_UTILS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp new file mode 100644 index 000000000000..993914424b29 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -0,0 +1,45 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_KERNELS_HPP +#define OPENCV_DNN_CUDA4DNN_KERNELS_HPP + +#include "cuda4dnn/csl/stream.hpp" +#include "cuda4dnn/csl/memory.hpp" +#include "cuda4dnn/csl/span.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + template + void abs(const Stream& stream, span dest, view src); + + template + void tanh(const Stream& stream, span dest, view src); + + template + void sigmoid(const Stream& stream, span dest, view src); + + template + void bnll(const Stream& stream, span dest, view src); + + template + void elu(const Stream& stream, span dest, view src); + + template + void relu(const Stream& stream, span dest, view src, T slope); + + template + void clipped_relu(const Stream& stream, span dest, view src, T floor, T ceiling); + + template + void axiswise_relu(const Stream& stream, span dest, view src, view slope, std::size_t inner_size, std::size_t channel_size); + + template + void power(const Stream& stream, span dest, view src, T exp, T scale, T shift); + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ + +#endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index a122467d9067..0fa696944ac7 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -10,6 +10,7 @@ #include "cublas.hpp" #include "cudnn.hpp" #include "span.hpp" +#include "kernels.hpp" #include @@ -264,6 +265,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { reshape(std::begin(new_sizes), std::end(new_sizes)); } + operator span() noexcept { return span(data.get(), size()); } + operator view() const noexcept { return view(data.get(), size()); } + friend void swap(Tensor& lhs, Tensor& rhs) noexcept { using std::swap; swap(lhs.data, rhs.data); @@ -479,6 +483,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return subspan(offset, std::begin(new_sizes), std::end(new_sizes)); } + operator span() noexcept { return span(ptr, size()); } + operator view() const noexcept { return view(ptr, size()); } + friend void swap(TensorSpan& lhs, TensorSpan& rhs) noexcept { using std::swap; swap(lhs.ptr, rhs.ptr); @@ -703,6 +710,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return subview(offset, std::begin(new_sizes), std::end(new_sizes)); } + operator view() const noexcept { return view(ptr, size()); } + friend void swap(TensorView& lhs, TensorView& rhs) noexcept { using std::swap; swap(lhs.ptr, rhs.ptr); @@ -808,6 +817,17 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { const auto A_nc = A.get_axis_size(-1); const auto B_nc = B.get_axis_size(-1); + /* tensors are stored in row-major but cublas::gemm operates on column-major matrices + * a row-major matrix when read as column-major matrix gives the transpose of the intended matrix + * + * Required: C = AB + * what cuBLAS sees: C^T = A^TB^T = (BA)^T + * + * By reversing operands, we effectively perform: + * C^T = B^TA^T = (AB)^T + * + * which gives C = AB + */ cublas::gemm(handle, transb, transa, result_nc, result_nr, common_dim, @@ -883,6 +903,63 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log); } + + template inline + void abs(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + kernels::abs(stream, dest, src); + } + + template inline + void bnll(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + kernels::bnll(stream, dest, src); + } + + template inline + void relu(const Stream& stream, TensorSpan dest, TensorView src, T slope = 0) { + CV_Assert(is_shape_same(dest, src)); + kernels::relu(stream, dest, src, slope); + } + + template inline + void clipped_relu(const Stream& stream, TensorSpan dest, TensorView src, T min, T max) { + CV_Assert(is_shape_same(dest, src)); + kernels::clipped_relu(stream, dest, src, min, max); + } + + template inline + void channelwise_relu(const Stream& stream, TensorSpan dest, TensorView src, TensorView slope) { + CV_Assert(is_shape_same(dest, src)); + CV_Assert(src.get_axis_size(1) == slope.size()); + std::size_t inner_size = src.size() / src.get_axis_size(0); + std::size_t channel_size = inner_size / src.get_axis_size(1); + kernels::axiswise_relu(stream, dest, src, slope, inner_size, channel_size); + } + + template inline + void elu(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + kernels::elu(stream, dest, src); + } + + template inline + void power(const Stream& stream, TensorSpan dest, TensorView src, T exp = 1, T scale = 1, T shift = 0) { + CV_Assert(is_shape_same(dest, src)); + kernels::power(stream, dest, src, exp, scale, shift); + } + + template inline + void sigmoid(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + kernels::sigmoid(stream, dest, src); + } + + template inline + void tanh(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + kernels::tanh(stream, dest, src); + } } }}}} /* namespace cv::dnn::cuda4dnn::csl */ diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index 96dffced4b55..9066c2005b2c 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -42,6 +42,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -52,6 +53,12 @@ #include "opencl_kernels_dnn.hpp" #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/kernels.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -221,6 +228,30 @@ class ElementWiseLayer : public Func::Layer func.apply(src, dst, len, planeSize, cn0, cn1); } + +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) + { + func.applyCUDA(inputs, outputs, workspace, stream); + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes + ) + { + stream = std::move(stream_); + } + + csl::Stream stream; +#endif + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { @@ -261,7 +292,9 @@ struct ReLUFunctor if (backendId == DNN_BACKEND_INFERENCE_ENGINE) return slope >= 0 || !INF_ENGINE_VER_MAJOR_EQ(INF_ENGINE_RELEASE_2019R1); #endif - return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE || + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || + backendId == DNN_BACKEND_HALIDE || backendId == DNN_BACKEND_VKCOM; } @@ -297,6 +330,27 @@ struct ReLUFunctor } } +#ifdef HAVE_CUDA + void applyCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace, + const csl::Stream& stream + ) + { + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::relu(stream, output, input, slope); + } + } +#endif + #ifdef HAVE_OPENCL bool initKernel(ocl::Kernel &ker, const UMat &src) const { @@ -392,7 +446,9 @@ struct ReLU6Functor bool supportBackend(int backendId, int) { - return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE || + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || + backendId == DNN_BACKEND_HALIDE || backendId == DNN_BACKEND_INFERENCE_ENGINE; } @@ -460,6 +516,27 @@ struct ReLU6Functor } #endif +#ifdef HAVE_CUDA + void applyCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace, + const csl::Stream& stream + ) + { + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::clipped_relu(stream, output, input, minValue, maxValue); + } + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -496,7 +573,9 @@ struct TanHFunctor bool supportBackend(int backendId, int) { - return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE || + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || + backendId == DNN_BACKEND_HALIDE || backendId == DNN_BACKEND_INFERENCE_ENGINE; } @@ -540,6 +619,27 @@ struct TanHFunctor } #endif +#ifdef HAVE_CUDA + void applyCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace, + const csl::Stream& stream + ) + { + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::tanh(stream, output, input); + } + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -576,7 +676,9 @@ struct SigmoidFunctor bool supportBackend(int backendId, int) { - return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE || + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || + backendId == DNN_BACKEND_HALIDE || backendId == DNN_BACKEND_INFERENCE_ENGINE; } @@ -620,6 +722,27 @@ struct SigmoidFunctor } #endif +#ifdef HAVE_CUDA + void applyCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace, + const csl::Stream& stream + ) + { + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::sigmoid(stream, output, input); + } + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -658,7 +781,9 @@ struct ELUFunctor bool supportBackend(int backendId, int) { - return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE || + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || + backendId == DNN_BACKEND_HALIDE || backendId == DNN_BACKEND_INFERENCE_ENGINE; } @@ -702,6 +827,27 @@ struct ELUFunctor } #endif +#ifdef HAVE_CUDA + void applyCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace, + const csl::Stream& stream + ) + { + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::elu(stream, output, input); + } + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -742,7 +888,9 @@ struct AbsValFunctor if (backendId == DNN_BACKEND_INFERENCE_ENGINE) return !INF_ENGINE_VER_MAJOR_EQ(INF_ENGINE_RELEASE_2019R1); #endif - return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE; + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || + backendId == DNN_BACKEND_HALIDE; } void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const @@ -785,6 +933,27 @@ struct AbsValFunctor } #endif +#ifdef HAVE_CUDA + void applyCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace, + const csl::Stream& stream + ) + { + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::abs(stream, output, input); + } + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -821,7 +990,9 @@ struct BNLLFunctor bool supportBackend(int backendId, int) { - return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE; + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || + backendId == DNN_BACKEND_HALIDE; } void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const @@ -865,6 +1036,27 @@ struct BNLLFunctor } #endif +#ifdef HAVE_CUDA + void applyCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace, + const csl::Stream& stream + ) + { + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::bnll(stream, output, input); + } + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -912,7 +1104,9 @@ struct PowerFunctor if (backendId == DNN_BACKEND_INFERENCE_ENGINE) return (targetId != DNN_TARGET_OPENCL && targetId != DNN_TARGET_OPENCL_FP16) || power == 1.0 || power == 0.5; else - return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE; + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || + backendId == DNN_BACKEND_HALIDE; } void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const @@ -973,6 +1167,27 @@ struct PowerFunctor } #endif +#ifdef HAVE_CUDA + void applyCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace, + const csl::Stream& stream + ) + { + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::power(stream, output, input, power, scale, shift); +} + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -1051,7 +1266,9 @@ struct ChannelsPReLUFunctor bool supportBackend(int backendId, int) { - return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE || + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || + backendId == DNN_BACKEND_HALIDE || backendId == DNN_BACKEND_INFERENCE_ENGINE; } @@ -1126,6 +1343,37 @@ struct ChannelsPReLUFunctor } #endif +#ifdef HAVE_CUDA + void applyCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace, + const csl::Stream& stream + ) + { + if(slopeTensor) + { + slopeTensor = std::make_shared>(); + *slopeTensor = createTensorHeaderFromMat(scale); + copyMatToTensor(*slopeTensor, scale, stream); + } + + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::channelwise_relu(stream, output, input, *slopeTensor); + } + } + + /* we have a shared_ptr here because csl::Tensor is non-copyable and these functors need to be copyable */ + std::shared_ptr> slopeTensor; +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { From eb69bf747723a11731ff6b4c9eaedc03a7c4e7ce Mon Sep 17 00:00:00 2001 From: Yashas Date: Mon, 24 Jun 2019 14:57:31 +0530 Subject: [PATCH 011/129] support arbitary rank TensorDescriptor --- modules/dnn/src/cuda4dnn/csl/cudnn.hpp | 111 ++++++++++++++++--- modules/dnn/src/cuda4dnn/csl/tensor.hpp | 135 +++++++++++------------ modules/dnn/src/layers/softmax_layer.cpp | 11 +- 3 files changed, 162 insertions(+), 95 deletions(-) diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp index 4a8a7019e6ae..11206057a015 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp @@ -11,6 +11,13 @@ #include +#include +#include +#include +#include +#include +#include + #define CUDA4DNN_CHECK_CUDNN(call) \ ::cv::dnn::cuda4dnn::csl::cudnn::detail::check((call), CV_Func, __FILE__, __LINE__) @@ -28,13 +35,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu template <> inline auto get_data_type()->decltype(CUDNN_DATA_FLOAT) { return CUDNN_DATA_DOUBLE; } } - /** used to access the raw cuDNN handle held by Handle */ class HandleAccessor { public: static cudnnHandle_t get(const Handle& handle); }; + /** creates a cuDNN tensor descriptor for a given shape */ template class TensorDescriptor { public: @@ -45,19 +52,24 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu other.descriptor = nullptr; } - TensorDescriptor(std::size_t N, std::size_t chans, std::size_t height, std::size_t width) { - CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorDescriptor(&descriptor)); - try { - CUDA4DNN_CHECK_CUDNN(cudnnSetTensor4dDescriptor(descriptor, - CUDNN_TENSOR_NCHW, detail::get_data_type(), - static_cast(N), static_cast(chans), - static_cast(height), static_cast(width))); - } - catch (...) { - /* cudnnDestroyTensorDescriptor will not fail */ - CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor)); - throw; - } + /** constructs a tensor descriptor from the axis lengths provided in \p shape */ + template ()))> + TensorDescriptor(const SequenceContainer& shape) { + constructor(shape.begin(), shape.end()); + } + + /** constructs a tensor descriptor from the axis lengths provided in [begin, end) */ + template ::value, void>::type> // TODO is_iterator + TensorDescriptor(ForwardItr begin, ForwardItr end) { + constructor(begin, end); + } + + /** constructs a tensor descriptor from the axis lengths provided as arguments */ + template + TensorDescriptor(Sizes ...sizes) { + static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank"); + std::array dims = { static_cast(sizes)... }; + constructor(std::begin(dims), std::end(dims)); } ~TensorDescriptor() noexcept { @@ -77,6 +89,77 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu cudnnTensorDescriptor_t get() const noexcept { return descriptor; } private: + template + void constructor(ForwardItr start, ForwardItr end) { + CV_Assert(start != end); + CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX); + + CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorDescriptor(&descriptor)); + try { + const auto rank = std::distance(start, end); + if (rank <= 4) { + std::array dims; + std::fill(std::begin(dims), std::end(dims), 1); + + /* suppose we have a 3d tensor, the first axis is the batch axis and + * the second axis is the channel axis (generally) + * + * cuDNN frequently assumes that the first axis is the batch axis and the + * second axis is the channel axis; hence, we copy the shape of a lower rank + * tensor to the begining of `dims` + */ + std::copy(start, end, std::begin(dims)); + + CUDA4DNN_CHECK_CUDNN( + cudnnSetTensor4dDescriptor(descriptor, + CUDNN_TENSOR_NCHW, detail::get_data_type(), + dims[0], dims[1], dims[2], dims[3] + ) + ); + } else { + std::vector stride(rank); + stride.back() = 1; + /* WHAT WE HAVE NOW: + * stride[-1] = 1 + * stride[-2] = garbage + * stride[-3] = garbage + * stride[-4] = garbage + * ... + */ + + std::copy(start + 1, end, stride.begin()); + /* WHAT WE HAVE NOW: + * stride[-1] = 1 + * stride[-2] = dim[-1] + * stride[-3] = dim[-2] + * stride[-4] = dim[-3] + * ... + */ + + std::partial_sum(std::rbegin(stride), std::rend(stride), std::rbegin(stride), std::multiplies()); + /* WHAT WE HAVE NOW: + * stride[-1] = 1 + * stride[-2] = stride[-1] * dim[-1] + * stride[-3] = stride[-2] * dim[-2] + * stride[-4] = stride[-3] * dim[-3] + * ... + */ + + std::vector dims(start, end); + CUDA4DNN_CHECK_CUDNN( + cudnnSetTensorNdDescriptor(descriptor, + detail::get_data_type(), rank, + dims.data(), stride.data() + ) + ); + } + } catch (...) { + /* cudnnDestroyTensorDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor)); + throw; + } + } + cudnnTensorDescriptor_t descriptor; }; diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index 0fa696944ac7..e7dfb5913b4c 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -94,6 +94,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return std::accumulate(std::begin(sizes), std::end(sizes), 1, std::multiplies()); } + /** returns a shape array consisting of axis lengths in order starting from zero */ + std::array shape() const noexcept { return sizes; } + /** returns true if the tensor is empty */ bool empty() const noexcept { return !size(); } @@ -111,7 +114,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * - the axis must be in the range [-rank, rank) */ size_type get_axis_size(int axis) const noexcept { - axis = axis < 0 ? rank + axis : axis; + axis = clamp_axis(axis, rank); CV_Assert(axis >= 0 && axis < rank); return sizes[axis]; } @@ -144,9 +147,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { data.reset(total); /* length of the unspecified axes are assumed to be one */ - auto fill_sizes = rank - std::distance(start, end); - std::fill_n(std::begin(sizes), fill_sizes, 1); - std::copy(start, end, std::begin(sizes) + fill_sizes); + std::fill(std::begin(sizes), std::end(sizes), 1); + std::copy_backward(start, end, std::end(sizes)); } /** @brief resizes the tensor @@ -230,9 +232,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { } /* we assume the size of the unspecified axes to be one */ - auto fill_sizes = rank - std::distance(start, end); - std::fill_n(std::begin(sizes), fill_sizes, 1); - std::copy(start, end, std::begin(sizes) + fill_sizes); + std::fill(std::begin(sizes), std::end(sizes), 1); + std::copy_backward(start, end, std::end(sizes)); /* replace the unknown axis with the correct value */ std::replace(std::begin(sizes), std::end(sizes), size_type(-1), unknown_size); @@ -314,6 +315,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return std::accumulate(std::begin(sizes), std::end(sizes), 1, std::multiplies()); } + /** returns a shape array consisting of axis lengths in order starting from zero */ + CUDA4DNN_HOST std::array shape() const noexcept { + std::array temp; + std::copy(std::begin(sizes), std::end(sizes), std::begin(temp)); + return temp; + } + /** returns true if the tensor is empty */ CUDA4DNN_HOST/*_DEVICE*/ bool empty() const noexcept { return !size(); } @@ -329,7 +337,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * - the axis must be in the range [-rank, rank) */ CUDA4DNN_HOST_DEVICE size_type get_axis_size(int axis) const noexcept { - axis = axis < 0 ? rank + axis : axis; + axis = clamp_axis(axis, rank); CV_Assert(axis >= 0 && axis < rank); return sizes[axis]; } @@ -391,9 +399,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { } /* we assume the size of the unspecified axes to be one */ - auto fill_sizes = rank - std::distance(start, end); - std::fill_n(std::begin(sizes), fill_sizes, 1); - std::copy(start, end, std::begin(sizes) + fill_sizes); + std::fill(std::begin(sizes), std::end(sizes), 1); + std::copy_backward(start, end, std::end(sizes)); /* replace the unknown axis with the correct value */ std::replace(std::begin(sizes), std::end(sizes), size_type(-1), unknown_size); @@ -465,9 +472,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { TensorSpan temp; /* we assume the size of the unspecified axes to be one */ - auto fill_sizes = rank - std::distance(start, end); - std::fill_n(std::begin(temp.sizes), fill_sizes, 1); - std::copy(start, end, std::begin(temp.sizes) + fill_sizes); + std::fill(std::begin(temp.sizes), std::end(temp.sizes), 1); + std::copy_backward(start, end, std::end(temp.sizes)); temp.ptr = ptr + offset; return temp; @@ -541,6 +547,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return std::accumulate(std::begin(sizes), std::end(sizes), 1, std::multiplies()); } + /** returns a shape array consisting of axis lengths in order starting from zero */ + CUDA4DNN_HOST std::array shape() const noexcept { + std::array temp; + std::copy(std::begin(sizes), std::end(sizes), std::begin(temp)); + return temp; + } + /** returns true if the tensor is empty */ CUDA4DNN_HOST/*_DEVICE*/ bool empty() const noexcept { return !size(); } @@ -556,7 +569,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * - the axis must be in the range [-rank, rank) */ CUDA4DNN_HOST_DEVICE size_type get_axis_size(int axis) const noexcept { - axis = axis < 0 ? rank + axis : axis; + axis = clamp_axis(axis, rank); CV_Assert(axis >= 0 && axis < rank); return sizes[axis]; } @@ -618,9 +631,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { } /* we assume the size of the unspecified axes to be one */ - auto fill_sizes = rank - std::distance(start, end); - std::fill_n(std::begin(sizes), fill_sizes, 1); - std::copy(start, end, std::begin(sizes) + fill_sizes); + std::fill(std::begin(sizes), std::end(sizes), 1); + std::copy_backward(start, end, std::end(sizes)); /* replace the unknown axis with the correct value */ std::replace(std::begin(sizes), std::end(sizes), size_type(-1), unknown_size); @@ -692,9 +704,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { TensorView temp; /* we assume the size of the unspecified axes to be one */ - auto fill_sizes = rank - std::distance(start, end); - std::fill_n(std::begin(temp.sizes), fill_sizes, 1); - std::copy(start, end, std::begin(temp.sizes) + fill_sizes); + std::fill(std::begin(temp.sizes), std::end(temp.sizes), 1); + std::copy_backward(start, end, std::end(temp.sizes)); temp.ptr = ptr + offset; return temp; @@ -723,6 +734,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { const_pointer ptr; }; + /** if the \p axis is a negative index, the equivalent postive index is returned; otherwise, returns \p axis */ + template + CUDA4DNN_HOST_DEVICE constexpr T clamp_axis(T axis, std::size_t rank) { + return axis < 0 ? axis + rank : axis; + } + /** returns true if the two TensorType objects have the same shape */ template inline bool is_shape_same(const TensorType1& x, const TensorType2& y) noexcept { @@ -765,14 +782,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return effective_rank; } - /** returns the length of the axes of a TensorType object in a std::vector */ - template inline - std::vector get_shape_vector(const TensorType& x) { - constexpr auto rank = TensorType::rank; - std::vector shape(rank); - for (int i = 0; i < rank; i++) - shape[i] = x.get_axis_size(i); - return shape; + template inline + std::vector squeeze_shape(const Container& shape, std::size_t upto_rank = 1) { + auto start = std::find_if(std::begin(shape), std::end(shape) - upto_rank + 1, [] (T x) { return x != 1; }); + return { start, std::end(shape) }; } namespace tensor_ops { @@ -839,35 +852,18 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { /** @brief performs element-wise addition with broadcasting * * Pre-conditions: - * - \p A and \p result must be compatible tensors + * - \p A and \p C must be compatible tensors * * Exception Gaurantee: Basic */ template inline - void add(const cudnn::Handle& handle, T beta, TensorSpan result, T alpha, TensorView A) { - /* mathematical requirements */ - CV_Assert(is_shape_compatible(result, A)); - - /* technical requirements */ - CV_Assert(get_effective_rank(result) <= 4); - CV_Assert(get_effective_rank(A) <= 4); + void add(const cudnn::Handle& handle, T beta, TensorSpan C, T alpha, TensorView A) { + CV_Assert(is_shape_compatible(A, C)); using cudnn::TensorDescriptor; - auto aDesc = TensorDescriptor( - A.get_axis_size(-4), - A.get_axis_size(-3), - A.get_axis_size(-2), - A.get_axis_size(-1) - ); - - auto cDesc = TensorDescriptor( - result.get_axis_size(-4), - result.get_axis_size(-3), - result.get_axis_size(-2), - result.get_axis_size(-1) - ); - - cudnn::add(handle, alpha, aDesc, A.get(), beta, cDesc, result.get()); + auto aDesc = TensorDescriptor(A.shape()); + auto cDesc = TensorDescriptor(C.shape()); + cudnn::add(handle, alpha, aDesc, A.get(), beta, cDesc, C.get()); } /** @brief performs element-wise addition with broadcasting @@ -878,29 +874,26 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * Exception Gaurantee: Basic */ template inline - void softmax(const cudnn::Handle& handle, TensorSpan output, TensorView input, bool log) { - /* mathematical requirements */ + void softmax(const cudnn::Handle& handle, TensorSpan output, TensorView input, int channel_axis, bool log) { CV_Assert(is_shape_same(output, input)); - /* technical requirements */ - CV_Assert(get_effective_rank(output) <= 4); - CV_Assert(get_effective_rank(input) <= 4); + channel_axis = clamp_axis(channel_axis, input.rank); - using cudnn::TensorDescriptor; - auto inputDesc = TensorDescriptor( - input.get_axis_size(-4), - input.get_axis_size(-3), - input.get_axis_size(-2), - input.get_axis_size(-1) - ); - - auto outputDesc = TensorDescriptor( - output.get_axis_size(-4), - output.get_axis_size(-3), - output.get_axis_size(-2), - output.get_axis_size(-1) - ); + std::size_t outer_size = 1; + for (int j = 0; j < channel_axis; j++) + outer_size *= input.get_axis_size(j); + + auto channel_size = input.get_axis_size(channel_axis); + + std::size_t inner_size = 1; + for (int j = channel_axis + 1; j < input.rank; j++) + inner_size *= input.get_axis_size(j); + std::array shape = { outer_size, channel_size, 1 , inner_size }; + + using cudnn::TensorDescriptor; + auto inputDesc = TensorDescriptor(shape); + auto outputDesc = TensorDescriptor(shape); cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log); } diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index ec2212efae49..04ba47d49305 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -315,16 +315,7 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer auto extra_dims = input.rank - actual_dims; auto channel_axis = clamp(axisRaw, actual_dims) + extra_dims; - - std::size_t batch_size = 1; - for (int j = 0; j < channel_axis; j++) - batch_size *= input.get_axis_size(j); - - auto channel_size = input.get_axis_size(channel_axis); - input.reshape(batch_size, channel_size, 1, -1); - output.reshape(batch_size, channel_size, 1, -1); - - csl::tensor_ops::softmax(cudnnHandle, output, input, logSoftMax); + csl::tensor_ops::softmax(cudnnHandle, output, input, channel_axis, logSoftMax); } } From f24ad2c79b4d2f5615f0e94618fd077c07605299 Mon Sep 17 00:00:00 2001 From: Yashas Date: Mon, 24 Jun 2019 18:17:34 +0530 Subject: [PATCH 012/129] pass input wrappers to `initCUDA()` --- modules/dnn/include/opencv2/dnn/dnn.hpp | 4 +++- modules/dnn/src/dnn.cpp | 5 +++-- modules/dnn/src/layers/elementwise_layers.cpp | 3 ++- modules/dnn/src/layers/fully_connected_layer.cpp | 3 ++- modules/dnn/src/layers/softmax_layer.cpp | 3 ++- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index c8ad545ced17..53d4c110387e 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -305,6 +305,7 @@ CV__DNN_INLINE_NS_BEGIN * @param[in] cublas_handle cuBLAS handle to use for cuBLAS operations * @param[in] cudnn_handle cuDNN handle to use for cuDNN operations * @param[out] scratch_mem_in_bytes request extra device memory in bytes for internals; defaults to zero + * @param inputs layer inputs * * This method needs to be implemented iff the layer supports forward pass compuatation on CUDA devices. */ @@ -312,7 +313,8 @@ CV__DNN_INLINE_NS_BEGIN cuda4dnn::csl::Stream stream, cuda4dnn::csl::cublas::Handle cublas_handle, cuda4dnn::csl::cudnn::Handle cudnn_handle, - std::size_t& scratch_mem_in_bytes + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs ); /** diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index ff6b4eeb3865..2bf18a1b63b4 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -1841,7 +1841,7 @@ struct Net::Impl { auto& ld = layer.second; std::size_t workspace_size_required = 0; - ld.layerInstance->initCUDA(stream, cublasHandle, cudnnHandle, workspace_size_required); + ld.layerInstance->initCUDA(stream, cublasHandle, cudnnHandle, workspace_size_required, ld.inputBlobsWrappers); workspace.require(workspace_size_required); } #endif @@ -3769,7 +3769,8 @@ void Layer::initCUDA( cuda4dnn::csl::Stream stream, cuda4dnn::csl::cublas::Handle cublas_handle, cuda4dnn::csl::cudnn::Handle cudnn_handle, - std::size_t& scratch_mem_in_bytes) + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs) { /* ** Implementing initCUDA is required iff the layer supports forward pass on CUDA devices. diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index 9066c2005b2c..0f6b1568efff 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -243,7 +243,8 @@ class ElementWiseLayer : public Func::Layer csl::Stream stream_, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, - std::size_t& scratch_mem_in_bytes + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs ) { stream = std::move(stream_); diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 4daa21208f63..06cd56768375 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -479,7 +479,8 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, - std::size_t& scratch_mem_in_bytes + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs ) { cublasHandle = std::move(cublas_handle); diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index 04ba47d49305..820e2c3b8e5b 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -323,7 +323,8 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, - std::size_t& scratch_mem_in_bytes + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs ) { cudnnHandle = std::move(cudnn_handle); From a5ae40735dabb0298d5c7dcc4dd3f46c78b45c08 Mon Sep 17 00:00:00 2001 From: Yashas Date: Mon, 24 Jun 2019 23:41:19 +0530 Subject: [PATCH 013/129] add 1d/2d/3d-convolution --- modules/dnn/src/cuda4dnn/csl/cudnn.hpp | 279 ++++++++++++++++++- modules/dnn/src/cuda4dnn/csl/tensor.hpp | 65 +++++ modules/dnn/src/layers/convolution_layer.cpp | 104 ++++++- 3 files changed, 445 insertions(+), 3 deletions(-) diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp index 11206057a015..cf82fa3dd315 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp @@ -47,7 +47,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu public: TensorDescriptor() noexcept : descriptor{ nullptr } { } TensorDescriptor(const TensorDescriptor&) = delete; - TensorDescriptor(TensorDescriptor&& other) + TensorDescriptor(TensorDescriptor&& other) noexcept : descriptor{ other.descriptor } { other.descriptor = nullptr; } @@ -163,6 +163,283 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu cudnnTensorDescriptor_t descriptor; }; + /** creates a cuDNN filter descriptor for the given filter shape + * + * Dimension Ordering: + * 0: number of output feature maps + * 1: number of input feature maps + * 2..n: kernel dimensions + */ + template + class FilterDescriptor { + public: + FilterDescriptor() noexcept : descriptor{ nullptr } { } + FilterDescriptor(const FilterDescriptor&) = delete; + FilterDescriptor(FilterDescriptor&& other) noexcept + : descriptor{ other.descriptor } { + other.descriptor = nullptr; + } + + /** constructs a filter descriptor from the filter dimensions provided in \p shape */ + template ()))> + FilterDescriptor(const SequenceContainer& shape) { + constructor(shape.begin(), shape.end()); + } + + /** constructs a filter descriptor from the filter dimensions provided in [begin, end) */ + template ::value, void>::type> // TODO is_iterator + FilterDescriptor(ForwardItr begin, ForwardItr end) { + constructor(begin, end); + } + + /** constructs a filter descriptor from the filter dimensions provided as arguments */ + template + FilterDescriptor(Sizes ...sizes) { + static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions"); + static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank"); + std::array dims = { static_cast(sizes)... }; + constructor(std::begin(dims), std::end(dims)); + } + + ~FilterDescriptor() noexcept { + if (descriptor != nullptr) { + /* cudnnDestroyFilterDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor)); + } + } + + FilterDescriptor& operator=(const FilterDescriptor&) = delete; + FilterDescriptor& operator=(FilterDescriptor&& other) noexcept { + descriptor = other.descriptor; + other.descriptor = nullptr; + return *this; + }; + + cudnnFilterDescriptor_t get() const noexcept { return descriptor; } + + private: + template + void constructor(ForwardItr start, ForwardItr end) { + CV_Assert(start != end); + CV_Assert(std::distance(start, end) >= 3); + CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX); + + CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor)); + try { + const auto rank = std::distance(start, end); + if (rank == 4) { + std::array dims; + std::copy(start, end, std::begin(dims)); + + CUDA4DNN_CHECK_CUDNN( + cudnnSetFilter4dDescriptor(descriptor, + detail::get_data_type(), CUDNN_TENSOR_NCHW, + dims[0], dims[1], dims[2], dims[3] + ) + ); + } else { + std::vector dims(start, end); + CUDA4DNN_CHECK_CUDNN( + cudnnSetFilterNdDescriptor(descriptor, + detail::get_data_type(), CUDNN_TENSOR_NCHW, + dims.size(), dims.data() + ) + ); + } + } catch (...) { + /* cudnnDestroyFilterDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor)); + throw; + } + } + + cudnnFilterDescriptor_t descriptor; + }; + + /** creates a cuDNN convolution descriptor */ + template + class ConvolutionDescriptor { + public: + ConvolutionDescriptor() noexcept : descriptor{ nullptr } { } + ConvolutionDescriptor(const ConvolutionDescriptor&) = delete; + ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept + : descriptor{ other.descriptor } { + other.descriptor = nullptr; + } + + template ()))> + ConvolutionDescriptor( + const SequenceContainer& zero_padding, + const SequenceContainer& stride, + const SequenceContainer& dialation, + std::size_t group_count) + { + constructor(zero_padding, stride, dialation, group_count); + } + + ~ConvolutionDescriptor() noexcept { + if (descriptor != nullptr) { + /* cudnnDestroyConvolutionDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor)); + } + } + + ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete; + ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept { + descriptor = other.descriptor; + other.descriptor = nullptr; + return *this; + }; + + cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; } + + private: + template + void constructor( + const SequenceContainer& zero_padding, + const SequenceContainer& stride, + const SequenceContainer& dialation, + std::size_t group_count) + { + CV_Assert(std::size(zero_padding) == std::size(stride)); + CV_Assert(std::size(zero_padding) == std::size(dialation)); + + CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor)); + try { + const auto rank = std::size(zero_padding); + if (rank == 2) { + CUDA4DNN_CHECK_CUDNN( + cudnnSetConvolution2dDescriptor(descriptor, + zero_padding[0], zero_padding[1], + stride[0], stride[1], + dialation[0], dialation[1], + CUDNN_CROSS_CORRELATION, + detail::get_data_type() + ) + ); + } else { + std::vector ipadding(std::begin(zero_padding), std::end(zero_padding)); + std::vector istride(std::begin(stride), std::end(stride)); + std::vector idialation(std::begin(dialation), std::end(dialation)); + CUDA4DNN_CHECK_CUDNN( + cudnnSetConvolutionNdDescriptor(descriptor, rank, + ipadding.data(), + istride.data(), + idialation.data(), + CUDNN_CROSS_CORRELATION, + detail::get_data_type() + ) + ); + } + CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count)); + } catch (...) { + /* cudnnDestroyConvolutionDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor)); + throw; + } + } + + cudnnConvolutionDescriptor_t descriptor; + }; + + template + class ConvolutionAlgorithm { + public: + ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { } + ConvolutionAlgorithm(ConvolutionAlgorithm&) = default; + ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default; + + ConvolutionAlgorithm( + Handle& handle, + ConvolutionDescriptor& conv, + FilterDescriptor& filter, + TensorDescriptor& input, + TensorDescriptor& output) + { + CUDA4DNN_CHECK_CUDNN( + cudnnGetConvolutionForwardAlgorithm(HandleAccessor::get(handle), + input.get(), filter.get(), conv.get(), output.get(), + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + 0, &algo + ) + ); + + CUDA4DNN_CHECK_CUDNN( + cudnnGetConvolutionForwardWorkspaceSize(HandleAccessor::get(handle), + input.get(), filter.get(), conv.get(), output.get(), + algo, &workspace_size + ) + ); + } + + ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default; + ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default; + + auto get() const noexcept { return algo; } + auto get_workspace_size() const noexcept { return workspace_size; } + + private: + cudnnConvolutionFwdAlgo_t algo; + std::size_t workspace_size; + }; + + template inline + void getConvolutionForwardOutputDim( + ConvolutionDescriptor& conv, + FilterDescriptor& filter, + TensorDescriptor& input, + std::vector& output) + { + output.clear(); + output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */ + + std::vector temp(CUDNN_DIM_MAX); + cudnnDataType_t tempDataType; + CUDA4DNN_CHECK_CUDNN( + cudnnGetTensorNdDescriptor( + input.get(), + CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */ + &tempDataType, + output.data(), + temp.data(), + temp.data() + ) + ); + const auto rank = output[0]; + output.resize(rank); + + CUDA4DNN_CHECK_CUDNN( + cudnnGetConvolutionNdForwardOutputDim(conv.get(), input.get(), filter.get(), + rank, + output.data() + ) + ); + } + + template inline + void convolve(const Handle& handle, + const FilterDescriptor& filter_desc, + DevicePtr filter_data, + const ConvolutionDescriptor& conv_desc, + const ConvolutionAlgorithm& algo, + DevicePtr workspace, + const TensorDescriptor& input_desc, + DevicePtr input_data, + T alpha, + T beta, + const TensorDescriptor& output_desc, + DevicePtr output_data) + { + CUDA4DNN_CHECK_CUDNN( + cudnnConvolutionForward( + HandleAccessor::get(handle), + &alpha, input_desc.get(), input_data.get(), + filter_desc.get(), filter_data.get(), conv_desc.get(), algo.get(), workspace.get(), + algo.get_workspace_size(), &beta, output_desc.get(), output_data.get() + ) + ); + } + /** @brief element-wise addition with broadcasting * * \f$ C = \alpha A + \beta C \f$ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index e7dfb5913b4c..1c6ed335d7c1 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -11,6 +11,7 @@ #include "cudnn.hpp" #include "span.hpp" #include "kernels.hpp" +#include "workspace.hpp" #include @@ -955,6 +956,70 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { } } + template + class Convolution { + public: + struct params_type { + std::vector input_shape; + std::vector filter_shape; + + std::vector padding; + std::vector stride; + std::vector dialation; + + std::size_t groups; + }; + + Convolution() = default; + Convolution(const Convolution&) = delete; + Convolution(Convolution&&) = default; + Convolution(cudnn::Handle handle, const params_type& params) { + cudnnHandle = std::move(handle); + + inputTensorDesc = TensorDescriptor(params.input_shape); + filterDesc = FilterDescriptor(params.filter_shape); + convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dialation, params.groups); + + std::vector output_dims; + getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims); + + outputTensorDesc = TensorDescriptor(output_dims); + + algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc); + } + + Convolution& operator=(const Convolution&) = delete; + Convolution& operator=(Convolution&&) = default; + + std::size_t get_workspace_size() const noexcept { + return algo.get_workspace_size(); + } + + void convolve(TensorSpan output, TensorView input, TensorView filters, Workspace& scratchpad) { + cudnn::convolve(cudnnHandle, + filterDesc, filters.get(), + convDesc, algo, WorkspaceAccessor::get(scratchpad), + inputTensorDesc, input.get(), 1.0, + 0.0, outputTensorDesc, output.get() + ); + } + + private: + cudnn::Handle cudnnHandle; + + using TensorDescriptor = cudnn::TensorDescriptor; + TensorDescriptor inputTensorDesc, outputTensorDesc; + + using FilterDescriptor = cudnn::FilterDescriptor; + FilterDescriptor filterDesc; + + using ConvolutionDescriptor = cudnn::ConvolutionDescriptor; + ConvolutionDescriptor convDesc; + + using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm; + ConvolutionAlgorithm algo; + }; + }}}} /* namespace cv::dnn::cuda4dnn::csl */ #endif /* OPENCV_DNN_CUDA4DNN_CSL_TENSOR_HPP */ diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index c8744fadac38..30e078ce60a2 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -42,6 +42,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -55,6 +56,11 @@ using namespace cv::dnn::ocl4dnn; #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -253,6 +259,9 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl virtual bool supportBackend(int backendId) CV_OVERRIDE { + if (backendId == DNN_BACKEND_CUDA) + return true; + #ifdef HAVE_INF_ENGINE if (backendId == DNN_BACKEND_INFERENCE_ENGINE) { @@ -491,8 +500,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl return Ptr(); } - - virtual Ptr initHalide(const std::vector > &inputs) CV_OVERRIDE { #ifdef HAVE_HALIDE @@ -1281,6 +1288,99 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl kernel_size, strides, pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes); } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) + { + CV_Assert(!activ); + + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto start = std::chrono::steady_clock::now(); + + convoluter.convolve(output, input, filtersTensor, workspace); + if (hasBias() || fusedBias) + csl::tensor_ops::add(cudnnHandle, 1.0, output, 1.0, biasTensor); + } + } + + void initCUDA( + csl::Stream stream, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) + { + cudnnHandle = std::move(cudnn_handle); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input_shape = input_wrapper->getShape(); + + /* we support 1-6d convolution */ + CV_Assert(input_shape.size() >= 3 || input_shape.size() <= 8); + + CV_Assert(blobs.size() >= 1); + const auto& filtersMat = blobs[0]; + + const auto output_feature_maps = filtersMat.size[0]; + const auto input_feature_maps = input_shape[1]; + const auto input_feature_maps_per_group = filtersMat.size[1]; + const auto groups = input_feature_maps / input_feature_maps_per_group; + CV_Assert(input_feature_maps % input_feature_maps_per_group == 0); + + const Mat& filterWeightsSource = newWeightAndBias ? weightsMat : filtersMat; + filtersTensor = createTensorHeaderFromMat(filterWeightsSource); + copyMatToTensor(filtersTensor, filterWeightsSource, stream); + + if (hasBias() || fusedBias) + { + std::vector biasShape(input_shape.size(), 1); + biasShape[1] = output_feature_maps; + Mat biasMat(input_shape.size(), biasShape.data(), CV_32F, &biasvec[0]); + biasTensor = createTensorHeaderFromMat(biasMat); + copyMatToTensor(biasTensor, biasMat, stream); + } + + if(pads_begin != pads_end) + CV_Error(Error::StsNotImplemented, "Asymmetric padding for convolution layer is not supported by CUDA backend"); + + csl::Convolution::params_type params; + params.padding = pads_begin; + params.stride = strides; + params.dialation = dilations; + params.groups = groups; + + auto& ishape = params.input_shape; + ishape.resize(input_shape.size()); + std::copy(std::begin(input_shape), std::end(input_shape), std::begin(ishape)); + + auto& fshape = params.filter_shape; + fshape.resize(ishape.size()); + fshape[0] = output_feature_maps; + fshape[1] = input_feature_maps_per_group; + + std::copy_backward(std::begin(kernel_size), std::end(kernel_size), std::end(fshape)); + CV_Assert(fshape.size() == kernel_size.size() + 2); + + convoluter = csl::Convolution(cudnnHandle, params); + scratch_mem_in_bytes = convoluter.get_workspace_size(); + } + + csl::cudnn::Handle cudnnHandle; + csl::Tensor filtersTensor, biasTensor; + csl::Convolution convoluter; +#endif + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { From bb984df34826a1918e6859ccc9bdf0badda270d6 Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 25 Jun 2019 12:23:56 +0530 Subject: [PATCH 014/129] add pooling layer --- modules/dnn/src/cuda4dnn/csl/cudnn.hpp | 152 ++++++++++++++++++++++- modules/dnn/src/cuda4dnn/csl/tensor.hpp | 51 ++++++++ modules/dnn/src/layers/pooling_layer.cpp | 76 ++++++++++++ 3 files changed, 276 insertions(+), 3 deletions(-) diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp index cf82fa3dd315..8a684bfad042 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp @@ -385,9 +385,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu template inline void getConvolutionForwardOutputDim( - ConvolutionDescriptor& conv, - FilterDescriptor& filter, - TensorDescriptor& input, + const ConvolutionDescriptor& conv, + const FilterDescriptor& filter, + const TensorDescriptor& input, std::vector& output) { output.clear(); @@ -440,6 +440,152 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu ); } + class PoolingDescriptor { + public: + enum class pooling_type { + max, + average_exclude_padding, + average_include_padding + }; + + PoolingDescriptor() noexcept : descriptor{ nullptr } { } + PoolingDescriptor(const PoolingDescriptor&) = delete; + PoolingDescriptor(PoolingDescriptor&& other) noexcept + : descriptor{ other.descriptor } { + other.descriptor = nullptr; + } + + template ()))> + PoolingDescriptor( + const SequenceContainer& window_size, + const SequenceContainer& padding, + const SequenceContainer& stride, + pooling_type type) + { + constructor(window_size, padding, stride, type); + } + + ~PoolingDescriptor() noexcept { + if (descriptor != nullptr) { + /* cudnnDestroyPoolingDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor)); + } + } + + PoolingDescriptor& operator=(const PoolingDescriptor&) = delete; + PoolingDescriptor& operator=(PoolingDescriptor&& other) noexcept { + descriptor = other.descriptor; + other.descriptor = nullptr; + return *this; + }; + + cudnnPoolingDescriptor_t get() const noexcept { return descriptor; } + + private: + template + void constructor( + const SequenceContainer& window_size, + const SequenceContainer& padding, + const SequenceContainer& stride, + pooling_type type) + { + CV_Assert(std::size(window_size) == std::size(padding)); + CV_Assert(std::size(window_size) == std::size(stride)); + + auto get_pooling_type = [] (pooling_type type) { + switch (type) { + case pooling_type::max: + return CUDNN_POOLING_MAX; + case pooling_type::average_exclude_padding: + return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + case pooling_type::average_include_padding: + return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + } + return CUDNN_POOLING_MAX; + }; + + CUDA4DNN_CHECK_CUDNN(cudnnCreatePoolingDescriptor(&descriptor)); + try { + const auto rank = std::size(window_size); + if (rank == 2) { + CUDA4DNN_CHECK_CUDNN( + cudnnSetPooling2dDescriptor(descriptor, + get_pooling_type(type), CUDNN_PROPAGATE_NAN, + window_size[0], window_size[1], + padding[0], padding[1], + stride[0], stride[1] + ) + ); + } + else { + std::vector iwindow_size(std::begin(window_size), std::end(window_size)); + std::vector ipadding(std::begin(padding), std::end(padding)); + std::vector istride(std::begin(stride), std::end(stride)); + CUDA4DNN_CHECK_CUDNN( + cudnnSetPoolingNdDescriptor(descriptor, + get_pooling_type(type), CUDNN_PROPAGATE_NAN, + rank, + iwindow_size.data(), + ipadding.data(), + istride.data() + ) + ); + } + } catch (...) { + /* cudnnDestroyPoolingDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor)); + throw; + } + } + + cudnnPoolingDescriptor_t descriptor; + }; + + template inline + void getPoolingForwardOutputDim(const PoolingDescriptor& pooling_desc, + const TensorDescriptor& input, + std::vector& output) { + output.clear(); + output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */ + + std::vector temp(CUDNN_DIM_MAX); + cudnnDataType_t tempDataType; + CUDA4DNN_CHECK_CUDNN( + cudnnGetTensorNdDescriptor( + input.get(), + CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */ + &tempDataType, + output.data(), + temp.data(), + temp.data() + ) + ); + const auto rank = output[0]; + output.resize(rank); + + CUDA4DNN_CHECK_CUDNN( + cudnnGetPoolingNdForwardOutputDim(pooling_desc.get(), input.get(), rank, output.data()) + ); + } + + template inline + void pool(Handle& handle, + PoolingDescriptor& pooling_desc, + TensorDescriptor& input_desc, + DevicePtr input_data, + T alpha, T beta, + TensorDescriptor& output_desc, + DevicePtr output_data) + { + CUDA4DNN_CHECK_CUDNN( + cudnnPoolingForward(HandleAccessor::get(handle), + pooling_desc.get(), + &alpha, input_desc.get(), input_data.get(), + &beta, output_desc.get(), output_data.get() + ) + ); + } + /** @brief element-wise addition with broadcasting * * \f$ C = \alpha A + \beta C \f$ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index 1c6ed335d7c1..6c9d826dba90 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -995,6 +995,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return algo.get_workspace_size(); } + /* plain convolution */ void convolve(TensorSpan output, TensorView input, TensorView filters, Workspace& scratchpad) { cudnn::convolve(cudnnHandle, filterDesc, filters.get(), @@ -1020,6 +1021,56 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { ConvolutionAlgorithm algo; }; + template + class Pooling { + using TensorDescriptor = cudnn::TensorDescriptor; + using PoolingDescriptor = cudnn::PoolingDescriptor; + + public: + using pooling_type = PoolingDescriptor::pooling_type; + + struct params_type { + std::vector input_shape; + + std::vector window_size; + std::vector padding; + std::vector stride; + + pooling_type type; + }; + + Pooling() = default; + Pooling(const Pooling&) = delete; + Pooling(Pooling&&) = default; + Pooling(cudnn::Handle handle, const params_type& params) { + + cudnnHandle = std::move(handle); + + inputTensorDesc = TensorDescriptor(params.input_shape); + + poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type); + + std::vector output_dim; + getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim); + + outputTensorDesc = TensorDescriptor(output_dim); + } + + Pooling& operator=(const Pooling&) = delete; + Pooling& operator=(Pooling&&) = default; + + void pool(TensorView input, TensorSpan& output) { + cudnn::pool(cudnnHandle, poolingDesc, inputTensorDesc, input.get(), 1.0, 0.0, outputTensorDesc, output.get()); + } + + private: + cudnn::Handle cudnnHandle; + + TensorDescriptor inputTensorDesc, outputTensorDesc; + + PoolingDescriptor poolingDesc; + }; + }}}} /* namespace cv::dnn::cuda4dnn::csl */ #endif /* OPENCV_DNN_CUDA4DNN_CSL_TENSOR_HPP */ diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index 8143b93c1c59..7709ece0995a 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -43,6 +43,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" #include "opencv2/core/hal/intrin.hpp" +#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -57,6 +58,11 @@ using std::min; using namespace cv::dnn::ocl4dnn; #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -184,6 +190,9 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer } else { + if (backendId == DNN_BACKEND_CUDA) + return (type == MAX || type == AVE); + if (kernel_size.size() == 3) return (backendId == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU); if (kernel_size.empty() || kernel_size.size() == 2) @@ -283,6 +292,73 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) + { + CV_UNUSED(workspace); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + pooler.pool(input, output); + } + + void initCUDA( + csl::Stream stream, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) + { + cudnnHandle = std::move(cudnn_handle); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input_shape = input_wrapper->getShape(); + + if (pads_begin != pads_end) + CV_Error(Error::StsNotImplemented, "Asymmetric padding for pooling layer is not supported by CUDA backend"); + + csl::Pooling::params_type params; + + auto& ishape = params.input_shape; + ishape.resize(input_shape.size()); + std::copy(std::begin(input_shape), std::end(input_shape), std::begin(ishape)); + + params.window_size = kernel_size; + params.padding = pads_begin; + params.stride = strides; + + if (type == MAX) + { + params.type = csl::Pooling::pooling_type::max; + } + else if (type == AVE) + { + if(padMode == "SAME") + params.type = csl::Pooling::pooling_type::average_exclude_padding; + else + params.type = csl::Pooling::pooling_type::average_include_padding; + } + else + CV_Error(Error::StsNotImplemented, "Unsupported pooling type"); + + /* TODO ceilMode */ + + pooler = csl::Pooling(cudnnHandle, params); + } + + csl::cudnn::Handle cudnnHandle; + csl::Pooling pooler; +#endif + virtual Ptr initVkCom(const std::vector > &inputs) CV_OVERRIDE { #ifdef HAVE_VULKAN From 16db28b38bc76736a8613960100cd0efbac178c1 Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 25 Jun 2019 14:32:28 +0530 Subject: [PATCH 015/129] reorganize and refactor code --- modules/dnn/src/cuda4dnn/csl/cudnn.hpp | 646 +----------------- .../src/cuda4dnn/csl/cudnn/convolution.hpp | 307 +++++++++ modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp | 198 ++++++ .../dnn/src/cuda4dnn/csl/cudnn/pooling.hpp | 178 +++++ .../dnn/src/cuda4dnn/csl/cudnn/softmax.hpp | 49 ++ modules/dnn/src/cuda4dnn/csl/tensor.hpp | 36 +- modules/dnn/src/layers/pooling_layer.cpp | 6 +- 7 files changed, 755 insertions(+), 665 deletions(-) create mode 100644 modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp create mode 100644 modules/dnn/src/cuda4dnn/csl/cudnn/softmax.hpp diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp index 8a684bfad042..b0671a86807c 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp @@ -7,647 +7,9 @@ #include -#include "pointer.hpp" - -#include - -#include -#include -#include -#include -#include -#include - -#define CUDA4DNN_CHECK_CUDNN(call) \ - ::cv::dnn::cuda4dnn::csl::cudnn::detail::check((call), CV_Func, __FILE__, __LINE__) - -namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { - - namespace detail { - inline void check(cudnnStatus_t status, const char* func, const char* file, int line) { - if (status != CUDNN_STATUS_SUCCESS) - throw cuDNNException(Error::GpuApiCallError, cudnnGetErrorString(status), func, file, line); - } - - /** get_data_type returns the equivalent cudnn enumeration constant for type T */ - template auto get_data_type()->decltype(CUDNN_DATA_FLOAT); - template <> inline auto get_data_type()->decltype(CUDNN_DATA_FLOAT) { return CUDNN_DATA_FLOAT; } - template <> inline auto get_data_type()->decltype(CUDNN_DATA_FLOAT) { return CUDNN_DATA_DOUBLE; } - } - - /** used to access the raw cuDNN handle held by Handle */ - class HandleAccessor { - public: - static cudnnHandle_t get(const Handle& handle); - }; - - /** creates a cuDNN tensor descriptor for a given shape */ - template - class TensorDescriptor { - public: - TensorDescriptor() noexcept : descriptor{ nullptr } { } - TensorDescriptor(const TensorDescriptor&) = delete; - TensorDescriptor(TensorDescriptor&& other) noexcept - : descriptor{ other.descriptor } { - other.descriptor = nullptr; - } - - /** constructs a tensor descriptor from the axis lengths provided in \p shape */ - template ()))> - TensorDescriptor(const SequenceContainer& shape) { - constructor(shape.begin(), shape.end()); - } - - /** constructs a tensor descriptor from the axis lengths provided in [begin, end) */ - template ::value, void>::type> // TODO is_iterator - TensorDescriptor(ForwardItr begin, ForwardItr end) { - constructor(begin, end); - } - - /** constructs a tensor descriptor from the axis lengths provided as arguments */ - template - TensorDescriptor(Sizes ...sizes) { - static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank"); - std::array dims = { static_cast(sizes)... }; - constructor(std::begin(dims), std::end(dims)); - } - - ~TensorDescriptor() noexcept { - if (descriptor != nullptr) { - /* cudnnDestroyTensorDescriptor will not fail */ - CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor)); - } - } - - TensorDescriptor& operator=(const TensorDescriptor&) = delete; - TensorDescriptor& operator=(TensorDescriptor&& other) noexcept { - descriptor = other.descriptor; - other.descriptor = nullptr; - return *this; - }; - - cudnnTensorDescriptor_t get() const noexcept { return descriptor; } - - private: - template - void constructor(ForwardItr start, ForwardItr end) { - CV_Assert(start != end); - CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX); - - CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorDescriptor(&descriptor)); - try { - const auto rank = std::distance(start, end); - if (rank <= 4) { - std::array dims; - std::fill(std::begin(dims), std::end(dims), 1); - - /* suppose we have a 3d tensor, the first axis is the batch axis and - * the second axis is the channel axis (generally) - * - * cuDNN frequently assumes that the first axis is the batch axis and the - * second axis is the channel axis; hence, we copy the shape of a lower rank - * tensor to the begining of `dims` - */ - std::copy(start, end, std::begin(dims)); - - CUDA4DNN_CHECK_CUDNN( - cudnnSetTensor4dDescriptor(descriptor, - CUDNN_TENSOR_NCHW, detail::get_data_type(), - dims[0], dims[1], dims[2], dims[3] - ) - ); - } else { - std::vector stride(rank); - stride.back() = 1; - /* WHAT WE HAVE NOW: - * stride[-1] = 1 - * stride[-2] = garbage - * stride[-3] = garbage - * stride[-4] = garbage - * ... - */ - - std::copy(start + 1, end, stride.begin()); - /* WHAT WE HAVE NOW: - * stride[-1] = 1 - * stride[-2] = dim[-1] - * stride[-3] = dim[-2] - * stride[-4] = dim[-3] - * ... - */ - - std::partial_sum(std::rbegin(stride), std::rend(stride), std::rbegin(stride), std::multiplies()); - /* WHAT WE HAVE NOW: - * stride[-1] = 1 - * stride[-2] = stride[-1] * dim[-1] - * stride[-3] = stride[-2] * dim[-2] - * stride[-4] = stride[-3] * dim[-3] - * ... - */ - - std::vector dims(start, end); - CUDA4DNN_CHECK_CUDNN( - cudnnSetTensorNdDescriptor(descriptor, - detail::get_data_type(), rank, - dims.data(), stride.data() - ) - ); - } - } catch (...) { - /* cudnnDestroyTensorDescriptor will not fail */ - CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor)); - throw; - } - } - - cudnnTensorDescriptor_t descriptor; - }; - - /** creates a cuDNN filter descriptor for the given filter shape - * - * Dimension Ordering: - * 0: number of output feature maps - * 1: number of input feature maps - * 2..n: kernel dimensions - */ - template - class FilterDescriptor { - public: - FilterDescriptor() noexcept : descriptor{ nullptr } { } - FilterDescriptor(const FilterDescriptor&) = delete; - FilterDescriptor(FilterDescriptor&& other) noexcept - : descriptor{ other.descriptor } { - other.descriptor = nullptr; - } - - /** constructs a filter descriptor from the filter dimensions provided in \p shape */ - template ()))> - FilterDescriptor(const SequenceContainer& shape) { - constructor(shape.begin(), shape.end()); - } - - /** constructs a filter descriptor from the filter dimensions provided in [begin, end) */ - template ::value, void>::type> // TODO is_iterator - FilterDescriptor(ForwardItr begin, ForwardItr end) { - constructor(begin, end); - } - - /** constructs a filter descriptor from the filter dimensions provided as arguments */ - template - FilterDescriptor(Sizes ...sizes) { - static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions"); - static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank"); - std::array dims = { static_cast(sizes)... }; - constructor(std::begin(dims), std::end(dims)); - } - - ~FilterDescriptor() noexcept { - if (descriptor != nullptr) { - /* cudnnDestroyFilterDescriptor will not fail */ - CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor)); - } - } - - FilterDescriptor& operator=(const FilterDescriptor&) = delete; - FilterDescriptor& operator=(FilterDescriptor&& other) noexcept { - descriptor = other.descriptor; - other.descriptor = nullptr; - return *this; - }; - - cudnnFilterDescriptor_t get() const noexcept { return descriptor; } - - private: - template - void constructor(ForwardItr start, ForwardItr end) { - CV_Assert(start != end); - CV_Assert(std::distance(start, end) >= 3); - CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX); - - CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor)); - try { - const auto rank = std::distance(start, end); - if (rank == 4) { - std::array dims; - std::copy(start, end, std::begin(dims)); - - CUDA4DNN_CHECK_CUDNN( - cudnnSetFilter4dDescriptor(descriptor, - detail::get_data_type(), CUDNN_TENSOR_NCHW, - dims[0], dims[1], dims[2], dims[3] - ) - ); - } else { - std::vector dims(start, end); - CUDA4DNN_CHECK_CUDNN( - cudnnSetFilterNdDescriptor(descriptor, - detail::get_data_type(), CUDNN_TENSOR_NCHW, - dims.size(), dims.data() - ) - ); - } - } catch (...) { - /* cudnnDestroyFilterDescriptor will not fail */ - CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor)); - throw; - } - } - - cudnnFilterDescriptor_t descriptor; - }; - - /** creates a cuDNN convolution descriptor */ - template - class ConvolutionDescriptor { - public: - ConvolutionDescriptor() noexcept : descriptor{ nullptr } { } - ConvolutionDescriptor(const ConvolutionDescriptor&) = delete; - ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept - : descriptor{ other.descriptor } { - other.descriptor = nullptr; - } - - template ()))> - ConvolutionDescriptor( - const SequenceContainer& zero_padding, - const SequenceContainer& stride, - const SequenceContainer& dialation, - std::size_t group_count) - { - constructor(zero_padding, stride, dialation, group_count); - } - - ~ConvolutionDescriptor() noexcept { - if (descriptor != nullptr) { - /* cudnnDestroyConvolutionDescriptor will not fail */ - CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor)); - } - } - - ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete; - ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept { - descriptor = other.descriptor; - other.descriptor = nullptr; - return *this; - }; - - cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; } - - private: - template - void constructor( - const SequenceContainer& zero_padding, - const SequenceContainer& stride, - const SequenceContainer& dialation, - std::size_t group_count) - { - CV_Assert(std::size(zero_padding) == std::size(stride)); - CV_Assert(std::size(zero_padding) == std::size(dialation)); - - CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor)); - try { - const auto rank = std::size(zero_padding); - if (rank == 2) { - CUDA4DNN_CHECK_CUDNN( - cudnnSetConvolution2dDescriptor(descriptor, - zero_padding[0], zero_padding[1], - stride[0], stride[1], - dialation[0], dialation[1], - CUDNN_CROSS_CORRELATION, - detail::get_data_type() - ) - ); - } else { - std::vector ipadding(std::begin(zero_padding), std::end(zero_padding)); - std::vector istride(std::begin(stride), std::end(stride)); - std::vector idialation(std::begin(dialation), std::end(dialation)); - CUDA4DNN_CHECK_CUDNN( - cudnnSetConvolutionNdDescriptor(descriptor, rank, - ipadding.data(), - istride.data(), - idialation.data(), - CUDNN_CROSS_CORRELATION, - detail::get_data_type() - ) - ); - } - CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count)); - } catch (...) { - /* cudnnDestroyConvolutionDescriptor will not fail */ - CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor)); - throw; - } - } - - cudnnConvolutionDescriptor_t descriptor; - }; - - template - class ConvolutionAlgorithm { - public: - ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { } - ConvolutionAlgorithm(ConvolutionAlgorithm&) = default; - ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default; - - ConvolutionAlgorithm( - Handle& handle, - ConvolutionDescriptor& conv, - FilterDescriptor& filter, - TensorDescriptor& input, - TensorDescriptor& output) - { - CUDA4DNN_CHECK_CUDNN( - cudnnGetConvolutionForwardAlgorithm(HandleAccessor::get(handle), - input.get(), filter.get(), conv.get(), output.get(), - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - 0, &algo - ) - ); - - CUDA4DNN_CHECK_CUDNN( - cudnnGetConvolutionForwardWorkspaceSize(HandleAccessor::get(handle), - input.get(), filter.get(), conv.get(), output.get(), - algo, &workspace_size - ) - ); - } - - ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default; - ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default; - - auto get() const noexcept { return algo; } - auto get_workspace_size() const noexcept { return workspace_size; } - - private: - cudnnConvolutionFwdAlgo_t algo; - std::size_t workspace_size; - }; - - template inline - void getConvolutionForwardOutputDim( - const ConvolutionDescriptor& conv, - const FilterDescriptor& filter, - const TensorDescriptor& input, - std::vector& output) - { - output.clear(); - output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */ - - std::vector temp(CUDNN_DIM_MAX); - cudnnDataType_t tempDataType; - CUDA4DNN_CHECK_CUDNN( - cudnnGetTensorNdDescriptor( - input.get(), - CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */ - &tempDataType, - output.data(), - temp.data(), - temp.data() - ) - ); - const auto rank = output[0]; - output.resize(rank); - - CUDA4DNN_CHECK_CUDNN( - cudnnGetConvolutionNdForwardOutputDim(conv.get(), input.get(), filter.get(), - rank, - output.data() - ) - ); - } - - template inline - void convolve(const Handle& handle, - const FilterDescriptor& filter_desc, - DevicePtr filter_data, - const ConvolutionDescriptor& conv_desc, - const ConvolutionAlgorithm& algo, - DevicePtr workspace, - const TensorDescriptor& input_desc, - DevicePtr input_data, - T alpha, - T beta, - const TensorDescriptor& output_desc, - DevicePtr output_data) - { - CUDA4DNN_CHECK_CUDNN( - cudnnConvolutionForward( - HandleAccessor::get(handle), - &alpha, input_desc.get(), input_data.get(), - filter_desc.get(), filter_data.get(), conv_desc.get(), algo.get(), workspace.get(), - algo.get_workspace_size(), &beta, output_desc.get(), output_data.get() - ) - ); - } - - class PoolingDescriptor { - public: - enum class pooling_type { - max, - average_exclude_padding, - average_include_padding - }; - - PoolingDescriptor() noexcept : descriptor{ nullptr } { } - PoolingDescriptor(const PoolingDescriptor&) = delete; - PoolingDescriptor(PoolingDescriptor&& other) noexcept - : descriptor{ other.descriptor } { - other.descriptor = nullptr; - } - - template ()))> - PoolingDescriptor( - const SequenceContainer& window_size, - const SequenceContainer& padding, - const SequenceContainer& stride, - pooling_type type) - { - constructor(window_size, padding, stride, type); - } - - ~PoolingDescriptor() noexcept { - if (descriptor != nullptr) { - /* cudnnDestroyPoolingDescriptor will not fail */ - CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor)); - } - } - - PoolingDescriptor& operator=(const PoolingDescriptor&) = delete; - PoolingDescriptor& operator=(PoolingDescriptor&& other) noexcept { - descriptor = other.descriptor; - other.descriptor = nullptr; - return *this; - }; - - cudnnPoolingDescriptor_t get() const noexcept { return descriptor; } - - private: - template - void constructor( - const SequenceContainer& window_size, - const SequenceContainer& padding, - const SequenceContainer& stride, - pooling_type type) - { - CV_Assert(std::size(window_size) == std::size(padding)); - CV_Assert(std::size(window_size) == std::size(stride)); - - auto get_pooling_type = [] (pooling_type type) { - switch (type) { - case pooling_type::max: - return CUDNN_POOLING_MAX; - case pooling_type::average_exclude_padding: - return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - case pooling_type::average_include_padding: - return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; - } - return CUDNN_POOLING_MAX; - }; - - CUDA4DNN_CHECK_CUDNN(cudnnCreatePoolingDescriptor(&descriptor)); - try { - const auto rank = std::size(window_size); - if (rank == 2) { - CUDA4DNN_CHECK_CUDNN( - cudnnSetPooling2dDescriptor(descriptor, - get_pooling_type(type), CUDNN_PROPAGATE_NAN, - window_size[0], window_size[1], - padding[0], padding[1], - stride[0], stride[1] - ) - ); - } - else { - std::vector iwindow_size(std::begin(window_size), std::end(window_size)); - std::vector ipadding(std::begin(padding), std::end(padding)); - std::vector istride(std::begin(stride), std::end(stride)); - CUDA4DNN_CHECK_CUDNN( - cudnnSetPoolingNdDescriptor(descriptor, - get_pooling_type(type), CUDNN_PROPAGATE_NAN, - rank, - iwindow_size.data(), - ipadding.data(), - istride.data() - ) - ); - } - } catch (...) { - /* cudnnDestroyPoolingDescriptor will not fail */ - CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor)); - throw; - } - } - - cudnnPoolingDescriptor_t descriptor; - }; - - template inline - void getPoolingForwardOutputDim(const PoolingDescriptor& pooling_desc, - const TensorDescriptor& input, - std::vector& output) { - output.clear(); - output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */ - - std::vector temp(CUDNN_DIM_MAX); - cudnnDataType_t tempDataType; - CUDA4DNN_CHECK_CUDNN( - cudnnGetTensorNdDescriptor( - input.get(), - CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */ - &tempDataType, - output.data(), - temp.data(), - temp.data() - ) - ); - const auto rank = output[0]; - output.resize(rank); - - CUDA4DNN_CHECK_CUDNN( - cudnnGetPoolingNdForwardOutputDim(pooling_desc.get(), input.get(), rank, output.data()) - ); - } - - template inline - void pool(Handle& handle, - PoolingDescriptor& pooling_desc, - TensorDescriptor& input_desc, - DevicePtr input_data, - T alpha, T beta, - TensorDescriptor& output_desc, - DevicePtr output_data) - { - CUDA4DNN_CHECK_CUDNN( - cudnnPoolingForward(HandleAccessor::get(handle), - pooling_desc.get(), - &alpha, input_desc.get(), input_data.get(), - &beta, output_desc.get(), output_data.get() - ) - ); - } - - /** @brief element-wise addition with broadcasting - * - * \f$ C = \alpha A + \beta C \f$ - * - * @tparam T matrix element type (must be `float` or `double`) - * - * @param handle valid cuDNN handle - * @param alpha scale factor for A - * @param aDesc tensor descriptor for A - * @param[in] A pointer to tensor in device memory - * @param beta scale factor for C - * @param cDesc tensor descriptor for C - * @param[in] C pointer to tensor in device memory - * - * Exception Guarantee: Basic - */ - template - typename std::enable_if::value || std::is_same::value, void> - ::type add(const Handle& handle, - T alpha, const TensorDescriptor& aDesc, DevicePtr A, - T beta, const TensorDescriptor& cDesc, DevicePtr C) - { - CUDA4DNN_CHECK_CUDNN( - cudnnAddTensor(HandleAccessor::get(handle), - &alpha, aDesc.get(), A.get(), - &beta, cDesc.get(), C.get() - ) - ); - } - - /** @brief computes softmax (or log softmax) - * - * @tparam T matrix element type (must be `float` or `double`) - * - * @param handle valid cuDNN handle - * @param outputDesc tensor descriptor for A - * @param[out] output pointer to tensor in device memory - * @param inputDesc tensor descriptor for C - * @param[in] input pointer to tensor in device memory - * @param log apply log on probabilities - * - * Exception Guarantee: Basic - */ - template - typename std::enable_if::value || std::is_same::value, void> - ::type softmax(const cudnn::Handle& handle, - const TensorDescriptor& outputDesc, DevicePtr output, - const TensorDescriptor& inputDesc, DevicePtr input, - bool log) - { - T alpha = 1.0, beta = 0.0; - cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE; - CUDA4DNN_CHECK_CUDNN( - cudnnSoftmaxForward( - HandleAccessor::get(handle), - algo, CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, inputDesc.get(), input.get(), - &beta, outputDesc.get(), output.get() - ) - ); - } - -}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ +#include "cudnn/cudnn.hpp" +#include "cudnn/convolution.hpp" +#include "cudnn/pooling.hpp" +#include "cudnn/softmax.hpp" #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp new file mode 100644 index 000000000000..4fe94b8cdbb2 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp @@ -0,0 +1,307 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP + +#include "cudnn.h" +#include "../pointer.hpp" +#include "../workspace.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { + + /** creates a cuDNN filter descriptor for the given filter shape + * + * Dimension Ordering: + * 0: number of output feature maps + * 1: number of input feature maps + * 2..n: kernel dimensions + */ + template + class FilterDescriptor { + public: + FilterDescriptor() noexcept : descriptor{ nullptr } { } + FilterDescriptor(const FilterDescriptor&) = delete; + FilterDescriptor(FilterDescriptor&& other) noexcept + : descriptor{ other.descriptor } { + other.descriptor = nullptr; + } + + /** constructs a filter descriptor from the filter dimensions provided in \p shape */ + template ()))> + FilterDescriptor(const SequenceContainer& shape) { + constructor(shape.begin(), shape.end()); + } + + /** constructs a filter descriptor from the filter dimensions provided in [begin, end) */ + template ::value, void>::type> // TODO is_iterator + FilterDescriptor(ForwardItr begin, ForwardItr end) { + constructor(begin, end); + } + + /** constructs a filter descriptor from the filter dimensions provided as arguments */ + template + FilterDescriptor(Sizes ...sizes) { + static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions"); + static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank"); + std::array dims = { static_cast(sizes)... }; + constructor(std::begin(dims), std::end(dims)); + } + + ~FilterDescriptor() noexcept { + if (descriptor != nullptr) { + /* cudnnDestroyFilterDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor)); + } + } + + FilterDescriptor& operator=(const FilterDescriptor&) = delete; + FilterDescriptor& operator=(FilterDescriptor&& other) noexcept { + descriptor = other.descriptor; + other.descriptor = nullptr; + return *this; + }; + + cudnnFilterDescriptor_t get() const noexcept { return descriptor; } + + private: + template + void constructor(ForwardItr start, ForwardItr end) { + CV_Assert(start != end); + CV_Assert(std::distance(start, end) >= 3); + CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX); + + CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor)); + try { + const auto rank = std::distance(start, end); + if (rank == 4) { + std::array dims; + std::copy(start, end, std::begin(dims)); + CUDA4DNN_CHECK_CUDNN( + cudnnSetFilter4dDescriptor( + descriptor, + detail::get_data_type(), CUDNN_TENSOR_NCHW, + dims[0], dims[1], dims[2], dims[3] + ) + ); + } else { + std::vector dims(start, end); + CUDA4DNN_CHECK_CUDNN( + cudnnSetFilterNdDescriptor( + descriptor, + detail::get_data_type(), CUDNN_TENSOR_NCHW, + dims.size(), dims.data() + ) + ); + } + } catch (...) { + /* cudnnDestroyFilterDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor)); + throw; + } + } + + cudnnFilterDescriptor_t descriptor; + }; + + /** creates a cuDNN convolution descriptor */ + template + class ConvolutionDescriptor { + public: + ConvolutionDescriptor() noexcept : descriptor{ nullptr } { } + ConvolutionDescriptor(const ConvolutionDescriptor&) = delete; + ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept + : descriptor{ other.descriptor } { + other.descriptor = nullptr; + } + + template ()))> + ConvolutionDescriptor( + const SequenceContainer& zero_padding, + const SequenceContainer& stride, + const SequenceContainer& dialation, + std::size_t group_count) + { + constructor(zero_padding, stride, dialation, group_count); + } + + ~ConvolutionDescriptor() noexcept { + if (descriptor != nullptr) { + /* cudnnDestroyConvolutionDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor)); + } + } + + ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete; + ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept { + descriptor = other.descriptor; + other.descriptor = nullptr; + return *this; + }; + + cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; } + + private: + template + void constructor( + const SequenceContainer& zero_padding, + const SequenceContainer& stride, + const SequenceContainer& dialation, + std::size_t group_count) + { + CV_Assert(std::size(zero_padding) == std::size(stride)); + CV_Assert(std::size(zero_padding) == std::size(dialation)); + + CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor)); + try { + const auto rank = std::size(zero_padding); + if (rank == 2) { + CUDA4DNN_CHECK_CUDNN( + cudnnSetConvolution2dDescriptor( + descriptor, + zero_padding[0], zero_padding[1], + stride[0], stride[1], + dialation[0], dialation[1], + CUDNN_CROSS_CORRELATION, + detail::get_data_type() + ) + ); + } else { + std::vector ipadding(std::begin(zero_padding), std::end(zero_padding)); + std::vector istride(std::begin(stride), std::end(stride)); + std::vector idialation(std::begin(dialation), std::end(dialation)); + CUDA4DNN_CHECK_CUDNN( + cudnnSetConvolutionNdDescriptor( + descriptor, + rank, ipadding.data(), istride.data(), idialation.data(), + CUDNN_CROSS_CORRELATION, + detail::get_data_type() + ) + ); + } + CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count)); + } catch (...) { + /* cudnnDestroyConvolutionDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor)); + throw; + } + } + + cudnnConvolutionDescriptor_t descriptor; + }; + + template + class ConvolutionAlgorithm { + public: + ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { } + ConvolutionAlgorithm(ConvolutionAlgorithm&) = default; + ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default; + + ConvolutionAlgorithm( + const Handle& handle, + const ConvolutionDescriptor& conv, + const FilterDescriptor& filter, + const TensorDescriptor& input, + const TensorDescriptor& output) + { + CUDA4DNN_CHECK_CUDNN( + cudnnGetConvolutionForwardAlgorithm( + HandleAccessor::get(handle), + input.get(), filter.get(), conv.get(), output.get(), + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + 0, /* no memory limit */ + &algo + ) + ); + + CUDA4DNN_CHECK_CUDNN( + cudnnGetConvolutionForwardWorkspaceSize( + HandleAccessor::get(handle), + input.get(), filter.get(), conv.get(), output.get(), + algo, &workspace_size + ) + ); + } + + ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default; + ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default; + + auto get() const noexcept { return algo; } + auto get_workspace_size() const noexcept { return workspace_size; } + + private: + cudnnConvolutionFwdAlgo_t algo; + std::size_t workspace_size; + }; + + template inline + void getConvolutionForwardOutputDim( + const ConvolutionDescriptor& convDesc, + const FilterDescriptor& filterDesc, + const TensorDescriptor& inputDesc, + std::vector& output) + { + output.clear(); + output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */ + + std::vector temp(CUDNN_DIM_MAX); + cudnnDataType_t tempDataType; + CUDA4DNN_CHECK_CUDNN( + cudnnGetTensorNdDescriptor( + inputDesc.get(), + CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */ + &tempDataType, + output.data(), + temp.data(), + temp.data() + ) + ); + + const auto rank = output[0]; + output.resize(rank); + CUDA4DNN_CHECK_CUDNN( + cudnnGetConvolutionNdForwardOutputDim( + convDesc.get(), inputDesc.get(), filterDesc.get(), rank, output.data() + ) + ); + } + + template inline + void convolve( + const Handle& handle, + const ConvolutionDescriptor& convDesc, + const ConvolutionAlgorithm& convAlgo, + const Workspace& workspace, + const FilterDescriptor& filterDesc, + DevicePtr filterPtr, + const TensorDescriptor& inputDesc, + DevicePtr inputPtr, + T alpha, T beta, + const TensorDescriptor& outputDesc, + DevicePtr outputPtr) + { + CUDA4DNN_CHECK_CUDNN( + cudnnConvolutionForward( + HandleAccessor::get(handle), + &alpha, inputDesc.get(), inputPtr.get(), + filterDesc.get(), filterPtr.get(), + convDesc.get(), convAlgo.get(), + WorkspaceAccessor::get(workspace).get(), workspace.size(), + &beta, outputDesc.get(), outputPtr.get() + ) + ); + } + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp new file mode 100644 index 000000000000..70060e284202 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp @@ -0,0 +1,198 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP + +#include + +#include "../pointer.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#define CUDA4DNN_CHECK_CUDNN(call) \ + ::cv::dnn::cuda4dnn::csl::cudnn::detail::check((call), CV_Func, __FILE__, __LINE__) + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { + + namespace detail { + inline void check(cudnnStatus_t status, const char* func, const char* file, int line) { + if (status != CUDNN_STATUS_SUCCESS) + throw cuDNNException(Error::GpuApiCallError, cudnnGetErrorString(status), func, file, line); + } + + /** get_data_type returns the equivalent cudnn enumeration constant for type T */ + template auto get_data_type()->decltype(CUDNN_DATA_FLOAT); + template <> inline auto get_data_type()->decltype(CUDNN_DATA_FLOAT) { return CUDNN_DATA_FLOAT; } + template <> inline auto get_data_type()->decltype(CUDNN_DATA_FLOAT) { return CUDNN_DATA_DOUBLE; } + } + + /** used to access the raw cuDNN handle held by Handle */ + class HandleAccessor { + public: + static cudnnHandle_t get(const Handle& handle); + }; + + /** creates a cuDNN tensor descriptor for a given shape */ + template + class TensorDescriptor { + public: + TensorDescriptor() noexcept : descriptor{ nullptr } { } + TensorDescriptor(const TensorDescriptor&) = delete; + TensorDescriptor(TensorDescriptor&& other) noexcept + : descriptor{ other.descriptor } { + other.descriptor = nullptr; + } + + /** constructs a tensor descriptor from the axis lengths provided in \p shape */ + template ()))> + TensorDescriptor(const SequenceContainer& shape) { + constructor(shape.begin(), shape.end()); + } + + /** constructs a tensor descriptor from the axis lengths provided in [begin, end) */ + template ::value, void>::type> // TODO is_iterator + TensorDescriptor(ForwardItr begin, ForwardItr end) { + constructor(begin, end); + } + + /** constructs a tensor descriptor from the axis lengths provided as arguments */ + template + TensorDescriptor(Sizes ...sizes) { + static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank"); + std::array dims = { static_cast(sizes)... }; + constructor(std::begin(dims), std::end(dims)); + } + + ~TensorDescriptor() noexcept { + if (descriptor != nullptr) { + /* cudnnDestroyTensorDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor)); + } + } + + TensorDescriptor& operator=(const TensorDescriptor&) = delete; + TensorDescriptor& operator=(TensorDescriptor&& other) noexcept { + descriptor = other.descriptor; + other.descriptor = nullptr; + return *this; + }; + + cudnnTensorDescriptor_t get() const noexcept { return descriptor; } + + private: + template + void constructor(ForwardItr start, ForwardItr end) { + CV_Assert(start != end); + CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX); + + CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorDescriptor(&descriptor)); + try { + const auto rank = std::distance(start, end); + if (rank <= 4) { + std::array dims; + std::fill(std::begin(dims), std::end(dims), 1); + + /* suppose we have a 3d tensor, the first axis is the batch axis and + * the second axis is the channel axis (generally) + * + * cuDNN frequently assumes that the first axis is the batch axis and the + * second axis is the channel axis; hence, we copy the shape of a lower rank + * tensor to the begining of `dims` + */ + std::copy(start, end, std::begin(dims)); + + CUDA4DNN_CHECK_CUDNN( + cudnnSetTensor4dDescriptor(descriptor, + CUDNN_TENSOR_NCHW, detail::get_data_type(), + dims[0], dims[1], dims[2], dims[3] + ) + ); + } else { + std::vector stride(rank); + stride.back() = 1; + /* WHAT WE HAVE NOW: + * stride[-1] = 1 + * stride[-2] = garbage + * stride[-3] = garbage + * stride[-4] = garbage + * ... + */ + + std::copy(start + 1, end, stride.begin()); + /* WHAT WE HAVE NOW: + * stride[-1] = 1 + * stride[-2] = dim[-1] + * stride[-3] = dim[-2] + * stride[-4] = dim[-3] + * ... + */ + + std::partial_sum(std::rbegin(stride), std::rend(stride), std::rbegin(stride), std::multiplies()); + /* WHAT WE HAVE NOW: + * stride[-1] = 1 + * stride[-2] = stride[-1] * dim[-1] + * stride[-3] = stride[-2] * dim[-2] + * stride[-4] = stride[-3] * dim[-3] + * ... + */ + + std::vector dims(start, end); + CUDA4DNN_CHECK_CUDNN( + cudnnSetTensorNdDescriptor(descriptor, + detail::get_data_type(), rank, + dims.data(), stride.data() + ) + ); + } + } catch (...) { + /* cudnnDestroyTensorDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor)); + throw; + } + } + + cudnnTensorDescriptor_t descriptor; + }; + + /** @brief element-wise addition with broadcasting + * + * \f$ C = \alpha A + \beta C \f$ + * + * @tparam T matrix element type (must be `float` or `double`) + * + * @param handle valid cuDNN handle + * @param alpha scale factor for A + * @param aDesc tensor descriptor for A + * @param[in] A pointer to tensor in device memory + * @param beta scale factor for C + * @param cDesc tensor descriptor for C + * @param[in] C pointer to tensor in device memory + * + * Exception Guarantee: Basic + */ + template + typename std::enable_if::value || std::is_same::value, void> + ::type add(const Handle& handle, + T alpha, const TensorDescriptor& aDesc, DevicePtr A, + T beta, const TensorDescriptor& cDesc, DevicePtr C) + { + CUDA4DNN_CHECK_CUDNN( + cudnnAddTensor(HandleAccessor::get(handle), + &alpha, aDesc.get(), A.get(), + &beta, cDesc.get(), C.get() + ) + ); + } + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp new file mode 100644 index 000000000000..06e59d810df9 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp @@ -0,0 +1,178 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP + +#include "cudnn.h" +#include "../pointer.hpp" + +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { + + class PoolingDescriptor { + public: + enum class pooling_type { + MAX, + MAX_DETERMINISTIC, + AVERAGE_EXCLUDE_PADDING, + AVERAGE_INCLUDE_PADDING + }; + + PoolingDescriptor() noexcept : descriptor{ nullptr } { } + PoolingDescriptor(const PoolingDescriptor&) = delete; + PoolingDescriptor(PoolingDescriptor&& other) noexcept + : descriptor{ other.descriptor } { + other.descriptor = nullptr; + } + + template ()))> + PoolingDescriptor( + const SequenceContainer& window_size, + const SequenceContainer& padding, + const SequenceContainer& stride, + pooling_type type) + { + constructor(window_size, padding, stride, type); + } + + ~PoolingDescriptor() noexcept { + if (descriptor != nullptr) { + /* cudnnDestroyPoolingDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor)); + } + } + + PoolingDescriptor& operator=(const PoolingDescriptor&) = delete; + PoolingDescriptor& operator=(PoolingDescriptor&& other) noexcept { + descriptor = other.descriptor; + other.descriptor = nullptr; + return *this; + }; + + cudnnPoolingDescriptor_t get() const noexcept { return descriptor; } + + private: + template + void constructor( + const SequenceContainer& window_size, + const SequenceContainer& padding, + const SequenceContainer& stride, + pooling_type type) + { + CV_Assert(std::size(window_size) == std::size(padding)); + CV_Assert(std::size(window_size) == std::size(stride)); + + auto get_pooling_type = [] (pooling_type type) { + switch (type) { + case pooling_type::MAX: + return CUDNN_POOLING_MAX; + case pooling_type::MAX_DETERMINISTIC: + return CUDNN_POOLING_MAX_DETERMINISTIC; + case pooling_type::AVERAGE_EXCLUDE_PADDING: + return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + case pooling_type::AVERAGE_INCLUDE_PADDING: + return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + } + CV_Error(Error::StsBadArg, "unknown pooling type"); + }; + + CUDA4DNN_CHECK_CUDNN(cudnnCreatePoolingDescriptor(&descriptor)); + try { + const auto rank = std::size(window_size); + if (rank == 2) { + CUDA4DNN_CHECK_CUDNN( + cudnnSetPooling2dDescriptor( + descriptor, + get_pooling_type(type), CUDNN_PROPAGATE_NAN, + window_size[0], window_size[1], + padding[0], padding[1], + stride[0], stride[1] + ) + ); + } + else { + std::vector iwindow_size(std::begin(window_size), std::end(window_size)); + std::vector ipadding(std::begin(padding), std::end(padding)); + std::vector istride(std::begin(stride), std::end(stride)); + CUDA4DNN_CHECK_CUDNN( + cudnnSetPoolingNdDescriptor( + descriptor, + get_pooling_type(type), CUDNN_PROPAGATE_NAN, + rank, iwindow_size.data(), ipadding.data(), istride.data() + ) + ); + } + } catch (...) { + /* cudnnDestroyPoolingDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor)); + throw; + } + } + + cudnnPoolingDescriptor_t descriptor; + }; + + template inline + void getPoolingForwardOutputDim( + const PoolingDescriptor& poolingDesc, + const TensorDescriptor& inputDesc, + std::vector& output_dim) + { + output_dim.clear(); + output_dim.resize(CUDNN_DIM_MAX); /* we use `output_dim` to hold temporaries */ + + std::vector temp(CUDNN_DIM_MAX); + cudnnDataType_t tempDataType; + CUDA4DNN_CHECK_CUDNN( + cudnnGetTensorNdDescriptor( + inputDesc.get(), + CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */ + &tempDataType, + output_dim.data(), + temp.data(), + temp.data() + ) + ); + + const auto rank = output_dim[0]; + output_dim.resize(rank); + CUDA4DNN_CHECK_CUDNN( + cudnnGetPoolingNdForwardOutputDim(poolingDesc.get(), inputDesc.get(), rank, output_dim.data()) + ); + } + + template inline + void pool( + const Handle& handle, + const PoolingDescriptor& poolingDesc, + const TensorDescriptor& inputDesc, + const DevicePtr inputPtr, + T alpha, T beta, + const TensorDescriptor& outputDesc, + DevicePtr outputPtr) + { + CUDA4DNN_CHECK_CUDNN( + cudnnPoolingForward( + HandleAccessor::get(handle), + poolingDesc.get(), + &alpha, inputDesc.get(), inputPtr.get(), + &beta, outputDesc.get(), outputPtr.get() + ) + ); + } + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/softmax.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/softmax.hpp new file mode 100644 index 000000000000..f684c1509d8d --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/softmax.hpp @@ -0,0 +1,49 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP + +#include "cudnn.h" +#include "../pointer.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { + + /** @brief computes softmax (or log softmax) + * + * @tparam T matrix element type (must be `float` or `double`) + * + * @param handle valid cuDNN handle + * @param outputDesc tensor descriptor for A + * @param[out] output pointer to tensor in device memory + * @param inputDesc tensor descriptor for C + * @param[in] input pointer to tensor in device memory + * @param log apply log on probabilities + * + * Exception Guarantee: Basic + */ + template + typename std::enable_if::value || std::is_same::value, void> + ::type softmax(const cudnn::Handle& handle, + const TensorDescriptor& outputDesc, DevicePtr output, + const TensorDescriptor& inputDesc, DevicePtr input, + bool log) + { + T alpha = 1.0, beta = 0.0; + cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE; + CUDA4DNN_CHECK_CUDNN( + cudnnSoftmaxForward( + HandleAccessor::get(handle), + algo, CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, inputDesc.get(), input.get(), + &beta, outputDesc.get(), output.get() + ) + ); + } + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index 6c9d826dba90..8be2f7d86f6b 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -958,6 +958,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { template class Convolution { + using TensorDescriptor = cudnn::TensorDescriptor; + using FilterDescriptor = cudnn::FilterDescriptor; + using ConvolutionDescriptor = cudnn::ConvolutionDescriptor; + using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm; + public: struct params_type { std::vector input_shape; @@ -982,7 +987,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { std::vector output_dims; getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims); - outputTensorDesc = TensorDescriptor(output_dims); algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc); @@ -995,29 +999,21 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return algo.get_workspace_size(); } - /* plain convolution */ void convolve(TensorSpan output, TensorView input, TensorView filters, Workspace& scratchpad) { - cudnn::convolve(cudnnHandle, + cudnn::convolve( + cudnnHandle, + convDesc, algo, scratchpad, filterDesc, filters.get(), - convDesc, algo, WorkspaceAccessor::get(scratchpad), - inputTensorDesc, input.get(), 1.0, - 0.0, outputTensorDesc, output.get() + inputTensorDesc, input.get(), + 1.0, 0.0, outputTensorDesc, output.get() ); } private: cudnn::Handle cudnnHandle; - - using TensorDescriptor = cudnn::TensorDescriptor; TensorDescriptor inputTensorDesc, outputTensorDesc; - - using FilterDescriptor = cudnn::FilterDescriptor; FilterDescriptor filterDesc; - - using ConvolutionDescriptor = cudnn::ConvolutionDescriptor; ConvolutionDescriptor convDesc; - - using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm; ConvolutionAlgorithm algo; }; @@ -1043,16 +1039,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { Pooling(const Pooling&) = delete; Pooling(Pooling&&) = default; Pooling(cudnn::Handle handle, const params_type& params) { - cudnnHandle = std::move(handle); inputTensorDesc = TensorDescriptor(params.input_shape); - poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type); std::vector output_dim; getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim); - outputTensorDesc = TensorDescriptor(output_dim); } @@ -1060,14 +1053,17 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { Pooling& operator=(Pooling&&) = default; void pool(TensorView input, TensorSpan& output) { - cudnn::pool(cudnnHandle, poolingDesc, inputTensorDesc, input.get(), 1.0, 0.0, outputTensorDesc, output.get()); + cudnn::pool( + cudnnHandle, + poolingDesc, + inputTensorDesc, input.get(), + 1.0, 0.0, outputTensorDesc, output.get() + ); } private: cudnn::Handle cudnnHandle; - TensorDescriptor inputTensorDesc, outputTensorDesc; - PoolingDescriptor poolingDesc; }; diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index 7709ece0995a..39a4ec27be81 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -338,14 +338,14 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer if (type == MAX) { - params.type = csl::Pooling::pooling_type::max; + params.type = csl::Pooling::pooling_type::MAX; } else if (type == AVE) { if(padMode == "SAME") - params.type = csl::Pooling::pooling_type::average_exclude_padding; + params.type = csl::Pooling::pooling_type::AVERAGE_EXCLUDE_PADDING; else - params.type = csl::Pooling::pooling_type::average_include_padding; + params.type = csl::Pooling::pooling_type::AVERAGE_INCLUDE_PADDING; } else CV_Error(Error::StsNotImplemented, "Unsupported pooling type"); From 883968e7c9972a5fc29e7674aefc4f6d70508fff Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 25 Jun 2019 14:47:30 +0530 Subject: [PATCH 016/129] fixes for gcc, clang and doxygen; remove cxx14/17 code --- modules/dnn/include/opencv2/dnn/csl/cublas.hpp | 6 ++++-- modules/dnn/include/opencv2/dnn/csl/cudnn.hpp | 6 ++++-- modules/dnn/include/opencv2/dnn/csl/error.hpp | 2 +- modules/dnn/include/opencv2/dnn/csl/stream.hpp | 10 ++++++---- modules/dnn/include/opencv2/dnn/dnn.hpp | 2 +- modules/dnn/src/cuda4dnn/csl/cublas.cpp | 2 +- modules/dnn/src/cuda4dnn/csl/cudnn.cpp | 2 +- modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp | 10 +++++----- modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp | 4 +++- modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp | 6 +++--- modules/dnn/src/cuda4dnn/csl/kernels.hpp | 6 +++--- modules/dnn/src/cuda4dnn/csl/stream.cpp | 2 +- modules/dnn/src/cuda4dnn/csl/tensor.hpp | 12 ++++++------ modules/dnn/src/layers/convolution_layer.cpp | 6 ++---- modules/dnn/src/layers/elementwise_layers.cpp | 4 ++-- modules/dnn/src/layers/fully_connected_layer.cpp | 4 ++-- modules/dnn/src/layers/pooling_layer.cpp | 4 ++-- modules/dnn/src/layers/softmax_layer.cpp | 4 ++-- modules/dnn/src/op_cuda.hpp | 2 +- 19 files changed, 50 insertions(+), 44 deletions(-) diff --git a/modules/dnn/include/opencv2/dnn/csl/cublas.hpp b/modules/dnn/include/opencv2/dnn/csl/cublas.hpp index 5ed347ff8bfc..fc0f03b9bc64 100644 --- a/modules/dnn/include/opencv2/dnn/csl/cublas.hpp +++ b/modules/dnn/include/opencv2/dnn/csl/cublas.hpp @@ -12,7 +12,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas { - //! exception class for errors thrown by the cuBLAS API + /** @brief exception class for errors thrown by the cuBLAS API */ class cuBLASException : public CUDAException { public: using CUDAException::CUDAException; @@ -37,11 +37,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu Handle& operator=(const Handle&) noexcept; Handle& operator=(Handle&&) noexcept; - //!< returns true if the handle is valid + /** returns true if the handle is valid */ explicit operator bool() const noexcept; private: + /*! \cond PRIVATE */ friend class HandleAccessor; + /*! \endcond */ class UniqueHandle; std::shared_ptr handle; diff --git a/modules/dnn/include/opencv2/dnn/csl/cudnn.hpp b/modules/dnn/include/opencv2/dnn/csl/cudnn.hpp index 873e4f869a5c..03a768a72f6f 100644 --- a/modules/dnn/include/opencv2/dnn/csl/cudnn.hpp +++ b/modules/dnn/include/opencv2/dnn/csl/cudnn.hpp @@ -12,7 +12,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { - //! exception class for errors thrown by the cuDNN API + /** @brief exception class for errors thrown by the cuDNN API */ class cuDNNException : public CUDAException { public: using CUDAException::CUDAException; @@ -37,11 +37,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu Handle& operator=(const Handle&) noexcept; Handle& operator=(Handle&&) noexcept; - //!< returns true if the handle is valid + /** returns true if the handle is valid */ explicit operator bool() const noexcept; private: + /*! \cond PRIVATE */ friend class HandleAccessor; + /*! \endcond */ class UniqueHandle; std::shared_ptr handle; diff --git a/modules/dnn/include/opencv2/dnn/csl/error.hpp b/modules/dnn/include/opencv2/dnn/csl/error.hpp index 2210c748de64..6e68a854783e 100644 --- a/modules/dnn/include/opencv2/dnn/csl/error.hpp +++ b/modules/dnn/include/opencv2/dnn/csl/error.hpp @@ -9,7 +9,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { - //! exception class for errors thrown by the CUDA APIs + /** @brief exception class for errors thrown by the CUDA APIs */ class CUDAException : public cv::Exception { public: using cv::Exception::Exception; diff --git a/modules/dnn/include/opencv2/dnn/csl/stream.hpp b/modules/dnn/include/opencv2/dnn/csl/stream.hpp index 70fb616723b2..4681817ef42c 100644 --- a/modules/dnn/include/opencv2/dnn/csl/stream.hpp +++ b/modules/dnn/include/opencv2/dnn/csl/stream.hpp @@ -25,23 +25,25 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { Stream(const Stream&) noexcept; Stream(Stream&&) noexcept; - //!< if \p create is `true`, a new stream will be created instead of the otherwise default stream + /** if \p create is `true`, a new stream will be created instead of the otherwise default stream */ Stream(bool create); Stream& operator=(const Stream&) noexcept; Stream& operator=(Stream&&) noexcept; - //!< blocks the caller thread until all operations in the stream complete + /** blocks the caller thread until all operations in the stream are complete */ void synchronize() const; - //!< returns true if there are operations pending in the stream + /** returns true if there are operations pending in the stream */ bool busy() const; - //!< returns true if the stream is valid + /** returns true if the stream is valid */ explicit operator bool() const noexcept; private: + /*! \cond PRIVATE */ friend class StreamAccessor; + /*! \endcond */ class UniqueStream; std::shared_ptr stream; diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index 53d4c110387e..605e03384148 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -230,7 +230,7 @@ CV__DNN_INLINE_NS_BEGIN */ void forward_fallback(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals); - /** @brief forward the @p inputs through the layer + /** @brief Forward the @p inputs through the layer. * * @param[in] inputs input tensors * @param[out] outputs output tensors diff --git a/modules/dnn/src/cuda4dnn/csl/cublas.cpp b/modules/dnn/src/cuda4dnn/csl/cublas.cpp index 79756e26565c..d0f6050385a4 100644 --- a/modules/dnn/src/cuda4dnn/csl/cublas.cpp +++ b/modules/dnn/src/cuda4dnn/csl/cublas.cpp @@ -83,7 +83,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu return *this; } - //!< returns the raw cuBLAS handle + /** @brief returns the raw cuBLAS handle */ cublasHandle_t get() const noexcept { return handle; } private: diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.cpp b/modules/dnn/src/cuda4dnn/csl/cudnn.cpp index ec7ce5c11266..4c8f87f95938 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.cpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.cpp @@ -58,7 +58,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu return *this; } - //!< returns the raw cuDNN handle + /** returns the raw cuDNN handle */ cudnnHandle_t get() const noexcept { return handle; } private: diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp index 4fe94b8cdbb2..3069acbc711b 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp @@ -159,12 +159,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu const SequenceContainer& dialation, std::size_t group_count) { - CV_Assert(std::size(zero_padding) == std::size(stride)); - CV_Assert(std::size(zero_padding) == std::size(dialation)); + CV_Assert(zero_padding.size() == stride.size()); + CV_Assert(zero_padding.size() == dialation.size()); CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor)); try { - const auto rank = std::size(zero_padding); + const auto rank = zero_padding.size(); if (rank == 2) { CUDA4DNN_CHECK_CUDNN( cudnnSetConvolution2dDescriptor( @@ -236,8 +236,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default; ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default; - auto get() const noexcept { return algo; } - auto get_workspace_size() const noexcept { return workspace_size; } + cudnnConvolutionFwdAlgo_t get() const noexcept { return algo; } + std::size_t get_workspace_size() const noexcept { return workspace_size; } private: cudnnConvolutionFwdAlgo_t algo; diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp index 70060e284202..542e606d3a73 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include @@ -136,7 +138,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu * ... */ - std::partial_sum(std::rbegin(stride), std::rend(stride), std::rbegin(stride), std::multiplies()); + std::partial_sum(stride.rbegin(), stride.rend(), stride.rbegin(), std::multiplies()); /* WHAT WE HAVE NOW: * stride[-1] = 1 * stride[-2] = stride[-1] * dim[-1] diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp index 06e59d810df9..958434b8e56e 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp @@ -71,8 +71,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu const SequenceContainer& stride, pooling_type type) { - CV_Assert(std::size(window_size) == std::size(padding)); - CV_Assert(std::size(window_size) == std::size(stride)); + CV_Assert(window_size.size() == padding.size()); + CV_Assert(window_size.size() == stride.size()); auto get_pooling_type = [] (pooling_type type) { switch (type) { @@ -90,7 +90,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu CUDA4DNN_CHECK_CUDNN(cudnnCreatePoolingDescriptor(&descriptor)); try { - const auto rank = std::size(window_size); + const auto rank = window_size.size(); if (rank == 2) { CUDA4DNN_CHECK_CUDNN( cudnnSetPooling2dDescriptor( diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index 993914424b29..6477f47bfbaf 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -5,9 +5,9 @@ #ifndef OPENCV_DNN_CUDA4DNN_KERNELS_HPP #define OPENCV_DNN_CUDA4DNN_KERNELS_HPP -#include "cuda4dnn/csl/stream.hpp" -#include "cuda4dnn/csl/memory.hpp" -#include "cuda4dnn/csl/span.hpp" +#include "stream.hpp" +#include "memory.hpp" +#include "span.hpp" #include diff --git a/modules/dnn/src/cuda4dnn/csl/stream.cpp b/modules/dnn/src/cuda4dnn/csl/stream.cpp index 18e76c5dfbfe..8f91c632f424 100644 --- a/modules/dnn/src/cuda4dnn/csl/stream.cpp +++ b/modules/dnn/src/cuda4dnn/csl/stream.cpp @@ -57,7 +57,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return *this; } - //!< returns the raw CUDA stream handle + /** returns the raw CUDA stream handle */ cudaStream_t get() const noexcept { return stream; } void synchronize() const { CUDA4DNN_CHECK_CUDA(cudaStreamSynchronize(stream)); } diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index 8be2f7d86f6b..8612eee63c7f 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -32,6 +32,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + /** if the \p axis is a negative index, the equivalent postive index is returned; otherwise, returns \p axis */ + template + CUDA4DNN_HOST_DEVICE constexpr T clamp_axis(T axis, std::size_t rank) { + return axis < 0 ? axis + rank : axis; + } + /** \file tensor.hpp * * The tensor library contains three kinds of tensor objects which are summarized @@ -735,12 +741,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { const_pointer ptr; }; - /** if the \p axis is a negative index, the equivalent postive index is returned; otherwise, returns \p axis */ - template - CUDA4DNN_HOST_DEVICE constexpr T clamp_axis(T axis, std::size_t rank) { - return axis < 0 ? axis + rank : axis; - } - /** returns true if the two TensorType objects have the same shape */ template inline bool is_shape_same(const TensorType1& x, const TensorType2& y) noexcept { diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 30e078ce60a2..7a537f392401 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -1293,7 +1293,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl std::vector>& inputs, std::vector>& outputs, csl::Workspace& workspace - ) + ) override { CV_Assert(!activ); @@ -1305,8 +1305,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl auto output_wrapper = outputs[i].dynamicCast(); auto output = output_wrapper->getSpan(); - auto start = std::chrono::steady_clock::now(); - convoluter.convolve(output, input, filtersTensor, workspace); if (hasBias() || fusedBias) csl::tensor_ops::add(cudnnHandle, 1.0, output, 1.0, biasTensor); @@ -1319,7 +1317,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs - ) + ) override { cudnnHandle = std::move(cudnn_handle); diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index 0f6b1568efff..c23aceae1b03 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -234,7 +234,7 @@ class ElementWiseLayer : public Func::Layer std::vector>& inputs, std::vector>& outputs, csl::Workspace& workspace - ) + ) override { func.applyCUDA(inputs, outputs, workspace, stream); } @@ -245,7 +245,7 @@ class ElementWiseLayer : public Func::Layer csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs - ) + ) override { stream = std::move(stream_); } diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 06cd56768375..b78fbc007aa1 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -427,7 +427,7 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer std::vector>& inputs, std::vector>& outputs, csl::Workspace& workspace - ) + ) override { CV_UNUSED(workspace); @@ -481,7 +481,7 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs - ) + ) override { cublasHandle = std::move(cublas_handle); cudnnHandle = std::move(cudnn_handle); diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index 39a4ec27be81..13a45fb3352b 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -297,7 +297,7 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer std::vector>& inputs, std::vector>& outputs, csl::Workspace& workspace - ) + ) override { CV_UNUSED(workspace); @@ -316,7 +316,7 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs - ) + ) override { cudnnHandle = std::move(cudnn_handle); diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index 820e2c3b8e5b..987b2aa4ab13 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -298,7 +298,7 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer std::vector>& inputs, std::vector>& outputs, csl::Workspace& workspace - ) + ) override { CV_UNUSED(workspace); @@ -325,7 +325,7 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs - ) + ) override { cudnnHandle = std::move(cudnn_handle); } diff --git a/modules/dnn/src/op_cuda.hpp b/modules/dnn/src/op_cuda.hpp index 6ec1b893e0c5..f0487a806163 100644 --- a/modules/dnn/src/op_cuda.hpp +++ b/modules/dnn/src/op_cuda.hpp @@ -32,7 +32,7 @@ namespace cv { TensorT createTensorHeaderFromMat(const cv::Mat& mat) { auto is_matrix_type_same_as_tensor_type = [&mat]() { switch (mat.type()) { - case CV_32F: return std::is_same::value; + case CV_32F: return std::is_same::value; default: return false; } }; From 99fe39354f3c13422fb8a9b2a862f4f7461e4869 Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 25 Jun 2019 15:16:08 +0530 Subject: [PATCH 017/129] add blank_layer --- modules/dnn/src/cuda4dnn/csl/tensor.hpp | 14 ++++++++ modules/dnn/src/dnn.cpp | 3 +- modules/dnn/src/layers/blank_layer.cpp | 43 +++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index 8612eee63c7f..3c85cca5b9ff 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -791,6 +791,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace tensor_ops { + /** @brief copies data between tensors + * + * Pre-conditions: + * - \p dest and \p src must have the same shape + * + * Exception Gaurantee: Basic + */ + template inline + void copy(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + if (dest.get() != src.get()) + memcpy(dest.get(), src.get(), dest.size()); + } + /** @brief performs generalized matrix-multiplication * * Pre-conditions: diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 2bf18a1b63b4..96ee320cf81f 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -552,7 +552,6 @@ struct DataLayer : public Layer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || - backendId == DNN_BACKEND_CUDA || (backendId == DNN_BACKEND_INFERENCE_ENGINE && inputsData.size() == 1); } @@ -2538,7 +2537,7 @@ struct Net::Impl /* the layer does not have a CUDA implementation; use CPU for this layer */ std::ostringstream os; - os << ld.name << " >> " << ex.what(); + os << ld.name << " [" << ld.type << "]" << " >> " << ex.what(); if (ex.code == Error::StsNotImplemented) os << "Switching to CPU for this layer.\n"; CV_LOG_WARNING(NULL, os.str().c_str()); diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index ef44ed79c4f5..92bf25647f0a 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -40,8 +40,14 @@ // //M*/ #include "../precomp.hpp" +#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -57,6 +63,7 @@ class BlankLayerImpl CV_FINAL : public BlankLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine()); } @@ -107,6 +114,42 @@ class BlankLayerImpl CV_FINAL : public BlankLayer inputs[i].copyTo(outputs[i]); } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + CV_UNUSED(workspace); + + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + if (input.get() != output.get()) + csl::tensor_ops::copy(stream, output, input); + } + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + stream = std::move(stream_); + } + + csl::Stream stream; +#endif + #ifdef HAVE_INF_ENGINE virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { From 35a1d8fa03ea767657bc6b55a83dd29c49826cfb Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 25 Jun 2019 23:15:01 +0530 Subject: [PATCH 018/129] add LRN layer --- modules/dnn/src/cuda4dnn/csl/cudnn.hpp | 1 + modules/dnn/src/cuda4dnn/csl/cudnn/lrn.hpp | 108 +++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/tensor.hpp | 37 ++++++- modules/dnn/src/layers/lrn_layer.cpp | 47 +++++++++ 4 files changed, 191 insertions(+), 2 deletions(-) create mode 100644 modules/dnn/src/cuda4dnn/csl/cudnn/lrn.hpp diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp index b0671a86807c..2a3185cb3183 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp @@ -9,6 +9,7 @@ #include "cudnn/cudnn.hpp" #include "cudnn/convolution.hpp" +#include "cudnn/lrn.hpp" #include "cudnn/pooling.hpp" #include "cudnn/softmax.hpp" diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/lrn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/lrn.hpp new file mode 100644 index 000000000000..aa8bf2a42c64 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/lrn.hpp @@ -0,0 +1,108 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP + +#include "cudnn.h" +#include "../pointer.hpp" + +#include + +#include + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { + + class LRNDescriptor { + public: + enum class lrn_type { + ACROSS_CHANNELS + }; + + LRNDescriptor() noexcept : descriptor{ nullptr } { } + LRNDescriptor(const LRNDescriptor&) = delete; + LRNDescriptor(LRNDescriptor&& other) noexcept + : descriptor{ other.descriptor } { + other.descriptor = nullptr; + } + + LRNDescriptor(std::size_t local_size, double alpha, double beta, double k, lrn_type type) + { + constructor(local_size, alpha, beta, k, type); + } + + ~LRNDescriptor() noexcept { + if (descriptor != nullptr) { + /* cudnnDestroyLRNDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor)); + } + } + + LRNDescriptor& operator=(const LRNDescriptor&) = delete; + LRNDescriptor& operator=(LRNDescriptor&& other) noexcept { + descriptor = other.descriptor; + other.descriptor = nullptr; + return *this; + }; + + cudnnLRNDescriptor_t get() const noexcept { return descriptor; } + + private: + void constructor(std::size_t local_size, double alpha, double beta, double k, lrn_type type) { + auto get_lrn_type = [] (lrn_type type) { + switch (type) { + case lrn_type::ACROSS_CHANNELS: + return CUDNN_LRN_CROSS_CHANNEL_DIM1; + } + CV_Error(Error::StsBadArg, "unknown LRN type"); + }; + mode = get_lrn_type(type); + + CUDA4DNN_CHECK_CUDNN(cudnnCreateLRNDescriptor(&descriptor)); + try { + CUDA4DNN_CHECK_CUDNN( + cudnnSetLRNDescriptor( + descriptor, + local_size, + alpha, + beta, + k + ) + ); + } catch (...) { + /* cudnnDestroyLRNDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor)); + throw; + } + } + + cudnnLRNDescriptor_t descriptor; + cudnnLRNMode_t mode; + }; + + template inline + void LRNForward( + const Handle& handle, + const LRNDescriptor& lrnDesc, + const TensorDescriptor& inputDesc, + DevicePtr inputPtr, + T alpha, T beta, + const TensorDescriptor& outputDesc, + DevicePtr outputPtr) + { + CUDA4DNN_CHECK_CUDNN( + cudnnLRNCrossChannelForward( + HandleAccessor::get(handle), + lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1, + &alpha, inputDesc.get(), inputPtr.get(), + &beta, outputDesc.get(), outputPtr.get() + ) + ); + } + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index 3c85cca5b9ff..a1f23d588e58 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -1013,7 +1013,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return algo.get_workspace_size(); } - void convolve(TensorSpan output, TensorView input, TensorView filters, Workspace& scratchpad) { + void convolve(TensorSpan output, TensorView input, TensorView filters, const Workspace& scratchpad) { cudnn::convolve( cudnnHandle, convDesc, algo, scratchpad, @@ -1066,7 +1066,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { Pooling& operator=(const Pooling&) = delete; Pooling& operator=(Pooling&&) = default; - void pool(TensorView input, TensorSpan& output) { + void pool(TensorView input, TensorSpan output) { cudnn::pool( cudnnHandle, poolingDesc, @@ -1081,6 +1081,39 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { PoolingDescriptor poolingDesc; }; + template + class LRN { + using LRNDescriptor = cudnn::LRNDescriptor; + using TensorDescriptor = cudnn::TensorDescriptor; + + public: + using lrn_type = LRNDescriptor::lrn_type; + + LRN() = default; + LRN(const LRN&) = delete; + LRN(LRN&&) = default; + LRN(csl::cudnn::Handle handle, std::size_t local_size, double alpha, double beta, double k, lrn_type type) { + cudnnHandle = std::move(handle); + lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type); + } + + LRN& operator=(const LRN&) = delete; + LRN& operator=(LRN&&) = default; + + void normalize(TensorView input, TensorSpan output) { + cudnn::LRNForward( + cudnnHandle, + lrnDesc, + TensorDescriptor(input.shape()), input.get(), + 1.0, 0.0, TensorDescriptor(output.shape()), output.get() + ); + } + + private: + cudnn::Handle cudnnHandle; + LRNDescriptor lrnDesc; + }; + }}}} /* namespace cv::dnn::cuda4dnn::csl */ #endif /* OPENCV_DNN_CUDA4DNN_CSL_TENSOR_HPP */ diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp index b9e38769f079..423b8e8f9822 100644 --- a/modules/dnn/src/layers/lrn_layer.cpp +++ b/modules/dnn/src/layers/lrn_layer.cpp @@ -42,6 +42,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -55,6 +56,11 @@ using namespace cv::dnn::ocl4dnn; #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -94,6 +100,7 @@ class LRNLayerImpl CV_FINAL : public LRNLayer if (backendId == DNN_BACKEND_INFERENCE_ENGINE) return bias == (int)bias; return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA() && type == CHANNEL_NRM) || backendId == DNN_BACKEND_HALIDE || (backendId == DNN_BACKEND_VKCOM && haveVulkan() && (size % 2 == 1) && (type == CHANNEL_NRM)); } @@ -309,6 +316,46 @@ class LRNLayerImpl CV_FINAL : public LRNLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_UNUSED(workspace); + + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + lrn.normalize(input, output); + } + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs) override + { + cudnnHandle = std::move(cudnn_handle); + + if (type != CHANNEL_NRM) + CV_Error(CV_StsNotImplemented, "Only LRN across channels is supported by the CUDA backend"); + + float alphaSize = normBySize ? alpha : alpha * size; + lrn = csl::LRN(cudnnHandle, size, alphaSize, beta, bias, csl::LRN::lrn_type::ACROSS_CHANNELS); + } + + csl::cudnn::Handle cudnnHandle; + csl::LRN lrn; +#endif + virtual Ptr initVkCom(const std::vector > &inputs) CV_OVERRIDE { #ifdef HAVE_VULKAN From 84067f05c909fab0025c8277e03c5788c3f1ea68 Mon Sep 17 00:00:00 2001 From: Yashas Date: Wed, 26 Jun 2019 15:45:19 +0530 Subject: [PATCH 019/129] add rounding modes for pooling layer --- .../dnn/src/cuda4dnn/csl/cudnn/pooling.hpp | 3 +- modules/dnn/src/cuda4dnn/csl/tensor.hpp | 39 +++++++++++++++++-- modules/dnn/src/layers/pooling_layer.cpp | 20 +++++----- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp index 958434b8e56e..6476b7fd4173 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp @@ -101,8 +101,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu stride[0], stride[1] ) ); - } - else { + } else { std::vector iwindow_size(std::begin(window_size), std::end(window_size)); std::vector ipadding(std::begin(padding), std::end(padding)); std::vector istride(std::begin(stride), std::end(stride)); diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index a1f23d588e58..4d65d5124bc1 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -1037,6 +1037,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { using PoolingDescriptor = cudnn::PoolingDescriptor; public: + enum class rounding_type { + FLOOR, + CEILING + }; + using pooling_type = PoolingDescriptor::pooling_type; struct params_type { @@ -1046,6 +1051,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { std::vector padding; std::vector stride; + rounding_type rounding_mode; pooling_type type; }; @@ -1058,9 +1064,36 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { inputTensorDesc = TensorDescriptor(params.input_shape); poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type); - std::vector output_dim; - getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim); - outputTensorDesc = TensorDescriptor(output_dim); + const auto& input_shape = params.input_shape; + std::vector output_shape; + output_shape.assign(std::begin(input_shape), std::end(input_shape)); + + const auto& window_size = params.window_size; + const auto& padding = params.padding; + const auto& stride = params.stride; + + bool ceil = params.rounding_mode == rounding_type::CEILING; + for (int i = 0; i < window_size.size(); i++) { + double axis_sz = (input_shape[i + 2] + 2 * padding[i] - window_size[i]) / double(stride[i]) + 1; + output_shape[i + 2] = ceil ? std::ceil(axis_sz) : std::floor(axis_sz); + + /* check if the last pooling window starts in the valid region */ + if (padding[i]) { + if ((output_shape[i + 2] - 1) * stride[i] >= input_shape[i + 2] + padding[i]) + output_shape[i + 2]--; + } + } + + if (!ceil) + { + /* we must agree with cuDNN if we used floor */ + std::vector output_dim; + getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim); + CV_Assert(std::equal(std::begin(output_dim), std::end(output_dim), std::begin(output_shape))); + CV_UNUSED(output_dim); + } + + outputTensorDesc = TensorDescriptor(output_shape); } Pooling& operator=(const Pooling&) = delete; diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index 13a45fb3352b..ad19e9171363 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -301,6 +301,9 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer { CV_UNUSED(workspace); + if (computeMaxIdx) + CV_Error(Error::StsNotImplemented, "Pooling layer does not support caching max indicies"); + auto input_wrapper = inputs[0].dynamicCast(); auto input = input_wrapper->getView(); @@ -327,11 +330,7 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer CV_Error(Error::StsNotImplemented, "Asymmetric padding for pooling layer is not supported by CUDA backend"); csl::Pooling::params_type params; - - auto& ishape = params.input_shape; - ishape.resize(input_shape.size()); - std::copy(std::begin(input_shape), std::end(input_shape), std::begin(ishape)); - + params.input_shape.assign(std::begin(input_shape), std::end(input_shape)); params.window_size = kernel_size; params.padding = pads_begin; params.stride = strides; @@ -342,15 +341,18 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer } else if (type == AVE) { - if(padMode == "SAME") - params.type = csl::Pooling::pooling_type::AVERAGE_EXCLUDE_PADDING; - else + if(avePoolPaddedArea) params.type = csl::Pooling::pooling_type::AVERAGE_INCLUDE_PADDING; + else + params.type = csl::Pooling::pooling_type::AVERAGE_EXCLUDE_PADDING; } else CV_Error(Error::StsNotImplemented, "Unsupported pooling type"); - /* TODO ceilMode */ + if(ceilMode) + params.rounding_mode = csl::Pooling::rounding_type::CEILING; + else + params.rounding_mode = csl::Pooling::rounding_type::FLOOR; pooler = csl::Pooling(cudnnHandle, params); } From e2037033358aec17c6c17a2ada5fb1056b9f04b8 Mon Sep 17 00:00:00 2001 From: Yashas Date: Wed, 26 Jun 2019 17:28:01 +0530 Subject: [PATCH 020/129] split tensor.hpp into tensor.hpp and tensor_ops.hpp --- modules/dnn/src/cuda4dnn/csl/tensor.hpp | 358 ---------------- modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp | 384 ++++++++++++++++++ modules/dnn/src/layers/blank_layer.cpp | 1 + modules/dnn/src/layers/convolution_layer.cpp | 1 + modules/dnn/src/layers/elementwise_layers.cpp | 2 +- .../dnn/src/layers/fully_connected_layer.cpp | 1 + modules/dnn/src/layers/lrn_layer.cpp | 1 + modules/dnn/src/layers/pooling_layer.cpp | 1 + modules/dnn/src/layers/softmax_layer.cpp | 1 + 9 files changed, 391 insertions(+), 359 deletions(-) create mode 100644 modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index 4d65d5124bc1..377236f0a79a 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -789,364 +789,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { return { start, std::end(shape) }; } - namespace tensor_ops { - - /** @brief copies data between tensors - * - * Pre-conditions: - * - \p dest and \p src must have the same shape - * - * Exception Gaurantee: Basic - */ - template inline - void copy(const Stream& stream, TensorSpan dest, TensorView src) { - CV_Assert(is_shape_same(dest, src)); - if (dest.get() != src.get()) - memcpy(dest.get(), src.get(), dest.size()); - } - - /** @brief performs generalized matrix-multiplication - * - * Pre-conditions: - * - \p A and \p B must meet the mathematical requirements for matrix multiplication - * - \p result must be large enough to hold the result - * - * Exception Gaurantee: Basic - */ - template inline - void gemm(const cublas::Handle& handle, T beta, TensorSpan result, T alpha, bool transa, TensorView A, bool transb, TensorView B) { - /* matrix operations can be performed only on rank two or less tensors */ - CV_Assert(get_effective_rank(A) <= 2 && - get_effective_rank(B) <= 2 && - get_effective_rank(result) <= 2); - - /* check dimension requirements for matrix multiplication */ - if (!transa && !transb) { - CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2)); - CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2)); - CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1)); - } else if (!transa && transb) { - CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2)); - CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1)); - CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1)); - } else if (transa && !transb) { - CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2)); - CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2)); - CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1)); - } else { - CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2)); - CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1)); - CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1)); - } - - const auto result_nr = result.get_axis_size(-2); - const auto result_nc = result.get_axis_size(-1); - const auto common_dim = A.get_axis_size(transa ? -2 : -1); - const auto A_nc = A.get_axis_size(-1); - const auto B_nc = B.get_axis_size(-1); - - /* tensors are stored in row-major but cublas::gemm operates on column-major matrices - * a row-major matrix when read as column-major matrix gives the transpose of the intended matrix - * - * Required: C = AB - * what cuBLAS sees: C^T = A^TB^T = (BA)^T - * - * By reversing operands, we effectively perform: - * C^T = B^TA^T = (AB)^T - * - * which gives C = AB - */ - cublas::gemm(handle, - transb, transa, - result_nc, result_nr, common_dim, - alpha, B.get(), B_nc, - A.get(), A_nc, - beta, result.get(), result_nc); - } - - /** @brief performs element-wise addition with broadcasting - * - * Pre-conditions: - * - \p A and \p C must be compatible tensors - * - * Exception Gaurantee: Basic - */ - template inline - void add(const cudnn::Handle& handle, T beta, TensorSpan C, T alpha, TensorView A) { - CV_Assert(is_shape_compatible(A, C)); - - using cudnn::TensorDescriptor; - auto aDesc = TensorDescriptor(A.shape()); - auto cDesc = TensorDescriptor(C.shape()); - cudnn::add(handle, alpha, aDesc, A.get(), beta, cDesc, C.get()); - } - - /** @brief performs element-wise addition with broadcasting - * - * Pre-conditions: - * - \p A and \p result must be compatible tensors - * - * Exception Gaurantee: Basic - */ - template inline - void softmax(const cudnn::Handle& handle, TensorSpan output, TensorView input, int channel_axis, bool log) { - CV_Assert(is_shape_same(output, input)); - - channel_axis = clamp_axis(channel_axis, input.rank); - - std::size_t outer_size = 1; - for (int j = 0; j < channel_axis; j++) - outer_size *= input.get_axis_size(j); - - auto channel_size = input.get_axis_size(channel_axis); - - std::size_t inner_size = 1; - for (int j = channel_axis + 1; j < input.rank; j++) - inner_size *= input.get_axis_size(j); - - std::array shape = { outer_size, channel_size, 1 , inner_size }; - - using cudnn::TensorDescriptor; - auto inputDesc = TensorDescriptor(shape); - auto outputDesc = TensorDescriptor(shape); - cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log); - } - - template inline - void abs(const Stream& stream, TensorSpan dest, TensorView src) { - CV_Assert(is_shape_same(dest, src)); - kernels::abs(stream, dest, src); - } - - template inline - void bnll(const Stream& stream, TensorSpan dest, TensorView src) { - CV_Assert(is_shape_same(dest, src)); - kernels::bnll(stream, dest, src); - } - - template inline - void relu(const Stream& stream, TensorSpan dest, TensorView src, T slope = 0) { - CV_Assert(is_shape_same(dest, src)); - kernels::relu(stream, dest, src, slope); - } - - template inline - void clipped_relu(const Stream& stream, TensorSpan dest, TensorView src, T min, T max) { - CV_Assert(is_shape_same(dest, src)); - kernels::clipped_relu(stream, dest, src, min, max); - } - - template inline - void channelwise_relu(const Stream& stream, TensorSpan dest, TensorView src, TensorView slope) { - CV_Assert(is_shape_same(dest, src)); - CV_Assert(src.get_axis_size(1) == slope.size()); - std::size_t inner_size = src.size() / src.get_axis_size(0); - std::size_t channel_size = inner_size / src.get_axis_size(1); - kernels::axiswise_relu(stream, dest, src, slope, inner_size, channel_size); - } - - template inline - void elu(const Stream& stream, TensorSpan dest, TensorView src) { - CV_Assert(is_shape_same(dest, src)); - kernels::elu(stream, dest, src); - } - - template inline - void power(const Stream& stream, TensorSpan dest, TensorView src, T exp = 1, T scale = 1, T shift = 0) { - CV_Assert(is_shape_same(dest, src)); - kernels::power(stream, dest, src, exp, scale, shift); - } - - template inline - void sigmoid(const Stream& stream, TensorSpan dest, TensorView src) { - CV_Assert(is_shape_same(dest, src)); - kernels::sigmoid(stream, dest, src); - } - - template inline - void tanh(const Stream& stream, TensorSpan dest, TensorView src) { - CV_Assert(is_shape_same(dest, src)); - kernels::tanh(stream, dest, src); - } - } - - template - class Convolution { - using TensorDescriptor = cudnn::TensorDescriptor; - using FilterDescriptor = cudnn::FilterDescriptor; - using ConvolutionDescriptor = cudnn::ConvolutionDescriptor; - using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm; - - public: - struct params_type { - std::vector input_shape; - std::vector filter_shape; - - std::vector padding; - std::vector stride; - std::vector dialation; - - std::size_t groups; - }; - - Convolution() = default; - Convolution(const Convolution&) = delete; - Convolution(Convolution&&) = default; - Convolution(cudnn::Handle handle, const params_type& params) { - cudnnHandle = std::move(handle); - - inputTensorDesc = TensorDescriptor(params.input_shape); - filterDesc = FilterDescriptor(params.filter_shape); - convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dialation, params.groups); - - std::vector output_dims; - getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims); - outputTensorDesc = TensorDescriptor(output_dims); - - algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc); - } - - Convolution& operator=(const Convolution&) = delete; - Convolution& operator=(Convolution&&) = default; - - std::size_t get_workspace_size() const noexcept { - return algo.get_workspace_size(); - } - - void convolve(TensorSpan output, TensorView input, TensorView filters, const Workspace& scratchpad) { - cudnn::convolve( - cudnnHandle, - convDesc, algo, scratchpad, - filterDesc, filters.get(), - inputTensorDesc, input.get(), - 1.0, 0.0, outputTensorDesc, output.get() - ); - } - - private: - cudnn::Handle cudnnHandle; - TensorDescriptor inputTensorDesc, outputTensorDesc; - FilterDescriptor filterDesc; - ConvolutionDescriptor convDesc; - ConvolutionAlgorithm algo; - }; - - template - class Pooling { - using TensorDescriptor = cudnn::TensorDescriptor; - using PoolingDescriptor = cudnn::PoolingDescriptor; - - public: - enum class rounding_type { - FLOOR, - CEILING - }; - - using pooling_type = PoolingDescriptor::pooling_type; - - struct params_type { - std::vector input_shape; - - std::vector window_size; - std::vector padding; - std::vector stride; - - rounding_type rounding_mode; - pooling_type type; - }; - - Pooling() = default; - Pooling(const Pooling&) = delete; - Pooling(Pooling&&) = default; - Pooling(cudnn::Handle handle, const params_type& params) { - cudnnHandle = std::move(handle); - - inputTensorDesc = TensorDescriptor(params.input_shape); - poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type); - - const auto& input_shape = params.input_shape; - std::vector output_shape; - output_shape.assign(std::begin(input_shape), std::end(input_shape)); - - const auto& window_size = params.window_size; - const auto& padding = params.padding; - const auto& stride = params.stride; - - bool ceil = params.rounding_mode == rounding_type::CEILING; - for (int i = 0; i < window_size.size(); i++) { - double axis_sz = (input_shape[i + 2] + 2 * padding[i] - window_size[i]) / double(stride[i]) + 1; - output_shape[i + 2] = ceil ? std::ceil(axis_sz) : std::floor(axis_sz); - - /* check if the last pooling window starts in the valid region */ - if (padding[i]) { - if ((output_shape[i + 2] - 1) * stride[i] >= input_shape[i + 2] + padding[i]) - output_shape[i + 2]--; - } - } - - if (!ceil) - { - /* we must agree with cuDNN if we used floor */ - std::vector output_dim; - getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim); - CV_Assert(std::equal(std::begin(output_dim), std::end(output_dim), std::begin(output_shape))); - CV_UNUSED(output_dim); - } - - outputTensorDesc = TensorDescriptor(output_shape); - } - - Pooling& operator=(const Pooling&) = delete; - Pooling& operator=(Pooling&&) = default; - - void pool(TensorView input, TensorSpan output) { - cudnn::pool( - cudnnHandle, - poolingDesc, - inputTensorDesc, input.get(), - 1.0, 0.0, outputTensorDesc, output.get() - ); - } - - private: - cudnn::Handle cudnnHandle; - TensorDescriptor inputTensorDesc, outputTensorDesc; - PoolingDescriptor poolingDesc; - }; - - template - class LRN { - using LRNDescriptor = cudnn::LRNDescriptor; - using TensorDescriptor = cudnn::TensorDescriptor; - - public: - using lrn_type = LRNDescriptor::lrn_type; - - LRN() = default; - LRN(const LRN&) = delete; - LRN(LRN&&) = default; - LRN(csl::cudnn::Handle handle, std::size_t local_size, double alpha, double beta, double k, lrn_type type) { - cudnnHandle = std::move(handle); - lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type); - } - - LRN& operator=(const LRN&) = delete; - LRN& operator=(LRN&&) = default; - - void normalize(TensorView input, TensorSpan output) { - cudnn::LRNForward( - cudnnHandle, - lrnDesc, - TensorDescriptor(input.shape()), input.get(), - 1.0, 0.0, TensorDescriptor(output.shape()), output.get() - ); - } - - private: - cudnn::Handle cudnnHandle; - LRNDescriptor lrnDesc; - }; - }}}} /* namespace cv::dnn::cuda4dnn::csl */ #endif /* OPENCV_DNN_CUDA4DNN_CSL_TENSOR_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp new file mode 100644 index 000000000000..8c5a77bf5d9c --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp @@ -0,0 +1,384 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_TENSOR_OPS_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_TENSOR_OPS_HPP + +#include "stream.hpp" +#include "tensor.hpp" +#include "kernels.hpp" +#include "pointer.hpp" +#include "cublas.hpp" +#include "cudnn.hpp" + +#include + +#include +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + namespace tensor_ops { + + /** @brief copies data between tensors + * + * Pre-conditions: + * - \p dest and \p src must have the same shape + * + * Exception Gaurantee: Basic + */ + template inline + void copy(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + if (dest.get() != src.get()) + memcpy(dest.get(), src.get(), dest.size()); + } + + /** @brief performs generalized matrix-multiplication + * + * Pre-conditions: + * - \p A and \p B must meet the mathematical requirements for matrix multiplication + * - \p result must be large enough to hold the result + * + * Exception Gaurantee: Basic + */ + template inline + void gemm(const cublas::Handle& handle, T beta, TensorSpan result, T alpha, bool transa, TensorView A, bool transb, TensorView B) { + /* matrix operations can be performed only on rank two or less tensors */ + CV_Assert(get_effective_rank(A) <= 2 && + get_effective_rank(B) <= 2 && + get_effective_rank(result) <= 2); + + /* check dimension requirements for matrix multiplication */ + if (!transa && !transb) { + CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2)); + CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2)); + CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1)); + } else if (!transa && transb) { + CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2)); + CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1)); + CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1)); + } else if (transa && !transb) { + CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2)); + CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2)); + CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1)); + } else { + CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2)); + CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1)); + CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1)); + } + + const auto result_nr = result.get_axis_size(-2); + const auto result_nc = result.get_axis_size(-1); + const auto common_dim = A.get_axis_size(transa ? -2 : -1); + const auto A_nc = A.get_axis_size(-1); + const auto B_nc = B.get_axis_size(-1); + + /* tensors are stored in row-major but cublas::gemm operates on column-major matrices + * a row-major matrix when read as column-major matrix gives the transpose of the intended matrix + * + * Required: C = AB + * what cuBLAS sees: C^T = A^TB^T = (BA)^T + * + * By reversing operands, we effectively perform: + * C^T = B^TA^T = (AB)^T + * + * which gives C = AB + */ + cublas::gemm(handle, + transb, transa, + result_nc, result_nr, common_dim, + alpha, B.get(), B_nc, + A.get(), A_nc, + beta, result.get(), result_nc); + } + + /** @brief performs element-wise addition with broadcasting + * + * Pre-conditions: + * - \p A and \p C must be compatible tensors + * + * Exception Gaurantee: Basic + */ + template inline + void add(const cudnn::Handle& handle, T beta, TensorSpan C, T alpha, TensorView A) { + CV_Assert(is_shape_compatible(A, C)); + + using cudnn::TensorDescriptor; + auto aDesc = TensorDescriptor(A.shape()); + auto cDesc = TensorDescriptor(C.shape()); + cudnn::add(handle, alpha, aDesc, A.get(), beta, cDesc, C.get()); + } + + /** @brief performs element-wise addition with broadcasting + * + * Pre-conditions: + * - \p A and \p result must be compatible tensors + * + * Exception Gaurantee: Basic + */ + template inline + void softmax(const cudnn::Handle& handle, TensorSpan output, TensorView input, int channel_axis, bool log) { + CV_Assert(is_shape_same(output, input)); + + channel_axis = clamp_axis(channel_axis, input.rank); + + std::size_t outer_size = 1; + for (int j = 0; j < channel_axis; j++) + outer_size *= input.get_axis_size(j); + + auto channel_size = input.get_axis_size(channel_axis); + + std::size_t inner_size = 1; + for (int j = channel_axis + 1; j < input.rank; j++) + inner_size *= input.get_axis_size(j); + + std::array shape = { outer_size, channel_size, 1 , inner_size }; + + using cudnn::TensorDescriptor; + auto inputDesc = TensorDescriptor(shape); + auto outputDesc = TensorDescriptor(shape); + cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log); + } + + template inline + void abs(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + kernels::abs(stream, dest, src); + } + + template inline + void bnll(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + kernels::bnll(stream, dest, src); + } + + template inline + void relu(const Stream& stream, TensorSpan dest, TensorView src, T slope = 0) { + CV_Assert(is_shape_same(dest, src)); + kernels::relu(stream, dest, src, slope); + } + + template inline + void clipped_relu(const Stream& stream, TensorSpan dest, TensorView src, T min, T max) { + CV_Assert(is_shape_same(dest, src)); + kernels::clipped_relu(stream, dest, src, min, max); + } + + template inline + void channelwise_relu(const Stream& stream, TensorSpan dest, TensorView src, TensorView slope) { + CV_Assert(is_shape_same(dest, src)); + CV_Assert(src.get_axis_size(1) == slope.size()); + std::size_t inner_size = src.size() / src.get_axis_size(0); + std::size_t channel_size = inner_size / src.get_axis_size(1); + kernels::axiswise_relu(stream, dest, src, slope, inner_size, channel_size); + } + + template inline + void elu(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + kernels::elu(stream, dest, src); + } + + template inline + void power(const Stream& stream, TensorSpan dest, TensorView src, T exp = 1, T scale = 1, T shift = 0) { + CV_Assert(is_shape_same(dest, src)); + kernels::power(stream, dest, src, exp, scale, shift); + } + + template inline + void sigmoid(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + kernels::sigmoid(stream, dest, src); + } + + template inline + void tanh(const Stream& stream, TensorSpan dest, TensorView src) { + CV_Assert(is_shape_same(dest, src)); + kernels::tanh(stream, dest, src); + } + } + + template + class Convolution { + using TensorDescriptor = cudnn::TensorDescriptor; + using FilterDescriptor = cudnn::FilterDescriptor; + using ConvolutionDescriptor = cudnn::ConvolutionDescriptor; + using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm; + + public: + struct params_type { + std::vector input_shape; + std::vector filter_shape; + + std::vector padding; + std::vector stride; + std::vector dialation; + + std::size_t groups; + }; + + Convolution() = default; + Convolution(const Convolution&) = delete; + Convolution(Convolution&&) = default; + Convolution(cudnn::Handle handle, const params_type& params) { + cudnnHandle = std::move(handle); + + inputTensorDesc = TensorDescriptor(params.input_shape); + filterDesc = FilterDescriptor(params.filter_shape); + convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dialation, params.groups); + + std::vector output_dims; + getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims); + outputTensorDesc = TensorDescriptor(output_dims); + + algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc); + } + + Convolution& operator=(const Convolution&) = delete; + Convolution& operator=(Convolution&&) = default; + + std::size_t get_workspace_size() const noexcept { + return algo.get_workspace_size(); + } + + void convolve(TensorSpan output, TensorView input, TensorView filters, const Workspace& scratchpad) { + cudnn::convolve( + cudnnHandle, + convDesc, algo, scratchpad, + filterDesc, filters.get(), + inputTensorDesc, input.get(), + 1.0, 0.0, outputTensorDesc, output.get() + ); + } + + private: + cudnn::Handle cudnnHandle; + TensorDescriptor inputTensorDesc, outputTensorDesc; + FilterDescriptor filterDesc; + ConvolutionDescriptor convDesc; + ConvolutionAlgorithm algo; + }; + + template + class Pooling { + using TensorDescriptor = cudnn::TensorDescriptor; + using PoolingDescriptor = cudnn::PoolingDescriptor; + + public: + enum class rounding_type { + FLOOR, + CEILING + }; + + using pooling_type = PoolingDescriptor::pooling_type; + + struct params_type { + std::vector input_shape; + + std::vector window_size; + std::vector padding; + std::vector stride; + + rounding_type rounding_mode; + pooling_type type; + }; + + Pooling() = default; + Pooling(const Pooling&) = delete; + Pooling(Pooling&&) = default; + Pooling(cudnn::Handle handle, const params_type& params) { + cudnnHandle = std::move(handle); + + inputTensorDesc = TensorDescriptor(params.input_shape); + poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type); + + const auto& input_shape = params.input_shape; + std::vector output_shape; + output_shape.assign(std::begin(input_shape), std::end(input_shape)); + + const auto& window_size = params.window_size; + const auto& padding = params.padding; + const auto& stride = params.stride; + + bool ceil = params.rounding_mode == rounding_type::CEILING; + for (int i = 0; i < window_size.size(); i++) { + double axis_sz = (input_shape[i + 2] + 2 * padding[i] - window_size[i]) / double(stride[i]) + 1; + output_shape[i + 2] = ceil ? std::ceil(axis_sz) : std::floor(axis_sz); + + /* check if the last pooling window starts in the valid region */ + if (padding[i]) { + if ((output_shape[i + 2] - 1) * stride[i] >= input_shape[i + 2] + padding[i]) + output_shape[i + 2]--; + } + } + + if (!ceil) + { + /* we must agree with cuDNN if we used floor */ + std::vector output_dim; + getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim); + CV_Assert(std::equal(std::begin(output_dim), std::end(output_dim), std::begin(output_shape))); + CV_UNUSED(output_dim); + } + + outputTensorDesc = TensorDescriptor(output_shape); + } + + Pooling& operator=(const Pooling&) = delete; + Pooling& operator=(Pooling&&) = default; + + void pool(TensorView input, TensorSpan output) { + cudnn::pool( + cudnnHandle, + poolingDesc, + inputTensorDesc, input.get(), + 1.0, 0.0, outputTensorDesc, output.get() + ); + } + + private: + cudnn::Handle cudnnHandle; + TensorDescriptor inputTensorDesc, outputTensorDesc; + PoolingDescriptor poolingDesc; + }; + + template + class LRN { + using LRNDescriptor = cudnn::LRNDescriptor; + using TensorDescriptor = cudnn::TensorDescriptor; + + public: + using lrn_type = LRNDescriptor::lrn_type; + + LRN() = default; + LRN(const LRN&) = delete; + LRN(LRN&&) = default; + LRN(csl::cudnn::Handle handle, std::size_t local_size, double alpha, double beta, double k, lrn_type type) { + cudnnHandle = std::move(handle); + lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type); + } + + LRN& operator=(const LRN&) = delete; + LRN& operator=(LRN&&) = default; + + void normalize(TensorView input, TensorSpan output) { + cudnn::LRNForward( + cudnnHandle, + lrnDesc, + TensorDescriptor(input.shape()), input.get(), + 1.0, 0.0, TensorDescriptor(output.shape()), output.get() + ); + } + + private: + cudnn::Handle cudnnHandle; + LRNDescriptor lrnDesc; + }; + +}}}} /* namespace cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_TENSOR_OPS_HPP */ diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index 92bf25647f0a..730c45ff1598 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -45,6 +45,7 @@ #ifdef HAVE_CUDA #include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" using namespace cv::dnn::cuda4dnn; #endif diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 7a537f392401..d1818a49510e 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -58,6 +58,7 @@ using namespace cv::dnn::ocl4dnn; #ifdef HAVE_CUDA #include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" using namespace cv::dnn::cuda4dnn; #endif diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index c23aceae1b03..757894b80f9f 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -55,7 +55,7 @@ #ifdef HAVE_CUDA #include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" using namespace cv::dnn::cuda4dnn; #endif diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index b78fbc007aa1..8a21b67bc2d9 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -54,6 +54,7 @@ using namespace cv::dnn::ocl4dnn; #ifdef HAVE_CUDA #include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" using namespace cv::dnn::cuda4dnn; #endif diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp index 423b8e8f9822..4e2e99489d52 100644 --- a/modules/dnn/src/layers/lrn_layer.cpp +++ b/modules/dnn/src/layers/lrn_layer.cpp @@ -58,6 +58,7 @@ using namespace cv::dnn::ocl4dnn; #ifdef HAVE_CUDA #include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" using namespace cv::dnn::cuda4dnn; #endif diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index ad19e9171363..9725cb6095a1 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -60,6 +60,7 @@ using namespace cv::dnn::ocl4dnn; #ifdef HAVE_CUDA #include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" using namespace cv::dnn::cuda4dnn; #endif diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index 987b2aa4ab13..43bb6a578cc2 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -57,6 +57,7 @@ using namespace cv::dnn::ocl4dnn; #ifdef HAVE_CUDA #include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" using namespace cv::dnn::cuda4dnn; #endif From 4c8d23b46439b119ee0bb00477852413057f6255 Mon Sep 17 00:00:00 2001 From: Yashas Date: Wed, 26 Jun 2019 17:38:22 +0530 Subject: [PATCH 021/129] add concat layer --- modules/dnn/src/cuda/concat.cu | 63 +++++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/kernels.hpp | 8 +++ modules/dnn/src/cuda4dnn/csl/tensor.hpp | 1 - modules/dnn/src/layers/concat_layer.cpp | 64 ++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 modules/dnn/src/cuda/concat.cu diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu new file mode 100644 index 000000000000..6c9ff17921b8 --- /dev/null +++ b/modules/dnn/src/cuda/concat.cu @@ -0,0 +1,63 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include + +#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/kernel_utils.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/pointer.hpp" +#include "../cuda4dnn/csl/stream.hpp" + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace raw { + /* Reference: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/concat_layer.cu */ + template + __global__ void concat( + DevicePtr output, DevicePtr input, + std::size_t concat_size, std::size_t input_concat_axis_size, + std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis, + std::size_t n) + { + for (auto idx : grid_stride_range(n)) { + const auto total_concat_size = concat_size * input_concat_axis_size; + const auto concat_num = idx / total_concat_size; + const auto concat_index = idx % total_concat_size; + const auto top_index = concat_index + + (concat_num * output_concat_axis_size + output_offset_concat_axis) * concat_size; + + output[top_index] = input[idx]; + } + } + } + + template + void concat( + const Stream& stream, + TensorSpan output, TensorView input, + std::size_t concat_size, std::size_t input_concat_axis_size, + std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis) + { + auto policy = make_policy(raw::concat, 0, stream); + launch_kernel(raw::concat, policy, + output.get(), input.get(), + concat_size, input_concat_axis_size, + output_concat_axis_size, output_offset_concat_axis, + input.size()); + } + + template void concat( + const Stream& stream, + TensorSpan output, TensorView input, + std::size_t concat_size, std::size_t input_concat_axis_size, + std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis); + + template void concat( + const Stream& stream, + TensorSpan output, TensorView input, + std::size_t concat_size, std::size_t input_concat_axis_size, + std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis); + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index 6477f47bfbaf..a9583b34ff65 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -8,6 +8,7 @@ #include "stream.hpp" #include "memory.hpp" #include "span.hpp" +#include "tensor.hpp" #include @@ -40,6 +41,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke template void power(const Stream& stream, span dest, view src, T exp, T scale, T shift); + template + void concat( + const Stream& stream, + TensorSpan output, TensorView input, + std::size_t concat_size, std::size_t input_concat_axis_size, + std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis); + }}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index 377236f0a79a..fa7a105e0476 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -10,7 +10,6 @@ #include "cublas.hpp" #include "cudnn.hpp" #include "span.hpp" -#include "kernels.hpp" #include "workspace.hpp" #include diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp index aae9bdea1a85..2ada0aec3d3a 100644 --- a/modules/dnn/src/layers/concat_layer.cpp +++ b/modules/dnn/src/layers/concat_layer.cpp @@ -42,6 +42,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -50,6 +51,12 @@ #include "opencl_kernels_dnn.hpp" #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/kernels.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -105,6 +112,7 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA() && !padding) || (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 && !padding) || // By channels (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !padding) || (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !padding); @@ -234,6 +242,62 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer } #endif +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + CV_UNUSED(workspace); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto output_concat_axis = [&] { + auto actual_dims = output_wrapper->getShape().size(); + auto extra_dims = output.rank - actual_dims; + return clamp(axis, actual_dims) + extra_dims; + }(); + + std::size_t concat_size = 1; + for (auto i = output_concat_axis + 1; i < output.rank; i++) + concat_size *= output.get_axis_size(i); + + std::size_t output_concat_axis_offset = 0; + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto input_concat_axis = [&] { + auto actual_dims = input_wrapper->getShape().size(); + auto extra_dims = input.rank - actual_dims; + return clamp(axis, actual_dims) + extra_dims; + }(); + + csl::kernels::concat(stream, output, input, + concat_size, input.get_axis_size(input_concat_axis), + output.get_axis_size(output_concat_axis), output_concat_axis_offset); + + output_concat_axis_offset += input.get_axis_size(input_concat_axis); + } + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + stream = std::move(stream_); + } + + csl::Stream stream; +#endif + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); From 2ab9bddb8717f8d48e2fd05d5a1c961080b62db3 Mon Sep 17 00:00:00 2001 From: Yashas Date: Wed, 26 Jun 2019 20:30:36 +0530 Subject: [PATCH 022/129] add scale layer --- modules/dnn/src/cuda/scale.cu | 102 ++++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/kernels.hpp | 14 +++ modules/dnn/src/cuda4dnn/csl/tensor.hpp | 4 +- modules/dnn/src/layers/scale_layer.cpp | 105 ++++++++++++++++++++++- 4 files changed, 222 insertions(+), 3 deletions(-) create mode 100644 modules/dnn/src/cuda/scale.cu diff --git a/modules/dnn/src/cuda/scale.cu b/modules/dnn/src/cuda/scale.cu new file mode 100644 index 000000000000..85eb18c71d22 --- /dev/null +++ b/modules/dnn/src/cuda/scale.cu @@ -0,0 +1,102 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/kernel_utils.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/pointer.hpp" +#include "../cuda4dnn/csl/stream.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace raw { + template + __global__ void scale( + std::size_t n, + DevicePtr output, + DevicePtr input, std::size_t inner_size, + DevicePtr weights, std::size_t scale_size) + { + for (auto i : grid_stride_range(n)) { + const auto scale_idx = (i / inner_size) % scale_size; + output[i] = input[i] * weights[scale_idx]; + } + } + + template + __global__ void scale_with_bias( + std::size_t n, + DevicePtr output, + DevicePtr input, std::size_t inner_size, + DevicePtr weights, DevicePtr bias, std::size_t scale_bias_size) + { + for (auto i : grid_stride_range(n)) { + const auto scale_idx = (i / inner_size) % scale_bias_size; + output[i] = input[i] * weights[scale_idx] + bias[scale_idx]; + } + } + } + + template + void scale( + const Stream& stream, + TensorSpan output, + TensorView input, std::size_t inner_size, + TensorView weights) + { + CV_Assert(is_shape_same(input, output)); + + auto policy = make_policy(raw::scale, 0, stream); + launch_kernel(raw::scale, policy, + output.size(), + output.get(), + input.get(), inner_size, + weights.get(), weights.size()); + } + + template void scale( + const Stream& stream, + TensorSpan output, + TensorView input, std::size_t inner_size, + TensorView weights); + + template void scale( + const Stream& stream, + TensorSpan output, + TensorView input, std::size_t inner_size, + TensorView weights); + + template + void scale_with_bias( + const Stream& stream, + TensorSpan output, + TensorView input, std::size_t inner_size, + TensorView weights, TensorView bias) + { + CV_Assert(is_shape_same(input, output)); + CV_Assert(weights.size() == bias.size()); + + auto policy = make_policy(raw::scale_with_bias, 0, stream); + launch_kernel(raw::scale_with_bias, policy, + output.size(), + output.get(), + input.get(), inner_size, + weights.get(), bias.get(), weights.size()); + } + + template void scale_with_bias( + const Stream& stream, + TensorSpan output, + TensorView input, std::size_t inner_size, + TensorView weights, TensorView bias); + + template void scale_with_bias( + const Stream& stream, + TensorSpan output, + TensorView input, std::size_t inner_size, + TensorView weights, TensorView bias); + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index a9583b34ff65..59caca36cc8d 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -48,6 +48,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke std::size_t concat_size, std::size_t input_concat_axis_size, std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis); + template + void scale( + const Stream& stream, + TensorSpan output, + TensorView input, std::size_t inner_size, + TensorView weights); + + template + void scale_with_bias( + const Stream& stream, + TensorSpan output, + TensorView input, std::size_t inner_size, + TensorView weights, TensorView bias); + }}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index fa7a105e0476..67a41f6704cb 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -344,7 +344,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { */ CUDA4DNN_HOST_DEVICE size_type get_axis_size(int axis) const noexcept { axis = clamp_axis(axis, rank); - CV_Assert(axis >= 0 && axis < rank); + assert(axis >= 0 && axis < rank); return sizes[axis]; } @@ -576,7 +576,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { */ CUDA4DNN_HOST_DEVICE size_type get_axis_size(int axis) const noexcept { axis = clamp_axis(axis, rank); - CV_Assert(axis >= 0 && axis < rank); + assert(axis >= 0 && axis < rank); /* CV_Assert isn't allowed in device code */ return sizes[axis]; } diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp index 4486a0f6de07..16d0ec80791f 100644 --- a/modules/dnn/src/layers/scale_layer.cpp +++ b/modules/dnn/src/layers/scale_layer.cpp @@ -11,10 +11,18 @@ Implementation of Scale layer. #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../cuda4dnn/csl/kernels.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -50,7 +58,9 @@ class ScaleLayerImpl CV_FINAL : public ScaleLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { - return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE || + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || + backendId == DNN_BACKEND_HALIDE || (backendId == DNN_BACKEND_INFERENCE_ENGINE && axis == 1); } @@ -138,6 +148,99 @@ class ScaleLayerImpl CV_FINAL : public ScaleLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + CV_UNUSED(workspace); + CV_Assert(outputs.size() == 1); + CV_Assert(!blobs.empty() || inputs.size() == 2); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::TensorView weights; + if (blobs.empty()) + { + auto wrapper = inputs[1].dynamicCast(); + weights = wrapper->getView(); + } + else if (hasWeights) + { + weights = csl::TensorSpan(weightsTensor); + } + + csl::TensorView bias; + if (hasBias) + bias = csl::TensorSpan(biasTensor); + + const auto numParams = !weights.empty() ? weights.size() : bias.size(); + CV_Assert(numParams != 0); + if (hasWeights && hasBias) + { + CV_CheckEQ(weights.size(), bias.size(), "Incompatible weights/bias blobs"); + } + + auto input_shape = input_wrapper->getShape(); + + /* the weights might require broadcasting to scale */ + int end_axis = [&] { + for (int endAxis = axis + 1; endAxis <= input_shape.size(); ++endAxis) + if (total(input_shape, axis, endAxis) == numParams) + return endAxis; + CV_Assert(0 /* invalid weights matrix */); + }(); + + std::size_t inner_size = total(input_shape, end_axis, -1); + + CV_Assert(hasWeights || hasBias); + if (hasWeights && hasBias) + csl::kernels::scale_with_bias(stream, output, input, inner_size, weights, bias); + else if (hasWeights) + csl::kernels::scale(stream, output, input, inner_size, weights); + else + { + /* rarely used codepath; hence, not optimized TODO */ + csl::tensor_ops::copy(stream, output, input); + csl::tensor_ops::add(stream, 1.0, output, 1.0, bias); + } + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + stream = std::move(stream_); + if (hasWeights) + { + weightsTensor = createTensorHeaderFromMat(blobs[0]); + copyMatToTensor(weightsTensor, blobs[0], stream); + } + + if (hasBias) + { + /* if the weights are provided, bias will be in blobs[1]; otherwise, it will be in blobs[0] + * in either case, it is at the end of the blobs vector => bias = blobs.back() + */ + biasTensor = createTensorHeaderFromMat(blobs[1]); + copyMatToTensor(biasTensor, blobs.back(), stream); + } + } + + csl::Tensor weightsTensor, biasTensor; + csl::Stream stream; +#endif + virtual Ptr tryAttach(const Ptr& node) CV_OVERRIDE { switch (node->backendId) From b12e4fc624e3e1b7c0813702b1957cb483d1c4ca Mon Sep 17 00:00:00 2001 From: Yashas Date: Wed, 26 Jun 2019 20:44:41 +0530 Subject: [PATCH 023/129] add batch normalization layer --- modules/dnn/src/layers/batch_norm_layer.cpp | 50 +++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index 791d8f1e0bf1..2e74bc5217ac 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -11,6 +11,7 @@ Implementation of Batch Normalization layer. #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include @@ -19,6 +20,12 @@ Implementation of Batch Normalization layer. #include "opencl_kernels_dnn.hpp" #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/kernels.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -155,6 +162,7 @@ class BatchNormLayerImpl CV_FINAL : public BatchNormLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return (backendId == DNN_BACKEND_OPENCV) || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || (backendId == DNN_BACKEND_HALIDE && haveHalide()) || (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && (preferableTarget == DNN_TARGET_CPU || dims == 4)); } @@ -306,6 +314,48 @@ class BatchNormLayerImpl CV_FINAL : public BatchNormLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto input_shape = input_wrapper->getShape(); + std::size_t inner_size = total(input_shape, 2, -1); + + csl::kernels::scale_with_bias(stream, output, input, inner_size, weightsTensor, biasTensor); + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + stream = std::move(stream_); + + weightsTensor = createTensorHeaderFromMat(weights_); + copyMatToTensor(weightsTensor, weights_, stream); + + biasTensor = createTensorHeaderFromMat(bias_); + copyMatToTensor(biasTensor, bias_, stream); + } + + csl::Tensor weightsTensor, biasTensor; + csl::Stream stream; +#endif + virtual Ptr tryAttach(const Ptr& node) CV_OVERRIDE { switch (node->backendId) From 4ae2d35fc39dcc54d98a2bf58f04e5c503602b67 Mon Sep 17 00:00:00 2001 From: Yashas Date: Wed, 26 Jun 2019 21:50:35 +0530 Subject: [PATCH 024/129] split math.cu into activations.cu and math.hpp --- .../dnn/src/cuda/{math.cu => activations.cu} | 64 +++---------------- modules/dnn/src/cuda/math.hpp | 60 +++++++++++++++++ 2 files changed, 70 insertions(+), 54 deletions(-) rename modules/dnn/src/cuda/{math.cu => activations.cu} (76%) create mode 100644 modules/dnn/src/cuda/math.hpp diff --git a/modules/dnn/src/cuda/math.cu b/modules/dnn/src/cuda/activations.cu similarity index 76% rename from modules/dnn/src/cuda/math.cu rename to modules/dnn/src/cuda/activations.cu index 0ef18142f566..a993fa8fd095 100644 --- a/modules/dnn/src/cuda/math.cu +++ b/modules/dnn/src/cuda/activations.cu @@ -4,6 +4,8 @@ #include +#include "math.hpp" + #include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/kernel_utils.hpp" #include "../cuda4dnn/csl/span.hpp" @@ -11,58 +13,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { - namespace detail { - template __device__ T abs(T val); - template <> __device__ float abs(float val) { return fabsf(val); } - template <> __device__ double abs(double val) { return fabs(val); } - - template __device__ T exp(T val); - template <> __device__ float exp(float val) { return expf(val); } - template <> __device__ double exp(double val) { return ::exp(val); } - - template __device__ T max(T x, T y); - template <> __device__ float max(float x, float y) { return fmaxf(x, y); } - template <> __device__ double max(double x, double y) { return fmax(x, y); } - - template __device__ T min(T x, T y); - template <> __device__ float min(float x, float y) { return fminf(x, y); } - template <> __device__ double min(double x, double y) { return fmin(x, y); } - - template __device__ T log1p(T val); - template <> __device__ float log1p(float val) { return log1pf(val); } - template <> __device__ double log1p(double val) { return ::log1p(val); } - - template __device__ T log1pexp(T val); - template <> __device__ double log1pexp(double val) { - if (val <= -37) - return exp(val); - else if (-37 < val && val <= 18) - return log1p(exp(val)); - else if (18 < val && val <= 33.3) - return val + exp(-val); - else - return val; - } - template <> __device__ float log1pexp(float val) { return log1pexp(val); } - - template __device__ T tanh(T val); - template <> __device__ float tanh(float val) { return tanhf(val); } - template <> __device__ double tanh(double val) { return ::tanh(val); } - - template __device__ T pow(T val, T exp); - template <> __device__ float pow(float val, float exp) { return powf(val, exp); } - template <> __device__ double pow(double val, double exp) { return ::pow(val, exp); } - - template - __device__ T sigmoid(T val) { return T(1) / (1 + exp(-val)); } - } - namespace raw { template __global__ void abs(span dest, view src) { assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { - using detail::abs; + using utils::abs; dest[i] = abs(src[i]); } } @@ -71,7 +27,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k __global__ void tanh(span dest, view src) { assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { - using detail::tanh; + using utils::tanh; dest[i] = tanh(src[i]); } } @@ -80,7 +36,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k __global__ void sigmoid(span dest, view src) { assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { - using detail::sigmoid; + using utils::sigmoid; dest[i] = sigmoid(src[i]); } } @@ -89,7 +45,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k __global__ void bnll(span dest, view src) { assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { - using detail::log1pexp; + using utils::log1pexp; dest[i] = src[i] > 0 ? src[i] + log1pexp(-src[i]) : log1pexp(src[i]); } } @@ -98,7 +54,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k __global__ void elu(span dest, view src) { assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { - using detail::exp; + using utils::exp; dest[i] = src[i] >= 0 ? src[i] : (exp(src[i]) - 1); } } @@ -115,8 +71,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k assert(src.size() >= dest.size()); assert(floor <= ceiling); for (auto i : grid_stride_range(dest.size())) { - using detail::max; - using detail::min; + using utils::max; + using utils::min; dest[i] = min(max(src[i], floor), ceiling); } } @@ -134,7 +90,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k __global__ void power(span dest, view src, T exp, T scale, T shift) { assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { - using detail::pow; + using utils::pow; dest[i] = pow(shift + scale * src[i], exp); } } diff --git a/modules/dnn/src/cuda/math.hpp b/modules/dnn/src/cuda/math.hpp new file mode 100644 index 000000000000..e0773be0f969 --- /dev/null +++ b/modules/dnn/src/cuda/math.hpp @@ -0,0 +1,60 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA_MATH_HPP +#define OPENCV_DNN_SRC_CUDA_MATH_HPP + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace utils { + template __device__ T abs(T val); + template <> inline __device__ float abs(float val) { return fabsf(val); } + template <> inline __device__ double abs(double val) { return fabs(val); } + + template __device__ T exp(T val); + template <> inline __device__ float exp(float val) { return expf(val); } + template <> inline __device__ double exp(double val) { return ::exp(val); } + + template __device__ T max(T x, T y); + template <> inline __device__ float max(float x, float y) { return fmaxf(x, y); } + template <> inline __device__ double max(double x, double y) { return fmax(x, y); } + + template __device__ T min(T x, T y); + template <> inline __device__ float min(float x, float y) { return fminf(x, y); } + template <> inline __device__ double min(double x, double y) { return fmin(x, y); } + + template __device__ T log1p(T val); + template <> inline __device__ float log1p(float val) { return log1pf(val); } + template <> inline __device__ double log1p(double val) { return ::log1p(val); } + + template __device__ T log1pexp(T val); + template <> inline __device__ double log1pexp(double val) { + if (val <= -37) + return exp(val); + else if (-37 < val && val <= 18) + return log1p(exp(val)); + else if (18 < val && val <= 33.3) + return val + exp(-val); + else + return val; + } + template <> inline __device__ float log1pexp(float val) { return log1pexp(val); } + + template __device__ T tanh(T val); + template <> inline __device__ float tanh(float val) { return tanhf(val); } + template <> inline __device__ double tanh(double val) { return ::tanh(val); } + + template __device__ T pow(T val, T exp); + template <> inline __device__ float pow(float val, float exp) { return powf(val, exp); } + template <> inline __device__ double pow(double val, double exp) { return ::pow(val, exp); } + + template + __device__ T sigmoid(T val) { return T(1) / (1 + exp(-val)); } + } + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ + +#endif /* OPENCV_DNN_SRC_CUDA_MATH_HPP */ From cf34c6514534fcd7edd1e6b335b68e2b6d02256b Mon Sep 17 00:00:00 2001 From: Yashas Date: Wed, 26 Jun 2019 22:53:38 +0530 Subject: [PATCH 025/129] add eltwise layer --- modules/dnn/src/cuda/eltwise_ops.cu | 92 ++++++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/kernels.hpp | 12 ++++ modules/dnn/src/cuda4dnn/csl/span.hpp | 4 ++ modules/dnn/src/layers/eltwise_layer.cpp | 92 ++++++++++++++++++++++++ 4 files changed, 200 insertions(+) create mode 100644 modules/dnn/src/cuda/eltwise_ops.cu diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu new file mode 100644 index 000000000000..d7f615d4b152 --- /dev/null +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -0,0 +1,92 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "math.hpp" + +#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/kernel_utils.hpp" +#include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/span.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace raw { + template + __global__ void eltwise_max_2(span output, view x, view y) { + assert(x.size() == y.size()); + assert(output.size() >= y.size()); + + for (auto i : grid_stride_range(output.size())) { + using utils::max; + output[i] = max(x[i], y[i]); + } + } + + template + __global__ void eltwise_sum_2(span output, view x, view y) { + assert(x.size() == y.size()); + assert(output.size() >= y.size()); + + for (auto i : grid_stride_range(output.size())) + output[i] = x[i] + y[i]; + } + + template + __global__ void eltwise_sum_coeff_2(span output, T coeff_x, view x, T coeff_y, view y) { + assert(x.size() == y.size()); + assert(output.size() >= y.size()); + + for (auto i : grid_stride_range(output.size())) + output[i] = coeff_x * x[i] + coeff_y * y[i]; + } + + template + __global__ void eltwise_prod_2(span output, view x, view y) { + assert(x.size() == y.size()); + assert(output.size() >= y.size()); + + for (auto i : grid_stride_range(output.size())) + output[i] = x[i] * y[i]; + } + } + + template + void eltwise_max_2(const Stream& stream, span output, view x, view y) { + auto policy = make_policy(raw::eltwise_max_2, 0, stream); + launch_kernel(raw::eltwise_max_2, policy, output, x, y); + } + + template void eltwise_max_2(const Stream& stream, span output, view x, view y); + template void eltwise_max_2(const Stream& stream, span output, view x, view y); + + template + void eltwise_sum_2(const Stream& stream, span output, view x, view y) { + auto policy = make_policy(raw::eltwise_sum_2, 0, stream); + launch_kernel(raw::eltwise_sum_2, policy, output, x, y); + } + + template void eltwise_sum_2(const Stream& stream, span output, view x, view y); + template void eltwise_sum_2(const Stream& stream, span output, view x, view y); + + template + void eltwise_sum_coeff_2(const Stream& stream, span output, T coeff_x, view x, T coeff_y, view y) { + auto policy = make_policy(raw::eltwise_sum_coeff_2, 0, stream); + launch_kernel(raw::eltwise_sum_coeff_2, policy, output, coeff_x, x, coeff_y, y); + } + + template void eltwise_sum_coeff_2(const Stream&, span, float, view, float, view); + template void eltwise_sum_coeff_2(const Stream&, span, double, view, double, view); + + template + void eltwise_prod_2(const Stream& stream, span output, view x, view y) { + auto policy = make_policy(raw::eltwise_prod_2, 0, stream); + launch_kernel(raw::eltwise_prod_2, policy, output, x, y); + } + + template void eltwise_prod_2(const Stream& stream, span output, view x, view y); + template void eltwise_prod_2(const Stream& stream, span output, view x, view y); + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index 59caca36cc8d..88640ef8324a 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -62,6 +62,18 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke TensorView input, std::size_t inner_size, TensorView weights, TensorView bias); + template + void eltwise_max_2(const Stream& stream, span output, view x, view y); + + template + void eltwise_sum_2(const Stream& stream, span output, view x, view y); + + template + void eltwise_sum_coeff_2(const Stream& stream, span output, T coeff_x, view x, T coeff_y, view y); + + template + void eltwise_prod_2(const Stream& stream, span output, view x, view y); + }}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/span.hpp b/modules/dnn/src/cuda4dnn/csl/span.hpp index c18e7b96cc7d..f6771c761a8c 100644 --- a/modules/dnn/src/cuda4dnn/csl/span.hpp +++ b/modules/dnn/src/cuda4dnn/csl/span.hpp @@ -45,6 +45,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { CUDA4DNN_DEVICE reference operator[](difference_type index) const { return ptr[index]; } CUDA4DNN_HOST_DEVICE pointer data() const noexcept { return ptr; } + template::type, + typename std::enable_if::value, bool>::type = true> + CUDA4DNN_HOST_DEVICE operator span() const noexcept { return span{ptr, sz}; } + private: pointer ptr; size_type sz; diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index 47430680f32d..640908633ed4 100644 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -42,6 +42,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" @@ -49,6 +50,13 @@ #include "opencl_kernels_dnn.hpp" #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../cuda4dnn/csl/kernels.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -97,6 +105,7 @@ class EltwiseLayerImpl CV_FINAL : public EltwiseLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || backendId == DNN_BACKEND_HALIDE || (backendId == DNN_BACKEND_INFERENCE_ENGINE && (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty())); @@ -374,6 +383,89 @@ class EltwiseLayerImpl CV_FINAL : public EltwiseLayer coeffs, op, activ.get(), nstripes); } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + CV_Assert(outputs.size() == 1); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + /* optimized path for common case */ + if (inputs.size() == 2) + { + auto input_wrapper_x = inputs[0].dynamicCast(); + auto input_x = input_wrapper_x->getView(); + + auto input_wrapper_y = inputs[1].dynamicCast(); + auto input_y = input_wrapper_y->getView(); + + switch(op) + { + case MAX: csl::kernels::eltwise_max_2(stream, output, input_x, input_y); break; + case PROD: csl::kernels::eltwise_prod_2(stream, output, input_x, input_y); break; + case SUM: + if (coeffs.empty() || (coeffs[0] == 1 && coeffs[1] == 1)) + csl::kernels::eltwise_sum_2(stream, output, input_x, input_y); + else + csl::kernels::eltwise_sum_coeff_2(stream, output, coeffs[0], input_x, coeffs[1], input_y); + break; + } + } + else + { + auto input_wrapper_0 = inputs[0].dynamicCast(); + auto input_0 = input_wrapper_0->getView(); + + /* we first make a copy and then apply EltwiseOp cumulatively */ + csl::tensor_ops::copy(stream, output, input_0); + + for (std::size_t i = 1; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + switch (op) + { + case MAX: csl::kernels::eltwise_max_2(stream, output, output, input); break; + case PROD: csl::kernels::eltwise_prod_2(stream, output, output, input); break; + case SUM: + if (coeffs.empty() || coeffs[i] == 1) + csl::kernels::eltwise_sum_2(stream, output, output, input); + else + { + /* if this is the first op, we must scale output too */ + auto coeff_x = (i == 1) ? coeffs[0] : 1.0; + csl::kernels::eltwise_sum_coeff_2(stream, output, coeff_x, output, coeffs[i], input); + } + break; + } + } + } + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + CV_Assert(inputs.size() >= 2); + CV_Assert(coeffs.size() == 0 || inputs.size() == coeffs.size()); + CV_Assert(coeffs.size() == 0 || op == SUM); + + stream = std::move(stream_); + } + + csl::Stream stream; +#endif + virtual Ptr initHalide(const std::vector > &input) CV_OVERRIDE { #ifdef HAVE_HALIDE From ed87d45bbe64628a98d74d15be8ac8cc5b12ed99 Mon Sep 17 00:00:00 2001 From: Yashas Date: Wed, 26 Jun 2019 23:31:32 +0530 Subject: [PATCH 026/129] add flatten layer --- modules/dnn/src/layers/flatten_layer.cpp | 47 ++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp index f1250e7e3e0a..1071f4d0da90 100644 --- a/modules/dnn/src/layers/flatten_layer.cpp +++ b/modules/dnn/src/layers/flatten_layer.cpp @@ -42,11 +42,18 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "op_cuda.hpp" #include "../op_inf_engine.hpp" #include #include #include +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -65,6 +72,7 @@ class FlattenLayerImpl CV_FINAL : public FlattenLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine()); } @@ -162,6 +170,45 @@ class FlattenLayerImpl CV_FINAL : public FlattenLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) + { + CV_UNUSED(workspace); + CV_Assert(outputs.size() == 1); + + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + if (input.get() != output.get()) + { + input.reshape_as(output); + csl::tensor_ops::copy(stream, output, input); + } + } + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes + ) + { + stream = std::move(stream_); + } + + csl::Stream stream; +#endif + #ifdef HAVE_INF_ENGINE virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { From 9261242a777ca10271e67b26eb2428c4dc158346 Mon Sep 17 00:00:00 2001 From: Yashas Date: Fri, 28 Jun 2019 13:11:49 +0530 Subject: [PATCH 027/129] add tensor transform api --- modules/dnn/src/cuda4dnn/csl/cudnn.hpp | 1 + .../dnn/src/cuda4dnn/csl/cudnn/transform.hpp | 105 ++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp | 35 +++++- 3 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 modules/dnn/src/cuda4dnn/csl/cudnn/transform.hpp diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp index 2a3185cb3183..142766f5f31c 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp @@ -12,5 +12,6 @@ #include "cudnn/lrn.hpp" #include "cudnn/pooling.hpp" #include "cudnn/softmax.hpp" +#include "cudnn/transform.hpp" #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/transform.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/transform.hpp new file mode 100644 index 000000000000..e2a3bfb6583b --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/transform.hpp @@ -0,0 +1,105 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP + +#include "../pointer.hpp" + +#include "cudnn.hpp" + +#include +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { + + class TensorTransformDescriptor { + public: + TensorTransformDescriptor() noexcept : descriptor{ nullptr } { } + TensorTransformDescriptor(const TensorTransformDescriptor&) = delete; + TensorTransformDescriptor(TensorTransformDescriptor&& other) noexcept + : descriptor{ other.descriptor } { + other.descriptor = nullptr; + } + + template ()))> + TensorTransformDescriptor( + const SequenceContainer& padding_left, + const SequenceContainer& padding_right) + { + constructor(padding_left, padding_right); + } + + ~TensorTransformDescriptor() noexcept { + if (descriptor != nullptr) { + /* cudnnDestroyTensorTransformDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor)); + } + } + + TensorTransformDescriptor& operator=(const TensorTransformDescriptor&) = delete; + TensorTransformDescriptor& operator=(TensorTransformDescriptor&& other) noexcept { + descriptor = other.descriptor; + other.descriptor = nullptr; + return *this; + }; + + cudnnTensorTransformDescriptor_t get() const noexcept { return descriptor; } + + private: + template + void constructor( + const SequenceContainer& padding_left, + const SequenceContainer& padding_right + ) + { + CV_Assert(padding_left.size() == padding_right.size()); + + auto ipadding_left = std::vector(std::begin(padding_left), std::end(padding_left)); + auto ipadding_right = std::vector(std::begin(padding_right), std::end(padding_right)); + CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorTransformDescriptor(&descriptor)); + try { + CUDA4DNN_CHECK_CUDNN( + cudnnSetTensorTransformDescriptor( + descriptor, + ipadding_left.size(), CUDNN_TENSOR_NCHW, + ipadding_left.data(), ipadding_right.data(), + NULL, CUDNN_TRANSFORM_FOLD + ) + ); + } catch (...) { + /* cudnnDestroyTensorTransformDescriptor will not fail */ + CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor)); + throw; + } + } + + cudnnTensorTransformDescriptor_t descriptor; + }; + + template + void transform( + const Handle& handle, + const TensorTransformDescriptor& transDesc, + const TensorDescriptor& inputDesc, + DevicePtr inputPtr, + const TensorDescriptor& outputDesc, + DevicePtr outputPtr) + { + T alpha = 1, beta = 0; + CUDA4DNN_CHECK_CUDNN( + cudnnTransformTensorEx( + HandleAccessor::get(handle), + transDesc.get(), + &alpha, inputDesc.get(), inputPtr.get(), + &beta, outputDesc.get(), outputPtr.get() + ) + ); + } + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp index 8c5a77bf5d9c..bed909c225e9 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp @@ -357,7 +357,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { LRN() = default; LRN(const LRN&) = delete; LRN(LRN&&) = default; - LRN(csl::cudnn::Handle handle, std::size_t local_size, double alpha, double beta, double k, lrn_type type) { + LRN(cudnn::Handle handle, std::size_t local_size, double alpha, double beta, double k, lrn_type type) { cudnnHandle = std::move(handle); lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type); } @@ -379,6 +379,39 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { LRNDescriptor lrnDesc; }; + template + class TensorTransform { + using TensorTransformDescriptor = cudnn::TensorTransformDescriptor; + using TensorDescriptor = cudnn::TensorDescriptor; + + public: + TensorTransform() = default; + TensorTransform(const TensorTransform&) = delete; + TensorTransform(TensorTransform&&) = default; + + template + TensorTransform(cudnn::Handle handle, const SequenceContainer& paddingLeft, const SequenceContainer& paddingRight) { + cudnnHandle = std::move(handle); + transDesc = TensorTransformDescriptor(paddingLeft, paddingRight); + } + + TensorTransform& operator=(const TensorTransform&) = delete; + TensorTransform& operator=(TensorTransform&&) = default; + + void transform(TensorView input, TensorSpan output) { + cudnn::transform( + cudnnHandle, + transDesc, + TensorDescriptor(input.shape()), input.get(), + TensorDescriptor(output.shape()), output.get() + ); + } + + private: + cudnn::Handle cudnnHandle; + TensorTransformDescriptor transDesc; + }; + }}}} /* namespace cv::dnn::cuda4dnn::csl */ #endif /* OPENCV_DNN_CUDA4DNN_CSL_TENSOR_OPS_HPP */ From 7db9e6e0183d2788708ef2bb60bca9f7609b710c Mon Sep 17 00:00:00 2001 From: Yashas Date: Fri, 28 Jun 2019 15:26:54 +0530 Subject: [PATCH 028/129] add asymmetric padding support for convolution layer --- modules/dnn/src/layers/convolution_layer.cpp | 107 +++++++++++++++---- 1 file changed, 84 insertions(+), 23 deletions(-) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index d1818a49510e..4151289306aa 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -1297,19 +1297,23 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl ) override { CV_Assert(!activ); + CV_Assert(inputs.size() == 1 && outputs.size() == 1); - for (std::size_t i = 0; i < inputs.size(); i++) + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + if (!transformedInput.empty()) { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); + inputTransformer.transform(input, transformedInput); + input = csl::TensorView(transformedInput); + } - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); - convoluter.convolve(output, input, filtersTensor, workspace); - if (hasBias() || fusedBias) - csl::tensor_ops::add(cudnnHandle, 1.0, output, 1.0, biasTensor); - } + convoluter.convolve(output, input, filtersTensor, workspace); + if (hasBias() || fusedBias) + csl::tensor_ops::add(cudnnHandle, 1.0, output, 1.0, biasTensor); } void initCUDA( @@ -1325,12 +1329,13 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl auto input_wrapper = inputs[0].dynamicCast(); auto input_shape = input_wrapper->getShape(); - /* we support 1-6d convolution */ - CV_Assert(input_shape.size() >= 3 || input_shape.size() <= 8); + /* 1d, 2d, 3d convolutions are supported */ + CV_Assert(input_shape.size() >= 3 || input_shape.size() <= 5); CV_Assert(blobs.size() >= 1); const auto& filtersMat = blobs[0]; + const auto rank = input_shape.size(); const auto output_feature_maps = filtersMat.size[0]; const auto input_feature_maps = input_shape[1]; const auto input_feature_maps_per_group = filtersMat.size[1]; @@ -1343,34 +1348,86 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl if (hasBias() || fusedBias) { - std::vector biasShape(input_shape.size(), 1); + std::vector biasShape(rank, 1); biasShape[1] = output_feature_maps; - Mat biasMat(input_shape.size(), biasShape.data(), CV_32F, &biasvec[0]); + Mat biasMat(rank, biasShape.data(), CV_32F, &biasvec[0]); biasTensor = createTensorHeaderFromMat(biasMat); copyMatToTensor(biasTensor, biasMat, stream); } - if(pads_begin != pads_end) - CV_Error(Error::StsNotImplemented, "Asymmetric padding for convolution layer is not supported by CUDA backend"); + /* left and right are misleading as the padding is applicable for any number of dimensions + * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` + */ + std::vector common_padding(rank, 0); + std::vector padding_left(rank, 0), padding_right(rank, 0); + if (padMode.empty()) + { + for (int i = 2; i < common_padding.size(); i++) + { + common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]); + padding_left[i] = pads_begin[i - 2] - common_padding[i]; + padding_right[i] = pads_end[i - 2] - common_padding[i]; + } + } + else if (padMode == "VALID") { /* nothing to do as the paddings are already preset to zero */ } + else if (padMode == "SAME") + { + /* TensorFlow Logic: + * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] + * + * if total padding is odd, the input is padded towards the end + */ + std::vector inShape(std::begin(input_shape) + 2, std::end(input_shape)), outShape; + getConvPoolOutParams(inShape, kernel_size, strides, padMode, dilations, outShape); + + for (int i = 2; i < rank; i++) + { + const auto j = i - 2; /* filter index */ + const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; + const auto required_total_padding = + std::max(0, (outShape[j] - 1) * strides[j] + effective_kernel_size - inShape[j]); + + common_padding[i] = required_total_padding / 2; + padding_left[i] = 0; + padding_right[i] = required_total_padding % 2; + } + } + else + { + CV_Error(Error::StsNotImplemented, "Specified padding mode not supported by ConvolutionLayer"); + } + + /* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by + * copying the input to a bigger tensor and pad the sides manually + */ + for (int i = 0; i < rank; i++) + input_shape[i] += padding_left[i] + padding_right[i]; + + /* if the actual input shape and the new input shape do not match; we need to transform the input */ + transform_required = input_shape != input_wrapper->getShape(); + if (transform_required) + { + transformedInput.resize(std::begin(input_shape), std::end(input_shape)); + inputTransformer = csl::TensorTransform(cudnnHandle, padding_left, padding_right); + } csl::Convolution::params_type params; - params.padding = pads_begin; - params.stride = strides; - params.dialation = dilations; - params.groups = groups; auto& ishape = params.input_shape; - ishape.resize(input_shape.size()); - std::copy(std::begin(input_shape), std::end(input_shape), std::begin(ishape)); + ishape.assign(std::begin(input_shape), std::end(input_shape)); auto& fshape = params.filter_shape; fshape.resize(ishape.size()); fshape[0] = output_feature_maps; fshape[1] = input_feature_maps_per_group; - - std::copy_backward(std::begin(kernel_size), std::end(kernel_size), std::end(fshape)); + std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2); CV_Assert(fshape.size() == kernel_size.size() + 2); + params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); + params.stride = strides; + params.dialation = dilations; + params.groups = groups; + convoluter = csl::Convolution(cudnnHandle, params); scratch_mem_in_bytes = convoluter.get_workspace_size(); } @@ -1378,6 +1435,10 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl csl::cudnn::Handle cudnnHandle; csl::Tensor filtersTensor, biasTensor; csl::Convolution convoluter; + + bool transform_required; + csl::Tensor transformedInput; + csl::TensorTransform inputTransformer; #endif virtual int64 getFLOPS(const std::vector &inputs, From 205c1915447dda7e9ce044d941c16ac4d5b3f093 Mon Sep 17 00:00:00 2001 From: YashasSamaga Date: Fri, 28 Jun 2019 22:58:23 +0530 Subject: [PATCH 029/129] fix rebase issues --- modules/dnn/src/layers/convolution_layer.cpp | 2 +- modules/dnn/src/layers/flatten_layer.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 4151289306aa..9350bdfe2452 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -1342,7 +1342,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl const auto groups = input_feature_maps / input_feature_maps_per_group; CV_Assert(input_feature_maps % input_feature_maps_per_group == 0); - const Mat& filterWeightsSource = newWeightAndBias ? weightsMat : filtersMat; + const Mat& filterWeightsSource = fusedWeights ? weightsMat : filtersMat; filtersTensor = createTensorHeaderFromMat(filterWeightsSource); copyMatToTensor(filtersTensor, filterWeightsSource, stream); diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp index 1071f4d0da90..7b2f3fe42a25 100644 --- a/modules/dnn/src/layers/flatten_layer.cpp +++ b/modules/dnn/src/layers/flatten_layer.cpp @@ -42,7 +42,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "op_cuda.hpp" +#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include #include From e04e463eb80e1d5e0e31959c97e90f43e0f21818 Mon Sep 17 00:00:00 2001 From: Yashas Date: Fri, 28 Jun 2019 22:25:40 +0530 Subject: [PATCH 030/129] add reshape layer --- modules/dnn/src/layers/flatten_layer.cpp | 7 ++-- modules/dnn/src/layers/reshape_layer.cpp | 45 ++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp index 7b2f3fe42a25..16f942880f14 100644 --- a/modules/dnn/src/layers/flatten_layer.cpp +++ b/modules/dnn/src/layers/flatten_layer.cpp @@ -174,8 +174,7 @@ class FlattenLayerImpl CV_FINAL : public FlattenLayer void forwardCUDA( std::vector>& inputs, std::vector>& outputs, - csl::Workspace& workspace - ) + csl::Workspace& workspace) override { CV_UNUSED(workspace); CV_Assert(outputs.size() == 1); @@ -200,8 +199,8 @@ class FlattenLayerImpl CV_FINAL : public FlattenLayer csl::Stream stream_, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, - std::size_t& scratch_mem_in_bytes - ) + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs) override { stream = std::move(stream_); } diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp index 5cbfc03e5960..9dc26f4affed 100644 --- a/modules/dnn/src/layers/reshape_layer.cpp +++ b/modules/dnn/src/layers/reshape_layer.cpp @@ -42,9 +42,16 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -179,6 +186,7 @@ class ReshapeLayerImpl CV_FINAL : public ReshapeLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine()); } @@ -258,6 +266,43 @@ class ReshapeLayerImpl CV_FINAL : public ReshapeLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_UNUSED(workspace); + + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + if (input.get() != output.get()) + { + input.reshape_as(output); + csl::tensor_ops::copy(stream, output, input); + } + } + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs) override + { + stream = std::move(stream_); + } + + csl::Stream stream; +#endif + #ifdef HAVE_INF_ENGINE virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { From f120bd070e909c33fac2e8d194e7e58a400e3420 Mon Sep 17 00:00:00 2001 From: Yashas Date: Sat, 29 Jun 2019 14:57:00 +0530 Subject: [PATCH 031/129] add permute layer --- modules/dnn/src/cuda/array.hpp | 73 +++++++++++++ modules/dnn/src/cuda/permute.cu | 110 ++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/kernels.hpp | 3 + modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp | 15 +++ modules/dnn/src/layers/permute_layer.cpp | 71 ++++++++++++- 5 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 modules/dnn/src/cuda/array.hpp create mode 100644 modules/dnn/src/cuda/permute.cu diff --git a/modules/dnn/src/cuda/array.hpp b/modules/dnn/src/cuda/array.hpp new file mode 100644 index 000000000000..5717cb11e615 --- /dev/null +++ b/modules/dnn/src/cuda/array.hpp @@ -0,0 +1,73 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA_ARRAY_HPP +#define OPENCV_DNN_SRC_CUDA_ARRAY_HPP + +#include + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace utils { + template + struct array { + using value_type = T; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using reference = typename std::add_lvalue_reference::type; + using const_reference = typename std::add_lvalue_reference::type>::type; + using pointer = typename std::add_pointer::type; + using const_pointer = typename std::add_pointer::type>::type; + using iterator = pointer; + using const_iterator = const_pointer; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + + bool empty() const noexcept { return N == 0; } + size_type size() const noexcept { return N; } + + __host__ __device__ iterator begin() noexcept { return ptr; } + __host__ __device__ iterator end() noexcept { return ptr + N; } + __host__ __device__ const_iterator begin() const noexcept { return ptr; } + __host__ __device__ const_iterator end() const noexcept { return ptr + N; } + + __host__ __device__ const_iterator cbegin() const noexcept { return ptr; } + __host__ __device__ const_iterator cend() const noexcept { return ptr + N; } + + __host__ __device__ iterator rbegin() noexcept { return ptr + N; } + __host__ __device__ iterator rend() noexcept { return ptr; } + __host__ __device__ const_iterator rbegin() const noexcept { return ptr + N; } + __host__ __device__ const_iterator rend() const noexcept { return ptr; } + + __host__ __device__ const_iterator crbegin() const noexcept { return ptr + N; } + __host__ __device__ const_iterator crend() const noexcept { return ptr; } + + template + __host__ void assign(InputItr first, InputItr last) { + std::copy(first, last, std::begin(ptr)); + } + + __host__ __device__ reference operator[](int idx) { return ptr[idx]; } + __host__ __device__ const_reference operator[](int idx) const { return ptr[idx]; } + + __host__ __device__ reference front() { return ptr[0]; } + __host__ __device__ const_reference front() const { return ptr[0]; } + + __host__ __device__ reference back() { return ptr[N - 1]; } + __host__ __device__ const_reference back() const { return ptr[N - 1]; } + + __host__ __device__ pointer data() noexcept { return ptr; } + __host__ __device__ const_pointer data() const noexcept { return ptr; } + + T ptr[N]; + }; + } + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ + +#endif /* OPENCV_DNN_SRC_CUDA_ARRAY_HPP */ diff --git a/modules/dnn/src/cuda/permute.cu b/modules/dnn/src/cuda/permute.cu new file mode 100644 index 000000000000..5c0ae99acbbe --- /dev/null +++ b/modules/dnn/src/cuda/permute.cu @@ -0,0 +1,110 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include + +#include "array.hpp" + +#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/kernel_utils.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/pointer.hpp" +#include "../cuda4dnn/csl/stream.hpp" + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace raw { + template + __global__ void permute( + std::size_t n, utils::array axis_order, + DevicePtr output, utils::array outStrides, + DevicePtr input, utils::array inStrides) + { + for (auto i : grid_stride_range(n)) { + int oldPosition = 0; + int newPosition = i; + + for (int j = 0; j < N; j++) + { + int order = axis_order[j]; + oldPosition += (newPosition / outStrides[j]) * inStrides[order]; + newPosition %= outStrides[j]; + } + + output[i] = input[oldPosition]; + } + } + } + + template static + void launch_permute_kernel( + const Stream& stream, + std::size_t n, const std::vector& order, + DevicePtr output, const std::vector& outStride, + DevicePtr input, const std::vector& inStride) + { + CV_Assert(order.size() == N); + CV_Assert(outStride.size() == N); + CV_Assert(inStride.size() == N); + + utils::array order_k, outStride_k, inStride_k; + order_k.assign(std::begin(order), std::end(order)); + outStride_k.assign(std::begin(outStride), std::end(outStride)); + inStride_k.assign(std::begin(inStride), std::end(inStride)); + + auto kernel = raw::permute; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, n, order_k, output, outStride_k, input, inStride_k); + } + + template + void permute( + const Stream& stream, + TensorSpan output, TensorView input, + const std::vector& order) + { + CV_Assert(output.rank == input.rank); + CV_Assert(input.size() == output.size()); + CV_Assert(order.size() >= 3 && order.size() <= 5); + CV_Assert(input.rank >= order.size()); + CV_Assert(output.rank >= order.size()); + CV_Assert(get_effective_rank(input) <= order.size()); + CV_Assert(get_effective_rank(output) <= order.size()); + + int rank = output.rank; + auto inShape = input.shape(); + auto outShape = output.shape(); + + std::vector inStride(rank), outStride(rank); + inStride.back() = 1; + outStride.back() = 1; + /* garbage, ..., garbage, 1 */ + + std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride)); + std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride)); + /* dim[0], dim[1], ..., dim[-1], 1 */ + + std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies()); + std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies()); + /* stride[0], stride[1], ..., stride[-2], 1 */ + + if (order.size() != rank) { + auto diff = rank - order.size(); + outStride.erase(outStride.begin(), outStride.begin() + diff); + inStride.erase(inStride.begin(), inStride.begin() + diff); + } + + if (rank == 5) { + launch_permute_kernel(stream, input.size(), order, output.get(), outStride, input.get(), inStride); + } else if (rank == 4) { + launch_permute_kernel(stream, input.size(), order, output.get(), outStride, input.get(), inStride); + } else if (rank == 3) { + launch_permute_kernel(stream, input.size(), order, output.get(), outStride, input.get(), inStride); + } + } + + template void permute(const Stream&, TensorSpan, TensorView, const std::vector&); + template void permute(const Stream&, TensorSpan, TensorView, const std::vector&); + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index 88640ef8324a..eed197dfc455 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -74,6 +74,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke template void eltwise_prod_2(const Stream& stream, span output, view x, view y); + template + void permute(const Stream& stream, TensorSpan output, TensorView input, const std::vector& order); + }}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp index bed909c225e9..b0cf6e39be67 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp @@ -37,6 +37,21 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { memcpy(dest.get(), src.get(), dest.size()); } + /** @brief permutes the dimensions of a tensor + * + * Pre-conditions: + * - \p dest and \p src must have the same number of elements + * + * Exception Gaurantee: Basic + */ + template inline + void permute(const Stream& stream, TensorSpan dest, TensorView src, const std::vector& order_) { + std::vector order; + for (const auto& sz : order_) + order.push_back(clamp_axis(sz, src.rank)); + csl::kernels::permute(stream, dest, src, order); + } + /** @brief performs generalized matrix-multiplication * * Pre-conditions: diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp index 6c0b53ffe926..0f5985ae2328 100644 --- a/modules/dnn/src/layers/permute_layer.cpp +++ b/modules/dnn/src/layers/permute_layer.cpp @@ -42,6 +42,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" #include @@ -51,6 +52,12 @@ #include "opencl_kernels_dnn.hpp" #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -106,6 +113,7 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine()) || (backendId == DNN_BACKEND_VKCOM && haveVulkan()); } @@ -372,6 +380,49 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_UNUSED(workspace); + CV_Assert(outputs.size() == 1); + + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + if (!_needsPermute) + { + if (input.get() != output.get()) + csl::tensor_ops::copy(stream, output, input); + } + else + { + std::vector order(std::begin(_order), std::end(_order)); + csl::tensor_ops::permute(stream, output, input, order); + } + } + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs) override + { + stream = std::move(stream_); + } + + csl::Stream stream; +#endif + virtual Ptr initVkCom(const std::vector > &input) CV_OVERRIDE { #ifdef HAVE_VULKAN @@ -382,14 +433,30 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer return Ptr(); } -#ifdef HAVE_INF_ENGINE virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { +#ifdef HAVE_INF_ENGINE +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) InferenceEngine::Builder::PermuteLayer ieLayer(name); ieLayer.setOrder(_order); return Ptr(new InfEngineBackendNode(ieLayer)); - } +#else + InferenceEngine::LayerParams lp; + lp.name = name; + lp.type = "Permute"; + lp.precision = InferenceEngine::Precision::FP32; + std::shared_ptr ieLayer(new InferenceEngine::CNNLayer(lp)); + + CV_Assert(!_order.empty()); + ieLayer->params["order"] = format("%zu", _order[0]); + for (int i = 1; i < _order.size(); ++i) + ieLayer->params["order"] += format(",%zu", _order[i]); + + return Ptr(new InfEngineBackendNode(ieLayer)); +#endif #endif // HAVE_INF_ENGINE + return Ptr(); + } size_t _count; std::vector _order; From bf114d7e1b376616938f1690aaf558011b65fcac Mon Sep 17 00:00:00 2001 From: Yashas Date: Sat, 29 Jun 2019 21:27:37 +0530 Subject: [PATCH 032/129] add padding support for concat layer --- modules/dnn/src/cuda/concat.cu | 110 +++++++++++++++++++++-- modules/dnn/src/cuda4dnn/csl/kernels.hpp | 3 + modules/dnn/src/layers/concat_layer.cpp | 65 ++++++++------ 3 files changed, 145 insertions(+), 33 deletions(-) diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu index 6c9ff17921b8..f3176007a0a7 100644 --- a/modules/dnn/src/cuda/concat.cu +++ b/modules/dnn/src/cuda/concat.cu @@ -4,6 +4,8 @@ #include +#include "array.hpp" + #include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/kernel_utils.hpp" #include "../cuda4dnn/csl/tensor.hpp" @@ -16,10 +18,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k /* Reference: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/concat_layer.cu */ template __global__ void concat( - DevicePtr output, DevicePtr input, - std::size_t concat_size, std::size_t input_concat_axis_size, - std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis, - std::size_t n) + std::size_t n, + DevicePtr output, std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis, + DevicePtr input, std::size_t concat_size, std::size_t input_concat_axis_size) { for (auto idx : grid_stride_range(n)) { const auto total_concat_size = concat_size * input_concat_axis_size; @@ -31,6 +32,35 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k output[top_index] = input[idx]; } } + + template + __global__ void concat_with_axis_offset( + std::size_t n, + DevicePtr output, utils::array outStrides, utils::array outOffset, + DevicePtr input, utils::array inStrides) + { + using utils::array; + + for (auto i : grid_stride_range(n)) { + /* compute input indices corresponding to element 'i' */ + array in_index; + in_index[0] = i / inStrides[0]; + for (int j = 1; j < N; j++) + in_index[j] = (i % inStrides[j - 1]) / inStrides[j]; + + /* compute output indices corresponding to element 'i' */ + array out_index; + for (int j = 0; j < N; j++) + out_index[j] = outOffset[j] + in_index[j]; + + /* compute output element index from output indices */ + int oidx = 0; + for (int j = 0; j < N; j++) + oidx += out_index[j] * outStrides[j]; + + output[oidx] = input[i]; + } + } } template @@ -42,10 +72,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k { auto policy = make_policy(raw::concat, 0, stream); launch_kernel(raw::concat, policy, - output.get(), input.get(), - concat_size, input_concat_axis_size, - output_concat_axis_size, output_offset_concat_axis, - input.size()); + input.size(), + output.get(), output_concat_axis_size, output_offset_concat_axis, + input.get(), concat_size, input_concat_axis_size); } template void concat( @@ -60,4 +89,69 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k std::size_t concat_size, std::size_t input_concat_axis_size, std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis); + template static + void launch_concat_with_axis_offset_kernel( + const Stream& stream, + std::size_t n, + DevicePtr output, const std::vector& outStride, const std::vector& outOffset, + DevicePtr input, const std::vector& inStride) + { + CV_Assert(outStride.size() == N); + CV_Assert(outOffset.size() == N); + CV_Assert(inStride.size() == N); + + utils::array outStride_k, outOffset_k, inStride_k; + outStride_k.assign(std::begin(outStride), std::end(outStride)); + outOffset_k.assign(std::begin(outOffset), std::end(outOffset)); + inStride_k.assign(std::begin(inStride), std::end(inStride)); + + auto kernel = raw::concat_with_axis_offset; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, n, output, outStride_k, outOffset_k, input, inStride_k); + } + + template + void concat_with_axis_offset( + const Stream& stream, + TensorSpan output, TensorView input, + const std::vector& offset) + { + CV_Assert(output.rank == input.rank); + CV_Assert(output.rank >= 3 && output.rank <= 5); + + int rank = output.rank; + auto inShape = input.shape(); + auto outShape = output.shape(); + + std::vector inStride(rank), outStride(rank); + inStride.back() = 1; + outStride.back() = 1; + /* garbage, ..., garbage, 1 */ + + std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride)); + std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride)); + /* dim[0], dim[1], ..., dim[-1], 1 */ + + std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies()); + std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies()); + /* stride[0], stride[1], ..., stride[-2], 1 */ + + if (offset.size() != rank) { + auto diff = rank - offset.size(); + outStride.erase(outStride.begin(), outStride.begin() + diff); + inStride.erase(inStride.begin(), inStride.begin() + diff); + } + + if (rank == 5) { + launch_concat_with_axis_offset_kernel(stream, input.size(), output.get(), outStride, offset, input.get(), inStride); + } else if (rank == 4) { + launch_concat_with_axis_offset_kernel(stream, input.size(), output.get(), outStride, offset, input.get(), inStride); + } else if (rank == 3) { + launch_concat_with_axis_offset_kernel(stream, input.size(), output.get(), outStride, offset, input.get(), inStride); + } + } + + template void concat_with_axis_offset(const Stream&, TensorSpan, TensorView, const std::vector&); + template void concat_with_axis_offset(const Stream&, TensorSpan, TensorView, const std::vector&); + }}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index eed197dfc455..d2cf4d59f1b8 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -77,6 +77,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke template void permute(const Stream& stream, TensorSpan output, TensorView input, const std::vector& order); + template + void concat_with_axis_offset(const Stream& stream, TensorSpan output, TensorView input, const std::vector& axis_offset); + }}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp index 2ada0aec3d3a..4e99cf5abebd 100644 --- a/modules/dnn/src/layers/concat_layer.cpp +++ b/modules/dnn/src/layers/concat_layer.cpp @@ -112,7 +112,7 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || - (backendId == DNN_BACKEND_CUDA && haveCUDA() && !padding) || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 && !padding) || // By channels (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && !padding) || (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !padding); @@ -246,41 +246,57 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer void forwardCUDA( std::vector>& inputs, std::vector>& outputs, - csl::Workspace& workspace - ) override + csl::Workspace& workspace) override { - CV_UNUSED(workspace); - auto output_wrapper = outputs[0].dynamicCast(); auto output = output_wrapper->getSpan(); + auto outShape = output_wrapper->getShape(); - auto output_concat_axis = [&] { - auto actual_dims = output_wrapper->getShape().size(); + auto concat_axis = [&] { + auto actual_dims = outShape.size(); auto extra_dims = output.rank - actual_dims; return clamp(axis, actual_dims) + extra_dims; }(); - std::size_t concat_size = 1; - for (auto i = output_concat_axis + 1; i < output.rank; i++) - concat_size *= output.get_axis_size(i); + if (!padding) + { + std::size_t concat_size = 1; + for (auto i = concat_axis + 1; i < output.rank; i++) + concat_size *= output.get_axis_size(i); + + std::size_t output_concat_axis_offset = 0; + for (std::size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + csl::kernels::concat(stream, output, input, + concat_size, input.get_axis_size(concat_axis), + output.get_axis_size(concat_axis), output_concat_axis_offset); - std::size_t output_concat_axis_offset = 0; - for (std::size_t i = 0; i < inputs.size(); i++) + output_concat_axis_offset += input.get_axis_size(concat_axis); + } + } + else /* if(padding) */ { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); + csl::memset(output.get(), 0, output.size(), stream); - auto input_concat_axis = [&] { - auto actual_dims = input_wrapper->getShape().size(); - auto extra_dims = input.rank - actual_dims; - return clamp(axis, actual_dims) + extra_dims; - }(); + std::size_t output_concat_axis_offset = 0; + for (size_t i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + auto inShape = input_wrapper->getShape(); - csl::kernels::concat(stream, output, input, - concat_size, input.get_axis_size(input_concat_axis), - output.get_axis_size(output_concat_axis), output_concat_axis_offset); + std::vector offsets(inShape.size()); + for (int j = 0; j < offsets.size(); j++) + offsets[j] = (outShape[j] - inShape[j]) / 2; + offsets[concat_axis] = output_concat_axis_offset; - output_concat_axis_offset += input.get_axis_size(input_concat_axis); + csl::kernels::concat_with_axis_offset(stream, output, input, offsets); + + output_concat_axis_offset += input.get_axis_size(concat_axis); + } } } @@ -289,8 +305,7 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, - const std::vector>& inputs - ) override + const std::vector>& inputs) override { stream = std::move(stream_); } From 0ab06a903b9c66111ef02205f36a33cf0ea5b1a2 Mon Sep 17 00:00:00 2001 From: Yashas Date: Mon, 1 Jul 2019 18:25:23 +0530 Subject: [PATCH 033/129] refactor and reorganize code --- modules/dnn/src/cuda/activations.cu | 100 +++++++++--------- modules/dnn/src/cuda/concat.cu | 105 +++++++++---------- modules/dnn/src/cuda/eltwise_ops.cu | 51 ++++++---- modules/dnn/src/cuda/permute.cu | 33 +++--- modules/dnn/src/cuda/scale.cu | 107 +++++++++++--------- modules/dnn/src/cuda4dnn/csl/kernels.hpp | 24 +++-- modules/dnn/src/cuda4dnn/csl/tensor.hpp | 12 +-- modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp | 31 +++--- modules/dnn/src/layers/batch_norm_layer.cpp | 2 +- modules/dnn/src/layers/concat_layer.cpp | 26 ++--- modules/dnn/src/layers/scale_layer.cpp | 4 +- 11 files changed, 257 insertions(+), 238 deletions(-) diff --git a/modules/dnn/src/cuda/activations.cu b/modules/dnn/src/cuda/activations.cu index a993fa8fd095..871e4ce9d3ad 100644 --- a/modules/dnn/src/cuda/activations.cu +++ b/modules/dnn/src/cuda/activations.cu @@ -2,8 +2,6 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. -#include - #include "math.hpp" #include "../cuda4dnn/csl/kernels.hpp" @@ -11,12 +9,16 @@ #include "../cuda4dnn/csl/span.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include + +#include +#include + namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { template __global__ void abs(span dest, view src) { - assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { using utils::abs; dest[i] = abs(src[i]); @@ -25,7 +27,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void tanh(span dest, view src) { - assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { using utils::tanh; dest[i] = tanh(src[i]); @@ -34,7 +35,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void sigmoid(span dest, view src) { - assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { using utils::sigmoid; dest[i] = sigmoid(src[i]); @@ -43,7 +43,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void bnll(span dest, view src) { - assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { using utils::log1pexp; dest[i] = src[i] > 0 ? src[i] + log1pexp(-src[i]) : log1pexp(src[i]); @@ -52,7 +51,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void elu(span dest, view src) { - assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { using utils::exp; dest[i] = src[i] >= 0 ? src[i] : (exp(src[i]) - 1); @@ -61,15 +59,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void relu(span dest, view src, T slope) { - assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) dest[i] = src[i] >= 0.0 ? src[i] : slope * src[i]; } template __global__ void clipped_relu(span dest, view src, T floor, T ceiling) { - assert(src.size() >= dest.size()); - assert(floor <= ceiling); for (auto i : grid_stride_range(dest.size())) { using utils::max; using utils::min; @@ -78,17 +73,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void axiswise_relu(span dest, view src, view slope, std::size_t inner_size, std::size_t channel_size) { - assert(src.size() >= dest.size()); + __global__ void axiswise_relu(span dest, view src, std::size_t inner_size, view slope) { for (auto i : grid_stride_range(dest.size())) { - const auto c = (i % inner_size) / channel_size; + const auto c = (i % inner_size) / slope.size(); dest[i] = src[i] < 0 ? src[i] * slope[c] : src[i]; } } template __global__ void power(span dest, view src, T exp, T scale, T shift) { - assert(src.size() >= dest.size()); for (auto i : grid_stride_range(dest.size())) { using utils::pow; dest[i] = pow(shift + scale * src[i], exp); @@ -100,8 +93,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k void abs(const Stream& stream, span dest, view src) { CV_Assert(src.size() >= dest.size()); - auto policy = make_policy(raw::abs, 0, stream); - launch_kernel(raw::abs, policy, dest, src); + auto kernel = raw::abs; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, dest, src); } template void abs(const Stream& stream, span dest, view src); @@ -111,89 +105,97 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k void tanh(const Stream& stream, span dest, view src) { CV_Assert(src.size() >= dest.size()); - auto policy = make_policy(raw::tanh, 0, stream); - launch_kernel(raw::tanh, policy, dest, src); + auto kernel = raw::tanh; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, dest, src); } - template void tanh(const Stream& stream, span dest, view src); - template void tanh(const Stream& stream, span dest, view src); + template void tanh(const Stream&, span, view); + template void tanh(const Stream&, span, view); template void sigmoid(const Stream& stream, span dest, view src) { CV_Assert(src.size() >= dest.size()); - auto policy = make_policy(raw::sigmoid, 0, stream); - launch_kernel(raw::sigmoid, policy, dest, src); + auto kernel = raw::sigmoid; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, dest, src); } - template void sigmoid(const Stream& stream, span dest, view src); - template void sigmoid(const Stream& stream, span dest, view src); + template void sigmoid(const Stream&, span, view); + template void sigmoid(const Stream&, span, view); template void bnll(const Stream& stream, span dest, view src) { CV_Assert(src.size() >= dest.size()); - auto policy = make_policy(raw::bnll, 0, stream); - launch_kernel(raw::bnll, policy, dest, src); + auto kernel = raw::bnll; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, dest, src); } - template void bnll(const Stream& stream, span dest, view src); - template void bnll(const Stream& stream, span dest, view src); + template void bnll(const Stream&, span, view); + template void bnll(const Stream&, span, view); template void elu(const Stream& stream, span dest, view src) { CV_Assert(src.size() >= dest.size()); - auto policy = make_policy(raw::elu, 0, stream); - launch_kernel(raw::elu, policy, dest, src); + auto kernel = raw::elu; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, dest, src); } - template void elu(const Stream& stream, span dest, view src); - template void elu(const Stream& stream, span dest, view src); + template void elu(const Stream&, span, view); + template void elu(const Stream&, span, view); template void relu(const Stream& stream, span dest, view src, T slope) { CV_Assert(src.size() >= dest.size()); - auto policy = make_policy(raw::relu, 0, stream); - launch_kernel(raw::relu, policy, dest, src, slope); + auto kernel = raw::relu; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, dest, src, slope); } - template void relu(const Stream& stream, span dest, view src, float slope); - template void relu(const Stream& stream, span dest, view src, double slope); + template void relu(const Stream&, span, view, float); + template void relu(const Stream&, span, view, double); template void clipped_relu(const Stream& stream, span dest, view src, T floor, T ceiling) { CV_Assert(src.size() >= dest.size()); CV_Assert(floor <= ceiling); - auto policy = make_policy(raw::clipped_relu, 0, stream); - launch_kernel(raw::clipped_relu, policy, dest, src, floor, ceiling); + auto kernel = raw::clipped_relu; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, dest, src, floor, ceiling); } - template void clipped_relu(const Stream& stream, span dest, view src, float floor, float ceiling); - template void clipped_relu(const Stream& stream, span dest, view src, double floor, double ceiling); + template void clipped_relu(const Stream&, span, view, float, float); + template void clipped_relu(const Stream&, span, view, double, double); template - void axiswise_relu(const Stream& stream, span dest, view src, view slope, std::size_t inner_size, std::size_t channel_size) { + void axiswise_relu(const Stream& stream, span dest, view src, view slope, std::size_t inner_size) { CV_Assert(src.size() >= dest.size()); - auto policy = make_policy(raw::axiswise_relu, 0, stream); - launch_kernel(raw::axiswise_relu, policy, dest, src, slope, inner_size, channel_size); + auto kernel = raw::axiswise_relu; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, dest, src, inner_size, slope); } - template void axiswise_relu(const Stream& stream, span dest, view src, view slope, std::size_t inner_size, std::size_t channel_size); - template void axiswise_relu(const Stream& stream, span dest, view src, view slope, std::size_t inner_size, std::size_t channel_size); + template void axiswise_relu(const Stream&, span, view, view, std::size_t); + template void axiswise_relu(const Stream&, span, view, view, std::size_t); template void power(const Stream& stream, span dest, view src, T exp, T scale, T shift) { CV_Assert(src.size() >= dest.size()); - auto policy = make_policy(raw::power, 0, stream); - launch_kernel(raw::power, policy, dest, src, exp, scale, shift); + auto kernel = raw::power; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, dest, src, exp, scale, shift); } - template void power(const Stream& stream, span dest, view src, float exp, float scale, float shift); - template void power(const Stream& stream, span dest, view src, double exp, double scale, double shift); + template void power(const Stream&, span, view, float, float, float); + template void power(const Stream&, span, view, double, double, double); }}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu index f3176007a0a7..4b54fe5defd0 100644 --- a/modules/dnn/src/cuda/concat.cu +++ b/modules/dnn/src/cuda/concat.cu @@ -18,45 +18,45 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k /* Reference: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/concat_layer.cu */ template __global__ void concat( - std::size_t n, - DevicePtr output, std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis, - DevicePtr input, std::size_t concat_size, std::size_t input_concat_axis_size) + span output, std::size_t output_axis_size, std::size_t output_axis_offset, + view input, std::size_t input_axis_size, std::size_t concat_size) { - for (auto idx : grid_stride_range(n)) { - const auto total_concat_size = concat_size * input_concat_axis_size; + /* we need to copy all the elements of input to some location in the output */ + for (auto idx : grid_stride_range(input.size())) { + const auto total_concat_size = concat_size * input_axis_size; const auto concat_num = idx / total_concat_size; const auto concat_index = idx % total_concat_size; const auto top_index = concat_index + - (concat_num * output_concat_axis_size + output_offset_concat_axis) * concat_size; + (concat_num * output_axis_size + output_axis_offset) * concat_size; output[top_index] = input[idx]; } } template - __global__ void concat_with_axis_offset( - std::size_t n, - DevicePtr output, utils::array outStrides, utils::array outOffset, - DevicePtr input, utils::array inStrides) - { - using utils::array; + using array = utils::array; - for (auto i : grid_stride_range(n)) { - /* compute input indices corresponding to element 'i' */ - array in_index; - in_index[0] = i / inStrides[0]; + template + __global__ void concat_with_offsets( + span output, array out_strides, array out_offset, + view input, array in_strides) + { + for (auto i : grid_stride_range(input.size())) { + /* compute input axis indices corresponding to element 'i' */ + array in_index; + in_index[0] = i / in_strides[0]; for (int j = 1; j < N; j++) - in_index[j] = (i % inStrides[j - 1]) / inStrides[j]; + in_index[j] = (i % in_strides[j - 1]) / in_strides[j]; - /* compute output indices corresponding to element 'i' */ - array out_index; + /* compute output axis indices corresponding to element 'i' */ + array out_index; for (int j = 0; j < N; j++) - out_index[j] = outOffset[j] + in_index[j]; + out_index[j] = out_offset[j] + in_index[j]; - /* compute output element index from output indices */ - int oidx = 0; + /* compute output element number from output axis indices */ + std::size_t oidx = 0; for (int j = 0; j < N; j++) - oidx += out_index[j] * outStrides[j]; + oidx += out_index[j] * out_strides[j]; output[oidx] = input[i]; } @@ -66,35 +66,30 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template void concat( const Stream& stream, - TensorSpan output, TensorView input, - std::size_t concat_size, std::size_t input_concat_axis_size, - std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis) + TensorSpan output, std::size_t output_axis_offset, + TensorView input, std::size_t axis) { + std::size_t concat_size = 1; + for (int i = axis + 1; i < output.rank; i++) + concat_size *= output.get_axis_size(i); + + std::size_t input_axis_size = input.get_axis_size(axis); + std::size_t output_axis_size = output.get_axis_size(axis); + auto policy = make_policy(raw::concat, 0, stream); launch_kernel(raw::concat, policy, - input.size(), - output.get(), output_concat_axis_size, output_offset_concat_axis, - input.get(), concat_size, input_concat_axis_size); + output, output_axis_size, output_axis_offset, + input, input_axis_size, concat_size); } - template void concat( - const Stream& stream, - TensorSpan output, TensorView input, - std::size_t concat_size, std::size_t input_concat_axis_size, - std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis); - - template void concat( - const Stream& stream, - TensorSpan output, TensorView input, - std::size_t concat_size, std::size_t input_concat_axis_size, - std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis); + template void concat(const Stream&, TensorSpan, std::size_t, TensorView, std::size_t); + template void concat(const Stream&, TensorSpan, std::size_t, TensorView, std::size_t); template static - void launch_concat_with_axis_offset_kernel( + void launch_concat_with_offsets_kernel( const Stream& stream, - std::size_t n, - DevicePtr output, const std::vector& outStride, const std::vector& outOffset, - DevicePtr input, const std::vector& inStride) + span output, const std::vector& outStride, const std::vector& outOffset, + view input, const std::vector& inStride) { CV_Assert(outStride.size() == N); CV_Assert(outOffset.size() == N); @@ -105,16 +100,16 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k outOffset_k.assign(std::begin(outOffset), std::end(outOffset)); inStride_k.assign(std::begin(inStride), std::end(inStride)); - auto kernel = raw::concat_with_axis_offset; + auto kernel = raw::concat_with_offsets; auto policy = make_policy(kernel, 0, stream); - launch_kernel(kernel, policy, n, output, outStride_k, outOffset_k, input, inStride_k); + launch_kernel(kernel, policy, output, outStride_k, outOffset_k, input, inStride_k); } template - void concat_with_axis_offset( + void concat_with_offsets( const Stream& stream, TensorSpan output, TensorView input, - const std::vector& offset) + const std::vector& offsets) { CV_Assert(output.rank == input.rank); CV_Assert(output.rank >= 3 && output.rank <= 5); @@ -136,22 +131,22 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies()); /* stride[0], stride[1], ..., stride[-2], 1 */ - if (offset.size() != rank) { - auto diff = rank - offset.size(); + if (offsets.size() != rank) { + auto diff = rank - offsets.size(); outStride.erase(outStride.begin(), outStride.begin() + diff); inStride.erase(inStride.begin(), inStride.begin() + diff); } if (rank == 5) { - launch_concat_with_axis_offset_kernel(stream, input.size(), output.get(), outStride, offset, input.get(), inStride); + launch_concat_with_offsets_kernel(stream, output, outStride, offsets, input, inStride); } else if (rank == 4) { - launch_concat_with_axis_offset_kernel(stream, input.size(), output.get(), outStride, offset, input.get(), inStride); + launch_concat_with_offsets_kernel(stream, output, outStride, offsets, input, inStride); } else if (rank == 3) { - launch_concat_with_axis_offset_kernel(stream, input.size(), output.get(), outStride, offset, input.get(), inStride); + launch_concat_with_offsets_kernel(stream, output, outStride, offsets, input, inStride); } } - template void concat_with_axis_offset(const Stream&, TensorSpan, TensorView, const std::vector&); - template void concat_with_axis_offset(const Stream&, TensorSpan, TensorView, const std::vector&); + template void concat_with_offsets(const Stream&, TensorSpan, TensorView, const std::vector&); + template void concat_with_offsets(const Stream&, TensorSpan, TensorView, const std::vector&); }}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu index d7f615d4b152..98d3eaab3916 100644 --- a/modules/dnn/src/cuda/eltwise_ops.cu +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -9,6 +9,8 @@ #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/span.hpp" +#include + #include namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { @@ -16,9 +18,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k namespace raw { template __global__ void eltwise_max_2(span output, view x, view y) { - assert(x.size() == y.size()); - assert(output.size() >= y.size()); - for (auto i : grid_stride_range(output.size())) { using utils::max; output[i] = max(x[i], y[i]); @@ -27,27 +26,18 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void eltwise_sum_2(span output, view x, view y) { - assert(x.size() == y.size()); - assert(output.size() >= y.size()); - for (auto i : grid_stride_range(output.size())) output[i] = x[i] + y[i]; } template __global__ void eltwise_sum_coeff_2(span output, T coeff_x, view x, T coeff_y, view y) { - assert(x.size() == y.size()); - assert(output.size() >= y.size()); - for (auto i : grid_stride_range(output.size())) output[i] = coeff_x * x[i] + coeff_y * y[i]; } template __global__ void eltwise_prod_2(span output, view x, view y) { - assert(x.size() == y.size()); - assert(output.size() >= y.size()); - for (auto i : grid_stride_range(output.size())) output[i] = x[i] * y[i]; } @@ -55,8 +45,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template void eltwise_max_2(const Stream& stream, span output, view x, view y) { - auto policy = make_policy(raw::eltwise_max_2, 0, stream); - launch_kernel(raw::eltwise_max_2, policy, output, x, y); + CV_Assert(x.size() == y.size()); + CV_Assert(output.size() == x.size()); + + auto kernel = raw::eltwise_max_2; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output, x, y); } template void eltwise_max_2(const Stream& stream, span output, view x, view y); @@ -64,8 +58,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template void eltwise_sum_2(const Stream& stream, span output, view x, view y) { - auto policy = make_policy(raw::eltwise_sum_2, 0, stream); - launch_kernel(raw::eltwise_sum_2, policy, output, x, y); + CV_Assert(x.size() == y.size()); + CV_Assert(output.size() == x.size()); + + auto kernel = raw::eltwise_sum_2; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output, x, y); } template void eltwise_sum_2(const Stream& stream, span output, view x, view y); @@ -73,8 +71,17 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template void eltwise_sum_coeff_2(const Stream& stream, span output, T coeff_x, view x, T coeff_y, view y) { - auto policy = make_policy(raw::eltwise_sum_coeff_2, 0, stream); - launch_kernel(raw::eltwise_sum_coeff_2, policy, output, coeff_x, x, coeff_y, y); + CV_Assert(x.size() == y.size()); + CV_Assert(output.size() == x.size()); + + if (coeff_x == 1.0 && coeff_y == 1.0) { + eltwise_sum_2(stream, output, x, y); + return; + } + + auto kernel = raw::eltwise_sum_coeff_2; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output, coeff_x, x, coeff_y, y); } template void eltwise_sum_coeff_2(const Stream&, span, float, view, float, view); @@ -82,8 +89,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template void eltwise_prod_2(const Stream& stream, span output, view x, view y) { - auto policy = make_policy(raw::eltwise_prod_2, 0, stream); - launch_kernel(raw::eltwise_prod_2, policy, output, x, y); + CV_Assert(x.size() == y.size()); + CV_Assert(output.size() == x.size()); + + auto kernel = raw::eltwise_prod_2; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output, x, y); } template void eltwise_prod_2(const Stream& stream, span output, view x, view y); diff --git a/modules/dnn/src/cuda/permute.cu b/modules/dnn/src/cuda/permute.cu index 5c0ae99acbbe..efd2c047f56a 100644 --- a/modules/dnn/src/cuda/permute.cu +++ b/modules/dnn/src/cuda/permute.cu @@ -2,26 +2,31 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. -#include - #include "array.hpp" #include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/kernel_utils.hpp" #include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/pointer.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include + +#include +#include + namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { + template + using array = utils::array; + template __global__ void permute( - std::size_t n, utils::array axis_order, - DevicePtr output, utils::array outStrides, - DevicePtr input, utils::array inStrides) + array axis_order, + span output, array outStrides, + view input, array inStrides) { - for (auto i : grid_stride_range(n)) { + for (auto i : grid_stride_range(input.size())) { int oldPosition = 0; int newPosition = i; @@ -40,9 +45,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template static void launch_permute_kernel( const Stream& stream, - std::size_t n, const std::vector& order, - DevicePtr output, const std::vector& outStride, - DevicePtr input, const std::vector& inStride) + const std::vector& order, + span output, const std::vector& outStride, + view input, const std::vector& inStride) { CV_Assert(order.size() == N); CV_Assert(outStride.size() == N); @@ -55,7 +60,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k auto kernel = raw::permute; auto policy = make_policy(kernel, 0, stream); - launch_kernel(kernel, policy, n, order_k, output, outStride_k, input, inStride_k); + launch_kernel(kernel, policy, order_k, output, outStride_k, input, inStride_k); } template @@ -96,11 +101,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } if (rank == 5) { - launch_permute_kernel(stream, input.size(), order, output.get(), outStride, input.get(), inStride); + launch_permute_kernel(stream, order, output, outStride, input, inStride); } else if (rank == 4) { - launch_permute_kernel(stream, input.size(), order, output.get(), outStride, input.get(), inStride); + launch_permute_kernel(stream, order, output, outStride, input, inStride); } else if (rank == 3) { - launch_permute_kernel(stream, input.size(), order, output.get(), outStride, input.get(), inStride); + launch_permute_kernel(stream, order, output, outStride, input, inStride); } } diff --git a/modules/dnn/src/cuda/scale.cu b/modules/dnn/src/cuda/scale.cu index 85eb18c71d22..898c05354ab2 100644 --- a/modules/dnn/src/cuda/scale.cu +++ b/modules/dnn/src/cuda/scale.cu @@ -8,40 +8,61 @@ #include "../cuda4dnn/csl/pointer.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include + +#include #include namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { template - __global__ void scale( - std::size_t n, - DevicePtr output, - DevicePtr input, std::size_t inner_size, - DevicePtr weights, std::size_t scale_size) + __global__ void scale1(span output, view input, T alpha) { - for (auto i : grid_stride_range(n)) { - const auto scale_idx = (i / inner_size) % scale_size; + for (auto i : grid_stride_range(output.size())) + output[i] = alpha * input[i]; + } + + template + __global__ void scaleN(span output, view input, std::size_t inner_size, view weights) + { + for (auto i : grid_stride_range(output.size())) { + const auto scale_idx = (i / inner_size) % weights.size(); output[i] = input[i] * weights[scale_idx]; } } template - __global__ void scale_with_bias( - std::size_t n, - DevicePtr output, - DevicePtr input, std::size_t inner_size, - DevicePtr weights, DevicePtr bias, std::size_t scale_bias_size) + __global__ void scale1_with_bias1(span output, view input, T alpha, T beta) { - for (auto i : grid_stride_range(n)) { - const auto scale_idx = (i / inner_size) % scale_bias_size; + for (auto i : grid_stride_range(output.size())) + output[i] = alpha * input[i] + beta; + } + + template + __global__ void scaleN_with_biasN(span output, view input, std::size_t inner_size, view weights, view bias) + { + for (auto i : grid_stride_range(output.size())) { + const auto scale_idx = (i / inner_size) % weights.size(); output[i] = input[i] * weights[scale_idx] + bias[scale_idx]; } } } template - void scale( + void scale1(const Stream& stream, TensorSpan output, TensorView input, T alpha) { + CV_Assert(is_shape_same(input, output)); + + auto kernel = raw::scale1; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output, input, alpha); + } + + template void scale1(const Stream&, TensorSpan, TensorView, float); + template void scale1(const Stream&, TensorSpan, TensorView, double); + + template + void scaleN( const Stream& stream, TensorSpan output, TensorView input, std::size_t inner_size, @@ -49,28 +70,28 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k { CV_Assert(is_shape_same(input, output)); - auto policy = make_policy(raw::scale, 0, stream); - launch_kernel(raw::scale, policy, - output.size(), - output.get(), - input.get(), inner_size, - weights.get(), weights.size()); + auto kernel = raw::scaleN; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, weights); } - template void scale( - const Stream& stream, - TensorSpan output, - TensorView input, std::size_t inner_size, - TensorView weights); + template void scaleN(const Stream&, TensorSpan, TensorView, std::size_t, TensorView); + template void scaleN(const Stream&, TensorSpan, TensorView, std::size_t, TensorView); - template void scale( - const Stream& stream, - TensorSpan output, - TensorView input, std::size_t inner_size, - TensorView weights); + template + void scale1_with_bias1(const Stream& stream, TensorSpan output, TensorView input, T alpha, T beta) { + CV_Assert(is_shape_same(input, output)); + + auto kernel = raw::scale1_with_bias1; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output, input, alpha, beta); + } + + template void scale1_with_bias1(const Stream&, TensorSpan, TensorView, float, float); + template void scale1_with_bias1(const Stream&, TensorSpan, TensorView, double, double); template - void scale_with_bias( + void scaleN_with_biasN( const Stream& stream, TensorSpan output, TensorView input, std::size_t inner_size, @@ -79,24 +100,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(is_shape_same(input, output)); CV_Assert(weights.size() == bias.size()); - auto policy = make_policy(raw::scale_with_bias, 0, stream); - launch_kernel(raw::scale_with_bias, policy, - output.size(), - output.get(), - input.get(), inner_size, - weights.get(), bias.get(), weights.size()); + auto kernel = raw::scaleN_with_biasN; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, weights, bias); } - template void scale_with_bias( - const Stream& stream, - TensorSpan output, - TensorView input, std::size_t inner_size, - TensorView weights, TensorView bias); - - template void scale_with_bias( - const Stream& stream, - TensorSpan output, - TensorView input, std::size_t inner_size, - TensorView weights, TensorView bias); + template void scaleN_with_biasN(const Stream&, TensorSpan, TensorView, std::size_t, TensorView, TensorView); + template void scaleN_with_biasN(const Stream&, TensorSpan, TensorView, std::size_t, TensorView, TensorView); }}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index d2cf4d59f1b8..cf54f53cb899 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -36,7 +36,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke void clipped_relu(const Stream& stream, span dest, view src, T floor, T ceiling); template - void axiswise_relu(const Stream& stream, span dest, view src, view slope, std::size_t inner_size, std::size_t channel_size); + void axiswise_relu(const Stream& stream, span dest, view src, view slope, std::size_t inner_size); template void power(const Stream& stream, span dest, view src, T exp, T scale, T shift); @@ -44,19 +44,26 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke template void concat( const Stream& stream, - TensorSpan output, TensorView input, - std::size_t concat_size, std::size_t input_concat_axis_size, - std::size_t output_concat_axis_size, std::size_t output_offset_concat_axis); + TensorSpan output, std::size_t output_axis_offset, + TensorView input, std::size_t axis); template - void scale( - const Stream& stream, + void concat_with_offsets(const Stream& stream, TensorSpan output, TensorView input, const std::vector& axis_offsets); + + template + void scale1(const Stream& stream, TensorSpan output, TensorView input, T alpha); + + template + void scaleN(const Stream& stream, TensorSpan output, TensorView input, std::size_t inner_size, TensorView weights); template - void scale_with_bias( + void scale1_with_bias1(const Stream& stream, TensorSpan output, TensorView input, T alpha, T beta); + + template + void scaleN_with_biasN( const Stream& stream, TensorSpan output, TensorView input, std::size_t inner_size, @@ -77,9 +84,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke template void permute(const Stream& stream, TensorSpan output, TensorView input, const std::vector& order); - template - void concat_with_axis_offset(const Stream& stream, TensorSpan output, TensorView input, const std::vector& axis_offset); - }}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index 67a41f6704cb..d3f952aa0911 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -31,12 +31,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { - /** if the \p axis is a negative index, the equivalent postive index is returned; otherwise, returns \p axis */ - template - CUDA4DNN_HOST_DEVICE constexpr T clamp_axis(T axis, std::size_t rank) { - return axis < 0 ? axis + rank : axis; - } - /** \file tensor.hpp * * The tensor library contains three kinds of tensor objects which are summarized @@ -54,6 +48,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * "TensorType", frequently used as a template parameter, can refer to Tensor, TensorSpan or TensorView. */ + /** if the \p axis is a negative index, the equivalent postive index is returned; otherwise, returns \p axis */ + template + CUDA4DNN_HOST_DEVICE constexpr T clamp_axis(T axis, std::size_t rank) { + return axis < 0 ? axis + rank : axis; + } + /** @brief multi-dimensional contiguous GPU tensor containing elements of a single type * * \tparam T type of data stored by the tensor diff --git a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp index b0cf6e39be67..680eeba8207c 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp @@ -31,10 +31,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * Exception Gaurantee: Basic */ template inline - void copy(const Stream& stream, TensorSpan dest, TensorView src) { + void copy(const Stream& stream, TensorSpan dest, TensorView src) { CV_Assert(is_shape_same(dest, src)); if (dest.get() != src.get()) - memcpy(dest.get(), src.get(), dest.size()); + memcpy(dest.get(), src.get(), dest.size(), stream); } /** @brief permutes the dimensions of a tensor @@ -112,12 +112,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { } /** @brief performs element-wise addition with broadcasting - * - * Pre-conditions: - * - \p A and \p C must be compatible tensors - * - * Exception Gaurantee: Basic - */ + * + * Pre-conditions: + * - \p A and \p C must be compatible tensors + * + * Exception Gaurantee: Basic + */ template inline void add(const cudnn::Handle& handle, T beta, TensorSpan C, T alpha, TensorView A) { CV_Assert(is_shape_compatible(A, C)); @@ -129,12 +129,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { } /** @brief performs element-wise addition with broadcasting - * - * Pre-conditions: - * - \p A and \p result must be compatible tensors - * - * Exception Gaurantee: Basic - */ + * + * Pre-conditions: + * - \p A and \p result must be compatible tensors + * + * Exception Gaurantee: Basic + */ template inline void softmax(const cudnn::Handle& handle, TensorSpan output, TensorView input, int channel_axis, bool log) { CV_Assert(is_shape_same(output, input)); @@ -188,8 +188,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { CV_Assert(is_shape_same(dest, src)); CV_Assert(src.get_axis_size(1) == slope.size()); std::size_t inner_size = src.size() / src.get_axis_size(0); - std::size_t channel_size = inner_size / src.get_axis_size(1); - kernels::axiswise_relu(stream, dest, src, slope, inner_size, channel_size); + kernels::axiswise_relu(stream, dest, src, slope, inner_size); } template inline diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index 2e74bc5217ac..e186dff2d74e 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -332,7 +332,7 @@ class BatchNormLayerImpl CV_FINAL : public BatchNormLayer auto input_shape = input_wrapper->getShape(); std::size_t inner_size = total(input_shape, 2, -1); - csl::kernels::scale_with_bias(stream, output, input, inner_size, weightsTensor, biasTensor); + csl::kernels::scaleN_with_biasN(stream, output, input, inner_size, weightsTensor, biasTensor); } void initCUDA( diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp index 4e99cf5abebd..efb52073ecbb 100644 --- a/modules/dnn/src/layers/concat_layer.cpp +++ b/modules/dnn/src/layers/concat_layer.cpp @@ -250,34 +250,28 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer { auto output_wrapper = outputs[0].dynamicCast(); auto output = output_wrapper->getSpan(); - auto outShape = output_wrapper->getShape(); + auto output_shape = output_wrapper->getShape(); auto concat_axis = [&] { - auto actual_dims = outShape.size(); + auto actual_dims = output_shape.size(); auto extra_dims = output.rank - actual_dims; return clamp(axis, actual_dims) + extra_dims; }(); if (!padding) { - std::size_t concat_size = 1; - for (auto i = concat_axis + 1; i < output.rank; i++) - concat_size *= output.get_axis_size(i); - - std::size_t output_concat_axis_offset = 0; + std::size_t output_axis_offset = 0; for (std::size_t i = 0; i < inputs.size(); i++) { auto input_wrapper = inputs[i].dynamicCast(); auto input = input_wrapper->getView(); - csl::kernels::concat(stream, output, input, - concat_size, input.get_axis_size(concat_axis), - output.get_axis_size(concat_axis), output_concat_axis_offset); + csl::kernels::concat(stream, output, output_axis_offset, input, concat_axis); - output_concat_axis_offset += input.get_axis_size(concat_axis); + output_axis_offset += input.get_axis_size(concat_axis); } } - else /* if(padding) */ + else /* if(padding) */ { csl::memset(output.get(), 0, output.size(), stream); @@ -286,14 +280,14 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer { auto input_wrapper = inputs[i].dynamicCast(); auto input = input_wrapper->getView(); - auto inShape = input_wrapper->getShape(); + auto input_shape = input_wrapper->getShape(); - std::vector offsets(inShape.size()); + std::vector offsets(input_shape.size()); for (int j = 0; j < offsets.size(); j++) - offsets[j] = (outShape[j] - inShape[j]) / 2; + offsets[j] = (output_shape[j] - input_shape[j]) / 2; offsets[concat_axis] = output_concat_axis_offset; - csl::kernels::concat_with_axis_offset(stream, output, input, offsets); + csl::kernels::concat_with_offsets(stream, output, input, offsets); output_concat_axis_offset += input.get_axis_size(concat_axis); } diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp index 16d0ec80791f..15f904a41c60 100644 --- a/modules/dnn/src/layers/scale_layer.cpp +++ b/modules/dnn/src/layers/scale_layer.cpp @@ -201,9 +201,9 @@ class ScaleLayerImpl CV_FINAL : public ScaleLayer CV_Assert(hasWeights || hasBias); if (hasWeights && hasBias) - csl::kernels::scale_with_bias(stream, output, input, inner_size, weights, bias); + csl::kernels::scaleN_with_biasN(stream, output, input, inner_size, weights, bias); else if (hasWeights) - csl::kernels::scale(stream, output, input, inner_size, weights); + csl::kernels::scaleN(stream, output, input, inner_size, weights); else { /* rarely used codepath; hence, not optimized TODO */ From 5d2d336822311fd565bf4b9a5b29a03abbc056d1 Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 2 Jul 2019 11:46:57 +0530 Subject: [PATCH 034/129] add normalize layer --- modules/dnn/src/cuda/math.hpp | 4 + modules/dnn/src/cuda/normalize.cu | 209 ++++++++++++++++++ modules/dnn/src/cuda/reduce.hpp | 23 ++ modules/dnn/src/cuda4dnn/csl/kernels.hpp | 7 + .../dnn/src/layers/normalize_bbox_layer.cpp | 73 +++++- 5 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 modules/dnn/src/cuda/normalize.cu create mode 100644 modules/dnn/src/cuda/reduce.hpp diff --git a/modules/dnn/src/cuda/math.hpp b/modules/dnn/src/cuda/math.hpp index e0773be0f969..6c49e4346b9d 100644 --- a/modules/dnn/src/cuda/math.hpp +++ b/modules/dnn/src/cuda/math.hpp @@ -51,6 +51,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template <> inline __device__ float pow(float val, float exp) { return powf(val, exp); } template <> inline __device__ double pow(double val, double exp) { return ::pow(val, exp); } + template __device__ T sqrt(T val); + template <> inline __device__ float sqrt(float val) { return sqrtf(val); } + template <> inline __device__ double sqrt(double val) { return ::sqrt(val); } + template __device__ T sigmoid(T val) { return T(1) / (1 + exp(-val)); } } diff --git a/modules/dnn/src/cuda/normalize.cu b/modules/dnn/src/cuda/normalize.cu new file mode 100644 index 000000000000..c916c8b9188a --- /dev/null +++ b/modules/dnn/src/cuda/normalize.cu @@ -0,0 +1,209 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "array.hpp" +#include "math.hpp" +#include "reduce.hpp" + +#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/kernel_utils.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/stream.hpp" + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace raw { + template + __global__ void reduce_sum_powN(span output, + view input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, T norm) + { + for (int i = 0; i < outer_size; i++) { + for (int j = 0; j < mid_size; j++) { + const auto outer_offset = i * mid_size * inner_size; + const auto mid_offset = j * mid_size; + const auto total_offset = outer_offset + mid_offset; + + T thread_sum = 0; + for (auto idx : grid_stride_range(inner_size)) { + const auto full_idx = total_offset + idx; + thread_sum += utils::pow(utils::abs(input[full_idx]), norm); + } + + auto warp_sum = utils::warpReduceSum(thread_sum); + if ((threadIdx.x & (warpSize - 1)) == 0) + atomicAdd(&output[total_offset], warp_sum); + } + } + } + + template + __global__ void scale_inverse_powN(span output, + view input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, T episilon, T norm, + view sums) + { + for (int i = 0; i < outer_size; i++) { + for (int j = 0; j < mid_size; j++) { + const auto outer_offset = i * mid_size * inner_size; + const auto mid_offset = j * mid_size; + const auto total_offset = outer_offset + mid_offset; + + const auto scale = 1 / utils::pow(sums[total_offset] + episilon, 1 / norm); + for (auto idx : grid_stride_range(inner_size)) { + const auto full_idx = total_offset + idx; + output[full_idx] = input[full_idx] * scale; + } + } + } + } + + template + __global__ void reduce_sum_powN_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size, T pnorm) + { + for (int i = 0; i < outer_size; i++) { + const auto outer_offset = i * mid_size; + + T thread_sum = 0; + for (auto idx : grid_stride_range(mid_size)) { + const auto full_idx = outer_offset + idx; + thread_sum += utils::pow(input[full_idx], pnorm); + } + + auto warp_sum = utils::warpReduceSum(thread_sum); + if ((threadIdx.x & (warpSize - 1)) == 0) + atomicAdd(&output[i], warp_sum); + } + } + + template + __global__ void scale_inverse_powN_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size, T epsilon, T pnorm, + view sums) + { + for (int i = 0; i < outer_size; i++) { + const auto outer_offset = i * mid_size; + const auto scale = 1 / utils::pow(sums[i] + epsilon, 1/pnorm); + for (auto idx : grid_stride_range(mid_size)) { + const auto full_idx = outer_offset + idx; + output[full_idx] = input[full_idx] * scale; + } + } + } + + template + __global__ void reduce_sum_pow2_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size) + { + for (int i = 0; i < outer_size; i++) { + const auto outer_offset = i * mid_size; + + T thread_sum = 0; + for (auto idx : grid_stride_range(mid_size)) { + const auto full_idx = outer_offset + idx; + thread_sum += input[full_idx] * input[full_idx]; + } + + auto warp_sum = utils::warpReduceSum(thread_sum); + if ((threadIdx.x & (warpSize - 1)) == 0) + atomicAdd(&output[i], warp_sum); + } + } + + template + __global__ void scale_inverse_pow2_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size, T epsilon, + view sums) + { + for (int i = 0; i < outer_size; i++) { + const auto outer_offset = i * mid_size; + const auto scale = 1 / utils::sqrt(sums[i] + epsilon); + for (auto idx : grid_stride_range(mid_size)) { + const auto full_idx = outer_offset + idx; + output[full_idx] = input[full_idx] * scale; + } + } + } + + template + __global__ void reduce_sum_pow1_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size) + { + for (int i = 0; i < outer_size; i++) { + const auto outer_offset = i * mid_size; + + T thread_sum = 0; + for (auto idx : grid_stride_range(mid_size)) { + const auto full_idx = outer_offset + idx; + thread_sum += utils::abs(input[full_idx]); + } + + auto warp_sum = utils::warpReduceSum(thread_sum); + if ((threadIdx.x & (warpSize - 1)) == 0) + atomicAdd(&output[i], warp_sum); + } + } + + template + __global__ void scale_inverse_pow1_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size, T epsilon, + view sums) + { + for (int i = 0; i < outer_size; i++) { + const auto outer_offset = i * mid_size; + const auto scale = 1/(sums[i] + epsilon); + for (auto idx : grid_stride_range(mid_size)) { + const auto full_idx = outer_offset + idx; + output[full_idx] = input[full_idx] * scale; + } + } + } + } + + template + void normalize( + const Stream& stream, + span output, + view input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, T norm, T epsilon, + span workspace) + { + if (inner_size == 1) { + CV_Assert(workspace.size() >= outer_size); + if (norm == 1) { + auto reduce_kernel = raw::reduce_sum_pow1_inner1; + auto policy = make_policy(reduce_kernel, 0, stream); + launch_kernel(reduce_kernel, policy, workspace, input, outer_size, mid_size); + + auto scale_kernel = raw::scale_inverse_pow1_inner1; + policy = make_policy(scale_kernel, 0, stream); + launch_kernel(scale_kernel, policy, output, input, outer_size, mid_size, epsilon, workspace); + } else if (norm == 2) { + auto reduce_kernel = raw::reduce_sum_pow2_inner1; + auto policy = make_policy(reduce_kernel, 0, stream); + launch_kernel(reduce_kernel, policy, workspace, input, outer_size, mid_size); + + auto scale_kernel = raw::scale_inverse_pow2_inner1; + policy = make_policy(scale_kernel, 0, stream); + launch_kernel(scale_kernel, policy, output, input, outer_size, mid_size, epsilon, workspace); + } else { + auto reduce_kernel = raw::reduce_sum_powN_inner1; + auto policy = make_policy(reduce_kernel, 0, stream); + launch_kernel(reduce_kernel, policy, workspace, input, outer_size, mid_size, norm); + + auto scale_kernel = raw::scale_inverse_powN_inner1; + policy = make_policy(scale_kernel, 0, stream); + launch_kernel(scale_kernel, policy, output, input, outer_size, mid_size, epsilon, norm, workspace); + } + } else { + auto reduce_kernel = raw::reduce_sum_powN; + auto policy = make_policy(reduce_kernel, 0, stream); + launch_kernel(reduce_kernel, policy, workspace, input, outer_size, mid_size, inner_size, norm); + + auto scale_kernel = raw::scale_inverse_powN; + policy = make_policy(scale_kernel, 0, stream); + launch_kernel(scale_kernel, policy, output, input, outer_size, mid_size, inner_size, epsilon, norm, workspace); + } + } + + template void normalize(const Stream&, span, view, std::size_t, std::size_t, std::size_t, float, float, span); + /* double variant not available due to efficient atomicAdd implementation */ + //template void normalize(const Stream&, span, view, std::size_t, std::size_t, std::size_t, unsigned, span); + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda/reduce.hpp b/modules/dnn/src/cuda/reduce.hpp new file mode 100644 index 000000000000..0ed5919ec988 --- /dev/null +++ b/modules/dnn/src/cuda/reduce.hpp @@ -0,0 +1,23 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA_REDUCE_HPP +#define OPENCV_DNN_SRC_CUDA_REDUCE_HPP + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace utils { + template + __device__ T warpReduceSum(T val) { + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += __shfl_down_sync(0xFFFFFFFF, val, offset); + return val; + } + } + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ + +#endif /* OPENCV_DNN_SRC_CUDA_REDUCE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index cf54f53cb899..f93593adeb45 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -84,6 +84,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke template void permute(const Stream& stream, TensorSpan output, TensorView input, const std::vector& order); + template + void normalize( + const Stream& stream, + span output, view input, + std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, T norm, T epsilon, + span workspace); + }}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp index b6b973d22628..15899fa6cf76 100644 --- a/modules/dnn/src/layers/normalize_bbox_layer.cpp +++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp @@ -42,8 +42,15 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/kernels.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn { class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer @@ -70,7 +77,8 @@ class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer return preferableTarget == DNN_TARGET_MYRIAD ? !acrossSpatial : startAxis == 1; } - return backendId == DNN_BACKEND_OPENCV; + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()); } bool getMemoryShapes(const std::vector &inputs, @@ -257,6 +265,69 @@ class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + auto input_shape = input_wrapper->getShape(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto start_axis = clamp(startAxis, input_shape.size()); + auto end_axis = clamp(endAxis, input_shape.size()); + + auto outer_size = total(input_shape, 0, start_axis); + auto mid_size = total(input_shape, start_axis, end_axis + 1); + auto inner_size = total(input_shape, end_axis + 1, -1); + + auto scratch_ptr = reinterpret_cast(csl::WorkspaceAccessor::get(workspace).get()); + auto scratch = csl::span(csl::DevicePtr(scratch_ptr), workspace.size()); + csl::kernels::normalize(stream, output, input, outer_size, mid_size, inner_size, pnorm, epsilon, scratch); + + if (!blobs.empty()) { + Mat weightsMat = blobs[0]; + if (weightsMat.total() == 1) + { + csl::kernels::scale1(stream, output, input, weightsMat.at(0, 0)); + } + else + { + CV_Assert(weightsTensor.size() == mid_size); + csl::kernels::scaleN(stream, output, input, inner_size, weightsTensor); + } + } + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + stream = std::move(stream_); + + if (!blobs.empty() && blobs[0].total() != 1) + { + const auto& weightsMat = blobs[0]; + weightsTensor = createTensorHeaderFromMat(weightsMat); + copyMatToTensor(weightsTensor, weightsMat, stream); + } + } + + csl::Tensor weightsTensor; + csl::Stream stream; +#endif + #ifdef HAVE_INF_ENGINE virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { From 1619f0b020775639f4ca649584e5edce1646b145 Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 2 Jul 2019 11:47:20 +0530 Subject: [PATCH 035/129] optimize bias addition in scale layer --- modules/dnn/src/cuda/scale.cu | 45 ++++++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/kernels.hpp | 9 +++++ modules/dnn/src/layers/scale_layer.cpp | 12 +++---- 3 files changed, 58 insertions(+), 8 deletions(-) diff --git a/modules/dnn/src/cuda/scale.cu b/modules/dnn/src/cuda/scale.cu index 898c05354ab2..c5ec7055fe59 100644 --- a/modules/dnn/src/cuda/scale.cu +++ b/modules/dnn/src/cuda/scale.cu @@ -16,6 +16,22 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { + template + __global__ void bias1(span output, view input, T beta) + { + for (auto i : grid_stride_range(output.size())) + output[i] = input[i] + beta; + } + + template + __global__ void biasN(span output, view input, std::size_t inner_size, view bias) + { + for (auto i : grid_stride_range(output.size())) { + const auto bias_idx = (i / inner_size) % bias.size(); + output[i] = input[i] + bias[bias_idx]; + } + } + template __global__ void scale1(span output, view input, T alpha) { @@ -49,6 +65,35 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } } + template + void bias1(const Stream& stream, TensorSpan output, TensorView input, T beta) { + CV_Assert(is_shape_same(input, output)); + + auto kernel = raw::scale1; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output, input, beta); + } + + template void bias1(const Stream&, TensorSpan, TensorView, float); + template void bias1(const Stream&, TensorSpan, TensorView, double); + + template + void biasN( + const Stream& stream, + TensorSpan output, + TensorView input, std::size_t inner_size, + TensorView bias) + { + CV_Assert(is_shape_same(input, output)); + + auto kernel = raw::biasN; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, bias); + } + + template void biasN(const Stream&, TensorSpan, TensorView, std::size_t, TensorView); + template void biasN(const Stream&, TensorSpan, TensorView, std::size_t, TensorView); + template void scale1(const Stream& stream, TensorSpan output, TensorView input, T alpha) { CV_Assert(is_shape_same(input, output)); diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index f93593adeb45..bc3ec0881a16 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -50,6 +50,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke template void concat_with_offsets(const Stream& stream, TensorSpan output, TensorView input, const std::vector& axis_offsets); + template + void bias1(const Stream& stream, TensorSpan output, TensorView input, T alpha); + + template + void biasN(const Stream& stream, + TensorSpan output, + TensorView input, std::size_t inner_size, + TensorView bias); + template void scale1(const Stream& stream, TensorSpan output, TensorView input, T alpha); diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp index 15f904a41c60..12daa889e0b4 100644 --- a/modules/dnn/src/layers/scale_layer.cpp +++ b/modules/dnn/src/layers/scale_layer.cpp @@ -189,7 +189,7 @@ class ScaleLayerImpl CV_FINAL : public ScaleLayer auto input_shape = input_wrapper->getShape(); - /* the weights might require broadcasting to scale */ + /* the weights/bias might require broadcasting to scale/shift */ int end_axis = [&] { for (int endAxis = axis + 1; endAxis <= input_shape.size(); ++endAxis) if (total(input_shape, axis, endAxis) == numParams) @@ -205,11 +205,7 @@ class ScaleLayerImpl CV_FINAL : public ScaleLayer else if (hasWeights) csl::kernels::scaleN(stream, output, input, inner_size, weights); else - { - /* rarely used codepath; hence, not optimized TODO */ - csl::tensor_ops::copy(stream, output, input); - csl::tensor_ops::add(stream, 1.0, output, 1.0, bias); - } + csl::kernels::biasN(stream, output, input, inner_size, bias); } void initCUDA( @@ -232,7 +228,7 @@ class ScaleLayerImpl CV_FINAL : public ScaleLayer /* if the weights are provided, bias will be in blobs[1]; otherwise, it will be in blobs[0] * in either case, it is at the end of the blobs vector => bias = blobs.back() */ - biasTensor = createTensorHeaderFromMat(blobs[1]); + biasTensor = createTensorHeaderFromMat(blobs.back()); copyMatToTensor(biasTensor, blobs.back(), stream); } } @@ -360,7 +356,7 @@ Ptr ShiftLayer::create(const LayerParams& params) scaleParams.type = "Scale"; scaleParams.blobs = params.blobs; scaleParams.set("bias_term", true); - scaleParams.set("axis", 0); + scaleParams.set("axis", 1); return Ptr(new ScaleLayerImpl(scaleParams)); } From ed16c7e71016aaf2d4e03cea838dbbe002388d41 Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 2 Jul 2019 21:26:26 +0530 Subject: [PATCH 036/129] add prior box layer --- modules/dnn/src/cuda/array.hpp | 4 +- modules/dnn/src/cuda/math.hpp | 5 +- modules/dnn/src/cuda/prior_box.cu | 156 +++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/kernels.hpp | 12 ++ modules/dnn/src/layers/prior_box_layer.cpp | 69 +++++++++ 5 files changed, 242 insertions(+), 4 deletions(-) create mode 100644 modules/dnn/src/cuda/prior_box.cu diff --git a/modules/dnn/src/cuda/array.hpp b/modules/dnn/src/cuda/array.hpp index 5717cb11e615..a28589d1a11b 100644 --- a/modules/dnn/src/cuda/array.hpp +++ b/modules/dnn/src/cuda/array.hpp @@ -28,8 +28,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k using reverse_iterator = std::reverse_iterator; using const_reverse_iterator = std::reverse_iterator; - bool empty() const noexcept { return N == 0; } - size_type size() const noexcept { return N; } + __host__ __device__ bool empty() const noexcept { return N == 0; } + __host__ __device__ size_type size() const noexcept { return N; } __host__ __device__ iterator begin() noexcept { return ptr; } __host__ __device__ iterator end() noexcept { return ptr + N; } diff --git a/modules/dnn/src/cuda/math.hpp b/modules/dnn/src/cuda/math.hpp index 6c49e4346b9d..df4e9d40e4a7 100644 --- a/modules/dnn/src/cuda/math.hpp +++ b/modules/dnn/src/cuda/math.hpp @@ -55,8 +55,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template <> inline __device__ float sqrt(float val) { return sqrtf(val); } template <> inline __device__ double sqrt(double val) { return ::sqrt(val); } - template - __device__ T sigmoid(T val) { return T(1) / (1 + exp(-val)); } + template __device__ T sigmoid(T val) { return T(1) / (1 + exp(-val)); } + + template __device__ T clamp(T value, T lower, T upper) { return min(max(value, lower), upper); } } }}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda/prior_box.cu b/modules/dnn/src/cuda/prior_box.cu new file mode 100644 index 000000000000..3a5da326dcf5 --- /dev/null +++ b/modules/dnn/src/cuda/prior_box.cu @@ -0,0 +1,156 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "array.hpp" +#include "math.hpp" +#include "reduce.hpp" + +#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/kernel_utils.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/stream.hpp" + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace raw { + template + __global__ void prior_box( + span output, + view boxWidth, view boxHeight, view offsetX, view offsetY, + std::size_t layerWidth, std::size_t layerHeight, + std::size_t imageWidth, std::size_t imageHeight, + T stepX, T stepY) + { + /* num_points contains the number of points in the feature map of interest + * each iteration of the stride loop selects a point and generates prior boxes for it + */ + std::size_t num_points = layerWidth * layerHeight; + for (auto idx : grid_stride_range(num_points)) { + auto x = idx % layerWidth, + y = idx / layerWidth; + + DevicePtr output_ptr = output.data() + idx * 4 * offsetX.size() * boxWidth.size(); + + for (int i = 0; i < boxWidth.size(); i++) { + for (int j = 0; j < offsetX.size(); j++) { + float center_x = (x + offsetX[j]) * stepX; + float center_y = (y + offsetY[j]) * stepY; + + if(Normalize) { + output_ptr[0] = (center_x - boxWidth[i] * 0.5f) / imageWidth; + output_ptr[1] = (center_y - boxHeight[i] * 0.5f) / imageHeight; + output_ptr[2] = (center_x + boxWidth[i] * 0.5f) / imageWidth; + output_ptr[3] = (center_y + boxHeight[i] * 0.5f) / imageHeight; + } else { + output_ptr[0] = center_x - boxWidth[i] * 0.5f; + output_ptr[1] = center_y - boxHeight[i] * 0.5f; + output_ptr[2] = center_x + boxWidth[i] * 0.5f - 1.0f; + output_ptr[3] = center_y + boxHeight[i] * 0.5f - 1.0f; + } + + output_ptr += 4; + } + } + } + } + + template + __global__ void prior_box_clip(span output) { + for (auto i : grid_stride_range(output.size())) { + using utils::clamp; + output[i] = clamp(output[i], 0.0, 1.0); + } + } + + template + __global__ void prior_box_set_variance1(span output, T variance) { + for (auto i : grid_stride_range(output.size())) + output[i] = variance; + } + + template + using array = utils::array; + + template + __global__ void prior_box_set_variance4(span output, array variance) { + for (auto i : grid_stride_range(output.size()/4)) { + for (int j = 0; j < variance.size(); j++) + output[i * 4 + j] = variance[j]; + } + } + } + + template static + void launch_prior_box_kernel( + const Stream& stream, + span output, view boxWidth, view boxHeight, view offsetX, view offsetY, + std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight, + T stepX, T stepY) + { + auto kernel = raw::prior_box; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, + output, boxWidth, boxHeight, offsetX, offsetY, + layerWidth, layerHeight, imageWidth, imageHeight, + stepX, stepY); + } + + template + void generate_prior_boxes( + const Stream& stream, + span output, + view boxWidth, view boxHeight, view offsetX, view offsetY, + std::vector variance, + std::size_t numPriors, + std::size_t layerWidth, std::size_t layerHeight, + std::size_t imageWidth, std::size_t imageHeight, + T stepX, T stepY, + bool normalize, bool clip) + { + if (normalize) { + launch_prior_box_kernel( + stream, output, boxWidth, boxHeight, offsetX, offsetY, + layerWidth, layerHeight, imageWidth, imageHeight, stepX, stepY + ); + } else { + launch_prior_box_kernel( + stream, output, boxWidth, boxHeight, offsetX, offsetY, + layerWidth, layerHeight, imageWidth, imageHeight, stepX, stepY + ); + } + + std::size_t channel_size = layerHeight * layerWidth * numPriors * 4; + CV_Assert(channel_size * 2 == output.size()); + + if (clip) { + auto output_span_c1 = span(output.data(), channel_size); + auto kernel = raw::prior_box_clip; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output_span_c1); + } + + auto output_span_c2 = span(output.data() + channel_size, channel_size); + if (variance.size() == 1) { + auto kernel = raw::prior_box_set_variance1; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output_span_c2, variance[0]); + } else { + utils::array variance_k; + variance_k.assign(std::begin(variance), std::end(variance)); + auto kernel = raw::prior_box_set_variance4; + auto policy = make_policy(kernel, 0, stream); + launch_kernel(kernel, policy, output_span_c2, variance_k); + } + } + + template void generate_prior_boxes(const Stream&, span, view, view, view, view, + std::vector, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, float, float, bool, bool); + + template void generate_prior_boxes(const Stream&, span, view, view, view, view, + std::vector, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, double, double, bool, bool); + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index bc3ec0881a16..9a4654a9b5c3 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -100,6 +100,18 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, T norm, T epsilon, span workspace); + template + void generate_prior_boxes( + const Stream& stream, + span output, + view boxWidth, view boxHeight, view offsetX, view offsetY, + std::vector variance, + std::size_t numPriors, + std::size_t layerWidth, std::size_t layerHeight, + std::size_t imageWidth, std::size_t imageHeight, + T stepX, T stepY, + bool normalize, bool clip); + }}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp index f5d9bf7b1395..0f51a666c521 100644 --- a/modules/dnn/src/layers/prior_box_layer.cpp +++ b/modules/dnn/src/layers/prior_box_layer.cpp @@ -42,6 +42,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" #include @@ -52,6 +53,12 @@ #include "opencl_kernels_dnn.hpp" #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -274,6 +281,7 @@ class PriorBoxLayerImpl CV_FINAL : public PriorBoxLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && ( _explicitSizes || (_minSize.size() == 1 && _maxSize.size() <= 1))) || (backendId == DNN_BACKEND_VKCOM && haveVulkan()); @@ -485,6 +493,67 @@ class PriorBoxLayerImpl CV_FINAL : public PriorBoxLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 2 && outputs.size() == 1); + + auto layer_input_wrapper = inputs[0].dynamicCast(); + auto layer_input = layer_input_wrapper->getView(); /* useless synchronization */ + + auto data_input_wrapper = inputs[1].dynamicCast(); + auto data_input = data_input_wrapper->getView(); /* useless synchronization */ + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto layerWidth = layer_input.get_axis_size(-1); + auto layerHeight = layer_input.get_axis_size(-2); + + auto imageWidth = data_input.get_axis_size(-1); + auto imageHeight = data_input.get_axis_size(-2); + + auto boxSize = _boxWidths.size(), offsetSize = _offsetsX.size(); + auto boxWidth = csl::view(paramsTensor.get(), boxSize); + auto boxHeight = csl::view(paramsTensor.get() + boxSize, boxSize); + auto offsetX = csl::view(paramsTensor.get() + 2 * boxSize, offsetSize); + auto offsetY = csl::view(paramsTensor.get() + 2 * boxSize + offsetSize, offsetSize); + + csl::kernels::generate_prior_boxes(stream, output, + boxWidth, boxHeight, offsetX, offsetY, + _variance, _numPriors, layerWidth, layerHeight, imageWidth, imageHeight, _stepX, _stepY, _bboxesNormalized, _clip); + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs) override + { + stream = std::move(stream_); + + CV_Assert(_boxWidths.size() == _boxHeights.size()); + CV_Assert(_offsetsX.size() == _offsetsY.size()); + + auto total = _boxWidths.size() * 2 + _offsetsX.size() * 2; + std::vector paramsVec; + paramsVec.insert(std::end(paramsVec), std::begin(_boxWidths), std::end(_boxWidths)); + paramsVec.insert(std::end(paramsVec), std::begin(_boxHeights), std::end(_boxHeights)); + paramsVec.insert(std::end(paramsVec), std::begin(_offsetsX), std::end(_offsetsX)); + paramsVec.insert(std::end(paramsVec), std::begin(_offsetsY), std::end(_offsetsY)); + + paramsTensor.resize(total); + csl::memcpy(paramsTensor.get(), paramsVec.data(), total, stream); /* synchronous copy */ + } + + csl::Tensor paramsTensor; /* widths, heights, offsetsX, offsetsY */ + csl::Stream stream; +#endif + virtual Ptr initVkCom(const std::vector > &input) CV_OVERRIDE { #ifdef HAVE_VULKAN From 76eaf7b16af4eb003930bb268f1369b45b0d658a Mon Sep 17 00:00:00 2001 From: Yashas Date: Thu, 4 Jul 2019 12:54:35 +0530 Subject: [PATCH 037/129] fix and optimize normalize layer --- modules/dnn/src/cuda/atomics.hpp | 25 ++ modules/dnn/src/cuda/math.hpp | 6 + modules/dnn/src/cuda/normalize.cu | 221 ++++++------------ .../dnn/src/layers/normalize_bbox_layer.cpp | 15 +- 4 files changed, 108 insertions(+), 159 deletions(-) create mode 100644 modules/dnn/src/cuda/atomics.hpp diff --git a/modules/dnn/src/cuda/atomics.hpp b/modules/dnn/src/cuda/atomics.hpp new file mode 100644 index 000000000000..d075ca449de4 --- /dev/null +++ b/modules/dnn/src/cuda/atomics.hpp @@ -0,0 +1,25 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA_ATOMICS_HPP +#define OPENCV_DNN_SRC_CUDA_ATOMICS_HPP + +#include + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 +#else +__device__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + +#endif /* OPENCV_DNN_SRC_CUDA_ATOMICS_HPP */ diff --git a/modules/dnn/src/cuda/math.hpp b/modules/dnn/src/cuda/math.hpp index df4e9d40e4a7..79ef5431954f 100644 --- a/modules/dnn/src/cuda/math.hpp +++ b/modules/dnn/src/cuda/math.hpp @@ -55,11 +55,17 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template <> inline __device__ float sqrt(float val) { return sqrtf(val); } template <> inline __device__ double sqrt(double val) { return ::sqrt(val); } + template __device__ T rsqrt(T val); + template <> inline __device__ float rsqrt(float val) { return rsqrtf(val); } + template <> inline __device__ double rsqrt(double val) { return ::rsqrt(val); } + template __device__ T sigmoid(T val) { return T(1) / (1 + exp(-val)); } template __device__ T clamp(T value, T lower, T upper) { return min(max(value, lower), upper); } } + + }}}}} /* cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_SRC_CUDA_MATH_HPP */ diff --git a/modules/dnn/src/cuda/normalize.cu b/modules/dnn/src/cuda/normalize.cu index c916c8b9188a..121d83a779aa 100644 --- a/modules/dnn/src/cuda/normalize.cu +++ b/modules/dnn/src/cuda/normalize.cu @@ -5,6 +5,7 @@ #include "array.hpp" #include "math.hpp" #include "reduce.hpp" +#include "atomics.hpp" #include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/kernel_utils.hpp" @@ -17,142 +18,57 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { - template - __global__ void reduce_sum_powN(span output, - view input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, T norm) - { - for (int i = 0; i < outer_size; i++) { - for (int j = 0; j < mid_size; j++) { - const auto outer_offset = i * mid_size * inner_size; - const auto mid_offset = j * mid_size; - const auto total_offset = outer_offset + mid_offset; - - T thread_sum = 0; - for (auto idx : grid_stride_range(inner_size)) { - const auto full_idx = total_offset + idx; - thread_sum += utils::pow(utils::abs(input[full_idx]), norm); - } - - auto warp_sum = utils::warpReduceSum(thread_sum); - if ((threadIdx.x & (warpSize - 1)) == 0) - atomicAdd(&output[total_offset], warp_sum); - } - } + template static + __global__ void zero(span output) { + for (auto idx : grid_stride_range(output.size())) + output[idx] = 0; } - template - __global__ void scale_inverse_powN(span output, - view input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, T episilon, T norm, - view sums) + template static + __global__ void reduce_sum_abs(span output, view input, std::size_t outer_stride, std::size_t mid_stride) { - for (int i = 0; i < outer_size; i++) { - for (int j = 0; j < mid_size; j++) { - const auto outer_offset = i * mid_size * inner_size; - const auto mid_offset = j * mid_size; - const auto total_offset = outer_offset + mid_offset; - - const auto scale = 1 / utils::pow(sums[total_offset] + episilon, 1 / norm); - for (auto idx : grid_stride_range(inner_size)) { - const auto full_idx = total_offset + idx; - output[full_idx] = input[full_idx] * scale; - } - } - } - } + for (auto idx : grid_stride_range(input.size())) { + const auto outer_idx = idx / outer_stride; + const auto inner_idx = idx % mid_stride; - template - __global__ void reduce_sum_powN_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size, T pnorm) - { - for (int i = 0; i < outer_size; i++) { - const auto outer_offset = i * mid_size; - - T thread_sum = 0; - for (auto idx : grid_stride_range(mid_size)) { - const auto full_idx = outer_offset + idx; - thread_sum += utils::pow(input[full_idx], pnorm); - } - - auto warp_sum = utils::warpReduceSum(thread_sum); - if ((threadIdx.x & (warpSize - 1)) == 0) - atomicAdd(&output[i], warp_sum); + auto sum_idx = outer_idx * mid_stride + inner_idx; + atomicAdd(&output[sum_idx], utils::abs(input[idx])); } } - template - __global__ void scale_inverse_powN_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size, T epsilon, T pnorm, - view sums) - { - for (int i = 0; i < outer_size; i++) { - const auto outer_offset = i * mid_size; - const auto scale = 1 / utils::pow(sums[i] + epsilon, 1/pnorm); - for (auto idx : grid_stride_range(mid_size)) { - const auto full_idx = outer_offset + idx; - output[full_idx] = input[full_idx] * scale; - } - } + template static + __global__ void reciprocal(span output, T epsilon) { + for (auto idx : grid_stride_range(output.size())) + output[idx] = 1 / (output[idx] + epsilon); } - template - __global__ void reduce_sum_pow2_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size) + template static + __global__ void reduce_sum_squared(span output, view input, std::size_t outer_stride, std::size_t mid_stride) { - for (int i = 0; i < outer_size; i++) { - const auto outer_offset = i * mid_size; - - T thread_sum = 0; - for (auto idx : grid_stride_range(mid_size)) { - const auto full_idx = outer_offset + idx; - thread_sum += input[full_idx] * input[full_idx]; - } - - auto warp_sum = utils::warpReduceSum(thread_sum); - if ((threadIdx.x & (warpSize - 1)) == 0) - atomicAdd(&output[i], warp_sum); - } - } + for (auto idx : grid_stride_range(input.size())) { + const auto outer_idx = idx / outer_stride; + const auto inner_idx = idx % mid_stride; - template - __global__ void scale_inverse_pow2_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size, T epsilon, - view sums) - { - for (int i = 0; i < outer_size; i++) { - const auto outer_offset = i * mid_size; - const auto scale = 1 / utils::sqrt(sums[i] + epsilon); - for (auto idx : grid_stride_range(mid_size)) { - const auto full_idx = outer_offset + idx; - output[full_idx] = input[full_idx] * scale; - } - } + auto sum_idx = outer_idx * mid_stride + inner_idx; + atomicAdd(&output[sum_idx], input[idx] * input[idx]); + } } - template - __global__ void reduce_sum_pow1_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size) - { - for (int i = 0; i < outer_size; i++) { - const auto outer_offset = i * mid_size; - - T thread_sum = 0; - for (auto idx : grid_stride_range(mid_size)) { - const auto full_idx = outer_offset + idx; - thread_sum += utils::abs(input[full_idx]); - } - - auto warp_sum = utils::warpReduceSum(thread_sum); - if ((threadIdx.x & (warpSize - 1)) == 0) - atomicAdd(&output[i], warp_sum); - } + template static + __global__ void rsqrt(span output, T epsilon) { + for (auto idx : grid_stride_range(output.size())) + output[idx] = utils::rsqrt(output[idx] + epsilon); } - template - __global__ void scale_inverse_pow1_inner1(span output, view input, std::size_t outer_size, std::size_t mid_size, T epsilon, - view sums) + template static + __global__ void apply_norm(span output, view input, std::size_t outer_stride, std::size_t mid_stride, view sums) { - for (int i = 0; i < outer_size; i++) { - const auto outer_offset = i * mid_size; - const auto scale = 1/(sums[i] + epsilon); - for (auto idx : grid_stride_range(mid_size)) { - const auto full_idx = outer_offset + idx; - output[full_idx] = input[full_idx] * scale; - } + for (auto idx : grid_stride_range(output.size())) { + const auto outer_idx = idx / outer_stride; + const auto inner_idx = idx % mid_stride; + + auto sum_idx = outer_idx * mid_stride + inner_idx; + output[idx] = input[idx] * sums[sum_idx]; } } } @@ -162,48 +78,41 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k const Stream& stream, span output, view input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, T norm, T epsilon, - span workspace) + span workspace_) { - if (inner_size == 1) { - CV_Assert(workspace.size() >= outer_size); - if (norm == 1) { - auto reduce_kernel = raw::reduce_sum_pow1_inner1; - auto policy = make_policy(reduce_kernel, 0, stream); - launch_kernel(reduce_kernel, policy, workspace, input, outer_size, mid_size); - - auto scale_kernel = raw::scale_inverse_pow1_inner1; - policy = make_policy(scale_kernel, 0, stream); - launch_kernel(scale_kernel, policy, output, input, outer_size, mid_size, epsilon, workspace); - } else if (norm == 2) { - auto reduce_kernel = raw::reduce_sum_pow2_inner1; - auto policy = make_policy(reduce_kernel, 0, stream); - launch_kernel(reduce_kernel, policy, workspace, input, outer_size, mid_size); - - auto scale_kernel = raw::scale_inverse_pow2_inner1; - policy = make_policy(scale_kernel, 0, stream); - launch_kernel(scale_kernel, policy, output, input, outer_size, mid_size, epsilon, workspace); - } else { - auto reduce_kernel = raw::reduce_sum_powN_inner1; - auto policy = make_policy(reduce_kernel, 0, stream); - launch_kernel(reduce_kernel, policy, workspace, input, outer_size, mid_size, norm); - - auto scale_kernel = raw::scale_inverse_powN_inner1; - policy = make_policy(scale_kernel, 0, stream); - launch_kernel(scale_kernel, policy, output, input, outer_size, mid_size, epsilon, norm, workspace); - } + CV_Assert(norm == 1 || norm == 2); + CV_Assert(workspace_.size() >= outer_size * inner_size); + + auto sums = span(workspace_.data(), outer_size * inner_size); + + auto zero_kernel = raw::zero; + auto policy = make_policy(zero_kernel, 0, stream); + launch_kernel(zero_kernel, policy, sums); + + if (norm == 1) { + auto reduce_kernel = raw::reduce_sum_abs; + policy = make_policy(reduce_kernel, 0, stream); + launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size); + + auto reciprocal_kernel = raw::reciprocal; + policy = make_policy(reciprocal_kernel, 0, stream); + launch_kernel(reciprocal_kernel, policy, sums, epsilon); } else { - auto reduce_kernel = raw::reduce_sum_powN; - auto policy = make_policy(reduce_kernel, 0, stream); - launch_kernel(reduce_kernel, policy, workspace, input, outer_size, mid_size, inner_size, norm); + auto reduce_kernel = raw::reduce_sum_squared; + policy = make_policy(reduce_kernel, 0, stream); + launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size); - auto scale_kernel = raw::scale_inverse_powN; - policy = make_policy(scale_kernel, 0, stream); - launch_kernel(scale_kernel, policy, output, input, outer_size, mid_size, inner_size, epsilon, norm, workspace); + auto rsqrt_kernel = raw::rsqrt; + policy = make_policy(rsqrt_kernel, 0, stream); + launch_kernel(rsqrt_kernel, policy, sums, epsilon); } + + auto scale_kernel = raw::apply_norm; + policy = make_policy(scale_kernel, 0, stream); + launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums); } template void normalize(const Stream&, span, view, std::size_t, std::size_t, std::size_t, float, float, span); - /* double variant not available due to efficient atomicAdd implementation */ - //template void normalize(const Stream&, span, view, std::size_t, std::size_t, std::size_t, unsigned, span); + template void normalize(const Stream&, span, view, std::size_t, std::size_t, std::size_t, double, double, span); }}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp index 15899fa6cf76..e9857ab7fa2c 100644 --- a/modules/dnn/src/layers/normalize_bbox_layer.cpp +++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp @@ -78,7 +78,7 @@ class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer return preferableTarget == DNN_TARGET_MYRIAD ? !acrossSpatial : startAxis == 1; } return backendId == DNN_BACKEND_OPENCV || - (backendId == DNN_BACKEND_CUDA && haveCUDA()); + (backendId == DNN_BACKEND_CUDA && haveCUDA() && (pnorm == 1.0 || pnorm == 2.0)); } bool getMemoryShapes(const std::vector &inputs, @@ -269,8 +269,7 @@ class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer void forwardCUDA( std::vector>& inputs, std::vector>& outputs, - csl::Workspace& workspace - ) override + csl::Workspace& workspace) override { CV_Assert(inputs.size() == 1 && outputs.size() == 1); @@ -322,6 +321,16 @@ class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer weightsTensor = createTensorHeaderFromMat(weightsMat); copyMatToTensor(weightsTensor, weightsMat, stream); } + + auto input_wrapper = inputs[0].dynamicCast(); + auto input_shape = input_wrapper->getShape(); + + auto start_axis = clamp(startAxis, input_shape.size()); + auto end_axis = clamp(endAxis, input_shape.size()); + + auto outer_size = total(input_shape, 0, start_axis); + auto inner_size = total(input_shape, end_axis + 1, -1); + scratch_mem_in_bytes = outer_size * inner_size * sizeof(float); } csl::Tensor weightsTensor; From ebf5cfbd8991c906ef428e76f9a5a008c2b6b182 Mon Sep 17 00:00:00 2001 From: Yashas Date: Thu, 4 Jul 2019 16:08:36 +0530 Subject: [PATCH 038/129] add asymmetric padding support for pooling layer --- modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp | 39 +------- modules/dnn/src/layers/convolution_layer.cpp | 2 +- modules/dnn/src/layers/pooling_layer.cpp | 98 +++++++++++++++++--- 3 files changed, 90 insertions(+), 49 deletions(-) diff --git a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp index 680eeba8207c..ef28e4ff5448 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp @@ -283,11 +283,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { using PoolingDescriptor = cudnn::PoolingDescriptor; public: - enum class rounding_type { - FLOOR, - CEILING - }; - using pooling_type = PoolingDescriptor::pooling_type; struct params_type { @@ -297,7 +292,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { std::vector padding; std::vector stride; - rounding_type rounding_mode; pooling_type type; }; @@ -310,36 +304,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { inputTensorDesc = TensorDescriptor(params.input_shape); poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type); - const auto& input_shape = params.input_shape; - std::vector output_shape; - output_shape.assign(std::begin(input_shape), std::end(input_shape)); - - const auto& window_size = params.window_size; - const auto& padding = params.padding; - const auto& stride = params.stride; - - bool ceil = params.rounding_mode == rounding_type::CEILING; - for (int i = 0; i < window_size.size(); i++) { - double axis_sz = (input_shape[i + 2] + 2 * padding[i] - window_size[i]) / double(stride[i]) + 1; - output_shape[i + 2] = ceil ? std::ceil(axis_sz) : std::floor(axis_sz); - - /* check if the last pooling window starts in the valid region */ - if (padding[i]) { - if ((output_shape[i + 2] - 1) * stride[i] >= input_shape[i + 2] + padding[i]) - output_shape[i + 2]--; - } - } - - if (!ceil) - { - /* we must agree with cuDNN if we used floor */ - std::vector output_dim; - getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim); - CV_Assert(std::equal(std::begin(output_dim), std::end(output_dim), std::begin(output_shape))); - CV_UNUSED(output_dim); - } - - outputTensorDesc = TensorDescriptor(output_shape); + std::vector output_dim; + getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim); + outputTensorDesc = TensorDescriptor(output_dim); } Pooling& operator=(const Pooling&) = delete; diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 9350bdfe2452..61a9d27c6387 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -1398,7 +1398,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl } /* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by - * copying the input to a bigger tensor and pad the sides manually + * copying the input to a bigger tensor and padding the ends manually */ for (int i = 0; i < rank; i++) input_shape[i] += padding_left[i] + padding_right[i]; diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index 9725cb6095a1..14c7b5ff57e5 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -300,14 +300,18 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer csl::Workspace& workspace ) override { - CV_UNUSED(workspace); - if (computeMaxIdx) - CV_Error(Error::StsNotImplemented, "Pooling layer does not support caching max indicies"); + CV_Error(Error::StsNotImplemented, "Pooling layer does not support caching max indices"); auto input_wrapper = inputs[0].dynamicCast(); auto input = input_wrapper->getView(); + if (!transformedInput.empty()) + { + inputTransformer.transform(input, transformedInput); + input = csl::TensorView(transformedInput); + } + auto output_wrapper = outputs[0].dynamicCast(); auto output = output_wrapper->getSpan(); @@ -327,13 +331,84 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer auto input_wrapper = inputs[0].dynamicCast(); auto input_shape = input_wrapper->getShape(); - if (pads_begin != pads_end) - CV_Error(Error::StsNotImplemented, "Asymmetric padding for pooling layer is not supported by CUDA backend"); + /* 1d, 2d, 3d pooling are supported */ + CV_Assert(input_shape.size() >= 3 || input_shape.size() <= 5); + + const auto rank = input_shape.size(); + + /* left and right are misleading as the padding is applicable for any number of dimensions + * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` + */ + std::vector common_padding(rank, 0); + std::vector padding_left(rank, 0), padding_right(rank, 0); + if (padMode.empty()) + { + /* cuDNN rounds down by default; hence, if ceilMode is false, we do nothing + * otherwise, we add extra padding towards the end so that the convolution arithmetic yeilds + * the correct output size without having to deal with fancy fractional sizes + */ + auto pads_end_modified = pads_end; + if (ceilMode) + { + for (int i = 0; i < kernel_size.size(); i++) { + auto rem = (input_shape[i + 2] + pads_begin[i] + pads_end[i] - kernel_size[i]) % strides[i]; + if(rem) + pads_end_modified[i] += strides[i] - rem; + } + } + + for (int i = 2; i < common_padding.size(); i++) + { + common_padding[i] = std::min(pads_begin[i - 2], pads_end_modified[i - 2]); + padding_left[i] = pads_begin[i - 2] - common_padding[i]; + padding_right[i] = pads_end_modified[i - 2] - common_padding[i]; + } + } + else if(padMode == "VALID") { /* nothing to do as the paddings are already preset to zero */ } + else if (padMode == "SAME") + { + /* TensorFlow Logic: + * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] + * + * if total padding is odd, the input is padded towards the end + */ + std::vector inShape(std::begin(input_shape) + 2, std::end(input_shape)), outShape; + getConvPoolOutParams(inShape, kernel_size, strides, padMode, std::vector(kernel_size.size(), 1), outShape); + + for (int i = 2; i < rank; i++) + { + const auto j = i - 2; /* window idx */ + const auto required_total_padding = + std::max(0, (outShape[j] - 1) * strides[j] + kernel_size[j] - inShape[j]); + + common_padding[i] = required_total_padding / 2; + padding_left[i] = 0; + padding_right[i] = required_total_padding % 2; + } + } + else + { + CV_Error(Error::StsNotImplemented, "Specified padding mode not supported by PoolingLayer"); + } + + /* csl::Pooling supports symmetric padding only; hence, we deal with asymmetric padding by + * copying the input to a bigger tensor and padding the sides manually + */ + for (int i = 0; i < rank; i++) + input_shape[i] += padding_left[i] + padding_right[i]; + + /* if the actual input shape and the new input shape do not match; we need to transform the input */ + transform_required = input_shape != input_wrapper->getShape(); + if (transform_required) + { + transformedInput.resize(std::begin(input_shape), std::end(input_shape)); + inputTransformer = csl::TensorTransform(cudnnHandle, padding_left, padding_right); + } csl::Pooling::params_type params; params.input_shape.assign(std::begin(input_shape), std::end(input_shape)); params.window_size = kernel_size; - params.padding = pads_begin; + params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); params.stride = strides; if (type == MAX) @@ -342,7 +417,7 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer } else if (type == AVE) { - if(avePoolPaddedArea) + if (avePoolPaddedArea) params.type = csl::Pooling::pooling_type::AVERAGE_INCLUDE_PADDING; else params.type = csl::Pooling::pooling_type::AVERAGE_EXCLUDE_PADDING; @@ -350,16 +425,15 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer else CV_Error(Error::StsNotImplemented, "Unsupported pooling type"); - if(ceilMode) - params.rounding_mode = csl::Pooling::rounding_type::CEILING; - else - params.rounding_mode = csl::Pooling::rounding_type::FLOOR; - pooler = csl::Pooling(cudnnHandle, params); } csl::cudnn::Handle cudnnHandle; csl::Pooling pooler; + + bool transform_required; + csl::Tensor transformedInput; + csl::TensorTransform inputTransformer; #endif virtual Ptr initVkCom(const std::vector > &inputs) CV_OVERRIDE From 6fc4ce0b953e3f9628975f2210ee9147375da8b1 Mon Sep 17 00:00:00 2001 From: Yashas Date: Thu, 4 Jul 2019 19:47:03 +0530 Subject: [PATCH 039/129] add event API --- modules/dnn/src/cuda4dnn/csl/event.hpp | 102 +++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 modules/dnn/src/cuda4dnn/csl/event.hpp diff --git a/modules/dnn/src/cuda4dnn/csl/event.hpp b/modules/dnn/src/cuda4dnn/csl/event.hpp new file mode 100644 index 000000000000..dd536319e3f3 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/event.hpp @@ -0,0 +1,102 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_EVENT_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_EVENT_HPP + +#include "error.hpp" +#include "stream.hpp" + +#include + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /** @brief sharable smart CUDA stream + * + * Stream is a smart sharable wrapper for CUDA stream handle which ensures that + * the handle is destroyed after use. Unless explicitly specified by a constructor argument, + * the stream object represents the default stream. + * + * @note Moving a Stream object to another invalidates the former + */ + class Event { + public: + Event() noexcept : event{ nullptr } { } + Event(const Event&) = delete; + Event(Event&& other) noexcept + : event{ other.event } { + other.event = nullptr; + } + + /** if \p create is `true`, a new event will be created; otherwise, an empty event object is created */ + Event(bool create, bool timing_event = false) : event{nullptr} { + if (create) { + unsigned int flags = cudaEventBlockingSync | (timing_event ? 0 : cudaEventDisableTiming); + CUDA4DNN_CHECK_CUDA(cudaEventCreateWithFlags(&event, flags)); + } + } + + ~Event() { + try { + if (event != nullptr) + CUDA4DNN_CHECK_CUDA(cudaEventDestroy(event)); + } catch (const CUDAException& ex) { + std::ostringstream os; + os << "Asynchronous exception caught during CUDA event destruction.\n"; + os << ex.what(); + os << "Exception will be ignored.\n"; + CV_LOG_WARNING(0, os.str().c_str()); + } + } + + Event& operator=(const Event&) noexcept = delete; + Event& operator=(Event&& other) noexcept { + event = other.event; + other.event = nullptr; + return *this; + } + + /** mark a point in a stream */ + void record(const Stream& stream) { + CUDA4DNN_CHECK_CUDA(cudaEventRecord(event, StreamAccessor::get(stream))); + } + + /** blocks the caller thread until all operations before the event finish */ + void synchronize() const { CUDA4DNN_CHECK_CUDA(cudaEventSynchronize(event)); } + + /** returns true if there are operations pending before the event completes */ + bool busy() const { + auto status = cudaEventQuery(event); + if (status == cudaErrorNotReady) + return true; + CUDA4DNN_CHECK_CUDA(status); + return false; + } + + cudaEvent_t get() const noexcept { return event; } + + /** returns true if the event is valid */ + explicit operator bool() const noexcept { return event; } + + private: + cudaEvent_t event; + }; + + /** makes a stream wait on an event */ + void StreamWaitOnEvent(const Stream& stream, const Event& event) { + CUDA4DNN_CHECK_CUDA(cudaStreamWaitEvent(StreamAccessor::get(stream), event.get(), 0)); + } + + /** returns the time elapsed between two events in milliseconds */ + float TimeElapsedBetweenEvents(const Event& start, const Event& end) { + float temp; + CUDA4DNN_CHECK_CUDA(cudaEventElapsedTime(&temp, start.get(), end.get())); + return temp; + } + +}}}} /* cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_EVENT_HPP */ From 699867ea21fca680854aaeae4797affac3f65c5e Mon Sep 17 00:00:00 2001 From: Yashas Date: Thu, 4 Jul 2019 21:51:08 +0530 Subject: [PATCH 040/129] improve pooling performance for some padding scenarios --- modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp | 7 +++--- modules/dnn/src/layers/pooling_layer.cpp | 28 +++++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp index ef28e4ff5448..7c0f4c0d491f 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp @@ -287,6 +287,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { struct params_type { std::vector input_shape; + std::vector output_shape; std::vector window_size; std::vector padding; @@ -304,9 +305,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { inputTensorDesc = TensorDescriptor(params.input_shape); poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type); - std::vector output_dim; - getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim); - outputTensorDesc = TensorDescriptor(output_dim); + //std::vector output_dim; + //getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim); + outputTensorDesc = TensorDescriptor(params.output_shape); } Pooling& operator=(const Pooling&) = delete; diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index 14c7b5ff57e5..3d6431f6f120 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -391,12 +391,39 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer CV_Error(Error::StsNotImplemented, "Specified padding mode not supported by PoolingLayer"); } + /* in some scenarios, the extra padding may not change the output at all */ + for (int i = 2; i < rank; i++) { + auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; + auto rem = (input_shape[i] + total_padding - kernel_size[i - 2]) % strides[i - 2]; + if (rem && padding_right[i] > 0) + padding_right[i]--; + } + /* csl::Pooling supports symmetric padding only; hence, we deal with asymmetric padding by * copying the input to a bigger tensor and padding the sides manually */ for (int i = 0; i < rank; i++) input_shape[i] += padding_left[i] + padding_right[i]; + std::vector output_shape(rank); + output_shape[0] = input_shape[0]; + output_shape[1] = input_shape[1]; + for (int i = 2; i < rank; i++) + { + auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; + output_shape[i] = (input_shape[i] + total_padding - kernel_size[i - 2]) / strides[i - 2] + 1; + } + + /* try to avoid input transformation using cuDNN's flexibility */ + if (input_shape != input_wrapper->getShape() && + std::all_of(std::begin(padding_left), std::end(padding_left), [](std::size_t i) {return i == 0; })) + { + /* we don't need a transformation since cuDNN allows smaller or bigger output dimensions for + * from the dimensions calculated from the arithmetic + */ + input_shape = input_wrapper->getShape(); + } + /* if the actual input shape and the new input shape do not match; we need to transform the input */ transform_required = input_shape != input_wrapper->getShape(); if (transform_required) @@ -407,6 +434,7 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer csl::Pooling::params_type params; params.input_shape.assign(std::begin(input_shape), std::end(input_shape)); + params.output_shape.assign(std::begin(output_shape), std::end(output_shape)); params.window_size = kernel_size; params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); params.stride = strides; From 47918529271b0ca7da7f91a38e4e98cba0c34607 Mon Sep 17 00:00:00 2001 From: Yashas Date: Thu, 4 Jul 2019 22:48:34 +0530 Subject: [PATCH 041/129] avoid over-allocation of compute resources to kernels --- modules/dnn/src/cuda/activations.cu | 18 +++++++++--------- modules/dnn/src/cuda/concat.cu | 4 ++-- modules/dnn/src/cuda/eltwise_ops.cu | 8 ++++---- modules/dnn/src/cuda/normalize.cu | 12 ++++++------ modules/dnn/src/cuda/permute.cu | 2 +- modules/dnn/src/cuda/prior_box.cu | 9 +++++---- modules/dnn/src/cuda/scale.cu | 12 ++++++------ modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp | 18 ++++++++++++++++++ 8 files changed, 51 insertions(+), 32 deletions(-) diff --git a/modules/dnn/src/cuda/activations.cu b/modules/dnn/src/cuda/activations.cu index 871e4ce9d3ad..695e0ed4f673 100644 --- a/modules/dnn/src/cuda/activations.cu +++ b/modules/dnn/src/cuda/activations.cu @@ -94,7 +94,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(src.size() >= dest.size()); auto kernel = raw::abs; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, dest.size(), 0, stream); launch_kernel(kernel, policy, dest, src); } @@ -106,7 +106,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(src.size() >= dest.size()); auto kernel = raw::tanh; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, dest.size(), 0, stream); launch_kernel(kernel, policy, dest, src); } @@ -118,7 +118,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(src.size() >= dest.size()); auto kernel = raw::sigmoid; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, dest.size(), 0, stream); launch_kernel(kernel, policy, dest, src); } @@ -130,7 +130,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(src.size() >= dest.size()); auto kernel = raw::bnll; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, dest.size(), 0, stream); launch_kernel(kernel, policy, dest, src); } @@ -142,7 +142,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(src.size() >= dest.size()); auto kernel = raw::elu; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, dest.size(), 0, stream); launch_kernel(kernel, policy, dest, src); } @@ -154,7 +154,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(src.size() >= dest.size()); auto kernel = raw::relu; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, dest.size(), 0, stream); launch_kernel(kernel, policy, dest, src, slope); } @@ -167,7 +167,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(floor <= ceiling); auto kernel = raw::clipped_relu; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, dest.size(), 0, stream); launch_kernel(kernel, policy, dest, src, floor, ceiling); } @@ -179,7 +179,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(src.size() >= dest.size()); auto kernel = raw::axiswise_relu; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, dest.size(), 0, stream); launch_kernel(kernel, policy, dest, src, inner_size, slope); } @@ -191,7 +191,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(src.size() >= dest.size()); auto kernel = raw::power; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, dest.size(), 0, stream); launch_kernel(kernel, policy, dest, src, exp, scale, shift); } diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu index 4b54fe5defd0..b4a1a1667cee 100644 --- a/modules/dnn/src/cuda/concat.cu +++ b/modules/dnn/src/cuda/concat.cu @@ -76,7 +76,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k std::size_t input_axis_size = input.get_axis_size(axis); std::size_t output_axis_size = output.get_axis_size(axis); - auto policy = make_policy(raw::concat, 0, stream); + auto policy = make_policy(raw::concat, input.size(), 0, stream); launch_kernel(raw::concat, policy, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size); @@ -101,7 +101,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k inStride_k.assign(std::begin(inStride), std::end(inStride)); auto kernel = raw::concat_with_offsets; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, input.size(), 0, stream); launch_kernel(kernel, policy, output, outStride_k, outOffset_k, input, inStride_k); } diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu index 98d3eaab3916..3bccc780efe5 100644 --- a/modules/dnn/src/cuda/eltwise_ops.cu +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -49,7 +49,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(output.size() == x.size()); auto kernel = raw::eltwise_max_2; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, x, y); } @@ -62,7 +62,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(output.size() == x.size()); auto kernel = raw::eltwise_sum_2; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, x, y); } @@ -80,7 +80,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } auto kernel = raw::eltwise_sum_coeff_2; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, coeff_x, x, coeff_y, y); } @@ -93,7 +93,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(output.size() == x.size()); auto kernel = raw::eltwise_prod_2; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, x, y); } diff --git a/modules/dnn/src/cuda/normalize.cu b/modules/dnn/src/cuda/normalize.cu index 121d83a779aa..6c4c336be31a 100644 --- a/modules/dnn/src/cuda/normalize.cu +++ b/modules/dnn/src/cuda/normalize.cu @@ -86,29 +86,29 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k auto sums = span(workspace_.data(), outer_size * inner_size); auto zero_kernel = raw::zero; - auto policy = make_policy(zero_kernel, 0, stream); + auto policy = make_policy(zero_kernel, sums.size(), 0, stream); launch_kernel(zero_kernel, policy, sums); if (norm == 1) { auto reduce_kernel = raw::reduce_sum_abs; - policy = make_policy(reduce_kernel, 0, stream); + policy = make_policy(reduce_kernel, input.size(), 0, stream); launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size); auto reciprocal_kernel = raw::reciprocal; - policy = make_policy(reciprocal_kernel, 0, stream); + policy = make_policy(reciprocal_kernel, sums.size(), 0, stream); launch_kernel(reciprocal_kernel, policy, sums, epsilon); } else { auto reduce_kernel = raw::reduce_sum_squared; - policy = make_policy(reduce_kernel, 0, stream); + policy = make_policy(reduce_kernel, input.size(), 0, stream); launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size); auto rsqrt_kernel = raw::rsqrt; - policy = make_policy(rsqrt_kernel, 0, stream); + policy = make_policy(rsqrt_kernel, sums.size(), 0, stream); launch_kernel(rsqrt_kernel, policy, sums, epsilon); } auto scale_kernel = raw::apply_norm; - policy = make_policy(scale_kernel, 0, stream); + policy = make_policy(scale_kernel, output.size(), 0, stream); launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums); } diff --git a/modules/dnn/src/cuda/permute.cu b/modules/dnn/src/cuda/permute.cu index efd2c047f56a..5a590970376a 100644 --- a/modules/dnn/src/cuda/permute.cu +++ b/modules/dnn/src/cuda/permute.cu @@ -59,7 +59,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k inStride_k.assign(std::begin(inStride), std::end(inStride)); auto kernel = raw::permute; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, input.size(), 0, stream); launch_kernel(kernel, policy, order_k, output, outStride_k, input, inStride_k); } diff --git a/modules/dnn/src/cuda/prior_box.cu b/modules/dnn/src/cuda/prior_box.cu index 3a5da326dcf5..7e9ea59f10d4 100644 --- a/modules/dnn/src/cuda/prior_box.cu +++ b/modules/dnn/src/cuda/prior_box.cu @@ -91,8 +91,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight, T stepX, T stepY) { + auto num_points = layerWidth * layerHeight; auto kernel = raw::prior_box; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, num_points, 0, stream); launch_kernel(kernel, policy, output, boxWidth, boxHeight, offsetX, offsetY, layerWidth, layerHeight, imageWidth, imageHeight, @@ -129,20 +130,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k if (clip) { auto output_span_c1 = span(output.data(), channel_size); auto kernel = raw::prior_box_clip; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output_span_c1.size(), 0, stream); launch_kernel(kernel, policy, output_span_c1); } auto output_span_c2 = span(output.data() + channel_size, channel_size); if (variance.size() == 1) { auto kernel = raw::prior_box_set_variance1; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output_span_c2.size(), 0, stream); launch_kernel(kernel, policy, output_span_c2, variance[0]); } else { utils::array variance_k; variance_k.assign(std::begin(variance), std::end(variance)); auto kernel = raw::prior_box_set_variance4; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output_span_c2.size()/4, 0, stream); launch_kernel(kernel, policy, output_span_c2, variance_k); } } diff --git a/modules/dnn/src/cuda/scale.cu b/modules/dnn/src/cuda/scale.cu index c5ec7055fe59..99978bcb239e 100644 --- a/modules/dnn/src/cuda/scale.cu +++ b/modules/dnn/src/cuda/scale.cu @@ -70,7 +70,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(is_shape_same(input, output)); auto kernel = raw::scale1; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, input, beta); } @@ -87,7 +87,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(is_shape_same(input, output)); auto kernel = raw::biasN; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, input, inner_size, bias); } @@ -99,7 +99,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(is_shape_same(input, output)); auto kernel = raw::scale1; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, input, alpha); } @@ -116,7 +116,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(is_shape_same(input, output)); auto kernel = raw::scaleN; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, input, inner_size, weights); } @@ -128,7 +128,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(is_shape_same(input, output)); auto kernel = raw::scale1_with_bias1; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, input, alpha, beta); } @@ -146,7 +146,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(weights.size() == bias.size()); auto kernel = raw::scaleN_with_biasN; - auto policy = make_policy(kernel, 0, stream); + auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, input, inner_size, weights, bias); } diff --git a/modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp b/modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp index c7c3570e6f3b..68e168e952cc 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp @@ -37,11 +37,29 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { cudaStream_t stream; }; + /* this overload shouldn't be necessary; we should always try to provide a bound on the number of threads */ + /* template inline execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) { int grid_size, block_size; CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem)); return execution_policy(grid_size, block_size, sharedMem, stream); + }*/ + + template inline + execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) { + CV_Assert(max_threads > 0); + + int grid_size, block_size; + CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem)); + if (grid_size * block_size > max_threads) { + grid_size = (max_threads + block_size - 1) / block_size; + if (block_size > max_threads) + block_size = std::max(64, max_threads); + } + + CV_Assert(grid_size >= 1 && block_size >= 1); + return execution_policy(grid_size, block_size, sharedMem, stream); } template inline From 170fc3e9a20e0fd651ae5decd2a5da98154b41f6 Mon Sep 17 00:00:00 2001 From: Yashas Date: Thu, 4 Jul 2019 22:50:25 +0530 Subject: [PATCH 042/129] improve prior box performance --- modules/dnn/src/cuda/prior_box.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/dnn/src/cuda/prior_box.cu b/modules/dnn/src/cuda/prior_box.cu index 7e9ea59f10d4..e00511a47ce2 100644 --- a/modules/dnn/src/cuda/prior_box.cu +++ b/modules/dnn/src/cuda/prior_box.cu @@ -77,9 +77,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void prior_box_set_variance4(span output, array variance) { - for (auto i : grid_stride_range(output.size()/4)) { - for (int j = 0; j < variance.size(); j++) - output[i * 4 + j] = variance[j]; + for (auto i : grid_stride_range(output.size())) { + const auto vidx = i % variance.size(); + output[i] = variance[vidx]; } } } @@ -143,7 +143,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k utils::array variance_k; variance_k.assign(std::begin(variance), std::end(variance)); auto kernel = raw::prior_box_set_variance4; - auto policy = make_policy(kernel, output_span_c2.size()/4, 0, stream); + auto policy = make_policy(kernel, output_span_c2.size(), 0, stream); launch_kernel(kernel, policy, output_span_c2, variance_k); } } From 8f664f66a82646f8c381ed889816c2191a488e05 Mon Sep 17 00:00:00 2001 From: Yashas Date: Fri, 5 Jul 2019 12:17:23 +0530 Subject: [PATCH 043/129] enable layer fusion --- modules/dnn/src/dnn.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 96ee320cf81f..c5be9d066073 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -1977,6 +1977,7 @@ struct Net::Impl void fuseLayers(const std::vector& blobsToKeep_) { if( !fusion || (preferableBackend != DNN_BACKEND_OPENCV && + preferableBackend != DNN_BACKEND_CUDA && preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)) return; From 00557bd1e04a6de125a149bf499f6c6b68a68d71 Mon Sep 17 00:00:00 2001 From: Yashas Date: Fri, 5 Jul 2019 21:04:31 +0530 Subject: [PATCH 044/129] add const layer --- modules/dnn/src/layers/const_layer.cpp | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/modules/dnn/src/layers/const_layer.cpp b/modules/dnn/src/layers/const_layer.cpp index 7a33d6ec84ec..841fc92c114a 100644 --- a/modules/dnn/src/layers/const_layer.cpp +++ b/modules/dnn/src/layers/const_layer.cpp @@ -7,12 +7,19 @@ #include "../precomp.hpp" #include "../op_inf_engine.hpp" +#include "../op_cuda.hpp" #include "layers_common.hpp" #ifdef HAVE_OPENCL #include "opencl_kernels_dnn.hpp" #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/tensor_ops.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn { class ConstLayerImpl CV_FINAL : public ConstLayer @@ -39,6 +46,12 @@ class ConstLayerImpl CV_FINAL : public ConstLayer return false; } + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()); + } + #ifdef HAVE_OPENCL bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { @@ -73,6 +86,38 @@ class ConstLayerImpl CV_FINAL : public ConstLayer return Ptr(new InfEngineBackendNode(ieLayer)); } #endif // HAVE_INF_ENGINE + +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + auto output_wrapper = outputs[0].dynamicCast(); + csl::tensor_ops::copy(stream, output_wrapper->getSpan(), constTensor); + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + /* host to device copy is more expensive than device to device copy; hence, we keep a copy + * of the blob in device memory and use it as the source for copy + */ + stream = std::move(stream_); + constTensor = createTensorHeaderFromMat(blobs[0]); + copyMatToTensor(constTensor, blobs[0], stream); + } + + csl::Stream stream; + csl::Tensor constTensor; +#endif + }; Ptr ConstLayer::create(const LayerParams& params) From 0f217062b17f6e72af1ac453641c0eed5e687574 Mon Sep 17 00:00:00 2001 From: Yashas Date: Sat, 6 Jul 2019 00:41:05 +0530 Subject: [PATCH 045/129] add resize layer --- modules/dnn/src/cuda/math.hpp | 6 +- modules/dnn/src/cuda/resize.cu | 126 +++++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/kernels.hpp | 6 ++ modules/dnn/src/layers/resize_layer.cpp | 47 ++++++++- 4 files changed, 181 insertions(+), 4 deletions(-) create mode 100644 modules/dnn/src/cuda/resize.cu diff --git a/modules/dnn/src/cuda/math.hpp b/modules/dnn/src/cuda/math.hpp index 79ef5431954f..d715f41c0432 100644 --- a/modules/dnn/src/cuda/math.hpp +++ b/modules/dnn/src/cuda/math.hpp @@ -10,7 +10,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace utils { - template __device__ T abs(T val); + template __device__ T abs(T val) { return (val < 0 ? -val : val); } template <> inline __device__ float abs(float val) { return fabsf(val); } template <> inline __device__ double abs(double val) { return fabs(val); } @@ -18,11 +18,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template <> inline __device__ float exp(float val) { return expf(val); } template <> inline __device__ double exp(double val) { return ::exp(val); } - template __device__ T max(T x, T y); + template __device__ T max(T x, T y) { return ::max(x, y); } template <> inline __device__ float max(float x, float y) { return fmaxf(x, y); } template <> inline __device__ double max(double x, double y) { return fmax(x, y); } - template __device__ T min(T x, T y); + template __device__ T min(T x, T y) { return ::min(x, y); } template <> inline __device__ float min(float x, float y) { return fminf(x, y); } template <> inline __device__ double min(double x, double y) { return fmin(x, y); } diff --git a/modules/dnn/src/cuda/resize.cu b/modules/dnn/src/cuda/resize.cu new file mode 100644 index 000000000000..4ce1d5d4cd2a --- /dev/null +++ b/modules/dnn/src/cuda/resize.cu @@ -0,0 +1,126 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "math.hpp" + +#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/kernel_utils.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/stream.hpp" + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace raw { + template + __global__ void resize_nn( + span output, std::size_t out_height, std::size_t out_width, + view input, std::size_t in_height, std::size_t in_width) + { + auto in_image_size = in_height * in_width; + auto out_image_size = out_height * out_width; + + /* o2i = output to input */ + auto o2i_fx = float(in_width) / out_width; + auto o2i_fy = float(in_height) / out_height; + + /* think of the output and input as a collection of 2d images with the last axis + * representing the width and the last but one axis representing the height + * + * the remaining axis together form a collection of these images + */ + for (auto idx : grid_stride_range(output.size())) { + auto n = idx / out_image_size; + auto x = (idx % out_image_size) % out_width; + auto y = (idx % out_image_size) / out_width; + + auto in_x = __float2int_rz(x * o2i_fx); + auto in_y = __float2int_rz(y * o2i_fy); + + auto in_idx = n * in_image_size + in_y * in_width + in_x; + output[idx] = input[in_idx]; + } + } + + template + __global__ void resize_bilinear( + span output, std::size_t out_height, std::size_t out_width, + view input, std::size_t in_height, std::size_t in_width, + float o2i_fy, float o2i_fx) + { + auto in_image_size = in_height * in_width; + auto out_image_size = out_height * out_width; + + /* think of the output and input as a collection of 2d images with the last axis + * representing the width and the last but one axis representing the height + * + * the remaining axis together form a collection of these images + */ + for (auto idx : grid_stride_range(output.size())) { + auto n = idx / out_image_size; + auto x = (idx % out_image_size) % out_width; + auto y = (idx % out_image_size) / out_width; + + auto in_x = x * o2i_fx; + auto in_y = y * o2i_fy; + + int in_x0 = __float2int_rz(in_x); + int in_y0 = __float2int_rz(in_y); + + using utils::min; + int in_x1 = min(in_x0 + 1, in_width - 1); + int in_y1 = min(in_y0 + 1, in_height - 1); + + int in_offset_r0 = n * in_image_size + in_y0 * in_width; + int in_offset_r1 = n * in_image_size + in_y1 * in_width; + + auto v_00 = input[in_offset_r0 + in_x0], + v_01 = input[in_offset_r0 + in_x1], + v_10 = input[in_offset_r1 + in_x0], + v_11 = input[in_offset_r1 + in_x1]; + + output[idx] = + v_00 + + (in_y - in_y0) * (v_10 - v_00) + + (in_x - in_x0) * (v_01 - v_00) + + (in_y - in_y0) * (in_x - in_x0) * (v_11 - v_01 - v_10 + v_00); + } + } + } + + template + void resize_nn(const Stream& stream, TensorSpan output, TensorView input) { + auto in_height = input.get_axis_size(-2); + auto in_width = input.get_axis_size(-1); + + auto out_height = output.get_axis_size(-2); + auto out_width = output.get_axis_size(-1); + + auto kernel = raw::resize_nn; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width); + } + + template void resize_nn(const Stream&, TensorSpan, TensorView); + template void resize_nn(const Stream&, TensorSpan, TensorView); + + template + void resize_bilinear(const Stream& stream, TensorSpan output, TensorView input, float scale_y, float scale_x) { + auto in_height = input.get_axis_size(-2); + auto in_width = input.get_axis_size(-1); + + auto out_height = output.get_axis_size(-2); + auto out_width = output.get_axis_size(-1); + + auto kernel = raw::resize_bilinear; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x); + } + + template void resize_bilinear(const Stream&, TensorSpan, TensorView, float, float); + template void resize_bilinear(const Stream&, TensorSpan, TensorView, float, float); + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index 9a4654a9b5c3..eb24c47002ee 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -112,6 +112,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke T stepX, T stepY, bool normalize, bool clip); + template + void resize_nn(const Stream& stream, TensorSpan output, TensorView input); + + template + void resize_bilinear(const Stream& stream, TensorSpan output, TensorView input, float scale_y, float scale_x); + }}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp index 339f2b7932eb..d1c71b438c9a 100644 --- a/modules/dnn/src/layers/resize_layer.cpp +++ b/modules/dnn/src/layers/resize_layer.cpp @@ -6,9 +6,16 @@ // Third party copyrights are property of their respective owners. #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/kernels.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn { class ResizeLayerImpl : public ResizeLayer @@ -58,7 +65,8 @@ class ResizeLayerImpl : public ResizeLayer (interpolation == "bilinear"); } #endif - return backendId == DNN_BACKEND_OPENCV; + return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()); } virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE @@ -159,6 +167,43 @@ class ResizeLayerImpl : public ResizeLayer CV_Error(Error::StsNotImplemented, "Unknown interpolation: " + interpolation); } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + if (interpolation == "nearest") + csl::kernels::resize_nn(stream, output, input); + else if (interpolation == "bilinear") + csl::kernels::resize_bilinear(stream, output, input, scaleHeight, scaleWidth); + else + CV_Error(Error::StsNotImplemented, "Requested interpolation mode is not available in resize layer."); + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + stream = std::move(stream_); + } + + csl::Stream stream; +#endif + virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE From c850cb55b499ac36cd291bb4bba7c9c620af31a0 Mon Sep 17 00:00:00 2001 From: Yashas Date: Sat, 6 Jul 2019 17:05:52 +0530 Subject: [PATCH 046/129] add slice layer --- modules/dnn/src/cuda/slice.cu | 107 +++++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/kernels.hpp | 6 ++ modules/dnn/src/layers/slice_layer.cpp | 46 ++++++++++ 3 files changed, 159 insertions(+) create mode 100644 modules/dnn/src/cuda/slice.cu diff --git a/modules/dnn/src/cuda/slice.cu b/modules/dnn/src/cuda/slice.cu new file mode 100644 index 000000000000..3020a56bbc27 --- /dev/null +++ b/modules/dnn/src/cuda/slice.cu @@ -0,0 +1,107 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "array.hpp" + +#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/kernel_utils.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/stream.hpp" + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace raw { + template + using array = utils::array; + + template + __global__ void slice( + span output, array out_strides, + view input, array in_strides, array in_offset) + { + for (auto i : grid_stride_range(output.size())) { + /* compute output axis indices corresponding to element 'i' */ + int out_index = i / out_strides[0]; + int in_index = in_offset[0] + out_index; + int iidx = in_index * in_strides[0]; + for (int j = 1; j < N; j++) { + out_index = (i % out_strides[j - 1]) / out_strides[j]; + in_index = in_offset[j] + out_index; + iidx += in_index * in_strides[j]; + } + + output[i] = input[iidx]; + } + } + } + + template static + void launch_slice_kernel( + const Stream& stream, + span output, const std::vector& outStride, + view input, const std::vector& inStride, const std::vector& inOffset) + { + CV_Assert(outStride.size() == N); + CV_Assert(inStride.size() == N); + CV_Assert(inOffset.size() == N); + + utils::array outStride_k, inStride_k, inOffset_k; + outStride_k.assign(std::begin(outStride), std::end(outStride)); + inStride_k.assign(std::begin(inStride), std::end(inStride)); + inOffset_k.assign(std::begin(inOffset), std::end(inOffset)); + + auto kernel = raw::slice; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, outStride_k, input, inStride_k, inOffset_k); + } + + template + void slice(const Stream& stream, + TensorSpan output, TensorView input, + const std::vector& offsets) + { + CV_Assert(output.rank == input.rank); + CV_Assert(output.rank >= 3 && output.rank <= 5); + CV_Assert(output.size() % 2 == 0); + + int rank = output.rank; + auto inShape = input.shape(); + auto outShape = output.shape(); + + std::vector inStride(rank), outStride(rank); + inStride.back() = 1; + outStride.back() = 1; + /* garbage, ..., garbage, 1 */ + + std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride)); + std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride)); + /* dim[0], dim[1], ..., dim[-1], 1 */ + + std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies()); + std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies()); + /* stride[0], stride[1], ..., stride[-2], 1 */ + + if (offsets.size() != rank) { + auto diff = rank - offsets.size(); + outStride.erase(outStride.begin(), outStride.begin() + diff); + inStride.erase(inStride.begin(), inStride.begin() + diff); + } + + if (rank == 5) { + launch_slice_kernel(stream, output, outStride, input, inStride, offsets); + } else if (rank == 4) { + launch_slice_kernel(stream, output, outStride, input, inStride, offsets); + } else if (rank == 3) { + launch_slice_kernel(stream, output, outStride, input, inStride, offsets); + } + } + + template void slice(const Stream&, TensorSpan, TensorView, const std::vector&); + template void slice(const Stream&, TensorSpan, TensorView, const std::vector&); + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index eb24c47002ee..90c298488935 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -118,6 +118,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke template void resize_bilinear(const Stream& stream, TensorSpan output, TensorView input, float scale_y, float scale_x); + + template + void slice(const Stream& stream, + TensorSpan output, TensorView input, + const std::vector& offsets); + }}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 430555161b5a..8647f82bf325 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -41,6 +41,7 @@ //M*/ #include "../precomp.hpp" +#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include "layers_common.hpp" #include @@ -49,6 +50,12 @@ #include "opencl_kernels_dnn.hpp" #endif +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/kernels.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -112,6 +119,7 @@ class SliceLayerImpl : public SliceLayer virtual bool supportBackend(int backendId) CV_OVERRIDE { return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || (backendId == DNN_BACKEND_INFERENCE_ENGINE && #ifdef HAVE_INF_ENGINE INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1) && @@ -260,6 +268,44 @@ class SliceLayerImpl : public SliceLayer } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + CV_Assert(inputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + for (int i = 0; i < outputs.size(); ++i) + { + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + std::vector offsets; + for (const auto& range : sliceRanges[i]) + offsets.push_back(range.start); + csl::kernels::slice(stream, output, input, offsets); + } + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + stream = std::move(stream_); + } + + csl::Stream stream; +#endif + #ifdef HAVE_INF_ENGINE #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1) virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE From 085e632452457bdcc80778529499f20f9225873c Mon Sep 17 00:00:00 2001 From: Yashas Date: Mon, 8 Jul 2019 14:26:20 +0530 Subject: [PATCH 047/129] add padding layer --- modules/dnn/src/cuda/fill.cu | 38 +++++++ modules/dnn/src/cuda/padding.cu | 134 +++++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/kernels.hpp | 10 +- modules/dnn/src/layers/padding_layer.cpp | 86 +++++++++++++++ 4 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 modules/dnn/src/cuda/fill.cu create mode 100644 modules/dnn/src/cuda/padding.cu diff --git a/modules/dnn/src/cuda/fill.cu b/modules/dnn/src/cuda/fill.cu new file mode 100644 index 000000000000..a28fea18468c --- /dev/null +++ b/modules/dnn/src/cuda/fill.cu @@ -0,0 +1,38 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/kernel_utils.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/pointer.hpp" +#include "../cuda4dnn/csl/stream.hpp" + +#include + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace raw { + template + __global__ void fill(span output, T value) + { + for (auto i : grid_stride_range(output.size())) + output[i] = value; + } + } + + template + void fill(const Stream& stream, span output, T value) + { + auto kernel = raw::fill; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, value); + } + + template void fill(const Stream&, span, float); + template void fill(const Stream&, span, double); + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda/padding.cu b/modules/dnn/src/cuda/padding.cu new file mode 100644 index 000000000000..6954bfb0c6c3 --- /dev/null +++ b/modules/dnn/src/cuda/padding.cu @@ -0,0 +1,134 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "array.hpp" +#include "math.hpp" + +#include "../cuda4dnn/csl/kernels.hpp" +#include "../cuda4dnn/csl/kernel_utils.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/stream.hpp" + +#include + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + + namespace raw { + template + using array = utils::array; + + template + __global__ void copy_with_reflection101( + span output, array out_strides, array start, array end, + view input, array in_strides) + { + for (auto i : grid_stride_range(output.size())) { + /* compute output axis indices corresponding to element 'i' */ + array out_index; + out_index[0] = i / out_strides[0]; + for (int j = 1; j < N; j++) + out_index[j] = (i % out_strides[j - 1]) / out_strides[j]; + + /* compute input axis indices corresponding to output axis indices */ + using utils::abs; + array in_index; + for (int j = 0; j < N; j++) { + /* if out_index < start, the point is in the left reflection region + * the reflected value's index is the absolute value of the difference + * + * otherwise, if the value is in the copy region, out_index - start gives the input index + */ + in_index[j] = abs(out_index[j] - start[j]); + + /* if out_index >= end, it's in the right reflection region */ + if (out_index[j] >= end[j]) + in_index[j] = (end[j] - start[j]) - (out_index[j] - end[j]) - 2; + } + + /* compute input element number from input axis indices */ + int iidx = 0; + for (int j = 0; j < N; j++) + iidx += in_index[j] * in_strides[j]; + + output[i] = input[iidx]; + } + } + } + + template static + void launch_copy_with_reflection101_kernel( + const Stream& stream, + span output, const std::vector& outStride, + view input, const std::vector& inStride, + const std::vector>& ranges) + { + CV_Assert(outStride.size() == N); + CV_Assert(inStride.size() == N); + CV_Assert(ranges.size() == N); + + utils::array outStride_k, start_k, end_k, inStride_k; + outStride_k.assign(std::begin(outStride), std::end(outStride)); + inStride_k.assign(std::begin(inStride), std::end(inStride)); + + for (int i = 0; i < N; i++) { + start_k[i] = ranges[i].first; + end_k[i] = ranges[i].second; + } + + auto kernel = raw::copy_with_reflection101; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, outStride_k, start_k, end_k, input, inStride_k); + } + + template + void copy_with_reflection101( + const Stream& stream, + TensorSpan output, TensorView input, + const std::vector>& ranges) + { + CV_Assert(output.rank == input.rank); + CV_Assert(output.rank >= 3 && output.rank <= 5); + CV_Assert(ranges.size() > 0 && ranges.size() < 5); + + int rank = output.rank; + auto inShape = input.shape(); + auto outShape = output.shape(); + + std::vector inStride(rank), outStride(rank); + inStride.back() = 1; + outStride.back() = 1; + /* garbage, ..., garbage, 1 */ + + std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride)); + std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride)); + /* dim[0], dim[1], ..., dim[-1], 1 */ + + std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies()); + std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies()); + /* stride[0], stride[1], ..., stride[-2], 1 */ + + if (ranges.size() != rank) { + auto diff = rank - ranges.size(); + outStride.erase(outStride.begin(), outStride.begin() + diff); + inStride.erase(inStride.begin(), inStride.begin() + diff); + } + + if (ranges.size() == 4) { + launch_copy_with_reflection101_kernel(stream, output, outStride, input, inStride, ranges); + } else if (ranges.size() == 3) { + launch_copy_with_reflection101_kernel(stream, output, outStride, input, inStride, ranges); + } else if (ranges.size() == 2) { + launch_copy_with_reflection101_kernel(stream, output, outStride, input, inStride, ranges); + } else if (ranges.size() == 1) { + launch_copy_with_reflection101_kernel(stream, output, outStride, input, inStride, ranges); + } + } + + template void copy_with_reflection101(const Stream&, TensorSpan, TensorView, const std::vector>& ranges); + template void copy_with_reflection101(const Stream&, TensorSpan, TensorView, const std::vector>& ranges); + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index 90c298488935..914ba34606b1 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -118,12 +118,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke template void resize_bilinear(const Stream& stream, TensorSpan output, TensorView input, float scale_y, float scale_x); - template void slice(const Stream& stream, TensorSpan output, TensorView input, const std::vector& offsets); + template + void fill(const Stream& stream, span output, T value); + + template + void copy_with_reflection101( + const Stream& stream, + TensorSpan output, TensorView input, + const std::vector>& ranges); + }}}}} /* namespace cv::dnn::cuda4dnn::csl::kernels */ #endif /* OPENCV_DNN_CUDA4DNN_KERNELS_HPP */ diff --git a/modules/dnn/src/layers/padding_layer.cpp b/modules/dnn/src/layers/padding_layer.cpp index cffb84d692c4..ad714a9c0450 100644 --- a/modules/dnn/src/layers/padding_layer.cpp +++ b/modules/dnn/src/layers/padding_layer.cpp @@ -11,10 +11,17 @@ Implementation of padding layer, which adds paddings to input blob. #include "../precomp.hpp" #include "layers_common.hpp" +#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include +#ifdef HAVE_CUDA +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/kernels.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + namespace cv { namespace dnn @@ -100,6 +107,7 @@ class PaddingLayerImpl CV_FINAL : public PaddingLayer (dstRanges.size() == 4 && paddings[0].first == 0 && paddings[0].second == 0)); #endif return backendId == DNN_BACKEND_OPENCV || + (backendId == DNN_BACKEND_CUDA && haveCUDA()) || (backendId == DNN_BACKEND_HALIDE && haveHalide() && dstRanges.size() == 4); } @@ -161,6 +169,84 @@ class PaddingLayerImpl CV_FINAL : public PaddingLayer CV_Error(Error::StsNotImplemented, "Unknown padding type: " + paddingType); } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto effective_rank = get_effective_rank(input); + CV_Assert(get_effective_rank(input) == get_effective_rank(output)); + + /* suppose we require padding for the first spatial axis (H in NCHW or D in NCDHW) + * + * there could be a case where the batch axis, channel axis, and the first spatial axis are all one + * this would result in effective rank being less than the number of axes requiring padding + */ + effective_rank = std::max(effective_rank, dstRanges.size()); + + for (int i = effective_rank - dstRanges.size(); i < effective_rank; i++) + { + if (dstRanges[i] == Range::all()) + CV_Assert(input.get_axis_size(i) == output.get_axis_size(i)); + else + CV_Assert(input.get_axis_size(i) == dstRanges[i].size()); + } + + if (paddingType == "constant") + { + csl::kernels::fill(stream, output, paddingValue); + + std::vector offsets(effective_rank, 0); + for (int i = 0; i < dstRanges.size(); i++) + { + const auto delta = effective_rank - dstRanges.size(); + if (dstRanges[i] != Range::all()) + offsets[delta + i] = dstRanges[i].start; + } + + csl::kernels::concat_with_offsets(stream, output, input, offsets); + } + else if (paddingType == "reflect") + { + std::vector> ranges(effective_rank); + for (int i = 0; i < effective_rank; i++) + { + const auto delta = effective_rank - dstRanges.size(); + if (i < delta || dstRanges[i - delta] == Range::all()) + ranges[i] = { 0, input.get_axis_size(i) }; + else + ranges[i] = { dstRanges[i].start, dstRanges[i].end }; + } + csl::kernels::copy_with_reflection101(stream, output, input, ranges); + } + else + CV_Error(Error::StsNotImplemented, "Requested padding mode is not supported by padding layer."); + } + + void initCUDA( + csl::Stream stream_, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + stream = std::move(stream_); + } + + csl::Stream stream; +#endif + virtual Ptr initHalide(const std::vector > &inputs) CV_OVERRIDE { #ifdef HAVE_HALIDE From 1dfc4098636a93122656630564d2fe16da6c27ac Mon Sep 17 00:00:00 2001 From: Yashas Date: Mon, 8 Jul 2019 15:20:12 +0530 Subject: [PATCH 048/129] add deconvolution layer --- modules/dnn/src/cuda4dnn/csl/cudnn.hpp | 1 + .../csl/cudnn/transpose_convolution.hpp | 94 ++++++++++ modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp | 71 ++++++++ modules/dnn/src/layers/convolution_layer.cpp | 163 +++++++++++++++++- 4 files changed, 328 insertions(+), 1 deletion(-) create mode 100644 modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp index 142766f5f31c..62286bec700c 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp @@ -9,6 +9,7 @@ #include "cudnn/cudnn.hpp" #include "cudnn/convolution.hpp" +#include "cudnn/transpose_convolution.hpp" #include "cudnn/lrn.hpp" #include "cudnn/pooling.hpp" #include "cudnn/softmax.hpp" diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp new file mode 100644 index 000000000000..2abbd61b1cdd --- /dev/null +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp @@ -0,0 +1,94 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP +#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP + +#include "cudnn.hpp" +#include "convolution.hpp" + +#include "../pointer.hpp" +#include "../workspace.hpp" + +#include + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { + + template + class TransposeConvolutionAlgorithm { + public: + TransposeConvolutionAlgorithm() noexcept : workspace_size{ 0 } { } + TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&) = default; + TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&&) = default; + + TransposeConvolutionAlgorithm( + const Handle& handle, + const ConvolutionDescriptor& conv, + const FilterDescriptor& filter, + const TensorDescriptor& input, + const TensorDescriptor& output) + { + CUDA4DNN_CHECK_CUDNN( + cudnnGetConvolutionBackwardDataAlgorithm( + HandleAccessor::get(handle), + filter.get(), input.get(), conv.get(), output.get(), + CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, + 0, /* no memory limit */ + &dalgo + ) + ); + + CUDA4DNN_CHECK_CUDNN( + cudnnGetConvolutionBackwardDataWorkspaceSize( + HandleAccessor::get(handle), + filter.get(), input.get(), conv.get(), output.get(), + dalgo, &workspace_size + ) + ); + } + + TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default; + TransposeConvolutionAlgorithm& operator=(TransposeConvolutionAlgorithm&& other) = default; + + cudnnConvolutionBwdDataAlgo_t get() const noexcept { return dalgo; } + + std::size_t get_workspace_size() const noexcept { return workspace_size; } + + private: + cudnnConvolutionBwdDataAlgo_t dalgo; + std::size_t workspace_size; + }; + + template inline + void transpose_convolve( + const Handle& handle, + const ConvolutionDescriptor& convDesc, + const TransposeConvolutionAlgorithm& convAlgo, + const Workspace& workspace, + const FilterDescriptor& filterDesc, + DevicePtr filterPtr, + const TensorDescriptor& inputDesc, + DevicePtr inputPtr, + T alpha, T beta, + const TensorDescriptor& outputDesc, + DevicePtr outputPtr) + { + CUDA4DNN_CHECK_CUDNN( + cudnnConvolutionBackwardData( + HandleAccessor::get(handle), + &alpha, + filterDesc.get(), filterPtr.get(), + inputDesc.get(), inputPtr.get(), + convDesc.get(), convAlgo.get(), + WorkspaceAccessor::get(workspace).get(), workspace.size(), + &beta, outputDesc.get(), outputPtr.get() + ) + ); + } + +}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp index 7c0f4c0d491f..490618d75233 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp @@ -277,6 +277,77 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { ConvolutionAlgorithm algo; }; + template + class TransposeConvolution { + using TensorDescriptor = cudnn::TensorDescriptor; + using FilterDescriptor = cudnn::FilterDescriptor; + using ConvolutionDescriptor = cudnn::ConvolutionDescriptor; + using TransposeConvolutionAlgorithm = cudnn::TransposeConvolutionAlgorithm; + + public: + struct params_type { + std::vector input_shape; + std::vector output_shape; + + std::vector filter_shape; + + std::vector padding; + std::vector stride; + std::vector dialation; + + std::size_t groups; + }; + + TransposeConvolution() = default; + TransposeConvolution(const TransposeConvolution&) = delete; + TransposeConvolution(TransposeConvolution&&) = default; + TransposeConvolution(cudnn::Handle handle, const params_type& params) { + cudnnHandle = std::move(handle); + + filterDesc = FilterDescriptor(params.filter_shape); + convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dialation, params.groups); + + /* input_shape is the output shape for convolution + * output_shape is the input shape for convolution + */ + convInputTensorDesc = TensorDescriptor(params.output_shape); + + std::vector conv_output_dims; + getConvolutionForwardOutputDim(convDesc, filterDesc, convInputTensorDesc, conv_output_dims); + + /* the convolution output must be identical to what cuDNN expects */ + CV_Assert(std::equal(std::begin(conv_output_dims), std::end(conv_output_dims), std::begin(params.input_shape))); + + convOutputTensorDesc = TensorDescriptor(params.input_shape); + + algo = TransposeConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, convOutputTensorDesc, convInputTensorDesc); + } + + TransposeConvolution& operator=(const TransposeConvolution&) = delete; + TransposeConvolution& operator=(TransposeConvolution&&) = default; + + std::size_t get_workspace_size() const noexcept { + return algo.get_workspace_size(); + } + + void transpose_convolve(TensorSpan output, TensorView input, TensorView filters, const Workspace& scratchpad) { + cudnn::transpose_convolve( + cudnnHandle, + convDesc, algo, scratchpad, + filterDesc, filters.get(), + convOutputTensorDesc, input.get(), + 1.0, 0.0, convInputTensorDesc, output.get() + ); + } + + private: + cudnn::Handle cudnnHandle; + TensorDescriptor convInputTensorDesc, convOutputTensorDesc; + FilterDescriptor filterDesc; + ConvolutionDescriptor convDesc; + TransposeConvolutionAlgorithm algo; + }; + template class Pooling { using TensorDescriptor = cudnn::TensorDescriptor; diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 61a9d27c6387..abeb6dbe862d 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -1532,7 +1532,8 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl } else #endif // HAVE_INF_ENGINE - return kernel_size.size() == 2 && (backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE); + return (backendId == DNN_BACKEND_CUDA && haveCUDA()) || + (kernel_size.size() == 2 && (backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE)); } bool getMemoryShapes(const std::vector &inputs, @@ -2058,6 +2059,166 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl } } +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + convoluter.transpose_convolve(output, input, filtersTensor, workspace); + if (hasBias() || fusedBias) + csl::tensor_ops::add(cudnnHandle, 1.0, output, 1.0, biasTensor); + } + + void initCUDA( + csl::Stream stream, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + cudnnHandle = std::move(cudnn_handle); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input_shape = input_wrapper->getShape(); + + /* 1d, 2d, 3d deconvolutions are supported */ + CV_Assert(input_shape.size() >= 3 || input_shape.size() <= 5); + + CV_Assert(blobs.size() >= 1); + const auto& filtersMat = blobs[0]; + + const auto rank = input_shape.size(); + const auto input_feature_maps = input_shape[1]; + const auto output_feature_maps = numOutput; + const auto output_feature_maps_per_group = filtersMat.size[1]; + const auto groups = output_feature_maps / output_feature_maps_per_group; + CV_Assert(output_feature_maps % output_feature_maps_per_group == 0); + + auto output_shape = input_shape; + output_shape[1] = output_feature_maps; + if (padMode.empty()) + { + for (int i = 0; i < kernel_size.size(); i++) + output_shape[i + 2] = + (strides[i] * (input_shape[2 + i] - 1) + kernel_size[i] - pads_begin[i] - pads_end[i] + adjust_pads[i]); + } + else if (padMode == "VALID") + { + for (int i = 0; i < kernel_size.size(); i++) + output_shape[i + 2] = + (strides[i] * (input_shape[2 + i] - 1) + kernel_size[i] + adjust_pads[i]); + } + else if (padMode == "SAME") + { + for (int i = 0; i < kernel_size.size(); i++) + output_shape[i + 2] = (strides[i] * (input_shape[2 + i] - 1) + 1 + adjust_pads[i]); + } + else + CV_Error(Error::StsNotImplemented, "[0] Specified padding mode not supported by DeconvolutionLayer"); + + Mat filterWeightsSource = filtersMat; + if (fusedWeights) + { + filterWeightsSource = weightsMat.clone(); + transpose(weightsMat, filterWeightsSource); + } + + filtersTensor = createTensorHeaderFromMat(filterWeightsSource); + copyMatToTensor(filtersTensor, filterWeightsSource, stream); + + if (hasBias() || fusedBias) + { + biasTensor = createTensorHeaderFromMat(biasesMat); + copyMatToTensor(biasTensor, biasesMat, stream); + + std::vector biasShape(rank, 1); + biasShape[1] = output_feature_maps; + biasTensor.reshape(std::begin(biasShape), std::end(biasShape)); + } + + /* left and right are misleading as the padding is applicable for any number of dimensions + * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` + */ + std::vector common_padding(rank, 0); + std::vector padding_left(rank, 0), padding_right(rank, 0); + if (padMode.empty()) + { + for (int i = 2; i < common_padding.size(); i++) + { + common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]); + padding_left[i] = pads_begin[i - 2] - common_padding[i]; + padding_right[i] = pads_end[i - 2] - common_padding[i]; + } + } + else if (padMode == "VALID") { /* nothing to do as the paddings are already preset to zero */ } + else if (padMode == "SAME") + { + /* TensorFlow Logic (for convolution): + * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] + * + * if total padding is odd, the input is padded towards the end + */ + for (int i = 2; i < rank; i++) + { + const auto j = i - 2; /* filter index */ + const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; + const auto required_total_padding = + std::max(0, (input_shape[i] - 1) * strides[j] + effective_kernel_size - output_shape[i]); + + common_padding[i] = required_total_padding / 2; + padding_left[i] = 0; + padding_right[i] = required_total_padding % 2; + } + } + else + { + CV_Error(Error::StsNotImplemented, "[1] Specified padding mode not supported by DeconvolutionLayer"); + } + + for (int i = 0; i < rank; i++) + if (padding_left[i] != 0 || padding_right[i] != 0) + CV_Error(Error::StsNotImplemented, "Padding configuration requires asymmetric padding and hence is not supported."); + + csl::TransposeConvolution::params_type params; + + auto& ishape = params.input_shape; + ishape.assign(std::begin(input_shape), std::end(input_shape)); + + auto& oshape = params.output_shape; + oshape.assign(std::begin(output_shape), std::end(output_shape)); + + auto& fshape = params.filter_shape; + fshape.resize(ishape.size()); + fshape[0] = input_feature_maps; + fshape[1] = output_feature_maps_per_group; + std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2); + CV_Assert(fshape.size() == kernel_size.size() + 2); + + params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); + params.stride = strides; + params.dialation = dilations; + params.groups = groups; + + convoluter = csl::TransposeConvolution(cudnnHandle, params); + scratch_mem_in_bytes = convoluter.get_workspace_size(); + } + + csl::cudnn::Handle cudnnHandle; + csl::Tensor filtersTensor, biasTensor; + csl::TransposeConvolution convoluter; +#endif + virtual Ptr initHalide(const std::vector > &inputs) CV_OVERRIDE { #ifdef HAVE_HALIDE From 39cc3a78166ed6067353f4ecd456f8afc17a2831 Mon Sep 17 00:00:00 2001 From: Yashas Date: Mon, 8 Jul 2019 15:33:35 +0530 Subject: [PATCH 049/129] fix channelwise ReLU initialization --- modules/dnn/src/layers/elementwise_layers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index 757894b80f9f..cad95cfa5f1d 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -1352,7 +1352,7 @@ struct ChannelsPReLUFunctor const csl::Stream& stream ) { - if(slopeTensor) + if(!slopeTensor) { slopeTensor = std::make_shared>(); *slopeTensor = createTensorHeaderFromMat(scale); From fd1acaf6ed561d2d57d2b9f9ad53b85bf83db37f Mon Sep 17 00:00:00 2001 From: Yashas Date: Mon, 8 Jul 2019 23:19:42 +0530 Subject: [PATCH 050/129] add vector traits --- modules/dnn/src/cuda/vector_traits.hpp | 157 +++++++++++++++++++++++ modules/dnn/src/cuda4dnn/csl/pointer.hpp | 6 + modules/dnn/src/cuda4dnn/csl/span.hpp | 16 +++ 3 files changed, 179 insertions(+) create mode 100644 modules/dnn/src/cuda/vector_traits.hpp diff --git a/modules/dnn/src/cuda/vector_traits.hpp b/modules/dnn/src/cuda/vector_traits.hpp new file mode 100644 index 000000000000..34adb74fd3cf --- /dev/null +++ b/modules/dnn/src/cuda/vector_traits.hpp @@ -0,0 +1,157 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA_VECTOR_TYPE_TRAITS_HPP +#define OPENCV_DNN_SRC_CUDA_VECTOR_TYPE_TRAITS_HPP + +#include + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + /* HOW TO ADD A NEW VECTOR TYPE? + * - specialize `get_vector_type` + * - specialize `detail::get_element_type` + */ + + /** returns a vector type in the 'type' field for a given scalar type and vector size + * + * if a vector type does not exist for the given combination, the `type` member will not exist + */ + template + struct get_vector_type {}; + + template <> struct get_vector_type { typedef float type; }; + template <> struct get_vector_type { typedef float2 type; }; + template <> struct get_vector_type { typedef float4 type; }; + + template <> struct get_vector_type { typedef double type; }; + template <> struct get_vector_type { typedef double2 type; }; + template <> struct get_vector_type { typedef double4 type; }; + + namespace detail { + template + struct get_element_type { }; + + /* only non-const specializations are required; const qualifications are automatically handled */ + template struct get_element_type { typedef float type; }; + template struct get_element_type { typedef float type; }; + template struct get_element_type { typedef float type; }; + + template struct get_element_type { typedef double type; }; + template struct get_element_type { typedef double type; }; + template struct get_element_type { typedef double type; }; + + /* handle const qualified types */ + template + struct get_element_type::value, void>::type>{ + typedef + typename std::add_const< + typename get_element_type< + typename std::remove_const::type + >::type + >::type + type; + }; + } + + namespace detail { + template __host__ __device__ + constexpr std::size_t size() { return sizeof(V) / sizeof(typename get_element_type::type); } + } + + /** returns a struct with information about a given vector or scalar type + * + * - `element_type` gives the scalar type corresponding to the type + * - `vector_type` gives the type + * - `size()` returns the number of elements of `element_type` that can exist in the type + */ + template + struct vector_traits { + typedef typename detail::get_element_type::type element_type; + typedef V vector_type; + + __host__ __device__ + static constexpr std::size_t size() { return detail::size(); } + }; + + namespace detail { + template + struct accessor { }; + + template + struct accessor { + __host__ __device__ + static constexpr typename vector_traits::element_type get(V& v, std::size_t i) { return v; } + + __host__ __device__ + static constexpr void set(V& v, std::size_t i, vector_traits::element_type value) { v = value; } + }; + + template + struct accessor { + __host__ __device__ + static constexpr typename vector_traits::element_type get(V& v, std::size_t i) { + switch (i) { + case 0: return v.x; + case 1: return v.y; + } + /* should never end up here */ + return v.x; + } + + __host__ __device__ + static constexpr void set(V& v, std::size_t i, vector_traits::element_type value) { + switch (i) { + case 0: v.x = value; + case 1: v.y = value; + } + /* should never end up here */ + } + }; + + template + struct accessor { + __host__ __device__ + static constexpr typename vector_traits::element_type get(V& v, std::size_t i) { + switch (i) { + case 0: return v.w; + case 1: return v.x; + case 2: return v.y; + case 3: return v.z; + } + /* should never end up here */ + return v.x; + } + + __host__ __device__ + static constexpr void set(V& v, std::size_t i, vector_traits::element_type value) { + switch (i) { + case 0: v.w = value; + case 1: v.x = value; + case 2: v.y = value; + case 3: v.z = value; + } + /* should never end up here */ + } + }; + } + + /** get a value from a vector type using an index */ + template __host__ __device__ + constexpr typename vector_traits::element_type get(V& v, std::size_t i) { + return detail::accessor::size()>::get(v, i); + } + + /** set a value in a vector type using an index */ + template __host__ __device__ + constexpr void set(V& v, std::size_t i, typename vector_traits::element_type value) { + return detail::accessor::size()>::set(v, i, value); + } + + +}}}}} /* cv::dnn::cuda4dnn::csl::kernels */ + +#endif /* OPENCV_DNN_SRC_CUDA_VECTOR_TYPE_TRAITS_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/pointer.hpp b/modules/dnn/src/cuda4dnn/csl/pointer.hpp index aeb298c97e25..77831f1d9970 100644 --- a/modules/dnn/src/cuda4dnn/csl/pointer.hpp +++ b/modules/dnn/src/cuda4dnn/csl/pointer.hpp @@ -127,6 +127,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { pointer ptr; }; + template + bool is_aligned(DevicePtr ptr, std::size_t alignment) { + auto addr = reinterpret_cast(ptr.get()); + return addr % alignment == 0; + } + /** copies \p n elements from \p src to \p dest4 * * \param[in] src device pointer diff --git a/modules/dnn/src/cuda4dnn/csl/span.hpp b/modules/dnn/src/cuda4dnn/csl/span.hpp index f6771c761a8c..c26075856812 100644 --- a/modules/dnn/src/cuda4dnn/csl/span.hpp +++ b/modules/dnn/src/cuda4dnn/csl/span.hpp @@ -58,6 +58,22 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { template using view = span; + template + bool is_address_aligned(view v, std::size_t alignment) { + return is_aligned(v.data(), alignment); + } + + template + bool is_size_aligned(view v, std::size_t alignment) { + return v.size() % alignment == 0; + } + + /** @brief returns true if the address and the size of the span/view is aligned to a boundary */ + template + bool is_fully_aligned(view v, std::size_t alignment) { + return is_address_aligned(v, alignment) && is_size_aligned(v, alignment); + } + }}}} /* namespace cv::dnn::cuda4dnn::csl */ #endif /* OPENCV_DNN_CUDA4DNN_CSL_SPAN_HPP */ From ad0e4c61b293bfb27687f09d250228dddc7da17b Mon Sep 17 00:00:00 2001 From: Yashas Date: Mon, 8 Jul 2019 23:20:58 +0530 Subject: [PATCH 051/129] add vectorized versions of relu, clipped_relu, power --- modules/dnn/src/cuda/activations.cu | 157 ++++++++++++++++++++++++++-- 1 file changed, 146 insertions(+), 11 deletions(-) diff --git a/modules/dnn/src/cuda/activations.cu b/modules/dnn/src/cuda/activations.cu index 695e0ed4f673..c0f082484769 100644 --- a/modules/dnn/src/cuda/activations.cu +++ b/modules/dnn/src/cuda/activations.cu @@ -3,6 +3,7 @@ // of this distribution and at http://opencv.org/license.html. #include "math.hpp" +#include "vector_traits.hpp" #include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/kernel_utils.hpp" @@ -57,12 +58,82 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } } + template + __global__ void relu_vec4(span dest, view src, T slope) { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(dest.data().get()); + const vector_type* srcPtr = reinterpret_cast(src.data().get()); + + for (auto i : grid_stride_range(dest.size() / 4)) { + vector_type vec = srcPtr[i]; + vec.w = vec.w >= 0.0 ? vec.w : slope * vec.w; + vec.x = vec.x >= 0.0 ? vec.x : slope * vec.x; + vec.y = vec.y >= 0.0 ? vec.y : slope * vec.y; + vec.z = vec.z >= 0.0 ? vec.z : slope * vec.z; + dstPtr[i] = vec; + } + } + + template + __global__ void relu_vec2(span dest, view src, T slope) { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(dest.data().get()); + const vector_type* srcPtr = reinterpret_cast(src.data().get()); + + for (auto i : grid_stride_range(dest.size() / 2)) { + vector_type vec = srcPtr[i]; + vec.x = vec.x >= 0.0 ? vec.x : slope * vec.x; + vec.y = vec.y >= 0.0 ? vec.y : slope * vec.y; + dstPtr[i] = vec; + } + } + template __global__ void relu(span dest, view src, T slope) { for (auto i : grid_stride_range(dest.size())) dest[i] = src[i] >= 0.0 ? src[i] : slope * src[i]; } + template + __global__ void clipped_relu_vec4(span dest, view src, T floor, T ceiling) { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(dest.data().get()); + const vector_type* srcPtr = reinterpret_cast(src.data().get()); + + for (auto i : grid_stride_range(dest.size()/4)) { + using utils::max; + using utils::min; + + vector_type vec = srcPtr[i]; + vec.w = min(max(vec.w, floor), ceiling); + vec.x = min(max(vec.x, floor), ceiling); + vec.y = min(max(vec.y, floor), ceiling); + vec.z = min(max(vec.z, floor), ceiling); + dstPtr[i] = vec; + } + } + + template + __global__ void clipped_relu_vec2(span dest, view src, T floor, T ceiling) { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(dest.data().get()); + const vector_type* srcPtr = reinterpret_cast(src.data().get()); + + for (auto i : grid_stride_range(dest.size()/2)) { + using utils::max; + using utils::min; + + vector_type vec = srcPtr[i]; + vec.x = min(max(vec.x, floor), ceiling); + vec.y = min(max(vec.y, floor), ceiling); + dstPtr[i] = vec; + } + } + template __global__ void clipped_relu(span dest, view src, T floor, T ceiling) { for (auto i : grid_stride_range(dest.size())) { @@ -80,6 +151,42 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } } + template + __global__ void power_vec4(span dest, view src, T exp, T scale, T shift) { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(dest.data().get()); + const vector_type* srcPtr = reinterpret_cast(src.data().get()); + + for (auto i : grid_stride_range(dest.size()/4)) { + using utils::pow; + + vector_type vec = srcPtr[i]; + vec.w = pow(shift + scale * vec.w, exp); + vec.x = pow(shift + scale * vec.x, exp); + vec.y = pow(shift + scale * vec.y, exp); + vec.z = pow(shift + scale * vec.z, exp); + dstPtr[i] = vec; + } + } + + template + __global__ void power_vec2(span dest, view src, T exp, T scale, T shift) { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(dest.data().get()); + const vector_type* srcPtr = reinterpret_cast(src.data().get()); + + for (auto i : grid_stride_range(dest.size()/2)) { + using utils::pow; + + vector_type vec = srcPtr[i]; + vec.x = pow(shift + scale * vec.x, exp); + vec.y = pow(shift + scale * vec.y, exp); + dstPtr[i] = vec; + } + } + template __global__ void power(span dest, view src, T exp, T scale, T shift) { for (auto i : grid_stride_range(dest.size())) { @@ -152,10 +259,19 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template void relu(const Stream& stream, span dest, view src, T slope) { CV_Assert(src.size() >= dest.size()); - - auto kernel = raw::relu; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src, slope); + if(is_fully_aligned(dest, 4) && is_fully_aligned(src, 4)) { + auto kernel = raw::relu_vec4; + auto policy = make_policy(kernel, dest.size() / 4, 0, stream); + launch_kernel(kernel, policy, dest, src, slope); + } else if (is_fully_aligned(dest, 2) && is_fully_aligned(src, 2)) { + auto kernel = raw::relu_vec2; + auto policy = make_policy(kernel, dest.size() / 2, 0, stream); + launch_kernel(kernel, policy, dest, src, slope); + } else { + auto kernel = raw::relu; + auto policy = make_policy(kernel, dest.size(), 0, stream); + launch_kernel(kernel, policy, dest, src, slope); + } } template void relu(const Stream&, span, view, float); @@ -166,9 +282,19 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(src.size() >= dest.size()); CV_Assert(floor <= ceiling); - auto kernel = raw::clipped_relu; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src, floor, ceiling); + if (is_fully_aligned(dest, 4) && is_fully_aligned(src, 4)) { + auto kernel = raw::clipped_relu_vec4; + auto policy = make_policy(kernel, dest.size() / 4, 0, stream); + launch_kernel(kernel, policy, dest, src, floor, ceiling); + } else if (is_fully_aligned(dest, 2) && is_fully_aligned(src, 2)) { + auto kernel = raw::clipped_relu_vec2; + auto policy = make_policy(kernel, dest.size() / 2, 0, stream); + launch_kernel(kernel, policy, dest, src, floor, ceiling); + } else { + auto kernel = raw::clipped_relu; + auto policy = make_policy(kernel, dest.size(), 0, stream); + launch_kernel(kernel, policy, dest, src, floor, ceiling); + } } template void clipped_relu(const Stream&, span, view, float, float); @@ -189,10 +315,19 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template void power(const Stream& stream, span dest, view src, T exp, T scale, T shift) { CV_Assert(src.size() >= dest.size()); - - auto kernel = raw::power; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src, exp, scale, shift); + if (is_fully_aligned(dest, 4) && is_fully_aligned(src, 4) && dest.size() > 1024 * 16 * 4) { + auto kernel = raw::power_vec4; + auto policy = make_policy(kernel, dest.size() / 4, 0, stream); + launch_kernel(kernel, policy, dest, src, exp, scale, shift); + } else if (is_fully_aligned(dest, 2) && is_fully_aligned(src, 2) && dest.size() > 1024 * 16 * 2) { + auto kernel = raw::power_vec2; + auto policy = make_policy(kernel, dest.size() / 2, 0, stream); + launch_kernel(kernel, policy, dest, src, exp, scale, shift); + } else { + auto kernel = raw::power; + auto policy = make_policy(kernel, dest.size(), 0, stream); + launch_kernel(kernel, policy, dest, src, exp, scale, shift); + } } template void power(const Stream&, span, view, float, float, float); From 9414e0b368106234bf25272ddec3e4d22323d849 Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 9 Jul 2019 13:08:24 +0530 Subject: [PATCH 052/129] add vectorized concat kernels --- modules/dnn/src/cuda/concat.cu | 97 ++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 5 deletions(-) diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu index b4a1a1667cee..288d79df9f5f 100644 --- a/modules/dnn/src/cuda/concat.cu +++ b/modules/dnn/src/cuda/concat.cu @@ -2,9 +2,9 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. -#include #include "array.hpp" +#include "vector_traits.hpp" #include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/kernel_utils.hpp" @@ -12,10 +12,60 @@ #include "../cuda4dnn/csl/pointer.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include + namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { /* Reference: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/concat_layer.cu */ + template + __global__ void concat_vec4( + span output, std::size_t output_axis_size, std::size_t output_axis_offset, + view input, std::size_t input_axis_size, std::size_t concat_size) + { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); + + /* we need to copy all the elements of input to some location in the output */ + for (auto in_idx : grid_stride_range(input.size() / 4)) { + const auto idx = in_idx * 4; + const auto total_concat_size = concat_size * input_axis_size; + const auto concat_num = idx / total_concat_size; + const auto concat_index = idx % total_concat_size; + const auto top_index = concat_index + + (concat_num * output_axis_size + output_axis_offset) * concat_size; + + const auto out_idx = top_index / 4; + dstPtr[out_idx] = srcPtr[in_idx]; + } + } + + template + __global__ void concat_vec2( + span output, std::size_t output_axis_size, std::size_t output_axis_offset, + view input, std::size_t input_axis_size, std::size_t concat_size) + { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); + + /* we need to copy all the elements of input to some location in the output */ + for (auto in_idx : grid_stride_range(input.size() / 2)) { + const auto idx = in_idx * 2; + const auto total_concat_size = concat_size * input_axis_size; + const auto concat_num = idx / total_concat_size; + const auto concat_index = idx % total_concat_size; + const auto top_index = concat_index + + (concat_num * output_axis_size + output_axis_offset) * concat_size; + + const auto out_idx = top_index / 2; + dstPtr[out_idx] = srcPtr[in_idx]; + } + } + template __global__ void concat( span output, std::size_t output_axis_size, std::size_t output_axis_offset, @@ -69,6 +119,17 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k TensorSpan output, std::size_t output_axis_offset, TensorView input, std::size_t axis) { + /* let's call the axis of interest as the channel axis for the purpose of the following discussion + * even though it can be any axis + * + * for each batch item: + * we move all the channels from the input (which together for a single batch item is contiguous) + * of a batch item to its corresponding contiguous place in the output + * + * for a valid vector operation, the size of each copy block must be aligned + * input must be aligned + * all the destination locations in the output must be aligned + */ std::size_t concat_size = 1; for (int i = axis + 1; i < output.rank; i++) concat_size *= output.get_axis_size(i); @@ -76,10 +137,36 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k std::size_t input_axis_size = input.get_axis_size(axis); std::size_t output_axis_size = output.get_axis_size(axis); - auto policy = make_policy(raw::concat, input.size(), 0, stream); - launch_kernel(raw::concat, policy, - output, output_axis_size, output_axis_offset, - input, input_axis_size, concat_size); + std::size_t copy_block_size = concat_size * input_axis_size; + std::size_t copy_block_stride = concat_size * output_axis_size; + std::size_t starting_offset = output_axis_offset * concat_size; + + /* in a nutshell, all this concat operation does is copy several blocks of size `copy_block_size` + * to the output starting from `starting_offset` with blocks in the output strided by `copy_block_stride` + */ + + bool is_aligned_4 = copy_block_size % 4 == 0 && copy_block_stride % 4 == 0 && starting_offset % 4 == 0; + bool is_aligned_2 = copy_block_size % 2 == 0 && copy_block_stride % 2 == 0 && starting_offset % 2 == 0; + + if (is_fully_aligned(output, 4) && is_fully_aligned(input, 4) && is_aligned_4) { + auto kernel = raw::concat_vec4; + auto policy = make_policy(kernel, input.size() / 4, 0, stream); + launch_kernel(kernel, policy, + output, output_axis_size, output_axis_offset, + input, input_axis_size, concat_size); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2) && is_aligned_2) { + auto kernel = raw::concat_vec2; + auto policy = make_policy(kernel, input.size() / 2, 0, stream); + launch_kernel(kernel, policy, + output, output_axis_size, output_axis_offset, + input, input_axis_size, concat_size); + } else { + auto kernel = raw::concat; + auto policy = make_policy(kernel, input.size(), 0, stream); + launch_kernel(kernel, policy, + output, output_axis_size, output_axis_offset, + input, input_axis_size, concat_size); + } } template void concat(const Stream&, TensorSpan, std::size_t, TensorView, std::size_t); From 1357a9f85d609681a795b2f7b3856a3ff05be612 Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 9 Jul 2019 14:04:42 +0530 Subject: [PATCH 053/129] improve concat_with_offsets performance --- modules/dnn/src/cuda/concat.cu | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu index 288d79df9f5f..dfac2b6a108c 100644 --- a/modules/dnn/src/cuda/concat.cu +++ b/modules/dnn/src/cuda/concat.cu @@ -92,21 +92,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k view input, array in_strides) { for (auto i : grid_stride_range(input.size())) { - /* compute input axis indices corresponding to element 'i' */ - array in_index; - in_index[0] = i / in_strides[0]; - for (int j = 1; j < N; j++) - in_index[j] = (i % in_strides[j - 1]) / in_strides[j]; - - /* compute output axis indices corresponding to element 'i' */ - array out_index; - for (int j = 0; j < N; j++) - out_index[j] = out_offset[j] + in_index[j]; - - /* compute output element number from output axis indices */ - std::size_t oidx = 0; - for (int j = 0; j < N; j++) - oidx += out_index[j] * out_strides[j]; + int in_index = i / in_strides[0]; + int out_index = out_offset[0] + in_index; + int oidx = out_index * out_strides[0]; + for (int j = 1; j < N; j++) { + in_index = (i % in_strides[j - 1]) / in_strides[j]; + out_index = out_offset[j] + in_index; + oidx += out_index * out_strides[j]; + } output[oidx] = input[i]; } From c8eee86da6d80a59ee5e4549ff5bd9c07d0d534e Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 9 Jul 2019 17:08:05 +0530 Subject: [PATCH 054/129] vectorize scale and bias kernels --- modules/dnn/src/cuda/scale.cu | 170 ++++++++++++++++++++++++++++++++-- 1 file changed, 161 insertions(+), 9 deletions(-) diff --git a/modules/dnn/src/cuda/scale.cu b/modules/dnn/src/cuda/scale.cu index 99978bcb239e..4a667edacca4 100644 --- a/modules/dnn/src/cuda/scale.cu +++ b/modules/dnn/src/cuda/scale.cu @@ -2,6 +2,8 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. +#include "vector_traits.hpp" + #include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/kernel_utils.hpp" #include "../cuda4dnn/csl/tensor.hpp" @@ -23,6 +25,46 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k output[i] = input[i] + beta; } + template + __global__ void biasN_vec4(span output, view input, std::size_t inner_size, view bias) + { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); + + inner_size /= 4; + for (auto i : grid_stride_range(output.size() / 4)) { + const auto bias_idx = (i / inner_size) % bias.size(); + + vector_type vec = srcPtr[i]; + vec.w = vec.w + bias[bias_idx]; + vec.x = vec.x + bias[bias_idx]; + vec.y = vec.y + bias[bias_idx]; + vec.z = vec.z + bias[bias_idx]; + dstPtr[i] = vec; + } + } + + template + __global__ void biasN_vec2(span output, view input, std::size_t inner_size, view bias) + { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); + + inner_size /= 2; + for (auto i : grid_stride_range(output.size() / 2)) { + const auto bias_idx = (i / inner_size) % bias.size(); + + vector_type vec = srcPtr[i]; + vec.x = vec.x + bias[bias_idx]; + vec.y = vec.y + bias[bias_idx]; + dstPtr[i] = vec; + } + } + template __global__ void biasN(span output, view input, std::size_t inner_size, view bias) { @@ -39,6 +81,46 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k output[i] = alpha * input[i]; } + template + __global__ void scaleN_vec4(span output, view input, std::size_t inner_size, view weights) + { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); + + inner_size /= 4; + for (auto i : grid_stride_range(output.size() / 4)) { + auto scale_idx = (i / inner_size) % weights.size(); + + vector_type vec = srcPtr[i]; + vec.w = vec.w * weights[scale_idx]; + vec.x = vec.x * weights[scale_idx]; + vec.y = vec.y * weights[scale_idx]; + vec.z = vec.z * weights[scale_idx]; + dstPtr[i] = vec; + } + } + + template + __global__ void scaleN_vec2(span output, view input, std::size_t inner_size, view weights) + { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); + + inner_size /= 2; + for (auto i : grid_stride_range(output.size() / 2)) { + auto scale_idx = (i / inner_size) % weights.size(); + + vector_type vec = srcPtr[i]; + vec.x = vec.x * weights[scale_idx]; + vec.y = vec.y * weights[scale_idx]; + dstPtr[i] = vec; + } + } + template __global__ void scaleN(span output, view input, std::size_t inner_size, view weights) { @@ -55,6 +137,46 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k output[i] = alpha * input[i] + beta; } + template + __global__ void scaleN_with_biasN_vec4(span output, view input, std::size_t inner_size, view weights, view bias) + { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); + + inner_size /= 4; + for (auto i : grid_stride_range(output.size() / 4)) { + auto scale_idx = (i / inner_size) % weights.size(); + + vector_type vec = srcPtr[i]; + vec.w = vec.w * weights[scale_idx] + bias[scale_idx]; + vec.x = vec.x * weights[scale_idx] + bias[scale_idx]; + vec.y = vec.y * weights[scale_idx] + bias[scale_idx]; + vec.z = vec.z * weights[scale_idx] + bias[scale_idx]; + dstPtr[i] = vec; + } + } + + template + __global__ void scaleN_with_biasN_vec2(span output, view input, std::size_t inner_size, view weights, view bias) + { + using vector_type = typename get_vector_type::type; + + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); + + inner_size /= 2; + for (auto i : grid_stride_range(output.size() / 2)) { + auto scale_idx = (i / inner_size) % weights.size(); + + vector_type vec = srcPtr[i]; + vec.x = vec.x * weights[scale_idx] + bias[scale_idx]; + vec.y = vec.y * weights[scale_idx] + bias[scale_idx]; + dstPtr[i] = vec; + } + } + template __global__ void scaleN_with_biasN(span output, view input, std::size_t inner_size, view weights, view bias) { @@ -86,9 +208,19 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k { CV_Assert(is_shape_same(input, output)); - auto kernel = raw::biasN; - auto policy = make_policy(kernel, output.size(), 0, stream); - launch_kernel(kernel, policy, output, input, inner_size, bias); + if (is_fully_aligned(output, 4) && is_fully_aligned(input, 4) && inner_size % 4 == 0) { + auto kernel = raw::biasN_vec4; + auto policy = make_policy(kernel, output.size() / 4, 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, bias); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2) && inner_size % 2 == 0) { + auto kernel = raw::biasN_vec2; + auto policy = make_policy(kernel, output.size() / 2, 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, bias); + } else { + auto kernel = raw::biasN; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, bias); + } } template void biasN(const Stream&, TensorSpan, TensorView, std::size_t, TensorView); @@ -115,9 +247,19 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k { CV_Assert(is_shape_same(input, output)); - auto kernel = raw::scaleN; - auto policy = make_policy(kernel, output.size(), 0, stream); - launch_kernel(kernel, policy, output, input, inner_size, weights); + if (is_fully_aligned(output, 4) && is_fully_aligned(input, 4) && inner_size % 4 == 0) { + auto kernel = raw::scaleN_vec4; + auto policy = make_policy(kernel, output.size() / 4, 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, weights); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2) && inner_size % 2 == 0) { + auto kernel = raw::scaleN_vec2; + auto policy = make_policy(kernel, output.size() / 2, 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, weights); + } else { + auto kernel = raw::scaleN; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, weights); + } } template void scaleN(const Stream&, TensorSpan, TensorView, std::size_t, TensorView); @@ -145,9 +287,19 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(is_shape_same(input, output)); CV_Assert(weights.size() == bias.size()); - auto kernel = raw::scaleN_with_biasN; - auto policy = make_policy(kernel, output.size(), 0, stream); - launch_kernel(kernel, policy, output, input, inner_size, weights, bias); + if (is_fully_aligned(output, 4) && is_fully_aligned(input, 4) && inner_size % 4 == 0) { + auto kernel = raw::scaleN_with_biasN_vec4; + auto policy = make_policy(kernel, output.size() / 4, 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, weights, bias); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2) && inner_size % 2 == 0) { + auto kernel = raw::scaleN_with_biasN_vec2; + auto policy = make_policy(kernel, output.size() / 2, 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, weights, bias); + } else { + auto kernel = raw::scaleN_with_biasN; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, weights, bias); + } } template void scaleN_with_biasN(const Stream&, TensorSpan, TensorView, std::size_t, TensorView, TensorView); From 3e78b2187348c3de0e0717dbf5566ba9c9ea0a39 Mon Sep 17 00:00:00 2001 From: Yashas Date: Wed, 10 Jul 2019 22:20:51 +0530 Subject: [PATCH 055/129] add support for multi-billion element tensors --- modules/dnn/src/cuda/activations.cu | 251 +++++++++--------- modules/dnn/src/cuda/concat.cu | 78 +++--- modules/dnn/src/cuda/eltwise_ops.cu | 3 +- modules/dnn/src/cuda/execution.hpp | 79 ++++++ modules/dnn/src/cuda/fill.cu | 16 +- modules/dnn/src/cuda/grid_stride_loop.hpp | 87 ++++++ modules/dnn/src/cuda/math.hpp | 4 + modules/dnn/src/cuda/normalize.cu | 57 ++-- modules/dnn/src/cuda/padding.cu | 27 +- modules/dnn/src/cuda/permute.cu | 29 +- modules/dnn/src/cuda/prior_box.cu | 25 +- modules/dnn/src/cuda/resize.cu | 53 ++-- modules/dnn/src/cuda/scale.cu | 54 ++-- modules/dnn/src/cuda/slice.cu | 37 ++- modules/dnn/src/cuda/types.hpp | 28 ++ modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp | 153 ----------- 16 files changed, 543 insertions(+), 438 deletions(-) create mode 100644 modules/dnn/src/cuda/execution.hpp create mode 100644 modules/dnn/src/cuda/grid_stride_loop.hpp create mode 100644 modules/dnn/src/cuda/types.hpp delete mode 100644 modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp diff --git a/modules/dnn/src/cuda/activations.cu b/modules/dnn/src/cuda/activations.cu index c0f082484769..4feb4e822033 100644 --- a/modules/dnn/src/cuda/activations.cu +++ b/modules/dnn/src/cuda/activations.cu @@ -3,69 +3,76 @@ // of this distribution and at http://opencv.org/license.html. #include "math.hpp" +#include "types.hpp" #include "vector_traits.hpp" +#include "grid_stride_loop.hpp" +#include "execution.hpp" #include "../cuda4dnn/csl/kernels.hpp" -#include "../cuda4dnn/csl/kernel_utils.hpp" -#include "../cuda4dnn/csl/span.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/span.hpp" #include -#include #include +#include + namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { + + using index_type = gpu::index_type; + using size_type = gpu::size_type; + template - __global__ void abs(span dest, view src) { - for (auto i : grid_stride_range(dest.size())) { + __global__ void abs(span output, view input) { + for (auto i : grid_stride_range(output.size())) { using utils::abs; - dest[i] = abs(src[i]); + output[i] = abs(input[i]); } } template - __global__ void tanh(span dest, view src) { - for (auto i : grid_stride_range(dest.size())) { + __global__ void tanh(span output, view input) { + for (auto i : grid_stride_range(output.size())) { using utils::tanh; - dest[i] = tanh(src[i]); + output[i] = tanh(input[i]); } } template - __global__ void sigmoid(span dest, view src) { - for (auto i : grid_stride_range(dest.size())) { + __global__ void sigmoid(span output, view input) { + for (auto i : grid_stride_range(output.size())) { using utils::sigmoid; - dest[i] = sigmoid(src[i]); + output[i] = sigmoid(input[i]); } } template - __global__ void bnll(span dest, view src) { - for (auto i : grid_stride_range(dest.size())) { + __global__ void bnll(span output, view input) { + for (auto i : grid_stride_range(output.size())) { using utils::log1pexp; - dest[i] = src[i] > 0 ? src[i] + log1pexp(-src[i]) : log1pexp(src[i]); + output[i] = input[i] > 0 ? input[i] + log1pexp(-input[i]) : log1pexp(input[i]); } } template - __global__ void elu(span dest, view src) { - for (auto i : grid_stride_range(dest.size())) { + __global__ void elu(span output, view input) { + for (auto i : grid_stride_range(output.size())) { using utils::exp; - dest[i] = src[i] >= 0 ? src[i] : (exp(src[i]) - 1); + output[i] = input[i] >= 0 ? input[i] : expm1(input[i]); } } template - __global__ void relu_vec4(span dest, view src, T slope) { + __global__ void relu_vec4(span output, view input, T slope) { using vector_type = typename get_vector_type::type; - vector_type* dstPtr = reinterpret_cast(dest.data().get()); - const vector_type* srcPtr = reinterpret_cast(src.data().get()); + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); - for (auto i : grid_stride_range(dest.size() / 4)) { + for (auto i : grid_stride_range(output.size() / 4)) { vector_type vec = srcPtr[i]; vec.w = vec.w >= 0.0 ? vec.w : slope * vec.w; vec.x = vec.x >= 0.0 ? vec.x : slope * vec.x; @@ -76,13 +83,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void relu_vec2(span dest, view src, T slope) { + __global__ void relu_vec2(span output, view input, T slope) { using vector_type = typename get_vector_type::type; - vector_type* dstPtr = reinterpret_cast(dest.data().get()); - const vector_type* srcPtr = reinterpret_cast(src.data().get()); + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); - for (auto i : grid_stride_range(dest.size() / 2)) { + for (auto i : grid_stride_range(output.size() / 2)) { vector_type vec = srcPtr[i]; vec.x = vec.x >= 0.0 ? vec.x : slope * vec.x; vec.y = vec.y >= 0.0 ? vec.y : slope * vec.y; @@ -91,74 +98,71 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void relu(span dest, view src, T slope) { - for (auto i : grid_stride_range(dest.size())) - dest[i] = src[i] >= 0.0 ? src[i] : slope * src[i]; + __global__ void relu(span output, view input, T slope) { + for (auto i : grid_stride_range(output.size())) + output[i] = input[i] >= 0.0 ? input[i] : slope * input[i]; } template - __global__ void clipped_relu_vec4(span dest, view src, T floor, T ceiling) { + __global__ void clipped_relu_vec4(span output, view input, T floor, T ceiling) { using vector_type = typename get_vector_type::type; - vector_type* dstPtr = reinterpret_cast(dest.data().get()); - const vector_type* srcPtr = reinterpret_cast(src.data().get()); + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); - for (auto i : grid_stride_range(dest.size()/4)) { - using utils::max; - using utils::min; + for (auto i : grid_stride_range(output.size() / 4)) { + using utils::clamp; vector_type vec = srcPtr[i]; - vec.w = min(max(vec.w, floor), ceiling); - vec.x = min(max(vec.x, floor), ceiling); - vec.y = min(max(vec.y, floor), ceiling); - vec.z = min(max(vec.z, floor), ceiling); + vec.w = clamp(vec.w, floor, ceiling); + vec.x = clamp(vec.x, floor, ceiling); + vec.y = clamp(vec.y, floor, ceiling); + vec.z = clamp(vec.z, floor, ceiling); dstPtr[i] = vec; } } template - __global__ void clipped_relu_vec2(span dest, view src, T floor, T ceiling) { + __global__ void clipped_relu_vec2(span output, view input, T floor, T ceiling) { using vector_type = typename get_vector_type::type; - vector_type* dstPtr = reinterpret_cast(dest.data().get()); - const vector_type* srcPtr = reinterpret_cast(src.data().get()); + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); - for (auto i : grid_stride_range(dest.size()/2)) { - using utils::max; - using utils::min; + for (auto i : grid_stride_range(output.size() / 2)) { + using utils::clamp; vector_type vec = srcPtr[i]; - vec.x = min(max(vec.x, floor), ceiling); - vec.y = min(max(vec.y, floor), ceiling); + vec.x = clamp(vec.x, floor, ceiling); + vec.y = clamp(vec.y, floor, ceiling); dstPtr[i] = vec; } } template - __global__ void clipped_relu(span dest, view src, T floor, T ceiling) { - for (auto i : grid_stride_range(dest.size())) { - using utils::max; - using utils::min; - dest[i] = min(max(src[i], floor), ceiling); + __global__ void clipped_relu(span output, view input, T floor, T ceiling) { + for (auto i : grid_stride_range(output.size())) { + using utils::clamp; + output[i] = clamp(input[i], floor, ceiling); } } template - __global__ void axiswise_relu(span dest, view src, std::size_t inner_size, view slope) { - for (auto i : grid_stride_range(dest.size())) { - const auto c = (i % inner_size) / slope.size(); - dest[i] = src[i] < 0 ? src[i] * slope[c] : src[i]; + __global__ void axiswise_relu(span output, view input, size_type inner_size, view slope) { + for (auto i : grid_stride_range(output.size())) { + const index_type c = (i % inner_size) / static_cast(slope.size()); + output[i] = input[i] < 0 ? input[i] * slope[c] : input[i]; } } template - __global__ void power_vec4(span dest, view src, T exp, T scale, T shift) { + __global__ void power_vec4(span output, view input, T exp, T scale, T shift) { using vector_type = typename get_vector_type::type; - vector_type* dstPtr = reinterpret_cast(dest.data().get()); - const vector_type* srcPtr = reinterpret_cast(src.data().get()); + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); - for (auto i : grid_stride_range(dest.size()/4)) { + for (auto i : grid_stride_range(output.size() / 4)) { using utils::pow; vector_type vec = srcPtr[i]; @@ -171,13 +175,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void power_vec2(span dest, view src, T exp, T scale, T shift) { + __global__ void power_vec2(span output, view input, T exp, T scale, T shift) { using vector_type = typename get_vector_type::type; - vector_type* dstPtr = reinterpret_cast(dest.data().get()); - const vector_type* srcPtr = reinterpret_cast(src.data().get()); + vector_type* dstPtr = reinterpret_cast(output.data().get()); + const vector_type* srcPtr = reinterpret_cast(input.data().get()); - for (auto i : grid_stride_range(dest.size()/2)) { + for (auto i : grid_stride_range(output.size() / 2)) { using utils::pow; vector_type vec = srcPtr[i]; @@ -188,89 +192,89 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void power(span dest, view src, T exp, T scale, T shift) { - for (auto i : grid_stride_range(dest.size())) { + __global__ void power(span output, view input, T exp, T scale, T shift) { + for (auto i : grid_stride_range(output.size())) { using utils::pow; - dest[i] = pow(shift + scale * src[i], exp); + output[i] = pow(shift + scale * input[i], exp); } } } template - void abs(const Stream& stream, span dest, view src) { - CV_Assert(src.size() >= dest.size()); + void abs(const Stream& stream, span output, view input) { + CV_Assert(input.size() == output.size()); auto kernel = raw::abs; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src); + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input); } - template void abs(const Stream& stream, span dest, view src); - template void abs(const Stream& stream, span dest, view src); + template void abs(const Stream& stream, span output, view input); + template void abs(const Stream& stream, span output, view input); template - void tanh(const Stream& stream, span dest, view src) { - CV_Assert(src.size() >= dest.size()); + void tanh(const Stream& stream, span output, view input) { + CV_Assert(input.size() == output.size()); auto kernel = raw::tanh; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src); + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input); } template void tanh(const Stream&, span, view); template void tanh(const Stream&, span, view); template - void sigmoid(const Stream& stream, span dest, view src) { - CV_Assert(src.size() >= dest.size()); + void sigmoid(const Stream& stream, span output, view input) { + CV_Assert(input.size() == output.size()); auto kernel = raw::sigmoid; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src); + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input); } template void sigmoid(const Stream&, span, view); template void sigmoid(const Stream&, span, view); template - void bnll(const Stream& stream, span dest, view src) { - CV_Assert(src.size() >= dest.size()); + void bnll(const Stream& stream, span output, view input) { + CV_Assert(input.size() == output.size()); auto kernel = raw::bnll; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src); + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input); } template void bnll(const Stream&, span, view); template void bnll(const Stream&, span, view); template - void elu(const Stream& stream, span dest, view src) { - CV_Assert(src.size() >= dest.size()); + void elu(const Stream& stream, span output, view input) { + CV_Assert(input.size() == output.size()); auto kernel = raw::elu; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src); + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input); } template void elu(const Stream&, span, view); template void elu(const Stream&, span, view); template - void relu(const Stream& stream, span dest, view src, T slope) { - CV_Assert(src.size() >= dest.size()); - if(is_fully_aligned(dest, 4) && is_fully_aligned(src, 4)) { + void relu(const Stream& stream, span output, view input, T slope) { + CV_Assert(input.size() == output.size()); + if(is_fully_aligned(output, 4) && is_fully_aligned(input, 4)) { auto kernel = raw::relu_vec4; - auto policy = make_policy(kernel, dest.size() / 4, 0, stream); - launch_kernel(kernel, policy, dest, src, slope); - } else if (is_fully_aligned(dest, 2) && is_fully_aligned(src, 2)) { + auto policy = make_policy(kernel, output.size() / 4, 0, stream); + launch_kernel(kernel, policy, output, input, slope); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2)) { auto kernel = raw::relu_vec2; - auto policy = make_policy(kernel, dest.size() / 2, 0, stream); - launch_kernel(kernel, policy, dest, src, slope); + auto policy = make_policy(kernel, output.size() / 2, 0, stream); + launch_kernel(kernel, policy, output, input, slope); } else { auto kernel = raw::relu; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src, slope); + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input, slope); } } @@ -278,22 +282,22 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template void relu(const Stream&, span, view, double); template - void clipped_relu(const Stream& stream, span dest, view src, T floor, T ceiling) { - CV_Assert(src.size() >= dest.size()); + void clipped_relu(const Stream& stream, span output, view input, T floor, T ceiling) { + CV_Assert(input.size() == output.size()); CV_Assert(floor <= ceiling); - if (is_fully_aligned(dest, 4) && is_fully_aligned(src, 4)) { + if (is_fully_aligned(output, 4) && is_fully_aligned(input, 4)) { auto kernel = raw::clipped_relu_vec4; - auto policy = make_policy(kernel, dest.size() / 4, 0, stream); - launch_kernel(kernel, policy, dest, src, floor, ceiling); - } else if (is_fully_aligned(dest, 2) && is_fully_aligned(src, 2)) { + auto policy = make_policy(kernel, output.size() / 4, 0, stream); + launch_kernel(kernel, policy, output, input, floor, ceiling); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2)) { auto kernel = raw::clipped_relu_vec2; - auto policy = make_policy(kernel, dest.size() / 2, 0, stream); - launch_kernel(kernel, policy, dest, src, floor, ceiling); + auto policy = make_policy(kernel, output.size() / 2, 0, stream); + launch_kernel(kernel, policy, output, input, floor, ceiling); } else { auto kernel = raw::clipped_relu; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src, floor, ceiling); + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input, floor, ceiling); } } @@ -301,32 +305,33 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template void clipped_relu(const Stream&, span, view, double, double); template - void axiswise_relu(const Stream& stream, span dest, view src, view slope, std::size_t inner_size) { - CV_Assert(src.size() >= dest.size()); + void axiswise_relu(const Stream& stream, span output, view input, view slope, std::size_t inner_size) { + CV_Assert(input.size() == output.size()); auto kernel = raw::axiswise_relu; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src, inner_size, slope); + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input, inner_size, slope); } template void axiswise_relu(const Stream&, span, view, view, std::size_t); template void axiswise_relu(const Stream&, span, view, view, std::size_t); template - void power(const Stream& stream, span dest, view src, T exp, T scale, T shift) { - CV_Assert(src.size() >= dest.size()); - if (is_fully_aligned(dest, 4) && is_fully_aligned(src, 4) && dest.size() > 1024 * 16 * 4) { + void power(const Stream& stream, span output, view input, T exp, T scale, T shift) { + CV_Assert(input.size() == output.size()); + + if (is_fully_aligned(output, 4) && is_fully_aligned(input, 4) && output.size() > 1024 * 16 * 4) { auto kernel = raw::power_vec4; - auto policy = make_policy(kernel, dest.size() / 4, 0, stream); - launch_kernel(kernel, policy, dest, src, exp, scale, shift); - } else if (is_fully_aligned(dest, 2) && is_fully_aligned(src, 2) && dest.size() > 1024 * 16 * 2) { + auto policy = make_policy(kernel, output.size() / 4, 0, stream); + launch_kernel(kernel, policy, output, input, exp, scale, shift); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2) && output.size() > 1024 * 16 * 2) { auto kernel = raw::power_vec2; - auto policy = make_policy(kernel, dest.size() / 2, 0, stream); - launch_kernel(kernel, policy, dest, src, exp, scale, shift); + auto policy = make_policy(kernel, output.size() / 2, 0, stream); + launch_kernel(kernel, policy, output, input, exp, scale, shift); } else { auto kernel = raw::power; - auto policy = make_policy(kernel, dest.size(), 0, stream); - launch_kernel(kernel, policy, dest, src, exp, scale, shift); + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, input, exp, scale, shift); } } diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu index dfac2b6a108c..4a36fe7a2d58 100644 --- a/modules/dnn/src/cuda/concat.cu +++ b/modules/dnn/src/cuda/concat.cu @@ -4,37 +4,44 @@ #include "array.hpp" +#include "types.hpp" #include "vector_traits.hpp" +#include "grid_stride_loop.hpp" +#include "execution.hpp" #include "../cuda4dnn/csl/kernels.hpp" -#include "../cuda4dnn/csl/kernel_utils.hpp" -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/pointer.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/span.hpp" #include +#include + namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + using index_type = gpu::index_type; + using size_type = gpu::size_type; + namespace raw { /* Reference: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/concat_layer.cu */ template __global__ void concat_vec4( - span output, std::size_t output_axis_size, std::size_t output_axis_offset, - view input, std::size_t input_axis_size, std::size_t concat_size) + span output, size_type output_axis_size, index_type output_axis_offset, + view input, size_type input_axis_size, size_type concat_size) { using vector_type = typename get_vector_type::type; vector_type* dstPtr = reinterpret_cast(output.data().get()); const vector_type* srcPtr = reinterpret_cast(input.data().get()); - /* we need to copy all the elements of input to some location in the output */ + const auto total_concat_size = concat_size * input_axis_size; + for (auto in_idx : grid_stride_range(input.size() / 4)) { - const auto idx = in_idx * 4; - const auto total_concat_size = concat_size * input_axis_size; - const auto concat_num = idx / total_concat_size; - const auto concat_index = idx % total_concat_size; - const auto top_index = concat_index + + const index_type idx = in_idx * 4; + const index_type concat_num = idx / total_concat_size; + const index_type concat_index = idx % total_concat_size; + const index_type top_index = concat_index + (concat_num * output_axis_size + output_axis_offset) * concat_size; const auto out_idx = top_index / 4; @@ -44,21 +51,21 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void concat_vec2( - span output, std::size_t output_axis_size, std::size_t output_axis_offset, - view input, std::size_t input_axis_size, std::size_t concat_size) + span output, size_type output_axis_size, index_type output_axis_offset, + view input, size_type input_axis_size, size_type concat_size) { using vector_type = typename get_vector_type::type; vector_type* dstPtr = reinterpret_cast(output.data().get()); const vector_type* srcPtr = reinterpret_cast(input.data().get()); - /* we need to copy all the elements of input to some location in the output */ + const auto total_concat_size = concat_size * input_axis_size; + for (auto in_idx : grid_stride_range(input.size() / 2)) { - const auto idx = in_idx * 2; - const auto total_concat_size = concat_size * input_axis_size; - const auto concat_num = idx / total_concat_size; - const auto concat_index = idx % total_concat_size; - const auto top_index = concat_index + + const index_type idx = in_idx * 2; + const index_type concat_num = idx / total_concat_size; + const index_type concat_index = idx % total_concat_size; + const index_type top_index = concat_index + (concat_num * output_axis_size + output_axis_offset) * concat_size; const auto out_idx = top_index / 2; @@ -68,15 +75,18 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void concat( - span output, std::size_t output_axis_size, std::size_t output_axis_offset, - view input, std::size_t input_axis_size, std::size_t concat_size) + span output, size_type output_axis_size, index_type output_axis_offset, + view input, size_type input_axis_size, size_type concat_size) { - /* we need to copy all the elements of input to some location in the output */ + /* we need to copy all the elements of input to some location in the output + * we copy blocks of size `total_concat_size` to some location in the output + */ + const auto total_concat_size = concat_size * input_axis_size; + for (auto idx : grid_stride_range(input.size())) { - const auto total_concat_size = concat_size * input_axis_size; - const auto concat_num = idx / total_concat_size; - const auto concat_index = idx % total_concat_size; - const auto top_index = concat_index + + const index_type concat_num = idx / total_concat_size; + const index_type concat_index = idx % total_concat_size; + const index_type top_index = concat_index + (concat_num * output_axis_size + output_axis_offset) * concat_size; output[top_index] = input[idx]; @@ -88,13 +98,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void concat_with_offsets( - span output, array out_strides, array out_offset, - view input, array in_strides) + span output, array out_strides, array out_offset, + view input, array in_strides) { for (auto i : grid_stride_range(input.size())) { - int in_index = i / in_strides[0]; - int out_index = out_offset[0] + in_index; - int oidx = out_index * out_strides[0]; + index_type in_index = i / in_strides[0]; + index_type out_index = out_offset[0] + in_index; + index_type oidx = out_index * out_strides[0]; for (int j = 1; j < N; j++) { in_index = (i % in_strides[j - 1]) / in_strides[j]; out_index = out_offset[j] + in_index; @@ -175,11 +185,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(outOffset.size() == N); CV_Assert(inStride.size() == N); - utils::array outStride_k, outOffset_k, inStride_k; + utils::array outStride_k, inStride_k; outStride_k.assign(std::begin(outStride), std::end(outStride)); - outOffset_k.assign(std::begin(outOffset), std::end(outOffset)); inStride_k.assign(std::begin(inStride), std::end(inStride)); + utils::array outOffset_k; + outOffset_k.assign(std::begin(outOffset), std::end(outOffset)); + auto kernel = raw::concat_with_offsets; auto policy = make_policy(kernel, input.size(), 0, stream); launch_kernel(kernel, policy, output, outStride_k, outOffset_k, input, inStride_k); diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu index 3bccc780efe5..7047eb6578ff 100644 --- a/modules/dnn/src/cuda/eltwise_ops.cu +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -3,9 +3,10 @@ // of this distribution and at http://opencv.org/license.html. #include "math.hpp" +#include "grid_stride_loop.hpp" +#include "execution.hpp" #include "../cuda4dnn/csl/kernels.hpp" -#include "../cuda4dnn/csl/kernel_utils.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/span.hpp" diff --git a/modules/dnn/src/cuda/execution.hpp b/modules/dnn/src/cuda/execution.hpp new file mode 100644 index 000000000000..ad3f521f771a --- /dev/null +++ b/modules/dnn/src/cuda/execution.hpp @@ -0,0 +1,79 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_SRC_CUDA_EXECUTION_HPP +#define OPENCV_DNN_CUDA4DNN_SRC_CUDA_EXECUTION_HPP + +#include "../cuda4dnn/csl/error.hpp" +#include "../cuda4dnn/csl/stream.hpp" + +#include + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + struct execution_policy { + execution_policy(dim3 grid_size, dim3 block_size) + : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { } + + execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem) + : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { } + + execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm) + : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ StreamAccessor::get(strm) } { } + + execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm) + : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ StreamAccessor::get(strm) } { } + + dim3 grid; + dim3 block; + std::size_t sharedMem; + cudaStream_t stream; + }; + + /* this overload shouldn't be necessary; we should always try to provide a bound on the number of threads */ + /* + template inline + execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) { + int grid_size, block_size; + CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem)); + return execution_policy(grid_size, block_size, sharedMem, stream); + }*/ + + template inline + execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) { + CV_Assert(max_threads > 0); + + int grid_size, block_size; + CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem)); + if (grid_size * block_size > max_threads) { + grid_size = (max_threads + block_size - 1) / block_size; + if (block_size > max_threads) + block_size = std::max(64, max_threads); + } + + CV_Assert(grid_size >= 1 && block_size >= 1); + return execution_policy(grid_size, block_size, sharedMem, stream); + } + + template inline + void launch_kernel(Kernel kernel, Args ...args) { + auto policy = make_policy(kernel); + kernel <<>> (std::forward(args)...); + } + + template inline + void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) { + kernel <<>> (std::forward(args)...); + } + + template inline + void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) { + kernel <<>> (std::forward(args)...); + } + +}}}} /* cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_SRC_CUDA_EXECUTION_HPP */ diff --git a/modules/dnn/src/cuda/fill.cu b/modules/dnn/src/cuda/fill.cu index a28fea18468c..697fc626d831 100644 --- a/modules/dnn/src/cuda/fill.cu +++ b/modules/dnn/src/cuda/fill.cu @@ -2,31 +2,31 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. +#include "grid_stride_loop.hpp" +#include "execution.hpp" + #include "../cuda4dnn/csl/kernels.hpp" -#include "../cuda4dnn/csl/kernel_utils.hpp" -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/pointer.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/span.hpp" #include -#include #include +#include + namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { template - __global__ void fill(span output, T value) - { + __global__ void fill(span output, T value) { for (auto i : grid_stride_range(output.size())) output[i] = value; } } template - void fill(const Stream& stream, span output, T value) - { + void fill(const Stream& stream, span output, T value) { auto kernel = raw::fill; auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, value); diff --git a/modules/dnn/src/cuda/grid_stride_loop.hpp b/modules/dnn/src/cuda/grid_stride_loop.hpp new file mode 100644 index 000000000000..8a3c97787825 --- /dev/null +++ b/modules/dnn/src/cuda/grid_stride_loop.hpp @@ -0,0 +1,87 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_SRC_CUDA_GRID_STRIDE_LOOP_HPP +#define OPENCV_DNN_CUDA4DNN_SRC_CUDA_GRID_STRIDE_LOOP_HPP + +#include "types.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + template __device__ auto getGridDim()->decltype(dim3::x); + template <> inline __device__ auto getGridDim<0>()->decltype(dim3::x) { return gridDim.x; } + template <> inline __device__ auto getGridDim<1>()->decltype(dim3::x) { return gridDim.y; } + template <> inline __device__ auto getGridDim<2>()->decltype(dim3::x) { return gridDim.z; } + + template __device__ auto getBlockDim()->decltype(dim3::x); + template <> inline __device__ auto getBlockDim<0>()->decltype(dim3::x) { return blockDim.x; } + template <> inline __device__ auto getBlockDim<1>()->decltype(dim3::x) { return blockDim.y; } + template <> inline __device__ auto getBlockDim<2>()->decltype(dim3::x) { return blockDim.z; } + + template __device__ auto getBlockIdx()->decltype(uint3::x); + template <> inline __device__ auto getBlockIdx<0>()->decltype(uint3::x) { return blockIdx.x; } + template <> inline __device__ auto getBlockIdx<1>()->decltype(uint3::x) { return blockIdx.y; } + template <> inline __device__ auto getBlockIdx<2>()->decltype(uint3::x) { return blockIdx.z; } + + template __device__ auto getThreadIdx()->decltype(uint3::x); + template <> inline __device__ auto getThreadIdx<0>()->decltype(uint3::x) { return threadIdx.x; } + template <> inline __device__ auto getThreadIdx<1>()->decltype(uint3::x) { return threadIdx.y; } + template <> inline __device__ auto getThreadIdx<2>()->decltype(uint3::x) { return threadIdx.z; } + + template + class grid_stride_range_generic { + public: + __device__ grid_stride_range_generic(index_type to_) : from(0), to(to_) { } + __device__ grid_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { } + + class iterator + { + public: + __device__ iterator(index_type pos_) : pos(pos_) {} + + /* these iterators return the index when dereferenced; this allows us to use + * loop through the indices using a range based for loop + */ + __device__ index_type operator*() const { return pos; } + + __device__ iterator& operator++() { + pos += getGridDim() * static_cast(getBlockDim()); + return *this; + } + + __device__ bool operator!=(const iterator& other) const { + /* NOTE HACK + ** 'pos' can move in large steps (see operator++) + ** expansion of range for loop uses != as the loop conditioion + ** => operator!= must return false if 'pos' crosses the end + */ + return pos < other.pos; + } + + private: + index_type pos; + }; + + __device__ iterator begin() const { + return iterator(from + getBlockDim() * getBlockIdx() + getThreadIdx()); + } + + __device__ iterator end() const { + return iterator(to); + } + + private: + index_type from, to; + }; + + using grid_stride_range_x = grid_stride_range_generic<0>; + using grid_stride_range_y = grid_stride_range_generic<1>; + using grid_stride_range_z = grid_stride_range_generic<2>; + using grid_stride_range = grid_stride_range_x; + +}}}} /* cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_CUDA4DNN_SRC_CUDA_GRID_STRIDE_LOOP_HPP */ diff --git a/modules/dnn/src/cuda/math.hpp b/modules/dnn/src/cuda/math.hpp index d715f41c0432..c310a0ab60cb 100644 --- a/modules/dnn/src/cuda/math.hpp +++ b/modules/dnn/src/cuda/math.hpp @@ -18,6 +18,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template <> inline __device__ float exp(float val) { return expf(val); } template <> inline __device__ double exp(double val) { return ::exp(val); } + template __device__ T expm1(T val); + template <> inline __device__ float expm1(float val) { return expm1f(val); } + template <> inline __device__ double expm1(double val) { return ::expm1(val); } + template __device__ T max(T x, T y) { return ::max(x, y); } template <> inline __device__ float max(float x, float y) { return fmaxf(x, y); } template <> inline __device__ double max(double x, double y) { return fmax(x, y); } diff --git a/modules/dnn/src/cuda/normalize.cu b/modules/dnn/src/cuda/normalize.cu index 6c4c336be31a..c4ff9ddb95c9 100644 --- a/modules/dnn/src/cuda/normalize.cu +++ b/modules/dnn/src/cuda/normalize.cu @@ -4,70 +4,73 @@ #include "array.hpp" #include "math.hpp" -#include "reduce.hpp" +#include "types.hpp" #include "atomics.hpp" +#include "grid_stride_loop.hpp" +#include "execution.hpp" #include "../cuda4dnn/csl/kernels.hpp" -#include "../cuda4dnn/csl/kernel_utils.hpp" -#include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/span.hpp" -#include #include +#include + namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { - template static + using index_type = gpu::index_type; + using size_type = gpu::size_type; + + template __global__ void zero(span output) { for (auto idx : grid_stride_range(output.size())) output[idx] = 0; } - template static - __global__ void reduce_sum_abs(span output, view input, std::size_t outer_stride, std::size_t mid_stride) - { + template + __global__ void reduce_sum_abs(span output, view input, size_type outer_stride, size_type mid_stride) { for (auto idx : grid_stride_range(input.size())) { - const auto outer_idx = idx / outer_stride; - const auto inner_idx = idx % mid_stride; + const index_type outer_idx = idx / outer_stride; + const index_type inner_idx = idx % mid_stride; - auto sum_idx = outer_idx * mid_stride + inner_idx; + const index_type sum_idx = outer_idx * mid_stride + inner_idx; atomicAdd(&output[sum_idx], utils::abs(input[idx])); } } - template static + template __global__ void reciprocal(span output, T epsilon) { for (auto idx : grid_stride_range(output.size())) output[idx] = 1 / (output[idx] + epsilon); } - template static - __global__ void reduce_sum_squared(span output, view input, std::size_t outer_stride, std::size_t mid_stride) - { + template + __global__ void reduce_sum_squared(span output, view input, size_type outer_stride, size_type mid_stride) { for (auto idx : grid_stride_range(input.size())) { - const auto outer_idx = idx / outer_stride; - const auto inner_idx = idx % mid_stride; + const index_type outer_idx = idx / outer_stride; + const index_type inner_idx = idx % mid_stride; - auto sum_idx = outer_idx * mid_stride + inner_idx; + const index_type sum_idx = outer_idx * mid_stride + inner_idx; atomicAdd(&output[sum_idx], input[idx] * input[idx]); } } - template static + template __global__ void rsqrt(span output, T epsilon) { for (auto idx : grid_stride_range(output.size())) output[idx] = utils::rsqrt(output[idx] + epsilon); } - template static - __global__ void apply_norm(span output, view input, std::size_t outer_stride, std::size_t mid_stride, view sums) + template + __global__ void apply_norm(span output, view input, size_type outer_stride, size_type mid_stride, view sums) { for (auto idx : grid_stride_range(output.size())) { - const auto outer_idx = idx / outer_stride; - const auto inner_idx = idx % mid_stride; + const index_type outer_idx = idx / outer_stride; + const index_type inner_idx = idx % mid_stride; - auto sum_idx = outer_idx * mid_stride + inner_idx; + const index_type sum_idx = outer_idx * mid_stride + inner_idx; output[idx] = input[idx] * sums[sum_idx]; } } @@ -78,12 +81,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k const Stream& stream, span output, view input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, T norm, T epsilon, - span workspace_) + span workspace) { CV_Assert(norm == 1 || norm == 2); - CV_Assert(workspace_.size() >= outer_size * inner_size); + CV_Assert(workspace.size() >= outer_size * inner_size); - auto sums = span(workspace_.data(), outer_size * inner_size); + auto sums = span(workspace.data(), outer_size * inner_size); auto zero_kernel = raw::zero; auto policy = make_policy(zero_kernel, sums.size(), 0, stream); diff --git a/modules/dnn/src/cuda/padding.cu b/modules/dnn/src/cuda/padding.cu index 6954bfb0c6c3..0bd0a5a16757 100644 --- a/modules/dnn/src/cuda/padding.cu +++ b/modules/dnn/src/cuda/padding.cu @@ -4,18 +4,24 @@ #include "array.hpp" #include "math.hpp" +#include "types.hpp" +#include "grid_stride_loop.hpp" +#include "execution.hpp" #include "../cuda4dnn/csl/kernels.hpp" -#include "../cuda4dnn/csl/kernel_utils.hpp" -#include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/span.hpp" #include -#include #include +#include + namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + using index_type = gpu::index_type; + using size_type = gpu::size_type; namespace raw { template @@ -23,25 +29,25 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void copy_with_reflection101( - span output, array out_strides, array start, array end, - view input, array in_strides) + span output, array out_strides, array start, array end, + view input, array in_strides) { for (auto i : grid_stride_range(output.size())) { /* compute output axis indices corresponding to element 'i' */ - array out_index; + array out_index; out_index[0] = i / out_strides[0]; for (int j = 1; j < N; j++) out_index[j] = (i % out_strides[j - 1]) / out_strides[j]; /* compute input axis indices corresponding to output axis indices */ - using utils::abs; - array in_index; + array in_index; for (int j = 0; j < N; j++) { /* if out_index < start, the point is in the left reflection region * the reflected value's index is the absolute value of the difference * * otherwise, if the value is in the copy region, out_index - start gives the input index */ + using utils::abs; in_index[j] = abs(out_index[j] - start[j]); /* if out_index >= end, it's in the right reflection region */ @@ -50,7 +56,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } /* compute input element number from input axis indices */ - int iidx = 0; + index_type iidx = 0; for (int j = 0; j < N; j++) iidx += in_index[j] * in_strides[j]; @@ -70,10 +76,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(inStride.size() == N); CV_Assert(ranges.size() == N); - utils::array outStride_k, start_k, end_k, inStride_k; + utils::array outStride_k, inStride_k; outStride_k.assign(std::begin(outStride), std::end(outStride)); inStride_k.assign(std::begin(inStride), std::end(inStride)); + utils::array start_k, end_k; for (int i = 0; i < N; i++) { start_k[i] = ranges[i].first; end_k[i] = ranges[i].second; diff --git a/modules/dnn/src/cuda/permute.cu b/modules/dnn/src/cuda/permute.cu index 5a590970376a..88efed452693 100644 --- a/modules/dnn/src/cuda/permute.cu +++ b/modules/dnn/src/cuda/permute.cu @@ -3,36 +3,43 @@ // of this distribution and at http://opencv.org/license.html. #include "array.hpp" +#include "types.hpp" +#include "grid_stride_loop.hpp" +#include "execution.hpp" #include "../cuda4dnn/csl/kernels.hpp" -#include "../cuda4dnn/csl/kernel_utils.hpp" -#include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/span.hpp" #include -#include #include +#include + namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + using index_type = gpu::index_type; + using size_type = gpu::size_type; + namespace raw { template using array = utils::array; template __global__ void permute( - array axis_order, - span output, array outStrides, - view input, array inStrides) + array axis_order, + span output, array outStrides, + view input, array inStrides) { for (auto i : grid_stride_range(input.size())) { - int oldPosition = 0; - int newPosition = i; + index_type oldPosition = 0; + index_type newPosition = i; for (int j = 0; j < N; j++) { - int order = axis_order[j]; + auto order = axis_order[j]; oldPosition += (newPosition / outStrides[j]) * inStrides[order]; newPosition %= outStrides[j]; } @@ -53,8 +60,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(outStride.size() == N); CV_Assert(inStride.size() == N); - utils::array order_k, outStride_k, inStride_k; + utils::array order_k; order_k.assign(std::begin(order), std::end(order)); + + utils::array outStride_k, inStride_k; outStride_k.assign(std::begin(outStride), std::end(outStride)); inStride_k.assign(std::begin(inStride), std::end(inStride)); diff --git a/modules/dnn/src/cuda/prior_box.cu b/modules/dnn/src/cuda/prior_box.cu index e00511a47ce2..40ebeed5e484 100644 --- a/modules/dnn/src/cuda/prior_box.cu +++ b/modules/dnn/src/cuda/prior_box.cu @@ -4,34 +4,39 @@ #include "array.hpp" #include "math.hpp" -#include "reduce.hpp" +#include "types.hpp" +#include "grid_stride_loop.hpp" +#include "execution.hpp" #include "../cuda4dnn/csl/kernels.hpp" -#include "../cuda4dnn/csl/kernel_utils.hpp" -#include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/span.hpp" -#include #include +#include + namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { + using index_type = gpu::index_type; + using size_type = gpu::size_type; + template __global__ void prior_box( span output, view boxWidth, view boxHeight, view offsetX, view offsetY, - std::size_t layerWidth, std::size_t layerHeight, - std::size_t imageWidth, std::size_t imageHeight, + size_type layerWidth, size_type layerHeight, + size_type imageWidth, size_type imageHeight, T stepX, T stepY) { /* num_points contains the number of points in the feature map of interest * each iteration of the stride loop selects a point and generates prior boxes for it */ - std::size_t num_points = layerWidth * layerHeight; + size_type num_points = layerWidth * layerHeight; for (auto idx : grid_stride_range(num_points)) { - auto x = idx % layerWidth, - y = idx / layerWidth; + const index_type x = idx % layerWidth, + y = idx / layerWidth; DevicePtr output_ptr = output.data() + idx * 4 * offsetX.size() * boxWidth.size(); @@ -78,7 +83,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void prior_box_set_variance4(span output, array variance) { for (auto i : grid_stride_range(output.size())) { - const auto vidx = i % variance.size(); + const index_type vidx = i % variance.size(); output[i] = variance[vidx]; } } diff --git a/modules/dnn/src/cuda/resize.cu b/modules/dnn/src/cuda/resize.cu index 4ce1d5d4cd2a..5a205126422f 100644 --- a/modules/dnn/src/cuda/resize.cu +++ b/modules/dnn/src/cuda/resize.cu @@ -3,29 +3,34 @@ // of this distribution and at http://opencv.org/license.html. #include "math.hpp" +#include "types.hpp" +#include "grid_stride_loop.hpp" +#include "execution.hpp" #include "../cuda4dnn/csl/kernels.hpp" -#include "../cuda4dnn/csl/kernel_utils.hpp" -#include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/span.hpp" -#include #include namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { + using index_type = gpu::index_type; + using size_type = gpu::size_type; + template __global__ void resize_nn( - span output, std::size_t out_height, std::size_t out_width, - view input, std::size_t in_height, std::size_t in_width) + span output, size_type out_height, size_type out_width, + view input, size_type in_height, size_type in_width) { auto in_image_size = in_height * in_width; auto out_image_size = out_height * out_width; /* o2i = output to input */ - auto o2i_fx = float(in_width) / out_width; - auto o2i_fy = float(in_height) / out_height; + auto o2i_fx = static_cast(in_width) / out_width; + auto o2i_fy = static_cast(in_height) / out_height; /* think of the output and input as a collection of 2d images with the last axis * representing the width and the last but one axis representing the height @@ -33,22 +38,22 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k * the remaining axis together form a collection of these images */ for (auto idx : grid_stride_range(output.size())) { - auto n = idx / out_image_size; - auto x = (idx % out_image_size) % out_width; - auto y = (idx % out_image_size) / out_width; + const index_type n = idx / out_image_size; + const index_type x = (idx % out_image_size) % out_width; + const index_type y = (idx % out_image_size) / out_width; - auto in_x = __float2int_rz(x * o2i_fx); - auto in_y = __float2int_rz(y * o2i_fy); + auto in_x = static_cast(x * o2i_fx); + auto in_y = static_cast(y * o2i_fy); - auto in_idx = n * in_image_size + in_y * in_width + in_x; + index_type in_idx = n * in_image_size + in_y * in_width + in_x; output[idx] = input[in_idx]; } } template __global__ void resize_bilinear( - span output, std::size_t out_height, std::size_t out_width, - view input, std::size_t in_height, std::size_t in_width, + span output, size_type out_height, size_type out_width, + view input, size_type in_height, size_type in_width, float o2i_fy, float o2i_fx) { auto in_image_size = in_height * in_width; @@ -60,22 +65,22 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k * the remaining axis together form a collection of these images */ for (auto idx : grid_stride_range(output.size())) { - auto n = idx / out_image_size; - auto x = (idx % out_image_size) % out_width; - auto y = (idx % out_image_size) / out_width; + const index_type n = idx / out_image_size; + const index_type x = (idx % out_image_size) % out_width; + const index_type y = (idx % out_image_size) / out_width; auto in_x = x * o2i_fx; auto in_y = y * o2i_fy; - int in_x0 = __float2int_rz(in_x); - int in_y0 = __float2int_rz(in_y); + auto in_x0 = static_cast(in_x); + auto in_y0 = static_cast(in_y); using utils::min; - int in_x1 = min(in_x0 + 1, in_width - 1); - int in_y1 = min(in_y0 + 1, in_height - 1); + auto in_x1 = min(in_x0 + 1, in_width - 1); + auto in_y1 = min(in_y0 + 1, in_height - 1); - int in_offset_r0 = n * in_image_size + in_y0 * in_width; - int in_offset_r1 = n * in_image_size + in_y1 * in_width; + const index_type in_offset_r0 = n * in_image_size + in_y0 * in_width; + const index_type in_offset_r1 = n * in_image_size + in_y1 * in_width; auto v_00 = input[in_offset_r0 + in_x0], v_01 = input[in_offset_r0 + in_x1], diff --git a/modules/dnn/src/cuda/scale.cu b/modules/dnn/src/cuda/scale.cu index 4a667edacca4..fc0894a841f9 100644 --- a/modules/dnn/src/cuda/scale.cu +++ b/modules/dnn/src/cuda/scale.cu @@ -2,32 +2,36 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. +#include "types.hpp" #include "vector_traits.hpp" +#include "grid_stride_loop.hpp" +#include "execution.hpp" #include "../cuda4dnn/csl/kernels.hpp" -#include "../cuda4dnn/csl/kernel_utils.hpp" -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/pointer.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/span.hpp" #include -#include #include +#include + namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { + using index_type = gpu::index_type; + using size_type = gpu::size_type; + template - __global__ void bias1(span output, view input, T beta) - { + __global__ void bias1(span output, view input, T beta) { for (auto i : grid_stride_range(output.size())) output[i] = input[i] + beta; } template - __global__ void biasN_vec4(span output, view input, std::size_t inner_size, view bias) - { + __global__ void biasN_vec4(span output, view input, size_type inner_size, view bias) { using vector_type = typename get_vector_type::type; vector_type* dstPtr = reinterpret_cast(output.data().get()); @@ -35,7 +39,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k inner_size /= 4; for (auto i : grid_stride_range(output.size() / 4)) { - const auto bias_idx = (i / inner_size) % bias.size(); + const index_type bias_idx = (i / inner_size) % bias.size(); vector_type vec = srcPtr[i]; vec.w = vec.w + bias[bias_idx]; @@ -47,7 +51,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void biasN_vec2(span output, view input, std::size_t inner_size, view bias) + __global__ void biasN_vec2(span output, view input, size_type inner_size, view bias) { using vector_type = typename get_vector_type::type; @@ -56,7 +60,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k inner_size /= 2; for (auto i : grid_stride_range(output.size() / 2)) { - const auto bias_idx = (i / inner_size) % bias.size(); + const index_type bias_idx = (i / inner_size) % bias.size(); vector_type vec = srcPtr[i]; vec.x = vec.x + bias[bias_idx]; @@ -66,10 +70,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void biasN(span output, view input, std::size_t inner_size, view bias) + __global__ void biasN(span output, view input, size_type inner_size, view bias) { for (auto i : grid_stride_range(output.size())) { - const auto bias_idx = (i / inner_size) % bias.size(); + const index_type bias_idx = (i / inner_size) % bias.size(); output[i] = input[i] + bias[bias_idx]; } } @@ -82,7 +86,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void scaleN_vec4(span output, view input, std::size_t inner_size, view weights) + __global__ void scaleN_vec4(span output, view input, size_type inner_size, view weights) { using vector_type = typename get_vector_type::type; @@ -91,7 +95,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k inner_size /= 4; for (auto i : grid_stride_range(output.size() / 4)) { - auto scale_idx = (i / inner_size) % weights.size(); + const index_type scale_idx = (i / inner_size) % weights.size(); vector_type vec = srcPtr[i]; vec.w = vec.w * weights[scale_idx]; @@ -103,7 +107,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void scaleN_vec2(span output, view input, std::size_t inner_size, view weights) + __global__ void scaleN_vec2(span output, view input, size_type inner_size, view weights) { using vector_type = typename get_vector_type::type; @@ -112,7 +116,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k inner_size /= 2; for (auto i : grid_stride_range(output.size() / 2)) { - auto scale_idx = (i / inner_size) % weights.size(); + const index_type scale_idx = (i / inner_size) % weights.size(); vector_type vec = srcPtr[i]; vec.x = vec.x * weights[scale_idx]; @@ -122,10 +126,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void scaleN(span output, view input, std::size_t inner_size, view weights) + __global__ void scaleN(span output, view input, size_type inner_size, view weights) { for (auto i : grid_stride_range(output.size())) { - const auto scale_idx = (i / inner_size) % weights.size(); + const index_type scale_idx = (i / inner_size) % weights.size(); output[i] = input[i] * weights[scale_idx]; } } @@ -138,7 +142,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void scaleN_with_biasN_vec4(span output, view input, std::size_t inner_size, view weights, view bias) + __global__ void scaleN_with_biasN_vec4(span output, view input, size_type inner_size, view weights, view bias) { using vector_type = typename get_vector_type::type; @@ -147,7 +151,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k inner_size /= 4; for (auto i : grid_stride_range(output.size() / 4)) { - auto scale_idx = (i / inner_size) % weights.size(); + const index_type scale_idx = (i / inner_size) % weights.size(); vector_type vec = srcPtr[i]; vec.w = vec.w * weights[scale_idx] + bias[scale_idx]; @@ -159,7 +163,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void scaleN_with_biasN_vec2(span output, view input, std::size_t inner_size, view weights, view bias) + __global__ void scaleN_with_biasN_vec2(span output, view input, size_type inner_size, view weights, view bias) { using vector_type = typename get_vector_type::type; @@ -168,7 +172,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k inner_size /= 2; for (auto i : grid_stride_range(output.size() / 2)) { - auto scale_idx = (i / inner_size) % weights.size(); + const index_type scale_idx = (i / inner_size) % weights.size(); vector_type vec = srcPtr[i]; vec.x = vec.x * weights[scale_idx] + bias[scale_idx]; @@ -178,10 +182,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } template - __global__ void scaleN_with_biasN(span output, view input, std::size_t inner_size, view weights, view bias) + __global__ void scaleN_with_biasN(span output, view input, size_type inner_size, view weights, view bias) { for (auto i : grid_stride_range(output.size())) { - const auto scale_idx = (i / inner_size) % weights.size(); + const index_type scale_idx = (i / inner_size) % weights.size(); output[i] = input[i] * weights[scale_idx] + bias[scale_idx]; } } diff --git a/modules/dnn/src/cuda/slice.cu b/modules/dnn/src/cuda/slice.cu index 3020a56bbc27..564d64b56c1e 100644 --- a/modules/dnn/src/cuda/slice.cu +++ b/modules/dnn/src/cuda/slice.cu @@ -3,32 +3,39 @@ // of this distribution and at http://opencv.org/license.html. #include "array.hpp" +#include "types.hpp" +#include "grid_stride_loop.hpp" +#include "execution.hpp" #include "../cuda4dnn/csl/kernels.hpp" -#include "../cuda4dnn/csl/kernel_utils.hpp" -#include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/tensor.hpp" +#include "../cuda4dnn/csl/span.hpp" + +#include -#include #include -#include + +#include namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { + using index_type = gpu::index_type; + using size_type = gpu::size_type; + namespace raw { - template + template using array = utils::array; - template + template __global__ void slice( - span output, array out_strides, - view input, array in_strides, array in_offset) + span output, array out_strides, + view input, array in_strides, array in_offset) { for (auto i : grid_stride_range(output.size())) { - /* compute output axis indices corresponding to element 'i' */ - int out_index = i / out_strides[0]; - int in_index = in_offset[0] + out_index; - int iidx = in_index * in_strides[0]; + index_type out_index = i / out_strides[0]; + index_type in_index = in_offset[0] + out_index; + index_type iidx = in_index * in_strides[0]; for (int j = 1; j < N; j++) { out_index = (i % out_strides[j - 1]) / out_strides[j]; in_index = in_offset[j] + out_index; @@ -40,7 +47,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } } - template static + template static void launch_slice_kernel( const Stream& stream, span output, const std::vector& outStride, @@ -50,9 +57,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k CV_Assert(inStride.size() == N); CV_Assert(inOffset.size() == N); - utils::array outStride_k, inStride_k, inOffset_k; + utils::array outStride_k, inStride_k; outStride_k.assign(std::begin(outStride), std::end(outStride)); inStride_k.assign(std::begin(inStride), std::end(inStride)); + + utils::array inOffset_k; inOffset_k.assign(std::begin(inOffset), std::end(inOffset)); auto kernel = raw::slice; diff --git a/modules/dnn/src/cuda/types.hpp b/modules/dnn/src/cuda/types.hpp new file mode 100644 index 000000000000..11f0bfa3370d --- /dev/null +++ b/modules/dnn/src/cuda/types.hpp @@ -0,0 +1,28 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA_TYPE_HPP +#define OPENCV_DNN_SRC_CUDA_TYPE_HPP + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { + + /* For indices, we can use 32bit variables or 64bit variables. The GPU registers are 32 bits in size. + * Hence, a 64bit variable requires two registers and is slightly slower than the 32bit versions for some + * operations. + */ + namespace gpu { +#ifdef __CUDACC__ + using size_type = int; + using index_type = int; +#else + using size_type = std::int32_t; + using index_type = std::int32_t; +#endif + } + +}}}} /* cv::dnn::cuda4dnn::csl */ + +#endif /* OPENCV_DNN_SRC_CUDA_TYPE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp b/modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp deleted file mode 100644 index 68e168e952cc..000000000000 --- a/modules/dnn/src/cuda4dnn/csl/kernel_utils.hpp +++ /dev/null @@ -1,153 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -#ifndef OPENCV_DNN_CUDA4DNN_CSL_KERNEL_UTILS_HPP -#define OPENCV_DNN_CUDA4DNN_CSL_KERNEL_UTILS_HPP - -#include "error.hpp" -#include "stream.hpp" -#include "nvcc_defs.hpp" - -#include - -#ifdef __CUDACC__ -#include -#endif - -namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { - -#ifdef __CUDACC__ - struct execution_policy { - execution_policy(dim3 grid_size, dim3 block_size) - : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { } - - execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem) - : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { } - - execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm) - : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ StreamAccessor::get(strm) } { } - - execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm) - : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ StreamAccessor::get(strm) } { } - - dim3 grid; - dim3 block; - std::size_t sharedMem; - cudaStream_t stream; - }; - - /* this overload shouldn't be necessary; we should always try to provide a bound on the number of threads */ - /* - template inline - execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) { - int grid_size, block_size; - CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem)); - return execution_policy(grid_size, block_size, sharedMem, stream); - }*/ - - template inline - execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) { - CV_Assert(max_threads > 0); - - int grid_size, block_size; - CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem)); - if (grid_size * block_size > max_threads) { - grid_size = (max_threads + block_size - 1) / block_size; - if (block_size > max_threads) - block_size = std::max(64, max_threads); - } - - CV_Assert(grid_size >= 1 && block_size >= 1); - return execution_policy(grid_size, block_size, sharedMem, stream); - } - - template inline - void launch_kernel(Kernel kernel, Args ...args) { - auto policy = make_policy(kernel); - kernel <<>> (std::forward(args)...); - } - - template inline - void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) { - kernel <<>> (std::forward(args)...); - } - - template inline - void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) { - kernel <<>> (std::forward(args)...); - } - - template CUDA4DNN_DEVICE auto getGridDim()->decltype(dim3::x); - template <> inline CUDA4DNN_DEVICE auto getGridDim<0>()->decltype(dim3::x) { return gridDim.x; } - template <> inline CUDA4DNN_DEVICE auto getGridDim<1>()->decltype(dim3::x) { return gridDim.y; } - template <> inline CUDA4DNN_DEVICE auto getGridDim<2>()->decltype(dim3::x) { return gridDim.z; } - - template CUDA4DNN_DEVICE auto getBlockDim()->decltype(dim3::x); - template <> inline CUDA4DNN_DEVICE auto getBlockDim<0>()->decltype(dim3::x) { return blockDim.x; } - template <> inline CUDA4DNN_DEVICE auto getBlockDim<1>()->decltype(dim3::x) { return blockDim.y; } - template <> inline CUDA4DNN_DEVICE auto getBlockDim<2>()->decltype(dim3::x) { return blockDim.z; } - - template CUDA4DNN_DEVICE auto getBlockIdx()->decltype(uint3::x); - template <> inline CUDA4DNN_DEVICE auto getBlockIdx<0>()->decltype(uint3::x) { return blockIdx.x; } - template <> inline CUDA4DNN_DEVICE auto getBlockIdx<1>()->decltype(uint3::x) { return blockIdx.y; } - template <> inline CUDA4DNN_DEVICE auto getBlockIdx<2>()->decltype(uint3::x) { return blockIdx.z; } - - template CUDA4DNN_DEVICE auto getThreadIdx()->decltype(uint3::x); - template <> inline CUDA4DNN_DEVICE auto getThreadIdx<0>()->decltype(uint3::x) { return threadIdx.x; } - template <> inline CUDA4DNN_DEVICE auto getThreadIdx<1>()->decltype(uint3::x) { return threadIdx.y; } - template <> inline CUDA4DNN_DEVICE auto getThreadIdx<2>()->decltype(uint3::x) { return threadIdx.z; } - - template - class grid_stride_range_generic { - public: - CUDA4DNN_DEVICE grid_stride_range_generic(std::size_t to_) : from(0), to(to_) { } - CUDA4DNN_DEVICE grid_stride_range_generic(std::size_t from_, std::size_t to_) : from(from_), to(to_) { } - - class iterator - { - public: - CUDA4DNN_DEVICE iterator(std::size_t pos_) : pos(pos_) {} - - CUDA4DNN_DEVICE size_t operator*() const { return pos; } - - CUDA4DNN_DEVICE iterator& operator++() { - pos += getGridDim() * getBlockDim(); - return *this; - } - - CUDA4DNN_DEVICE bool operator!=(const iterator& other) const { - /* NOTE HACK - ** 'pos' can move in large steps (see operator++) - ** expansion of range for loop uses != as the loop conditioion - ** => operator!= must return false if 'pos' crosses the end - */ - return pos < other.pos; - } - - private: - std::size_t pos; - }; - - CUDA4DNN_DEVICE iterator begin() const { - return iterator(from + getBlockDim() * getBlockIdx() + getThreadIdx()); - } - - CUDA4DNN_DEVICE iterator end() const { - return iterator(to); - } - - private: - std::size_t from, to; - }; - - using grid_stride_range_x = grid_stride_range_generic<0>; - using grid_stride_range_y = grid_stride_range_generic<1>; - using grid_stride_range_z = grid_stride_range_generic<2>; - using grid_stride_range = grid_stride_range_x; - -#endif /* __CUDACC__ */ - -}}}} /* cv::dnn::cuda4dnn::csl */ - -#endif /* OPENCV_DNN_CUDA4DNN_CSL_KERNEL_UTILS_HPP */ From ca95f5cef748cca9071e94586835f1a66fd47e2d Mon Sep 17 00:00:00 2001 From: Yashas Date: Wed, 10 Jul 2019 22:51:46 +0530 Subject: [PATCH 056/129] vectorize prior box kernels --- modules/dnn/src/cuda/prior_box.cu | 59 +++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/modules/dnn/src/cuda/prior_box.cu b/modules/dnn/src/cuda/prior_box.cu index 40ebeed5e484..facebf1d8524 100644 --- a/modules/dnn/src/cuda/prior_box.cu +++ b/modules/dnn/src/cuda/prior_box.cu @@ -5,6 +5,7 @@ #include "array.hpp" #include "math.hpp" #include "types.hpp" +#include "vector_traits.hpp" #include "grid_stride_loop.hpp" #include "execution.hpp" @@ -30,6 +31,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k size_type imageWidth, size_type imageHeight, T stepX, T stepY) { + /* each box consists of two pair of coordinates and hence 4 values in total */ + /* since the entire output consists (first channel at least) of these boxes, + * we are garunteeed that the output is aligned to a boundary of 4 values + */ + using vector_type = typename get_vector_type::type; + vector_type* outputPtr_v4 = reinterpret_cast(output.data().get()); + /* num_points contains the number of points in the feature map of interest * each iteration of the stride loop selects a point and generates prior boxes for it */ @@ -38,26 +46,27 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k const index_type x = idx % layerWidth, y = idx / layerWidth; - DevicePtr output_ptr = output.data() + idx * 4 * offsetX.size() * boxWidth.size(); - + index_type output_offset_v4 = idx * offsetX.size() * boxWidth.size(); for (int i = 0; i < boxWidth.size(); i++) { for (int j = 0; j < offsetX.size(); j++) { float center_x = (x + offsetX[j]) * stepX; float center_y = (y + offsetY[j]) * stepY; + vector_type vec; if(Normalize) { - output_ptr[0] = (center_x - boxWidth[i] * 0.5f) / imageWidth; - output_ptr[1] = (center_y - boxHeight[i] * 0.5f) / imageHeight; - output_ptr[2] = (center_x + boxWidth[i] * 0.5f) / imageWidth; - output_ptr[3] = (center_y + boxHeight[i] * 0.5f) / imageHeight; + vec.x = (center_x - boxWidth[i] * 0.5f) / imageWidth; + vec.y = (center_y - boxHeight[i] * 0.5f) / imageHeight; + vec.z = (center_x + boxWidth[i] * 0.5f) / imageWidth; + vec.w = (center_y + boxHeight[i] * 0.5f) / imageHeight; } else { - output_ptr[0] = center_x - boxWidth[i] * 0.5f; - output_ptr[1] = center_y - boxHeight[i] * 0.5f; - output_ptr[2] = center_x + boxWidth[i] * 0.5f - 1.0f; - output_ptr[3] = center_y + boxHeight[i] * 0.5f - 1.0f; + vec.x = center_x - boxWidth[i] * 0.5f; + vec.y = center_y - boxHeight[i] * 0.5f; + vec.z = center_x + boxWidth[i] * 0.5f - 1.0f; + vec.w = center_y + boxHeight[i] * 0.5f - 1.0f; } - output_ptr += 4; + outputPtr_v4[output_offset_v4] = vec; + output_offset_v4++; } } } @@ -73,8 +82,16 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void prior_box_set_variance1(span output, T variance) { - for (auto i : grid_stride_range(output.size())) - output[i] = variance; + using vector_type = typename get_vector_type::type; + vector_type* outputPtr_v4 = reinterpret_cast(output.data().get()); + for (auto i : grid_stride_range(output.size() / 4)) { + vector_type vec; + vec.x = variance; + vec.y = variance; + vec.z = variance; + vec.w = variance; + outputPtr_v4[i] = vec; + } } template @@ -82,9 +99,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void prior_box_set_variance4(span output, array variance) { - for (auto i : grid_stride_range(output.size())) { - const index_type vidx = i % variance.size(); - output[i] = variance[vidx]; + using vector_type = typename get_vector_type::type; + vector_type* outputPtr_v4 = reinterpret_cast(output.data().get()); + for (auto i : grid_stride_range(output.size() / 4)) { + vector_type vec; + vec.x = variance[0]; + vec.y = variance[1]; + vec.z = variance[2]; + vec.w = variance[3]; + outputPtr_v4[i] = vec; } } } @@ -142,13 +165,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k auto output_span_c2 = span(output.data() + channel_size, channel_size); if (variance.size() == 1) { auto kernel = raw::prior_box_set_variance1; - auto policy = make_policy(kernel, output_span_c2.size(), 0, stream); + auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream); launch_kernel(kernel, policy, output_span_c2, variance[0]); } else { utils::array variance_k; variance_k.assign(std::begin(variance), std::end(variance)); auto kernel = raw::prior_box_set_variance4; - auto policy = make_policy(kernel, output_span_c2.size(), 0, stream); + auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream); launch_kernel(kernel, policy, output_span_c2, variance_k); } } From b678bff0a49f87b21efa45e8b38381ae57f3ab3a Mon Sep 17 00:00:00 2001 From: Yashas Date: Wed, 10 Jul 2019 23:00:11 +0530 Subject: [PATCH 057/129] fix address alignment check --- modules/dnn/src/cuda4dnn/csl/span.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/dnn/src/cuda4dnn/csl/span.hpp b/modules/dnn/src/cuda4dnn/csl/span.hpp index c26075856812..fa636aafefdf 100644 --- a/modules/dnn/src/cuda4dnn/csl/span.hpp +++ b/modules/dnn/src/cuda4dnn/csl/span.hpp @@ -58,17 +58,21 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { template using view = span; + /** returns true if the address of a span/view is aligned to \p alignment number of elements (not bytes) */ template bool is_address_aligned(view v, std::size_t alignment) { - return is_aligned(v.data(), alignment); + return is_aligned(v.data(), alignment * sizeof(T)); } + /** returns true if the size of a span/view is a multiple of \p alignment */ template bool is_size_aligned(view v, std::size_t alignment) { return v.size() % alignment == 0; } - /** @brief returns true if the address and the size of the span/view is aligned to a boundary */ + /** @brief returns true if the address and the size of the span/view is aligned + * \p alignment refers to the number of elements (not bytes) + */ template bool is_fully_aligned(view v, std::size_t alignment) { return is_address_aligned(v, alignment) && is_size_aligned(v, alignment); From 986b466f16f6851625ee98134561f376c85f1c7a Mon Sep 17 00:00:00 2001 From: Yashas Date: Thu, 11 Jul 2019 12:20:53 +0530 Subject: [PATCH 058/129] improve bias addition performance of conv/deconv/fc layers --- modules/dnn/src/layers/convolution_layer.cpp | 18 ++++++++++++++---- .../dnn/src/layers/fully_connected_layer.cpp | 13 +++++-------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index abeb6dbe862d..8485c5e3e361 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -1313,17 +1313,21 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl convoluter.convolve(output, input, filtersTensor, workspace); if (hasBias() || fusedBias) - csl::tensor_ops::add(cudnnHandle, 1.0, output, 1.0, biasTensor); + { + std::size_t inner_size = total(output_wrapper->getShape(), 2, -1); + csl::kernels::biasN(stream, output, output, inner_size, biasTensor); + } } void initCUDA( - csl::Stream stream, + csl::Stream stream_, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs ) override { + stream = std::move(stream_); cudnnHandle = std::move(cudnn_handle); auto input_wrapper = inputs[0].dynamicCast(); @@ -1432,6 +1436,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl scratch_mem_in_bytes = convoluter.get_workspace_size(); } + csl::Stream stream; csl::cudnn::Handle cudnnHandle; csl::Tensor filtersTensor, biasTensor; csl::Convolution convoluter; @@ -2076,17 +2081,21 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl convoluter.transpose_convolve(output, input, filtersTensor, workspace); if (hasBias() || fusedBias) - csl::tensor_ops::add(cudnnHandle, 1.0, output, 1.0, biasTensor); + { + std::size_t inner_size = total(output_wrapper->getShape(), 2, -1); + csl::kernels::biasN(stream, output, output, inner_size, biasTensor); + } } void initCUDA( - csl::Stream stream, + csl::Stream stream_, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs ) override { + stream = std::move(stream_); cudnnHandle = std::move(cudnn_handle); auto input_wrapper = inputs[0].dynamicCast(); @@ -2214,6 +2223,7 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl scratch_mem_in_bytes = convoluter.get_workspace_size(); } + csl::Stream stream; csl::cudnn::Handle cudnnHandle; csl::Tensor filtersTensor, biasTensor; csl::TransposeConvolution convoluter; diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 8a21b67bc2d9..a162777c19ca 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -469,23 +469,20 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer csl::tensor_ops::gemm(cublasHandle, 0.0, output, 1.0, false, input, true, weightsTensor); if (bias) - { - output.reshape(batch_size, 1, output_size, 1); - csl::tensor_ops::add(cudnnHandle, 1.0, output, 1.0, biasTensor); - } + csl::kernels::biasN(stream, output, output, 1, biasTensor); } } void initCUDA( - csl::Stream stream, + csl::Stream stream_, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs ) override { + stream = std::move(stream_); cublasHandle = std::move(cublas_handle); - cudnnHandle = std::move(cudnn_handle); weightsTensor = createTensorHeaderFromMat(weightsMat); CV_Assert(get_effective_rank(weightsTensor) == 2); @@ -500,9 +497,9 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer } } - csl::Tensor weightsTensor, biasTensor; + csl::Stream stream; csl::cublas::Handle cublasHandle; - csl::cudnn::Handle cudnnHandle; + csl::Tensor weightsTensor, biasTensor; #endif virtual Ptr initHalide(const std::vector > &inputs) CV_OVERRIDE From b0799b1ea4bdb4892e44fb15818255581b898d44 Mon Sep 17 00:00:00 2001 From: Yashas Date: Tue, 16 Jul 2019 17:43:31 +0530 Subject: [PATCH 059/129] restructure code for supporting multiple targets --- modules/dnn/src/cuda/activations.cu | 1 - modules/dnn/src/cuda/concat.cu | 1 - modules/dnn/src/cuda/eltwise_ops.cu | 1 - modules/dnn/src/cuda/fill.cu | 19 +- modules/dnn/src/cuda/normalize.cu | 18 +- modules/dnn/src/cuda/padding.cu | 1 - modules/dnn/src/cuda/permute.cu | 1 - modules/dnn/src/cuda/prior_box.cu | 35 +- modules/dnn/src/cuda/resize.cu | 1 - modules/dnn/src/cuda/scale.cu | 1 - modules/dnn/src/cuda/slice.cu | 1 - modules/dnn/src/cuda4dnn/csl/cudnn.hpp | 6 - modules/dnn/src/cuda4dnn/csl/kernels.hpp | 3 +- modules/dnn/src/cuda4dnn/csl/tensor.hpp | 30 +- modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp | 7 + .../src/cuda4dnn/cxx_utils/is_iterator.hpp | 31 ++ .../src/cuda4dnn/cxx_utils/make_unique.hpp | 19 + .../src/cuda4dnn/primitives/activation.hpp | 286 ++++++++++++++ .../src/cuda4dnn/primitives/batch_norm.hpp | 60 +++ .../dnn/src/cuda4dnn/primitives/concat.hpp | 88 +++++ modules/dnn/src/cuda4dnn/primitives/const.hpp | 51 +++ .../src/cuda4dnn/primitives/convolution.hpp | 240 ++++++++++++ .../dnn/src/cuda4dnn/primitives/eltwise.hpp | 114 ++++++ .../src/cuda4dnn/primitives/inner_product.hpp | 93 +++++ modules/dnn/src/cuda4dnn/primitives/lrn.hpp | 56 +++ .../cuda4dnn/primitives/normalize_bbox.hpp | 145 ++++++++ .../dnn/src/cuda4dnn/primitives/padding.hpp | 109 ++++++ .../dnn/src/cuda4dnn/primitives/permute.hpp | 67 ++++ .../dnn/src/cuda4dnn/primitives/pooling.hpp | 258 +++++++++++++ .../dnn/src/cuda4dnn/primitives/prior_box.hpp | 137 +++++++ .../dnn/src/cuda4dnn/primitives/reshape.hpp | 52 +++ .../dnn/src/cuda4dnn/primitives/resize.hpp | 59 +++ .../src/cuda4dnn/primitives/scale_shift.hpp | 116 ++++++ modules/dnn/src/cuda4dnn/primitives/slice.hpp | 58 +++ .../dnn/src/cuda4dnn/primitives/softmax.hpp | 53 +++ .../primitives/transpose_convolution.hpp | 225 +++++++++++ modules/dnn/src/layers/batch_norm_layer.cpp | 31 +- modules/dnn/src/layers/blank_layer.cpp | 25 +- modules/dnn/src/layers/concat_layer.cpp | 104 ++---- modules/dnn/src/layers/const_layer.cpp | 25 +- modules/dnn/src/layers/convolution_layer.cpp | 348 ++++++------------ modules/dnn/src/layers/elementwise_layers.cpp | 190 ++-------- modules/dnn/src/layers/eltwise_layer.cpp | 80 +--- modules/dnn/src/layers/flatten_layer.cpp | 35 +- .../dnn/src/layers/fully_connected_layer.cpp | 73 +--- modules/dnn/src/layers/lrn_layer.cpp | 32 +- .../dnn/src/layers/normalize_bbox_layer.cpp | 73 ++-- modules/dnn/src/layers/padding_layer.cpp | 75 +--- modules/dnn/src/layers/permute_layer.cpp | 41 +-- modules/dnn/src/layers/pooling_layer.cpp | 167 ++------- modules/dnn/src/layers/prior_box_layer.cpp | 77 ++-- modules/dnn/src/layers/reshape_layer.cpp | 34 +- modules/dnn/src/layers/resize_layer.cpp | 36 +- modules/dnn/src/layers/scale_layer.cpp | 86 +---- modules/dnn/src/layers/slice_layer.cpp | 38 +- modules/dnn/src/layers/softmax_layer.cpp | 35 +- modules/dnn/src/op_cuda.cpp | 118 ------ modules/dnn/src/op_cuda.hpp | 316 ++++++++++++---- modules/dnn/src/precomp.hpp | 1 - 59 files changed, 3056 insertions(+), 1427 deletions(-) create mode 100644 modules/dnn/src/cuda4dnn/cxx_utils/is_iterator.hpp create mode 100644 modules/dnn/src/cuda4dnn/cxx_utils/make_unique.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/activation.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/batch_norm.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/concat.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/const.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/convolution.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/eltwise.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/inner_product.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/lrn.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/padding.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/permute.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/pooling.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/prior_box.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/reshape.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/resize.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/slice.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/softmax.hpp create mode 100644 modules/dnn/src/cuda4dnn/primitives/transpose_convolution.hpp delete mode 100644 modules/dnn/src/op_cuda.cpp diff --git a/modules/dnn/src/cuda/activations.cu b/modules/dnn/src/cuda/activations.cu index 4feb4e822033..9ed3d4eb65c5 100644 --- a/modules/dnn/src/cuda/activations.cu +++ b/modules/dnn/src/cuda/activations.cu @@ -8,7 +8,6 @@ #include "grid_stride_loop.hpp" #include "execution.hpp" -#include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/span.hpp" diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu index 4a36fe7a2d58..52d36345275e 100644 --- a/modules/dnn/src/cuda/concat.cu +++ b/modules/dnn/src/cuda/concat.cu @@ -9,7 +9,6 @@ #include "grid_stride_loop.hpp" #include "execution.hpp" -#include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/span.hpp" diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu index 7047eb6578ff..3da47ddf6fbe 100644 --- a/modules/dnn/src/cuda/eltwise_ops.cu +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -6,7 +6,6 @@ #include "grid_stride_loop.hpp" #include "execution.hpp" -#include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/span.hpp" diff --git a/modules/dnn/src/cuda/fill.cu b/modules/dnn/src/cuda/fill.cu index 697fc626d831..b09491109990 100644 --- a/modules/dnn/src/cuda/fill.cu +++ b/modules/dnn/src/cuda/fill.cu @@ -5,7 +5,6 @@ #include "grid_stride_loop.hpp" #include "execution.hpp" -#include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/span.hpp" @@ -18,6 +17,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace kernels { namespace raw { + template + __global__ void zero(span output) { + for (auto idx : grid_stride_range(output.size())) + output[idx] = 0; + } + template __global__ void fill(span output, T value) { for (auto i : grid_stride_range(output.size())) @@ -27,9 +32,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template void fill(const Stream& stream, span output, T value) { - auto kernel = raw::fill; - auto policy = make_policy(kernel, output.size(), 0, stream); - launch_kernel(kernel, policy, output, value); + if (value == 0.0) { + auto kernel = raw::zero; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output); + } else { + auto kernel = raw::fill; + auto policy = make_policy(kernel, output.size(), 0, stream); + launch_kernel(kernel, policy, output, value); + } } template void fill(const Stream&, span, float); diff --git a/modules/dnn/src/cuda/normalize.cu b/modules/dnn/src/cuda/normalize.cu index c4ff9ddb95c9..3e7c7f0f13b3 100644 --- a/modules/dnn/src/cuda/normalize.cu +++ b/modules/dnn/src/cuda/normalize.cu @@ -9,9 +9,9 @@ #include "grid_stride_loop.hpp" #include "execution.hpp" -#include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/span.hpp" +#include "../cuda4dnn/csl/kernels.hpp" #include @@ -23,12 +23,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k using index_type = gpu::index_type; using size_type = gpu::size_type; - template - __global__ void zero(span output) { - for (auto idx : grid_stride_range(output.size())) - output[idx] = 0; - } - template __global__ void reduce_sum_abs(span output, view input, size_type outer_stride, size_type mid_stride) { for (auto idx : grid_stride_range(input.size())) { @@ -88,13 +82,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k auto sums = span(workspace.data(), outer_size * inner_size); - auto zero_kernel = raw::zero; - auto policy = make_policy(zero_kernel, sums.size(), 0, stream); - launch_kernel(zero_kernel, policy, sums); + fill(stream, sums, 0.0); if (norm == 1) { auto reduce_kernel = raw::reduce_sum_abs; - policy = make_policy(reduce_kernel, input.size(), 0, stream); + auto policy = make_policy(reduce_kernel, input.size(), 0, stream); launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size); auto reciprocal_kernel = raw::reciprocal; @@ -102,7 +94,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k launch_kernel(reciprocal_kernel, policy, sums, epsilon); } else { auto reduce_kernel = raw::reduce_sum_squared; - policy = make_policy(reduce_kernel, input.size(), 0, stream); + auto policy = make_policy(reduce_kernel, input.size(), 0, stream); launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size); auto rsqrt_kernel = raw::rsqrt; @@ -111,7 +103,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } auto scale_kernel = raw::apply_norm; - policy = make_policy(scale_kernel, output.size(), 0, stream); + auto policy = make_policy(scale_kernel, output.size(), 0, stream); launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums); } diff --git a/modules/dnn/src/cuda/padding.cu b/modules/dnn/src/cuda/padding.cu index 0bd0a5a16757..4ee2cb081383 100644 --- a/modules/dnn/src/cuda/padding.cu +++ b/modules/dnn/src/cuda/padding.cu @@ -8,7 +8,6 @@ #include "grid_stride_loop.hpp" #include "execution.hpp" -#include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/span.hpp" diff --git a/modules/dnn/src/cuda/permute.cu b/modules/dnn/src/cuda/permute.cu index 88efed452693..541f8612359d 100644 --- a/modules/dnn/src/cuda/permute.cu +++ b/modules/dnn/src/cuda/permute.cu @@ -7,7 +7,6 @@ #include "grid_stride_loop.hpp" #include "execution.hpp" -#include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/span.hpp" diff --git a/modules/dnn/src/cuda/prior_box.cu b/modules/dnn/src/cuda/prior_box.cu index facebf1d8524..643951dbf0ed 100644 --- a/modules/dnn/src/cuda/prior_box.cu +++ b/modules/dnn/src/cuda/prior_box.cu @@ -9,7 +9,6 @@ #include "grid_stride_loop.hpp" #include "execution.hpp" -#include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/span.hpp" @@ -26,10 +25,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template __global__ void prior_box( span output, - view boxWidth, view boxHeight, view offsetX, view offsetY, + view boxWidth, view boxHeight, view offsetX, view offsetY, T stepX, T stepY, size_type layerWidth, size_type layerHeight, - size_type imageWidth, size_type imageHeight, - T stepX, T stepY) + size_type imageWidth, size_type imageHeight) { /* each box consists of two pair of coordinates and hence 4 values in total */ /* since the entire output consists (first channel at least) of these boxes, @@ -115,40 +113,37 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k template static void launch_prior_box_kernel( const Stream& stream, - span output, view boxWidth, view boxHeight, view offsetX, view offsetY, - std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight, - T stepX, T stepY) + span output, view boxWidth, view boxHeight, view offsetX, view offsetY, T stepX, T stepY, + std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight) { auto num_points = layerWidth * layerHeight; auto kernel = raw::prior_box; auto policy = make_policy(kernel, num_points, 0, stream); launch_kernel(kernel, policy, - output, boxWidth, boxHeight, offsetX, offsetY, - layerWidth, layerHeight, imageWidth, imageHeight, - stepX, stepY); + output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY, + layerWidth, layerHeight, imageWidth, imageHeight); } template void generate_prior_boxes( const Stream& stream, span output, - view boxWidth, view boxHeight, view offsetX, view offsetY, + view boxWidth, view boxHeight, view offsetX, view offsetY, T stepX, T stepY, std::vector variance, std::size_t numPriors, std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight, - T stepX, T stepY, bool normalize, bool clip) { if (normalize) { launch_prior_box_kernel( - stream, output, boxWidth, boxHeight, offsetX, offsetY, - layerWidth, layerHeight, imageWidth, imageHeight, stepX, stepY + stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY, + layerWidth, layerHeight, imageWidth, imageHeight ); } else { launch_prior_box_kernel( - stream, output, boxWidth, boxHeight, offsetX, offsetY, - layerWidth, layerHeight, imageWidth, imageHeight, stepX, stepY + stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY, + layerWidth, layerHeight, imageWidth, imageHeight ); } @@ -176,10 +171,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace k } } - template void generate_prior_boxes(const Stream&, span, view, view, view, view, - std::vector, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, float, float, bool, bool); + template void generate_prior_boxes(const Stream&, span, view, view, view, view, float, float, + std::vector, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool); - template void generate_prior_boxes(const Stream&, span, view, view, view, view, - std::vector, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, double, double, bool, bool); + template void generate_prior_boxes(const Stream&, span, view, view, view, view, double, double, + std::vector, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool); }}}}} /* cv::dnn::cuda4dnn::csl::kernels */ diff --git a/modules/dnn/src/cuda/resize.cu b/modules/dnn/src/cuda/resize.cu index 5a205126422f..31357bb6a100 100644 --- a/modules/dnn/src/cuda/resize.cu +++ b/modules/dnn/src/cuda/resize.cu @@ -7,7 +7,6 @@ #include "grid_stride_loop.hpp" #include "execution.hpp" -#include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/span.hpp" diff --git a/modules/dnn/src/cuda/scale.cu b/modules/dnn/src/cuda/scale.cu index fc0894a841f9..99c979de04b0 100644 --- a/modules/dnn/src/cuda/scale.cu +++ b/modules/dnn/src/cuda/scale.cu @@ -7,7 +7,6 @@ #include "grid_stride_loop.hpp" #include "execution.hpp" -#include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/span.hpp" diff --git a/modules/dnn/src/cuda/slice.cu b/modules/dnn/src/cuda/slice.cu index 564d64b56c1e..8187ae1352bd 100644 --- a/modules/dnn/src/cuda/slice.cu +++ b/modules/dnn/src/cuda/slice.cu @@ -7,7 +7,6 @@ #include "grid_stride_loop.hpp" #include "execution.hpp" -#include "../cuda4dnn/csl/kernels.hpp" #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/span.hpp" diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp index 62286bec700c..f6594ad577b3 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn.hpp @@ -8,11 +8,5 @@ #include #include "cudnn/cudnn.hpp" -#include "cudnn/convolution.hpp" -#include "cudnn/transpose_convolution.hpp" -#include "cudnn/lrn.hpp" -#include "cudnn/pooling.hpp" -#include "cudnn/softmax.hpp" -#include "cudnn/transform.hpp" #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/kernels.hpp b/modules/dnn/src/cuda4dnn/csl/kernels.hpp index 914ba34606b1..0ebbe8b4d727 100644 --- a/modules/dnn/src/cuda4dnn/csl/kernels.hpp +++ b/modules/dnn/src/cuda4dnn/csl/kernels.hpp @@ -104,12 +104,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace ke void generate_prior_boxes( const Stream& stream, span output, - view boxWidth, view boxHeight, view offsetX, view offsetY, + view boxWidth, view boxHeight, view offsetX, view offsetY, T stepX, T stepY, std::vector variance, std::size_t numPriors, std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight, - T stepX, T stepY, bool normalize, bool clip); template diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp index d3f952aa0911..bd3accc296e3 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp @@ -12,6 +12,8 @@ #include "span.hpp" #include "workspace.hpp" +#include "../cxx_utils/is_iterator.hpp" + #include #include @@ -85,7 +87,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * Whatever arguments are accepted by the resize methods are accepted here. */ template - Tensor(Args... sizes) { resize(std::forward(sizes)...); } + Tensor(Args&&... sizes) { resize(std::forward(sizes)...); } Tensor& operator=(const Tensor&) = delete; Tensor& operator=(Tensor&& other) noexcept { @@ -143,7 +145,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * Exception Guarantee: Strong */ template - typename std::enable_if::value, void> // TODO is_iterator + typename std::enable_if::value, void> ::type resize(ForwardItr start, ForwardItr end) { CV_Assert(start != end); CV_Assert(std::distance(start, end) <= rank); @@ -201,7 +203,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * Exception Guarantee: Strong */ template - typename std::enable_if::value, void> // TODO is_iterator + typename std::enable_if::value, void> ::type reshape(ForwardItr start, ForwardItr end) { CV_Assert(start != end); CV_Assert(std::distance(start, end) <= rank); @@ -316,7 +318,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { sizes[i] = parent.get_axis_size(i); } - /* returns the total number of elements in the span */ + /** creates a subspan of a tensor (or span); refer to subspan method for more details */ + template + TensorSpan(TensorSpan other, size_type offset, Args&&... args) + : TensorSpan(other.subspan(offset, std::forward(args)...)) + { + } + + /** returns the total number of elements in the span */ CUDA4DNN_HOST/*_DEVICE*/ size_type size() const noexcept { return std::accumulate(std::begin(sizes), std::end(sizes), 1, std::multiplies()); } @@ -368,7 +377,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * Exception Guarantee: Strong */ template CUDA4DNN_HOST - typename std::enable_if::value, void> // TODO is_iterator + typename std::enable_if::value, void> ::type reshape(ForwardItr start, ForwardItr end) { CV_Assert(start != end); CV_Assert(std::distance(start, end) <= rank); @@ -452,7 +461,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * Exception Guarantee: Strong */ template CUDA4DNN_HOST - typename std::enable_if::value, TensorSpan> // TODO is_iterator + typename std::enable_if::value, TensorSpan> ::type subspan(size_type offset, ForwardItr start, ForwardItr end) const { CV_Assert(start != end); CV_Assert(std::distance(start, end) <= rank); @@ -541,6 +550,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { sizes[i] = parent.get_axis_size(i); } + /** creates a subview of a tensor (or span or view); refer to subview method for more details */ + template + TensorView(TensorView other, size_type offset, Args&&... args) noexcept + : TensorView(other.subview(offset, std::forward(args)...)) + { + } + TensorView& operator=(const TensorView&) = default; TensorView& operator=(const TensorSpan& other) noexcept { TensorView tmp(other); @@ -684,7 +700,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { * Exception Guarantee: Strong */ template CUDA4DNN_HOST - typename std::enable_if::value, TensorView> // TODO is_iterator + typename std::enable_if::value, TensorView> ::type subview(size_type offset, ForwardItr start, ForwardItr end) const { CV_Assert(start != end); CV_Assert(std::distance(start, end) <= rank); diff --git a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp index 490618d75233..13fccd40ef78 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp @@ -12,6 +12,13 @@ #include "cublas.hpp" #include "cudnn.hpp" +#include "cudnn/convolution.hpp" +#include "cudnn/pooling.hpp" +#include "cudnn/lrn.hpp" +#include "cudnn/softmax.hpp" +#include "cudnn/transform.hpp" +#include "cudnn/transpose_convolution.hpp" + #include #include diff --git a/modules/dnn/src/cuda4dnn/cxx_utils/is_iterator.hpp b/modules/dnn/src/cuda4dnn/cxx_utils/is_iterator.hpp new file mode 100644 index 000000000000..4778ca327ab0 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/cxx_utils/is_iterator.hpp @@ -0,0 +1,31 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP +#define OPENCV_DNN_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils { + + namespace detail { + template + struct is_iterator_helper : std::false_type {}; + + template + struct is_iterator_helper::iterator_category>::value, void>::type + > : std::true_type {}; + } + + template + using is_iterator = typename detail::is_iterator_helper; + + template + using is_forward_iterator = typename detail::is_iterator_helper; + +}}}} /* cv::dnn::cuda4dnn::csl::cxx_utils */ + +#endif /* OPENCV_DNN_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP */ diff --git a/modules/dnn/src/cuda4dnn/cxx_utils/make_unique.hpp b/modules/dnn/src/cuda4dnn/cxx_utils/make_unique.hpp new file mode 100644 index 000000000000..4690ab026ad4 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/cxx_utils/make_unique.hpp @@ -0,0 +1,19 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_CXX_UTILS_MAKE_UNIQUE_HPP +#define OPENCV_DNN_CUDA4DNN_CXX_UTILS_MAKE_UNIQUE_HPP + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils { + + template + std::unique_ptr make_unique(Args&&... args) { + return std::unique_ptr(new T(std::forward(args)...)); + } + +}}}} /* cv::dnn::cuda4dnn::csl::cxx_utils */ + +#endif /* OPENCV_DNN_CUDA4DNN_CXX_UTILS_MAKE_UNIQUE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/activation.hpp b/modules/dnn/src/cuda4dnn/primitives/activation.hpp new file mode 100644 index 000000000000..69efd45ae927 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/activation.hpp @@ -0,0 +1,286 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/tensor.hpp" +#include "../csl/tensor_ops.hpp" + +#include + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class ReLUOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + ReLUOp(csl::Stream stream_, T slope_) + : stream(std::move(stream_)), slope{ slope_ } { } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::relu(stream, output, input, slope); + } + } + + private: + csl::Stream stream; + const T slope; + }; + + template + class ClippedReLUOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + ClippedReLUOp(csl::Stream stream_, T min_, T max_) + : stream(std::move(stream_)), min{ min_ }, max{ max_ } { } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::clipped_relu(stream, output, input, min, max); + } + } + + private: + csl::Stream stream; + const T min, max; + }; + + template + class ChannelwiseReLUOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + ChannelwiseReLUOp(csl::Stream stream_, const Mat& slope) + : stream(std::move(stream_)) + { + slopeTensor = csl::makeTensorHeader(slope); + csl::copyMatToTensor(slopeTensor, slope, stream); + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::channelwise_relu(stream, output, input, slopeTensor); + } + } + + private: + csl::Stream stream; + csl::Tensor slopeTensor; + }; + + template + class TanHOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + TanHOp(csl::Stream stream_) : stream(std::move(stream_)) { } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::tanh(stream, output, input); + } + } + + private: + csl::Stream stream; + }; + + template + class SigmoidOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + SigmoidOp(csl::Stream stream_) : stream(std::move(stream_)) { } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::sigmoid(stream, output, input); + } + } + + private: + csl::Stream stream; + }; + + template + class ELUOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + ELUOp(csl::Stream stream_) : stream(std::move(stream_)) { } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::elu(stream, output, input); + } + } + + private: + csl::Stream stream; + }; + + template + class AbsValOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + AbsValOp(csl::Stream stream_) : stream(std::move(stream_)) { } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::abs(stream, output, input); + } + } + + private: + csl::Stream stream; + }; + + template + class BNLLOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + BNLLOp(csl::Stream stream_) : stream(std::move(stream_)) { } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::bnll(stream, output, input); + } + } + + private: + csl::Stream stream; + }; + + template + class PowerOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + PowerOp(csl::Stream stream_, T exp_, T scale_, T shift_) + : stream(std::move(stream_)), exp{ exp_ }, scale{ scale_ }, shift{ shift_ } { } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::power(stream, output, input, exp, scale, shift); + } + } + + private: + csl::Stream stream; + const T exp, scale, shift; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/batch_norm.hpp b/modules/dnn/src/cuda4dnn/primitives/batch_norm.hpp new file mode 100644 index 000000000000..197226c66797 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/batch_norm.hpp @@ -0,0 +1,60 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/tensor.hpp" +#include "../csl/tensor_ops.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class BatchNormOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + BatchNormOp(csl::Stream stream_, const cv::Mat& weights, const cv::Mat& bias) + : stream(std::move(stream_)) + { + biasTensor = csl::makeTensorHeader(bias); + csl::copyMatToTensor(biasTensor, bias, stream); + + weightsTensor = csl::makeTensorHeader(weights); + csl::copyMatToTensor(weightsTensor, weights, stream); + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + std::size_t inner_size = 1; + for (int i = 2; i < input.rank; i++) + inner_size *= input.get_axis_size(i); + + csl::kernels::scaleN_with_biasN(stream, output, input, inner_size, weightsTensor, biasTensor); + } + + private: + csl::Stream stream; + csl::Tensor weightsTensor, biasTensor; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/concat.hpp b/modules/dnn/src/cuda4dnn/primitives/concat.hpp new file mode 100644 index 000000000000..8db80186f165 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/concat.hpp @@ -0,0 +1,88 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_CONCAT_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_CONCAT_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/pointer.hpp" +#include "../csl/kernels.hpp" + +#include + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class ConcatOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + ConcatOp(csl::Stream stream_, std::size_t concat_axis, bool zero_padding) + : stream(std::move(stream_)), concat_axis{ concat_axis }, zero_padding{ zero_padding } + { + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(outputs.size() == 1); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + if(zero_padding) + { + auto output_shape = output_wrapper->getShape(); + + csl::memset(output.get(), 0, output.size(), stream); + + std::size_t output_concat_axis_offset = 0; + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + auto input_shape = input_wrapper->getShape(); + + std::vector offsets(input_shape.size()); + for (int j = 0; j < offsets.size(); j++) + offsets[j] = (output_shape[j] - input_shape[j]) / 2; + offsets[concat_axis] = output_concat_axis_offset; + + csl::kernels::concat_with_offsets(stream, output, input, offsets); + + output_concat_axis_offset += input.get_axis_size(concat_axis); + } + } + else + { + std::size_t output_axis_offset = 0; + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + csl::kernels::concat(stream, output, output_axis_offset, input, concat_axis); + + output_axis_offset += input.get_axis_size(concat_axis); + } + } + } + + private: + csl::Stream stream; + std::size_t concat_axis; + bool zero_padding; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_CONCAT_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/const.hpp b/modules/dnn/src/cuda4dnn/primitives/const.hpp new file mode 100644 index 000000000000..ffccc3244043 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/const.hpp @@ -0,0 +1,51 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_CONST_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_CONST_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/tensor.hpp" +#include "../csl/tensor_ops.hpp" + +#include + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class ConstOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + ConstOp(csl::Stream stream_, const cv::Mat& data) + : stream(std::move(stream_)) + { + constTensor = csl::makeTensorHeader(data); + csl::copyMatToTensor(constTensor, data, stream); + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(outputs.size() == 1 && inputs.size() == 0); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + csl::tensor_ops::copy(stream, output, constTensor); + } + + private: + csl::Stream stream; + csl::Tensor constTensor; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_CONST_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp new file mode 100644 index 000000000000..bd431865acbf --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp @@ -0,0 +1,240 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/cudnn.hpp" +#include "../csl/stream.hpp" +#include "../csl/tensor.hpp" +#include "../csl/tensor_ops.hpp" +#include "../csl/kernels.hpp" + +#include + +#include +#include +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + struct ConvolutionConfiguration { + /* the size of the following vectors must be equal to the kernel size */ + std::vector kernel_size; + std::vector dilations, strides; + + enum class padding_mode { + manual, /* uses explicit padding values provided in `pads_begin` and `pads_end` */ + valid, /* no padding is added */ + same /* TensorFlow logic is used for same padding */ + }; + + /* explicit paddings are used if and only if padMode is set to manual */ + padding_mode padMode; + std::vector pads_begin, pads_end; + + /* full shape inclusive of channel and batch axis */ + std::vector input_shape; + std::vector output_shape; + + /* group count for grouped convolution */ + std::size_t groups; + }; + + template + class ConvolutionOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + ConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const ConvolutionConfiguration& config, const Mat& filters, const Mat& bias) + : stream(std::move(stream_)), cudnnHandle(std::move(handle)) + { + const auto& kernel_size = config.kernel_size; + const auto& dilations = config.dilations; + const auto& strides = config.strides; + + const auto convolution_order = kernel_size.size(); + CV_Assert(convolution_order >= 1); + + CV_Assert(convolution_order == dilations.size()); + CV_Assert(convolution_order == strides.size()); + + const auto& input_shape = config.input_shape; + const auto& output_shape = config.output_shape; + CV_Assert(input_shape.size() == output_shape.size()); + CV_Assert(input_shape.size() == convolution_order + 2); + + const auto groups = config.groups; + + if (convolution_order > 3) + CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D convolution is supported."); + + const auto rank = input_shape.size(); + const auto output_feature_maps = output_shape[1]; + const auto input_feature_maps = input_shape[1]; + const auto input_feature_maps_per_group = input_feature_maps / groups; + CV_Assert(input_feature_maps % groups == 0); + + filtersTensor = csl::makeTensorHeader(filters); + csl::copyMatToTensor(filtersTensor, filters, stream); + + if (!bias.empty()) + { + biasTensor = csl::makeTensorHeader(bias); + csl::copyMatToTensor(biasTensor, bias, stream); + } + + /* left and right are misleading as the padding is applicable for any number of dimensions + * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` + * + * `common_padding` contains the amount of padding that has to be added to both sides + * `padding_left` and `padding_right` contains the amount of padding that needs to be added + * to a particular side in addition to the common padding + */ + std::vector common_padding(rank, 0); + std::vector padding_left(rank, 0), padding_right(rank, 0); + if (config.padMode == ConvolutionConfiguration::padding_mode::manual) + { + const auto& pads_begin = config.pads_begin; + const auto& pads_end = config.pads_end; + + CV_Assert(convolution_order == pads_begin.size()); + CV_Assert(convolution_order == pads_end.size()); + + for (int i = 2; i < common_padding.size(); i++) + { + common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]); + padding_left[i] = pads_begin[i - 2] - common_padding[i]; + padding_right[i] = pads_end[i - 2] - common_padding[i]; + } + } + else if (config.padMode == ConvolutionConfiguration::padding_mode::valid) + { + /* nothing to do as the paddings are already preset to zero */ + } + else if (config.padMode == ConvolutionConfiguration::padding_mode::same) + { + /* TensorFlow Logic: + * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] + * + * if total padding is odd, the extra is added towards the end + */ + for (int i = 2; i < rank; i++) + { + const auto j = i - 2; /* filter index */ + const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; + const auto required_total_padding = + std::max(0, (output_shape[i] - 1) * strides[j] + effective_kernel_size - input_shape[i]); + + common_padding[i] = required_total_padding / 2; + padding_left[i] = 0; + padding_right[i] = required_total_padding % 2; + } + } + + /* in some scenarios, the extra padding at the end may not change the output at all */ + for (int i = 2; i < rank; i++) { + const auto j = i - 2; /* filter idx */ + const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; + const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; + std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j]; + + /* the output shape doesn't change if we decrease the total padding by at most `rem` + * provided that we decrease from the right + */ + if (rem && padding_right[i] > 0) + padding_right[i] = std::max(0, padding_right[i] - rem); + } + + auto is_not_zero = [](std::size_t i) { return i != 0; }; + if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) || + std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero)) + { + /* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by + * copying the input to a bigger tensor and padding the ends manually + */ + auto transformed_input_shape = input_shape; + for (int i = 0; i < rank; i++) + transformed_input_shape[i] += padding_left[i] + padding_right[i]; + + transformedInput.resize(std::begin(transformed_input_shape), std::end(transformed_input_shape)); + inputTransformer = csl::TensorTransform(cudnnHandle, padding_left, padding_right); + } + + csl::Convolution::params_type params; + if (transformedInput.empty()) + { + params.input_shape.assign(std::begin(input_shape), std::end(input_shape)); + } + else + { + /* the convolution operation will be seeing the transformed input */ + auto transformed_input_shape = transformedInput.shape(); + params.input_shape.assign(std::begin(transformed_input_shape), std::end(transformed_input_shape)); + } + + auto& fshape = params.filter_shape; + fshape.resize(rank); + fshape[0] = output_feature_maps; + fshape[1] = input_feature_maps_per_group; + std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2); + CV_Assert(fshape.size() == kernel_size.size() + 2); + + params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); + params.stride = strides; + params.dialation = dilations; + params.groups = config.groups; + + convoluter = csl::Convolution(cudnnHandle, params); + scratch_mem_in_bytes = convoluter.get_workspace_size(); + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + if (!transformedInput.empty()) + { + inputTransformer.transform(input, transformedInput); + input = csl::TensorView(transformedInput); + } + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + convoluter.convolve(output, input, filtersTensor, workspace); + if (!biasTensor.empty()) + { + std::size_t inner_size = total(output_wrapper->getShape(), 2, -1); + csl::kernels::biasN(stream, output, output, inner_size, biasTensor); + } + } + + std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; } + + private: + csl::Stream stream; + csl::cudnn::Handle cudnnHandle; + csl::Tensor filtersTensor, biasTensor; + csl::Convolution convoluter; + + csl::Tensor transformedInput; + csl::TensorTransform inputTransformer; + + std::size_t scratch_mem_in_bytes; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp new file mode 100644 index 000000000000..18695316065a --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp @@ -0,0 +1,114 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_ELTWISE_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_ELTWISE_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/tensor.hpp" +#include "../csl/tensor_ops.hpp" +#include "../csl/kernels.hpp" + +#include + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + enum class eltwise_op { + max, + sum, + product + }; + + template + class EltwiseOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + template + EltwiseOp(csl::Stream stream_, eltwise_op op_, std::vector coeffs_) + : stream(std::move(stream_)), op{ op_ }, coeffs(std::begin(coeffs_), std::end(coeffs_)) + { + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() >= 2); + CV_Assert(outputs.size() == 1); + + CV_Assert(coeffs.size() == 0 || op == eltwise_op::sum); + CV_Assert(coeffs.size() == 0 || inputs.size() == coeffs.size()); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + if (inputs.size() == 2) + { + auto input_wrapper_x = inputs[0].dynamicCast(); + auto input_x = input_wrapper_x->getView(); + + auto input_wrapper_y = inputs[1].dynamicCast(); + auto input_y = input_wrapper_y->getView(); + + switch (op) + { + case eltwise_op::max: csl::kernels::eltwise_max_2(stream, output, input_x, input_y); break; + case eltwise_op::product: csl::kernels::eltwise_prod_2(stream, output, input_x, input_y); break; + case eltwise_op::sum: + if (coeffs.empty() || (coeffs[0] == 1 && coeffs[1] == 1)) + csl::kernels::eltwise_sum_2(stream, output, input_x, input_y); + else + csl::kernels::eltwise_sum_coeff_2(stream, output, coeffs[0], input_x, coeffs[1], input_y); + break; + } + } + else + { + auto input_wrapper_0 = inputs[0].dynamicCast(); + auto input_0 = input_wrapper_0->getView(); + + /* we first make a copy and then apply EltwiseOp cumulatively */ + csl::tensor_ops::copy(stream, output, input_0); + + for (int i = 1; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + switch (op) + { + case eltwise_op::max: csl::kernels::eltwise_max_2(stream, output, output, input); break; + case eltwise_op::product: csl::kernels::eltwise_prod_2(stream, output, output, input); break; + case eltwise_op::sum: + if (coeffs.empty() || coeffs[i] == 1) + csl::kernels::eltwise_sum_2(stream, output, output, input); + else + { + /* if this is the first op, we must scale output too */ + auto coeff_x = (i == 1) ? coeffs[0] : 1.0; + csl::kernels::eltwise_sum_coeff_2(stream, output, coeff_x, output, coeffs[i], input); + } + break; + } + } + } + } + + private: + csl::Stream stream; + eltwise_op op; + std::vector coeffs; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_ELTWISE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/inner_product.hpp b/modules/dnn/src/cuda4dnn/primitives/inner_product.hpp new file mode 100644 index 000000000000..f2775cb1f3e7 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/inner_product.hpp @@ -0,0 +1,93 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/cublas.hpp" +#include "../csl/tensor.hpp" +#include "../csl/tensor_ops.hpp" +#include "../csl/kernels.hpp" + +#include + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class InnerProductOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + InnerProductOp(csl::Stream stream_, csl::cublas::Handle handle, std::size_t axis, const Mat& weights, const Mat& bias) + : stream(std::move(stream_)), cublasHandle(std::move(handle)), axis{ axis } + { + weightsTensor = csl::makeTensorHeader(weights); + CV_Assert(get_effective_rank(weightsTensor) == 2); + csl::copyMatToTensor(weightsTensor, weights, stream); + + if (!bias.empty()) + { + biasTensor = csl::makeTensorHeader(bias); + csl::copyMatToTensor(biasTensor, bias, stream); + CV_Assert(weightsTensor.get_axis_size(-2) == biasTensor.size()); + } + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + std::size_t batch_size = 1; + for (int j = 0; j < axis; j++) + batch_size *= input.get_axis_size(j); + + auto input_size = input.size() / batch_size; + CV_Assert(input_size == weightsTensor.get_axis_size(-1)); + + auto output_size = output.size() / batch_size; + CV_Assert(output_size == weightsTensor.get_axis_size(-2)); + + /* we treat the input and output as a matrix with dimensions (batch_size, input_size) + * and (batch_size, output_size) respectively + * + * weight matrix dimensions: (output_size, input_size) + * + * I(W^T) = O + * (batch_size, input_size) * (input_size, output_size) = (batch_size, output_size) + */ + input.reshape(batch_size, input_size); + output.reshape(batch_size, output_size); + csl::tensor_ops::gemm(cublasHandle, 0.0, output, 1.0, false, input, true, weightsTensor); + + if (!biasTensor.empty()) + csl::kernels::biasN(stream, output, output, 1, biasTensor); + } + } + + private: + csl::Stream stream; + csl::cublas::Handle cublasHandle; + csl::Tensor weightsTensor, biasTensor; + std::size_t axis; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/lrn.hpp b/modules/dnn/src/cuda4dnn/primitives/lrn.hpp new file mode 100644 index 000000000000..2afd3c7ba582 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/lrn.hpp @@ -0,0 +1,56 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_LRN_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_LRN_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/cudnn.hpp" +#include "../csl/tensor_ops.hpp" + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + enum class lrn_type { + across_channels + }; + + template + class LRNOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + LRNOp(csl::cudnn::Handle handle, lrn_type type, std::size_t local_size, T alpha, T beta, T bias) + { + if(type == lrn_type::across_channels) + lrn = csl::LRN(std::move(handle), local_size, alpha, beta, bias, csl::LRN::lrn_type::ACROSS_CHANNELS); + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + lrn.normalize(input, output); + } + } + + private: + csl::LRN lrn; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_LRN_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp b/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp new file mode 100644 index 000000000000..93f3bff693a8 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp @@ -0,0 +1,145 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/span.hpp" +#include "../csl/tensor.hpp" +#include "../csl/kernels.hpp" + +#include + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + struct NormalizeConfiguration { + std::vector input_shape; + + /* axis range across which values are normalized + * + * [0, axis_start) = outer range + * [axis_start, axis_end) = mid range + * [axis_end + 1, -1) = inner range + * + * for each location in the outer and inner range, all the values in the mid range are + * normalized together + */ + std::size_t axis_start, axis_end; + + /* 1 for L1 norm, 2 for L2 norm */ + std::size_t norm; + + /* epsilon to use to avoid divison by zero */ + T eps; + }; + + template + class NormalizeOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + template + NormalizeOp(csl::Stream stream_, const Mat& weights, const NormalizeConfiguration& config) + : stream(std::move(stream_)), weight{ 1.0 } + { + norm_order = config.norm; + epsilon = config.eps; + axis_start = config.axis_start; + axis_end = config.axis_end; + + if (!weights.empty()) + { + if (weights.total() == 1) + { + weight = weights.at(0, 0); + } + else + { + weightsTensor = csl::makeTensorHeader(weights); + csl::copyMatToTensor(weightsTensor, weights, stream); + } + } + + std::size_t outer_size = 1; + for (int i = 0; i < axis_start; i++) + outer_size *= config.input_shape[i]; + + std::size_t inner_size = 1; + for (int i = axis_end; i < config.input_shape.size(); i++) + inner_size *= config.input_shape[i]; + + scratch_mem_in_bytes = outer_size * inner_size * sizeof(T); + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + auto input_shape = input_wrapper->getShape(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + std::size_t outer_size = 1; + for (int i = 0; i < axis_start; i++) + outer_size *= input_shape[i]; + + std::size_t mid_size = 1; + for (int i = axis_start; i < axis_end; i++) + mid_size *= input_shape[i]; + + std::size_t inner_size = 1; + for (int i = axis_end; i < input_shape.size(); i++) + inner_size *= input_shape[i]; + + auto scratch_ptr = reinterpret_cast(csl::WorkspaceAccessor::get(workspace).get()); + auto scratch = csl::span(csl::DevicePtr(scratch_ptr), workspace.size()); + csl::kernels::normalize(stream, output, input, outer_size, mid_size, inner_size, norm_order, epsilon, scratch); + + /* there might be a single weight in which case `weight` will be not equal to 1.0 + * or there might be several weights + * or we don't have to scale + */ + if (weight != 1.0) + { + csl::kernels::scale1(stream, output, input, weight); + } + else if (!weightsTensor.empty()) + { + CV_Assert(weightsTensor.size() != 1); /* constructor should have set up to use `weight` */ + CV_Assert(weightsTensor.size() == mid_size); + csl::kernels::scaleN(stream, output, input, inner_size, weightsTensor); + } + } + + std::size_t get_workspace_memory_in_bytes() { return scratch_mem_in_bytes; } + + private: + csl::Stream stream; + csl::Tensor weightsTensor; + T weight; /* if there is only one weight, we use this */ + + T epsilon; + std::size_t norm_order; + std::size_t axis_start, axis_end; + + std::size_t scratch_mem_in_bytes; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/padding.hpp b/modules/dnn/src/cuda4dnn/primitives/padding.hpp new file mode 100644 index 000000000000..35a6bd9f0c59 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/padding.hpp @@ -0,0 +1,109 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_PADDING_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_PADDING_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/tensor.hpp" +#include "../csl/kernels.hpp" + +#include + +#include +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + enum class padding_type { + constant, + reflection101 + }; + + template + class PaddingOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + /* `ranges` is indexed by axis and contains the range in the output where the input is copied to */ + PaddingOp(csl::Stream stream_, padding_type type_, T value_, std::vector ranges) + : stream(std::move(stream_)), dstRanges(std::move(ranges)), type{ type_ }, value{ value_ } + { + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto effective_rank = get_effective_rank(input); + CV_Assert(get_effective_rank(input) == get_effective_rank(output)); + + /* suppose we require padding for the first spatial axis (H in NCHW or D in NCDHW) + * + * there could be a case where the batch axis, channel axis, and the first spatial axis are all one + * this would result in effective rank being less than the number of axes requiring padding + */ + effective_rank = std::max(effective_rank, dstRanges.size()); + + for (int i = effective_rank - dstRanges.size(); i < effective_rank; i++) + { + if (dstRanges[i] == Range::all()) + CV_Assert(input.get_axis_size(i) == output.get_axis_size(i)); + else + CV_Assert(input.get_axis_size(i) == dstRanges[i].size()); + } + + if (type == padding_type::constant) + { + csl::kernels::fill(stream, output, value); + + std::vector offsets(effective_rank, 0); + for (int i = 0; i < dstRanges.size(); i++) + { + const auto delta = effective_rank - dstRanges.size(); + if (dstRanges[i] != Range::all()) + offsets[delta + i] = dstRanges[i].start; + } + + csl::kernels::concat_with_offsets(stream, output, input, offsets); + } + else if (type == padding_type::reflection101) + { + std::vector> ranges(effective_rank); + for (int i = 0; i < effective_rank; i++) + { + const auto delta = effective_rank - dstRanges.size(); + if (i < delta || dstRanges[i - delta] == Range::all()) + ranges[i] = { 0, input.get_axis_size(i) }; + else + ranges[i] = { dstRanges[i].start, dstRanges[i].end }; + } + csl::kernels::copy_with_reflection101(stream, output, input, ranges); + } + } + + private: + csl::Stream stream; + padding_type type; + T value; + + std::vector dstRanges; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_PADDING_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/permute.hpp b/modules/dnn/src/cuda4dnn/primitives/permute.hpp new file mode 100644 index 000000000000..fc76feeb3d4f --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/permute.hpp @@ -0,0 +1,67 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_PERMUTE_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_PERMUTE_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/tensor_ops.hpp" + +#include + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class PermuteOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + PermuteOp(csl::Stream stream_, std::vector order_) + : stream(std::move(stream_)), order(std::move(order_)) { } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + auto needsPermute = [&] { + for (int i = 0; i < order.size(); i++) + if (order[i] != i) + return true; + return false; + }(); + + if (needsPermute) + { + csl::tensor_ops::permute(stream, output, input, order); + } + else + { + if (input.get() != output.get()) + csl::tensor_ops::copy(stream, output, input); + } + } + } + + private: + csl::Stream stream; + std::vector order; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_PERMUTE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/pooling.hpp b/modules/dnn/src/cuda4dnn/primitives/pooling.hpp new file mode 100644 index 000000000000..0575b167cfd4 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/pooling.hpp @@ -0,0 +1,258 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_POOLING_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_POOLING_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/cudnn.hpp" +#include "../csl/tensor.hpp" +#include "../csl/tensor_ops.hpp" + +#include + +#include +#include +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + struct PoolingConfiguration { + enum class pooling_mode { + max, + average_included, /* include padding while calculating average */ + average_excluded /* exclude padding while calculating average */ + }; + + pooling_mode poolMode; + + /* the size of the following vectors must be equal to the window size */ + std::vector window_size; + std::vector strides; + + enum class padding_mode { + manual, /* uses explicit padding values provided in `pads_begin` and `pads_end` */ + valid, /* no padding is added */ + same /* TensorFlow logic is used for same padding */ + }; + + padding_mode padMode; + + /* explicit paddings are used if and only if padMode is set to manual */ + std::vector pads_begin, pads_end; + + /* the output shape is calculated using the following formula: + * output_dim = func[(input_dim + padding_left + padding_right - kernel_dim)/stride] + 1 + * + * rounding mode decides what is used as `func` + */ + enum class rounding_mode { + ceil, /* uses ceil */ + floor + }; + + rounding_mode roundMode; + + /* full shape inclusive of channel and batch axis */ + std::vector input_shape; + }; + + template + class PoolingOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + PoolingOp(csl::cudnn::Handle handle, const PoolingConfiguration& config) + : cudnnHandle(std::move(handle)) + { + const auto& window_size = config.window_size; + + const auto pooling_order = window_size.size(); + CV_Assert(pooling_order >= 1); + + const auto& strides = config.strides; + CV_Assert(pooling_order == strides.size()); + + const auto& input_shape = config.input_shape; + CV_Assert(input_shape.size() == pooling_order + 2); + + if (pooling_order > 3) + CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D pooling are supported."); + + const auto rank = input_shape.size(); + + /* left and right are misleading as the padding is applicable for any number of dimensions + * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` + * + * `common_padding` contains the amount of padding that has to be added to both sides + * `padding_left` and `padding_right` contains the amount of padding that needs to be added + * to a particular side in addition to the common padding + */ + std::vector common_padding(rank, 0); + std::vector padding_left(rank, 0), padding_right(rank, 0); + if (config.padMode == PoolingConfiguration::padding_mode::manual) + { + const auto& pads_begin = config.pads_begin; + const auto& pads_end = config.pads_end; + + CV_Assert(pooling_order == pads_begin.size()); + CV_Assert(pooling_order == pads_end.size()); + + /* cuDNN rounds down by default; hence, if ceilMode is false, we do nothing + * otherwise, we add extra padding towards the end so that the convolution arithmetic yeilds + * the correct output size without having to deal with fancy fractional sizes + */ + auto pads_end_modified = pads_end; + if (config.roundMode == PoolingConfiguration::rounding_mode::ceil) + { + for (int i = 0; i < window_size.size(); i++) { + auto rem = (input_shape[i + 2] + pads_begin[i] + pads_end[i] - window_size[i]) % strides[i]; + if (rem) + pads_end_modified[i] += strides[i] - rem; + } + } + + for (int i = 2; i < common_padding.size(); i++) + { + common_padding[i] = std::min(pads_begin[i - 2], pads_end_modified[i - 2]); + padding_left[i] = pads_begin[i - 2] - common_padding[i]; + padding_right[i] = pads_end_modified[i - 2] - common_padding[i]; + } + } + else if (config.padMode == PoolingConfiguration::padding_mode::valid) + { + /* nothing to do as the paddings are already preset to zero */ + } + else if (config.padMode == PoolingConfiguration::padding_mode::same) + { + /* TensorFlow Logic: + * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] + * + * if total padding is odd, the extra is added towards the end + */ + for (int i = 2; i < rank; i++) + { + const auto j = i - 2; /* filter index */ + const auto output_dim = (input_shape[i] - 1 + strides[j]) / strides[j]; + const auto required_total_padding = + std::max(0, (output_dim - 1) * strides[j] + window_size[j] - input_shape[i]); + + common_padding[i] = required_total_padding / 2; + padding_left[i] = 0; + padding_right[i] = required_total_padding % 2; + } + } + + /* in some scenarios, the extra padding at the end may not change the output at all */ + for (int i = 2; i < rank; i++) { + const auto j = i - 2; /* filter idx */ + const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; + std::int64_t rem = (input_shape[i] + total_padding - window_size[j]) % strides[j]; + + /* the output shape doesn't change if we decrease the total padding by at most `rem` + * provided that we decrease from the right + */ + if (rem && padding_right[i] > 0) + padding_right[i] = std::max(0, padding_right[i] - rem); + } + + auto is_not_zero = [](std::size_t i) { return i != 0; }; + if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) || + std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero)) + { + /* csl::Pooling does not fully support asymmetric padding; hence, we deal with asymmetric padding by + * copying the input to a bigger tensor and padding the ends manually + * + * But we first try to avoid the transformation using cuDNN's flexibility. cuDNN can accept a smaller or + * a bigger output shape. This effectively allows us to have arbitary padding at the right. + */ + if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero)) + { + /* there is padding on the left and we are forced to transform */ + auto transformed_input_shape = input_shape; + for (int i = 0; i < rank; i++) + transformed_input_shape[i] += padding_left[i] + padding_right[i]; + + transformedInput.resize(std::begin(transformed_input_shape), std::end(transformed_input_shape)); + inputTransformer = csl::TensorTransform(cudnnHandle, padding_left, padding_right); + } + } + + csl::Pooling::params_type params; + if (transformedInput.empty()) + { + /* no transform => use original input shape */ + params.input_shape.assign(std::begin(input_shape), std::end(input_shape)); + } + else + { + /* the pooling operation will be seeing the transformed input */ + auto transformed_input_shape = transformedInput.shape(); + params.input_shape.assign(std::begin(transformed_input_shape), std::end(transformed_input_shape)); + } + + auto output_shape = input_shape; + for (int i = 2; i < rank; i++) + { + auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; + output_shape[i] = (params.input_shape[i] + total_padding - window_size[i - 2]) / strides[i - 2] + 1; + } + + params.output_shape.assign(std::begin(output_shape), std::end(output_shape)); + params.window_size = window_size; + params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); + params.stride = strides; + + if (config.poolMode == PoolingConfiguration::pooling_mode::max) + { + params.type = csl::Pooling::pooling_type::MAX; + } + else if (config.poolMode == PoolingConfiguration::pooling_mode::average_included) + { + params.type = csl::Pooling::pooling_type::AVERAGE_INCLUDE_PADDING; + } + else if (config.poolMode == PoolingConfiguration::pooling_mode::average_excluded) + { + params.type = csl::Pooling::pooling_type::AVERAGE_EXCLUDE_PADDING; + } + + pooler = csl::Pooling(cudnnHandle, params); + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + if (!transformedInput.empty()) + { + inputTransformer.transform(input, transformedInput); + input = csl::TensorView(transformedInput); + } + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + pooler.pool(input, output); + } + + private: + csl::cudnn::Handle cudnnHandle; + csl::Pooling pooler; + + csl::Tensor transformedInput; + csl::TensorTransform inputTransformer; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_POOLING_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/prior_box.hpp b/modules/dnn/src/cuda4dnn/primitives/prior_box.hpp new file mode 100644 index 000000000000..64412f0f9e17 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/prior_box.hpp @@ -0,0 +1,137 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/span.hpp" +#include "../csl/tensor.hpp" +#include "../csl/kernels.hpp" + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + struct PriorBoxConfiguration { + std::size_t feature_map_width, feature_map_height; + std::size_t image_width, image_height; + + /* parameters for prior boxes for each feature point */ + std::vector box_widths, box_heights; + std::vector offsets_x, offsets_y; + T stepX, stepY; + + std::vector variance; + + /* number of priors per feature point */ + std::size_t num_priors; + + /* clamps the box coordinates to [0, 1] range */ + bool clip; + + /* normalizes the box coordinates using the image dimensions */ + bool normalize; + }; + + template + class PriorBoxOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + template + PriorBoxOp(csl::Stream stream_, const PriorBoxConfiguration& config) + : stream(std::move(stream_)) + { + feature_map_width = config.feature_map_width; + feature_map_height = config.feature_map_height; + + image_width = config.image_width; + image_height = config.image_height; + + const auto& box_widths = config.box_widths; + const auto& box_heights = config.box_heights; + CV_Assert(box_widths.size() == box_heights.size()); + + box_size = box_widths.size(); + + const auto& offsets_x = config.offsets_x; + const auto& offsets_y = config.offsets_y; + CV_Assert(offsets_x.size() == offsets_y.size()); + + offset_size = offsets_x.size(); + + /* for better memory utilization and preassumably better cache performance, we merge + * the four vectors and put them in a single tensor + */ + auto total = box_widths.size() * 2 + offsets_x.size() * 2; + std::vector merged_params; + merged_params.insert(std::end(merged_params), std::begin(box_widths), std::end(box_widths)); + merged_params.insert(std::end(merged_params), std::begin(box_heights), std::end(box_heights)); + merged_params.insert(std::end(merged_params), std::begin(offsets_x), std::end(offsets_x)); + merged_params.insert(std::end(merged_params), std::begin(offsets_y), std::end(offsets_y)); + CV_Assert(merged_params.size() == total); + + paramsTensor.resize(total); + csl::memcpy(paramsTensor.get(), merged_params.data(), total, stream); /* synchronous copy */ + + const auto& variance_ = config.variance; + variance.assign(std::begin(variance_), std::end(variance_)); + + num_priors = config.num_priors; + stepX = config.stepX; + stepY = config.stepY; + clip = config.clip; + normalize = config.normalize; + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 2); /* we don't need the inputs but we are given */ + CV_Assert(outputs.size() == 1); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + /* we had stored all the parameters in a single tensor; now we create appropriate views + * for each of the parameter arrays from the single tensor + */ + auto boxWidths = csl::view(paramsTensor.get(), box_size); + auto boxHeights = csl::view(paramsTensor.get() + box_size, box_size); + auto offsetsX = csl::view(paramsTensor.get() + 2 * box_size, offset_size); + auto offsetsY = csl::view(paramsTensor.get() + 2 * box_size + offset_size, offset_size); + + csl::kernels::generate_prior_boxes(stream, output, + boxWidths, boxHeights, offsetsX, offsetsY, stepX, stepY, + variance, num_priors, feature_map_width, feature_map_height, image_width, image_height, normalize, clip); + } + + private: + csl::Stream stream; + csl::Tensor paramsTensor; /* widths, heights, offsetsX, offsetsY */ + + std::size_t feature_map_width, feature_map_height; + std::size_t image_width, image_height; + + std::size_t box_size, offset_size; + T stepX, stepY; + + std::vector variance; + + std::size_t num_priors; + bool clip, normalize; + }; + + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/reshape.hpp b/modules/dnn/src/cuda4dnn/primitives/reshape.hpp new file mode 100644 index 000000000000..5e7e65548da3 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/reshape.hpp @@ -0,0 +1,52 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_RESHAPE_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_RESHAPE_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/tensor.hpp" +#include "../csl/tensor_ops.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class ReshapeOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + ReshapeOp(csl::Stream stream_) : stream(std::move(stream_)) { } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + if (input.get() != output.get()) + { + input.reshape_as(output); + csl::tensor_ops::copy(stream, output, input); + } + } + } + + private: + csl::Stream stream; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_RESHAPE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/resize.hpp b/modules/dnn/src/cuda4dnn/primitives/resize.hpp new file mode 100644 index 000000000000..388b18823156 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/resize.hpp @@ -0,0 +1,59 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_RESIZE_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_RESIZE_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/kernels.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + enum class interpolation_type { + nearest_neighbour, + bilinear + }; + + template + class ResizeOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + ResizeOp(csl::Stream stream_, interpolation_type type_, float scaleHeight_, float scaleWidth_) + : stream(std::move(stream_)), type{ type_ }, scaleHeight{ scaleHeight_ }, scaleWidth{ scaleWidth_ } + { + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + if (type == interpolation_type::nearest_neighbour) + csl::kernels::resize_nn(stream, output, input); + else if (type == interpolation_type::bilinear) + csl::kernels::resize_bilinear(stream, output, input, scaleHeight, scaleWidth); + } + + private: + csl::Stream stream; + interpolation_type type; + float scaleHeight, scaleWidth; /* for bilinear interpolation */ + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_RESIZE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp b/modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp new file mode 100644 index 000000000000..07e2e353ddfa --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp @@ -0,0 +1,116 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/tensor.hpp" +#include "../csl/kernels.hpp" + +#include + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class ScaleShiftOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + ScaleShiftOp(csl::Stream stream_, std::size_t axis, const cv::Mat& weights, const cv::Mat& bias) + : stream(std::move(stream_)), axis{ axis } + { + if (!weights.empty()) + { + weightsTensor = csl::makeTensorHeader(weights); + csl::copyMatToTensor(weightsTensor, weights, stream); + } + + if (!bias.empty()) + { + biasTensor = csl::makeTensorHeader(bias); + csl::copyMatToTensor(biasTensor, bias, stream); + } + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::TensorView weights; + if (weightsTensor.empty() && biasTensor.empty()) + { + CV_Assert(inputs.size() == 2); + + /* no explicit scale/shift values provided; use the second input as weights */ + auto wrapper = inputs[1].dynamicCast(); + weights = wrapper->getView(); + } + else if (!weightsTensor.empty()) + { + weights = csl::TensorSpan(weightsTensor); + } + + csl::TensorView bias; + if (!biasTensor.empty()) + bias = csl::TensorSpan(biasTensor); + + const auto numParams = !weights.empty() ? weights.size() : bias.size(); + CV_Assert(numParams != 0); + if (!weightsTensor.empty() && !biasTensor.empty()) + { + CV_CheckEQ(weights.size(), bias.size(), "weights and bias size are not equal"); + } + + auto input_shape = input_wrapper->getShape(); + + /* the weights/bias might require broadcasting to scale/shift */ + const int end_axis = [&] { + for (int endAxis = axis + 1; endAxis <= input_shape.size(); ++endAxis) + { + std::size_t size = 1; + for (int i = axis; i < endAxis; i++) + size *= input_shape[i]; + + if (size == numParams) + return endAxis; + } + CV_Assert(0 /* invalid weights matrix */); + }(); + + std::size_t inner_size = 1; + for (int i = end_axis; i < input_shape.size(); i++) + inner_size *= input_shape[i]; + + if (!weights.empty() && !bias.empty()) + csl::kernels::scaleN_with_biasN(stream, output, input, inner_size, weights, bias); + else if (!weights.empty()) + csl::kernels::scaleN(stream, output, input, inner_size, weights); + else + csl::kernels::biasN(stream, output, input, inner_size, bias); + } + + private: + csl::Stream stream; + csl::Tensor weightsTensor, biasTensor; + std::size_t axis; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/slice.hpp b/modules/dnn/src/cuda4dnn/primitives/slice.hpp new file mode 100644 index 000000000000..d1241d8214da --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/slice.hpp @@ -0,0 +1,58 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_SLICE_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_SLICE_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/stream.hpp" +#include "../csl/kernels.hpp" + +#include + +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class SliceOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + /* offsets is indexed by output number and each subvector is indexed by axis number */ + SliceOp(csl::Stream stream_, std::vector> offsets) + : stream(std::move(stream_)), offsets(std::move(offsets)) + { + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + for (int i = 0; i < outputs.size(); ++i) + { + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::kernels::slice(stream, output, input, offsets[i]); + } + } + + private: + csl::Stream stream; + std::vector> offsets; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_SLICE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/softmax.hpp b/modules/dnn/src/cuda4dnn/primitives/softmax.hpp new file mode 100644 index 000000000000..0fd60508c21c --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/softmax.hpp @@ -0,0 +1,53 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/cudnn.hpp" +#include "../csl/tensor_ops.hpp" + +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + template + class SoftmaxOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + SoftmaxOp(csl::cudnn::Handle handle, std::size_t axis_, bool log_) + : cudnnHandle(std::move(handle)), channel_axis{ axis_ }, log{ log_ } + { + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + for (int i = 0; i < inputs.size(); i++) + { + auto input_wrapper = inputs[i].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[i].dynamicCast(); + auto output = output_wrapper->getSpan(); + + csl::tensor_ops::softmax(cudnnHandle, output, input, channel_axis, log); + } + } + + private: + csl::cudnn::Handle cudnnHandle; + std::size_t channel_axis; + bool log; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/transpose_convolution.hpp b/modules/dnn/src/cuda4dnn/primitives/transpose_convolution.hpp new file mode 100644 index 000000000000..0999b195940a --- /dev/null +++ b/modules/dnn/src/cuda4dnn/primitives/transpose_convolution.hpp @@ -0,0 +1,225 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP +#define OPENCV_DNN_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP + +#include "../../op_cuda.hpp" + +#include "../csl/cudnn.hpp" +#include "../csl/stream.hpp" +#include "../csl/tensor.hpp" +#include "../csl/tensor_ops.hpp" +#include "../csl/kernels.hpp" + +#include + +#include +#include +#include +#include +#include + +namespace cv { namespace dnn { namespace cuda4dnn { + + struct TransposeConvolutionConfiguration { + /* other than `input_shape` and `output_shape`, all the configuration values must be provided + * for the corresponding convolution operation (not transpose convolution) + */ + + /* the size of the following vectors must be equal to the kernel size */ + std::vector kernel_size; + std::vector dilations, strides; + + enum class padding_mode { + manual, /* uses explicit padding values provided in `pads_begin` and `pads_end` */ + valid, /* no padding is added */ + same /* TensorFlow logic is used for same padding */ + }; + + /* explicit paddings are used if and only if padMode is set to manual */ + padding_mode padMode; + std::vector pads_begin, pads_end; + + /* full shape inclusive of channel and batch axis */ + std::vector input_shape; + std::vector output_shape; + + /* group count for grouped convolution */ + std::size_t groups; + }; + + template + class TransposeConvolutionOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + TransposeConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const TransposeConvolutionConfiguration& config, const Mat& filters, const Mat& bias) + : stream(std::move(stream_)), cudnnHandle(std::move(handle)) + { + /* we make use of backward pass of convolution to perform forward pass of transpose convolution + * hence, we must setup configuration for the convolution operation and perform backward pass + */ + const auto& kernel_size = config.kernel_size; + const auto& dilations = config.dilations; + const auto& strides = config.strides; + + const auto convolution_order = kernel_size.size(); + CV_Assert(convolution_order >= 1); + + CV_Assert(convolution_order == dilations.size()); + CV_Assert(convolution_order == strides.size()); + + const auto& input_shape = config.input_shape; + const auto& output_shape = config.output_shape; + CV_Assert(input_shape.size() == output_shape.size()); + CV_Assert(input_shape.size() == convolution_order + 2); + + const auto groups = config.groups; + + if (convolution_order > 3) + CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D transpose convolution is supported."); + + const auto rank = input_shape.size(); + const auto input_feature_maps = input_shape[1]; + const auto output_feature_maps = output_shape[1]; + const auto output_feature_maps_per_group = output_feature_maps / groups; + CV_Assert(output_feature_maps % groups == 0); + + filtersTensor = csl::makeTensorHeader(filters); + csl::copyMatToTensor(filtersTensor, filters, stream); + + if (!bias.empty()) + { + CV_Assert(bias.total() == output_feature_maps); + biasTensor = csl::makeTensorHeader(bias); + csl::copyMatToTensor(biasTensor, bias, stream); + } + + /* left and right are misleading as the padding is applicable for any number of dimensions + * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` + * + * `common_padding` contains the amount of padding that has to be added to both sides + * `padding_left` and `padding_right` contains the amount of padding that needs to be added + * to a particular side in addition to the common padding + * + * note that we compute the padding for the convolution operation + */ + std::vector common_padding(rank, 0); + std::vector padding_left(rank, 0), padding_right(rank, 0); + if (config.padMode == TransposeConvolutionConfiguration::padding_mode::manual) + { + const auto& pads_begin = config.pads_begin; + const auto& pads_end = config.pads_end; + + CV_Assert(convolution_order == pads_begin.size()); + CV_Assert(convolution_order == pads_end.size()); + + for (int i = 2; i < common_padding.size(); i++) + { + common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]); + padding_left[i] = pads_begin[i - 2] - common_padding[i]; + padding_right[i] = pads_end[i - 2] - common_padding[i]; + } + } + else if (config.padMode == TransposeConvolutionConfiguration::padding_mode::valid) + { + /* nothing to do as the paddings are already preset to zero */ + } + else if (config.padMode == TransposeConvolutionConfiguration::padding_mode::same) + { + /* TensorFlow Logic: + * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] + * + * if total padding is odd, the extra is added towards the end + */ + for (int i = 2; i < rank; i++) + { + const auto j = i - 2; /* filter index */ + const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; + const auto required_total_padding = + std::max(0, (input_shape[i] - 1) * strides[j] + effective_kernel_size - output_shape[i]); + + common_padding[i] = required_total_padding / 2; + padding_left[i] = 0; + padding_right[i] = required_total_padding % 2; + } + } + + /* in some scenarios, the extra padding at the end may not change the output at all */ + for (int i = 2; i < rank; i++) { + const auto j = i - 2; /* filter idx */ + const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; + const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; + std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j]; + + /* the output shape doesn't change if we decrease the total padding by at most `rem` + * provided that we decrease from the right + */ + if (rem && padding_right[i] > 0) + padding_right[i] = std::max(0, padding_right[i] - rem); + } + + auto is_not_zero = [](std::size_t i) { return i != 0; }; + if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) || + std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero)) + { + CV_Error(Error::StsNotImplemented, "Padding configuration requires asymmetric padding and hence is not supported."); + } + + csl::TransposeConvolution::params_type params; + params.input_shape.assign(std::begin(input_shape), std::end(input_shape)); + params.output_shape.assign(std::begin(output_shape), std::end(output_shape)); + + auto& fshape = params.filter_shape; + fshape.resize(rank); + fshape[0] = input_feature_maps; + fshape[1] = output_feature_maps_per_group; + std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2); + CV_Assert(fshape.size() == kernel_size.size() + 2); + + params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); + params.stride = strides; + params.dialation = dilations; + params.groups = config.groups; + + convoluter = csl::TransposeConvolution(cudnnHandle, params); + scratch_mem_in_bytes = convoluter.get_workspace_size(); + } + + void forward( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + + auto input_wrapper = inputs[0].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + convoluter.transpose_convolve(output, input, filtersTensor, workspace); + if (!biasTensor.empty()) + { + std::size_t inner_size = total(output_wrapper->getShape(), 2, -1); + csl::kernels::biasN(stream, output, output, inner_size, biasTensor); + } + } + + std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; } + + private: + csl::Stream stream; + csl::cudnn::Handle cudnnHandle; + csl::Tensor filtersTensor, biasTensor; + csl::TransposeConvolution convoluter; + + std::size_t scratch_mem_in_bytes; + }; + +}}} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP */ diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index e186dff2d74e..5ad5a79253cc 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -11,7 +11,6 @@ Implementation of Batch Normalization layer. #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include @@ -21,8 +20,8 @@ Implementation of Batch Normalization layer. #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/kernels.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/batch_norm.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -321,39 +320,21 @@ class BatchNormLayerImpl CV_FINAL : public BatchNormLayer csl::Workspace& workspace ) override { - CV_Assert(inputs.size() == 1 && outputs.size() == 1); - - auto input_wrapper = inputs[0].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - - auto input_shape = input_wrapper->getShape(); - std::size_t inner_size = total(input_shape, 2, -1); - - csl::kernels::scaleN_with_biasN(stream, output, input, inner_size, weightsTensor, biasTensor); + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs ) override { - stream = std::move(stream_); - - weightsTensor = createTensorHeaderFromMat(weights_); - copyMatToTensor(weightsTensor, weights_, stream); - - biasTensor = createTensorHeaderFromMat(bias_); - copyMatToTensor(biasTensor, bias_, stream); + cudaNode = make_cuda_node(preferableTarget, std::move(stream), weights_, bias_); } - csl::Tensor weightsTensor, biasTensor; - csl::Stream stream; + std::unique_ptr cudaNode; #endif virtual Ptr tryAttach(const Ptr& node) CV_OVERRIDE diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index 730c45ff1598..ddf653d8f1a2 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -40,12 +40,11 @@ // //M*/ #include "../precomp.hpp" -#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/reshape.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -122,33 +121,21 @@ class BlankLayerImpl CV_FINAL : public BlankLayer csl::Workspace& workspace ) override { - CV_UNUSED(workspace); - - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - if (input.get() != output.get()) - csl::tensor_ops::copy(stream, output, input); - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs ) override { - stream = std::move(stream_); + cudaNode = make_cuda_node(preferableTarget, std::move(stream)); } - csl::Stream stream; + std::unique_ptr cudaNode; #endif #ifdef HAVE_INF_ENGINE diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp index efb52073ecbb..4d1ce2d5b561 100644 --- a/modules/dnn/src/layers/concat_layer.cpp +++ b/modules/dnn/src/layers/concat_layer.cpp @@ -42,7 +42,6 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -52,11 +51,12 @@ #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/kernels.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/concat.hpp" using namespace cv::dnn::cuda4dnn; #endif + namespace cv { namespace dnn @@ -242,71 +242,6 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer } #endif -#ifdef HAVE_CUDA - void forwardCUDA( - std::vector>& inputs, - std::vector>& outputs, - csl::Workspace& workspace) override - { - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - auto output_shape = output_wrapper->getShape(); - - auto concat_axis = [&] { - auto actual_dims = output_shape.size(); - auto extra_dims = output.rank - actual_dims; - return clamp(axis, actual_dims) + extra_dims; - }(); - - if (!padding) - { - std::size_t output_axis_offset = 0; - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - csl::kernels::concat(stream, output, output_axis_offset, input, concat_axis); - - output_axis_offset += input.get_axis_size(concat_axis); - } - } - else /* if(padding) */ - { - csl::memset(output.get(), 0, output.size(), stream); - - std::size_t output_concat_axis_offset = 0; - for (size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - auto input_shape = input_wrapper->getShape(); - - std::vector offsets(input_shape.size()); - for (int j = 0; j < offsets.size(); j++) - offsets[j] = (output_shape[j] - input_shape[j]) / 2; - offsets[concat_axis] = output_concat_axis_offset; - - csl::kernels::concat_with_offsets(stream, output, input, offsets); - - output_concat_axis_offset += input.get_axis_size(concat_axis); - } - } - } - - void initCUDA( - csl::Stream stream_, - csl::cublas::Handle cublas_handle, - csl::cudnn::Handle cudnn_handle, - std::size_t& scratch_mem_in_bytes, - const std::vector>& inputs) override - { - stream = std::move(stream_); - } - - csl::Stream stream; -#endif - void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); @@ -349,6 +284,39 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer } } } + +#ifdef HAVE_CUDA + void forwardCUDA( + std::vector>& inputs, + std::vector>& outputs, + csl::Workspace& workspace + ) override + { + cudaNode->forward(inputs, outputs, workspace); + } + + void initCUDA( + csl::Stream stream, + csl::cublas::Handle cublas_handle, + csl::cudnn::Handle cudnn_handle, + std::size_t& scratch_mem_in_bytes, + const std::vector>& inputs + ) override + { + auto input_wrapper = inputs[0].dynamicCast(); + + auto concat_axis = [&] { + auto actual_dims = input_wrapper->getShape().size(); + auto extra_dims = input_wrapper->getRank() - actual_dims; + return clamp(axis, actual_dims) + extra_dims; + }(); + + cudaNode = make_cuda_node(preferableTarget, std::move(stream), concat_axis, padding); + } + + std::unique_ptr cudaNode; +#endif + virtual Ptr initVkCom(const std::vector > &input) CV_OVERRIDE { #ifdef HAVE_VULKAN diff --git a/modules/dnn/src/layers/const_layer.cpp b/modules/dnn/src/layers/const_layer.cpp index 841fc92c114a..ae4a4806219d 100644 --- a/modules/dnn/src/layers/const_layer.cpp +++ b/modules/dnn/src/layers/const_layer.cpp @@ -7,7 +7,6 @@ #include "../precomp.hpp" #include "../op_inf_engine.hpp" -#include "../op_cuda.hpp" #include "layers_common.hpp" #ifdef HAVE_OPENCL @@ -15,8 +14,8 @@ #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/const.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -88,34 +87,28 @@ class ConstLayerImpl CV_FINAL : public ConstLayer #endif // HAVE_INF_ENGINE #ifdef HAVE_CUDA - void forwardCUDA( + void forwardCUDA( std::vector>& inputs, std::vector>& outputs, csl::Workspace& workspace ) override { - auto output_wrapper = outputs[0].dynamicCast(); - csl::tensor_ops::copy(stream, output_wrapper->getSpan(), constTensor); + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, - const std::vector>& inputs + const std::vector>& inputs ) override { - /* host to device copy is more expensive than device to device copy; hence, we keep a copy - * of the blob in device memory and use it as the source for copy - */ - stream = std::move(stream_); - constTensor = createTensorHeaderFromMat(blobs[0]); - copyMatToTensor(constTensor, blobs[0], stream); + CV_Assert(blobs.size() == 1); + cudaNode = make_cuda_node(preferableTarget, std::move(stream), blobs[0]); } - csl::Stream stream; - csl::Tensor constTensor; + std::unique_ptr cudaNode; #endif }; diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 8485c5e3e361..2d23ea8f592b 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -42,7 +42,6 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -57,8 +56,10 @@ using namespace cv::dnn::ocl4dnn; #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" + +#include "../cuda4dnn/primitives/convolution.hpp" +#include "../cuda4dnn/primitives/transpose_convolution.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -1296,154 +1297,86 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl csl::Workspace& workspace ) override { - CV_Assert(!activ); - CV_Assert(inputs.size() == 1 && outputs.size() == 1); - - auto input_wrapper = inputs[0].dynamicCast(); - auto input = input_wrapper->getView(); - - if (!transformedInput.empty()) - { - inputTransformer.transform(input, transformedInput); - input = csl::TensorView(transformedInput); - } - - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - - convoluter.convolve(output, input, filtersTensor, workspace); - if (hasBias() || fusedBias) - { - std::size_t inner_size = total(output_wrapper->getShape(), 2, -1); - csl::kernels::biasN(stream, output, output, inner_size, biasTensor); - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs ) override { - stream = std::move(stream_); - cudnnHandle = std::move(cudnn_handle); + CV_Assert(inputs.size() == 1); - auto input_wrapper = inputs[0].dynamicCast(); + auto input_wrapper = inputs[0].dynamicCast(); auto input_shape = input_wrapper->getShape(); - /* 1d, 2d, 3d convolutions are supported */ - CV_Assert(input_shape.size() >= 3 || input_shape.size() <= 5); - - CV_Assert(blobs.size() >= 1); - const auto& filtersMat = blobs[0]; - - const auto rank = input_shape.size(); - const auto output_feature_maps = filtersMat.size[0]; + const auto output_feature_maps = blobs[0].size[0]; const auto input_feature_maps = input_shape[1]; - const auto input_feature_maps_per_group = filtersMat.size[1]; + const auto input_feature_maps_per_group = blobs[0].size[1]; const auto groups = input_feature_maps / input_feature_maps_per_group; - CV_Assert(input_feature_maps % input_feature_maps_per_group == 0); - const Mat& filterWeightsSource = fusedWeights ? weightsMat : filtersMat; - filtersTensor = createTensorHeaderFromMat(filterWeightsSource); - copyMatToTensor(filtersTensor, filterWeightsSource, stream); + auto output_shape = [&] { + std::vector inShape, outShape; + inShape.assign(std::begin(input_shape) + 2, std::end(input_shape)); + if (padMode.empty()) + { + for (int i = 0; i < inShape.size(); i++) + outShape.push_back((inShape[i] + pads_begin[i] + pads_end[i] - dilations[i] * (kernel_size[i] - 1) - 1) / strides[i] + 1); + } + else + { + getConvPoolOutParams(inShape, kernel_size, strides, padMode, dilations, outShape); + } - if (hasBias() || fusedBias) - { - std::vector biasShape(rank, 1); - biasShape[1] = output_feature_maps; - Mat biasMat(rank, biasShape.data(), CV_32F, &biasvec[0]); - biasTensor = createTensorHeaderFromMat(biasMat); - copyMatToTensor(biasTensor, biasMat, stream); - } + auto output_shape = input_shape; + output_shape[1] = output_feature_maps; + std::copy_backward(std::begin(outShape), std::end(outShape), std::end(output_shape)); + return output_shape; + }(); + + ConvolutionConfiguration config; + config.kernel_size.assign(std::begin(kernel_size), std::end(kernel_size)); + config.dilations.assign(std::begin(dilations), std::end(dilations)); + config.strides.assign(std::begin(strides), std::end(strides)); - /* left and right are misleading as the padding is applicable for any number of dimensions - * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` - */ - std::vector common_padding(rank, 0); - std::vector padding_left(rank, 0), padding_right(rank, 0); if (padMode.empty()) { - for (int i = 2; i < common_padding.size(); i++) - { - common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]); - padding_left[i] = pads_begin[i - 2] - common_padding[i]; - padding_right[i] = pads_end[i - 2] - common_padding[i]; - } + config.padMode = ConvolutionConfiguration::padding_mode::manual; + config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin)); + config.pads_end.assign(std::begin(pads_end), std::end(pads_end)); } - else if (padMode == "VALID") { /* nothing to do as the paddings are already preset to zero */ } - else if (padMode == "SAME") + else if (padMode == "VALID") { - /* TensorFlow Logic: - * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] - * - * if total padding is odd, the input is padded towards the end - */ - std::vector inShape(std::begin(input_shape) + 2, std::end(input_shape)), outShape; - getConvPoolOutParams(inShape, kernel_size, strides, padMode, dilations, outShape); - - for (int i = 2; i < rank; i++) - { - const auto j = i - 2; /* filter index */ - const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; - const auto required_total_padding = - std::max(0, (outShape[j] - 1) * strides[j] + effective_kernel_size - inShape[j]); - - common_padding[i] = required_total_padding / 2; - padding_left[i] = 0; - padding_right[i] = required_total_padding % 2; - } + config.padMode = ConvolutionConfiguration::padding_mode::valid; } - else + else if (padMode == "SAME") { - CV_Error(Error::StsNotImplemented, "Specified padding mode not supported by ConvolutionLayer"); + config.padMode = ConvolutionConfiguration::padding_mode::same; } - - /* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by - * copying the input to a bigger tensor and padding the ends manually - */ - for (int i = 0; i < rank; i++) - input_shape[i] += padding_left[i] + padding_right[i]; - - /* if the actual input shape and the new input shape do not match; we need to transform the input */ - transform_required = input_shape != input_wrapper->getShape(); - if (transform_required) + else { - transformedInput.resize(std::begin(input_shape), std::end(input_shape)); - inputTransformer = csl::TensorTransform(cudnnHandle, padding_left, padding_right); + CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by ConvolutionLayer"); } - csl::Convolution::params_type params; + config.input_shape.assign(std::begin(input_shape), std::end(input_shape)); + config.output_shape.assign(std::begin(output_shape), std::end(output_shape)); + config.groups = groups; - auto& ishape = params.input_shape; - ishape.assign(std::begin(input_shape), std::end(input_shape)); + Mat filtersMat = fusedWeights ? weightsMat : blobs[0]; + Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat(); + if (countNonZero(biasMat) == 0) + biasMat = Mat(); - auto& fshape = params.filter_shape; - fshape.resize(ishape.size()); - fshape[0] = output_feature_maps; - fshape[1] = input_feature_maps_per_group; - std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2); - CV_Assert(fshape.size() == kernel_size.size() + 2); + cudaNode = make_cuda_node( + preferableTarget, std::move(stream), std::move(cudnn_handle), config, filtersMat, biasMat); - params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); - params.stride = strides; - params.dialation = dilations; - params.groups = groups; - - convoluter = csl::Convolution(cudnnHandle, params); - scratch_mem_in_bytes = convoluter.get_workspace_size(); + scratch_mem_in_bytes = cudaNode->get_workspace_memory_in_bytes(); } - csl::Stream stream; - csl::cudnn::Handle cudnnHandle; - csl::Tensor filtersTensor, biasTensor; - csl::Convolution convoluter; - - bool transform_required; - csl::Tensor transformedInput; - csl::TensorTransform inputTransformer; + std::unique_ptr cudaNode; #endif virtual int64 getFLOPS(const std::vector &inputs, @@ -2071,162 +2004,99 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl csl::Workspace& workspace ) override { - CV_Assert(inputs.size() == 1 && outputs.size() == 1); - - auto input_wrapper = inputs[0].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - - convoluter.transpose_convolve(output, input, filtersTensor, workspace); - if (hasBias() || fusedBias) - { - std::size_t inner_size = total(output_wrapper->getShape(), 2, -1); - csl::kernels::biasN(stream, output, output, inner_size, biasTensor); - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs ) override { - stream = std::move(stream_); - cudnnHandle = std::move(cudnn_handle); + CV_Assert(inputs.size() == 1); - auto input_wrapper = inputs[0].dynamicCast(); + auto input_wrapper = inputs[0].dynamicCast(); auto input_shape = input_wrapper->getShape(); - /* 1d, 2d, 3d deconvolutions are supported */ - CV_Assert(input_shape.size() >= 3 || input_shape.size() <= 5); - - CV_Assert(blobs.size() >= 1); - const auto& filtersMat = blobs[0]; - - const auto rank = input_shape.size(); - const auto input_feature_maps = input_shape[1]; const auto output_feature_maps = numOutput; - const auto output_feature_maps_per_group = filtersMat.size[1]; + const auto output_feature_maps_per_group = blobs[0].size[1]; const auto groups = output_feature_maps / output_feature_maps_per_group; - CV_Assert(output_feature_maps % output_feature_maps_per_group == 0); - auto output_shape = input_shape; - output_shape[1] = output_feature_maps; + auto output_shape = [&] { + auto output_shape = input_shape; + output_shape[1] = output_feature_maps; + if (padMode.empty()) + { + for (int i = 0; i < kernel_size.size(); i++) + output_shape[i + 2] = + (strides[i] * (input_shape[2 + i] - 1) + kernel_size[i] - pads_begin[i] - pads_end[i] + adjust_pads[i]); + } + else if (padMode == "VALID") + { + for (int i = 0; i < kernel_size.size(); i++) + output_shape[i + 2] = (strides[i] * (input_shape[2 + i] - 1) + kernel_size[i] + adjust_pads[i]); + } + else if (padMode == "SAME") + { + for (int i = 0; i < kernel_size.size(); i++) + output_shape[i + 2] = (strides[i] * (input_shape[2 + i] - 1) + 1 + adjust_pads[i]); + } + else + { + CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by DeconvolutionLayer"); + } + return output_shape; + }(); + + TransposeConvolutionConfiguration config; + config.kernel_size.assign(std::begin(kernel_size), std::end(kernel_size)); + config.dilations.assign(std::begin(dilations), std::end(dilations)); + config.strides.assign(std::begin(strides), std::end(strides)); + if (padMode.empty()) { - for (int i = 0; i < kernel_size.size(); i++) - output_shape[i + 2] = - (strides[i] * (input_shape[2 + i] - 1) + kernel_size[i] - pads_begin[i] - pads_end[i] + adjust_pads[i]); + config.padMode = TransposeConvolutionConfiguration::padding_mode::manual; + config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin)); + config.pads_end.assign(std::begin(pads_end), std::end(pads_end)); } else if (padMode == "VALID") { - for (int i = 0; i < kernel_size.size(); i++) - output_shape[i + 2] = - (strides[i] * (input_shape[2 + i] - 1) + kernel_size[i] + adjust_pads[i]); + config.padMode = TransposeConvolutionConfiguration::padding_mode::valid; } else if (padMode == "SAME") { - for (int i = 0; i < kernel_size.size(); i++) - output_shape[i + 2] = (strides[i] * (input_shape[2 + i] - 1) + 1 + adjust_pads[i]); + config.padMode = TransposeConvolutionConfiguration::padding_mode::same; } else - CV_Error(Error::StsNotImplemented, "[0] Specified padding mode not supported by DeconvolutionLayer"); - - Mat filterWeightsSource = filtersMat; - if (fusedWeights) { - filterWeightsSource = weightsMat.clone(); - transpose(weightsMat, filterWeightsSource); + CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by DeconvolutionLayer"); } - filtersTensor = createTensorHeaderFromMat(filterWeightsSource); - copyMatToTensor(filtersTensor, filterWeightsSource, stream); + config.input_shape.assign(std::begin(input_shape), std::end(input_shape)); + config.output_shape.assign(std::begin(output_shape), std::end(output_shape)); + config.groups = groups; - if (hasBias() || fusedBias) - { - biasTensor = createTensorHeaderFromMat(biasesMat); - copyMatToTensor(biasTensor, biasesMat, stream); - - std::vector biasShape(rank, 1); - biasShape[1] = output_feature_maps; - biasTensor.reshape(std::begin(biasShape), std::end(biasShape)); - } - - /* left and right are misleading as the padding is applicable for any number of dimensions - * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` - */ - std::vector common_padding(rank, 0); - std::vector padding_left(rank, 0), padding_right(rank, 0); - if (padMode.empty()) - { - for (int i = 2; i < common_padding.size(); i++) - { - common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]); - padding_left[i] = pads_begin[i - 2] - common_padding[i]; - padding_right[i] = pads_end[i - 2] - common_padding[i]; - } - } - else if (padMode == "VALID") { /* nothing to do as the paddings are already preset to zero */ } - else if (padMode == "SAME") - { - /* TensorFlow Logic (for convolution): - * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] - * - * if total padding is odd, the input is padded towards the end - */ - for (int i = 2; i < rank; i++) - { - const auto j = i - 2; /* filter index */ - const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; - const auto required_total_padding = - std::max(0, (input_shape[i] - 1) * strides[j] + effective_kernel_size - output_shape[i]); - - common_padding[i] = required_total_padding / 2; - padding_left[i] = 0; - padding_right[i] = required_total_padding % 2; - } - } - else + CV_Assert(blobs.size() >= 1); + Mat filtersMat = blobs[0]; + if (fusedWeights) { - CV_Error(Error::StsNotImplemented, "[1] Specified padding mode not supported by DeconvolutionLayer"); + filtersMat = weightsMat.clone(); + transpose(weightsMat, filtersMat); } - for (int i = 0; i < rank; i++) - if (padding_left[i] != 0 || padding_right[i] != 0) - CV_Error(Error::StsNotImplemented, "Padding configuration requires asymmetric padding and hence is not supported."); - - csl::TransposeConvolution::params_type params; - - auto& ishape = params.input_shape; - ishape.assign(std::begin(input_shape), std::end(input_shape)); - - auto& oshape = params.output_shape; - oshape.assign(std::begin(output_shape), std::end(output_shape)); - - auto& fshape = params.filter_shape; - fshape.resize(ishape.size()); - fshape[0] = input_feature_maps; - fshape[1] = output_feature_maps_per_group; - std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2); - CV_Assert(fshape.size() == kernel_size.size() + 2); + Mat biasMat = (hasBias() || fusedBias) ? biasesMat : Mat(); + if (countNonZero(biasMat) == 0) + biasMat = Mat(); - params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); - params.stride = strides; - params.dialation = dilations; - params.groups = groups; + cudaNode = make_cuda_node( + preferableTarget, std::move(stream), std::move(cudnn_handle), config, filtersMat, biasMat); - convoluter = csl::TransposeConvolution(cudnnHandle, params); - scratch_mem_in_bytes = convoluter.get_workspace_size(); + scratch_mem_in_bytes = cudaNode->get_workspace_memory_in_bytes(); } - csl::Stream stream; - csl::cudnn::Handle cudnnHandle; - csl::Tensor filtersTensor, biasTensor; - csl::TransposeConvolution convoluter; + std::unique_ptr cudaNode; #endif virtual Ptr initHalide(const std::vector > &inputs) CV_OVERRIDE diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index cad95cfa5f1d..b7cbdd8ddbd2 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -42,7 +42,6 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -54,8 +53,8 @@ #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/activation.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -228,7 +227,6 @@ class ElementWiseLayer : public Func::Layer func.apply(src, dst, len, planeSize, cn0, cn1); } - #ifdef HAVE_CUDA void forwardCUDA( std::vector>& inputs, @@ -236,21 +234,21 @@ class ElementWiseLayer : public Func::Layer csl::Workspace& workspace ) override { - func.applyCUDA(inputs, outputs, workspace, stream); + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs ) override { - stream = std::move(stream_); + cudaNode = func.initCUDA(preferableTarget, stream); } - csl::Stream stream; + std::unique_ptr cudaNode; #endif virtual int64 getFLOPS(const std::vector &inputs, @@ -332,23 +330,9 @@ struct ReLUFunctor } #ifdef HAVE_CUDA - void applyCUDA( - std::vector>& inputs, - std::vector>& outputs, - csl::Workspace& workspace, - const csl::Stream& stream - ) + std::unique_ptr initCUDA(int target, csl::Stream stream) { - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - csl::tensor_ops::relu(stream, output, input, slope); - } + return make_cuda_node(target, stream, slope); } #endif @@ -425,8 +409,6 @@ struct ReLUFunctor } #endif // HAVE_VULKAN - - bool tryFuse(Ptr&) { return false; } void getScaleShift(Mat&, Mat&) const {} @@ -518,23 +500,9 @@ struct ReLU6Functor #endif #ifdef HAVE_CUDA - void applyCUDA( - std::vector>& inputs, - std::vector>& outputs, - csl::Workspace& workspace, - const csl::Stream& stream - ) + std::unique_ptr initCUDA(int target, csl::Stream stream) { - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - csl::tensor_ops::clipped_relu(stream, output, input, minValue, maxValue); - } + return make_cuda_node(target, stream, minValue, maxValue); } #endif @@ -621,23 +589,9 @@ struct TanHFunctor #endif #ifdef HAVE_CUDA - void applyCUDA( - std::vector>& inputs, - std::vector>& outputs, - csl::Workspace& workspace, - const csl::Stream& stream - ) + std::unique_ptr initCUDA(int target, csl::Stream stream) { - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - csl::tensor_ops::tanh(stream, output, input); - } + return make_cuda_node(target, stream); } #endif @@ -724,23 +678,9 @@ struct SigmoidFunctor #endif #ifdef HAVE_CUDA - void applyCUDA( - std::vector>& inputs, - std::vector>& outputs, - csl::Workspace& workspace, - const csl::Stream& stream - ) + std::unique_ptr initCUDA(int target, csl::Stream stream) { - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - csl::tensor_ops::sigmoid(stream, output, input); - } + return make_cuda_node(target, stream); } #endif @@ -829,23 +769,9 @@ struct ELUFunctor #endif #ifdef HAVE_CUDA - void applyCUDA( - std::vector>& inputs, - std::vector>& outputs, - csl::Workspace& workspace, - const csl::Stream& stream - ) + std::unique_ptr initCUDA(int target, csl::Stream stream) { - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - csl::tensor_ops::elu(stream, output, input); - } + return make_cuda_node(target, stream); } #endif @@ -935,23 +861,9 @@ struct AbsValFunctor #endif #ifdef HAVE_CUDA - void applyCUDA( - std::vector>& inputs, - std::vector>& outputs, - csl::Workspace& workspace, - const csl::Stream& stream - ) + std::unique_ptr initCUDA(int target, csl::Stream stream) { - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - csl::tensor_ops::abs(stream, output, input); - } + return make_cuda_node(target, stream); } #endif @@ -1038,23 +950,9 @@ struct BNLLFunctor #endif #ifdef HAVE_CUDA - void applyCUDA( - std::vector>& inputs, - std::vector>& outputs, - csl::Workspace& workspace, - const csl::Stream& stream - ) + std::unique_ptr initCUDA(int target, csl::Stream stream) { - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - csl::tensor_ops::bnll(stream, output, input); - } + return make_cuda_node(target, stream); } #endif @@ -1169,23 +1067,9 @@ struct PowerFunctor #endif #ifdef HAVE_CUDA - void applyCUDA( - std::vector>& inputs, - std::vector>& outputs, - csl::Workspace& workspace, - const csl::Stream& stream - ) + std::unique_ptr initCUDA(int target, csl::Stream stream) { - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - csl::tensor_ops::power(stream, output, input, power, scale, shift); -} + return make_cuda_node(target, stream, power, scale, shift); } #endif @@ -1345,34 +1229,10 @@ struct ChannelsPReLUFunctor #endif #ifdef HAVE_CUDA - void applyCUDA( - std::vector>& inputs, - std::vector>& outputs, - csl::Workspace& workspace, - const csl::Stream& stream - ) - { - if(!slopeTensor) - { - slopeTensor = std::make_shared>(); - *slopeTensor = createTensorHeaderFromMat(scale); - copyMatToTensor(*slopeTensor, scale, stream); - } - - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - csl::tensor_ops::channelwise_relu(stream, output, input, *slopeTensor); - } + std::unique_ptr initCUDA(int target, csl::Stream stream) + { + return make_cuda_node(target, stream, scale); } - - /* we have a shared_ptr here because csl::Tensor is non-copyable and these functors need to be copyable */ - std::shared_ptr> slopeTensor; #endif #ifdef HAVE_HALIDE diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index 640908633ed4..57b9104ea908 100644 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -42,7 +42,6 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" @@ -51,9 +50,8 @@ #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" -#include "../cuda4dnn/csl/kernels.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/eltwise.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -390,80 +388,30 @@ class EltwiseLayerImpl CV_FINAL : public EltwiseLayer csl::Workspace& workspace ) override { - CV_Assert(outputs.size() == 1); - - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - - /* optimized path for common case */ - if (inputs.size() == 2) - { - auto input_wrapper_x = inputs[0].dynamicCast(); - auto input_x = input_wrapper_x->getView(); - - auto input_wrapper_y = inputs[1].dynamicCast(); - auto input_y = input_wrapper_y->getView(); - - switch(op) - { - case MAX: csl::kernels::eltwise_max_2(stream, output, input_x, input_y); break; - case PROD: csl::kernels::eltwise_prod_2(stream, output, input_x, input_y); break; - case SUM: - if (coeffs.empty() || (coeffs[0] == 1 && coeffs[1] == 1)) - csl::kernels::eltwise_sum_2(stream, output, input_x, input_y); - else - csl::kernels::eltwise_sum_coeff_2(stream, output, coeffs[0], input_x, coeffs[1], input_y); - break; - } - } - else - { - auto input_wrapper_0 = inputs[0].dynamicCast(); - auto input_0 = input_wrapper_0->getView(); - - /* we first make a copy and then apply EltwiseOp cumulatively */ - csl::tensor_ops::copy(stream, output, input_0); - - for (std::size_t i = 1; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - switch (op) - { - case MAX: csl::kernels::eltwise_max_2(stream, output, output, input); break; - case PROD: csl::kernels::eltwise_prod_2(stream, output, output, input); break; - case SUM: - if (coeffs.empty() || coeffs[i] == 1) - csl::kernels::eltwise_sum_2(stream, output, output, input); - else - { - /* if this is the first op, we must scale output too */ - auto coeff_x = (i == 1) ? coeffs[0] : 1.0; - csl::kernels::eltwise_sum_coeff_2(stream, output, coeff_x, output, coeffs[i], input); - } - break; - } - } - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs ) override { - CV_Assert(inputs.size() >= 2); - CV_Assert(coeffs.size() == 0 || inputs.size() == coeffs.size()); - CV_Assert(coeffs.size() == 0 || op == SUM); + eltwise_op op_ = [this] { + switch (op) { + case MAX: return eltwise_op::max; + case SUM: return eltwise_op::sum; + case PROD: return eltwise_op::product; + } + return eltwise_op::sum; + }(); - stream = std::move(stream_); + cudaNode = make_cuda_node(preferableTarget, std::move(stream), op_, coeffs); } - csl::Stream stream; + std::unique_ptr cudaNode; #endif virtual Ptr initHalide(const std::vector > &input) CV_OVERRIDE diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp index 16f942880f14..e46e31530dae 100644 --- a/modules/dnn/src/layers/flatten_layer.cpp +++ b/modules/dnn/src/layers/flatten_layer.cpp @@ -42,15 +42,14 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include #include #include #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/reshape.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -174,38 +173,24 @@ class FlattenLayerImpl CV_FINAL : public FlattenLayer void forwardCUDA( std::vector>& inputs, std::vector>& outputs, - csl::Workspace& workspace) override + csl::Workspace& workspace + ) override { - CV_UNUSED(workspace); - CV_Assert(outputs.size() == 1); - - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - if (input.get() != output.get()) - { - input.reshape_as(output); - csl::tensor_ops::copy(stream, output, input); - } - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, - const std::vector>& inputs) override + const std::vector>& inputs + ) override { - stream = std::move(stream_); + cudaNode = make_cuda_node(preferableTarget, std::move(stream)); } - csl::Stream stream; + std::unique_ptr cudaNode; #endif #ifdef HAVE_INF_ENGINE diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index a162777c19ca..f45b0585be07 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -42,7 +42,6 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include @@ -53,8 +52,8 @@ using namespace cv::dnn::ocl4dnn; #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/inner_product.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -430,76 +429,30 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer csl::Workspace& workspace ) override { - CV_UNUSED(workspace); - - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - auto actual_dims = input_wrapper->getShape().size(); - CV_Assert(get_effective_rank(input) <= actual_dims); - - auto extra_dims = input.rank - actual_dims; - auto flatten_start_axis = clamp(axis, actual_dims) + extra_dims; - - std::size_t batch_size = 1; - for (int j = 0; j < flatten_start_axis; j++) - batch_size *= input.get_axis_size(j); - - auto input_size = input.size() / batch_size; - CV_Assert(input_size == weightsTensor.get_axis_size(-1)); - - auto output_size = output.size() / batch_size; - CV_Assert(output_size == weightsTensor.get_axis_size(-2)); - - /* we treat the input and output as a matrix with dimensions (batch_size, input_size) - * and (batch_size, output_size) respectively - * - * weight matrix dimensions: (output_size, input_size) - * - * I(W^T) = O - * (batch_size, input_size) * (input_size, output_size) = (batch_size, output_size) - */ - input.reshape(batch_size, input_size); - output.reshape(batch_size, output_size); - csl::tensor_ops::gemm(cublasHandle, 0.0, output, 1.0, false, input, true, weightsTensor); - - if (bias) - csl::kernels::biasN(stream, output, output, 1, biasTensor); - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs ) override { - stream = std::move(stream_); - cublasHandle = std::move(cublas_handle); + auto input_wrapper = inputs[0].dynamicCast(); - weightsTensor = createTensorHeaderFromMat(weightsMat); - CV_Assert(get_effective_rank(weightsTensor) == 2); - copyMatToTensor(weightsTensor, weightsMat, stream); + auto flatten_start_axis = [&] { + auto actual_dims = input_wrapper->getShape().size(); + auto extra_dims = input_wrapper->getRank() - actual_dims; + return clamp(axis, actual_dims) + extra_dims; + }(); - if (bias) - { - biasTensor = createTensorHeaderFromMat(biasMat); - copyMatToTensor(biasTensor, biasMat, stream); - biasTensor.reshape(-1, 1); - CV_Assert(weightsTensor.get_axis_size(-2) == biasTensor.get_axis_size(-2)); - } + auto biasMat_ = bias ? biasMat : Mat(); + cudaNode = make_cuda_node(preferableTarget, std::move(stream), std::move(cublas_handle), flatten_start_axis, weightsMat, biasMat_); } - csl::Stream stream; - csl::cublas::Handle cublasHandle; - csl::Tensor weightsTensor, biasTensor; + std::unique_ptr cudaNode; #endif virtual Ptr initHalide(const std::vector > &inputs) CV_OVERRIDE diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp index 4e2e99489d52..91f3449ad071 100644 --- a/modules/dnn/src/layers/lrn_layer.cpp +++ b/modules/dnn/src/layers/lrn_layer.cpp @@ -42,7 +42,6 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -57,8 +56,8 @@ using namespace cv::dnn::ocl4dnn; #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/lrn.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -321,20 +320,10 @@ class LRNLayerImpl CV_FINAL : public LRNLayer void forwardCUDA( std::vector>& inputs, std::vector>& outputs, - csl::Workspace& workspace) override + csl::Workspace& workspace + ) override { - CV_UNUSED(workspace); - - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - lrn.normalize(input, output); - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( @@ -342,19 +331,18 @@ class LRNLayerImpl CV_FINAL : public LRNLayer csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, - const std::vector>& inputs) override + const std::vector>& inputs + ) override { - cudnnHandle = std::move(cudnn_handle); - if (type != CHANNEL_NRM) CV_Error(CV_StsNotImplemented, "Only LRN across channels is supported by the CUDA backend"); float alphaSize = normBySize ? alpha : alpha * size; - lrn = csl::LRN(cudnnHandle, size, alphaSize, beta, bias, csl::LRN::lrn_type::ACROSS_CHANNELS); + cudaNode = make_cuda_node(preferableTarget, + std::move(cudnn_handle), lrn_type::across_channels, size, alphaSize, beta, bias); } - csl::cudnn::Handle cudnnHandle; - csl::LRN lrn; + std::unique_ptr cudaNode; #endif virtual Ptr initVkCom(const std::vector > &inputs) CV_OVERRIDE diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp index e9857ab7fa2c..20ba765d8823 100644 --- a/modules/dnn/src/layers/normalize_bbox_layer.cpp +++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp @@ -42,15 +42,16 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/kernels.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/normalize_bbox.hpp" using namespace cv::dnn::cuda4dnn; #endif +#include + namespace cv { namespace dnn { class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer @@ -269,40 +270,10 @@ class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer void forwardCUDA( std::vector>& inputs, std::vector>& outputs, - csl::Workspace& workspace) override + csl::Workspace& workspace + ) override { - CV_Assert(inputs.size() == 1 && outputs.size() == 1); - - auto input_wrapper = inputs[0].dynamicCast(); - auto input = input_wrapper->getView(); - auto input_shape = input_wrapper->getShape(); - - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - - auto start_axis = clamp(startAxis, input_shape.size()); - auto end_axis = clamp(endAxis, input_shape.size()); - - auto outer_size = total(input_shape, 0, start_axis); - auto mid_size = total(input_shape, start_axis, end_axis + 1); - auto inner_size = total(input_shape, end_axis + 1, -1); - - auto scratch_ptr = reinterpret_cast(csl::WorkspaceAccessor::get(workspace).get()); - auto scratch = csl::span(csl::DevicePtr(scratch_ptr), workspace.size()); - csl::kernels::normalize(stream, output, input, outer_size, mid_size, inner_size, pnorm, epsilon, scratch); - - if (!blobs.empty()) { - Mat weightsMat = blobs[0]; - if (weightsMat.total() == 1) - { - csl::kernels::scale1(stream, output, input, weightsMat.at(0, 0)); - } - else - { - CV_Assert(weightsTensor.size() == mid_size); - csl::kernels::scaleN(stream, output, input, inner_size, weightsTensor); - } - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( @@ -313,28 +284,26 @@ class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer const std::vector>& inputs ) override { - stream = std::move(stream_); + if(pnorm != 1 && pnorm != 2) + CV_Error(Error::StsNotImplemented, "Unsupported normalization mode"); - if (!blobs.empty() && blobs[0].total() != 1) - { - const auto& weightsMat = blobs[0]; - weightsTensor = createTensorHeaderFromMat(weightsMat); - copyMatToTensor(weightsTensor, weightsMat, stream); - } - - auto input_wrapper = inputs[0].dynamicCast(); + auto input_wrapper = inputs[0].dynamicCast(); auto input_shape = input_wrapper->getShape(); - auto start_axis = clamp(startAxis, input_shape.size()); - auto end_axis = clamp(endAxis, input_shape.size()); + NormalizeConfiguration config; + config.input_shape.assign(std::begin(input_shape), std::end(input_shape)); + config.axis_start = clamp(startAxis, input_shape.size()); + config.axis_end = clamp(endAxis, input_shape.size()) + 1; /* +1 because NormalizeOp follows [start, end) convention */ + config.norm = pnorm; + config.eps = epsilon; + + const auto& weightsMat = blobs.empty() ? Mat() : blobs[0]; + cudaNode = make_cuda_node(preferableTarget, std::move(stream_), weightsMat, config); - auto outer_size = total(input_shape, 0, start_axis); - auto inner_size = total(input_shape, end_axis + 1, -1); - scratch_mem_in_bytes = outer_size * inner_size * sizeof(float); + scratch_mem_in_bytes = cudaNode->get_workspace_memory_in_bytes(); } - csl::Tensor weightsTensor; - csl::Stream stream; + std::unique_ptr cudaNode; #endif #ifdef HAVE_INF_ENGINE diff --git a/modules/dnn/src/layers/padding_layer.cpp b/modules/dnn/src/layers/padding_layer.cpp index ad714a9c0450..be9e8eb1b577 100644 --- a/modules/dnn/src/layers/padding_layer.cpp +++ b/modules/dnn/src/layers/padding_layer.cpp @@ -11,14 +11,13 @@ Implementation of padding layer, which adds paddings to input blob. #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/kernels.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/padding.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -176,61 +175,7 @@ class PaddingLayerImpl CV_FINAL : public PaddingLayer csl::Workspace& workspace ) override { - CV_Assert(inputs.size() == 1 && outputs.size() == 1); - - auto input_wrapper = inputs[0].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - - auto effective_rank = get_effective_rank(input); - CV_Assert(get_effective_rank(input) == get_effective_rank(output)); - - /* suppose we require padding for the first spatial axis (H in NCHW or D in NCDHW) - * - * there could be a case where the batch axis, channel axis, and the first spatial axis are all one - * this would result in effective rank being less than the number of axes requiring padding - */ - effective_rank = std::max(effective_rank, dstRanges.size()); - - for (int i = effective_rank - dstRanges.size(); i < effective_rank; i++) - { - if (dstRanges[i] == Range::all()) - CV_Assert(input.get_axis_size(i) == output.get_axis_size(i)); - else - CV_Assert(input.get_axis_size(i) == dstRanges[i].size()); - } - - if (paddingType == "constant") - { - csl::kernels::fill(stream, output, paddingValue); - - std::vector offsets(effective_rank, 0); - for (int i = 0; i < dstRanges.size(); i++) - { - const auto delta = effective_rank - dstRanges.size(); - if (dstRanges[i] != Range::all()) - offsets[delta + i] = dstRanges[i].start; - } - - csl::kernels::concat_with_offsets(stream, output, input, offsets); - } - else if (paddingType == "reflect") - { - std::vector> ranges(effective_rank); - for (int i = 0; i < effective_rank; i++) - { - const auto delta = effective_rank - dstRanges.size(); - if (i < delta || dstRanges[i - delta] == Range::all()) - ranges[i] = { 0, input.get_axis_size(i) }; - else - ranges[i] = { dstRanges[i].start, dstRanges[i].end }; - } - csl::kernels::copy_with_reflection101(stream, output, input, ranges); - } - else - CV_Error(Error::StsNotImplemented, "Requested padding mode is not supported by padding layer."); + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( @@ -238,13 +183,21 @@ class PaddingLayerImpl CV_FINAL : public PaddingLayer csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, - const std::vector>& inputs + const std::vector>& inputs ) override { - stream = std::move(stream_); + padding_type ptype; + if (paddingType == "constant") + ptype = padding_type::constant; + else if (paddingType == "reflect") + ptype = padding_type::reflection101; + else + CV_Error(Error::StsNotImplemented, "Unsupported padding mode"); + + cudaNode = make_cuda_node(preferableTarget, std::move(stream_), ptype, paddingValue, dstRanges); } - csl::Stream stream; + std::unique_ptr cudaNode; #endif virtual Ptr initHalide(const std::vector > &inputs) CV_OVERRIDE diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp index 0f5985ae2328..9caabcfe66e4 100644 --- a/modules/dnn/src/layers/permute_layer.cpp +++ b/modules/dnn/src/layers/permute_layer.cpp @@ -42,7 +42,6 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" #include @@ -53,8 +52,8 @@ #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/permute.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -384,43 +383,25 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer void forwardCUDA( std::vector>& inputs, std::vector>& outputs, - csl::Workspace& workspace) override + csl::Workspace& workspace + ) override { - CV_UNUSED(workspace); - CV_Assert(outputs.size() == 1); - - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - if (!_needsPermute) - { - if (input.get() != output.get()) - csl::tensor_ops::copy(stream, output, input); - } - else - { - std::vector order(std::begin(_order), std::end(_order)); - csl::tensor_ops::permute(stream, output, input, order); - } - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, - const std::vector>& inputs) override + const std::vector>& inputs + ) override { - stream = std::move(stream_); + std::vector order(std::begin(_order), std::end(_order)); + cudaNode = make_cuda_node(preferableTarget, std::move(stream), std::move(order)); } - csl::Stream stream; + std::unique_ptr cudaNode; #endif virtual Ptr initVkCom(const std::vector > &input) CV_OVERRIDE diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index 3d6431f6f120..05513da16011 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -43,7 +43,6 @@ #include "../precomp.hpp" #include "layers_common.hpp" #include "opencv2/core/hal/intrin.hpp" -#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -59,8 +58,8 @@ using namespace cv::dnn::ocl4dnn; #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/pooling.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -300,22 +299,7 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer csl::Workspace& workspace ) override { - if (computeMaxIdx) - CV_Error(Error::StsNotImplemented, "Pooling layer does not support caching max indices"); - - auto input_wrapper = inputs[0].dynamicCast(); - auto input = input_wrapper->getView(); - - if (!transformedInput.empty()) - { - inputTransformer.transform(input, transformedInput); - input = csl::TensorView(transformedInput); - } - - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - - pooler.pool(input, output); + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( @@ -326,142 +310,63 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer const std::vector>& inputs ) override { - cudnnHandle = std::move(cudnn_handle); + if (computeMaxIdx) + CV_Error(Error::StsNotImplemented, "Pooling layer does not support caching max indices"); - auto input_wrapper = inputs[0].dynamicCast(); + auto input_wrapper = inputs[0].dynamicCast(); auto input_shape = input_wrapper->getShape(); - /* 1d, 2d, 3d pooling are supported */ - CV_Assert(input_shape.size() >= 3 || input_shape.size() <= 5); - - const auto rank = input_shape.size(); - - /* left and right are misleading as the padding is applicable for any number of dimensions - * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` - */ - std::vector common_padding(rank, 0); - std::vector padding_left(rank, 0), padding_right(rank, 0); - if (padMode.empty()) + PoolingConfiguration config; + if (type == MAX) { - /* cuDNN rounds down by default; hence, if ceilMode is false, we do nothing - * otherwise, we add extra padding towards the end so that the convolution arithmetic yeilds - * the correct output size without having to deal with fancy fractional sizes - */ - auto pads_end_modified = pads_end; - if (ceilMode) - { - for (int i = 0; i < kernel_size.size(); i++) { - auto rem = (input_shape[i + 2] + pads_begin[i] + pads_end[i] - kernel_size[i]) % strides[i]; - if(rem) - pads_end_modified[i] += strides[i] - rem; - } - } - - for (int i = 2; i < common_padding.size(); i++) - { - common_padding[i] = std::min(pads_begin[i - 2], pads_end_modified[i - 2]); - padding_left[i] = pads_begin[i - 2] - common_padding[i]; - padding_right[i] = pads_end_modified[i - 2] - common_padding[i]; - } + config.poolMode = PoolingConfiguration::pooling_mode::max; } - else if(padMode == "VALID") { /* nothing to do as the paddings are already preset to zero */ } - else if (padMode == "SAME") + else if (type == AVE && !avePoolPaddedArea) { - /* TensorFlow Logic: - * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] - * - * if total padding is odd, the input is padded towards the end - */ - std::vector inShape(std::begin(input_shape) + 2, std::end(input_shape)), outShape; - getConvPoolOutParams(inShape, kernel_size, strides, padMode, std::vector(kernel_size.size(), 1), outShape); - - for (int i = 2; i < rank; i++) - { - const auto j = i - 2; /* window idx */ - const auto required_total_padding = - std::max(0, (outShape[j] - 1) * strides[j] + kernel_size[j] - inShape[j]); - - common_padding[i] = required_total_padding / 2; - padding_left[i] = 0; - padding_right[i] = required_total_padding % 2; - } + config.poolMode = PoolingConfiguration::pooling_mode::average_excluded; } - else + else if (type == AVE && avePoolPaddedArea) { - CV_Error(Error::StsNotImplemented, "Specified padding mode not supported by PoolingLayer"); + config.poolMode = PoolingConfiguration::pooling_mode::average_included; } - - /* in some scenarios, the extra padding may not change the output at all */ - for (int i = 2; i < rank; i++) { - auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; - auto rem = (input_shape[i] + total_padding - kernel_size[i - 2]) % strides[i - 2]; - if (rem && padding_right[i] > 0) - padding_right[i]--; - } - - /* csl::Pooling supports symmetric padding only; hence, we deal with asymmetric padding by - * copying the input to a bigger tensor and padding the sides manually - */ - for (int i = 0; i < rank; i++) - input_shape[i] += padding_left[i] + padding_right[i]; - - std::vector output_shape(rank); - output_shape[0] = input_shape[0]; - output_shape[1] = input_shape[1]; - for (int i = 2; i < rank; i++) + else { - auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; - output_shape[i] = (input_shape[i] + total_padding - kernel_size[i - 2]) / strides[i - 2] + 1; + CV_Error(Error::StsNotImplemented, "Unsupported pooling mode"); } - /* try to avoid input transformation using cuDNN's flexibility */ - if (input_shape != input_wrapper->getShape() && - std::all_of(std::begin(padding_left), std::end(padding_left), [](std::size_t i) {return i == 0; })) + config.window_size.assign(std::begin(kernel_size), std::end(kernel_size)); + config.strides.assign(std::begin(strides), std::end(strides)); + + if (padMode.empty()) { - /* we don't need a transformation since cuDNN allows smaller or bigger output dimensions for - * from the dimensions calculated from the arithmetic - */ - input_shape = input_wrapper->getShape(); + config.padMode = PoolingConfiguration::padding_mode::manual; + config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin)); + config.pads_end.assign(std::begin(pads_end), std::end(pads_end)); } - - /* if the actual input shape and the new input shape do not match; we need to transform the input */ - transform_required = input_shape != input_wrapper->getShape(); - if (transform_required) + else if (padMode == "VALID") { - transformedInput.resize(std::begin(input_shape), std::end(input_shape)); - inputTransformer = csl::TensorTransform(cudnnHandle, padding_left, padding_right); + config.padMode = PoolingConfiguration::padding_mode::valid; } - - csl::Pooling::params_type params; - params.input_shape.assign(std::begin(input_shape), std::end(input_shape)); - params.output_shape.assign(std::begin(output_shape), std::end(output_shape)); - params.window_size = kernel_size; - params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); - params.stride = strides; - - if (type == MAX) + else if (padMode == "SAME") { - params.type = csl::Pooling::pooling_type::MAX; + config.padMode = PoolingConfiguration::padding_mode::same; } - else if (type == AVE) + else { - if (avePoolPaddedArea) - params.type = csl::Pooling::pooling_type::AVERAGE_INCLUDE_PADDING; - else - params.type = csl::Pooling::pooling_type::AVERAGE_EXCLUDE_PADDING; + CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by ConvolutionLayer"); } + + if (ceilMode) + config.roundMode = PoolingConfiguration::rounding_mode::ceil; else - CV_Error(Error::StsNotImplemented, "Unsupported pooling type"); + config.roundMode = PoolingConfiguration::rounding_mode::floor; - pooler = csl::Pooling(cudnnHandle, params); - } + config.input_shape.assign(std::begin(input_shape), std::end(input_shape)); - csl::cudnn::Handle cudnnHandle; - csl::Pooling pooler; + cudaNode = make_cuda_node(preferableTarget, std::move(cudnn_handle), config); + } - bool transform_required; - csl::Tensor transformedInput; - csl::TensorTransform inputTransformer; + std::unique_ptr cudaNode; #endif virtual Ptr initVkCom(const std::vector > &inputs) CV_OVERRIDE diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp index 0f51a666c521..c4a014576961 100644 --- a/modules/dnn/src/layers/prior_box_layer.cpp +++ b/modules/dnn/src/layers/prior_box_layer.cpp @@ -42,7 +42,6 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" #include @@ -54,8 +53,8 @@ #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/prior_box.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -497,61 +496,49 @@ class PriorBoxLayerImpl CV_FINAL : public PriorBoxLayer void forwardCUDA( std::vector>& inputs, std::vector>& outputs, - csl::Workspace& workspace) override + csl::Workspace& workspace + ) override { - CV_Assert(inputs.size() == 2 && outputs.size() == 1); - - auto layer_input_wrapper = inputs[0].dynamicCast(); - auto layer_input = layer_input_wrapper->getView(); /* useless synchronization */ - - auto data_input_wrapper = inputs[1].dynamicCast(); - auto data_input = data_input_wrapper->getView(); /* useless synchronization */ - - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - - auto layerWidth = layer_input.get_axis_size(-1); - auto layerHeight = layer_input.get_axis_size(-2); - - auto imageWidth = data_input.get_axis_size(-1); - auto imageHeight = data_input.get_axis_size(-2); - - auto boxSize = _boxWidths.size(), offsetSize = _offsetsX.size(); - auto boxWidth = csl::view(paramsTensor.get(), boxSize); - auto boxHeight = csl::view(paramsTensor.get() + boxSize, boxSize); - auto offsetX = csl::view(paramsTensor.get() + 2 * boxSize, offsetSize); - auto offsetY = csl::view(paramsTensor.get() + 2 * boxSize + offsetSize, offsetSize); - - csl::kernels::generate_prior_boxes(stream, output, - boxWidth, boxHeight, offsetX, offsetY, - _variance, _numPriors, layerWidth, layerHeight, imageWidth, imageHeight, _stepX, _stepY, _bboxesNormalized, _clip); + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, - const std::vector>& inputs) override + const std::vector>& inputs + ) override { - stream = std::move(stream_); + auto feature_map_wrapper = inputs[0].dynamicCast(); + auto feature_map_shape = feature_map_wrapper->getShape(); - CV_Assert(_boxWidths.size() == _boxHeights.size()); - CV_Assert(_offsetsX.size() == _offsetsY.size()); + auto image_wrapper = inputs[1].dynamicCast(); + auto image_shape = image_wrapper->getShape(); + + PriorBoxConfiguration config; + config.feature_map_width = feature_map_shape.rbegin()[0]; + config.feature_map_height = feature_map_shape.rbegin()[1]; + config.image_width = image_shape.rbegin()[0]; + config.image_height = image_shape.rbegin()[1]; + + config.num_priors = _numPriors; + config.box_widths = _boxWidths; + config.box_heights = _boxHeights; + config.offsets_x = _offsetsX; + config.offsets_y = _offsetsY; + config.stepX = _stepX; + config.stepY = _stepY; + + config.variance = _variance; - auto total = _boxWidths.size() * 2 + _offsetsX.size() * 2; - std::vector paramsVec; - paramsVec.insert(std::end(paramsVec), std::begin(_boxWidths), std::end(_boxWidths)); - paramsVec.insert(std::end(paramsVec), std::begin(_boxHeights), std::end(_boxHeights)); - paramsVec.insert(std::end(paramsVec), std::begin(_offsetsX), std::end(_offsetsX)); - paramsVec.insert(std::end(paramsVec), std::begin(_offsetsY), std::end(_offsetsY)); + config.clip = _clip; + config.normalize = _bboxesNormalized; - paramsTensor.resize(total); - csl::memcpy(paramsTensor.get(), paramsVec.data(), total, stream); /* synchronous copy */ + cudaNode = make_cuda_node(preferableTarget, std::move(stream), config); } - csl::Tensor paramsTensor; /* widths, heights, offsetsX, offsetsY */ - csl::Stream stream; + std::unique_ptr cudaNode; #endif virtual Ptr initVkCom(const std::vector > &input) CV_OVERRIDE diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp index 9dc26f4affed..582e69082394 100644 --- a/modules/dnn/src/layers/reshape_layer.cpp +++ b/modules/dnn/src/layers/reshape_layer.cpp @@ -42,13 +42,12 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/reshape.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -270,37 +269,24 @@ class ReshapeLayerImpl CV_FINAL : public ReshapeLayer void forwardCUDA( std::vector>& inputs, std::vector>& outputs, - csl::Workspace& workspace) override + csl::Workspace& workspace + ) override { - CV_UNUSED(workspace); - - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - if (input.get() != output.get()) - { - input.reshape_as(output); - csl::tensor_ops::copy(stream, output, input); - } - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, - const std::vector>& inputs) override + const std::vector>& inputs + ) override { - stream = std::move(stream_); + cudaNode = make_cuda_node(preferableTarget, std::move(stream)); } - csl::Stream stream; + std::unique_ptr cudaNode; #endif #ifdef HAVE_INF_ENGINE diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp index d1c71b438c9a..6adc2cf181c4 100644 --- a/modules/dnn/src/layers/resize_layer.cpp +++ b/modules/dnn/src/layers/resize_layer.cpp @@ -6,13 +6,12 @@ // Third party copyrights are property of their respective owners. #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/kernels.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/resize.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -174,34 +173,29 @@ class ResizeLayerImpl : public ResizeLayer csl::Workspace& workspace ) override { - CV_Assert(inputs.size() == 1 && outputs.size() == 1); - - auto input_wrapper = inputs[0].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - - if (interpolation == "nearest") - csl::kernels::resize_nn(stream, output, input); - else if (interpolation == "bilinear") - csl::kernels::resize_bilinear(stream, output, input, scaleHeight, scaleWidth); - else - CV_Error(Error::StsNotImplemented, "Requested interpolation mode is not available in resize layer."); + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, - const std::vector>& inputs + const std::vector>& inputs ) override { - stream = std::move(stream_); + interpolation_type itype; + if (interpolation == "nearest") + itype = interpolation_type::nearest_neighbour; + else if (interpolation == "bilinear") + itype = interpolation_type::bilinear; + else + CV_Error(Error::StsNotImplemented, "Requested interpolation mode is not available in resize layer."); + + cudaNode = make_cuda_node(preferableTarget, std::move(stream), itype, scaleHeight, scaleWidth); } - csl::Stream stream; + std::unique_ptr cudaNode; #endif virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp index 12daa889e0b4..00eec2f2d9b2 100644 --- a/modules/dnn/src/layers/scale_layer.cpp +++ b/modules/dnn/src/layers/scale_layer.cpp @@ -11,15 +11,13 @@ Implementation of Scale layer. #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" -#include "../cuda4dnn/csl/kernels.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/scale_shift.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -155,86 +153,30 @@ class ScaleLayerImpl CV_FINAL : public ScaleLayer csl::Workspace& workspace ) override { - CV_UNUSED(workspace); - CV_Assert(outputs.size() == 1); - CV_Assert(!blobs.empty() || inputs.size() == 2); - - auto input_wrapper = inputs[0].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[0].dynamicCast(); - auto output = output_wrapper->getSpan(); - - csl::TensorView weights; - if (blobs.empty()) - { - auto wrapper = inputs[1].dynamicCast(); - weights = wrapper->getView(); - } - else if (hasWeights) - { - weights = csl::TensorSpan(weightsTensor); - } - - csl::TensorView bias; - if (hasBias) - bias = csl::TensorSpan(biasTensor); - - const auto numParams = !weights.empty() ? weights.size() : bias.size(); - CV_Assert(numParams != 0); - if (hasWeights && hasBias) - { - CV_CheckEQ(weights.size(), bias.size(), "Incompatible weights/bias blobs"); - } - - auto input_shape = input_wrapper->getShape(); - - /* the weights/bias might require broadcasting to scale/shift */ - int end_axis = [&] { - for (int endAxis = axis + 1; endAxis <= input_shape.size(); ++endAxis) - if (total(input_shape, axis, endAxis) == numParams) - return endAxis; - CV_Assert(0 /* invalid weights matrix */); - }(); - - std::size_t inner_size = total(input_shape, end_axis, -1); - - CV_Assert(hasWeights || hasBias); - if (hasWeights && hasBias) - csl::kernels::scaleN_with_biasN(stream, output, input, inner_size, weights, bias); - else if (hasWeights) - csl::kernels::scaleN(stream, output, input, inner_size, weights); - else - csl::kernels::biasN(stream, output, input, inner_size, bias); + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, const std::vector>& inputs ) override { - stream = std::move(stream_); - if (hasWeights) - { - weightsTensor = createTensorHeaderFromMat(blobs[0]); - copyMatToTensor(weightsTensor, blobs[0], stream); - } + CV_Assert(!blobs.empty() || inputs.size() == 2); - if (hasBias) - { - /* if the weights are provided, bias will be in blobs[1]; otherwise, it will be in blobs[0] - * in either case, it is at the end of the blobs vector => bias = blobs.back() - */ - biasTensor = createTensorHeaderFromMat(blobs.back()); - copyMatToTensor(biasTensor, blobs.back(), stream); - } + cv::Mat weightsMat = hasWeights ? blobs[0] : Mat(); + + /* if the weights are provided, bias will be in blobs[1]; otherwise, it will be in blobs[0] + * in either case, it is at the end of the blobs vector => bias = blobs.back() + */ + cv::Mat biasMat = hasBias ? blobs.back() : Mat(); + + cudaNode = make_cuda_node(preferableTarget, std::move(stream), axis, weightsMat, biasMat); } - csl::Tensor weightsTensor, biasTensor; - csl::Stream stream; + std::unique_ptr cudaNode; #endif virtual Ptr tryAttach(const Ptr& node) CV_OVERRIDE diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 8647f82bf325..fcfd4a589560 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -41,7 +41,6 @@ //M*/ #include "../precomp.hpp" -#include "../op_cuda.hpp" #include "../op_inf_engine.hpp" #include "layers_common.hpp" #include @@ -51,8 +50,8 @@ #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/kernels.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/slice.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -275,35 +274,30 @@ class SliceLayerImpl : public SliceLayer csl::Workspace& workspace ) override { - CV_Assert(inputs.size() == 1); - - auto input_wrapper = inputs[0].dynamicCast(); - auto input = input_wrapper->getView(); - - for (int i = 0; i < outputs.size(); ++i) - { - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - std::vector offsets; - for (const auto& range : sliceRanges[i]) - offsets.push_back(range.start); - csl::kernels::slice(stream, output, input, offsets); - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( - csl::Stream stream_, + csl::Stream stream, csl::cublas::Handle cublas_handle, csl::cudnn::Handle cudnn_handle, std::size_t& scratch_mem_in_bytes, - const std::vector>& inputs + const std::vector>& inputs ) override { - stream = std::move(stream_); + std::vector> offsets; + for (const auto& ranges : sliceRanges) + { + std::vector offsets_i; + for (const auto& range : ranges) + offsets_i.push_back(range.start); + offsets.push_back(std::move(offsets_i)); + } + + cudaNode = make_cuda_node(preferableTarget, std::move(stream), std::move(offsets)); } - csl::Stream stream; + std::unique_ptr cudaNode; #endif #ifdef HAVE_INF_ENGINE diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index 43bb6a578cc2..44f58e4f2f66 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -42,7 +42,6 @@ #include "../precomp.hpp" #include "layers_common.hpp" -#include "../op_cuda.hpp" #include "../op_halide.hpp" #include "../op_inf_engine.hpp" #include "../op_vkcom.hpp" @@ -56,8 +55,8 @@ using namespace cv::dnn::ocl4dnn; #endif #ifdef HAVE_CUDA -#include "../cuda4dnn/csl/tensor.hpp" -#include "../cuda4dnn/csl/tensor_ops.hpp" +#include "../op_cuda.hpp" +#include "../cuda4dnn/primitives/softmax.hpp" using namespace cv::dnn::cuda4dnn; #endif @@ -301,23 +300,7 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer csl::Workspace& workspace ) override { - CV_UNUSED(workspace); - - for (std::size_t i = 0; i < inputs.size(); i++) - { - auto input_wrapper = inputs[i].dynamicCast(); - auto input = input_wrapper->getView(); - - auto output_wrapper = outputs[i].dynamicCast(); - auto output = output_wrapper->getSpan(); - - auto actual_dims = input_wrapper->getShape().size(); - CV_Assert(get_effective_rank(input) <= actual_dims); - - auto extra_dims = input.rank - actual_dims; - auto channel_axis = clamp(axisRaw, actual_dims) + extra_dims; - csl::tensor_ops::softmax(cudnnHandle, output, input, channel_axis, logSoftMax); - } + cudaNode->forward(inputs, outputs, workspace); } void initCUDA( @@ -328,10 +311,18 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer const std::vector>& inputs ) override { - cudnnHandle = std::move(cudnn_handle); + auto input_wrapper = inputs[0].dynamicCast(); + + auto channel_axis = [&] { + auto actual_dims = input_wrapper->getShape().size(); + auto extra_dims = input_wrapper->getRank() - actual_dims; + return clamp(axisRaw, actual_dims) + extra_dims; + }(); + + cudaNode = make_cuda_node(preferableTarget, std::move(cudnn_handle), channel_axis, logSoftMax); } - csl::cudnn::Handle cudnnHandle; + std::unique_ptr cudaNode; #endif virtual Ptr initVkCom(const std::vector > &inputs) CV_OVERRIDE diff --git a/modules/dnn/src/op_cuda.cpp b/modules/dnn/src/op_cuda.cpp deleted file mode 100644 index 214b2263e1ee..000000000000 --- a/modules/dnn/src/op_cuda.cpp +++ /dev/null @@ -1,118 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -#include "precomp.hpp" -#include "op_cuda.hpp" - -#ifdef HAVE_CUDA -#include "cuda4dnn/csl/stream.hpp" -#include "cuda4dnn/csl/tensor.hpp" -#include "cuda4dnn/csl/pointer.hpp" -using namespace cv::dnn::cuda4dnn; -#endif - -#include -#include - -#include - -namespace cv { - namespace dnn { -#ifdef HAVE_CUDA - CUDABackendWrapperFP32::CUDABackendWrapperFP32(Mat& m) - : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_CUDA_FP32) - { - CV_Assert(m.isContinuous()); - CV_Assert(m.type() == CV_32F); - CV_Assert(m.size.dims() <= tensor_type::rank); - - shape = cv::dnn::shape(m); - - shared_block = std::make_shared(); - shared_block->host_dirty = true; - shared_block->device_dirty = false; - shared_block->host = m; - shared_block->memGuard = csl::MemoryLockGuard(m.data, m.total() * sizeof(float)); - shared_block->parent = createTensorHeaderFromMat(m); - } - - CUDABackendWrapperFP32::CUDABackendWrapperFP32(const Ptr& base_, const MatShape& shape_) - : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_CUDA_FP32) - { - const Ptr base = base_.dynamicCast(); - - shape = shape_; - shared_block = base->shared_block; - } - - Ptr CUDABackendWrapperFP32::create(Mat& m) - { - return Ptr(new CUDABackendWrapperFP32(m)); - } - - Ptr CUDABackendWrapperFP32::create(const Ptr& base, const MatShape& shape) - { - return Ptr(new CUDABackendWrapperFP32(base, shape)); - } - - /* blocking */ - void CUDABackendWrapperFP32::copyToHost() { - if(shared_block->device_dirty) { - shared_block->host_dirty = false; - shared_block->device_dirty = false; - - /* If the wrapper is being reused, the device tensor might be larger in size. - * Using the device tensor does not give incorrect code, but it leads to unused regions - * of memory being copied. - * - * We use a view to ensure that only the required region of memory is copied. - */ - auto view = tensor_view_type(shared_block->parent).subview(0, std::begin(shape), std::end(shape)); - copyTensorToMat(shared_block->host, view, shared_block->stream); - - shared_block->stream.synchronize(); - } - } - - void CUDABackendWrapperFP32::setHostDirty() { - shared_block->device_dirty = false; - shared_block->host_dirty = true; - } - - /* non-blocking - * we don't have to block for copying to device because all operations are put into a stream which - * ensures that the operations added to the stream are performed in order - */ - void CUDABackendWrapperFP32::copyToDevice() { - if(shared_block->host_dirty) { - shared_block->host_dirty = false; - shared_block->device_dirty = false; - - auto span = tensor_span_type(shared_block->parent).subspan(0, std::begin(shape), std::end(shape)); - copyMatToTensor(span, shared_block->host, shared_block->stream); - } - } - - void CUDABackendWrapperFP32::setDeviceDirty() noexcept { - shared_block->device_dirty = true; - shared_block->host_dirty = false; - } - - void CUDABackendWrapperFP32::setStream(csl::Stream stream) noexcept { - shared_block->stream = std::move(stream); - } - - CUDABackendWrapperFP32::tensor_span_type CUDABackendWrapperFP32::getSpan() noexcept { - setDeviceDirty(); - return tensor_span_type(shared_block->parent).subspan(0, std::begin(shape), std::end(shape)); - } - - CUDABackendWrapperFP32::tensor_view_type CUDABackendWrapperFP32::getView() noexcept { - copyToDevice(); - return tensor_view_type(shared_block->parent).subview(0, std::begin(shape), std::end(shape)); - } - -#endif /* ifdef HAVE_CUDA */ - } /* namespace dnn */ -} /* namespace cv */ diff --git a/modules/dnn/src/op_cuda.hpp b/modules/dnn/src/op_cuda.hpp index f0487a806163..4fe67f5f555d 100644 --- a/modules/dnn/src/op_cuda.hpp +++ b/modules/dnn/src/op_cuda.hpp @@ -8,17 +8,27 @@ #ifdef HAVE_CUDA #include "cuda4dnn/csl/stream.hpp" #include "cuda4dnn/csl/tensor.hpp" -#include "cuda4dnn/csl/pointer.hpp" +#include "cuda4dnn/cxx_utils/make_unique.hpp" #endif #include #include +#include #include +#include namespace cv { namespace dnn { - inline bool haveCUDA() { + constexpr bool IS_DNN_CUDA_TARGET(int id) { + switch (id) { + case DNN_TARGET_CUDA_FP32: + return true; + } + return false; + } + + constexpr bool haveCUDA() { #ifdef HAVE_CUDA return true; #else @@ -27,81 +37,242 @@ namespace cv { } #ifdef HAVE_CUDA - /** @brief creates csl::Tensor object from cv::Mat */ - template > inline - TensorT createTensorHeaderFromMat(const cv::Mat& mat) { - auto is_matrix_type_same_as_tensor_type = [&mat]() { - switch (mat.type()) { - case CV_32F: return std::is_same::value; - default: return false; - } - }; - CV_Assert(is_matrix_type_same_as_tensor_type()); + namespace cuda4dnn { namespace csl { + /** @brief creates Tensor object from cv::Mat (only the header is created, i.e. no data is copied) + * + * \tparam T element type for the tensor + */ + template > + TensorT makeTensorHeader(const Mat& mat) + { + auto sizes = shape(mat); + return TensorT(std::begin(sizes), std::end(sizes)); + } - auto sizes = shape(mat); - return TensorT(std::begin(sizes), std::end(sizes)); - } + /** @brief copies data from a cv::Mat and fills a TensorType + * + * Pre-conditions: + * - \p mat must be larger or equal to the tensor in size + * + * @note performance is best for continuous and page-locked cv::Mat + */ + template + void copyMatToTensor(const TensorSpan tensor, const Mat& mat, const Stream& stream); - /** @brief copies data from a cv::Mat and fills a TensorType - * - * Pre-conditions: - * - \p mat must be larger or equal to the tensor in size - * - * @note performance is best for continuous and page-locked cv::Mat - */ - template inline - void copyMatToTensor(const cuda4dnn::csl::TensorSpan tensor, const cv::Mat& mat, const cuda4dnn::csl::Stream& stream) { - CV_Assert(mat.total() >= tensor.size()); - - cv::Mat source = mat.isContinuous() ? mat : mat.clone(); - CV_Assert(source.isContinuous()); - - cuda4dnn::csl::memcpy(tensor.get(), reinterpret_cast(source.data), tensor.size(), stream); - } + template <> inline + void copyMatToTensor(const TensorSpan tensor, const Mat& mat, const Stream& stream) + { + /* should perhaps convert cv::Mat of different type to the required type and copy */ + CV_Assert(mat.type() == CV_32F); + CV_Assert(mat.total() >= tensor.size()); + + Mat source = mat.isContinuous() ? mat : mat.clone(); + CV_Assert(source.isContinuous()); + + memcpy(tensor.get(), reinterpret_cast(source.data), tensor.size(), stream); + } + + template <> inline + void copyMatToTensor(const TensorSpan tensor, const Mat& mat, const Stream& stream) + { + /* should perhaps convert cv::Mat of different type to the required type and copy */ + CV_Assert(mat.type() == CV_32F); + CV_Assert(mat.total() >= tensor.size()); - /** @brief copies data from a TensorType to a cv::Mat - * - * Pre-conditions: - * - \p mat must be larger or equal to the tensor in size - * - * @note performance is best for continuous and page-locked cv::Mat - */ - template inline - void copyTensorToMat(cv::Mat& mat, cuda4dnn::csl::TensorView tensor, const cuda4dnn::csl::Stream& stream) { - CV_Assert(mat.total() >= tensor.size()); + Mat source; + mat.convertTo(source, CV_64F); + CV_Assert(source.isContinuous()); + + memcpy(tensor.get(), reinterpret_cast(source.data), tensor.size(), stream); + } + + /** @brief copies data from a TensorType to a cv::Mat + * + * Pre-conditions: + * - \p mat must be larger or equal to the tensor in size + * + * @note performance is best for continuous and page-locked cv::Mat + */ + template + void copyTensorToMat(Mat& mat, TensorView tensor, const Stream& stream); + + template <> inline + void copyTensorToMat(Mat& mat, TensorView tensor, const Stream& stream) + { + CV_Assert(mat.type() == CV_32F); + CV_Assert(mat.total() >= tensor.size()); + + Mat source = mat.isContinuous() ? mat : mat.clone(); + CV_Assert(source.isContinuous()); + + memcpy(reinterpret_cast(source.data), tensor.get(), tensor.size(), stream); + + if (source.data != mat.data) + source.copyTo(mat); + } + + template <> inline + void copyTensorToMat(Mat& mat, TensorView tensor, const Stream& stream) + { + CV_Assert(mat.type() == CV_32F); + CV_Assert(mat.total() >= tensor.size()); + + Mat source(shape(mat), CV_64F); + CV_Assert(source.isContinuous()); + + memcpy(reinterpret_cast(source.data), tensor.get(), tensor.size(), stream); + + source.convertTo(mat, CV_32F); + } + }} /* cuda4dnn::csl */ + + /* base class for all CUDA backend/target node */ + class CUDABackendNode : public BackendNode { + public: + CUDABackendNode() : BackendNode(DNN_BACKEND_CUDA) { } + virtual ~CUDABackendNode() { } - cv::Mat source = mat.isContinuous() ? mat : mat.clone(); - CV_Assert(source.isContinuous()); + virtual void forward( + std::vector>& inputs, + std::vector>& outputs, + cuda4dnn::csl::Workspace& workspace) = 0; - cuda4dnn::csl::memcpy(reinterpret_cast(source.data), tensor.get(), tensor.size(), stream); + virtual std::size_t get_workspace_memory_in_bytes() const noexcept { return 0; } + }; - if(source.data != mat.data) - source.copyTo(mat); + /* utility function which creates a correct backend node based on `targetId` */ + template