diff --git a/.github/workflows/PR-5.x.yaml b/.github/workflows/PR-5.x.yaml index dcd3de86347..d89d21e7fd9 100644 --- a/.github/workflows/PR-5.x.yaml +++ b/.github/workflows/PR-5.x.yaml @@ -15,6 +15,10 @@ jobs: Ubuntu2204-x64: uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-5.x-U22.yaml@main +# TODO: +# Ubuntu2404-x64: +# uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-5.x-U24.yaml@main + Ubuntu2004-x64-CUDA: uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-5.x-U20-Cuda.yaml@main diff --git a/modules/bgsegm/test/test_backgroundsubtractor_gbh.cpp b/modules/bgsegm/test/test_backgroundsubtractor_gbh.cpp index f42f008e13b..237441357ae 100644 --- a/modules/bgsegm/test/test_backgroundsubtractor_gbh.cpp +++ b/modules/bgsegm/test/test_backgroundsubtractor_gbh.cpp @@ -29,8 +29,10 @@ void CV_BackgroundSubtractorTest::run(int) { int code = cvtest::TS::OK; RNG& rng = ts->get_rng(); - int type = ((unsigned int)rng)%7; //!< pick a random type, 0 - 6, defined in types_c.h - int channels = 1 + ((unsigned int)rng)%4; //!< random number of channels from 1 to 4. + int type = ((unsigned int)rng) % 3; + type = (type == 0) ? CV_8U : (type == 1) ? CV_16U : CV_32F; // 8U, 16U, 32F + int channels = ((unsigned int)rng)%3; + channels = (channels == 2) ? 4 : channels; // 1, 3, 4 int channelsAndType = CV_MAKETYPE(type,channels); int width = 2 + ((unsigned int)rng)%98; //!< Mat will be 2 to 100 in width and height int height = 2 + ((unsigned int)rng)%98; diff --git a/modules/cudaarithm/src/cuda/polar_cart.cu b/modules/cudaarithm/src/cuda/polar_cart.cu index 2fb1315e619..12980e424ff 100644 --- a/modules/cudaarithm/src/cuda/polar_cart.cu +++ b/modules/cudaarithm/src/cuda/polar_cart.cu @@ -133,23 +133,9 @@ void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, Outpu GpuMat_ anglec(angle.reshape(1)); if (angleInDegrees) - { - gridTransformTuple(zipPtr(xc, yc), - tie(magc, anglec), - make_tuple( - binaryTupleAdapter<0, 1>(magnitude_func()), - binaryTupleAdapter<0, 1>(direction_func())), - stream); - } + gridTransformBinary(xc, yc, magc, anglec, magnitude_func(), direction_func(), stream); else - { - gridTransformTuple(zipPtr(xc, yc), - tie(magc, anglec), - make_tuple( - binaryTupleAdapter<0, 1>(magnitude_func()), - binaryTupleAdapter<0, 1>(direction_func())), - stream); - } + gridTransformBinary(xc, yc, magc, anglec, magnitude_func(), direction_func(), stream); syncOutput(mag, _mag, stream); syncOutput(angle, _angle, stream); diff --git a/modules/cudaarithm/src/cuda/split_merge.cu b/modules/cudaarithm/src/cuda/split_merge.cu index 5b3af10775d..f0acb840a9e 100644 --- a/modules/cudaarithm/src/cuda/split_merge.cu +++ b/modules/cudaarithm/src/cuda/split_merge.cu @@ -67,7 +67,8 @@ namespace { static void call(const GpuMat* src, GpuMat& dst, Stream& stream) { - gridMerge(zipPtr(globPtr(src[0]), globPtr(src[1])), + const std::array, 2> d_src = {globPtr(src[0]), globPtr(src[1])}; + gridMerge(d_src, globPtr::type>(dst), stream); } @@ -77,7 +78,8 @@ namespace { static void call(const GpuMat* src, GpuMat& dst, Stream& stream) { - gridMerge(zipPtr(globPtr(src[0]), globPtr(src[1]), globPtr(src[2])), + const std::array, 3> d_src = {globPtr(src[0]), globPtr(src[1]), globPtr(src[2])}; + gridMerge(d_src, globPtr::type>(dst), stream); } @@ -87,7 +89,8 @@ namespace { static void call(const GpuMat* src, GpuMat& dst, Stream& stream) { - gridMerge(zipPtr(globPtr(src[0]), globPtr(src[1]), globPtr(src[2]), globPtr(src[3])), + const std::array, 4 > d_src = {globPtr(src[0]), globPtr(src[1]), globPtr(src[2]), globPtr(src[3])}; + gridMerge(d_src, globPtr::type>(dst), stream); } diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp index cfadad648a9..b70a128558f 100644 --- a/modules/cudaarithm/src/reductions.cpp +++ b/modules/cudaarithm/src/reductions.cpp @@ -151,7 +151,12 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, Stream& stream) sz.width = gsrc.cols; sz.height = gsrc.rows; +#if (CUDA_VERSION >= 12040) + size_t bufSize; +#else int bufSize; +#endif + #if (CUDA_VERSION <= 4020) nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) ); #else @@ -162,7 +167,8 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, Stream& stream) #endif BufferPool pool(stream); - GpuMat buf = pool.getBuffer(1, bufSize, gsrc.type()); + CV_Assert(bufSize <= std::numeric_limits::max()); + GpuMat buf = pool.getBuffer(1, static_cast(bufSize), gsrc.type()); // detail: https://github.com/opencv/opencv/issues/11063 //NppStreamHandler h(StreamAccessor::getStream(stream)); @@ -227,7 +233,12 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stre sz.width = gsrc.cols; sz.height = gsrc.rows; +#if (CUDA_VERSION >= 12040) + size_t bufSize; +#else int bufSize; +#endif + #if (CUDA_VERSION <= 4020) nppSafeCall( nppiMeanStdDev8uC1MRGetBufferHostSize(sz, &bufSize) ); #else @@ -238,7 +249,8 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stre #endif BufferPool pool(stream); - GpuMat buf = pool.getBuffer(1, bufSize, gsrc.type()); + CV_Assert(bufSize <= std::numeric_limits::max()); + GpuMat buf = pool.getBuffer(1, static_cast(bufSize), gsrc.type()); if(gsrc.type() == CV_8UC1) nppSafeCall( nppiMean_StdDev_8u_C1MR(gsrc.ptr(), static_cast(gsrc.step), gmask.ptr(), static_cast(gmask.step), diff --git a/modules/cudafilters/src/cuda/median_filter.cu b/modules/cudafilters/src/cuda/median_filter.cu index ed46eb4bf94..6776428ae1a 100644 --- a/modules/cudafilters/src/cuda/median_filter.cu +++ b/modules/cudafilters/src/cuda/median_filter.cu @@ -53,6 +53,17 @@ #include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/border_interpolate.hpp" + +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#include "wavelet_matrix_feature_support_checks.h" +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +#include "wavelet_matrix_multi.cuh" +#include "wavelet_matrix_2d.cuh" +#include "wavelet_matrix_float_supporter.cuh" +#endif + + namespace cv { namespace cuda { namespace device { __device__ void histogramAddAndSub8(int* H, const int * hist_colAdd,const int * hist_colSub){ @@ -334,4 +345,72 @@ namespace cv { namespace cuda { namespace device }}} + +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +namespace cv { namespace cuda { namespace device + { + using namespace wavelet_matrix_median; + + template + void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius,cudaStream_t stream){ + + constexpr bool is_float = std::is_same::value; + constexpr static int WORD_SIZE = 32; + constexpr static int ThW = (std::is_same::value ? 8 : 4); + constexpr static int ThH = (std::is_same::value ? 64 : 256); + using XYIdxT = uint32_t; + using XIdxT = uint16_t; + using WM_T = typename std::conditional::type; + using MedianResT = typename std::conditional::type; + using WM2D_IMPL = WaveletMatrix2dCu5C, 512, WORD_SIZE>; + + CV_Assert(src.cols == dst.cols); + CV_Assert(dst.step % sizeof(T) == 0); + + WM2D_IMPL WM_cuda(src.rows, src.cols, is_float, false); + WM_cuda.res_cu = reinterpret_cast(dst.ptr()); + + const size_t line_num = src.cols * CH_NUM; + if (is_float) { + WMMedianFloatSupporter::WMMedianFloatSupporter float_supporter(src.rows, src.cols); + float_supporter.alloc(); + for (int y = 0; y < src.rows; ++y) { + cudaMemcpy(float_supporter.val_in_cu + y * line_num, src.ptr(y), line_num * sizeof(T), cudaMemcpyDeviceToDevice); + } + const auto p = WM_cuda.get_nowcu_and_buf_byte_div32(); + float_supporter.sort_and_set((XYIdxT*)p.first, p.second); + WM_cuda.construct(nullptr, stream, true); + WM_cuda.template median2d(radius, dst.step / sizeof(T), (MedianResT*)float_supporter.get_res_table(), stream); + } else { + for (int y = 0; y < src.rows; ++y) { + cudaMemcpy(WM_cuda.src_cu + y * line_num, src.ptr(y), line_num * sizeof(T), cudaMemcpyDeviceToDevice); + } + WM_cuda.construct(nullptr, stream); + WM_cuda.template median2d(radius, dst.step / sizeof(T), nullptr, stream); + } + WM_cuda.res_cu = nullptr; + if (!stream) { + cudaSafeCall( cudaDeviceSynchronize() ); + } + } + + template + void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius, const int num_channels, cudaStream_t stream){ + if (num_channels == 1) { + medianFiltering_wavelet_matrix_gpu<1>(src, dst, radius, stream); + } else if (num_channels == 3) { + medianFiltering_wavelet_matrix_gpu<3>(src, dst, radius, stream); + } else if (num_channels == 4) { + medianFiltering_wavelet_matrix_gpu<4>(src, dst, radius, stream); + } else { + CV_Assert(num_channels == 1 || num_channels == 3 || num_channels == 4); + } + } + + template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius, const int num_channels, cudaStream_t stream); + template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius, const int num_channels, cudaStream_t stream); + template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius, const int num_channels, cudaStream_t stream); +}}} +#endif // __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + #endif diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_2d.cuh b/modules/cudafilters/src/cuda/wavelet_matrix_2d.cuh new file mode 100644 index 00000000000..9c10c223d87 --- /dev/null +++ b/modules/cudafilters/src/cuda/wavelet_matrix_2d.cuh @@ -0,0 +1,1053 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_WAVELET_MATRIX_2D_CUH__ +#define __OPENCV_WAVELET_MATRIX_2D_CUH__ + + +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#include "wavelet_matrix_feature_support_checks.h" +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + + +#include +#include +#include "opencv2/core/cuda/warp_shuffle.hpp" + +#include "wavelet_matrix_multi.cuh" + +namespace cv { namespace cuda { namespace device +{ + +namespace wavelet_matrix_median { + using std::vector; + using namespace std; + +template +__global__ void WaveletMatrix2dCu5C_UpSweep_gpu(const SrcT mask, const uint16_t block_pair_num, const XYIdxT size_div_w, const SrcT* __restrict__ src, DstT* __restrict__ dst, BlockT* __restrict__ nbit_bp, const XYIdxT* __restrict__ nsum_buf_test, XYIdxT* __restrict__ nsum_buf_test2, const uint32_t bv_block_byte_div32, const uint32_t buf_byte_div32, const XIdxT* __restrict__ idx_p, const XIdxT inf, XIdxT* __restrict__ wm, XIdxT* __restrict__ nxt_idx, XYIdxT* __restrict__ wm_nsum_scan_buf, const XYIdxT cwm_buf_byte_div32, BlockT* __restrict__ nbit_bp_pre) { + using WordT = decltype(BlockT::nbit); + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + constexpr int WORD_SIZE = 8 * sizeof(WordT); + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, ""); + constexpr uint32_t WORD_DIV_WARP = WORD_SIZE / WARP_SIZE; + + static_assert(ThreadsDimY % SRC_CACHE_DIV == 0, ""); + static_assert(ThreadsDimY != SRC_CACHE_DIV, "Warning: It's not efficient."); + + const size_t buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (buf_byte_div32*32ull); + const size_t bv_block_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (bv_block_byte_div32*32ull); + const size_t cwm_buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * cwm_buf_byte_div32 * 32u; + src = (SrcT*)((uint8_t*)src + buf_byte_y_offset); + dst = (DstT*)((uint8_t*)dst + buf_byte_y_offset); + nsum_buf_test = (XYIdxT*)((uint8_t*)nsum_buf_test + buf_byte_y_offset); + nsum_buf_test2 = (XYIdxT*)((uint8_t*)nsum_buf_test2 + buf_byte_y_offset); + nbit_bp = (BlockT*)((uint8_t*)nbit_bp + bv_block_byte_y_offset); + nbit_bp_pre = (BlockT*)((uint8_t*)nbit_bp_pre + bv_block_byte_y_offset); + + idx_p = (XIdxT*)((uint8_t*)idx_p + buf_byte_y_offset); + nxt_idx = (XIdxT*)((uint8_t*)nxt_idx + buf_byte_y_offset); + if (wm != nullptr) wm = (XIdxT*)((uint8_t*)wm + cwm_buf_byte_y_offset); + wm_nsum_scan_buf = (XYIdxT*)((uint8_t*)wm_nsum_scan_buf + cwm_buf_byte_y_offset); + + + using WarpScanX = cub::WarpScan; + using WarpScanY = cub::WarpScan; + using WarpReduce = cub::WarpReduce; + using WarpReduceY = cub::WarpReduce; + + static_assert(SRCB_S < 64 * 1024, ""); + + __shared__ SrcT src_val_cache[ThreadsDimY][(WARP_SIZE/SRC_CACHE_DIV)-1][WARP_SIZE]; + __shared__ XIdxT vidx_val_cache[ThreadsDimY][(WARP_SIZE/SRC_CACHE_DIV)-1][WARP_SIZE]; + + __shared__ uint4 nsum_count_sh[ThreadsDimY]; + __shared__ XYIdxT wm_zero_count_sh[ThreadsDimY]; + __shared__ XYIdxT pre_sum_share[2]; + __shared__ XYIdxT warp_scan_sums[ThreadsDimY]; + __shared__ typename WarpScanX::TempStorage s_scanStorage; + __shared__ typename WarpScanY::TempStorage s_scanStorage2; + __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY]; + __shared__ typename WarpReduceY::TempStorage WarpReduceY_temp_storage; + // shmem ------ end ------ + + XYIdxT wm_zero_count = 0; + + const XYIdxT size_div_warp = size_div_w * WORD_DIV_WARP; + const XYIdxT nsum = nbit_bp[size_div_w].nsum; + const XYIdxT nsum_offset = nsum_buf_test[blockIdx.x]; + const XYIdxT nsum_pre = nbit_bp_pre[size_div_w].nsum; + + + XYIdxT nsum_idx0_org = nsum_offset; + XYIdxT nsum_idx1_org = (XYIdxT)blockIdx.x * block_pair_num * THREAD_PER_GRID + nsum - nsum_idx0_org; + nsum_idx0_org /= (XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE; + nsum_idx1_org /= (XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE; + const XYIdxT nsum_idx0_bound = (nsum_idx0_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE; + const XYIdxT nsum_idx1_bound = (nsum_idx1_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE; + uint4 nsum_count = make_uint4(0, 0, 0, 0); + + const unsigned short th_idx = threadIdx.y * WARP_SIZE + threadIdx.x; + if (th_idx == 0) { + pre_sum_share[0] = nsum_offset; + } + + for (XYIdxT ka = 0; ka < block_pair_num; ka += WARP_SIZE / SRC_CACHE_DIV) { + const XYIdxT ibb = ((XYIdxT)blockIdx.x * block_pair_num + ka) * ThreadsDimY; + if (ibb >= size_div_warp) break; + + WarpWT my_bits = 0; + SrcT first_val; + XIdxT first_idxval; + + for (XYIdxT kb = 0, i = ibb + WARP_SIZE / SRC_CACHE_DIV * threadIdx.y; kb < WARP_SIZE / SRC_CACHE_DIV; ++kb, ++i) { + if (i >= size_div_warp) break; + WarpWT bits; + const XYIdxT ij = i * WARP_SIZE + threadIdx.x; + const SrcT v = src[ij]; + const XIdxT wm_idxv = idx_p[ij]; + if (kb == 0) { + first_val = v; + first_idxval = wm_idxv; + } else { + src_val_cache[threadIdx.y][kb - 1][threadIdx.x] = v; + vidx_val_cache[threadIdx.y][kb - 1][threadIdx.x] = wm_idxv; + } + if (v <= mask) { + bits = __activemask(); + } else { + bits = ~__activemask(); + } + if (threadIdx.x == kb) { + my_bits = bits; + } + if (wm != nullptr) { + if (ij < nsum_pre) { + wm[ij] = wm_idxv; + if (wm_idxv * 2 <= inf) { + ++wm_zero_count; + } + } else { + wm[ij] = inf; + } + } + } + + XYIdxT c, t = 0; + if (threadIdx.y < ThreadsDimY) { + c = __popc(my_bits); + + WarpScanX(s_scanStorage).ExclusiveSum(c, t); + if (threadIdx.x == WARP_SIZE / SRC_CACHE_DIV - 1) { + warp_scan_sums[threadIdx.y] = c + t; + } + } + + __syncthreads(); + + XYIdxT pre_sum = pre_sum_share[(ka & (WARP_SIZE / SRC_CACHE_DIV)) > 0 ? 1 : 0]; + XYIdxT s = threadIdx.x < ThreadsDimY ? warp_scan_sums[threadIdx.x] : 0; + WarpScanY(s_scanStorage2).ExclusiveSum(s, s); + + s = cv::cuda::device::shfl(s, threadIdx.y, WARP_SIZE); + s += t + pre_sum; + + if (SRC_CACHE_DIV == 1 || threadIdx.x < WARP_SIZE / SRC_CACHE_DIV) { + if (th_idx == THREAD_PER_GRID - WARP_SIZE + WARP_SIZE / SRC_CACHE_DIV - 1) { + pre_sum_share[(ka & (WARP_SIZE / SRC_CACHE_DIV)) == 0 ? 1 : 0] = s + c; + } + const XYIdxT bi = ibb + threadIdx.y * WARP_SIZE / SRC_CACHE_DIV + threadIdx.x; + if (bi < size_div_warp) { + static_assert(WORD_SIZE == 32, ""); + nbit_bp[bi] = BlockT{s, my_bits}; + } + } + + if (mask == 0) { + SrcT vo = first_val; + XIdxT idx_v = first_idxval; + for (XYIdxT j = 0, i = ibb + WARP_SIZE / SRC_CACHE_DIV * threadIdx.y; j < WARP_SIZE / SRC_CACHE_DIV; ++j, ++i) { + if (i >= size_div_warp) break; + const WarpWT e_nbit = cv::cuda::device::shfl(my_bits, j, WARP_SIZE); + const XYIdxT e_nsum = cv::cuda::device::shfl(s, j, WARP_SIZE); + XYIdxT rank = __popc(e_nbit << (WARP_SIZE - threadIdx.x)); + const XYIdxT idx0 = e_nsum + rank; + XYIdxT idx = idx0; + if (vo > mask) { // 1 + const XYIdxT ij = i * WARP_SIZE + threadIdx.x; + idx = ij + nsum - idx; + } + if (idx < size_div_warp * WARP_SIZE) { + nxt_idx[idx] = idx_v; + } + if (j == WARP_SIZE / SRC_CACHE_DIV - 1) break; + vo = src_val_cache[threadIdx.y][j][threadIdx.x]; + idx_v = vidx_val_cache[threadIdx.y][j][threadIdx.x]; + } + continue; + } + const SrcT mask_2 = mask >> 1; + SrcT vo = first_val; + XIdxT idx_v = first_idxval; + for (XYIdxT j = 0, i = ibb + WARP_SIZE / SRC_CACHE_DIV * threadIdx.y; j < WARP_SIZE / SRC_CACHE_DIV; ++j, ++i) { + if (i >= size_div_warp) break; + const WarpWT e_nbit = cv::cuda::device::shfl(my_bits, j, WARP_SIZE); + const XYIdxT e_nsum = cv::cuda::device::shfl(s, j, WARP_SIZE); + XYIdxT rank = __popc(e_nbit << (WARP_SIZE - threadIdx.x)); + const XYIdxT idx0 = e_nsum + rank; + + DstT v = (DstT)vo; + XYIdxT idx = idx0; + if (vo > mask) { // 1 + const XYIdxT ij = i * WARP_SIZE + threadIdx.x; + idx = ij + nsum - idx; + v &= mask; + } + if (idx < size_div_warp * WARP_SIZE) { + if (mask != 0) { + dst[idx] = v; + } + nxt_idx[idx] = idx_v; + } + + if (v <= mask_2) { + if (vo <= mask) { + if (idx < nsum_idx0_bound) { + nsum_count.x++; + } else { + nsum_count.y++; + } + } else { + if (idx < nsum_idx1_bound) { + nsum_count.z++; + } else { + nsum_count.w++; + } + } + } + if (j == WARP_SIZE / SRC_CACHE_DIV - 1) break; + vo = src_val_cache[threadIdx.y][j][threadIdx.x]; + idx_v = vidx_val_cache[threadIdx.y][j][threadIdx.x]; + } + } + if (blockIdx.x == gridDim.x - 1 && th_idx == 0) { + nbit_bp[size_div_warp / WORD_DIV_WARP].nsum = nsum; + } + + nsum_count.x = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.x); + nsum_count.y = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.y); + nsum_count.z = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.z); + nsum_count.w = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.w); + wm_zero_count = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(wm_zero_count); + if (threadIdx.x == 0) { + nsum_count_sh[threadIdx.y] = nsum_count; + wm_zero_count_sh[threadIdx.y] = wm_zero_count; + } + __syncthreads(); + if (threadIdx.x < ThreadsDimY) { + nsum_count = nsum_count_sh[threadIdx.x]; + nsum_count.x = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.x); + nsum_count.y = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.y); + nsum_count.z = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.z); + nsum_count.w = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.w); + wm_zero_count = WarpReduceY(WarpReduceY_temp_storage).Sum(wm_zero_count_sh[threadIdx.x]); + if (th_idx == 0) { + const XYIdxT nsum_idx0_org = nsum_idx0_bound / ((XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE); + const XYIdxT nsum_idx1_org = nsum_idx1_bound / ((XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE); + if (mask != 0) { + if (nsum_count.x > 0) atomicAdd(nsum_buf_test2 + nsum_idx0_org - 1, nsum_count.x); + if (nsum_count.y > 0) atomicAdd(nsum_buf_test2 + nsum_idx0_org - 0, nsum_count.y); + if (nsum_count.z > 0) atomicAdd(nsum_buf_test2 + nsum_idx1_org - 1, nsum_count.z); + if (nsum_count.w > 0) atomicAdd(nsum_buf_test2 + nsum_idx1_org - 0, nsum_count.w); + } + if (wm != nullptr) { + wm_nsum_scan_buf[blockIdx.x] = wm_zero_count; + } + } + } +} + +template +__global__ void WaveletMatrix2dCu5C_last_gpu(const uint16_t block_pair_num, const XYIdxT size_div_w, const uint32_t buf_byte_div32, const XIdxT* __restrict__ idx_p, const XIdxT inf, XIdxT* __restrict__ wm, XYIdxT* __restrict__ wm_nsum_scan_buf, const XYIdxT cwm_buf_byte_div32, BlockT* __restrict__ nbit_bp_pre, const uint32_t bv_block_byte_div32) { + using WordT = decltype(BlockT::nbit); + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + constexpr int WORD_SIZE = 8 * sizeof(WordT); + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, ""); + constexpr uint32_t WORD_DIV_WARP = WORD_SIZE / WARP_SIZE; + + const size_t buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (buf_byte_div32*32ull); + const size_t bv_block_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (bv_block_byte_div32*32ull); + const size_t cwm_buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * cwm_buf_byte_div32 * 32u; + + idx_p = (XIdxT*)((uint8_t*)idx_p + buf_byte_y_offset); + wm = (XIdxT*)((uint8_t*)wm + cwm_buf_byte_y_offset); + wm_nsum_scan_buf = (XYIdxT*)((uint8_t*)wm_nsum_scan_buf + cwm_buf_byte_y_offset); + nbit_bp_pre = (BlockT*)((uint8_t*)nbit_bp_pre + bv_block_byte_y_offset); + const XYIdxT nsum_pre = nbit_bp_pre[size_div_w].nsum; + + using WarpReduce = cub::WarpReduce; + using WarpReduceY = cub::WarpReduce; + + __shared__ XYIdxT wm_zero_count_sh[ThreadsDimY]; + __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY]; + __shared__ typename WarpReduceY::TempStorage WarpReduceY_temp_storage; + // shmem ------ end ------ + + XYIdxT wm_zero_count = 0; + const XYIdxT size_div_warp = size_div_w * WORD_DIV_WARP; + const unsigned short th_idx = threadIdx.y * WARP_SIZE + threadIdx.x; + + const int block_num = block_pair_num/WARP_SIZE; + for (XYIdxT ka = 0; ka < block_num; ++ka) { + const XYIdxT ibb = ((XYIdxT)blockIdx.x * block_num + ka) * THREAD_PER_GRID + WARP_SIZE * threadIdx.y; + if (ibb >= size_div_warp) break; + + for (XYIdxT kb = 0; kb < WARP_SIZE; ++kb) { + XYIdxT i = ibb + kb; + if (i >= size_div_warp) break; + + const XYIdxT ij = i * WARP_SIZE + threadIdx.x; + + if (ij < nsum_pre) { + const XIdxT wm_idxv = idx_p[ij]; + wm[ij] = wm_idxv; + if (wm_idxv * 2 <= inf) { + ++wm_zero_count; + } + } else { + wm[ij] = inf; + } + } + } + wm_zero_count = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(wm_zero_count); + if (threadIdx.x == 0) { + wm_zero_count_sh[threadIdx.y] = wm_zero_count; + } + __syncthreads(); + if (threadIdx.x < ThreadsDimY) { + wm_zero_count = WarpReduceY(WarpReduceY_temp_storage).Sum(wm_zero_count_sh[threadIdx.x]); + if (th_idx == 0) { + wm_nsum_scan_buf[blockIdx.x] = wm_zero_count; + } + } +} + +template +__global__ void WaveletMatrix2dCu5C_ExclusiveSum(XYIdxT* __restrict__ nsum_scan_buf, XYIdxT* __restrict__ nsum_buf_test2, BlockT* __restrict__ nsum_p, const uint32_t buf_byte_div32, const uint32_t bv_block_byte_div32) { + + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + + XYIdxT thread_data1; + XYIdxT thread_data2; + + nsum_scan_buf = (XYIdxT*)((uint8_t*)nsum_scan_buf + (size_t)blockIdx.x * (buf_byte_div32*32ull)); + nsum_buf_test2 = (XYIdxT*)((uint8_t*)nsum_buf_test2 + (size_t)blockIdx.x * (buf_byte_div32*32ull)); + + thread_data1 = nsum_scan_buf[threadIdx.x]; + BlockScan(temp_storage).ExclusiveSum(thread_data1, thread_data2); + + nsum_scan_buf[threadIdx.x] = thread_data2; + nsum_buf_test2[threadIdx.x] = 0; + + if (threadIdx.x == blockDim.x - 1) { + thread_data2 += thread_data1; + nsum_p = (BlockT*)((uint8_t*)nsum_p + (size_t)blockIdx.x * (bv_block_byte_div32*32ull)); + nsum_p->nsum = thread_data2; + } +} + + +template +__global__ void WaveletMatrix2dCu5C_first_gpu_multi(const SrcT mask, uint16_t block_pair_num, const XYIdxT size_div_warp, SrcT* __restrict__ src, SrcT* __restrict__ dst, XYIdxT* __restrict__ nsum_scan_buf, const uint32_t buf_byte_div32, XIdxT* __restrict__ buf_idx, const int W, const XYIdxT WH, const SrcT* __restrict__ src_const) { + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + + SrcT* __restrict__ dsts[CH_NUM]; + XIdxT* __restrict__ buf_idxs[CH_NUM]; + XYIdxT cs[CH_NUM]; + + __shared__ SrcT src_vbuf_org[ThreadsDimY][CH_NUM * WARP_SIZE]; + SrcT* __restrict__ src_vbuf = src_vbuf_org[threadIdx.y]; + + for (int c = 0; c < CH_NUM; ++c) { + if (CH_NUM > 1) { // constexpr + dsts[c] = (SrcT*)((uint8_t*)dst + (size_t)c * (buf_byte_div32*32ull)); + } + buf_idxs[c] = (XIdxT*)((uint8_t*)buf_idx + (size_t)c * (buf_byte_div32*32ull)); + cs[c] = 0; + } + + XYIdxT ibb = (XYIdxT)blockIdx.x * block_pair_num * ThreadsDimY; + for (XYIdxT ka = 0; ka < block_pair_num; ka += WARP_SIZE, ibb += THREAD_PER_GRID) { + for (XYIdxT kb = 0, i = ibb + WARP_SIZE * threadIdx.y; kb < WARP_SIZE; ++kb, ++i) { + if (i >= size_div_warp) break; + const XYIdxT iw = i * WARP_SIZE; + const XYIdxT idx = iw + threadIdx.x; + const XIdxT idx_v = (idx >= WH ? 0 : idx % W); + + for (int c = 0; c < CH_NUM; ++c) { + if (MOV_SRC) { // constexpr + const XYIdxT s_idx = iw * CH_NUM + threadIdx.x + c * WARP_SIZE; + src_vbuf[threadIdx.x + c * WARP_SIZE] = src[s_idx] = src_const[s_idx]; + } else { + src_vbuf[threadIdx.x + c * WARP_SIZE] = src[iw * CH_NUM + threadIdx.x + c * WARP_SIZE]; + } + } + __syncwarp(); + for (int c = 0; c < CH_NUM; ++c) { + const SrcT v = src_vbuf[threadIdx.x * CH_NUM + c]; + if (v <= mask) { + ++cs[c]; + } + buf_idxs[c][idx] = idx_v; + if constexpr(CH_NUM > 1) { + dsts[c][idx] = v; + } + } + } + } + + using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY]; + __shared__ XYIdxT cs_sum_sh[CH_NUM][ThreadsDimY]; + for (int c = 0; c < CH_NUM; ++c) { + cs[c] = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(cs[c]); + } + if (threadIdx.x == 0) { + for (int c = 0; c < CH_NUM; ++c) { + cs_sum_sh[c][threadIdx.y] = cs[c]; + } + } + __syncthreads(); + if (threadIdx.y != 0) return; + for (int c = 0; c < CH_NUM; ++c) { + XYIdxT cs_bsum = (threadIdx.x < ThreadsDimY ? cs_sum_sh[c][threadIdx.x] : 0); + cs_bsum = WarpReduce(WarpReduce_temp_storage[0]).Sum(cs_bsum); + if (threadIdx.x == 0) { + nsum_scan_buf[blockIdx.x] = cs_bsum; + nsum_scan_buf = (XYIdxT*)((uint8_t*)nsum_scan_buf + (buf_byte_div32*32ull)); + } + } + +} + + +template +__global__ void WaveletMatrix2dCu5C_first_gpu_multi_srcunpacked(const SrcT mask, uint16_t block_pair_num, const XYIdxT size_div_warp, const SrcT* __restrict__ src, XYIdxT* __restrict__ nsum_scan_buf, const uint32_t buf_byte_div32, XIdxT* __restrict__ buf_idx, const int W, const XYIdxT WH) { + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + + XYIdxT cs = 0; + + const int c = blockIdx.y; + buf_idx = buf_idx + c * (buf_byte_div32*32ull / sizeof(XIdxT)); + src = src + c * (buf_byte_div32*32ull / sizeof(SrcT)); + + + XYIdxT i = (XYIdxT)blockIdx.x * block_pair_num * ThreadsDimY + threadIdx.y; + + XIdxT x_idx = (i * WARP_SIZE + threadIdx.x) % W; + const XIdxT x_diff = THREAD_PER_GRID % W; + + for (XYIdxT k = 0; k < block_pair_num; ++k, i += ThreadsDimY) { + if (i >= size_div_warp) break; + const XYIdxT idx = i * WARP_SIZE + threadIdx.x; + + if (idx >= WH) x_idx = 0; + + const SrcT v = src[idx]; + if (v <= mask) { + ++cs; + } + buf_idx[idx] = x_idx; + + x_idx += x_diff; + if (x_idx >= W) x_idx -= W; + } + + using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY]; + __shared__ XYIdxT cs_sum_sh[ThreadsDimY]; + cs = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(cs); + if (threadIdx.x == 0) { + cs_sum_sh[threadIdx.y] = cs; + } + __syncthreads(); + if (threadIdx.y != 0) return; + XYIdxT cs_bsum = (threadIdx.x < ThreadsDimY ? cs_sum_sh[threadIdx.x] : 0); + cs_bsum = WarpReduce(WarpReduce_temp_storage[0]).Sum(cs_bsum); + if (threadIdx.x == 0) { + nsum_scan_buf += (buf_byte_div32*32ull / sizeof(XYIdxT)) * (blockIdx.y); + nsum_scan_buf[blockIdx.x] = cs_bsum; + } +} + +template +__device__ +inline IdxType WaveletMatrix2dCu5C_median2d_rank0(const IdxType i, const BlockT* __restrict__ nbit_bp) { + using WordT = decltype(BlockT::nbit); + constexpr int WORD_SIZE = 8 * sizeof(WordT); + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, ""); + + const IdxType bi = i / WORD_SIZE; + + const int ai = i % WORD_SIZE; + const BlockT block = nbit_bp[bi]; + if constexpr(WORD_SIZE == 32) { + return block.nsum + __popc(block.nbit & ((1u << ai) - 1)); + } + if constexpr(WORD_SIZE == 64) { + return block.nsum + __popcll(block.nbit & ((1ull << ai) - 1ull)); + } +} + + +template +__global__ void WaveletMatrix2dCu5C_median2d_cu( + const int H, const int W, const int res_step_num, const int r, ResT* __restrict__ res_cu, const BlockT* __restrict__ wm_nbit_bp, const uint32_t nsum_pos, + const uint32_t bv_block_h_byte_div32, const uint32_t bv_block_len, + const BlockT* __restrict__ bv_nbit_bp, const uint8_t w_bit_len, const uint8_t val_bit_len, + const ResTableT* __restrict__ res_table +) { + + const int y = blockIdx.y * THREADS_NUM_H + threadIdx.y; + if (y >= H) return; + const int x = blockIdx.x * THREADS_NUM_W + threadIdx.x; + if (x >= W) return; + + if (CH_NUM >= 2) { // constexpr + bv_nbit_bp = (BlockT*)((uint8_t*)bv_nbit_bp + bv_block_h_byte_div32 * 32ull * blockIdx.z * (VAL_BIT_LEN >= 0 ? VAL_BIT_LEN : val_bit_len)); // TODO + wm_nbit_bp = (BlockT*)((uint8_t*)wm_nbit_bp + bv_block_h_byte_div32 * 32ull * blockIdx.z * w_bit_len); + } + + XYIdxT ya, yb, k; + XIdxT xa, xb; + if (CUT_BORDER) { // constexpr + ya = y; + xa = x; + yb = y + r * 2 + 1; + xb = x + r * 2 + 1; + k = (r * 2 + 1) * (r * 2 + 1) / 2; + } else { + ya = (y < r ? 0 : y - r); + xa = (x < r ? 0 : x - r); + yb = y + r + 1; if (yb > H) yb = H; + xb = x + r + 1; if (xb > W) xb = W; + k = XYIdxT(yb - ya) * (xb - xa) / 2; + } + ValT res = 0; + ya *= (CUT_BORDER ? W + 2 * r : W); + yb *= (CUT_BORDER ? W + 2 * r : W); + + for (int8_t h = (VAL_BIT_LEN >= 0 ? VAL_BIT_LEN : val_bit_len); h--; ) { + const XYIdxT top0 = WaveletMatrix2dCu5C_median2d_rank0(ya, bv_nbit_bp); + const XYIdxT bot0 = WaveletMatrix2dCu5C_median2d_rank0(yb, bv_nbit_bp); + XYIdxT l_ya_xa = top0; + XYIdxT l_yb_xa = bot0; + XYIdxT l_ya_xb = top0; + XYIdxT l_yb_xb = bot0; + XYIdxT d = 0; + for (int8_t j = w_bit_len; j--; ) { + const XYIdxT zeros = wm_nbit_bp[nsum_pos].nsum; + const XYIdxT l_ya_xa_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_ya_xa, wm_nbit_bp); + const XYIdxT l_ya_xb_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_ya_xb, wm_nbit_bp); + const XYIdxT l_yb_xb_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_yb_xb, wm_nbit_bp); + const XYIdxT l_yb_xa_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_yb_xa, wm_nbit_bp); + + if (((xa >> j) & 1) == 0) { + l_ya_xa = l_ya_xa_rank0; + l_yb_xa = l_yb_xa_rank0; + } else { + d += l_ya_xa_rank0; l_ya_xa += zeros - l_ya_xa_rank0; + d -= l_yb_xa_rank0; l_yb_xa += zeros - l_yb_xa_rank0; + } + if (((xb >> j) & 1) == 0) { + l_ya_xb = l_ya_xb_rank0; + l_yb_xb = l_yb_xb_rank0; + } else { + d -= l_ya_xb_rank0; l_ya_xb += zeros - l_ya_xb_rank0; + d += l_yb_xb_rank0; l_yb_xb += zeros - l_yb_xb_rank0; + } + wm_nbit_bp = (BlockT*)((uint8_t*)wm_nbit_bp - bv_block_h_byte_div32 * 32ull); + } + if (CH_NUM >= 2) { + wm_nbit_bp = (BlockT*)((uint8_t*)wm_nbit_bp - bv_block_h_byte_div32 * 32ull * w_bit_len * (CH_NUM - 1)); + } + const XYIdxT bv_h_zeros = bv_nbit_bp[nsum_pos].nsum; + if (k < d) { + ya = top0; + yb = bot0; + } else { + k -= d; + res |= (ValT)1 << h; + ya += bv_h_zeros - top0; + yb += bv_h_zeros - bot0; + } + bv_nbit_bp = (BlockT*)((uint8_t*)bv_nbit_bp - bv_block_h_byte_div32 * 32ull); + } + + + + + if constexpr(is_same::value) { + res_cu[(XYIdxT)y * res_step_num + x * CH_NUM + blockIdx.z] = res; + } else if (CH_NUM == 1){ + res_cu[(XYIdxT)y * res_step_num + x * CH_NUM] = res_table[res]; + } else { + const size_t offset = size_t(CUT_BORDER ? W + 2 * r : W) * (CUT_BORDER ? H + 2 * r : H) * blockIdx.z; + res_cu[(XYIdxT)y * res_step_num + x * CH_NUM + blockIdx.z] = res_table[res + offset]; + } +} + + + + +template +struct WaveletMatrix2dCu5C { + static_assert(is_same() || is_same() || is_same(), "Supports 32, 16, or 8 bits only"); + static constexpr int MAX_BIT_LEN = 8 * sizeof(ValT); + + static constexpr uint32_t WSIZE = WORD_SIZE; + static constexpr int WARP_SIZE = 32; + using T_Type = ValT; + static constexpr int THREAD_PER_GRID = TH_NUM; + static constexpr int SRC_CACHE_DIV = 2; + static constexpr int MAX_BLOCK_X = MultiWaveletMatrixImpl::MAX_BLOCK_X; + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, "WORD_SIZE must be 32 or 64"); + using WordT = typename std::conditional::type; + + static_assert(MAX_BLOCK_X <= 1024, ""); + static_assert(TH_NUM == 1024 || TH_NUM == 512 || TH_NUM == 256 || TH_NUM == 128 || TH_NUM == 64 || TH_NUM == 32, ""); + static_assert(THREAD_PER_GRID == MultiWaveletMatrixImpl::THREAD_PER_GRID, ""); + + using BlockT = typename MultiWaveletMatrixImpl::BlockT; + using WarpWT = uint32_t; + using XIdxT = uint16_t; + using YIdxT = uint16_t; + using XYIdxT = uint32_t; + static constexpr int BLOCK_TYPE = 2; + using MultiWaveletMatrixImplClass = MultiWaveletMatrixImpl; + static_assert(is_same::value, ""); + static_assert(8 * sizeof(WarpWT) == WARP_SIZE, ""); + + int H, W; + XYIdxT size = 0; + MultiWaveletMatrixImpl WM; + XYIdxT bv_zeros[MAX_BIT_LEN]; + + int w_bit_len = 0; + int val_bit_len = 0; + static constexpr int wm_num = CH_NUM; + +private: + uint8_t* bv_block_nbit_and_nsum_base_cu = nullptr; // GPU mem + uint32_t bv_block_byte_div32; + uint32_t buf_byte_div32; + uint32_t nsum_scan_buf_len; + size_t input_buf_byte; +public: + ValT* src_cu = nullptr; // GPU mem + ValT* res_cu = nullptr; + size_t bv_block_len = 0; + size_t bv_chunk_len = 0; + +#if _MSC_VER >= 1920 || __INTEL_COMPILER + inline static int bitCount64(uint64_t bits) { + return (int)_mm_popcnt_u64(bits); + } +#else + inline static int bitCount64(uint64_t bits) { + return __builtin_popcountll(bits); + } +#endif + static constexpr int get_bit_len(uint64_t val) { + return ( + (val |= val >> 1), + (val |= val >> 2), + (val |= val >> 4), + (val |= val >> 8), + (val |= val >> 16), + (val |= val >> 32), + bitCount64(val)); + // val |= val >> 1; + // val |= val >> 2; + // val |= val >> 4; + // val |= val >> 8; + // val |= val >> 16; + // val |= val >> 32; + // return bitCount64(val); + } + + WaveletMatrix2dCu5C() { + reset(0, 0); + } + WaveletMatrix2dCu5C(const int rows, const int cols, const bool use_hw_bit_len = false, const bool alloc_res = true) { + reset(rows, cols, use_hw_bit_len, alloc_res); + } + + void reset(const int rows, const int cols, const bool use_hw_bit_len = false, const bool alloc_res = true) { + H = rows; + W = cols; + if (rows == 0 || cols == 0) return; + val_bit_len = (use_hw_bit_len ? get_bit_len((uint64_t)H * W - 1) : MAX_BIT_LEN); + assert(size == 0 && src_cu == nullptr); + + size = div_ceil((uint64_t)H * W, WORD_SIZE) * WORD_SIZE; + assert(W < 65535); // That is, less than 65534. + w_bit_len = get_bit_len(W); // w=7 [0,6] bl=3; w=8 [0,7] bl=4 + WM.reset(size, w_bit_len, val_bit_len * wm_num); + if (val_bit_len == 0) return; + + + bv_block_len = div_ceil(size, THREAD_PER_GRID) * THREAD_PER_GRID / WORD_SIZE + 1; + bv_block_len = div_ceil(bv_block_len, 8*2) * 8*2; + const size_t bv_block_byte = (sizeof(BlockT)) * val_bit_len * bv_block_len; + bv_block_byte_div32 = div_ceil(bv_block_byte, 32); + + cudaMalloc(&bv_block_nbit_and_nsum_base_cu, (size_t)(bv_block_byte_div32*32ull) * CH_NUM); + if (bv_block_nbit_and_nsum_base_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d\n", __FILE__, __LINE__); release(); return; } + + const uint16_t block_pair_num = get_block_pair_num(); + nsum_scan_buf_len = div_ceil(size, (size_t)THREAD_PER_GRID * block_pair_num); + nsum_scan_buf_len = div_ceil(nsum_scan_buf_len, 4) * 4; + + const size_t buf_byte = + sizeof(XYIdxT) * 2 * nsum_scan_buf_len + + sizeof(XIdxT) * 2 * size + + sizeof(ValT) * (CH_NUM == 1 ? 1 : 2) * size; + buf_byte_div32 = div_ceil(buf_byte, 32); + + + input_buf_byte = sizeof(ValT) * size * CH_NUM; + cudaMalloc(&src_cu, (size_t)(buf_byte_div32*32ull) * CH_NUM + input_buf_byte); + if (src_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d\n", __FILE__, __LINE__); release(); return; } + + if (alloc_res) { + cudaMalloc(&res_cu, sizeof(ValT) * size * CH_NUM); + if (res_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d\n", __FILE__, __LINE__); release(); return; } + } + } + void release() { + size = 0; + if (src_cu != nullptr) cudaFree(src_cu); + if (bv_block_nbit_and_nsum_base_cu != nullptr) cudaFree(bv_block_nbit_and_nsum_base_cu); + if (res_cu != nullptr) cudaFree(res_cu); + src_cu = nullptr; + bv_block_nbit_and_nsum_base_cu = nullptr; + res_cu = nullptr; + } + ~WaveletMatrix2dCu5C() { release(); } + + BlockT* get_bv_block_cu(int h) const { return (BlockT*)(bv_block_nbit_and_nsum_base_cu + (bv_block_len * (sizeof(BlockT))) * h); } + + BlockT* get_bv_block_cu(int h, int c) const { return (BlockT*)((uint8_t*)get_bv_block_cu(h) + (size_t)c * (bv_block_byte_div32*32ull)); } + + + uint16_t get_block_pair_num() const { + return WM.get_block_pair_num() * MultiWaveletMatrixImpl::THREAD_PER_GRID / THREAD_PER_GRID; + } + std::pair get_nowcu_and_buf_byte_div32() { + ValT *now_cu = src_cu + (CH_NUM == 1 ? 0ull : size * (size_t)CH_NUM); + return make_pair(now_cu, buf_byte_div32); + } + + // Set data in src_cu before calling (data will be destroyed). Or set src_cu_const. + void construct(const ValT *src_cu_const = nullptr, const cudaStream_t main_stream = 0, const bool src_unpacked = false) { + assert(size > 0 && src_cu != nullptr); + if (val_bit_len == 0) return; + if (src_cu == nullptr) { printf("Build Error: memory not alloced."); return;} + + const XIdxT inf = ((XIdxT)1u << w_bit_len) - 1; + assert(W <= inf); + assert(size % WORD_SIZE == 0); + + ValT mask = ((ValT)1u << val_bit_len) - 1; + + const uint16_t block_pair_num = get_block_pair_num(); + const int grid_x = div_ceil(size, THREAD_PER_GRID * block_pair_num); + if (grid_x > MAX_BLOCK_X) { printf("over grid_x %d\n", grid_x); exit(1); } + + const dim3 grid(grid_x, wm_num); + const dim3 thread(WARP_SIZE, THREAD_PER_GRID / WARP_SIZE); + const XYIdxT size_div_w = size / WORD_SIZE; + const XYIdxT size_div_warp = size / WARP_SIZE; + assert(size % WARP_SIZE == 0); + constexpr int ThreadsDimY = THREAD_PER_GRID / WARP_SIZE; + + +#define CALC_SRCB_SIZE(SrcT) (0) + constexpr int SRCB_S_8 = CALC_SRCB_SIZE(uint8_t); + constexpr int SRCB_S_16 = CALC_SRCB_SIZE(uint16_t); + constexpr int SRCB_S_32 = CALC_SRCB_SIZE(uint32_t); +#undef CALC_SRCB_SIZE + { using SrcT = uint8_t; using DstT = uint8_t; + cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_8); + } if (!is_same::value) { using SrcT = uint16_t; using DstT = uint8_t; + cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_16); + } if (!is_same::value) { using SrcT = uint16_t; using DstT = uint16_t; + cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_16); + } if (is_same::value) { using SrcT = uint32_t; using DstT = uint16_t; + cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_32); + } if (is_same::value) { using SrcT = uint32_t; using DstT = uint32_t; + cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_32); + } + + { using SrcT = uint8_t; using DstT = uint8_t; + cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncCachePreferShared); + } if (!is_same::value) { using SrcT = uint16_t; using DstT = uint8_t; + cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncCachePreferShared); + } if (!is_same::value) { using SrcT = uint16_t; using DstT = uint16_t; + cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncCachePreferShared); + } if (is_same::value) { using SrcT = uint32_t; using DstT = uint16_t; + cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncCachePreferShared); + } if (is_same::value) { using SrcT = uint32_t; using DstT = uint32_t; + cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncCachePreferShared); + } + + const uint32_t nsum_pos = get_nsum_pos(); + + ValT *now_cu = src_cu + (CH_NUM == 1 ? 0ull : size * (size_t)CH_NUM); + ValT *nxt_cu = now_cu + size; + XYIdxT *nsum_buf_test = (XYIdxT*)(nxt_cu + size); + XYIdxT *nsum_buf_test2 = nsum_buf_test + nsum_scan_buf_len; + XIdxT *buf_idx = (XIdxT*)(nsum_buf_test2 + nsum_scan_buf_len); + XIdxT *nxt_idx = (XIdxT*)(buf_idx + size); + + + const int val_bit_len_m1 = val_bit_len - 1; + int h = val_bit_len_m1; + if (src_unpacked == true) { + if (src_cu_const != nullptr) { + printf("[Error!] not support. %s:%d\n", __FILE__, __LINE__); + exit(-1); + } + WaveletMatrix2dCu5C_first_gpu_multi_srcunpacked <<>> (ValT(mask / 2),block_pair_num, size_div_warp, now_cu, nsum_buf_test, buf_byte_div32, buf_idx, W, (XYIdxT)W * H); + } else if (src_cu_const == nullptr) { + WaveletMatrix2dCu5C_first_gpu_multi <<>> (ValT(mask / 2),block_pair_num, size_div_warp, src_cu, now_cu, nsum_buf_test, buf_byte_div32, buf_idx, W, (XYIdxT)W * H, src_cu_const); + } else { + WaveletMatrix2dCu5C_first_gpu_multi <<>> (ValT(mask / 2),block_pair_num, size_div_warp, src_cu, now_cu, nsum_buf_test, buf_byte_div32, buf_idx, W, (XYIdxT)W * H, src_cu_const); + } + BlockT * nsum_p = get_bv_block_cu(h) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test, nsum_buf_test2, nsum_p, buf_byte_div32, bv_block_byte_div32); + + const XYIdxT cwm_buf_byte_div32 = WM.get_buf_byte_div32(); + + if constexpr (sizeof(ValT) >= 4) for (; h > 16; --h) { + using SrcT = uint32_t; + using DstT = uint32_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + const int hp1 = std::min(val_bit_len - 1, h+1); + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM); + XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM)); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1); + WaveletMatrix2dCu5C_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h); + + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_buf_test, nsum_buf_test2); + swap(now_cu, nxt_cu); + swap(buf_idx, nxt_idx); + } + if constexpr (sizeof(ValT) >= 4) if (h == 16 || (is_same::value && h >= 0)) do { + using SrcT = uint32_t; + using DstT = uint16_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + const int hp1 = std::min(val_bit_len - 1, h+1); + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM); + XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM)); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1); + WaveletMatrix2dCu5C_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h); + + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_buf_test, nsum_buf_test2); + swap(now_cu, nxt_cu); + swap(buf_idx, nxt_idx); + --h; + } while(0); + if constexpr (sizeof(ValT) >= 2) for (; h > 8; --h) { + using SrcT = uint16_t; + using DstT = uint16_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + const int hp1 = std::min(val_bit_len - 1, h+1); + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM); + XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM)); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1); + WaveletMatrix2dCu5C_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h); + + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_buf_test, nsum_buf_test2); + swap(now_cu, nxt_cu); + swap(buf_idx, nxt_idx); + } + if constexpr (sizeof(ValT) >= 2) if (h == 8 || (is_same::value && h >= 0)) do { + using SrcT = uint16_t; + using DstT = uint8_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + const int hp1 = std::min(val_bit_len - 1, h+1); + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM); + XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM)); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1); + WaveletMatrix2dCu5C_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h); + + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_buf_test, nsum_buf_test2); + swap(now_cu, nxt_cu); + swap(buf_idx, nxt_idx); + --h; + } while(0); + for (; h >= 0; --h) { + using SrcT = uint8_t; + using DstT = uint8_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + const int hp1 = std::min(val_bit_len - 1, h+1); + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM); + XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM)); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1); + WaveletMatrix2dCu5C_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h); + + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_buf_test, nsum_buf_test2); + swap(now_cu, nxt_cu); + swap(buf_idx, nxt_idx); + } + { + const int h = 0; + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(h * CH_NUM); + XIdxT* cwm = WM.get_src_p(h * CH_NUM); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(h); + + WaveletMatrix2dCu5C_last_gpu <<>> (block_pair_num, size_div_w, buf_byte_div32, nxt_idx, inf, cwm, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h, bv_block_byte_div32); + } + WM.construct(main_stream, false); + } + + XYIdxT get_nsum_pos() const { + const XYIdxT size_div_w = size / WORD_SIZE; + return size_div_w; + } + + template + void median2d(const int r, const ResTableT* res_table = nullptr) { + median2d(r, -1, res_table); + } + + template + void median2d(const int r, int res_step_num = -1, const ResTableT* res_table = nullptr, const cudaStream_t main_stream = 0) { + if (bv_block_nbit_and_nsum_base_cu == nullptr) { printf("Median2d Error: memory not alloced."); return;} + if (is_same::value == false && res_table == nullptr) {printf("Median2d Error: res_table is null."); return;} + static_assert(is_same::value || (sizeof(ResTableT) <= sizeof(ValT)), ""); + + static_assert(TH_W * TH_H <= 1024, "max number of threads in block"); + + if (res_step_num < 0) res_step_num = W * CH_NUM; + + constexpr int THREADS_NUM_W = TH_W; + const dim3 thread(THREADS_NUM_W, TH_H); + const dim3 grid(div_ceil((CUT_BORDER ? W - 2 * r: W), THREADS_NUM_W), div_ceil((CUT_BORDER ? H - 2 * r : H), TH_H), CH_NUM); + + + const uint32_t bv_nsum_pos = get_nsum_pos(); + const BlockT* bv_bv_block_nbit_cu_first = get_bv_block_cu(val_bit_len - 1); + + const BlockT* wm_bv_block_nbit_cu_first = WM.get_bv_block_cu(w_bit_len - 1, (val_bit_len - 1) * CH_NUM); // + const uint32_t nsum_pos = WM.get_nsum_pos(); + const uint64_t wm_bv_block_byte = WM.get_bv_block_byte(); + + if (bv_nsum_pos != nsum_pos) { printf("err! line %d", __LINE__); exit(-1); } + if (WM.get_bv_block_byte() != WM.get_bv_block_h_byte_div32() * 32ull * w_bit_len) { printf("err! line %d", __LINE__); exit(-1); } + + if (bv_block_len != WM.bv_block_len) {printf("bv_block_len error!\n"); exit(1);} + + using ResT = typename std::conditional::value, ValT, ResTableT>::type; + + const int Wc = (CUT_BORDER ? W - 2 * r : W); + const int Hc = (CUT_BORDER ? H - 2 * r : H); + + constexpr int VAL_BIT_LEN = (sizeof(ValT) < 4) ? MAX_BIT_LEN : -1; + WaveletMatrix2dCu5C_median2d_cu <<>> + (Hc, Wc, res_step_num, r, (ResT*)res_cu, wm_bv_block_nbit_cu_first, nsum_pos, WM.get_bv_block_h_byte_div32(), bv_block_len, + bv_bv_block_nbit_cu_first, w_bit_len, val_bit_len, res_table); + } + + template + vector> get_res() { + static_assert(sizeof(ResTableT) <= sizeof(ValT), ""); + auto res = vector>(H, vector(W)); + if (res_cu == nullptr) { printf("get_res Error: memory not alloced."); return res;} + + for (int i = 0; i < H; ++i) { + cudaMemcpy(res[i].data(), res_cu + (XYIdxT)W * i, W * sizeof(ResTableT), cudaMemcpyDeviceToHost); + } + return res; + } +}; + +} // end namespace wavelet_median +}}} //end namespace cv::cuda::device + +#endif +#endif // __OPENCV_WAVELET_MATRIX_2D_CUH__ diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_feature_support_checks.h b/modules/cudafilters/src/cuda/wavelet_matrix_feature_support_checks.h new file mode 100644 index 00000000000..6c47cc5e1a4 --- /dev/null +++ b/modules/cudafilters/src/cuda/wavelet_matrix_feature_support_checks.h @@ -0,0 +1,82 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_WAVELET_MATRIX_FEATURE_SUPPORT_CHECKS_H__ +#define __OPENCV_WAVELET_MATRIX_FEATURE_SUPPORT_CHECKS_H__ + +#ifdef HAVE_CUDA +#include +#include +#endif // HAVE_CUDA + + +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#if CUDA_VERSION >= 11000 || CUDART_VERSION >= 11000 + + +// Check `if constexpr` is available. + +// GCC has been supported since 7.1 +#if defined(__GNUC__) && (__GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ >= 1)) +#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +#endif + +// clang has been supported since 5.0 +#if defined(__clang__) && (__clang_major__ >= 5) +#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +#endif + + +// Visual Studio has been supported since Visual Studio 2019 (16.1.2) +#if defined(_MSC_VER) && _MSC_VER >= 1921 +#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +#endif + + +// I confirmed that it works with Intel C++ Compiler 2021.1.2. It did not work with icc 19.0.1. +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER > 202101 || (__INTEL_COMPILER == 202101 && __INTEL_COMPILER_UPDATE >= 2)) +#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +#endif + +#endif // CUDA_VERSION +#endif // __OPENCV_WAVELET_MATRIX_FEATURE_SUPPORT_CHECKS_H__ diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_float_supporter.cuh b/modules/cudafilters/src/cuda/wavelet_matrix_float_supporter.cuh new file mode 100644 index 00000000000..3a21a3c7082 --- /dev/null +++ b/modules/cudafilters/src/cuda/wavelet_matrix_float_supporter.cuh @@ -0,0 +1,227 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_WAVELET_MATRIX_FLOAT_SUPPORTER_CUH__ +#define __OPENCV_WAVELET_MATRIX_FLOAT_SUPPORTER_CUH__ + +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#include "wavelet_matrix_feature_support_checks.h" +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + + +namespace cv { namespace cuda { namespace device +{ + +namespace wavelet_matrix_median { +namespace WMMedianFloatSupporter { + +template +__global__ void iota_idx1(IdxT *idx_in_cu, const IdxT hw) { + const IdxT i = blockIdx.x * blockDim + threadIdx.x; + if (i >= hw) return; + idx_in_cu[i] = i; +} + +template +__global__ void split_and_iota_idx(IdxT *idx_in_cu, const ValT* val_in_cu, ValT* val_out_cu, const IdxT hw) { + const size_t i = blockIdx.x * blockDim + threadIdx.x; + if (i >= hw) return; + + static_assert(is_same::value, ""); + // static_assert(2 <= CH_NUM && CH_NUM <= 4); + using SrcTU = std::conditional_t>; + + const SrcTU *src_u = (SrcTU*)val_in_cu; + const SrcTU src_uv = src_u[i]; + + if (CH_NUM >= 1) { // constexpr + val_out_cu[i] = src_uv.x; + idx_in_cu[i] = i; + } + if constexpr (CH_NUM >= 2) { + val_out_cu += hw; idx_in_cu += hw; + val_out_cu[i] = src_uv.y; + idx_in_cu[i] = i; + } + if constexpr (CH_NUM >= 3) { + val_out_cu += hw; idx_in_cu += hw; + val_out_cu[i] = src_uv.z; + idx_in_cu[i] = i; + } + if constexpr (CH_NUM >= 4) { + val_out_cu += hw; idx_in_cu += hw; + val_out_cu[i] = src_uv.w; + idx_in_cu[i] = i; + } +} + +template +__global__ void set_wm_val_1(IdxT *wm_src_p, const IdxT *idx_out_cu, const IdxT hw) { + const IdxT i = blockIdx.x * blockDim + threadIdx.x; + if (i >= hw) return; + const IdxT j = idx_out_cu[i]; + wm_src_p[j] = i; +} + +template +__global__ void set_wm_val(IdxT *wm_src_p, const IdxT *idx_out_cu, const IdxT hw, const IdxT buf_byte_div32) { + const IdxT i = blockIdx.x * blockDim + threadIdx.x; + if (i >= hw) return; + const size_t hwc = size_t(hw) * blockIdx.y; + const IdxT j = idx_out_cu[i + hwc]; + const size_t src_offset = buf_byte_div32 * 32ull / sizeof(IdxT) * blockIdx.y; + wm_src_p[src_offset + j] = i; +} + +template +__global__ void conv_res_cu(ValT *dst, const ValT *val_out_cu, const IdxT *res_cu, const IdxT hw) { + const IdxT i = blockIdx.x * blockDim + threadIdx.x; + if (i >= hw) return; + + const IdxT r = res_cu[i]; + dst[i] = val_out_cu[r]; +} + +template +struct WMMedianFloatSupporter { + constexpr static int blockDim = 512; + int h = 0, w = 0; + int hw_bit_len = -1; + WMMedianFloatSupporter(){}; + WMMedianFloatSupporter(int h, int w) { reset(h, w); } + ~WMMedianFloatSupporter(){ + free(); + } + ValT *val_in_cu = nullptr; + IdxT *idx_in_cu = nullptr; + ValT *val_out_cu = nullptr; + IdxT *idx_out_cu = nullptr; + void *cub_temp_storage = nullptr; + size_t cub_temp_storage_bytes; + // set: val_in + // get: val_out + // 1ch + // [val_in][idx_in][val_out][idx_out][cub_temp] + // C ch + + // [val_in][......][......][......] + // AaBbCcDd + // [^^^^^^][......][valin2][idxin2] + // ABCDabcd01230123 + + // [val0in][val1in][idx0in][idx1in][val0out][val1out][idx0out][idx1out][cub_temp][d_offsets] + + void reset(const int H, const int W) { + h = H; w = W; + free(); + } + void alloc(){ + const size_t hwc = size_t(CH_NUM) * h * w; + if (CH_NUM == 1) { // constexpr + cub::DeviceRadixSort::SortPairs( + nullptr, cub_temp_storage_bytes, val_in_cu, val_out_cu, idx_in_cu, idx_out_cu, hwc); + cudaMalloc(&val_in_cu, 2ull * hwc * (sizeof(ValT) + sizeof(IdxT)) + cub_temp_storage_bytes); + } else { + cub::DeviceSegmentedRadixSort::SortPairs( + nullptr, cub_temp_storage_bytes, val_in_cu, val_out_cu, idx_in_cu, idx_out_cu, hwc, CH_NUM, (int*)nullptr, (int*)nullptr); + const size_t offsets_arr_size = (CH_NUM + 1) * sizeof(int); + cudaMalloc(&val_in_cu, 2ull * hwc * (sizeof(ValT) + sizeof(IdxT)) + cub_temp_storage_bytes + offsets_arr_size); + } + idx_in_cu = (IdxT*)(val_in_cu + hwc); + val_out_cu = (ValT*)(idx_in_cu + hwc); + idx_out_cu = (IdxT*)(val_out_cu + hwc); + int *d_offsets = (int*)(idx_out_cu + hwc); + cub_temp_storage = d_offsets + (CH_NUM + 1); + } + void free() { + if (val_in_cu != nullptr) { + cudaFree(val_in_cu); + } + } + void sort_and_set(IdxT *wm_src_p, const IdxT buf_byte_div32 = 0){ + const IdxT hw = h * w; + const size_t hwc = size_t(CH_NUM) * hw; + const dim3 gridDim((hw + blockDim - 1) / blockDim, CH_NUM); + + if (CH_NUM == 1) { // constexpr + iota_idx1<<>>(idx_in_cu, hw); + cub::DeviceRadixSort::SortPairs( + cub_temp_storage, cub_temp_storage_bytes, val_in_cu, val_out_cu, idx_in_cu, idx_out_cu, hw); + set_wm_val_1<<>>(wm_src_p, idx_out_cu, hw); + } else { + auto idx2 = idx_out_cu; + auto val2 = val_out_cu; + auto idx3 = idx_in_cu; + auto val3 = val_in_cu; + split_and_iota_idx<<>>(idx2, val_in_cu, val2, hw); + + int h_offsets[CH_NUM + 1]; + for (size_t i = 0; i <= CH_NUM; ++i) h_offsets[i] = i * hw; + int *d_offsets = (int*)(idx_out_cu + hwc); + cudaMemcpy(d_offsets, h_offsets, (CH_NUM + 1) * sizeof(int), cudaMemcpyHostToDevice); + + + cub::DeviceSegmentedRadixSort::SortPairs( + cub_temp_storage, cub_temp_storage_bytes, val2, val3, idx2, idx3, hwc, CH_NUM, d_offsets, d_offsets + 1); + set_wm_val<<>>(wm_src_p, idx3, hw, buf_byte_div32); + } + for(hw_bit_len = 1; ; ++hw_bit_len) { + if ((1ull << hw_bit_len) >= hw) { + break; + } + } + } + const ValT* get_res_table() const { + if (CH_NUM == 1) { // constexpr + return val_out_cu; + } else { + return val_in_cu; + } + } +}; +} // end namespace WMMedianFloatSupporter +} // end namespace wavelet_matrix_median + +}}} //end namespace cv::cuda::device +#endif +#endif // __OPENCV_WAVELET_MATRIX_FLOAT_SUPPORTER_CUH__ diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_multi.cuh b/modules/cudafilters/src/cuda/wavelet_matrix_multi.cuh new file mode 100644 index 00000000000..de59d75993a --- /dev/null +++ b/modules/cudafilters/src/cuda/wavelet_matrix_multi.cuh @@ -0,0 +1,636 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_WAVELET_MATRIX_MULTI_CUH__ +#define __OPENCV_WAVELET_MATRIX_MULTI_CUH__ + +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#include "wavelet_matrix_feature_support_checks.h" +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + + +#include +#include +#include +#include +#include +#include "opencv2/core/cuda/warp_shuffle.hpp" + + +#include + +namespace cv { namespace cuda { namespace device +{ + +namespace wavelet_matrix_median { + using std::vector; + using namespace std; + + template + constexpr T div_ceil(T a, T b) { + return (a + b - 1) / b; + } + + +template +__global__ void WaveletMatrixMultiCu4G_UpSweep_gpu(const SrcT mask, const uint16_t block_pair_num, const IdxType size_div_w, const SrcT* __restrict__ src, DstT* __restrict__ dst, BlockT* __restrict__ nbit_bp, const IdxType* __restrict__ nsum_zeros_buf, IdxType* __restrict__ nsum_zeros_buf2, const uint32_t bv_block_byte_div32, const uint32_t buf_byte_div32) { + using WordT = decltype(BlockT::nbit); + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + constexpr int WORD_SIZE = 8 * sizeof(WordT); + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, ""); + constexpr uint32_t WORD_DIV_WARP = WORD_SIZE / WARP_SIZE; + + src = (SrcT*)((uint8_t*)src + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + dst = (DstT*)((uint8_t*)dst + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + nsum_zeros_buf = (IdxType*)((uint8_t*)nsum_zeros_buf + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + nsum_zeros_buf2 = (IdxType*)((uint8_t*)nsum_zeros_buf2 + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + nbit_bp = (BlockT*)((uint8_t*)nbit_bp + (size_t)blockIdx.y * (bv_block_byte_div32*32ull)); // TODO: rename + + using WarpScan = cub::WarpScan; + using WarpScanY = cub::WarpScan; + using WarpReduce = cub::WarpReduce; + using WarpReduceY = cub::WarpReduce; + + constexpr size_t shmem_size = sizeof(SrcT) * (ThreadsDimY * (WARP_SIZE - 1) * WARP_SIZE); + static_assert(SRCB_S == shmem_size, ""); + static_assert(SRCB_S + DSTB_S < 64 * 1024, ""); + + constexpr int DST_BUF_SIZE = DSTB_S; + constexpr int DST_BUF_NUM_PER_WARP = DST_BUF_SIZE / (ThreadsDimY * sizeof(DstT)); // [32k/32/2=512] [48k/8/1=6114] + constexpr int DST_BUF_NUM_PER_THREAD = DST_BUF_NUM_PER_WARP / WARP_SIZE; + static_assert(DST_BUF_NUM_PER_THREAD <= WARP_SIZE, ""); + + + extern __shared__ uint8_t shmem_base[]; + SrcT* __restrict__ src_val_cache = (SrcT*)shmem_base; + DstT* __restrict__ dst_buf = (DstT*)&src_val_cache[SRCB_S] + threadIdx.y * DST_BUF_NUM_PER_WARP; //[ThreadsDimY][DST_BUF_NUM_PER_WARP]; + + __shared__ uint4 nsum_count_sh[ThreadsDimY]; + __shared__ IdxType pre_sum_share[2]; + __shared__ IdxType warp_scan_sums[WARP_SIZE]; + __shared__ typename WarpScan::TempStorage s_scanStorage; + __shared__ typename WarpScanY::TempStorage s_scanStorage2; + __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY]; + __shared__ typename WarpReduceY::TempStorage WarpReduceY_temp_storage; + // shmem ------ end ------ + + const IdxType size_div_warp = size_div_w * WORD_DIV_WARP; + const IdxType nsum = nbit_bp[size_div_w].nsum; + const IdxType nsum_offset = nsum_zeros_buf[blockIdx.x]; + + + IdxType nsum_idx0_org = nsum_offset; + IdxType nsum_idx1_org = (IdxType)blockIdx.x * block_pair_num * THREAD_PER_GRID + nsum - nsum_idx0_org; + nsum_idx0_org /= (IdxType)block_pair_num * ThreadsDimY * WARP_SIZE; + nsum_idx1_org /= (IdxType)block_pair_num * ThreadsDimY * WARP_SIZE; + const IdxType nsum_idx0_bound = (nsum_idx0_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE; + const IdxType nsum_idx1_bound = (nsum_idx1_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE; + uint4 nsum_count = make_uint4(0, 0, 0, 0); + + const unsigned short th_idx = threadIdx.y * WARP_SIZE + threadIdx.x; + if (th_idx == 0) { + pre_sum_share[0] = nsum_offset; + } + + + for (IdxType ka = 0; ka < block_pair_num; ka += WARP_SIZE) { + const IdxType ibb = ((IdxType)blockIdx.x * block_pair_num + ka) * ThreadsDimY; + if (ibb >= size_div_warp) break; + WarpWT my_bits = 0; + SrcT first_val; + + const IdxType src_val_cache_offset = IdxType(threadIdx.y * (WARP_SIZE - 1) - 1) * WARP_SIZE + threadIdx.x; + for (IdxType kb = 0, i = ibb + WARP_SIZE * threadIdx.y; kb < WARP_SIZE; ++kb, ++i) { + if (i >= size_div_warp) break; + WarpWT bits; + const SrcT v = src[i * WARP_SIZE + threadIdx.x]; + if (kb == 0) { + first_val = v; + } else { + src_val_cache[src_val_cache_offset + kb * WARP_SIZE] = v; + } + if (v <= mask) { + bits = __activemask(); + } else { + bits = ~__activemask(); + } + if (threadIdx.x == kb) { + my_bits = bits; + } + } + IdxType t, c = __popc(my_bits); + WarpScan(s_scanStorage).ExclusiveSum(c, t); + + if (threadIdx.x == WARP_SIZE - 1) { + warp_scan_sums[threadIdx.y] = c + t; + } + __syncthreads(); + IdxType pre_sum = pre_sum_share[(ka & WARP_SIZE) > 0]; + IdxType s = threadIdx.x < ThreadsDimY ? warp_scan_sums[threadIdx.x] : 0; + WarpScanY(s_scanStorage2).ExclusiveSum(s, s); + + s = cv::cuda::device::shfl(s, threadIdx.y, WARP_SIZE); + + s += t + pre_sum; + if (th_idx == THREAD_PER_GRID - 1) { + pre_sum_share[(ka & WARP_SIZE) == 0] = s + c; + } + const IdxType bi = ibb + th_idx; + + if (bi < size_div_warp) { + static_assert(WORD_SIZE == 32, ""); + nbit_bp[bi] = BlockT{s, my_bits}; + } + if (mask == 0) continue; + + const SrcT mask_2 = mask >> 1; + SrcT vo = first_val; + for (IdxType j = 0, i = ibb + WARP_SIZE * threadIdx.y; j < WARP_SIZE;) { + if (i >= size_div_warp) break; + + + IdxType idx0_begin, idx0_num, idx1_offset, idx01_num, idx1_offset0; + if (DST_BUF_SIZE > 0) { // constexpr + IdxType idx1_begin, idx0_end; + const IdxType ib = ::min(size_div_warp, i + DST_BUF_NUM_PER_THREAD); + const IdxType jb = j + ib - i - 1; + idx0_begin = cv::cuda::device::shfl(s, j, WARP_SIZE); + idx1_begin = i * WARP_SIZE + nsum - idx0_begin; + idx0_end = cv::cuda::device::shfl(s + c, jb, WARP_SIZE); + + idx0_num = idx0_end - idx0_begin; + idx1_offset = idx1_begin - idx0_num; + idx01_num = (ib - i) * WARP_SIZE; + idx1_offset0 = nsum - idx1_begin + idx0_num; + } + constexpr int DST_LOOP_NUM = (DST_BUF_SIZE == 0 ? 1 : DST_BUF_NUM_PER_THREAD); + for (IdxType kb = 0; kb < DST_LOOP_NUM; ++kb, ++j, ++i) { + if (i >= size_div_warp) break; + + const WarpWT e_nbit = cv::cuda::device::shfl(my_bits, j, WARP_SIZE); + const IdxType e_nsum = cv::cuda::device::shfl(s, j, WARP_SIZE); + IdxType rank = __popc(e_nbit << (WARP_SIZE - threadIdx.x)); + const IdxType idx0 = e_nsum + rank; + + DstT v = (DstT)vo; + IdxType idx; + IdxType buf_idx; + if (vo > mask) { // 1 + const IdxType ij = i * WARP_SIZE + threadIdx.x; + idx = ij + nsum - idx0; + v &= mask; + buf_idx = ij - idx0 + idx1_offset0; + } else { + idx = idx0; + buf_idx = idx0 - idx0_begin; + } + if (DST_BUF_SIZE == 0) { + dst[idx] = v; + } else { + dst_buf[buf_idx] = (DstT)v; + } + + if (v <= mask_2) { + if (vo <= mask) { + if (idx < nsum_idx0_bound) { + nsum_count.x++; + } else { + assert(idx < nsum_idx0_bound + block_pair_num * ThreadsDimY * WARP_SIZE); + nsum_count.y++; + } + } else { + if (idx < nsum_idx1_bound) { + nsum_count.z++; + } else { + assert(idx < nsum_idx1_bound + block_pair_num * ThreadsDimY * WARP_SIZE); + nsum_count.w++; + } + } + } + if (j == WARP_SIZE - 1) { j = WARP_SIZE; break; } + vo = src_val_cache[(threadIdx.y * (WARP_SIZE - 1) + j) * WARP_SIZE + threadIdx.x]; + } + if (DST_BUF_SIZE > 0) { // constexpr + for (IdxType j = threadIdx.x; (int)j < DST_BUF_NUM_PER_WARP; j += WARP_SIZE) { + if (j >= idx01_num) break; + IdxType idx; + if (j < idx0_num) { // 0 + idx = j + idx0_begin; + } else { // 1 + idx = j + idx1_offset; + } + dst[idx] = dst_buf[j]; + } + } + } + } + + if (blockIdx.x == gridDim.x - 1 && th_idx == 0) { + nbit_bp[size_div_warp / WORD_DIV_WARP].nsum = nsum; + } + if (mask == 0) return; + + nsum_count.x = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.x); + nsum_count.y = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.y); + nsum_count.z = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.z); + nsum_count.w = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.w); + if (threadIdx.x == 0) { + nsum_count_sh[threadIdx.y] = nsum_count; + } + __syncthreads(); + + + if (threadIdx.x < ThreadsDimY) { + nsum_count = nsum_count_sh[threadIdx.x]; + nsum_count.x = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.x); + nsum_count.y = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.y); + nsum_count.z = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.z); + nsum_count.w = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.w); + if (th_idx == 0) { + const IdxType nsum_idx0_org = nsum_idx0_bound / ((IdxType)block_pair_num * ThreadsDimY * WARP_SIZE); + const IdxType nsum_idx1_org = nsum_idx1_bound / ((IdxType)block_pair_num * ThreadsDimY * WARP_SIZE); + if (nsum_count.x > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx0_org - 1, nsum_count.x); + if (nsum_count.y > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx0_org - 0, nsum_count.y); + if (nsum_count.z > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx1_org - 1, nsum_count.z); + if (nsum_count.w > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx1_org - 0, nsum_count.w); + } + } +} + + +template +__global__ void WaveletMatrixMultiCu4G_ExclusiveSum(IdxType* __restrict__ nsum_scan_buf, IdxType* __restrict__ nsum_zeros_buf2, BlockT* __restrict__ nsum_p, const uint32_t buf_byte_div32, const uint32_t bv_block_byte_div32) { + + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + + IdxType thread_data1; + IdxType thread_data2; + nsum_scan_buf = (IdxType*)((uint8_t*)nsum_scan_buf + (size_t)blockIdx.x * (buf_byte_div32*32ull)); + nsum_zeros_buf2 = (IdxType*)((uint8_t*)nsum_zeros_buf2 + (size_t)blockIdx.x * (buf_byte_div32*32ull)); + + thread_data1 = nsum_scan_buf[threadIdx.x]; + BlockScan(temp_storage).ExclusiveSum(thread_data1, thread_data2); + + nsum_scan_buf[threadIdx.x] = thread_data2; + nsum_zeros_buf2[threadIdx.x] = 0; + + if (threadIdx.x == blockDim.x - 1) { + thread_data2 += thread_data1; + nsum_p = (BlockT*)((uint8_t*)nsum_p + (size_t)blockIdx.x * (bv_block_byte_div32*32ull)); + nsum_p->nsum = thread_data2; + } +} + + +template +__global__ void WaveletMatrixMultiCu4G_first_gpu(const SrcT mask, uint16_t block_pair_num, const IdxType size_div_warp, const SrcT* __restrict__ src, IdxType* __restrict__ nsum_scan_buf, const uint32_t buf_byte_div32) { + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + + src = (SrcT*)((uint8_t*)src + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + IdxType cs = 0; + IdxType ibb = (IdxType)blockIdx.x * block_pair_num * ThreadsDimY; + for (IdxType ka = 0; ka < block_pair_num; ka += WARP_SIZE, ibb += THREAD_PER_GRID) { + for (IdxType kb = 0, i = ibb + WARP_SIZE * threadIdx.y; kb < WARP_SIZE; ++kb, ++i) { + if (i >= size_div_warp) break; + const SrcT v = src[i * WARP_SIZE + threadIdx.x]; + if (v <= mask) { + ++cs; + } + } + } + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage s_reduceStorage; + IdxType reducedValue = BlockReduce(s_reduceStorage).Sum(cs); + + if (threadIdx.y == 0 && threadIdx.x == 0) { + nsum_scan_buf = (IdxType*)((uint8_t*)nsum_scan_buf + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + nsum_scan_buf[blockIdx.x] = reducedValue; + } +} + + + +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_Z(uint32_t r) { + return (r >= MIN_DSTBUF_KB) ? r * 1024 : 0; +} + +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_B(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_Z((r + 1) / 2); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A16(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_B(r | (r >> 16)); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A8(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A16(r | (r >> 8)); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A4(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A8(r | (r >> 4)); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A2(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A4(r | (r >> 2)); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A1(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A2(r | (r >> 1)); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb(int SRCB_S) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A1(SHMEM_USE_KB - (SRCB_S + 1023) / 1024); +} + +// template +// constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb(int SRCB_S) { +// uint32_t r = (SHMEM_USE_KB - (SRCB_S + 1023) / 1024); +// r |= r >> 1; +// r |= r >> 2; +// r |= r >> 4; +// r |= r >> 8; +// r |= r >> 16; +// r = (r + 1) / 2; +// return (r >= MIN_DSTBUF_KB) ? r * 1024 : 0; +// } + + +template +struct WaveletMatrixMultiCu4G { + static constexpr int MAX_BIT_LEN = 8 * sizeof(T); + + static constexpr uint32_t WSIZE = WORD_SIZE; + using T_Type = T; + static constexpr int WARP_SIZE = 32; + static constexpr int THREAD_PER_GRID = TH_NUM; + static constexpr int MAX_BLOCK_X = 1024; + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, "WORD SIZE must be 32 or 64"); + using WordT = typename std::conditional::type; + + static constexpr int SHMEM_USE_KB = 64; + + struct __align__(8) BLOCK32_T { uint32_t nsum; union { uint32_t nbit; uint32_t nbit_a[1];}; }; + struct __align__(4) BLOCK64_T { uint32_t nsum; union { uint64_t nbit; uint32_t nbit_a[2];}; }; + using BlockT = typename std::conditional::type; + + static constexpr int MIN_DSTBUF_KB = 4; + static constexpr int BLOCK_TYPE = 2; + using WarpWT = uint32_t; + static_assert(8 * sizeof(WarpWT) == WARP_SIZE, "bits of WarpWT must be WARP_SIZE"); + + IdxType size = 0; + int wm_num = 0; + +private: + T* src_cu = nullptr; + uint8_t* bv_block_nbit_and_nsum_base_cu = nullptr; + uint32_t bv_block_byte_div32; + uint32_t buf_byte_div32; +public: + size_t bv_block_len = 0; + IdxType bv_zeros[MAX_BIT_LEN]; + int bit_len = 0; + + WaveletMatrixMultiCu4G(IdxType _n = 0, int _bit_len = 0, int num = 0) { + reset(_n, _bit_len, num); + } + void reset(IdxType _n, int _bit_len, int _num) { + cudaError_t err; + assert(size == 0 && src_cu == nullptr && 0 <= _bit_len && _bit_len <= MAX_BIT_LEN); + bit_len = _bit_len; + wm_num = _num; + if (_n == 0 || wm_num == 0) return; + size = div_ceil(_n, WORD_SIZE) * WORD_SIZE; + bv_block_len = div_ceil(size, THREAD_PER_GRID) * THREAD_PER_GRID / WORD_SIZE + 1; + bv_block_len = div_ceil(bv_block_len, 8*2) * 8*2; + + const size_t bv_block_byte = (sizeof(BlockT)) * bit_len * bv_block_len; + if (bv_block_byte % 32 != 0) { printf("bv_block_byte not 32n!"); exit(-1); } + bv_block_byte_div32 = div_ceil(bv_block_byte, 32); + + err = cudaMalloc(&bv_block_nbit_and_nsum_base_cu, (size_t)(bv_block_byte_div32*32ull) * _num); + if (bv_block_nbit_and_nsum_base_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); release(); return; } + + const uint16_t block_pair_num = get_block_pair_num(); + const IdxType nsum_scan_buf_len = div_ceil(size, (size_t)THREAD_PER_GRID * block_pair_num); + + const size_t buf_byte = sizeof(IdxType) * 2 * nsum_scan_buf_len + sizeof(T) * size * 2; + buf_byte_div32 = div_ceil(buf_byte, 32); + err = cudaMalloc(&src_cu, (size_t)(buf_byte_div32*32ull) * _num); + if (src_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); release(); return; } + + } + void release() { + size = 0; + if (src_cu != nullptr) cudaFree(src_cu); + if (bv_block_nbit_and_nsum_base_cu != nullptr) cudaFree(bv_block_nbit_and_nsum_base_cu); + src_cu = nullptr; + bv_block_nbit_and_nsum_base_cu = nullptr; + } + ~WaveletMatrixMultiCu4G() { release(); } + + BlockT* get_bv_block_cu(int h) const { return (BlockT*)(bv_block_nbit_and_nsum_base_cu + (sizeof(BlockT)) * bv_block_len * h); } + BlockT* get_bv_block_cu(int h, int c) const { return (BlockT*)((uint8_t*)get_bv_block_cu(h) + (size_t)c * (bv_block_byte_div32*32ull)); } + uint64_t get_bv_block_byte() const { return (bv_block_byte_div32*32ull); } + + + T* get_src_p(int c) const { return src_cu + (size_t)(buf_byte_div32*32ull) / (sizeof(T)) * c; } + + uint16_t get_block_pair_num() const { + constexpr int x_chunk = 65536 / THREAD_PER_GRID / WARP_SIZE; // To make the pixels assigned per grid a multiple of 65536. + static_assert(x_chunk > 0, ""); + const uint64_t total_gridx = div_ceil(size, THREAD_PER_GRID * WARP_SIZE); + uint64_t block_pair_num_org = div_ceil(total_gridx, MAX_BLOCK_X); + if (block_pair_num_org <= x_chunk) { + block_pair_num_org--; + block_pair_num_org |= block_pair_num_org >> 1; + block_pair_num_org |= block_pair_num_org >> 2; + block_pair_num_org |= block_pair_num_org >> 4; + block_pair_num_org |= block_pair_num_org >> 8; + block_pair_num_org++; + } else { + block_pair_num_org = div_ceil(block_pair_num_org, x_chunk) * x_chunk; + } + block_pair_num_org *= WARP_SIZE; + + if (block_pair_num_org >= (1LL << (8 * sizeof(uint16_t)))) { printf("over block_pair_num %ld\n", block_pair_num_org); exit(1); } + return (uint16_t)block_pair_num_org; + } + std::pair get_nsum_buf_and_buf_byte() const { + IdxType* nsum_buf = (IdxType*)(src_cu + 2ull * size); + return { nsum_buf, (buf_byte_div32*32ull) }; + } + IdxType* get_nsum_buf(int c) const { + IdxType* nsum_buf = (IdxType*)(src_cu + 2ull * size); + return (IdxType*)((uint8_t*)nsum_buf + (size_t)(buf_byte_div32*32ull) * c); + } + uint64_t get_buf_byte() const { return (buf_byte_div32*32ull); } + uint32_t get_buf_byte_div32() const { return buf_byte_div32; } + + IdxType get_nsum_scan_buf_len(uint16_t block_pair_num) const { + return div_ceil(size, THREAD_PER_GRID * block_pair_num); + } + IdxType get_nsum_scan_buf_len() const { + const uint16_t block_pair_num = get_block_pair_num(); + return get_nsum_scan_buf_len(block_pair_num); + } + + // Set data in src_cu before calling (data will be destroyed). + void construct(const cudaStream_t main_stream = 0, const bool run_first = true) { + assert(size > 0 && src_cu != nullptr); + if (size == 0 || wm_num == 0) return; + if (src_cu == nullptr) { printf("Build Error: memory not alloced."); return;} + + T mask = ((T)1 << bit_len) - 1; + + const uint16_t block_pair_num = get_block_pair_num(); + const int grid_x = div_ceil(size, THREAD_PER_GRID * block_pair_num); + if (grid_x > MAX_BLOCK_X) { printf("over grid_x %d\n", grid_x); exit(1); } + + + const dim3 grid(grid_x, wm_num); + const dim3 thread(WARP_SIZE, THREAD_PER_GRID / WARP_SIZE); + const IdxType size_div_w = size / WORD_SIZE; + const IdxType size_div_warp = size / WARP_SIZE; + assert(size % WARP_SIZE == 0); + constexpr int ThreadsDimY = THREAD_PER_GRID / WARP_SIZE; + + + const int nsum_scan_buf_len = get_nsum_scan_buf_len(block_pair_num); // same grid_x + + +#define CALC_SRCB_SIZE(SrcT) (sizeof(SrcT) * (ThreadsDimY * (WARP_SIZE - 1) * WARP_SIZE)) + constexpr int SRCB_S_T = CALC_SRCB_SIZE(T); + constexpr int SRCB_S_8 = CALC_SRCB_SIZE(uint8_t); +#undef CALC_SRCB_SIZE + constexpr int BLOCK_SHMEM_KB = SHMEM_USE_KB * THREAD_PER_GRID / 1024; + constexpr int DSTB_S_T = WaveletMatrixMultiCu4G_get_dstbuf_kb(SRCB_S_T); + constexpr int DSTB_S_8 = WaveletMatrixMultiCu4G_get_dstbuf_kb(SRCB_S_8); + static_assert(SHMEM_USE_KB >= 64 || THREAD_PER_GRID == 1024, "if SHMEM_USE_KB < 64, THREAD_PER_GRID must 1024"); + static_assert(SRCB_S_T + DSTB_S_T <= BLOCK_SHMEM_KB * 1024 && ((DSTB_S_T == 0 && SRCB_S_T + MIN_DSTBUF_KB * 1024> BLOCK_SHMEM_KB * 1024) || (DSTB_S_T >= MIN_DSTBUF_KB * 1024 && SRCB_S_T + DSTB_S_T * 2 > BLOCK_SHMEM_KB * 1024)), ""); + static_assert(SRCB_S_8 + DSTB_S_8 <= BLOCK_SHMEM_KB * 1024 && ((DSTB_S_8 == 0 && SRCB_S_8 + MIN_DSTBUF_KB * 1024> BLOCK_SHMEM_KB * 1024) || (DSTB_S_8 >= MIN_DSTBUF_KB * 1024 && SRCB_S_8 + DSTB_S_8 * 2 > BLOCK_SHMEM_KB * 1024)), ""); + + { using SrcT = T; using DstT = T; + cudaFuncSetAttribute(&WaveletMatrixMultiCu4G_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_T + DSTB_S_T); + } { using SrcT = T; using DstT = uint8_t; + cudaFuncSetAttribute(&WaveletMatrixMultiCu4G_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_T + DSTB_S_T); + } { using SrcT = uint8_t; using DstT = uint8_t; + cudaFuncSetAttribute(&WaveletMatrixMultiCu4G_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_8 + DSTB_S_8); + } + + T* now_cu = src_cu; + T* nxt_cu = src_cu + size; + IdxType* nsum_zeros_buf = (IdxType*)(nxt_cu + size); + IdxType* nsum_zeros_buf2 = nsum_zeros_buf + nsum_scan_buf_len; + + const uint32_t nsum_pos = get_nsum_pos(); + + int h = bit_len - 1; + if (run_first) { + WaveletMatrixMultiCu4G_first_gpu <<>> (T(mask / 2), block_pair_num, size_div_warp, src_cu, nsum_zeros_buf, buf_byte_div32); + } + WaveletMatrixMultiCu4G_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf, nsum_zeros_buf2, get_bv_block_cu(h) + nsum_pos, buf_byte_div32, bv_block_byte_div32); + + for (; h > 8; --h) { + using SrcT = T; + using DstT = T; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + WaveletMatrixMultiCu4G_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_zeros_buf, nsum_zeros_buf2, bv_block_byte_div32, buf_byte_div32); + + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrixMultiCu4G_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf2, nsum_zeros_buf, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_zeros_buf, nsum_zeros_buf2); + swap(now_cu, nxt_cu); + } + if (h == 8 || (is_same::value && bit_len <= 8 && h == bit_len - 1)) { + using SrcT = T; + using DstT = uint8_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + WaveletMatrixMultiCu4G_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_zeros_buf, nsum_zeros_buf2, bv_block_byte_div32, buf_byte_div32); + if (h == 0) return; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrixMultiCu4G_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf2, nsum_zeros_buf, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_zeros_buf, nsum_zeros_buf2); + swap(now_cu, nxt_cu); + --h; + } + + for (; h >= 0; --h) { + using SrcT = uint8_t; + using DstT = uint8_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + WaveletMatrixMultiCu4G_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_zeros_buf, nsum_zeros_buf2, bv_block_byte_div32, buf_byte_div32); + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrixMultiCu4G_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf2, nsum_zeros_buf, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_zeros_buf, nsum_zeros_buf2); + swap(now_cu, nxt_cu); + } + } + + IdxType get_nsum_pos() const { + const IdxType size_div_w = size / WORD_SIZE; + return size_div_w; + } + IdxType get_bv_block_h_byte_div32() const { + return (bv_block_len * (sizeof(WordT) + sizeof(IdxType))) / 32u; + } +}; + +} // end namespace wavelet_median +}}} //end namespace cv::cuda::device + +#endif +#endif // __OPENCV_WAVELET_MATRIX_MULTI_CUH__ diff --git a/modules/cudafilters/src/filtering.cpp b/modules/cudafilters/src/filtering.cpp index daab3acde10..2ae789c856d 100644 --- a/modules/cudafilters/src/filtering.cpp +++ b/modules/cudafilters/src/filtering.cpp @@ -72,6 +72,8 @@ Ptr cv::cuda::createColumnSumFilter(int, int, int, int, int, Scalar) { t Ptr cv::cuda::createMedianFilter(int srcType, int _windowSize, int _partitions){ throw_no_cuda(); return Ptr();} #else +#include +#include namespace { @@ -1047,12 +1049,20 @@ Ptr cv::cuda::createColumnSumFilter(int srcType, int dstType, int ksize, //////////////////////////////////////////////////////////////////////////////////////////////////// // Median Filter +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#include "cuda/wavelet_matrix_feature_support_checks.h" namespace cv { namespace cuda { namespace device { void medianFiltering_gpu(const PtrStepSzb src, PtrStepSzb dst, PtrStepSzi devHist, PtrStepSzi devCoarseHist,int kernel, int partitions, cudaStream_t stream); + +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + template + void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius, const int num_channels, cudaStream_t stream); +#endif }}} namespace @@ -1074,7 +1084,15 @@ namespace MedianFilter::MedianFilter(int srcType, int _windowSize, int _partitions) : windowSize(_windowSize),partitions(_partitions) { - CV_Assert( srcType == CV_8UC1 ); +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 + || srcType == CV_16UC1 || srcType == CV_16UC3 || srcType == CV_16UC4 + || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4); +#else + if (srcType != CV_8UC1) { + CV_Error(Error::StsNotImplemented, "If CUDA version is below 10, only implementations that support CV_8UC1 are available"); + } +#endif CV_Assert(windowSize>=3); CV_Assert(_partitions>=1); @@ -1094,6 +1112,18 @@ namespace // Kernel needs to be half window size int kernel=windowSize/2; +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + const int depth = src.depth(); + if (depth == CV_8U) { + medianFiltering_wavelet_matrix_gpu(src, dst, kernel, src.channels(), StreamAccessor::getStream(_stream)); + } else if (depth == CV_16U) { + medianFiltering_wavelet_matrix_gpu(src, dst, kernel, src.channels(), StreamAccessor::getStream(_stream)); + } else if (depth == CV_32F) { + medianFiltering_wavelet_matrix_gpu(src, dst, kernel, src.channels(), StreamAccessor::getStream(_stream)); + } else { + CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32F); + } +#else CV_Assert(kernel < src.rows); CV_Assert(kernel < src.cols); @@ -1107,6 +1137,7 @@ namespace devCoarseHist.setTo(0, _stream); medianFiltering_gpu(src,dst,devHist, devCoarseHist,kernel,partitions,StreamAccessor::getStream(_stream)); +# endif } } diff --git a/modules/cudafilters/test/test_filters.cpp b/modules/cudafilters/test/test_filters.cpp index bb235dad093..432b5d2a5ac 100644 --- a/modules/cudafilters/test/test_filters.cpp +++ b/modules/cudafilters/test/test_filters.cpp @@ -43,6 +43,7 @@ #include "test_precomp.hpp" #ifdef HAVE_CUDA +#include "../src/cuda/wavelet_matrix_feature_support_checks.h" namespace opencv_test { namespace { @@ -647,7 +648,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Filters, MorphEx, testing::Combine( // Median -PARAM_TEST_CASE(Median, cv::cuda::DeviceInfo, cv::Size, MatDepth, KernelSize, UseRoi) +PARAM_TEST_CASE(Median, cv::cuda::DeviceInfo, cv::Size, MatType, KernelSize, UseRoi) { cv::cuda::DeviceInfo devInfo; cv::Size size; @@ -681,7 +682,7 @@ CUDA_TEST_P(Median, Accuracy) cv::Mat dst_gold; cv::medianBlur(src,dst_gold,kernel); - cv::Rect rect(kernel+1,0,src.cols-(2*kernel+1),src.rows); + cv::Rect rect(kernel/2, kernel/2, src.cols-(kernel-1), src.rows-(kernel-1)); cv::Mat dst_gold_no_border = dst_gold(rect); cv::cuda::GpuMat dst_no_border = cv::cuda::GpuMat(dst, rect); @@ -703,6 +704,17 @@ INSTANTIATE_TEST_CASE_P(CUDA_Filters, Median, testing::Combine( WHOLE_SUBMAT) ); -}} // namespace +INSTANTIATE_TEST_CASE_P(CUDA_Filters_Median_HDR, Median, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES, + testing::Values( + MatType(CV_8UC3), MatType(CV_8UC4), + MatType(CV_16U), MatType(CV_16UC3), MatType(CV_16UC4), + MatType(CV_32F), MatType(CV_32FC3), MatType(CV_32FC4)), + testing::Values(KernelSize(3), KernelSize(5)), + WHOLE_SUBMAT) + ); + +}} // namespace #endif // HAVE_CUDA diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp index 177bf75b1ac..51a5ce1a83e 100644 --- a/modules/cudaimgproc/src/histogram.cpp +++ b/modules/cudaimgproc/src/histogram.cpp @@ -281,8 +281,13 @@ cv::Ptr cv::cuda::createCLAHE(double clipLimit, cv::Size tileGr namespace { +#if (CUDA_VERSION >= 12040) + typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, size_t* hpBufferSize); + typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], size_t* hpBufferSize); +#else typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize); typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize); +#endif template struct NppHistogramEvenFuncC1 { @@ -315,7 +320,11 @@ namespace sz.width = src.cols; sz.height = src.rows; +#if (CUDA_VERSION >= 12040) + size_t buf_size; +#else int buf_size; +#endif get_buf_size(sz, levels, &buf_size); BufferPool pool(stream); @@ -349,7 +358,11 @@ namespace Npp32s* pHist[] = {hist[0].ptr(), hist[1].ptr(), hist[2].ptr(), hist[3].ptr()}; +#if (CUDA_VERSION >= 12040) + size_t buf_size; +#else int buf_size; +#endif get_buf_size(sz, levels, &buf_size); BufferPool pool(stream); @@ -419,7 +432,11 @@ namespace sz.width = src.cols; sz.height = src.rows; +#if (CUDA_VERSION >= 12040) + size_t buf_size; +#else int buf_size; +#endif get_buf_size(sz, levels.cols, &buf_size); BufferPool pool(stream); @@ -460,7 +477,11 @@ namespace sz.width = src.cols; sz.height = src.rows; +#if (CUDA_VERSION >= 12040) + size_t buf_size; +#else int buf_size; +#endif get_buf_size(sz, nLevels, &buf_size); BufferPool pool(stream); diff --git a/modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp b/modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp index 151e949a617..5bd1737aa3c 100644 --- a/modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp +++ b/modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp @@ -154,6 +154,17 @@ namespace block_reduce_detail val = smem[tid]; } + + // merge + + template + __device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op) + { + T reg = smem[tid + delta]; + smem[tid] = val = op(val, reg); + } + +#if (CUDART_VERSION < 12040) template __device__ __forceinline__ void loadToSmem(const tuple& smem, @@ -172,15 +183,6 @@ namespace block_reduce_detail For<0, tuple_size >::value>::loadFromSmem(smem, val, tid); } - // merge - - template - __device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op) - { - T reg = smem[tid + delta]; - smem[tid] = val = op(val, reg); - } - template @@ -214,6 +216,41 @@ namespace block_reduce_detail } #endif +#else + template + __device__ __forceinline__ void loadToSmem(const tuple& smem, const tuple& val, uint tid) + { + For<0, tuple_size >::value>::loadToSmem(smem, val, tid); + } + + template + __device__ __forceinline__ void loadFromSmem(const tuple& smem, const tuple& val, uint tid) + { + For<0, tuple_size >::value>::loadFromSmem(smem, val, tid); + } + + template + __device__ __forceinline__ void merge(const tuple& smem, const tuple& val, uint tid, uint delta, const tuple& op) + { + For<0, tuple_size >::value>::merge(smem, val, tid, delta, op); + } + + // mergeShfl + + template + __device__ __forceinline__ void mergeShfl(T& val, uint delta, uint width, const Op& op) + { + T reg = shfl_down(val, delta, width); + val = op(val, reg); + } + + template + __device__ __forceinline__ void mergeShfl(const tuple& val, uint delta, uint width, const tuple& op) + { + For<0, tuple_size >::value>::mergeShfl(val, delta, width, op); + } +#endif + // Generic template struct Generic diff --git a/modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp b/modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp index 4af834a446e..43876decc92 100644 --- a/modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp +++ b/modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp @@ -160,6 +160,7 @@ namespace block_reduce_key_val_detail data = smem[tid]; } +#if (CUDART_VERSION < 12040) template __device__ __forceinline__ void loadToSmem(const tuple& smem, @@ -241,6 +242,67 @@ namespace block_reduce_key_val_detail { For<0, tuple_size >::value>::merge(skeys, key, svals, val, cmp, tid, delta); } +#else + template + __device__ __forceinline__ void loadToSmem(const tuple& smem, const tuple& data, uint tid) + { + For<0, tuple_size >::value>::loadToSmem(smem, data, tid); + } + + template + __device__ __forceinline__ void loadFromSmem(const tuple& smem, const tuple& data, uint tid) + { + For<0, tuple_size >::value>::loadFromSmem(smem, data, tid); + } + + // copyVals + + template + __device__ __forceinline__ void copyVals(volatile V* svals, V& val, uint tid, uint delta) + { + svals[tid] = val = svals[tid + delta]; + } + + template + __device__ __forceinline__ void copyVals(const tuple& svals, const tuple& val, uint tid, uint delta) + { + For<0, tuple_size >::value>::copy(svals, val, tid, delta); + } + + // merge + + template + __device__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, uint tid, uint delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + + template + __device__ void merge(volatile K* skeys, K& key, const tuple& svals, const tuple& val, const Cmp& cmp, uint tid, uint delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + + template + __device__ __forceinline__ void merge(const tuple& skeys, const tuple& key, + const tuple& svals, const tuple& val, + const tuple& cmp, uint tid, uint delta) + { + For<0, tuple_size >::value>::merge(skeys, key, svals, val, cmp, tid, delta); + } +#endif // Generic diff --git a/modules/cudev/include/opencv2/cudev/block/reduce.hpp b/modules/cudev/include/opencv2/cudev/block/reduce.hpp index 06f59a16ae9..9dde278da84 100644 --- a/modules/cudev/include/opencv2/cudev/block/reduce.hpp +++ b/modules/cudev/include/opencv2/cudev/block/reduce.hpp @@ -51,6 +51,7 @@ #include "../warp/reduce.hpp" #include "detail/reduce.hpp" #include "detail/reduce_key_val.hpp" +#include namespace cv { namespace cudev { @@ -65,6 +66,7 @@ __device__ __forceinline__ void blockReduce(volatile T* smem, T& val, uint tid, block_reduce_detail::Dispatcher::reductor::template reduce(smem, val, tid, op); } +#if (CUDART_VERSION < 12040) template (skeys, key, svals, val, tid, cmp); } +#else + +template +__device__ __forceinline__ void blockReduce(const tuple& smem, + const tuple& val, + uint tid, + const tuple& op) +{ + block_reduce_detail::Dispatcher::reductor::template reduce&, const tuple&, const tuple&>(smem, val, tid, op); +} + +// blockReduceKeyVal + +template +__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, uint tid, const Cmp& cmp) +{ + block_reduce_key_val_detail::Dispatcher::reductor::template reduce(skeys, key, svals, val, tid, cmp); +} + +template +__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key, const tuple& svals, const tuple& val, uint tid, const Cmp& cmp) +{ + block_reduce_key_val_detail::Dispatcher::reductor::template reduce&, const tuple&, const Cmp&>(skeys, key, svals, val, tid, cmp); +} + +template +__device__ __forceinline__ void blockReduceKeyVal(const tuple& skeys, const tuple& key, const tuple& svals, const tuple& val, uint tid, const tuple& cmp) +{ + block_reduce_key_val_detail::Dispatcher::reductor::template reduce< const tuple&, const tuple&, const tuple&, const tuple&, const tuple&>(skeys, key, svals, val, tid, cmp); +} + +#endif + //! @} }} diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp index 3f512060165..df8bed3a948 100644 --- a/modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp @@ -157,28 +157,47 @@ namespace grid_split_merge_detail template struct MergeImpl<2, Policy> { template - __host__ static void merge(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + __host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { mergeC2(get<0>(src), get<1>(src), dst, mask, rows, cols, stream); } + + template + __host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + mergeC2(src[0], src[1], dst, mask, rows, cols, stream); + } + }; template struct MergeImpl<3, Policy> { template - __host__ static void merge(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + __host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { mergeC3(get<0>(src), get<1>(src), get<2>(src), dst, mask, rows, cols, stream); } + + template + __host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + mergeC3(src[0], src[1], src[2], dst, mask, rows, cols, stream); + } }; template struct MergeImpl<4, Policy> { template - __host__ static void merge(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + __host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { mergeC4(get<0>(src), get<1>(src), get<2>(src), get<3>(src), dst, mask, rows, cols, stream); } + + template + __host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + mergeC4(src[0], src[1], src[2], src[3], dst, mask, rows, cols, stream); + } }; // split diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp index 557797d7c85..4e901ac751d 100644 --- a/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp @@ -179,6 +179,23 @@ namespace grid_transform_detail dst(y, x) = saturate_cast(op(src1(y, x), src2(y, x))); } + // transformSimple, 2 outputs + // The overloads are added for polar_cart.cu to compute magnitude and phase with single call + // the previous implementation with touple causes cuda namespace clash. See https://github.com/opencv/opencv_contrib/issues/3690 + template + __global__ void transformSimple(const SrcPtr1 src1, const SrcPtr2 src2, GlobPtr dst1, GlobPtr dst2, + const BinOp1 op1, const BinOp2 op2, const MaskPtr mask, const int rows, const int cols) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= cols || y >= rows || !mask(y, x)) + return; + + dst1(y, x) = saturate_cast(op1(src1(y, x), src2(y, x))); + dst2(y, x) = saturate_cast(op2(src1(y, x), src2(y, x))); + } + // transformSmart template @@ -248,6 +265,52 @@ namespace grid_transform_detail } } + // transformSmart, 2 outputs + // The overloads are added for polar_cart.cu to compute magnitude and phase with single call + // the previous implementation with touple causes cuda namespace clash. See https://github.com/opencv/opencv_contrib/issues/3690 + template + __global__ void transformSmart(const GlobPtr src1_, const GlobPtr src2_, + GlobPtr dst1_, GlobPtr dst2_, + const BinOp1 op1, const BinOp2 op2, const MaskPtr mask, const int rows, const int cols) + { + typedef typename MakeVec::type read_type1; + typedef typename MakeVec::type read_type2; + typedef typename MakeVec::type write_type1; + typedef typename MakeVec::type write_type2; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x_shifted = x * SHIFT; + + if (y < rows) + { + const SrcType1* src1 = src1_.row(y); + const SrcType2* src2 = src2_.row(y); + DstType1* dst1 = dst1_.row(y); + DstType2* dst2 = dst2_.row(y); + + if (x_shifted + SHIFT - 1 < cols) + { + const read_type1 src1_n_el = ((const read_type1*)src1)[x]; + const read_type2 src2_n_el = ((const read_type2*)src2)[x]; + + OpUnroller::unroll(src1_n_el, src2_n_el, ((write_type1*)dst1)[x], op1, mask, x_shifted, y); + OpUnroller::unroll(src1_n_el, src2_n_el, ((write_type2*)dst2)[x], op2, mask, x_shifted, y); + } + else + { + for (int real_x = x_shifted; real_x < cols; ++real_x) + { + if (mask(y, real_x)) + { + dst1[real_x] = op1(src1[real_x], src2[real_x]); + dst2[real_x] = op2(src1[real_x], src2[real_x]); + } + } + } + } + } + // TransformDispatcher template struct TransformDispatcher; @@ -279,6 +342,20 @@ namespace grid_transform_detail if (stream == 0) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } + + template + __host__ static void call(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr& dst1, const GlobPtr& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + const dim3 block(Policy::block_size_x, Policy::block_size_y); + const dim3 grid(divUp(cols, block.x), divUp(rows, block.y)); + + transformSimple<<>>(src1, src2, dst1, dst2, op1, op2, mask, rows, cols); + CV_CUDEV_SAFE_CALL( cudaGetLastError() ); + + if (stream == 0) + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); + } }; template struct TransformDispatcher @@ -336,6 +413,33 @@ namespace grid_transform_detail if (stream == 0) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } + + template + __host__ static void call(const GlobPtr& src1, const GlobPtr& src2, + const GlobPtr& dst1, const GlobPtr& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + if (Policy::shift == 1 || + !isAligned(src1.data, Policy::shift * sizeof(SrcType1)) || !isAligned(src1.step, Policy::shift * sizeof(SrcType1)) || + !isAligned(src2.data, Policy::shift * sizeof(SrcType2)) || !isAligned(src2.step, Policy::shift * sizeof(SrcType2)) || + !isAligned(dst1.data, Policy::shift * sizeof(DstType1)) || !isAligned(dst1.step, Policy::shift * sizeof(DstType1))|| + !isAligned(dst2.data, Policy::shift * sizeof(DstType2)) || !isAligned(dst2.step, Policy::shift * sizeof(DstType2)) + ) + { + TransformDispatcher::call(src1, src2, dst1, dst2, op1, op2, mask, rows, cols, stream); + return; + } + + const dim3 block(Policy::block_size_x, Policy::block_size_y); + const dim3 grid(divUp(cols, block.x * Policy::shift), divUp(rows, block.y)); + + transformSmart<<>>(src1, src2, dst1, dst2, op1, op2, mask, rows, cols); + CV_CUDEV_SAFE_CALL( cudaGetLastError() ); + + if (stream == 0) + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); + } + }; template @@ -350,6 +454,13 @@ namespace grid_transform_detail TransformDispatcher::call(src1, src2, dst, op, mask, rows, cols, stream); } + template + __host__ void transform_binary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr& dst1, const GlobPtr& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + TransformDispatcher::call(src1, src2, dst1, dst2, op1, op2, mask, rows, cols, stream); + } + template __host__ void transform_unary(const GlobPtr& src, const GlobPtr& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { @@ -362,6 +473,15 @@ namespace grid_transform_detail TransformDispatcher::cn == 1 && VecTraits::cn == 1 && VecTraits::cn == 1 && Policy::shift != 1, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream); } + template + __host__ void transform_binary(const GlobPtr& src1, const GlobPtr& src2, const GlobPtr& dst1, const GlobPtr& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + TransformDispatcher::cn == 1 && VecTraits::cn == 1 && + VecTraits::cn == 1 && VecTraits::cn == 1 && + Policy::shift != 1, Policy>::call(src1, src2, dst1, dst2, op1, op2, mask, rows, cols, stream); + } + // transform_tuple template struct Unroll diff --git a/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp b/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp index 5c92a813ed8..115d8c55e46 100644 --- a/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp @@ -72,11 +72,11 @@ __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_& dst, const Ma dst.create(rows, cols); - grid_split_merge_detail::MergeImpl::cn, Policy>::merge(shrinkPtr(src), - shrinkPtr(dst), - shrinkPtr(mask), - rows, cols, - StreamAccessor::getStream(stream)); + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeTuple(shrinkPtr(src), + shrinkPtr(dst), + shrinkPtr(mask), + rows, cols, + StreamAccessor::getStream(stream)); } template @@ -90,7 +90,7 @@ __host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz& dst, CV_Assert( getRows(dst) == rows && getCols(dst) == cols ); CV_Assert( getRows(mask) == rows && getCols(mask) == cols ); - grid_split_merge_detail::MergeImpl::cn, Policy>::merge(shrinkPtr(src), + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeTuple(shrinkPtr(src), shrinkPtr(dst), shrinkPtr(mask), rows, cols, @@ -107,11 +107,11 @@ __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_& dst, Stream& dst.create(rows, cols); - grid_split_merge_detail::MergeImpl::cn, Policy>::merge(shrinkPtr(src), - shrinkPtr(dst), - WithOutMask(), - rows, cols, - StreamAccessor::getStream(stream)); + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeTuple(shrinkPtr(src), + shrinkPtr(dst), + WithOutMask(), + rows, cols, + StreamAccessor::getStream(stream)); } template @@ -124,13 +124,87 @@ __host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz& dst, CV_Assert( getRows(dst) == rows && getCols(dst) == cols ); - grid_split_merge_detail::MergeImpl::cn, Policy>::merge(shrinkPtr(src), - shrinkPtr(dst), - WithOutMask(), - rows, cols, - StreamAccessor::getStream(stream)); + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeTuple(shrinkPtr(src), + shrinkPtr(dst), + WithOutMask(), + rows, cols, + StreamAccessor::getStream(stream)); +} + +template +__host__ void gridMergeArray_(const std::array& src, GpuMat_& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + CV_Assert( VecTraits::cn == src.size() ); + + const int rows = getRows(src); + const int cols = getCols(src); + + CV_Assert( getRows(mask) == rows && getCols(mask) == cols ); + + dst.create(rows, cols); + + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeArray(src, + shrinkPtr(dst), + shrinkPtr(mask), + rows, cols, + StreamAccessor::getStream(stream)); +} + +template +__host__ void gridMergeArray_(const std::array& src, const GlobPtrSz& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + CV_Assert( VecTraits::cn == src.size() ); + + const int rows = src[0].rows; + const int cols = src[0].cols; + + CV_Assert( getRows(dst) == rows && getCols(dst) == cols ); + CV_Assert( getRows(mask) == rows && getCols(mask) == cols ); + + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeArray(src, + shrinkPtr(dst), + shrinkPtr(mask), + rows, cols, + StreamAccessor::getStream(stream)); } +template +__host__ void gridMergeArray_(const std::array& src, GpuMat_& dst, Stream& stream = Stream::Null()) +{ + CV_Assert( VecTraits::cn == src.size() ); + + const int rows = src[0].rows; + const int cols = src[0].cols; + + dst.create(rows, cols); + + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeArray(src, + shrinkPtr(dst), + WithOutMask(), + rows, cols, + StreamAccessor::getStream(stream)); +} + +template +__host__ void gridMergeArray_(const std::array& src, const GlobPtrSz& dst, Stream& stream = Stream::Null()) +{ + CV_Assert( VecTraits::cn == src.size() ); + + const int rows = src[0].rows; + const int cols = src[0].cols; + + CV_Assert( getRows(dst) == rows && getCols(dst) == cols ); + + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeArray(src, + shrinkPtr(dst), + WithOutMask(), + rows, cols, + StreamAccessor::getStream(stream)); +} + + +/////////////////////////////////////////////////////////////// + template __host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_&, GpuMat_& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) { @@ -522,6 +596,30 @@ __host__ void gridMerge(const SrcPtrTuple& src, const GlobPtrSz& dst, S gridMerge_(src, dst, stream); } +template +__host__ void gridMergeArray(const std::array& src, GpuMat_& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + gridMergeArray_(src, dst, mask, stream); +} + +template +__host__ void gridMerge(const std::array& src, const GlobPtrSz& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + gridMergeArray_(src, dst, mask, stream); +} + +template +__host__ void gridMerge(const std::array& src, GpuMat_& dst, Stream& stream = Stream::Null()) +{ + gridMergeArray_(src, dst, stream); +} + +template +__host__ void gridMerge(const std::array& src, const GlobPtrSz& dst, Stream& stream = Stream::Null()) +{ + gridMergeArray_(src, dst, stream); +} + template __host__ void gridSplit(const SrcPtr& src, const tuple< GpuMat_&, GpuMat_& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) { diff --git a/modules/cudev/include/opencv2/cudev/grid/transform.hpp b/modules/cudev/include/opencv2/cudev/grid/transform.hpp index 4f7d191e64b..f89cdf5d484 100644 --- a/modules/cudev/include/opencv2/cudev/grid/transform.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/transform.hpp @@ -121,6 +121,22 @@ __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, Gpu grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream)); } +template +__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_& dst1, GpuMat_& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + const int rows = getRows(src1); + const int cols = getCols(src1); + + CV_Assert( getRows(src2) == rows && getCols(src2) == cols ); + CV_Assert( getRows(mask) == rows && getCols(mask) == cols ); + + dst1.create(rows, cols); + dst2.create(rows, cols); + + grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst1), shrinkPtr(dst2), op1, op2, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream)); +} + template __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null()) { @@ -134,6 +150,22 @@ __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, con grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream)); } +template +__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst1, const GlobPtrSz& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + const int rows = getRows(src1); + const int cols = getCols(src1); + + CV_Assert( getRows(dst1) == rows && getCols(dst1) == cols ); + CV_Assert( getRows(dst2) == rows && getCols(dst2) == cols ); + CV_Assert( getRows(src2) == rows && getCols(src2) == cols ); + CV_Assert( getRows(mask) == rows && getCols(mask) == cols ); + + grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst1), shrinkPtr(dst2), op1, op2, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream)); +} + + template __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_& dst, const BinOp& op, Stream& stream = Stream::Null()) { @@ -147,6 +179,21 @@ __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, Gpu grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream)); } +template +__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_& dst1, GpuMat_& dst2, + const BinOp1& op1, const BinOp2& op2, Stream& stream = Stream::Null()) +{ + const int rows = getRows(src1); + const int cols = getCols(src1); + + CV_Assert( getRows(src2) == rows && getCols(src2) == cols ); + + dst1.create(rows, cols); + dst2.create(rows, cols); + + grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst1), shrinkPtr(dst2), op1, op2, WithOutMask(), rows, cols, StreamAccessor::getStream(stream)); +} + template __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst, const BinOp& op, Stream& stream = Stream::Null()) { @@ -159,6 +206,20 @@ __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, con grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream)); } +template +__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst1, const GlobPtrSz& dst2, + const BinOp1& op1, const BinOp2& op2, Stream& stream = Stream::Null()) +{ + const int rows = getRows(src1); + const int cols = getCols(src1); + + CV_Assert( getRows(dst1) == rows && getCols(dst1) == cols ); + CV_Assert( getRows(dst2) == rows && getCols(dst2) == cols ); + CV_Assert( getRows(src2) == rows && getCols(src2) == cols ); + + grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst1), shrinkPtr(dst2), op1, op2, WithOutMask(), rows, cols, StreamAccessor::getStream(stream)); +} + template __host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_&, GpuMat_& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null()) { @@ -449,24 +510,54 @@ __host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuM gridTransformBinary_(src1, src2, dst, op, mask, stream); } +template +__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_& dst1, GpuMat_& dst2, + const Op1& op1, const Op2& op2, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + gridTransformBinary_(src1, src2, dst1, dst2, op1, op2, mask, stream); +} + template __host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null()) { gridTransformBinary_(src1, src2, dst, op, mask, stream); } +template +__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst1, const GlobPtrSz& dst2, + const Op1& op1, const Op2& op2, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + gridTransformBinary_(src1, src2, dst1, dst2, op1, op2, mask, stream); +} + template __host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_& dst, const Op& op, Stream& stream = Stream::Null()) { gridTransformBinary_(src1, src2, dst, op, stream); } +template +__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, + GpuMat_& dst1, GpuMat_& dst2, + const Op1& op1, const Op2& op2, Stream& stream = Stream::Null()) +{ + gridTransformBinary_(src1, src2, dst1, dst2, op1, op2, stream); +} + template __host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst, const Op& op, Stream& stream = Stream::Null()) { gridTransformBinary_(src1, src2, dst, op, stream); } +template +__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, + const GlobPtrSz& dst1, const GlobPtrSz& dst2, + const Op1& op1, const Op2& op2, Stream& stream = Stream::Null()) +{ + gridTransformBinary_(src1, src2, dst1, dst2, op1, op2, stream); +} + template __host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_&, GpuMat_& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null()) { diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp index 2024a7e01a2..98c115fa1bf 100644 --- a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp +++ b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp @@ -118,6 +118,18 @@ __host__ GlobPtrSz globPtr(const GpuMat& mat) return p; } +template +__host__ GlobPtrSz globPtr(const GpuMat_& mat) +{ + GlobPtrSz p; + p.data = (T*) mat.data; + p.step = mat.step; + p.rows = mat.rows; + p.cols = mat.cols; + return p; +} + + template struct PtrTraits< GlobPtrSz > : PtrTraitsBase, GlobPtr > { }; diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp index e68f4cf61f5..f60ab5d0eb7 100644 --- a/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp +++ b/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp @@ -49,6 +49,9 @@ #include "../common.hpp" #include "../util/tuple.hpp" #include "traits.hpp" +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)) +#include +#endif namespace cv { namespace cudev { @@ -175,4 +178,27 @@ template struct PtrTraits< ZipPtrSz > : PtrTraitsBase }} +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)) +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template< class... Types > +struct tuple_size< cv::cudev::ZipPtr > > +: tuple_size > { }; + +template< class... Types > +struct tuple_size< cv::cudev::ZipPtrSz > > +: tuple_size > { }; + + +template +struct tuple_element > > +: tuple_element > { }; + +template +struct tuple_element > > +: tuple_element > { }; + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif #endif diff --git a/modules/cudev/test/test_split_merge.cu b/modules/cudev/test/test_split_merge.cu index b25c8b96d6f..598b6b80ac2 100644 --- a/modules/cudev/test/test_split_merge.cu +++ b/modules/cudev/test/test_split_merge.cu @@ -70,7 +70,8 @@ public: GpuMat_ d_src2(src2); GpuMat_::type> dst; - gridMerge(zipPtr(d_src1, d_src2), dst); + std::array, 2 > d_src = {globPtr(d_src1), globPtr(d_src2)}; + gridMerge(d_src, dst); Mat dst_gold; Mat srcs[] = {src1, src2}; @@ -93,8 +94,10 @@ public: GpuMat_ d_src2(src2); GpuMat_ d_src3(src3); + std::array, 3 > d_src = {globPtr(d_src1), globPtr(d_src2), globPtr(d_src3)}; + GpuMat_::type> dst; - gridMerge(zipPtr(d_src1, d_src2, d_src3), dst); + gridMerge(d_src, dst); Mat dst_gold; Mat srcs[] = {src1, src2, src3}; diff --git a/modules/face/src/facemarkLBF.cpp b/modules/face/src/facemarkLBF.cpp index 1ca1eeb545e..e5c062d18e3 100644 --- a/modules/face/src/facemarkLBF.cpp +++ b/modules/face/src/facemarkLBF.cpp @@ -661,24 +661,22 @@ FacemarkLBFImpl::BBox::BBox(double _x, double _y, double w, double h) { // Project absolute shape to relative shape binding to this bbox Mat FacemarkLBFImpl::BBox::project(const Mat &shape) const { - Mat_ res(shape.rows, shape.cols); - const Mat_ &shape_ = (Mat_)shape; + Mat res(shape.rows, shape.cols, CV_64FC1); for (int i = 0; i < shape.rows; i++) { - res(i, 0) = (shape_(i, 0) - x_center) / x_scale; - res(i, 1) = (shape_(i, 1) - y_center) / y_scale; + res.at(i, 0) = (shape.at(i, 0) - x_center) / x_scale; + res.at(i, 1) = (shape.at(i, 1) - y_center) / y_scale; } - return std::move(res); + return res; } // Project relative shape to absolute shape binding to this bbox Mat FacemarkLBFImpl::BBox::reproject(const Mat &shape) const { - Mat_ res(shape.rows, shape.cols); - const Mat_ &shape_ = (Mat_)shape; + Mat res(shape.rows, shape.cols, CV_64FC1); for (int i = 0; i < shape.rows; i++) { - res(i, 0) = shape_(i, 0)*x_scale + x_center; - res(i, 1) = shape_(i, 1)*y_scale + y_center; + res.at(i, 0) = shape.at(i, 0)*x_scale + x_center; + res.at(i, 1) = shape.at(i, 1)*y_scale + y_center; } - return std::move(res); + return res; } Mat FacemarkLBFImpl::getMeanShape(std::vector >_shapes, std::vector &bboxes) { @@ -997,7 +995,7 @@ void FacemarkLBFImpl::RandomForest::train(std::vector &imgs, std::vector lbf_feat(1, landmark_n*trees_n); + Mat lbf_feat(1, landmark_n*trees_n, CV_32SC1); double scale; Mat_ rotate; calcSimilarityTransform(bbox.project(current_shape), mean_shape, scale, rotate); @@ -1036,10 +1034,10 @@ Mat FacemarkLBFImpl::RandomForest::generateLBF(Mat &img, Mat ¤t_shape, BBo idx = 2 * idx + 1; } } - lbf_feat(i*trees_n + j) = (i*trees_n + j)*base + code; + lbf_feat.at(i*trees_n + j) = (i*trees_n + j)*base + code; } } - return std::move(lbf_feat); + return lbf_feat; } void FacemarkLBFImpl::RandomForest::write(FileStorage fs, int k) { @@ -1365,7 +1363,7 @@ Mat FacemarkLBFImpl::Regressor::supportVectorRegression( Mat FacemarkLBFImpl::Regressor::globalRegressionPredict(const Mat &lbf, int stage) { const Mat_ &weight = (Mat_)gl_regression_weights[stage]; - Mat_ delta_shape(weight.rows / 2, 2); + Mat delta_shape(weight.rows / 2, 2, CV_64FC1); const double *w_ptr = NULL; const int *lbf_ptr = lbf.ptr(0); @@ -1374,14 +1372,14 @@ Mat FacemarkLBFImpl::Regressor::globalRegressionPredict(const Mat &lbf, int stag w_ptr = weight.ptr(2 * i); double y = 0; for (int j = 0; j < lbf.cols; j++) y += w_ptr[lbf_ptr[j]]; - delta_shape(i, 0) = y; + delta_shape.at(i, 0) = y; w_ptr = weight.ptr(2 * i + 1); y = 0; for (int j = 0; j < lbf.cols; j++) y += w_ptr[lbf_ptr[j]]; - delta_shape(i, 1) = y; + delta_shape.at(i, 1) = y; } - return std::move(delta_shape); + return delta_shape; } // Regressor::globalRegressionPredict Mat FacemarkLBFImpl::Regressor::predict(Mat &img, BBox &bbox) { diff --git a/modules/face/src/mace.cpp b/modules/face/src/mace.cpp index 2c09560bfe5..3b7a236828c 100644 --- a/modules/face/src/mace.cpp +++ b/modules/face/src/mace.cpp @@ -102,11 +102,11 @@ struct MACEImpl CV_FINAL : MACE { Mat complexInput; merge(input, 2, complexInput); - Mat_ dftImg(IMGSIZE*2, IMGSIZE*2, 0.0); + Mat dftImg(IMGSIZE*2, IMGSIZE*2, CV_64FC2, 0.0); complexInput.copyTo(dftImg(Rect(0,0,IMGSIZE,IMGSIZE))); dft(dftImg, dftImg); - return std::move(dftImg); + return dftImg; } diff --git a/modules/intensity_transform/src/bimef.cpp b/modules/intensity_transform/src/bimef.cpp index 1abbf0a3a26..26788918cac 100644 --- a/modules/intensity_transform/src/bimef.cpp +++ b/modules/intensity_transform/src/bimef.cpp @@ -226,15 +226,15 @@ static Mat solveLinearEquation(const Mat_& img, Mat_& W_h_, Mat_ tin(img_t.ptr(), img_t.rows*img_t.cols); Eigen::VectorXf x = cg.solve(tin); - Mat_ tout(img.rows, img.cols); - tout.forEach( + Mat tout(img.rows, img.cols, CV_32FC1); + tout.forEach( [&](float &pixel, const int * position) -> void { pixel = x(position[1]*img.rows + position[0]); } ); - return std::move(tout); + return tout; } static Mat_ tsmooth(const Mat_& src, float lambda=0.01f, float sigma=3.0f, float sharpness=0.001f) diff --git a/modules/mcc/test/test_mcc.cpp b/modules/mcc/test/test_mcc.cpp index 374b829b4b2..37bd1f11fc7 100644 --- a/modules/mcc/test/test_mcc.cpp +++ b/modules/mcc/test/test_mcc.cpp @@ -102,7 +102,9 @@ TEST(CV_mcc_ccm_test, detect_Macbeth) // check Macbeth corners vector corners = checker->getBox(); - EXPECT_MAT_NEAR(gold_corners, corners, 3.6); // diff 3.57385 in ARM only + // diff 3.57385 corresponds to ARM v8 + // diff 4.37915 correspnds to Ubuntu 24.04 x86_64 configuration + EXPECT_MAT_NEAR(gold_corners, corners, 4.38); // read gold chartsRGB node = fs["chartsRGB"]; @@ -112,7 +114,7 @@ TEST(CV_mcc_ccm_test, detect_Macbeth) // check chartsRGB Mat chartsRGB = checker->getChartsRGB(); - EXPECT_MAT_NEAR(goldChartsRGB.col(1), chartsRGB.col(1), 0.25); // diff 0.240634 in ARM only + EXPECT_MAT_NEAR(goldChartsRGB.col(1), chartsRGB.col(1), 0.3); // diff 0.292077 on Ubuntu 20.04 ARM64 } TEST(CV_mcc_ccm_test, compute_ccm) diff --git a/modules/ovis/CMakeLists.txt b/modules/ovis/CMakeLists.txt index b9937d6b2c9..2ab6eed0b8b 100644 --- a/modules/ovis/CMakeLists.txt +++ b/modules/ovis/CMakeLists.txt @@ -24,7 +24,7 @@ ocv_glob_module_sources() ocv_module_include_directories() ocv_create_module() -ocv_add_samples(opencv_aruco) +ocv_add_samples(opencv_objdetect opencv_aruco) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wunused-parameter) ocv_target_link_libraries(${the_module} ${OGRE_LIBRARIES}) diff --git a/modules/ovis/samples/aruco_ar_demo.cpp b/modules/ovis/samples/aruco_ar_demo.cpp index a49c6ac9516..b33c9cc5772 100644 --- a/modules/ovis/samples/aruco_ar_demo.cpp +++ b/modules/ovis/samples/aruco_ar_demo.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include #include @@ -24,10 +24,12 @@ int main() const double focal_length = 800.0; // aruco - Ptr adict = aruco::getPredefinedDictionary(aruco::DICT_4X4_50); - //Mat out_img; - //aruco::drawMarker(adict, 0, 400, out_img); - //imshow("marker", out_img); + aruco::Dictionary adict = aruco::getPredefinedDictionary(aruco::DICT_4X4_50); + + aruco::ArucoDetector detector(adict); + Mat out_img; + adict.generateImageMarker(0, 400, out_img); + imshow("marker", out_img); // random calibration data, your mileage may vary Mat1d cm = Mat1d::zeros(3, 3); // init empty matrix @@ -53,7 +55,7 @@ int main() while (ovis::waitKey(1) != KEY_ESCAPE) { cap.read(img); win->setBackground(img); - aruco::detectMarkers(img, adict, corners, ids); + detector.detectMarkers(img, corners, ids); waitKey(1); diff --git a/modules/ovis/samples/aruco_ar_demo.py b/modules/ovis/samples/aruco_ar_demo.py index 72aeeaebea3..5877d915d9e 100644 --- a/modules/ovis/samples/aruco_ar_demo.py +++ b/modules/ovis/samples/aruco_ar_demo.py @@ -3,7 +3,7 @@ # aruco adict = cv.aruco.getPredefinedDictionary(cv.aruco.DICT_4X4_50) -cv.imshow("marker", cv.aruco.drawMarker(adict, 0, 400)) +cv.imshow("marker", adict.generateImageMarker(0, 400)) # random calibration data. your mileage may vary. imsize = (800, 600) diff --git a/modules/ovis/src/ovis.cpp b/modules/ovis/src/ovis.cpp index c210faa6cd4..487d87ad2c5 100644 --- a/modules/ovis/src/ovis.cpp +++ b/modules/ovis/src/ovis.cpp @@ -588,6 +588,7 @@ class WindowSceneImpl : public WindowScene cam->setDebugDisplayEnabled(true); cam->setNearClipDistance(1e-9); cam->setFarClipDistance(zFar); + cam->getFrustumExtents(); // force update _setCameraIntrinsics(cam, K, imsize); diff --git a/modules/rgbd/test/test_colored_kinfu.cpp b/modules/rgbd/test/test_colored_kinfu.cpp index 9a4787e3b0a..b66d496987c 100644 --- a/modules/rgbd/test/test_colored_kinfu.cpp +++ b/modules/rgbd/test/test_colored_kinfu.cpp @@ -158,8 +158,8 @@ struct Scene { virtual ~Scene() {} static Ptr create(int nScene, Size sz, Matx33f _intr, float _depthFactor); - virtual Mat depth(Affine3f pose) = 0; - virtual Mat rgb(Affine3f pose) = 0; + virtual Mat_ depth(const Affine3f& pose) = 0; + virtual Mat_ rgb(const Affine3f& pose) = 0; virtual std::vector getPoses() = 0; }; @@ -198,7 +198,7 @@ struct CubeSpheresScene : Scene return res; } - Mat depth(Affine3f pose) override + Mat_ depth(const Affine3f& pose) override { Mat_ frame(frameSize); Reprojector reproj(intr); @@ -206,10 +206,10 @@ struct CubeSpheresScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return frame; } - Mat rgb(Affine3f pose) override + Mat_ rgb(const Affine3f& pose) override { Mat_ frame(frameSize); Reprojector reproj(intr); @@ -217,7 +217,7 @@ struct CubeSpheresScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderColorInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return frame; } std::vector getPoses() override @@ -305,7 +305,7 @@ struct RotatingScene : Scene return res; } - Mat depth(Affine3f pose) override + Mat_ depth(const Affine3f& pose) override { Mat_ frame(frameSize); Reprojector reproj(intr); @@ -313,10 +313,10 @@ struct RotatingScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return frame; } - Mat rgb(Affine3f pose) override + Mat_ rgb(const Affine3f& pose) override { Mat_ frame(frameSize); Reprojector reproj(intr); @@ -324,7 +324,7 @@ struct RotatingScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderColorInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return frame; } std::vector getPoses() override diff --git a/modules/rgbd/test/test_kinfu.cpp b/modules/rgbd/test/test_kinfu.cpp index a47d6b74a4c..03863b6ecbd 100644 --- a/modules/rgbd/test/test_kinfu.cpp +++ b/modules/rgbd/test/test_kinfu.cpp @@ -141,7 +141,7 @@ struct CubeSpheresScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return static_cast(frame); } std::vector getPoses() override @@ -237,7 +237,7 @@ struct RotatingScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return static_cast(frame); } std::vector getPoses() override diff --git a/modules/tracking/doc/tracking.bib b/modules/tracking/doc/tracking.bib index 78ce6c32fa1..ce49bb4026e 100644 --- a/modules/tracking/doc/tracking.bib +++ b/modules/tracking/doc/tracking.bib @@ -76,3 +76,43 @@ @Article{Lukezic_IJCV2018 journal={International Journal of Computer Vision}, year={2018}, } + +@article{chaumette:inria-00350283, + title={{Visual servo control, Part I: Basic approaches}}, + author={Chaumette, Fran{\c c}ois and Hutchinson, S.}, + url={https://inria.hal.science/inria-00350283}, + journal={{IEEE Robotics and Automation Magazine}}, + publisher={{Institute of Electrical and Electronics Engineers}}, + volume={13}, + number={4}, + pages={82-90}, + year={2006}, + pdf={https://inria.hal.science/inria-00350283/file/2006_ieee_ram_chaumette.pdf}, + hal_id={inria-00350283}, + hal_version={v1}, +} + +@article{chaumette:inria-00350638, + title={{Visual servo control, Part II: Advanced approaches}}, + author={Chaumette, Fran{\c c}ois and Hutchinson, S.}, + url={https://inria.hal.science/inria-00350638}, + journal={{IEEE Robotics and Automation Magazine}}, + publisher={{Institute of Electrical and Electronics Engineers}}, + volume={14}, + number={1}, + pages={109-118}, + year={2007}, + pdf={https://inria.hal.science/inria-00350638/file/2007_ieee_ram_chaumette.pdf}, + hal_id={inria-00350638}, + hal_version={v1}, +} + +@article{Hutchinson1996ATO, + title={A tutorial on visual servo control}, + author={Seth A. Hutchinson and Gregory Hager and Peter Corke}, + journal={IEEE Trans. Robotics Autom.}, + year={1996}, + volume={12}, + pages={651-670}, + url={https://api.semanticscholar.org/CorpusID:1814423} +} diff --git a/modules/tracking/include/opencv2/tracking/twist.hpp b/modules/tracking/include/opencv2/tracking/twist.hpp index 8d998beda33..1452a00cddd 100644 --- a/modules/tracking/include/opencv2/tracking/twist.hpp +++ b/modules/tracking/include/opencv2/tracking/twist.hpp @@ -16,7 +16,7 @@ inline namespace tracking * @brief Compute the camera twist from a set of 2D pixel locations, their * velocities, depth values and intrinsic parameters of the camera. The pixel * velocities are usually obtained from optical flow algorithms, both dense and - * sparse flow can be used to compute the flow between images and duv computed by + * sparse flow can be used to compute the flow between images and \p duv computed by * dividing the flow by the time interval between the images. * * @param uv 2xN matrix of 2D pixel locations @@ -30,9 +30,10 @@ CV_EXPORTS cv::Vec6d computeTwist(const cv::Mat& uv, const cv::Mat& duv, const c const cv::Mat& K); /** - * @brief Compute the interaction matrix for a set of 2D pixels. This is usually + * @brief Compute the interaction matrix ( @cite Hutchinson1996ATO @cite chaumette:inria-00350283 + * @cite chaumette:inria-00350638 ) for a set of 2D pixels. This is usually * used in visual servoing applications to command a robot to move at desired pixel - * locations/velocities. By inverting this matrix one can estimate camera spatial + * locations/velocities. By inverting this matrix, one can estimate camera spatial * velocity i.e., the twist. * * @param uv 2xN matrix of 2D pixel locations @@ -41,8 +42,8 @@ CV_EXPORTS cv::Vec6d computeTwist(const cv::Mat& uv, const cv::Mat& duv, const c * @param J 2Nx6 interaction matrix * */ -CV_EXPORTS void getInteractionMatrix(const cv::Mat& uv, const cv::Mat& depths, const cv::Mat& K, - cv::Mat& J); +CV_EXPORTS void computeInteractionMatrix(const cv::Mat& uv, const cv::Mat& depths, const cv::Mat& K, + cv::Mat& J); //! @} diff --git a/modules/tracking/src/twist.cpp b/modules/tracking/src/twist.cpp index 1ff84c42582..bad6661ff1c 100644 --- a/modules/tracking/src/twist.cpp +++ b/modules/tracking/src/twist.cpp @@ -9,7 +9,7 @@ namespace detail inline namespace tracking { -void getInteractionMatrix(const cv::Mat& uv, const cv::Mat& depths, const cv::Mat& K, cv::Mat& J) +void computeInteractionMatrix(const cv::Mat& uv, const cv::Mat& depths, const cv::Mat& K, cv::Mat& J) { CV_Assert(uv.cols == depths.cols); CV_Assert(depths.type() == CV_32F); @@ -64,7 +64,7 @@ cv::Vec6d computeTwist(const cv::Mat& uv, const cv::Mat& duv, const cv::Mat& dep CV_Assert(uv.cols * 2 == duv.rows); cv::Mat J; - getInteractionMatrix(uv, depths, K, J); + computeInteractionMatrix(uv, depths, K, J); cv::Mat Jinv; cv::invert(J, Jinv, cv::DECOMP_SVD); cv::Mat twist = Jinv * duv; diff --git a/modules/tracking/test/test_twist.cpp b/modules/tracking/test/test_twist.cpp index 3911f28aea8..f365d811848 100644 --- a/modules/tracking/test/test_twist.cpp +++ b/modules/tracking/test/test_twist.cpp @@ -39,7 +39,7 @@ TEST_F(TwistTest, TestInteractionMatrix) cv::Mat uv = cv::Mat(2, 1, CV_32F, {1.0f, 1.0f}); cv::Mat depth = cv::Mat(1, 1, CV_32F, {2.0f}); - getInteractionMatrix(uv, depth, K, J); + computeInteractionMatrix(uv, depth, K, J); ASSERT_EQ(J.cols, 6); ASSERT_EQ(J.rows, 2); float expected[2][6] = {{-0.5f, 0.0f, 0.5f, 1.0f, -2.0f, 1.0f}, @@ -87,7 +87,7 @@ TEST_F(TwistTest, TestComputeWithNonZeroPixelVelocities) float duv_data[] = {1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 4.0f}; cv::Mat duv = cv::Mat(6, 1, CV_32F, duv_data); - getInteractionMatrix(uv, depth, K, J); + computeInteractionMatrix(uv, depth, K, J); ASSERT_EQ(J.cols, 6); ASSERT_EQ(J.rows, 6); float expected_jac[6][6] = {{-1.0f, 0.0f, 1.0f, 1.0f, -2.0f, 1.0f}, diff --git a/modules/videostab/src/global_motion.cpp b/modules/videostab/src/global_motion.cpp index 5bef5b1e74f..eb42d6954d6 100644 --- a/modules/videostab/src/global_motion.cpp +++ b/modules/videostab/src/global_motion.cpp @@ -82,7 +82,7 @@ namespace videostab { // does isotropic normalization -static Mat normalizePoints(int npoints, Point2f *points) +static Mat_ normalizePoints(int npoints, Point2f *points) { float cx = 0.f, cy = 0.f; for (int i = 0; i < npoints; ++i) @@ -113,32 +113,32 @@ static Mat normalizePoints(int npoints, Point2f *points) T(0,0) = T(1,1) = s; T(0,2) = -cx*s; T(1,2) = -cy*s; - return std::move(T); + return T; } static Mat estimateGlobMotionLeastSquaresTranslation( int npoints, Point2f *points0, Point2f *points1, float *rmse) { - Mat_ M = Mat::eye(3, 3, CV_32F); + Mat M = Mat::eye(3, 3, CV_32FC1); for (int i = 0; i < npoints; ++i) { - M(0,2) += points1[i].x - points0[i].x; - M(1,2) += points1[i].y - points0[i].y; + M.at(0,2) += points1[i].x - points0[i].x; + M.at(1,2) += points1[i].y - points0[i].y; } - M(0,2) /= npoints; - M(1,2) /= npoints; + M.at(0,2) /= npoints; + M.at(1,2) /= npoints; if (rmse) { *rmse = 0; for (int i = 0; i < npoints; ++i) - *rmse += sqr(points1[i].x - points0[i].x - M(0,2)) + - sqr(points1[i].y - points0[i].y - M(1,2)); + *rmse += sqr(points1[i].x - points0[i].x - M.at(0,2)) + + sqr(points1[i].y - points0[i].y - M.at(1,2)); *rmse = std::sqrt(*rmse / npoints); } - return std::move(M); + return M; } @@ -194,16 +194,16 @@ static Mat estimateGlobMotionLeastSquaresRotation( // A*sin(alpha) + B*cos(alpha) = 0 float C = std::sqrt(A*A + B*B); - Mat_ M = Mat::eye(3, 3, CV_32F); + Mat M = Mat::eye(3, 3, CV_32F); if ( C != 0 ) { float sinAlpha = - B / C; float cosAlpha = A / C; - M(0,0) = cosAlpha; - M(1,1) = M(0,0); - M(0,1) = sinAlpha; - M(1,0) = - M(0,1); + M.at(0,0) = cosAlpha; + M.at(1,1) = cosAlpha; + M.at(0,1) = sinAlpha; + M.at(1,0) = - sinAlpha; } if (rmse) @@ -213,16 +213,16 @@ static Mat estimateGlobMotionLeastSquaresRotation( { p0 = points0[i]; p1 = points1[i]; - *rmse += sqr(p1.x - M(0,0)*p0.x - M(0,1)*p0.y) + - sqr(p1.y - M(1,0)*p0.x - M(1,1)*p0.y); + *rmse += sqr(p1.x - M.at(0,0)*p0.x - M.at(0,1)*p0.y) + + sqr(p1.y - M.at(1,0)*p0.x - M.at(1,1)*p0.y); } *rmse = std::sqrt(*rmse / npoints); } - return std::move(M); + return M; } -static Mat estimateGlobMotionLeastSquaresRigid( +static Mat estimateGlobMotionLeastSquaresRigid( int npoints, Point2f *points0, Point2f *points1, float *rmse) { Point2f mean0(0.f, 0.f); @@ -250,15 +250,15 @@ static Mat estimateGlobMotionLeastSquaresRigid( A(1,1) += pt1.y * pt0.y; } - Mat_ M = Mat::eye(3, 3, CV_32F); + Mat M = Mat::eye(3, 3, CV_32FC1); SVD svd(A); Mat_ R = svd.u * svd.vt; Mat tmp(M(Rect(0,0,2,2))); R.copyTo(tmp); - M(0,2) = mean1.x - R(0,0)*mean0.x - R(0,1)*mean0.y; - M(1,2) = mean1.y - R(1,0)*mean0.x - R(1,1)*mean0.y; + M.at(0,2) = mean1.x - R(0,0)*mean0.x - R(0,1)*mean0.y; + M.at(1,2) = mean1.y - R(1,0)*mean0.x - R(1,1)*mean0.y; if (rmse) { @@ -267,13 +267,13 @@ static Mat estimateGlobMotionLeastSquaresRigid( { pt0 = points0[i]; pt1 = points1[i]; - *rmse += sqr(pt1.x - M(0,0)*pt0.x - M(0,1)*pt0.y - M(0,2)) + - sqr(pt1.y - M(1,0)*pt0.x - M(1,1)*pt0.y - M(1,2)); + *rmse += sqr(pt1.x - M.at(0,0)*pt0.x - M.at(0,1)*pt0.y - M.at(0,2)) + + sqr(pt1.y - M.at(1,0)*pt0.x - M.at(1,1)*pt0.y - M.at(1,2)); } *rmse = std::sqrt(*rmse / npoints); } - return std::move(M); + return M; } @@ -404,7 +404,7 @@ Mat estimateGlobalMotionRansac( // best hypothesis std::vector bestIndices(params.size); - Mat_ bestM; + Mat bestM; int ninliersMax = -1; RNG rng(0); @@ -469,8 +469,8 @@ Mat estimateGlobalMotionRansac( { p0 = points0_[i]; p1 = points1_[i]; - x = bestM(0,0)*p0.x + bestM(0,1)*p0.y + bestM(0,2); - y = bestM(1,0)*p0.x + bestM(1,1)*p0.y + bestM(1,2); + x = bestM.at(0,0)*p0.x + bestM.at(0,1)*p0.y + bestM.at(0,2); + y = bestM.at(1,0)*p0.x + bestM.at(1,1)*p0.y + bestM.at(1,2); if (sqr(x - p1.x) + sqr(y - p1.y) < params.thresh * params.thresh) { subset0[j] = p0; @@ -484,7 +484,7 @@ Mat estimateGlobalMotionRansac( if (ninliers) *ninliers = ninliersMax; - return std::move(bestM); + return bestM; } @@ -505,7 +505,7 @@ Mat MotionEstimatorRansacL2::estimate(InputArray points0, InputArray points1, bo // find motion int ninliers = 0; - Mat_ M; + Mat M; if (motionModel() != MM_HOMOGRAPHY) M = estimateGlobalMotionRansac( @@ -527,7 +527,7 @@ Mat MotionEstimatorRansacL2::estimate(InputArray points0, InputArray points1, bo if (ok) *ok = false; } - return std::move(M); + return M; } @@ -675,13 +675,13 @@ FromFileMotionReader::FromFileMotionReader(const String &path) Mat FromFileMotionReader::estimate(const Mat &/*frame0*/, const Mat &/*frame1*/, bool *ok) { - Mat_ M(3, 3); + Mat M(3, 3, CV_32FC1); bool ok_; - file_ >> M(0,0) >> M(0,1) >> M(0,2) - >> M(1,0) >> M(1,1) >> M(1,2) - >> M(2,0) >> M(2,1) >> M(2,2) >> ok_; + file_ >> M.at(0,0) >> M.at(0,1) >> M.at(0,2) + >> M.at(1,0) >> M.at(1,1) >> M.at(1,2) + >> M.at(2,0) >> M.at(2,1) >> M.at(2,2) >> ok_; if (ok) *ok = ok_; - return std::move(M); + return M; } @@ -696,12 +696,13 @@ ToFileMotionWriter::ToFileMotionWriter(const String &path, Ptr M = motionEstimator_->estimate(frame0, frame1, &ok_); - file_ << M(0,0) << " " << M(0,1) << " " << M(0,2) << " " - << M(1,0) << " " << M(1,1) << " " << M(1,2) << " " - << M(2,0) << " " << M(2,1) << " " << M(2,2) << " " << ok_ << std::endl; + Mat M = motionEstimator_->estimate(frame0, frame1, &ok_); + file_ << M.at(0,0) << " " << M.at(0,1) << " " << M.at(0,2) << " " + << M.at(1,0) << " " << M.at(1,1) << " " << M.at(1,2) << " " + << M.at(2,0) << " " << M.at(2,1) << " " << M.at(2,2) << " " + << ok_ << std::endl; if (ok) *ok = ok_; - return std::move(M); + return M; } diff --git a/modules/ximgproc/doc/ximgproc.bib b/modules/ximgproc/doc/ximgproc.bib index f081f54d3ce..279bac1d115 100644 --- a/modules/ximgproc/doc/ximgproc.bib +++ b/modules/ximgproc/doc/ximgproc.bib @@ -54,6 +54,13 @@ @incollection{Kaiming10 publisher={Springer} } +@article{Kaiming15, + title={Fast guided filter}, + author={He, Kaiming and Sun, Jian}, + journal={arXiv preprint arXiv:1505.00996}, + year={2015} +} + @inproceedings{Lee14, title={Outdoor place recognition in urban environments using straight lines}, author={Lee, Jin Han and Lee, Sehyung and Zhang, Guoxuan and Lim, Jongwoo and Chung, Wan Kyun and Suh, Il Hong}, diff --git a/modules/ximgproc/include/opencv2/ximgproc/edge_filter.hpp b/modules/ximgproc/include/opencv2/ximgproc/edge_filter.hpp index 82be7c71b7f..19b05451147 100644 --- a/modules/ximgproc/include/opencv2/ximgproc/edge_filter.hpp +++ b/modules/ximgproc/include/opencv2/ximgproc/edge_filter.hpp @@ -123,15 +123,15 @@ void dtFilter(InputArray guide, InputArray src, OutputArray dst, double sigmaSpa ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// -/** @brief Interface for realizations of Guided Filter. +/** @brief Interface for realizations of (Fast) Guided Filter. -For more details about this filter see @cite Kaiming10 . +For more details about this filter see @cite Kaiming10 @cite Kaiming15 . */ class CV_EXPORTS_W GuidedFilter : public Algorithm { public: - /** @brief Apply Guided Filter to the filtering image. + /** @brief Apply (Fast) Guided Filter to the filtering image. @param src filtering image with any numbers of channels. @@ -153,11 +153,14 @@ channels then only first 3 channels will be used. @param eps regularization term of Guided Filter. \f${eps}^2\f$ is similar to the sigma in the color space into bilateralFilter. -For more details about Guided Filter parameters, see the original article @cite Kaiming10 . +@param scale subsample factor of Fast Guided Filter, use a scale less than 1 to speeds up computation +with almost no visible degradation. (e.g. scale==0.5 shrinks the image by 2x inside the filter) + +For more details about (Fast) Guided Filter parameters, see the original articles @cite Kaiming10 @cite Kaiming15 . */ -CV_EXPORTS_W Ptr createGuidedFilter(InputArray guide, int radius, double eps); +CV_EXPORTS_W Ptr createGuidedFilter(InputArray guide, int radius, double eps, double scale = 1.0); -/** @brief Simple one-line Guided Filter call. +/** @brief Simple one-line (Fast) Guided Filter call. If you have multiple images to filter with the same guided image then use GuidedFilter interface to avoid extra computations on initialization stage. @@ -176,8 +179,11 @@ space into bilateralFilter. @param dDepth optional depth of the output image. +@param scale subsample factor of Fast Guided Filter, use a scale less than 1 to speeds up computation +with almost no visible degradation. (e.g. scale==0.5 shrinks the image by 2x inside the filter) + @sa bilateralFilter, dtFilter, amFilter */ -CV_EXPORTS_W void guidedFilter(InputArray guide, InputArray src, OutputArray dst, int radius, double eps, int dDepth = -1); +CV_EXPORTS_W void guidedFilter(InputArray guide, InputArray src, OutputArray dst, int radius, double eps, int dDepth = -1, double scale = 1.0); ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// diff --git a/modules/ximgproc/perf/perf_guided_filter.cpp b/modules/ximgproc/perf/perf_guided_filter.cpp index 9a0058a5505..7649acea9b3 100644 --- a/modules/ximgproc/perf/perf_guided_filter.cpp +++ b/modules/ximgproc/perf/perf_guided_filter.cpp @@ -7,11 +7,11 @@ namespace opencv_test { namespace { CV_ENUM(GuideTypes, CV_8UC1, CV_8UC3, CV_32FC1, CV_32FC3); CV_ENUM(SrcTypes, CV_8UC1, CV_8UC3, CV_32FC1, CV_32FC3); -typedef tuple GFParams; +typedef tuple GFParams; typedef TestBaseWithParam GuidedFilterPerfTest; -PERF_TEST_P( GuidedFilterPerfTest, perf, Combine(GuideTypes::all(), SrcTypes::all(), Values(sz1080p, sz2K)) ) +PERF_TEST_P( GuidedFilterPerfTest, perf, Combine(GuideTypes::all(), SrcTypes::all(), Values(sz1080p, sz2K), Values(1./1, 1./2, 1./3, 1./4)) ) { RNG rng(0); @@ -19,6 +19,7 @@ PERF_TEST_P( GuidedFilterPerfTest, perf, Combine(GuideTypes::all(), SrcTypes::al int guideType = get<0>(params); int srcType = get<1>(params); Size sz = get<2>(params); + double scale = get<3>(params); Mat guide(sz, guideType); Mat src(sz, srcType); @@ -30,7 +31,7 @@ PERF_TEST_P( GuidedFilterPerfTest, perf, Combine(GuideTypes::all(), SrcTypes::al { int radius = rng.uniform(5, 30); double eps = rng.uniform(0.1, 1e5); - guidedFilter(guide, src, dst, radius, eps); + guidedFilter(guide, src, dst, radius, eps, -1, scale); } SANITY_CHECK_NOTHING(); diff --git a/modules/ximgproc/src/guided_filter.cpp b/modules/ximgproc/src/guided_filter.cpp index 11fa0f6fcb0..9d8333aecba 100644 --- a/modules/ximgproc/src/guided_filter.cpp +++ b/modules/ximgproc/src/guided_filter.cpp @@ -128,7 +128,7 @@ class GuidedFilterImpl : public GuidedFilter { public: - static Ptr create(InputArray guide, int radius, double eps); + static Ptr create(InputArray guide, int radius, double eps, double scale); void filter(InputArray src, OutputArray dst, int dDepth = -1) CV_OVERRIDE; @@ -136,10 +136,13 @@ class GuidedFilterImpl : public GuidedFilter int radius; double eps; + double scale; int h, w; + int hOriginal, wOriginal; vector guideCn; vector guideCnMean; + vector guideCnOriginal; SymArray2D covarsInv; @@ -149,7 +152,7 @@ class GuidedFilterImpl : public GuidedFilter GuidedFilterImpl() {} - void init(InputArray guide, int radius, double eps); + void init(InputArray guide, int radius, double eps, double scale); void computeCovGuide(SymArray2D& covars); @@ -167,6 +170,16 @@ class GuidedFilterImpl : public GuidedFilter src.convertTo(dst, CV_32F); } + inline void subsample(Mat& src, Mat& dst) + { + resize(src, dst, Size(w, h), 0, 0, INTER_LINEAR); + } + + inline void upsample(Mat& src, Mat& dst) + { + resize(src, dst, Size(wOriginal, hOriginal), 0, 0, INTER_LINEAR); + } + private: /*Routines to parallelize boxFilter and convertTo*/ typedef void (GuidedFilterImpl::*TransformFunc)(Mat& src, Mat& dst); @@ -203,6 +216,20 @@ class GuidedFilterImpl : public GuidedFilter parallel_for_(pb.getRange(), pb); } + template + void parSubsample(V &src, V &dst) + { + GFTransform_ParBody pb(*this, src, dst, &GuidedFilterImpl::subsample); + parallel_for_(pb.getRange(), pb); + } + + template + void parUpsample(V &src, V &dst) + { + GFTransform_ParBody pb(*this, src, dst, &GuidedFilterImpl::upsample); + parallel_for_(pb.getRange(), pb); + } + private: /*Parallel body classes*/ inline void runParBody(const ParallelLoopBody& pb) @@ -582,7 +609,7 @@ void GuidedFilterImpl::ApplyTransform_ParBody::operator()(const Range& range) co { float *_g[4]; for (int gi = 0; gi < gf.gCnNum; gi++) - _g[gi] = gf.guideCn[gi].ptr(i); + _g[gi] = gf.guideCnOriginal[gi].ptr(i); float *betaDst, *g, *a; for (int si = 0; si < srcCnNum; si++) @@ -593,7 +620,7 @@ void GuidedFilterImpl::ApplyTransform_ParBody::operator()(const Range& range) co a = alpha[si][gi].ptr(i); g = _g[gi]; - add_mul(betaDst, a, g, gf.w); + add_mul(betaDst, a, g, gf.wOriginal); } } } @@ -666,28 +693,42 @@ void GuidedFilterImpl::getWalkPattern(int eid, int &cn1, int &cn2) cn2 = wdata[6 * 2 * (gCnNum-1) + 6 + eid]; } -Ptr GuidedFilterImpl::create(InputArray guide, int radius, double eps) +Ptr GuidedFilterImpl::create(InputArray guide, int radius, double eps, double scale) { GuidedFilterImpl *gf = new GuidedFilterImpl(); - gf->init(guide, radius, eps); + gf->init(guide, radius, eps, scale); return Ptr(gf); } -void GuidedFilterImpl::init(InputArray guide, int radius_, double eps_) +void GuidedFilterImpl::init(InputArray guide, int radius_, double eps_, double scale_) { CV_Assert( !guide.empty() && radius_ >= 0 && eps_ >= 0 ); CV_Assert( (guide.depth() == CV_32F || guide.depth() == CV_8U || guide.depth() == CV_16U) && (guide.channels() <= 3) ); + CV_Assert( scale_ <= 1.0 ); radius = radius_; eps = eps_; + scale = scale_; - splitFirstNChannels(guide, guideCn, 3); - gCnNum = (int)guideCn.size(); - h = guideCn[0].rows; - w = guideCn[0].cols; + splitFirstNChannels(guide, guideCnOriginal, 3); + gCnNum = (int)guideCnOriginal.size(); + hOriginal = guideCnOriginal[0].rows; + wOriginal = guideCnOriginal[0].cols; + h = int(hOriginal * scale); + w = int(wOriginal * scale); + + parConvertToWorkType(guideCnOriginal, guideCnOriginal); + if (scale < 1.0) + { + guideCn.resize(gCnNum); + parSubsample(guideCnOriginal, guideCn); + } + else + { + guideCn = guideCnOriginal; + } guideCnMean.resize(gCnNum); - parConvertToWorkType(guideCn, guideCn); parMeanFilter(guideCn, guideCnMean); SymArray2D covars; @@ -712,7 +753,7 @@ void GuidedFilterImpl::computeCovGuide(SymArray2D& covars) void GuidedFilterImpl::filter(InputArray src, OutputArray dst, int dDepth /*= -1*/) { CV_Assert( !src.empty() && (src.depth() == CV_32F || src.depth() == CV_8U) ); - if (src.rows() != h || src.cols() != w) + if (src.rows() != hOriginal || src.cols() != wOriginal) { CV_Error(Error::StsBadSize, "Size of filtering image must be equal to size of guide image"); return; @@ -725,6 +766,11 @@ void GuidedFilterImpl::filter(InputArray src, OutputArray dst, int dDepth /*= -1 vector& srcCnMean = srcCn; split(src, srcCn); + if (scale < 1.0) + { + parSubsample(srcCn, srcCn); + } + if (src.depth() != CV_32F) { parConvertToWorkType(srcCn, srcCn); @@ -749,7 +795,13 @@ void GuidedFilterImpl::filter(InputArray src, OutputArray dst, int dDepth /*= -1 parMeanFilter(beta, beta); parMeanFilter(alpha, alpha); - runParBody(ApplyTransform_ParBody(*this, alpha, beta)); + if (scale < 1.0) + { + parUpsample(beta, beta); + parUpsample(alpha, alpha); + } + + parallel_for_(Range(0, hOriginal), ApplyTransform_ParBody(*this, alpha, beta)); if (dDepth != CV_32F) { for (int i = 0; i < srcCnNum; i++) @@ -782,15 +834,15 @@ void GuidedFilterImpl::computeCovGuideAndSrc(vector& srcCn, vector& sr ////////////////////////////////////////////////////////////////////////// CV_EXPORTS_W -Ptr createGuidedFilter(InputArray guide, int radius, double eps) +Ptr createGuidedFilter(InputArray guide, int radius, double eps, double scale) { - return Ptr(GuidedFilterImpl::create(guide, radius, eps)); + return Ptr(GuidedFilterImpl::create(guide, radius, eps, scale)); } CV_EXPORTS_W -void guidedFilter(InputArray guide, InputArray src, OutputArray dst, int radius, double eps, int dDepth) +void guidedFilter(InputArray guide, InputArray src, OutputArray dst, int radius, double eps, int dDepth, double scale) { - Ptr gf = createGuidedFilter(guide, radius, eps); + Ptr gf = createGuidedFilter(guide, radius, eps, scale); gf->filter(src, dst, dDepth); } diff --git a/modules/ximgproc/src/thinning.cpp b/modules/ximgproc/src/thinning.cpp index 00017fe0acb..76e8ad15425 100644 --- a/modules/ximgproc/src/thinning.cpp +++ b/modules/ximgproc/src/thinning.cpp @@ -96,6 +96,10 @@ static void thinningIteration(Mat img, int iter, int thinningType){ Mat marker = Mat::zeros(img.size(), CV_8UC1); int rows = img.rows; int cols = img.cols; + marker.col(0).setTo(1); + marker.col(cols - 1).setTo(1); + marker.row(0).setTo(1); + marker.row(rows - 1).setTo(1); if(thinningType == THINNING_ZHANGSUEN){ marker.forEach([=](uchar& value, const int postion[]) { @@ -133,6 +137,7 @@ static void thinningIteration(Mat img, int iter, int thinningType){ //int m1 = iter == 0 ? (p2 * p4 * p6) : (p2 * p4 * p8); //int m2 = iter == 0 ? (p4 * p6 * p8) : (p2 * p6 * p8); //if (A == 1 && (B >= 2 && B <= 6) && m1 == 0 && m2 == 0) value = 0; + // else value = 1; }); } if(thinningType == THINNING_GUOHALL){ @@ -170,6 +175,7 @@ static void thinningIteration(Mat img, int iter, int thinningType){ //int N = N1 < N2 ? N1 : N2; //int m = iter == 0 ? ((p6 | p7 | (!p9)) & p8) : ((p2 | p3 | (!p5)) & p4); //if ((C == 1) && ((N >= 2) && ((N <= 3)) & (m == 0))) value = 0; + // else value = 1; }); } @@ -183,16 +189,17 @@ void thinning(InputArray input, OutputArray output, int thinningType){ // Enforce the range of the input image to be in between 0 - 255 processed /= 255; - Mat prev = Mat::zeros(processed.size(), CV_8UC1); + Mat prev = processed.clone(); Mat diff; do { thinningIteration(processed, 0, thinningType); thinningIteration(processed, 1, thinningType); absdiff(processed, prev, diff); + if (!hasNonZero(diff)) break; processed.copyTo(prev); } - while (countNonZero(diff) > 0); + while (true); processed *= 255; diff --git a/modules/ximgproc/test/test_guided_filter.cpp b/modules/ximgproc/test/test_guided_filter.cpp index b8293da65d0..c2263ecdd3b 100644 --- a/modules/ximgproc/test/test_guided_filter.cpp +++ b/modules/ximgproc/test/test_guided_filter.cpp @@ -64,6 +64,16 @@ static Mat convertTypeAndSize(Mat src, int dstType, Size dstSize) return dst; } +static double laplacianVariance(Mat src) +{ + Mat laplacian; + Laplacian(src, laplacian, CV_64F); + Scalar mean, stddev; + meanStdDev(laplacian, mean, stddev); + double variance = stddev.val[0] * stddev.val[0]; + return variance; +} + class GuidedFilterRefImpl : public GuidedFilter { int height, width, rad, chNum; @@ -350,6 +360,46 @@ TEST_P(GuidedFilterTest, accuracy) } } +TEST_P(GuidedFilterTest, accuracyFastGuidedFilter) +{ + int radius = 8; + double eps = 1; + + GFParams params = GetParam(); + string guideFileName = get<1>(params); + string srcFileName = get<2>(params); + int guideCnNum = 3; + int srcCnNum = get<0>(params); + + Mat guide = imread(getOpenCVExtraDir() + guideFileName); + Mat src = imread(getOpenCVExtraDir() + srcFileName); + ASSERT_TRUE(!guide.empty() && !src.empty()); + + Size dstSize(guide.cols, guide.rows); + guide = convertTypeAndSize(guide, CV_MAKE_TYPE(guide.depth(), guideCnNum), dstSize); + src = convertTypeAndSize(src, CV_MAKE_TYPE(src.depth(), srcCnNum), dstSize); + Mat outputRef; + ximgproc::guidedFilter(guide, src, outputRef, radius, eps); + + for (double scale : {1./2, 1./3, 1./4}) { + Mat outputFastGuidedFilter; + ximgproc::guidedFilter(guide, src, outputFastGuidedFilter, radius, eps, -1, scale); + + Mat guideNaiveDownsampled, srcNaiveDownsampled, outputNaiveDownsampled; + resize(guide, guideNaiveDownsampled, {}, scale, scale, INTER_LINEAR); + resize(src, srcNaiveDownsampled, {}, scale, scale, INTER_LINEAR); + ximgproc::guidedFilter(guideNaiveDownsampled, srcNaiveDownsampled, outputNaiveDownsampled, radius, eps); + resize(outputNaiveDownsampled, outputNaiveDownsampled, dstSize, 0, 0, INTER_LINEAR); + + double laplacianVarianceFastGuidedFilter = laplacianVariance(outputFastGuidedFilter); + double laplacianVarianceNaiveDownsampled = laplacianVariance(outputNaiveDownsampled); + EXPECT_GT(laplacianVarianceFastGuidedFilter, laplacianVarianceNaiveDownsampled); + + double normL2 = cv::norm(outputFastGuidedFilter, outputRef, NORM_L2) / guide.total(); + EXPECT_LE(normL2, 1.0/48.0/scale); + } +} + TEST_P(GuidedFilterTest, smallParamsIssue) { GFParams params = GetParam(); diff --git a/modules/ximgproc/test/test_sparse_match_interpolator.cpp b/modules/ximgproc/test/test_sparse_match_interpolator.cpp index 261d6109bf9..f53a5add027 100644 --- a/modules/ximgproc/test/test_sparse_match_interpolator.cpp +++ b/modules/ximgproc/test/test_sparse_match_interpolator.cpp @@ -17,22 +17,22 @@ Mat readOpticalFlow( const String& path ) // CV_Assert(sizeof(float) == 4); //FIXME: ensure right sizes of int and float - here and in writeOpticalFlow() - Mat_ flow; + Mat flow; std::ifstream file(path.c_str(), std::ios_base::binary); if ( !file.good() ) - return std::move(flow); // no file - return empty matrix + return flow; // no file - return empty matrix float tag; file.read((char*) &tag, sizeof(float)); if ( tag != FLOW_TAG_FLOAT ) - return std::move(flow); + return flow; int width, height; file.read((char*) &width, 4); file.read((char*) &height, 4); - flow.create(height, width); + flow.create(height, width, CV_32FC2); for ( int i = 0; i < flow.rows; ++i ) { @@ -44,14 +44,14 @@ Mat readOpticalFlow( const String& path ) if ( !file.good() ) { flow.release(); - return std::move(flow); + return flow; } - flow(i, j) = u; + flow.at(i, j) = u; } } file.close(); - return std::move(flow); + return flow; } CV_ENUM(GuideTypes, CV_8UC1, CV_8UC3) diff --git a/modules/ximgproc/test/test_thinning.cpp b/modules/ximgproc/test/test_thinning.cpp index 733fe85d473..7d5c5ac480f 100644 --- a/modules/ximgproc/test/test_thinning.cpp +++ b/modules/ximgproc/test/test_thinning.cpp @@ -6,9 +6,12 @@ namespace opencv_test { namespace { -static int createTestImage(Mat& src) +static int createTestImage(Mat1b& src) { - src = Mat::zeros(Size(256, 256), CV_8UC1); + src = Mat1b::zeros(Size(256, 256)); + // Create a corner point that should not be affected. + src(0, 0) = 255; + for (int x = 50; x < src.cols - 50; x += 50) { cv::circle(src, Point(x, x/2), 30 + x/2, Scalar(255), 5); @@ -20,13 +23,14 @@ static int createTestImage(Mat& src) TEST(ximgproc_Thinning, simple_ZHANGSUEN) { - Mat src; + Mat1b src; int src_pixels = createTestImage(src); - Mat dst; + Mat1b dst; thinning(src, dst, THINNING_ZHANGSUEN); int dst_pixels = countNonZero(dst); EXPECT_LE(dst_pixels, src_pixels); + EXPECT_EQ(dst(0, 0), 255); #if 0 imshow("src", src); imshow("dst", dst); waitKey(); @@ -35,13 +39,14 @@ TEST(ximgproc_Thinning, simple_ZHANGSUEN) TEST(ximgproc_Thinning, simple_GUOHALL) { - Mat src; + Mat1b src; int src_pixels = createTestImage(src); - Mat dst; + Mat1b dst; thinning(src, dst, THINNING_GUOHALL); int dst_pixels = countNonZero(dst); EXPECT_LE(dst_pixels, src_pixels); + EXPECT_EQ(dst(0, 0), 255); #if 0 imshow("src", src); imshow("dst", dst); waitKey();