From b603a10002b98a2f7d9132a0c8dc21fec3cc7339 Mon Sep 17 00:00:00 2001 From: Ray Wang Date: Fri, 19 Jan 2024 13:58:56 +0800 Subject: [PATCH 01/12] implement fast guided filter --- modules/ximgproc/doc/ximgproc.bib | 7 ++ .../include/opencv2/ximgproc/edge_filter.hpp | 20 +++-- modules/ximgproc/perf/perf_guided_filter.cpp | 7 +- modules/ximgproc/src/guided_filter.cpp | 88 +++++++++++++++---- modules/ximgproc/test/test_guided_filter.cpp | 50 +++++++++++ 5 files changed, 144 insertions(+), 28 deletions(-) diff --git a/modules/ximgproc/doc/ximgproc.bib b/modules/ximgproc/doc/ximgproc.bib index f081f54d3ce..279bac1d115 100644 --- a/modules/ximgproc/doc/ximgproc.bib +++ b/modules/ximgproc/doc/ximgproc.bib @@ -54,6 +54,13 @@ @incollection{Kaiming10 publisher={Springer} } +@article{Kaiming15, + title={Fast guided filter}, + author={He, Kaiming and Sun, Jian}, + journal={arXiv preprint arXiv:1505.00996}, + year={2015} +} + @inproceedings{Lee14, title={Outdoor place recognition in urban environments using straight lines}, author={Lee, Jin Han and Lee, Sehyung and Zhang, Guoxuan and Lim, Jongwoo and Chung, Wan Kyun and Suh, Il Hong}, diff --git a/modules/ximgproc/include/opencv2/ximgproc/edge_filter.hpp b/modules/ximgproc/include/opencv2/ximgproc/edge_filter.hpp index 82be7c71b7f..19b05451147 100644 --- a/modules/ximgproc/include/opencv2/ximgproc/edge_filter.hpp +++ b/modules/ximgproc/include/opencv2/ximgproc/edge_filter.hpp @@ -123,15 +123,15 @@ void dtFilter(InputArray guide, InputArray src, OutputArray dst, double sigmaSpa ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// -/** @brief Interface for realizations of Guided Filter. +/** @brief Interface for realizations of (Fast) Guided Filter. -For more details about this filter see @cite Kaiming10 . +For more details about this filter see @cite Kaiming10 @cite Kaiming15 . */ class CV_EXPORTS_W GuidedFilter : public Algorithm { public: - /** @brief Apply Guided Filter to the filtering image. + /** @brief Apply (Fast) Guided Filter to the filtering image. @param src filtering image with any numbers of channels. @@ -153,11 +153,14 @@ channels then only first 3 channels will be used. @param eps regularization term of Guided Filter. \f${eps}^2\f$ is similar to the sigma in the color space into bilateralFilter. -For more details about Guided Filter parameters, see the original article @cite Kaiming10 . +@param scale subsample factor of Fast Guided Filter, use a scale less than 1 to speeds up computation +with almost no visible degradation. (e.g. scale==0.5 shrinks the image by 2x inside the filter) + +For more details about (Fast) Guided Filter parameters, see the original articles @cite Kaiming10 @cite Kaiming15 . */ -CV_EXPORTS_W Ptr createGuidedFilter(InputArray guide, int radius, double eps); +CV_EXPORTS_W Ptr createGuidedFilter(InputArray guide, int radius, double eps, double scale = 1.0); -/** @brief Simple one-line Guided Filter call. +/** @brief Simple one-line (Fast) Guided Filter call. If you have multiple images to filter with the same guided image then use GuidedFilter interface to avoid extra computations on initialization stage. @@ -176,8 +179,11 @@ space into bilateralFilter. @param dDepth optional depth of the output image. +@param scale subsample factor of Fast Guided Filter, use a scale less than 1 to speeds up computation +with almost no visible degradation. (e.g. scale==0.5 shrinks the image by 2x inside the filter) + @sa bilateralFilter, dtFilter, amFilter */ -CV_EXPORTS_W void guidedFilter(InputArray guide, InputArray src, OutputArray dst, int radius, double eps, int dDepth = -1); +CV_EXPORTS_W void guidedFilter(InputArray guide, InputArray src, OutputArray dst, int radius, double eps, int dDepth = -1, double scale = 1.0); ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// diff --git a/modules/ximgproc/perf/perf_guided_filter.cpp b/modules/ximgproc/perf/perf_guided_filter.cpp index 9a0058a5505..7649acea9b3 100644 --- a/modules/ximgproc/perf/perf_guided_filter.cpp +++ b/modules/ximgproc/perf/perf_guided_filter.cpp @@ -7,11 +7,11 @@ namespace opencv_test { namespace { CV_ENUM(GuideTypes, CV_8UC1, CV_8UC3, CV_32FC1, CV_32FC3); CV_ENUM(SrcTypes, CV_8UC1, CV_8UC3, CV_32FC1, CV_32FC3); -typedef tuple GFParams; +typedef tuple GFParams; typedef TestBaseWithParam GuidedFilterPerfTest; -PERF_TEST_P( GuidedFilterPerfTest, perf, Combine(GuideTypes::all(), SrcTypes::all(), Values(sz1080p, sz2K)) ) +PERF_TEST_P( GuidedFilterPerfTest, perf, Combine(GuideTypes::all(), SrcTypes::all(), Values(sz1080p, sz2K), Values(1./1, 1./2, 1./3, 1./4)) ) { RNG rng(0); @@ -19,6 +19,7 @@ PERF_TEST_P( GuidedFilterPerfTest, perf, Combine(GuideTypes::all(), SrcTypes::al int guideType = get<0>(params); int srcType = get<1>(params); Size sz = get<2>(params); + double scale = get<3>(params); Mat guide(sz, guideType); Mat src(sz, srcType); @@ -30,7 +31,7 @@ PERF_TEST_P( GuidedFilterPerfTest, perf, Combine(GuideTypes::all(), SrcTypes::al { int radius = rng.uniform(5, 30); double eps = rng.uniform(0.1, 1e5); - guidedFilter(guide, src, dst, radius, eps); + guidedFilter(guide, src, dst, radius, eps, -1, scale); } SANITY_CHECK_NOTHING(); diff --git a/modules/ximgproc/src/guided_filter.cpp b/modules/ximgproc/src/guided_filter.cpp index 11fa0f6fcb0..9d8333aecba 100644 --- a/modules/ximgproc/src/guided_filter.cpp +++ b/modules/ximgproc/src/guided_filter.cpp @@ -128,7 +128,7 @@ class GuidedFilterImpl : public GuidedFilter { public: - static Ptr create(InputArray guide, int radius, double eps); + static Ptr create(InputArray guide, int radius, double eps, double scale); void filter(InputArray src, OutputArray dst, int dDepth = -1) CV_OVERRIDE; @@ -136,10 +136,13 @@ class GuidedFilterImpl : public GuidedFilter int radius; double eps; + double scale; int h, w; + int hOriginal, wOriginal; vector guideCn; vector guideCnMean; + vector guideCnOriginal; SymArray2D covarsInv; @@ -149,7 +152,7 @@ class GuidedFilterImpl : public GuidedFilter GuidedFilterImpl() {} - void init(InputArray guide, int radius, double eps); + void init(InputArray guide, int radius, double eps, double scale); void computeCovGuide(SymArray2D& covars); @@ -167,6 +170,16 @@ class GuidedFilterImpl : public GuidedFilter src.convertTo(dst, CV_32F); } + inline void subsample(Mat& src, Mat& dst) + { + resize(src, dst, Size(w, h), 0, 0, INTER_LINEAR); + } + + inline void upsample(Mat& src, Mat& dst) + { + resize(src, dst, Size(wOriginal, hOriginal), 0, 0, INTER_LINEAR); + } + private: /*Routines to parallelize boxFilter and convertTo*/ typedef void (GuidedFilterImpl::*TransformFunc)(Mat& src, Mat& dst); @@ -203,6 +216,20 @@ class GuidedFilterImpl : public GuidedFilter parallel_for_(pb.getRange(), pb); } + template + void parSubsample(V &src, V &dst) + { + GFTransform_ParBody pb(*this, src, dst, &GuidedFilterImpl::subsample); + parallel_for_(pb.getRange(), pb); + } + + template + void parUpsample(V &src, V &dst) + { + GFTransform_ParBody pb(*this, src, dst, &GuidedFilterImpl::upsample); + parallel_for_(pb.getRange(), pb); + } + private: /*Parallel body classes*/ inline void runParBody(const ParallelLoopBody& pb) @@ -582,7 +609,7 @@ void GuidedFilterImpl::ApplyTransform_ParBody::operator()(const Range& range) co { float *_g[4]; for (int gi = 0; gi < gf.gCnNum; gi++) - _g[gi] = gf.guideCn[gi].ptr(i); + _g[gi] = gf.guideCnOriginal[gi].ptr(i); float *betaDst, *g, *a; for (int si = 0; si < srcCnNum; si++) @@ -593,7 +620,7 @@ void GuidedFilterImpl::ApplyTransform_ParBody::operator()(const Range& range) co a = alpha[si][gi].ptr(i); g = _g[gi]; - add_mul(betaDst, a, g, gf.w); + add_mul(betaDst, a, g, gf.wOriginal); } } } @@ -666,28 +693,42 @@ void GuidedFilterImpl::getWalkPattern(int eid, int &cn1, int &cn2) cn2 = wdata[6 * 2 * (gCnNum-1) + 6 + eid]; } -Ptr GuidedFilterImpl::create(InputArray guide, int radius, double eps) +Ptr GuidedFilterImpl::create(InputArray guide, int radius, double eps, double scale) { GuidedFilterImpl *gf = new GuidedFilterImpl(); - gf->init(guide, radius, eps); + gf->init(guide, radius, eps, scale); return Ptr(gf); } -void GuidedFilterImpl::init(InputArray guide, int radius_, double eps_) +void GuidedFilterImpl::init(InputArray guide, int radius_, double eps_, double scale_) { CV_Assert( !guide.empty() && radius_ >= 0 && eps_ >= 0 ); CV_Assert( (guide.depth() == CV_32F || guide.depth() == CV_8U || guide.depth() == CV_16U) && (guide.channels() <= 3) ); + CV_Assert( scale_ <= 1.0 ); radius = radius_; eps = eps_; + scale = scale_; - splitFirstNChannels(guide, guideCn, 3); - gCnNum = (int)guideCn.size(); - h = guideCn[0].rows; - w = guideCn[0].cols; + splitFirstNChannels(guide, guideCnOriginal, 3); + gCnNum = (int)guideCnOriginal.size(); + hOriginal = guideCnOriginal[0].rows; + wOriginal = guideCnOriginal[0].cols; + h = int(hOriginal * scale); + w = int(wOriginal * scale); + + parConvertToWorkType(guideCnOriginal, guideCnOriginal); + if (scale < 1.0) + { + guideCn.resize(gCnNum); + parSubsample(guideCnOriginal, guideCn); + } + else + { + guideCn = guideCnOriginal; + } guideCnMean.resize(gCnNum); - parConvertToWorkType(guideCn, guideCn); parMeanFilter(guideCn, guideCnMean); SymArray2D covars; @@ -712,7 +753,7 @@ void GuidedFilterImpl::computeCovGuide(SymArray2D& covars) void GuidedFilterImpl::filter(InputArray src, OutputArray dst, int dDepth /*= -1*/) { CV_Assert( !src.empty() && (src.depth() == CV_32F || src.depth() == CV_8U) ); - if (src.rows() != h || src.cols() != w) + if (src.rows() != hOriginal || src.cols() != wOriginal) { CV_Error(Error::StsBadSize, "Size of filtering image must be equal to size of guide image"); return; @@ -725,6 +766,11 @@ void GuidedFilterImpl::filter(InputArray src, OutputArray dst, int dDepth /*= -1 vector& srcCnMean = srcCn; split(src, srcCn); + if (scale < 1.0) + { + parSubsample(srcCn, srcCn); + } + if (src.depth() != CV_32F) { parConvertToWorkType(srcCn, srcCn); @@ -749,7 +795,13 @@ void GuidedFilterImpl::filter(InputArray src, OutputArray dst, int dDepth /*= -1 parMeanFilter(beta, beta); parMeanFilter(alpha, alpha); - runParBody(ApplyTransform_ParBody(*this, alpha, beta)); + if (scale < 1.0) + { + parUpsample(beta, beta); + parUpsample(alpha, alpha); + } + + parallel_for_(Range(0, hOriginal), ApplyTransform_ParBody(*this, alpha, beta)); if (dDepth != CV_32F) { for (int i = 0; i < srcCnNum; i++) @@ -782,15 +834,15 @@ void GuidedFilterImpl::computeCovGuideAndSrc(vector& srcCn, vector& sr ////////////////////////////////////////////////////////////////////////// CV_EXPORTS_W -Ptr createGuidedFilter(InputArray guide, int radius, double eps) +Ptr createGuidedFilter(InputArray guide, int radius, double eps, double scale) { - return Ptr(GuidedFilterImpl::create(guide, radius, eps)); + return Ptr(GuidedFilterImpl::create(guide, radius, eps, scale)); } CV_EXPORTS_W -void guidedFilter(InputArray guide, InputArray src, OutputArray dst, int radius, double eps, int dDepth) +void guidedFilter(InputArray guide, InputArray src, OutputArray dst, int radius, double eps, int dDepth, double scale) { - Ptr gf = createGuidedFilter(guide, radius, eps); + Ptr gf = createGuidedFilter(guide, radius, eps, scale); gf->filter(src, dst, dDepth); } diff --git a/modules/ximgproc/test/test_guided_filter.cpp b/modules/ximgproc/test/test_guided_filter.cpp index b8293da65d0..c2263ecdd3b 100644 --- a/modules/ximgproc/test/test_guided_filter.cpp +++ b/modules/ximgproc/test/test_guided_filter.cpp @@ -64,6 +64,16 @@ static Mat convertTypeAndSize(Mat src, int dstType, Size dstSize) return dst; } +static double laplacianVariance(Mat src) +{ + Mat laplacian; + Laplacian(src, laplacian, CV_64F); + Scalar mean, stddev; + meanStdDev(laplacian, mean, stddev); + double variance = stddev.val[0] * stddev.val[0]; + return variance; +} + class GuidedFilterRefImpl : public GuidedFilter { int height, width, rad, chNum; @@ -350,6 +360,46 @@ TEST_P(GuidedFilterTest, accuracy) } } +TEST_P(GuidedFilterTest, accuracyFastGuidedFilter) +{ + int radius = 8; + double eps = 1; + + GFParams params = GetParam(); + string guideFileName = get<1>(params); + string srcFileName = get<2>(params); + int guideCnNum = 3; + int srcCnNum = get<0>(params); + + Mat guide = imread(getOpenCVExtraDir() + guideFileName); + Mat src = imread(getOpenCVExtraDir() + srcFileName); + ASSERT_TRUE(!guide.empty() && !src.empty()); + + Size dstSize(guide.cols, guide.rows); + guide = convertTypeAndSize(guide, CV_MAKE_TYPE(guide.depth(), guideCnNum), dstSize); + src = convertTypeAndSize(src, CV_MAKE_TYPE(src.depth(), srcCnNum), dstSize); + Mat outputRef; + ximgproc::guidedFilter(guide, src, outputRef, radius, eps); + + for (double scale : {1./2, 1./3, 1./4}) { + Mat outputFastGuidedFilter; + ximgproc::guidedFilter(guide, src, outputFastGuidedFilter, radius, eps, -1, scale); + + Mat guideNaiveDownsampled, srcNaiveDownsampled, outputNaiveDownsampled; + resize(guide, guideNaiveDownsampled, {}, scale, scale, INTER_LINEAR); + resize(src, srcNaiveDownsampled, {}, scale, scale, INTER_LINEAR); + ximgproc::guidedFilter(guideNaiveDownsampled, srcNaiveDownsampled, outputNaiveDownsampled, radius, eps); + resize(outputNaiveDownsampled, outputNaiveDownsampled, dstSize, 0, 0, INTER_LINEAR); + + double laplacianVarianceFastGuidedFilter = laplacianVariance(outputFastGuidedFilter); + double laplacianVarianceNaiveDownsampled = laplacianVariance(outputNaiveDownsampled); + EXPECT_GT(laplacianVarianceFastGuidedFilter, laplacianVarianceNaiveDownsampled); + + double normL2 = cv::norm(outputFastGuidedFilter, outputRef, NORM_L2) / guide.total(); + EXPECT_LE(normL2, 1.0/48.0/scale); + } +} + TEST_P(GuidedFilterTest, smallParamsIssue) { GFParams params = GetParam(); From fb69ae38cc774916e6132409c8245698ebe0d310 Mon Sep 17 00:00:00 2001 From: catree Date: Sun, 21 Apr 2024 13:57:39 +0200 Subject: [PATCH 02/12] Use "compute" instead of "get" term. Add some classical visual servo references. --- modules/tracking/doc/tracking.bib | 40 +++++++++++++++++++ .../include/opencv2/tracking/twist.hpp | 11 ++--- modules/tracking/src/twist.cpp | 4 +- modules/tracking/test/test_twist.cpp | 4 +- 4 files changed, 50 insertions(+), 9 deletions(-) diff --git a/modules/tracking/doc/tracking.bib b/modules/tracking/doc/tracking.bib index 78ce6c32fa1..ce49bb4026e 100644 --- a/modules/tracking/doc/tracking.bib +++ b/modules/tracking/doc/tracking.bib @@ -76,3 +76,43 @@ @Article{Lukezic_IJCV2018 journal={International Journal of Computer Vision}, year={2018}, } + +@article{chaumette:inria-00350283, + title={{Visual servo control, Part I: Basic approaches}}, + author={Chaumette, Fran{\c c}ois and Hutchinson, S.}, + url={https://inria.hal.science/inria-00350283}, + journal={{IEEE Robotics and Automation Magazine}}, + publisher={{Institute of Electrical and Electronics Engineers}}, + volume={13}, + number={4}, + pages={82-90}, + year={2006}, + pdf={https://inria.hal.science/inria-00350283/file/2006_ieee_ram_chaumette.pdf}, + hal_id={inria-00350283}, + hal_version={v1}, +} + +@article{chaumette:inria-00350638, + title={{Visual servo control, Part II: Advanced approaches}}, + author={Chaumette, Fran{\c c}ois and Hutchinson, S.}, + url={https://inria.hal.science/inria-00350638}, + journal={{IEEE Robotics and Automation Magazine}}, + publisher={{Institute of Electrical and Electronics Engineers}}, + volume={14}, + number={1}, + pages={109-118}, + year={2007}, + pdf={https://inria.hal.science/inria-00350638/file/2007_ieee_ram_chaumette.pdf}, + hal_id={inria-00350638}, + hal_version={v1}, +} + +@article{Hutchinson1996ATO, + title={A tutorial on visual servo control}, + author={Seth A. Hutchinson and Gregory Hager and Peter Corke}, + journal={IEEE Trans. Robotics Autom.}, + year={1996}, + volume={12}, + pages={651-670}, + url={https://api.semanticscholar.org/CorpusID:1814423} +} diff --git a/modules/tracking/include/opencv2/tracking/twist.hpp b/modules/tracking/include/opencv2/tracking/twist.hpp index 8d998beda33..1452a00cddd 100644 --- a/modules/tracking/include/opencv2/tracking/twist.hpp +++ b/modules/tracking/include/opencv2/tracking/twist.hpp @@ -16,7 +16,7 @@ inline namespace tracking * @brief Compute the camera twist from a set of 2D pixel locations, their * velocities, depth values and intrinsic parameters of the camera. The pixel * velocities are usually obtained from optical flow algorithms, both dense and - * sparse flow can be used to compute the flow between images and duv computed by + * sparse flow can be used to compute the flow between images and \p duv computed by * dividing the flow by the time interval between the images. * * @param uv 2xN matrix of 2D pixel locations @@ -30,9 +30,10 @@ CV_EXPORTS cv::Vec6d computeTwist(const cv::Mat& uv, const cv::Mat& duv, const c const cv::Mat& K); /** - * @brief Compute the interaction matrix for a set of 2D pixels. This is usually + * @brief Compute the interaction matrix ( @cite Hutchinson1996ATO @cite chaumette:inria-00350283 + * @cite chaumette:inria-00350638 ) for a set of 2D pixels. This is usually * used in visual servoing applications to command a robot to move at desired pixel - * locations/velocities. By inverting this matrix one can estimate camera spatial + * locations/velocities. By inverting this matrix, one can estimate camera spatial * velocity i.e., the twist. * * @param uv 2xN matrix of 2D pixel locations @@ -41,8 +42,8 @@ CV_EXPORTS cv::Vec6d computeTwist(const cv::Mat& uv, const cv::Mat& duv, const c * @param J 2Nx6 interaction matrix * */ -CV_EXPORTS void getInteractionMatrix(const cv::Mat& uv, const cv::Mat& depths, const cv::Mat& K, - cv::Mat& J); +CV_EXPORTS void computeInteractionMatrix(const cv::Mat& uv, const cv::Mat& depths, const cv::Mat& K, + cv::Mat& J); //! @} diff --git a/modules/tracking/src/twist.cpp b/modules/tracking/src/twist.cpp index 1ff84c42582..bad6661ff1c 100644 --- a/modules/tracking/src/twist.cpp +++ b/modules/tracking/src/twist.cpp @@ -9,7 +9,7 @@ namespace detail inline namespace tracking { -void getInteractionMatrix(const cv::Mat& uv, const cv::Mat& depths, const cv::Mat& K, cv::Mat& J) +void computeInteractionMatrix(const cv::Mat& uv, const cv::Mat& depths, const cv::Mat& K, cv::Mat& J) { CV_Assert(uv.cols == depths.cols); CV_Assert(depths.type() == CV_32F); @@ -64,7 +64,7 @@ cv::Vec6d computeTwist(const cv::Mat& uv, const cv::Mat& duv, const cv::Mat& dep CV_Assert(uv.cols * 2 == duv.rows); cv::Mat J; - getInteractionMatrix(uv, depths, K, J); + computeInteractionMatrix(uv, depths, K, J); cv::Mat Jinv; cv::invert(J, Jinv, cv::DECOMP_SVD); cv::Mat twist = Jinv * duv; diff --git a/modules/tracking/test/test_twist.cpp b/modules/tracking/test/test_twist.cpp index 3911f28aea8..f365d811848 100644 --- a/modules/tracking/test/test_twist.cpp +++ b/modules/tracking/test/test_twist.cpp @@ -39,7 +39,7 @@ TEST_F(TwistTest, TestInteractionMatrix) cv::Mat uv = cv::Mat(2, 1, CV_32F, {1.0f, 1.0f}); cv::Mat depth = cv::Mat(1, 1, CV_32F, {2.0f}); - getInteractionMatrix(uv, depth, K, J); + computeInteractionMatrix(uv, depth, K, J); ASSERT_EQ(J.cols, 6); ASSERT_EQ(J.rows, 2); float expected[2][6] = {{-0.5f, 0.0f, 0.5f, 1.0f, -2.0f, 1.0f}, @@ -87,7 +87,7 @@ TEST_F(TwistTest, TestComputeWithNonZeroPixelVelocities) float duv_data[] = {1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 4.0f}; cv::Mat duv = cv::Mat(6, 1, CV_32F, duv_data); - getInteractionMatrix(uv, depth, K, J); + computeInteractionMatrix(uv, depth, K, J); ASSERT_EQ(J.cols, 6); ASSERT_EQ(J.rows, 6); float expected_jac[6][6] = {{-1.0f, 0.0f, 1.0f, 1.0f, -2.0f, 1.0f}, From 4e766a039eda04b75f28f6905e4192d99b7874c7 Mon Sep 17 00:00:00 2001 From: sdy623 Date: Tue, 23 Apr 2024 21:43:40 +0900 Subject: [PATCH 03/12] cudaarithm: fix the compile faiure of CUDA 12.4.x . A slight API change of NPP nppiMeanStdDevGetBufferHostSize_8u_C1R The type of bufSize is size_t instead of int in CUDA 12.4.x --- modules/cudaarithm/src/reductions.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp index cfadad648a9..b70a128558f 100644 --- a/modules/cudaarithm/src/reductions.cpp +++ b/modules/cudaarithm/src/reductions.cpp @@ -151,7 +151,12 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, Stream& stream) sz.width = gsrc.cols; sz.height = gsrc.rows; +#if (CUDA_VERSION >= 12040) + size_t bufSize; +#else int bufSize; +#endif + #if (CUDA_VERSION <= 4020) nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) ); #else @@ -162,7 +167,8 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, Stream& stream) #endif BufferPool pool(stream); - GpuMat buf = pool.getBuffer(1, bufSize, gsrc.type()); + CV_Assert(bufSize <= std::numeric_limits::max()); + GpuMat buf = pool.getBuffer(1, static_cast(bufSize), gsrc.type()); // detail: https://github.com/opencv/opencv/issues/11063 //NppStreamHandler h(StreamAccessor::getStream(stream)); @@ -227,7 +233,12 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stre sz.width = gsrc.cols; sz.height = gsrc.rows; +#if (CUDA_VERSION >= 12040) + size_t bufSize; +#else int bufSize; +#endif + #if (CUDA_VERSION <= 4020) nppSafeCall( nppiMeanStdDev8uC1MRGetBufferHostSize(sz, &bufSize) ); #else @@ -238,7 +249,8 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stre #endif BufferPool pool(stream); - GpuMat buf = pool.getBuffer(1, bufSize, gsrc.type()); + CV_Assert(bufSize <= std::numeric_limits::max()); + GpuMat buf = pool.getBuffer(1, static_cast(bufSize), gsrc.type()); if(gsrc.type() == CV_8UC1) nppSafeCall( nppiMean_StdDev_8u_C1MR(gsrc.ptr(), static_cast(gsrc.step), gmask.ptr(), static_cast(gmask.step), From 6b1faf0d9bc174c5fb40c4941bac615392436e81 Mon Sep 17 00:00:00 2001 From: Pavel Rojtberg Date: Wed, 15 May 2024 08:30:24 +0200 Subject: [PATCH 04/12] Merge pull request #3735 from paroj:ovisup ovis: force camera extent update to get correct bbox #3735 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- fixes #3732 --- modules/ovis/CMakeLists.txt | 2 +- modules/ovis/samples/aruco_ar_demo.cpp | 14 ++++++++------ modules/ovis/samples/aruco_ar_demo.py | 2 +- modules/ovis/src/ovis.cpp | 1 + 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/modules/ovis/CMakeLists.txt b/modules/ovis/CMakeLists.txt index 912b86a590d..aff20c2b63e 100644 --- a/modules/ovis/CMakeLists.txt +++ b/modules/ovis/CMakeLists.txt @@ -24,7 +24,7 @@ ocv_glob_module_sources() ocv_module_include_directories() ocv_create_module() -ocv_add_samples(opencv_aruco) +ocv_add_samples(opencv_objdetect opencv_aruco) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wunused-parameter) ocv_target_link_libraries(${the_module} ${OGRE_LIBRARIES}) diff --git a/modules/ovis/samples/aruco_ar_demo.cpp b/modules/ovis/samples/aruco_ar_demo.cpp index 2398a7182a3..0b265beea84 100644 --- a/modules/ovis/samples/aruco_ar_demo.cpp +++ b/modules/ovis/samples/aruco_ar_demo.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include #include @@ -24,10 +24,12 @@ int main() const double focal_length = 800.0; // aruco - Ptr adict = aruco::getPredefinedDictionary(aruco::DICT_4X4_50); - //Mat out_img; - //aruco::drawMarker(adict, 0, 400, out_img); - //imshow("marker", out_img); + aruco::Dictionary adict = aruco::getPredefinedDictionary(aruco::DICT_4X4_50); + + aruco::ArucoDetector detector(adict); + Mat out_img; + adict.generateImageMarker(0, 400, out_img); + imshow("marker", out_img); // random calibration data, your mileage may vary Mat1d cm = Mat1d::zeros(3, 3); // init empty matrix @@ -53,7 +55,7 @@ int main() while (ovis::waitKey(1) != KEY_ESCAPE) { cap.read(img); win->setBackground(img); - aruco::detectMarkers(img, adict, corners, ids); + detector.detectMarkers(img, corners, ids); waitKey(1); diff --git a/modules/ovis/samples/aruco_ar_demo.py b/modules/ovis/samples/aruco_ar_demo.py index 72aeeaebea3..5877d915d9e 100644 --- a/modules/ovis/samples/aruco_ar_demo.py +++ b/modules/ovis/samples/aruco_ar_demo.py @@ -3,7 +3,7 @@ # aruco adict = cv.aruco.getPredefinedDictionary(cv.aruco.DICT_4X4_50) -cv.imshow("marker", cv.aruco.drawMarker(adict, 0, 400)) +cv.imshow("marker", adict.generateImageMarker(0, 400)) # random calibration data. your mileage may vary. imsize = (800, 600) diff --git a/modules/ovis/src/ovis.cpp b/modules/ovis/src/ovis.cpp index 8c0e3431e4c..38dfcbc2ee0 100644 --- a/modules/ovis/src/ovis.cpp +++ b/modules/ovis/src/ovis.cpp @@ -588,6 +588,7 @@ class WindowSceneImpl : public WindowScene cam->setDebugDisplayEnabled(true); cam->setNearClipDistance(1e-9); cam->setFarClipDistance(zFar); + cam->getFrustumExtents(); // force update _setCameraIntrinsics(cam, K, imsize); From cb08ac6b2ea890e3355c27caa782d3bfb1a7da69 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov <2536374+asmorkalov@users.noreply.github.com> Date: Wed, 15 May 2024 14:20:55 +0300 Subject: [PATCH 05/12] Merge pull request #3734 from asmorkalov:as/std_move_warning Fixed Wredundant-move produced by GCC 13.2 (Ubuntu 24.04). #3734 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- modules/face/src/facemarkLBF.cpp | 32 ++++--- modules/face/src/mace.cpp | 4 +- modules/mcc/test/test_mcc.cpp | 6 +- modules/rgbd/perf/perf_tsdf.cpp | 6 +- modules/rgbd/test/ocl/test_tsdf.cpp | 2 +- modules/rgbd/test/test_colored_kinfu.cpp | 20 ++--- modules/rgbd/test/test_kinfu.cpp | 4 +- modules/rgbd/test/test_tsdf.cpp | 2 +- modules/videostab/src/global_motion.cpp | 83 ++++++++++--------- .../test/test_sparse_match_interpolator.cpp | 14 ++-- 10 files changed, 87 insertions(+), 86 deletions(-) diff --git a/modules/face/src/facemarkLBF.cpp b/modules/face/src/facemarkLBF.cpp index df58c6319b7..49cd8f63fea 100644 --- a/modules/face/src/facemarkLBF.cpp +++ b/modules/face/src/facemarkLBF.cpp @@ -661,24 +661,22 @@ FacemarkLBFImpl::BBox::BBox(double _x, double _y, double w, double h) { // Project absolute shape to relative shape binding to this bbox Mat FacemarkLBFImpl::BBox::project(const Mat &shape) const { - Mat_ res(shape.rows, shape.cols); - const Mat_ &shape_ = (Mat_)shape; + Mat res(shape.rows, shape.cols, CV_64FC1); for (int i = 0; i < shape.rows; i++) { - res(i, 0) = (shape_(i, 0) - x_center) / x_scale; - res(i, 1) = (shape_(i, 1) - y_center) / y_scale; + res.at(i, 0) = (shape.at(i, 0) - x_center) / x_scale; + res.at(i, 1) = (shape.at(i, 1) - y_center) / y_scale; } - return std::move(res); + return res; } // Project relative shape to absolute shape binding to this bbox Mat FacemarkLBFImpl::BBox::reproject(const Mat &shape) const { - Mat_ res(shape.rows, shape.cols); - const Mat_ &shape_ = (Mat_)shape; + Mat res(shape.rows, shape.cols, CV_64FC1); for (int i = 0; i < shape.rows; i++) { - res(i, 0) = shape_(i, 0)*x_scale + x_center; - res(i, 1) = shape_(i, 1)*y_scale + y_center; + res.at(i, 0) = shape.at(i, 0)*x_scale + x_center; + res.at(i, 1) = shape.at(i, 1)*y_scale + y_center; } - return std::move(res); + return res; } Mat FacemarkLBFImpl::getMeanShape(std::vector >_shapes, std::vector &bboxes) { @@ -997,7 +995,7 @@ void FacemarkLBFImpl::RandomForest::train(std::vector &imgs, std::vector lbf_feat(1, landmark_n*trees_n); + Mat lbf_feat(1, landmark_n*trees_n, CV_32SC1); double scale; Mat_ rotate; calcSimilarityTransform(bbox.project(current_shape), mean_shape, scale, rotate); @@ -1036,10 +1034,10 @@ Mat FacemarkLBFImpl::RandomForest::generateLBF(Mat &img, Mat ¤t_shape, BBo idx = 2 * idx + 1; } } - lbf_feat(i*trees_n + j) = (i*trees_n + j)*base + code; + lbf_feat.at(i*trees_n + j) = (i*trees_n + j)*base + code; } } - return std::move(lbf_feat); + return lbf_feat; } void FacemarkLBFImpl::RandomForest::write(FileStorage fs, int k) { @@ -1365,7 +1363,7 @@ Mat FacemarkLBFImpl::Regressor::supportVectorRegression( Mat FacemarkLBFImpl::Regressor::globalRegressionPredict(const Mat &lbf, int stage) { const Mat_ &weight = (Mat_)gl_regression_weights[stage]; - Mat_ delta_shape(weight.rows / 2, 2); + Mat delta_shape(weight.rows / 2, 2, CV_64FC1); const double *w_ptr = NULL; const int *lbf_ptr = lbf.ptr(0); @@ -1374,14 +1372,14 @@ Mat FacemarkLBFImpl::Regressor::globalRegressionPredict(const Mat &lbf, int stag w_ptr = weight.ptr(2 * i); double y = 0; for (int j = 0; j < lbf.cols; j++) y += w_ptr[lbf_ptr[j]]; - delta_shape(i, 0) = y; + delta_shape.at(i, 0) = y; w_ptr = weight.ptr(2 * i + 1); y = 0; for (int j = 0; j < lbf.cols; j++) y += w_ptr[lbf_ptr[j]]; - delta_shape(i, 1) = y; + delta_shape.at(i, 1) = y; } - return std::move(delta_shape); + return delta_shape; } // Regressor::globalRegressionPredict Mat FacemarkLBFImpl::Regressor::predict(Mat &img, BBox &bbox) { diff --git a/modules/face/src/mace.cpp b/modules/face/src/mace.cpp index 2c09560bfe5..3b7a236828c 100644 --- a/modules/face/src/mace.cpp +++ b/modules/face/src/mace.cpp @@ -102,11 +102,11 @@ struct MACEImpl CV_FINAL : MACE { Mat complexInput; merge(input, 2, complexInput); - Mat_ dftImg(IMGSIZE*2, IMGSIZE*2, 0.0); + Mat dftImg(IMGSIZE*2, IMGSIZE*2, CV_64FC2, 0.0); complexInput.copyTo(dftImg(Rect(0,0,IMGSIZE,IMGSIZE))); dft(dftImg, dftImg); - return std::move(dftImg); + return dftImg; } diff --git a/modules/mcc/test/test_mcc.cpp b/modules/mcc/test/test_mcc.cpp index 374b829b4b2..37bd1f11fc7 100644 --- a/modules/mcc/test/test_mcc.cpp +++ b/modules/mcc/test/test_mcc.cpp @@ -102,7 +102,9 @@ TEST(CV_mcc_ccm_test, detect_Macbeth) // check Macbeth corners vector corners = checker->getBox(); - EXPECT_MAT_NEAR(gold_corners, corners, 3.6); // diff 3.57385 in ARM only + // diff 3.57385 corresponds to ARM v8 + // diff 4.37915 correspnds to Ubuntu 24.04 x86_64 configuration + EXPECT_MAT_NEAR(gold_corners, corners, 4.38); // read gold chartsRGB node = fs["chartsRGB"]; @@ -112,7 +114,7 @@ TEST(CV_mcc_ccm_test, detect_Macbeth) // check chartsRGB Mat chartsRGB = checker->getChartsRGB(); - EXPECT_MAT_NEAR(goldChartsRGB.col(1), chartsRGB.col(1), 0.25); // diff 0.240634 in ARM only + EXPECT_MAT_NEAR(goldChartsRGB.col(1), chartsRGB.col(1), 0.3); // diff 0.292077 on Ubuntu 20.04 ARM64 } TEST(CV_mcc_ccm_test, compute_ccm) diff --git a/modules/rgbd/perf/perf_tsdf.cpp b/modules/rgbd/perf/perf_tsdf.cpp index da928f98f22..999ddb51673 100644 --- a/modules/rgbd/perf/perf_tsdf.cpp +++ b/modules/rgbd/perf/perf_tsdf.cpp @@ -91,7 +91,7 @@ struct Scene { virtual ~Scene() {} static Ptr create(Size sz, Matx33f _intr, float _depthFactor, bool onlySemisphere); - virtual Mat depth(Affine3f pose) = 0; + virtual Mat_ depth(const Affine3f& pose) = 0; virtual std::vector getPoses() = 0; }; @@ -131,7 +131,7 @@ struct SemisphereScene : Scene return res; } - Mat depth(Affine3f pose) override + Mat_ depth(const Affine3f& pose) override { Mat_ frame(frameSize); Reprojector reproj(intr); @@ -139,7 +139,7 @@ struct SemisphereScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderInvoker(frame, pose, reproj, depthFactor, onlySemisphere)); - return std::move(frame); + return frame; } std::vector getPoses() override diff --git a/modules/rgbd/test/ocl/test_tsdf.cpp b/modules/rgbd/test/ocl/test_tsdf.cpp index 1c55e1f4001..fa3d593cb4f 100644 --- a/modules/rgbd/test/ocl/test_tsdf.cpp +++ b/modules/rgbd/test/ocl/test_tsdf.cpp @@ -143,7 +143,7 @@ struct SemisphereScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderInvoker(frame, pose, reproj, depthFactor, onlySemisphere)); - return std::move(frame); + return static_cast(frame); } std::vector getPoses() override diff --git a/modules/rgbd/test/test_colored_kinfu.cpp b/modules/rgbd/test/test_colored_kinfu.cpp index 4303b260e3e..0ff90704e93 100644 --- a/modules/rgbd/test/test_colored_kinfu.cpp +++ b/modules/rgbd/test/test_colored_kinfu.cpp @@ -158,8 +158,8 @@ struct Scene { virtual ~Scene() {} static Ptr create(int nScene, Size sz, Matx33f _intr, float _depthFactor); - virtual Mat depth(Affine3f pose) = 0; - virtual Mat rgb(Affine3f pose) = 0; + virtual Mat_ depth(const Affine3f& pose) = 0; + virtual Mat_ rgb(const Affine3f& pose) = 0; virtual std::vector getPoses() = 0; }; @@ -198,7 +198,7 @@ struct CubeSpheresScene : Scene return res; } - Mat depth(Affine3f pose) override + Mat_ depth(const Affine3f& pose) override { Mat_ frame(frameSize); Reprojector reproj(intr); @@ -206,10 +206,10 @@ struct CubeSpheresScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return frame; } - Mat rgb(Affine3f pose) override + Mat_ rgb(const Affine3f& pose) override { Mat_ frame(frameSize); Reprojector reproj(intr); @@ -217,7 +217,7 @@ struct CubeSpheresScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderColorInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return frame; } std::vector getPoses() override @@ -305,7 +305,7 @@ struct RotatingScene : Scene return res; } - Mat depth(Affine3f pose) override + Mat_ depth(const Affine3f& pose) override { Mat_ frame(frameSize); Reprojector reproj(intr); @@ -313,10 +313,10 @@ struct RotatingScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return frame; } - Mat rgb(Affine3f pose) override + Mat_ rgb(const Affine3f& pose) override { Mat_ frame(frameSize); Reprojector reproj(intr); @@ -324,7 +324,7 @@ struct RotatingScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderColorInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return frame; } std::vector getPoses() override diff --git a/modules/rgbd/test/test_kinfu.cpp b/modules/rgbd/test/test_kinfu.cpp index 18059013ee7..e9c821f1577 100644 --- a/modules/rgbd/test/test_kinfu.cpp +++ b/modules/rgbd/test/test_kinfu.cpp @@ -141,7 +141,7 @@ struct CubeSpheresScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return static_cast(frame); } std::vector getPoses() override @@ -237,7 +237,7 @@ struct RotatingScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderInvoker(frame, pose, reproj, depthFactor)); - return std::move(frame); + return static_cast(frame); } std::vector getPoses() override diff --git a/modules/rgbd/test/test_tsdf.cpp b/modules/rgbd/test/test_tsdf.cpp index 31a137854c3..f5a88180923 100644 --- a/modules/rgbd/test/test_tsdf.cpp +++ b/modules/rgbd/test/test_tsdf.cpp @@ -140,7 +140,7 @@ struct SemisphereScene : Scene Range range(0, frame.rows); parallel_for_(range, RenderInvoker(frame, pose, reproj, depthFactor, onlySemisphere)); - return std::move(frame); + return static_cast(frame); } std::vector getPoses() override diff --git a/modules/videostab/src/global_motion.cpp b/modules/videostab/src/global_motion.cpp index 5bef5b1e74f..eb42d6954d6 100644 --- a/modules/videostab/src/global_motion.cpp +++ b/modules/videostab/src/global_motion.cpp @@ -82,7 +82,7 @@ namespace videostab { // does isotropic normalization -static Mat normalizePoints(int npoints, Point2f *points) +static Mat_ normalizePoints(int npoints, Point2f *points) { float cx = 0.f, cy = 0.f; for (int i = 0; i < npoints; ++i) @@ -113,32 +113,32 @@ static Mat normalizePoints(int npoints, Point2f *points) T(0,0) = T(1,1) = s; T(0,2) = -cx*s; T(1,2) = -cy*s; - return std::move(T); + return T; } static Mat estimateGlobMotionLeastSquaresTranslation( int npoints, Point2f *points0, Point2f *points1, float *rmse) { - Mat_ M = Mat::eye(3, 3, CV_32F); + Mat M = Mat::eye(3, 3, CV_32FC1); for (int i = 0; i < npoints; ++i) { - M(0,2) += points1[i].x - points0[i].x; - M(1,2) += points1[i].y - points0[i].y; + M.at(0,2) += points1[i].x - points0[i].x; + M.at(1,2) += points1[i].y - points0[i].y; } - M(0,2) /= npoints; - M(1,2) /= npoints; + M.at(0,2) /= npoints; + M.at(1,2) /= npoints; if (rmse) { *rmse = 0; for (int i = 0; i < npoints; ++i) - *rmse += sqr(points1[i].x - points0[i].x - M(0,2)) + - sqr(points1[i].y - points0[i].y - M(1,2)); + *rmse += sqr(points1[i].x - points0[i].x - M.at(0,2)) + + sqr(points1[i].y - points0[i].y - M.at(1,2)); *rmse = std::sqrt(*rmse / npoints); } - return std::move(M); + return M; } @@ -194,16 +194,16 @@ static Mat estimateGlobMotionLeastSquaresRotation( // A*sin(alpha) + B*cos(alpha) = 0 float C = std::sqrt(A*A + B*B); - Mat_ M = Mat::eye(3, 3, CV_32F); + Mat M = Mat::eye(3, 3, CV_32F); if ( C != 0 ) { float sinAlpha = - B / C; float cosAlpha = A / C; - M(0,0) = cosAlpha; - M(1,1) = M(0,0); - M(0,1) = sinAlpha; - M(1,0) = - M(0,1); + M.at(0,0) = cosAlpha; + M.at(1,1) = cosAlpha; + M.at(0,1) = sinAlpha; + M.at(1,0) = - sinAlpha; } if (rmse) @@ -213,16 +213,16 @@ static Mat estimateGlobMotionLeastSquaresRotation( { p0 = points0[i]; p1 = points1[i]; - *rmse += sqr(p1.x - M(0,0)*p0.x - M(0,1)*p0.y) + - sqr(p1.y - M(1,0)*p0.x - M(1,1)*p0.y); + *rmse += sqr(p1.x - M.at(0,0)*p0.x - M.at(0,1)*p0.y) + + sqr(p1.y - M.at(1,0)*p0.x - M.at(1,1)*p0.y); } *rmse = std::sqrt(*rmse / npoints); } - return std::move(M); + return M; } -static Mat estimateGlobMotionLeastSquaresRigid( +static Mat estimateGlobMotionLeastSquaresRigid( int npoints, Point2f *points0, Point2f *points1, float *rmse) { Point2f mean0(0.f, 0.f); @@ -250,15 +250,15 @@ static Mat estimateGlobMotionLeastSquaresRigid( A(1,1) += pt1.y * pt0.y; } - Mat_ M = Mat::eye(3, 3, CV_32F); + Mat M = Mat::eye(3, 3, CV_32FC1); SVD svd(A); Mat_ R = svd.u * svd.vt; Mat tmp(M(Rect(0,0,2,2))); R.copyTo(tmp); - M(0,2) = mean1.x - R(0,0)*mean0.x - R(0,1)*mean0.y; - M(1,2) = mean1.y - R(1,0)*mean0.x - R(1,1)*mean0.y; + M.at(0,2) = mean1.x - R(0,0)*mean0.x - R(0,1)*mean0.y; + M.at(1,2) = mean1.y - R(1,0)*mean0.x - R(1,1)*mean0.y; if (rmse) { @@ -267,13 +267,13 @@ static Mat estimateGlobMotionLeastSquaresRigid( { pt0 = points0[i]; pt1 = points1[i]; - *rmse += sqr(pt1.x - M(0,0)*pt0.x - M(0,1)*pt0.y - M(0,2)) + - sqr(pt1.y - M(1,0)*pt0.x - M(1,1)*pt0.y - M(1,2)); + *rmse += sqr(pt1.x - M.at(0,0)*pt0.x - M.at(0,1)*pt0.y - M.at(0,2)) + + sqr(pt1.y - M.at(1,0)*pt0.x - M.at(1,1)*pt0.y - M.at(1,2)); } *rmse = std::sqrt(*rmse / npoints); } - return std::move(M); + return M; } @@ -404,7 +404,7 @@ Mat estimateGlobalMotionRansac( // best hypothesis std::vector bestIndices(params.size); - Mat_ bestM; + Mat bestM; int ninliersMax = -1; RNG rng(0); @@ -469,8 +469,8 @@ Mat estimateGlobalMotionRansac( { p0 = points0_[i]; p1 = points1_[i]; - x = bestM(0,0)*p0.x + bestM(0,1)*p0.y + bestM(0,2); - y = bestM(1,0)*p0.x + bestM(1,1)*p0.y + bestM(1,2); + x = bestM.at(0,0)*p0.x + bestM.at(0,1)*p0.y + bestM.at(0,2); + y = bestM.at(1,0)*p0.x + bestM.at(1,1)*p0.y + bestM.at(1,2); if (sqr(x - p1.x) + sqr(y - p1.y) < params.thresh * params.thresh) { subset0[j] = p0; @@ -484,7 +484,7 @@ Mat estimateGlobalMotionRansac( if (ninliers) *ninliers = ninliersMax; - return std::move(bestM); + return bestM; } @@ -505,7 +505,7 @@ Mat MotionEstimatorRansacL2::estimate(InputArray points0, InputArray points1, bo // find motion int ninliers = 0; - Mat_ M; + Mat M; if (motionModel() != MM_HOMOGRAPHY) M = estimateGlobalMotionRansac( @@ -527,7 +527,7 @@ Mat MotionEstimatorRansacL2::estimate(InputArray points0, InputArray points1, bo if (ok) *ok = false; } - return std::move(M); + return M; } @@ -675,13 +675,13 @@ FromFileMotionReader::FromFileMotionReader(const String &path) Mat FromFileMotionReader::estimate(const Mat &/*frame0*/, const Mat &/*frame1*/, bool *ok) { - Mat_ M(3, 3); + Mat M(3, 3, CV_32FC1); bool ok_; - file_ >> M(0,0) >> M(0,1) >> M(0,2) - >> M(1,0) >> M(1,1) >> M(1,2) - >> M(2,0) >> M(2,1) >> M(2,2) >> ok_; + file_ >> M.at(0,0) >> M.at(0,1) >> M.at(0,2) + >> M.at(1,0) >> M.at(1,1) >> M.at(1,2) + >> M.at(2,0) >> M.at(2,1) >> M.at(2,2) >> ok_; if (ok) *ok = ok_; - return std::move(M); + return M; } @@ -696,12 +696,13 @@ ToFileMotionWriter::ToFileMotionWriter(const String &path, Ptr M = motionEstimator_->estimate(frame0, frame1, &ok_); - file_ << M(0,0) << " " << M(0,1) << " " << M(0,2) << " " - << M(1,0) << " " << M(1,1) << " " << M(1,2) << " " - << M(2,0) << " " << M(2,1) << " " << M(2,2) << " " << ok_ << std::endl; + Mat M = motionEstimator_->estimate(frame0, frame1, &ok_); + file_ << M.at(0,0) << " " << M.at(0,1) << " " << M.at(0,2) << " " + << M.at(1,0) << " " << M.at(1,1) << " " << M.at(1,2) << " " + << M.at(2,0) << " " << M.at(2,1) << " " << M.at(2,2) << " " + << ok_ << std::endl; if (ok) *ok = ok_; - return std::move(M); + return M; } diff --git a/modules/ximgproc/test/test_sparse_match_interpolator.cpp b/modules/ximgproc/test/test_sparse_match_interpolator.cpp index 261d6109bf9..f53a5add027 100644 --- a/modules/ximgproc/test/test_sparse_match_interpolator.cpp +++ b/modules/ximgproc/test/test_sparse_match_interpolator.cpp @@ -17,22 +17,22 @@ Mat readOpticalFlow( const String& path ) // CV_Assert(sizeof(float) == 4); //FIXME: ensure right sizes of int and float - here and in writeOpticalFlow() - Mat_ flow; + Mat flow; std::ifstream file(path.c_str(), std::ios_base::binary); if ( !file.good() ) - return std::move(flow); // no file - return empty matrix + return flow; // no file - return empty matrix float tag; file.read((char*) &tag, sizeof(float)); if ( tag != FLOW_TAG_FLOAT ) - return std::move(flow); + return flow; int width, height; file.read((char*) &width, 4); file.read((char*) &height, 4); - flow.create(height, width); + flow.create(height, width, CV_32FC2); for ( int i = 0; i < flow.rows; ++i ) { @@ -44,14 +44,14 @@ Mat readOpticalFlow( const String& path ) if ( !file.good() ) { flow.release(); - return std::move(flow); + return flow; } - flow(i, j) = u; + flow.at(i, j) = u; } } file.close(); - return std::move(flow); + return flow; } CV_ENUM(GuideTypes, CV_8UC1, CV_8UC3) From 96c9c85d654aaaccbf9542a4450a1396712caa83 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Wed, 15 May 2024 17:03:40 +0300 Subject: [PATCH 06/12] More warning fixes for Ubuntu 24.04. --- modules/intensity_transform/src/bimef.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/intensity_transform/src/bimef.cpp b/modules/intensity_transform/src/bimef.cpp index 2500d2abe8a..9cad67578f0 100644 --- a/modules/intensity_transform/src/bimef.cpp +++ b/modules/intensity_transform/src/bimef.cpp @@ -226,15 +226,15 @@ static Mat solveLinearEquation(const Mat_& img, Mat_& W_h_, Mat_ tin(img_t.ptr(), img_t.rows*img_t.cols); Eigen::VectorXf x = cg.solve(tin); - Mat_ tout(img.rows, img.cols); - tout.forEach( + Mat tout(img.rows, img.cols, CV_32FC1); + tout.forEach( [&](float &pixel, const int * position) -> void { pixel = x(position[1]*img.rows + position[0]); } ); - return std::move(tout); + return tout; } static Mat_ tsmooth(const Mat_& src, float lambda=0.01f, float sigma=3.0f, float sharpness=0.001f) From ff694dbc901adadfb708fa809f9f44225ebdd3c3 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 16 May 2024 14:59:02 +0300 Subject: [PATCH 07/12] Added Ubuntu 24.04 to regular CI. --- .github/workflows/PR-4.x.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml index f33cc37d5d5..586d406b209 100644 --- a/.github/workflows/PR-4.x.yaml +++ b/.github/workflows/PR-4.x.yaml @@ -15,6 +15,9 @@ jobs: Ubuntu2204-x64: uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-U22.yaml@main + Ubuntu2404-x64: + uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-U24.yaml@main + Ubuntu2004-x64-CUDA: uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-U20-Cuda.yaml@main From e46ba342b251fdbe0df34be0da1ba85577aa1e94 Mon Sep 17 00:00:00 2001 From: TumoiYorozu Date: Wed, 22 May 2024 17:53:03 +0900 Subject: [PATCH 08/12] Merge pull request #3627 from TumoiYorozu:wavelet_matrix_median_filter_cuda MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implemented fast median filter for CUDA using Wavelet Matrix, a constant-time, HDR-compatible method #3627 I replaced the existing CUDA implementation of the histogram-based median filter with an implementation of a new wavelet matrix-based median filter algorithm, which I presented at SIGGRAPH Asia 2022. This paper won the Best Paper Award in the journal track of technical papers (ACM Transactions on Graphics). This new algorithm, like the histogram method, has the property that the window radius does not affect the computation time, and is several times faster than the histogram method. Furthermore, while the histogram method does not support HDR and only supports 8U images, this new algorithm supports HDR and also supports 16U and 32F images. I (the author) have published the implementation on my personal GitHub and made some modifications for OpenCV to make it accessible from OpenCV. I used the CUB library, which is part of the standard toolkit since CUDA 11.0. Therefore, depending on the CUDA_VERSION, the code is written to use the new algorithm for versions 11.0 and above, and the existing histogram method for versions 10 and below. Regarding the old histogram-based code, the CPU version of the median filter supports 16U and 32F for window sizes up to 5, but it seems that the histogram CUDA version of the median filter does not. Also, the number of channels supported is different: the CPU version supports 1, 3, and 4 channels, while the CUDA version supports only 1 channel. In addition, for the CUDA version of the histogram method, pixels at the edges of the image, i.e. where the window is insufficient, were set to zero. For example, if the window size is 7, the width of the 3 pixels at the top, bottom, left, and right were not calculated correctly. When checking the tests, it was found that they compared with the CPU version by cropping the edges with rect, and also the cropping area was too wide, with 8 pixels cropped from the top, bottom, left, and right when the window size was 7. In this PR, I first corrected the rect range for the tests so that both the old histogram method and the new wavelet matrix method can pass. Also, the CUDA version now supports 16U, 32F, and multi-channel formats such as 3 and 4 channels. In addition, while the CPU version only supports window sizes up to 5 for HDR, the new CUDA Wavelet Matrix method supports sizes of 7 and above. Additionally, I have added new tests for 16U, 32F, and multi-channel formats, specifically 3 and 4 channels. Paper’s project page: [Constant Time Median Filter using 2D Wavelet Matrix | Interactive Graphics & Engineering Lab](https://cgenglab.github.io/en/publication/sigga22_wmatrix_median/) My implementation (as author): [GitHub - TumoiYorozu/WMatrixMedian](https://github.com/TumoiYorozu/WMatrixMedian) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch ~~- [ ] There is a reference to the original bug report and related work~~ - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- modules/cudafilters/src/cuda/median_filter.cu | 79 ++ .../src/cuda/wavelet_matrix_2d.cuh | 1053 +++++++++++++++++ .../wavelet_matrix_feature_support_checks.h | 82 ++ .../cuda/wavelet_matrix_float_supporter.cuh | 227 ++++ .../src/cuda/wavelet_matrix_multi.cuh | 636 ++++++++++ modules/cudafilters/src/filtering.cpp | 33 +- modules/cudafilters/test/test_filters.cpp | 18 +- 7 files changed, 2124 insertions(+), 4 deletions(-) create mode 100644 modules/cudafilters/src/cuda/wavelet_matrix_2d.cuh create mode 100644 modules/cudafilters/src/cuda/wavelet_matrix_feature_support_checks.h create mode 100644 modules/cudafilters/src/cuda/wavelet_matrix_float_supporter.cuh create mode 100644 modules/cudafilters/src/cuda/wavelet_matrix_multi.cuh diff --git a/modules/cudafilters/src/cuda/median_filter.cu b/modules/cudafilters/src/cuda/median_filter.cu index ed46eb4bf94..6776428ae1a 100644 --- a/modules/cudafilters/src/cuda/median_filter.cu +++ b/modules/cudafilters/src/cuda/median_filter.cu @@ -53,6 +53,17 @@ #include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/border_interpolate.hpp" + +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#include "wavelet_matrix_feature_support_checks.h" +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +#include "wavelet_matrix_multi.cuh" +#include "wavelet_matrix_2d.cuh" +#include "wavelet_matrix_float_supporter.cuh" +#endif + + namespace cv { namespace cuda { namespace device { __device__ void histogramAddAndSub8(int* H, const int * hist_colAdd,const int * hist_colSub){ @@ -334,4 +345,72 @@ namespace cv { namespace cuda { namespace device }}} + +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +namespace cv { namespace cuda { namespace device + { + using namespace wavelet_matrix_median; + + template + void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius,cudaStream_t stream){ + + constexpr bool is_float = std::is_same::value; + constexpr static int WORD_SIZE = 32; + constexpr static int ThW = (std::is_same::value ? 8 : 4); + constexpr static int ThH = (std::is_same::value ? 64 : 256); + using XYIdxT = uint32_t; + using XIdxT = uint16_t; + using WM_T = typename std::conditional::type; + using MedianResT = typename std::conditional::type; + using WM2D_IMPL = WaveletMatrix2dCu5C, 512, WORD_SIZE>; + + CV_Assert(src.cols == dst.cols); + CV_Assert(dst.step % sizeof(T) == 0); + + WM2D_IMPL WM_cuda(src.rows, src.cols, is_float, false); + WM_cuda.res_cu = reinterpret_cast(dst.ptr()); + + const size_t line_num = src.cols * CH_NUM; + if (is_float) { + WMMedianFloatSupporter::WMMedianFloatSupporter float_supporter(src.rows, src.cols); + float_supporter.alloc(); + for (int y = 0; y < src.rows; ++y) { + cudaMemcpy(float_supporter.val_in_cu + y * line_num, src.ptr(y), line_num * sizeof(T), cudaMemcpyDeviceToDevice); + } + const auto p = WM_cuda.get_nowcu_and_buf_byte_div32(); + float_supporter.sort_and_set((XYIdxT*)p.first, p.second); + WM_cuda.construct(nullptr, stream, true); + WM_cuda.template median2d(radius, dst.step / sizeof(T), (MedianResT*)float_supporter.get_res_table(), stream); + } else { + for (int y = 0; y < src.rows; ++y) { + cudaMemcpy(WM_cuda.src_cu + y * line_num, src.ptr(y), line_num * sizeof(T), cudaMemcpyDeviceToDevice); + } + WM_cuda.construct(nullptr, stream); + WM_cuda.template median2d(radius, dst.step / sizeof(T), nullptr, stream); + } + WM_cuda.res_cu = nullptr; + if (!stream) { + cudaSafeCall( cudaDeviceSynchronize() ); + } + } + + template + void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius, const int num_channels, cudaStream_t stream){ + if (num_channels == 1) { + medianFiltering_wavelet_matrix_gpu<1>(src, dst, radius, stream); + } else if (num_channels == 3) { + medianFiltering_wavelet_matrix_gpu<3>(src, dst, radius, stream); + } else if (num_channels == 4) { + medianFiltering_wavelet_matrix_gpu<4>(src, dst, radius, stream); + } else { + CV_Assert(num_channels == 1 || num_channels == 3 || num_channels == 4); + } + } + + template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius, const int num_channels, cudaStream_t stream); + template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius, const int num_channels, cudaStream_t stream); + template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius, const int num_channels, cudaStream_t stream); +}}} +#endif // __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + #endif diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_2d.cuh b/modules/cudafilters/src/cuda/wavelet_matrix_2d.cuh new file mode 100644 index 00000000000..9c10c223d87 --- /dev/null +++ b/modules/cudafilters/src/cuda/wavelet_matrix_2d.cuh @@ -0,0 +1,1053 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_WAVELET_MATRIX_2D_CUH__ +#define __OPENCV_WAVELET_MATRIX_2D_CUH__ + + +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#include "wavelet_matrix_feature_support_checks.h" +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + + +#include +#include +#include "opencv2/core/cuda/warp_shuffle.hpp" + +#include "wavelet_matrix_multi.cuh" + +namespace cv { namespace cuda { namespace device +{ + +namespace wavelet_matrix_median { + using std::vector; + using namespace std; + +template +__global__ void WaveletMatrix2dCu5C_UpSweep_gpu(const SrcT mask, const uint16_t block_pair_num, const XYIdxT size_div_w, const SrcT* __restrict__ src, DstT* __restrict__ dst, BlockT* __restrict__ nbit_bp, const XYIdxT* __restrict__ nsum_buf_test, XYIdxT* __restrict__ nsum_buf_test2, const uint32_t bv_block_byte_div32, const uint32_t buf_byte_div32, const XIdxT* __restrict__ idx_p, const XIdxT inf, XIdxT* __restrict__ wm, XIdxT* __restrict__ nxt_idx, XYIdxT* __restrict__ wm_nsum_scan_buf, const XYIdxT cwm_buf_byte_div32, BlockT* __restrict__ nbit_bp_pre) { + using WordT = decltype(BlockT::nbit); + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + constexpr int WORD_SIZE = 8 * sizeof(WordT); + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, ""); + constexpr uint32_t WORD_DIV_WARP = WORD_SIZE / WARP_SIZE; + + static_assert(ThreadsDimY % SRC_CACHE_DIV == 0, ""); + static_assert(ThreadsDimY != SRC_CACHE_DIV, "Warning: It's not efficient."); + + const size_t buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (buf_byte_div32*32ull); + const size_t bv_block_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (bv_block_byte_div32*32ull); + const size_t cwm_buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * cwm_buf_byte_div32 * 32u; + src = (SrcT*)((uint8_t*)src + buf_byte_y_offset); + dst = (DstT*)((uint8_t*)dst + buf_byte_y_offset); + nsum_buf_test = (XYIdxT*)((uint8_t*)nsum_buf_test + buf_byte_y_offset); + nsum_buf_test2 = (XYIdxT*)((uint8_t*)nsum_buf_test2 + buf_byte_y_offset); + nbit_bp = (BlockT*)((uint8_t*)nbit_bp + bv_block_byte_y_offset); + nbit_bp_pre = (BlockT*)((uint8_t*)nbit_bp_pre + bv_block_byte_y_offset); + + idx_p = (XIdxT*)((uint8_t*)idx_p + buf_byte_y_offset); + nxt_idx = (XIdxT*)((uint8_t*)nxt_idx + buf_byte_y_offset); + if (wm != nullptr) wm = (XIdxT*)((uint8_t*)wm + cwm_buf_byte_y_offset); + wm_nsum_scan_buf = (XYIdxT*)((uint8_t*)wm_nsum_scan_buf + cwm_buf_byte_y_offset); + + + using WarpScanX = cub::WarpScan; + using WarpScanY = cub::WarpScan; + using WarpReduce = cub::WarpReduce; + using WarpReduceY = cub::WarpReduce; + + static_assert(SRCB_S < 64 * 1024, ""); + + __shared__ SrcT src_val_cache[ThreadsDimY][(WARP_SIZE/SRC_CACHE_DIV)-1][WARP_SIZE]; + __shared__ XIdxT vidx_val_cache[ThreadsDimY][(WARP_SIZE/SRC_CACHE_DIV)-1][WARP_SIZE]; + + __shared__ uint4 nsum_count_sh[ThreadsDimY]; + __shared__ XYIdxT wm_zero_count_sh[ThreadsDimY]; + __shared__ XYIdxT pre_sum_share[2]; + __shared__ XYIdxT warp_scan_sums[ThreadsDimY]; + __shared__ typename WarpScanX::TempStorage s_scanStorage; + __shared__ typename WarpScanY::TempStorage s_scanStorage2; + __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY]; + __shared__ typename WarpReduceY::TempStorage WarpReduceY_temp_storage; + // shmem ------ end ------ + + XYIdxT wm_zero_count = 0; + + const XYIdxT size_div_warp = size_div_w * WORD_DIV_WARP; + const XYIdxT nsum = nbit_bp[size_div_w].nsum; + const XYIdxT nsum_offset = nsum_buf_test[blockIdx.x]; + const XYIdxT nsum_pre = nbit_bp_pre[size_div_w].nsum; + + + XYIdxT nsum_idx0_org = nsum_offset; + XYIdxT nsum_idx1_org = (XYIdxT)blockIdx.x * block_pair_num * THREAD_PER_GRID + nsum - nsum_idx0_org; + nsum_idx0_org /= (XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE; + nsum_idx1_org /= (XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE; + const XYIdxT nsum_idx0_bound = (nsum_idx0_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE; + const XYIdxT nsum_idx1_bound = (nsum_idx1_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE; + uint4 nsum_count = make_uint4(0, 0, 0, 0); + + const unsigned short th_idx = threadIdx.y * WARP_SIZE + threadIdx.x; + if (th_idx == 0) { + pre_sum_share[0] = nsum_offset; + } + + for (XYIdxT ka = 0; ka < block_pair_num; ka += WARP_SIZE / SRC_CACHE_DIV) { + const XYIdxT ibb = ((XYIdxT)blockIdx.x * block_pair_num + ka) * ThreadsDimY; + if (ibb >= size_div_warp) break; + + WarpWT my_bits = 0; + SrcT first_val; + XIdxT first_idxval; + + for (XYIdxT kb = 0, i = ibb + WARP_SIZE / SRC_CACHE_DIV * threadIdx.y; kb < WARP_SIZE / SRC_CACHE_DIV; ++kb, ++i) { + if (i >= size_div_warp) break; + WarpWT bits; + const XYIdxT ij = i * WARP_SIZE + threadIdx.x; + const SrcT v = src[ij]; + const XIdxT wm_idxv = idx_p[ij]; + if (kb == 0) { + first_val = v; + first_idxval = wm_idxv; + } else { + src_val_cache[threadIdx.y][kb - 1][threadIdx.x] = v; + vidx_val_cache[threadIdx.y][kb - 1][threadIdx.x] = wm_idxv; + } + if (v <= mask) { + bits = __activemask(); + } else { + bits = ~__activemask(); + } + if (threadIdx.x == kb) { + my_bits = bits; + } + if (wm != nullptr) { + if (ij < nsum_pre) { + wm[ij] = wm_idxv; + if (wm_idxv * 2 <= inf) { + ++wm_zero_count; + } + } else { + wm[ij] = inf; + } + } + } + + XYIdxT c, t = 0; + if (threadIdx.y < ThreadsDimY) { + c = __popc(my_bits); + + WarpScanX(s_scanStorage).ExclusiveSum(c, t); + if (threadIdx.x == WARP_SIZE / SRC_CACHE_DIV - 1) { + warp_scan_sums[threadIdx.y] = c + t; + } + } + + __syncthreads(); + + XYIdxT pre_sum = pre_sum_share[(ka & (WARP_SIZE / SRC_CACHE_DIV)) > 0 ? 1 : 0]; + XYIdxT s = threadIdx.x < ThreadsDimY ? warp_scan_sums[threadIdx.x] : 0; + WarpScanY(s_scanStorage2).ExclusiveSum(s, s); + + s = cv::cuda::device::shfl(s, threadIdx.y, WARP_SIZE); + s += t + pre_sum; + + if (SRC_CACHE_DIV == 1 || threadIdx.x < WARP_SIZE / SRC_CACHE_DIV) { + if (th_idx == THREAD_PER_GRID - WARP_SIZE + WARP_SIZE / SRC_CACHE_DIV - 1) { + pre_sum_share[(ka & (WARP_SIZE / SRC_CACHE_DIV)) == 0 ? 1 : 0] = s + c; + } + const XYIdxT bi = ibb + threadIdx.y * WARP_SIZE / SRC_CACHE_DIV + threadIdx.x; + if (bi < size_div_warp) { + static_assert(WORD_SIZE == 32, ""); + nbit_bp[bi] = BlockT{s, my_bits}; + } + } + + if (mask == 0) { + SrcT vo = first_val; + XIdxT idx_v = first_idxval; + for (XYIdxT j = 0, i = ibb + WARP_SIZE / SRC_CACHE_DIV * threadIdx.y; j < WARP_SIZE / SRC_CACHE_DIV; ++j, ++i) { + if (i >= size_div_warp) break; + const WarpWT e_nbit = cv::cuda::device::shfl(my_bits, j, WARP_SIZE); + const XYIdxT e_nsum = cv::cuda::device::shfl(s, j, WARP_SIZE); + XYIdxT rank = __popc(e_nbit << (WARP_SIZE - threadIdx.x)); + const XYIdxT idx0 = e_nsum + rank; + XYIdxT idx = idx0; + if (vo > mask) { // 1 + const XYIdxT ij = i * WARP_SIZE + threadIdx.x; + idx = ij + nsum - idx; + } + if (idx < size_div_warp * WARP_SIZE) { + nxt_idx[idx] = idx_v; + } + if (j == WARP_SIZE / SRC_CACHE_DIV - 1) break; + vo = src_val_cache[threadIdx.y][j][threadIdx.x]; + idx_v = vidx_val_cache[threadIdx.y][j][threadIdx.x]; + } + continue; + } + const SrcT mask_2 = mask >> 1; + SrcT vo = first_val; + XIdxT idx_v = first_idxval; + for (XYIdxT j = 0, i = ibb + WARP_SIZE / SRC_CACHE_DIV * threadIdx.y; j < WARP_SIZE / SRC_CACHE_DIV; ++j, ++i) { + if (i >= size_div_warp) break; + const WarpWT e_nbit = cv::cuda::device::shfl(my_bits, j, WARP_SIZE); + const XYIdxT e_nsum = cv::cuda::device::shfl(s, j, WARP_SIZE); + XYIdxT rank = __popc(e_nbit << (WARP_SIZE - threadIdx.x)); + const XYIdxT idx0 = e_nsum + rank; + + DstT v = (DstT)vo; + XYIdxT idx = idx0; + if (vo > mask) { // 1 + const XYIdxT ij = i * WARP_SIZE + threadIdx.x; + idx = ij + nsum - idx; + v &= mask; + } + if (idx < size_div_warp * WARP_SIZE) { + if (mask != 0) { + dst[idx] = v; + } + nxt_idx[idx] = idx_v; + } + + if (v <= mask_2) { + if (vo <= mask) { + if (idx < nsum_idx0_bound) { + nsum_count.x++; + } else { + nsum_count.y++; + } + } else { + if (idx < nsum_idx1_bound) { + nsum_count.z++; + } else { + nsum_count.w++; + } + } + } + if (j == WARP_SIZE / SRC_CACHE_DIV - 1) break; + vo = src_val_cache[threadIdx.y][j][threadIdx.x]; + idx_v = vidx_val_cache[threadIdx.y][j][threadIdx.x]; + } + } + if (blockIdx.x == gridDim.x - 1 && th_idx == 0) { + nbit_bp[size_div_warp / WORD_DIV_WARP].nsum = nsum; + } + + nsum_count.x = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.x); + nsum_count.y = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.y); + nsum_count.z = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.z); + nsum_count.w = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.w); + wm_zero_count = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(wm_zero_count); + if (threadIdx.x == 0) { + nsum_count_sh[threadIdx.y] = nsum_count; + wm_zero_count_sh[threadIdx.y] = wm_zero_count; + } + __syncthreads(); + if (threadIdx.x < ThreadsDimY) { + nsum_count = nsum_count_sh[threadIdx.x]; + nsum_count.x = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.x); + nsum_count.y = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.y); + nsum_count.z = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.z); + nsum_count.w = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.w); + wm_zero_count = WarpReduceY(WarpReduceY_temp_storage).Sum(wm_zero_count_sh[threadIdx.x]); + if (th_idx == 0) { + const XYIdxT nsum_idx0_org = nsum_idx0_bound / ((XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE); + const XYIdxT nsum_idx1_org = nsum_idx1_bound / ((XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE); + if (mask != 0) { + if (nsum_count.x > 0) atomicAdd(nsum_buf_test2 + nsum_idx0_org - 1, nsum_count.x); + if (nsum_count.y > 0) atomicAdd(nsum_buf_test2 + nsum_idx0_org - 0, nsum_count.y); + if (nsum_count.z > 0) atomicAdd(nsum_buf_test2 + nsum_idx1_org - 1, nsum_count.z); + if (nsum_count.w > 0) atomicAdd(nsum_buf_test2 + nsum_idx1_org - 0, nsum_count.w); + } + if (wm != nullptr) { + wm_nsum_scan_buf[blockIdx.x] = wm_zero_count; + } + } + } +} + +template +__global__ void WaveletMatrix2dCu5C_last_gpu(const uint16_t block_pair_num, const XYIdxT size_div_w, const uint32_t buf_byte_div32, const XIdxT* __restrict__ idx_p, const XIdxT inf, XIdxT* __restrict__ wm, XYIdxT* __restrict__ wm_nsum_scan_buf, const XYIdxT cwm_buf_byte_div32, BlockT* __restrict__ nbit_bp_pre, const uint32_t bv_block_byte_div32) { + using WordT = decltype(BlockT::nbit); + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + constexpr int WORD_SIZE = 8 * sizeof(WordT); + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, ""); + constexpr uint32_t WORD_DIV_WARP = WORD_SIZE / WARP_SIZE; + + const size_t buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (buf_byte_div32*32ull); + const size_t bv_block_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (bv_block_byte_div32*32ull); + const size_t cwm_buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * cwm_buf_byte_div32 * 32u; + + idx_p = (XIdxT*)((uint8_t*)idx_p + buf_byte_y_offset); + wm = (XIdxT*)((uint8_t*)wm + cwm_buf_byte_y_offset); + wm_nsum_scan_buf = (XYIdxT*)((uint8_t*)wm_nsum_scan_buf + cwm_buf_byte_y_offset); + nbit_bp_pre = (BlockT*)((uint8_t*)nbit_bp_pre + bv_block_byte_y_offset); + const XYIdxT nsum_pre = nbit_bp_pre[size_div_w].nsum; + + using WarpReduce = cub::WarpReduce; + using WarpReduceY = cub::WarpReduce; + + __shared__ XYIdxT wm_zero_count_sh[ThreadsDimY]; + __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY]; + __shared__ typename WarpReduceY::TempStorage WarpReduceY_temp_storage; + // shmem ------ end ------ + + XYIdxT wm_zero_count = 0; + const XYIdxT size_div_warp = size_div_w * WORD_DIV_WARP; + const unsigned short th_idx = threadIdx.y * WARP_SIZE + threadIdx.x; + + const int block_num = block_pair_num/WARP_SIZE; + for (XYIdxT ka = 0; ka < block_num; ++ka) { + const XYIdxT ibb = ((XYIdxT)blockIdx.x * block_num + ka) * THREAD_PER_GRID + WARP_SIZE * threadIdx.y; + if (ibb >= size_div_warp) break; + + for (XYIdxT kb = 0; kb < WARP_SIZE; ++kb) { + XYIdxT i = ibb + kb; + if (i >= size_div_warp) break; + + const XYIdxT ij = i * WARP_SIZE + threadIdx.x; + + if (ij < nsum_pre) { + const XIdxT wm_idxv = idx_p[ij]; + wm[ij] = wm_idxv; + if (wm_idxv * 2 <= inf) { + ++wm_zero_count; + } + } else { + wm[ij] = inf; + } + } + } + wm_zero_count = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(wm_zero_count); + if (threadIdx.x == 0) { + wm_zero_count_sh[threadIdx.y] = wm_zero_count; + } + __syncthreads(); + if (threadIdx.x < ThreadsDimY) { + wm_zero_count = WarpReduceY(WarpReduceY_temp_storage).Sum(wm_zero_count_sh[threadIdx.x]); + if (th_idx == 0) { + wm_nsum_scan_buf[blockIdx.x] = wm_zero_count; + } + } +} + +template +__global__ void WaveletMatrix2dCu5C_ExclusiveSum(XYIdxT* __restrict__ nsum_scan_buf, XYIdxT* __restrict__ nsum_buf_test2, BlockT* __restrict__ nsum_p, const uint32_t buf_byte_div32, const uint32_t bv_block_byte_div32) { + + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + + XYIdxT thread_data1; + XYIdxT thread_data2; + + nsum_scan_buf = (XYIdxT*)((uint8_t*)nsum_scan_buf + (size_t)blockIdx.x * (buf_byte_div32*32ull)); + nsum_buf_test2 = (XYIdxT*)((uint8_t*)nsum_buf_test2 + (size_t)blockIdx.x * (buf_byte_div32*32ull)); + + thread_data1 = nsum_scan_buf[threadIdx.x]; + BlockScan(temp_storage).ExclusiveSum(thread_data1, thread_data2); + + nsum_scan_buf[threadIdx.x] = thread_data2; + nsum_buf_test2[threadIdx.x] = 0; + + if (threadIdx.x == blockDim.x - 1) { + thread_data2 += thread_data1; + nsum_p = (BlockT*)((uint8_t*)nsum_p + (size_t)blockIdx.x * (bv_block_byte_div32*32ull)); + nsum_p->nsum = thread_data2; + } +} + + +template +__global__ void WaveletMatrix2dCu5C_first_gpu_multi(const SrcT mask, uint16_t block_pair_num, const XYIdxT size_div_warp, SrcT* __restrict__ src, SrcT* __restrict__ dst, XYIdxT* __restrict__ nsum_scan_buf, const uint32_t buf_byte_div32, XIdxT* __restrict__ buf_idx, const int W, const XYIdxT WH, const SrcT* __restrict__ src_const) { + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + + SrcT* __restrict__ dsts[CH_NUM]; + XIdxT* __restrict__ buf_idxs[CH_NUM]; + XYIdxT cs[CH_NUM]; + + __shared__ SrcT src_vbuf_org[ThreadsDimY][CH_NUM * WARP_SIZE]; + SrcT* __restrict__ src_vbuf = src_vbuf_org[threadIdx.y]; + + for (int c = 0; c < CH_NUM; ++c) { + if (CH_NUM > 1) { // constexpr + dsts[c] = (SrcT*)((uint8_t*)dst + (size_t)c * (buf_byte_div32*32ull)); + } + buf_idxs[c] = (XIdxT*)((uint8_t*)buf_idx + (size_t)c * (buf_byte_div32*32ull)); + cs[c] = 0; + } + + XYIdxT ibb = (XYIdxT)blockIdx.x * block_pair_num * ThreadsDimY; + for (XYIdxT ka = 0; ka < block_pair_num; ka += WARP_SIZE, ibb += THREAD_PER_GRID) { + for (XYIdxT kb = 0, i = ibb + WARP_SIZE * threadIdx.y; kb < WARP_SIZE; ++kb, ++i) { + if (i >= size_div_warp) break; + const XYIdxT iw = i * WARP_SIZE; + const XYIdxT idx = iw + threadIdx.x; + const XIdxT idx_v = (idx >= WH ? 0 : idx % W); + + for (int c = 0; c < CH_NUM; ++c) { + if (MOV_SRC) { // constexpr + const XYIdxT s_idx = iw * CH_NUM + threadIdx.x + c * WARP_SIZE; + src_vbuf[threadIdx.x + c * WARP_SIZE] = src[s_idx] = src_const[s_idx]; + } else { + src_vbuf[threadIdx.x + c * WARP_SIZE] = src[iw * CH_NUM + threadIdx.x + c * WARP_SIZE]; + } + } + __syncwarp(); + for (int c = 0; c < CH_NUM; ++c) { + const SrcT v = src_vbuf[threadIdx.x * CH_NUM + c]; + if (v <= mask) { + ++cs[c]; + } + buf_idxs[c][idx] = idx_v; + if constexpr(CH_NUM > 1) { + dsts[c][idx] = v; + } + } + } + } + + using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY]; + __shared__ XYIdxT cs_sum_sh[CH_NUM][ThreadsDimY]; + for (int c = 0; c < CH_NUM; ++c) { + cs[c] = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(cs[c]); + } + if (threadIdx.x == 0) { + for (int c = 0; c < CH_NUM; ++c) { + cs_sum_sh[c][threadIdx.y] = cs[c]; + } + } + __syncthreads(); + if (threadIdx.y != 0) return; + for (int c = 0; c < CH_NUM; ++c) { + XYIdxT cs_bsum = (threadIdx.x < ThreadsDimY ? cs_sum_sh[c][threadIdx.x] : 0); + cs_bsum = WarpReduce(WarpReduce_temp_storage[0]).Sum(cs_bsum); + if (threadIdx.x == 0) { + nsum_scan_buf[blockIdx.x] = cs_bsum; + nsum_scan_buf = (XYIdxT*)((uint8_t*)nsum_scan_buf + (buf_byte_div32*32ull)); + } + } + +} + + +template +__global__ void WaveletMatrix2dCu5C_first_gpu_multi_srcunpacked(const SrcT mask, uint16_t block_pair_num, const XYIdxT size_div_warp, const SrcT* __restrict__ src, XYIdxT* __restrict__ nsum_scan_buf, const uint32_t buf_byte_div32, XIdxT* __restrict__ buf_idx, const int W, const XYIdxT WH) { + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + + XYIdxT cs = 0; + + const int c = blockIdx.y; + buf_idx = buf_idx + c * (buf_byte_div32*32ull / sizeof(XIdxT)); + src = src + c * (buf_byte_div32*32ull / sizeof(SrcT)); + + + XYIdxT i = (XYIdxT)blockIdx.x * block_pair_num * ThreadsDimY + threadIdx.y; + + XIdxT x_idx = (i * WARP_SIZE + threadIdx.x) % W; + const XIdxT x_diff = THREAD_PER_GRID % W; + + for (XYIdxT k = 0; k < block_pair_num; ++k, i += ThreadsDimY) { + if (i >= size_div_warp) break; + const XYIdxT idx = i * WARP_SIZE + threadIdx.x; + + if (idx >= WH) x_idx = 0; + + const SrcT v = src[idx]; + if (v <= mask) { + ++cs; + } + buf_idx[idx] = x_idx; + + x_idx += x_diff; + if (x_idx >= W) x_idx -= W; + } + + using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY]; + __shared__ XYIdxT cs_sum_sh[ThreadsDimY]; + cs = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(cs); + if (threadIdx.x == 0) { + cs_sum_sh[threadIdx.y] = cs; + } + __syncthreads(); + if (threadIdx.y != 0) return; + XYIdxT cs_bsum = (threadIdx.x < ThreadsDimY ? cs_sum_sh[threadIdx.x] : 0); + cs_bsum = WarpReduce(WarpReduce_temp_storage[0]).Sum(cs_bsum); + if (threadIdx.x == 0) { + nsum_scan_buf += (buf_byte_div32*32ull / sizeof(XYIdxT)) * (blockIdx.y); + nsum_scan_buf[blockIdx.x] = cs_bsum; + } +} + +template +__device__ +inline IdxType WaveletMatrix2dCu5C_median2d_rank0(const IdxType i, const BlockT* __restrict__ nbit_bp) { + using WordT = decltype(BlockT::nbit); + constexpr int WORD_SIZE = 8 * sizeof(WordT); + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, ""); + + const IdxType bi = i / WORD_SIZE; + + const int ai = i % WORD_SIZE; + const BlockT block = nbit_bp[bi]; + if constexpr(WORD_SIZE == 32) { + return block.nsum + __popc(block.nbit & ((1u << ai) - 1)); + } + if constexpr(WORD_SIZE == 64) { + return block.nsum + __popcll(block.nbit & ((1ull << ai) - 1ull)); + } +} + + +template +__global__ void WaveletMatrix2dCu5C_median2d_cu( + const int H, const int W, const int res_step_num, const int r, ResT* __restrict__ res_cu, const BlockT* __restrict__ wm_nbit_bp, const uint32_t nsum_pos, + const uint32_t bv_block_h_byte_div32, const uint32_t bv_block_len, + const BlockT* __restrict__ bv_nbit_bp, const uint8_t w_bit_len, const uint8_t val_bit_len, + const ResTableT* __restrict__ res_table +) { + + const int y = blockIdx.y * THREADS_NUM_H + threadIdx.y; + if (y >= H) return; + const int x = blockIdx.x * THREADS_NUM_W + threadIdx.x; + if (x >= W) return; + + if (CH_NUM >= 2) { // constexpr + bv_nbit_bp = (BlockT*)((uint8_t*)bv_nbit_bp + bv_block_h_byte_div32 * 32ull * blockIdx.z * (VAL_BIT_LEN >= 0 ? VAL_BIT_LEN : val_bit_len)); // TODO + wm_nbit_bp = (BlockT*)((uint8_t*)wm_nbit_bp + bv_block_h_byte_div32 * 32ull * blockIdx.z * w_bit_len); + } + + XYIdxT ya, yb, k; + XIdxT xa, xb; + if (CUT_BORDER) { // constexpr + ya = y; + xa = x; + yb = y + r * 2 + 1; + xb = x + r * 2 + 1; + k = (r * 2 + 1) * (r * 2 + 1) / 2; + } else { + ya = (y < r ? 0 : y - r); + xa = (x < r ? 0 : x - r); + yb = y + r + 1; if (yb > H) yb = H; + xb = x + r + 1; if (xb > W) xb = W; + k = XYIdxT(yb - ya) * (xb - xa) / 2; + } + ValT res = 0; + ya *= (CUT_BORDER ? W + 2 * r : W); + yb *= (CUT_BORDER ? W + 2 * r : W); + + for (int8_t h = (VAL_BIT_LEN >= 0 ? VAL_BIT_LEN : val_bit_len); h--; ) { + const XYIdxT top0 = WaveletMatrix2dCu5C_median2d_rank0(ya, bv_nbit_bp); + const XYIdxT bot0 = WaveletMatrix2dCu5C_median2d_rank0(yb, bv_nbit_bp); + XYIdxT l_ya_xa = top0; + XYIdxT l_yb_xa = bot0; + XYIdxT l_ya_xb = top0; + XYIdxT l_yb_xb = bot0; + XYIdxT d = 0; + for (int8_t j = w_bit_len; j--; ) { + const XYIdxT zeros = wm_nbit_bp[nsum_pos].nsum; + const XYIdxT l_ya_xa_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_ya_xa, wm_nbit_bp); + const XYIdxT l_ya_xb_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_ya_xb, wm_nbit_bp); + const XYIdxT l_yb_xb_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_yb_xb, wm_nbit_bp); + const XYIdxT l_yb_xa_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_yb_xa, wm_nbit_bp); + + if (((xa >> j) & 1) == 0) { + l_ya_xa = l_ya_xa_rank0; + l_yb_xa = l_yb_xa_rank0; + } else { + d += l_ya_xa_rank0; l_ya_xa += zeros - l_ya_xa_rank0; + d -= l_yb_xa_rank0; l_yb_xa += zeros - l_yb_xa_rank0; + } + if (((xb >> j) & 1) == 0) { + l_ya_xb = l_ya_xb_rank0; + l_yb_xb = l_yb_xb_rank0; + } else { + d -= l_ya_xb_rank0; l_ya_xb += zeros - l_ya_xb_rank0; + d += l_yb_xb_rank0; l_yb_xb += zeros - l_yb_xb_rank0; + } + wm_nbit_bp = (BlockT*)((uint8_t*)wm_nbit_bp - bv_block_h_byte_div32 * 32ull); + } + if (CH_NUM >= 2) { + wm_nbit_bp = (BlockT*)((uint8_t*)wm_nbit_bp - bv_block_h_byte_div32 * 32ull * w_bit_len * (CH_NUM - 1)); + } + const XYIdxT bv_h_zeros = bv_nbit_bp[nsum_pos].nsum; + if (k < d) { + ya = top0; + yb = bot0; + } else { + k -= d; + res |= (ValT)1 << h; + ya += bv_h_zeros - top0; + yb += bv_h_zeros - bot0; + } + bv_nbit_bp = (BlockT*)((uint8_t*)bv_nbit_bp - bv_block_h_byte_div32 * 32ull); + } + + + + + if constexpr(is_same::value) { + res_cu[(XYIdxT)y * res_step_num + x * CH_NUM + blockIdx.z] = res; + } else if (CH_NUM == 1){ + res_cu[(XYIdxT)y * res_step_num + x * CH_NUM] = res_table[res]; + } else { + const size_t offset = size_t(CUT_BORDER ? W + 2 * r : W) * (CUT_BORDER ? H + 2 * r : H) * blockIdx.z; + res_cu[(XYIdxT)y * res_step_num + x * CH_NUM + blockIdx.z] = res_table[res + offset]; + } +} + + + + +template +struct WaveletMatrix2dCu5C { + static_assert(is_same() || is_same() || is_same(), "Supports 32, 16, or 8 bits only"); + static constexpr int MAX_BIT_LEN = 8 * sizeof(ValT); + + static constexpr uint32_t WSIZE = WORD_SIZE; + static constexpr int WARP_SIZE = 32; + using T_Type = ValT; + static constexpr int THREAD_PER_GRID = TH_NUM; + static constexpr int SRC_CACHE_DIV = 2; + static constexpr int MAX_BLOCK_X = MultiWaveletMatrixImpl::MAX_BLOCK_X; + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, "WORD_SIZE must be 32 or 64"); + using WordT = typename std::conditional::type; + + static_assert(MAX_BLOCK_X <= 1024, ""); + static_assert(TH_NUM == 1024 || TH_NUM == 512 || TH_NUM == 256 || TH_NUM == 128 || TH_NUM == 64 || TH_NUM == 32, ""); + static_assert(THREAD_PER_GRID == MultiWaveletMatrixImpl::THREAD_PER_GRID, ""); + + using BlockT = typename MultiWaveletMatrixImpl::BlockT; + using WarpWT = uint32_t; + using XIdxT = uint16_t; + using YIdxT = uint16_t; + using XYIdxT = uint32_t; + static constexpr int BLOCK_TYPE = 2; + using MultiWaveletMatrixImplClass = MultiWaveletMatrixImpl; + static_assert(is_same::value, ""); + static_assert(8 * sizeof(WarpWT) == WARP_SIZE, ""); + + int H, W; + XYIdxT size = 0; + MultiWaveletMatrixImpl WM; + XYIdxT bv_zeros[MAX_BIT_LEN]; + + int w_bit_len = 0; + int val_bit_len = 0; + static constexpr int wm_num = CH_NUM; + +private: + uint8_t* bv_block_nbit_and_nsum_base_cu = nullptr; // GPU mem + uint32_t bv_block_byte_div32; + uint32_t buf_byte_div32; + uint32_t nsum_scan_buf_len; + size_t input_buf_byte; +public: + ValT* src_cu = nullptr; // GPU mem + ValT* res_cu = nullptr; + size_t bv_block_len = 0; + size_t bv_chunk_len = 0; + +#if _MSC_VER >= 1920 || __INTEL_COMPILER + inline static int bitCount64(uint64_t bits) { + return (int)_mm_popcnt_u64(bits); + } +#else + inline static int bitCount64(uint64_t bits) { + return __builtin_popcountll(bits); + } +#endif + static constexpr int get_bit_len(uint64_t val) { + return ( + (val |= val >> 1), + (val |= val >> 2), + (val |= val >> 4), + (val |= val >> 8), + (val |= val >> 16), + (val |= val >> 32), + bitCount64(val)); + // val |= val >> 1; + // val |= val >> 2; + // val |= val >> 4; + // val |= val >> 8; + // val |= val >> 16; + // val |= val >> 32; + // return bitCount64(val); + } + + WaveletMatrix2dCu5C() { + reset(0, 0); + } + WaveletMatrix2dCu5C(const int rows, const int cols, const bool use_hw_bit_len = false, const bool alloc_res = true) { + reset(rows, cols, use_hw_bit_len, alloc_res); + } + + void reset(const int rows, const int cols, const bool use_hw_bit_len = false, const bool alloc_res = true) { + H = rows; + W = cols; + if (rows == 0 || cols == 0) return; + val_bit_len = (use_hw_bit_len ? get_bit_len((uint64_t)H * W - 1) : MAX_BIT_LEN); + assert(size == 0 && src_cu == nullptr); + + size = div_ceil((uint64_t)H * W, WORD_SIZE) * WORD_SIZE; + assert(W < 65535); // That is, less than 65534. + w_bit_len = get_bit_len(W); // w=7 [0,6] bl=3; w=8 [0,7] bl=4 + WM.reset(size, w_bit_len, val_bit_len * wm_num); + if (val_bit_len == 0) return; + + + bv_block_len = div_ceil(size, THREAD_PER_GRID) * THREAD_PER_GRID / WORD_SIZE + 1; + bv_block_len = div_ceil(bv_block_len, 8*2) * 8*2; + const size_t bv_block_byte = (sizeof(BlockT)) * val_bit_len * bv_block_len; + bv_block_byte_div32 = div_ceil(bv_block_byte, 32); + + cudaMalloc(&bv_block_nbit_and_nsum_base_cu, (size_t)(bv_block_byte_div32*32ull) * CH_NUM); + if (bv_block_nbit_and_nsum_base_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d\n", __FILE__, __LINE__); release(); return; } + + const uint16_t block_pair_num = get_block_pair_num(); + nsum_scan_buf_len = div_ceil(size, (size_t)THREAD_PER_GRID * block_pair_num); + nsum_scan_buf_len = div_ceil(nsum_scan_buf_len, 4) * 4; + + const size_t buf_byte = + sizeof(XYIdxT) * 2 * nsum_scan_buf_len + + sizeof(XIdxT) * 2 * size + + sizeof(ValT) * (CH_NUM == 1 ? 1 : 2) * size; + buf_byte_div32 = div_ceil(buf_byte, 32); + + + input_buf_byte = sizeof(ValT) * size * CH_NUM; + cudaMalloc(&src_cu, (size_t)(buf_byte_div32*32ull) * CH_NUM + input_buf_byte); + if (src_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d\n", __FILE__, __LINE__); release(); return; } + + if (alloc_res) { + cudaMalloc(&res_cu, sizeof(ValT) * size * CH_NUM); + if (res_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d\n", __FILE__, __LINE__); release(); return; } + } + } + void release() { + size = 0; + if (src_cu != nullptr) cudaFree(src_cu); + if (bv_block_nbit_and_nsum_base_cu != nullptr) cudaFree(bv_block_nbit_and_nsum_base_cu); + if (res_cu != nullptr) cudaFree(res_cu); + src_cu = nullptr; + bv_block_nbit_and_nsum_base_cu = nullptr; + res_cu = nullptr; + } + ~WaveletMatrix2dCu5C() { release(); } + + BlockT* get_bv_block_cu(int h) const { return (BlockT*)(bv_block_nbit_and_nsum_base_cu + (bv_block_len * (sizeof(BlockT))) * h); } + + BlockT* get_bv_block_cu(int h, int c) const { return (BlockT*)((uint8_t*)get_bv_block_cu(h) + (size_t)c * (bv_block_byte_div32*32ull)); } + + + uint16_t get_block_pair_num() const { + return WM.get_block_pair_num() * MultiWaveletMatrixImpl::THREAD_PER_GRID / THREAD_PER_GRID; + } + std::pair get_nowcu_and_buf_byte_div32() { + ValT *now_cu = src_cu + (CH_NUM == 1 ? 0ull : size * (size_t)CH_NUM); + return make_pair(now_cu, buf_byte_div32); + } + + // Set data in src_cu before calling (data will be destroyed). Or set src_cu_const. + void construct(const ValT *src_cu_const = nullptr, const cudaStream_t main_stream = 0, const bool src_unpacked = false) { + assert(size > 0 && src_cu != nullptr); + if (val_bit_len == 0) return; + if (src_cu == nullptr) { printf("Build Error: memory not alloced."); return;} + + const XIdxT inf = ((XIdxT)1u << w_bit_len) - 1; + assert(W <= inf); + assert(size % WORD_SIZE == 0); + + ValT mask = ((ValT)1u << val_bit_len) - 1; + + const uint16_t block_pair_num = get_block_pair_num(); + const int grid_x = div_ceil(size, THREAD_PER_GRID * block_pair_num); + if (grid_x > MAX_BLOCK_X) { printf("over grid_x %d\n", grid_x); exit(1); } + + const dim3 grid(grid_x, wm_num); + const dim3 thread(WARP_SIZE, THREAD_PER_GRID / WARP_SIZE); + const XYIdxT size_div_w = size / WORD_SIZE; + const XYIdxT size_div_warp = size / WARP_SIZE; + assert(size % WARP_SIZE == 0); + constexpr int ThreadsDimY = THREAD_PER_GRID / WARP_SIZE; + + +#define CALC_SRCB_SIZE(SrcT) (0) + constexpr int SRCB_S_8 = CALC_SRCB_SIZE(uint8_t); + constexpr int SRCB_S_16 = CALC_SRCB_SIZE(uint16_t); + constexpr int SRCB_S_32 = CALC_SRCB_SIZE(uint32_t); +#undef CALC_SRCB_SIZE + { using SrcT = uint8_t; using DstT = uint8_t; + cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_8); + } if (!is_same::value) { using SrcT = uint16_t; using DstT = uint8_t; + cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_16); + } if (!is_same::value) { using SrcT = uint16_t; using DstT = uint16_t; + cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_16); + } if (is_same::value) { using SrcT = uint32_t; using DstT = uint16_t; + cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_32); + } if (is_same::value) { using SrcT = uint32_t; using DstT = uint32_t; + cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_32); + } + + { using SrcT = uint8_t; using DstT = uint8_t; + cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncCachePreferShared); + } if (!is_same::value) { using SrcT = uint16_t; using DstT = uint8_t; + cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncCachePreferShared); + } if (!is_same::value) { using SrcT = uint16_t; using DstT = uint16_t; + cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncCachePreferShared); + } if (is_same::value) { using SrcT = uint32_t; using DstT = uint16_t; + cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncCachePreferShared); + } if (is_same::value) { using SrcT = uint32_t; using DstT = uint32_t; + cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu, cudaFuncCachePreferShared); + } + + const uint32_t nsum_pos = get_nsum_pos(); + + ValT *now_cu = src_cu + (CH_NUM == 1 ? 0ull : size * (size_t)CH_NUM); + ValT *nxt_cu = now_cu + size; + XYIdxT *nsum_buf_test = (XYIdxT*)(nxt_cu + size); + XYIdxT *nsum_buf_test2 = nsum_buf_test + nsum_scan_buf_len; + XIdxT *buf_idx = (XIdxT*)(nsum_buf_test2 + nsum_scan_buf_len); + XIdxT *nxt_idx = (XIdxT*)(buf_idx + size); + + + const int val_bit_len_m1 = val_bit_len - 1; + int h = val_bit_len_m1; + if (src_unpacked == true) { + if (src_cu_const != nullptr) { + printf("[Error!] not support. %s:%d\n", __FILE__, __LINE__); + exit(-1); + } + WaveletMatrix2dCu5C_first_gpu_multi_srcunpacked <<>> (ValT(mask / 2),block_pair_num, size_div_warp, now_cu, nsum_buf_test, buf_byte_div32, buf_idx, W, (XYIdxT)W * H); + } else if (src_cu_const == nullptr) { + WaveletMatrix2dCu5C_first_gpu_multi <<>> (ValT(mask / 2),block_pair_num, size_div_warp, src_cu, now_cu, nsum_buf_test, buf_byte_div32, buf_idx, W, (XYIdxT)W * H, src_cu_const); + } else { + WaveletMatrix2dCu5C_first_gpu_multi <<>> (ValT(mask / 2),block_pair_num, size_div_warp, src_cu, now_cu, nsum_buf_test, buf_byte_div32, buf_idx, W, (XYIdxT)W * H, src_cu_const); + } + BlockT * nsum_p = get_bv_block_cu(h) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test, nsum_buf_test2, nsum_p, buf_byte_div32, bv_block_byte_div32); + + const XYIdxT cwm_buf_byte_div32 = WM.get_buf_byte_div32(); + + if constexpr (sizeof(ValT) >= 4) for (; h > 16; --h) { + using SrcT = uint32_t; + using DstT = uint32_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + const int hp1 = std::min(val_bit_len - 1, h+1); + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM); + XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM)); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1); + WaveletMatrix2dCu5C_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h); + + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_buf_test, nsum_buf_test2); + swap(now_cu, nxt_cu); + swap(buf_idx, nxt_idx); + } + if constexpr (sizeof(ValT) >= 4) if (h == 16 || (is_same::value && h >= 0)) do { + using SrcT = uint32_t; + using DstT = uint16_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + const int hp1 = std::min(val_bit_len - 1, h+1); + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM); + XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM)); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1); + WaveletMatrix2dCu5C_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h); + + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_buf_test, nsum_buf_test2); + swap(now_cu, nxt_cu); + swap(buf_idx, nxt_idx); + --h; + } while(0); + if constexpr (sizeof(ValT) >= 2) for (; h > 8; --h) { + using SrcT = uint16_t; + using DstT = uint16_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + const int hp1 = std::min(val_bit_len - 1, h+1); + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM); + XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM)); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1); + WaveletMatrix2dCu5C_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h); + + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_buf_test, nsum_buf_test2); + swap(now_cu, nxt_cu); + swap(buf_idx, nxt_idx); + } + if constexpr (sizeof(ValT) >= 2) if (h == 8 || (is_same::value && h >= 0)) do { + using SrcT = uint16_t; + using DstT = uint8_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + const int hp1 = std::min(val_bit_len - 1, h+1); + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM); + XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM)); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1); + WaveletMatrix2dCu5C_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h); + + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_buf_test, nsum_buf_test2); + swap(now_cu, nxt_cu); + swap(buf_idx, nxt_idx); + --h; + } while(0); + for (; h >= 0; --h) { + using SrcT = uint8_t; + using DstT = uint8_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + const int hp1 = std::min(val_bit_len - 1, h+1); + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM); + XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM)); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1); + WaveletMatrix2dCu5C_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h); + + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrix2dCu5C_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_buf_test, nsum_buf_test2); + swap(now_cu, nxt_cu); + swap(buf_idx, nxt_idx); + } + { + const int h = 0; + XYIdxT* WM_nsum_buf = WM.get_nsum_buf(h * CH_NUM); + XIdxT* cwm = WM.get_src_p(h * CH_NUM); + BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(h); + + WaveletMatrix2dCu5C_last_gpu <<>> (block_pair_num, size_div_w, buf_byte_div32, nxt_idx, inf, cwm, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h, bv_block_byte_div32); + } + WM.construct(main_stream, false); + } + + XYIdxT get_nsum_pos() const { + const XYIdxT size_div_w = size / WORD_SIZE; + return size_div_w; + } + + template + void median2d(const int r, const ResTableT* res_table = nullptr) { + median2d(r, -1, res_table); + } + + template + void median2d(const int r, int res_step_num = -1, const ResTableT* res_table = nullptr, const cudaStream_t main_stream = 0) { + if (bv_block_nbit_and_nsum_base_cu == nullptr) { printf("Median2d Error: memory not alloced."); return;} + if (is_same::value == false && res_table == nullptr) {printf("Median2d Error: res_table is null."); return;} + static_assert(is_same::value || (sizeof(ResTableT) <= sizeof(ValT)), ""); + + static_assert(TH_W * TH_H <= 1024, "max number of threads in block"); + + if (res_step_num < 0) res_step_num = W * CH_NUM; + + constexpr int THREADS_NUM_W = TH_W; + const dim3 thread(THREADS_NUM_W, TH_H); + const dim3 grid(div_ceil((CUT_BORDER ? W - 2 * r: W), THREADS_NUM_W), div_ceil((CUT_BORDER ? H - 2 * r : H), TH_H), CH_NUM); + + + const uint32_t bv_nsum_pos = get_nsum_pos(); + const BlockT* bv_bv_block_nbit_cu_first = get_bv_block_cu(val_bit_len - 1); + + const BlockT* wm_bv_block_nbit_cu_first = WM.get_bv_block_cu(w_bit_len - 1, (val_bit_len - 1) * CH_NUM); // + const uint32_t nsum_pos = WM.get_nsum_pos(); + const uint64_t wm_bv_block_byte = WM.get_bv_block_byte(); + + if (bv_nsum_pos != nsum_pos) { printf("err! line %d", __LINE__); exit(-1); } + if (WM.get_bv_block_byte() != WM.get_bv_block_h_byte_div32() * 32ull * w_bit_len) { printf("err! line %d", __LINE__); exit(-1); } + + if (bv_block_len != WM.bv_block_len) {printf("bv_block_len error!\n"); exit(1);} + + using ResT = typename std::conditional::value, ValT, ResTableT>::type; + + const int Wc = (CUT_BORDER ? W - 2 * r : W); + const int Hc = (CUT_BORDER ? H - 2 * r : H); + + constexpr int VAL_BIT_LEN = (sizeof(ValT) < 4) ? MAX_BIT_LEN : -1; + WaveletMatrix2dCu5C_median2d_cu <<>> + (Hc, Wc, res_step_num, r, (ResT*)res_cu, wm_bv_block_nbit_cu_first, nsum_pos, WM.get_bv_block_h_byte_div32(), bv_block_len, + bv_bv_block_nbit_cu_first, w_bit_len, val_bit_len, res_table); + } + + template + vector> get_res() { + static_assert(sizeof(ResTableT) <= sizeof(ValT), ""); + auto res = vector>(H, vector(W)); + if (res_cu == nullptr) { printf("get_res Error: memory not alloced."); return res;} + + for (int i = 0; i < H; ++i) { + cudaMemcpy(res[i].data(), res_cu + (XYIdxT)W * i, W * sizeof(ResTableT), cudaMemcpyDeviceToHost); + } + return res; + } +}; + +} // end namespace wavelet_median +}}} //end namespace cv::cuda::device + +#endif +#endif // __OPENCV_WAVELET_MATRIX_2D_CUH__ diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_feature_support_checks.h b/modules/cudafilters/src/cuda/wavelet_matrix_feature_support_checks.h new file mode 100644 index 00000000000..6c47cc5e1a4 --- /dev/null +++ b/modules/cudafilters/src/cuda/wavelet_matrix_feature_support_checks.h @@ -0,0 +1,82 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_WAVELET_MATRIX_FEATURE_SUPPORT_CHECKS_H__ +#define __OPENCV_WAVELET_MATRIX_FEATURE_SUPPORT_CHECKS_H__ + +#ifdef HAVE_CUDA +#include +#include +#endif // HAVE_CUDA + + +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#if CUDA_VERSION >= 11000 || CUDART_VERSION >= 11000 + + +// Check `if constexpr` is available. + +// GCC has been supported since 7.1 +#if defined(__GNUC__) && (__GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ >= 1)) +#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +#endif + +// clang has been supported since 5.0 +#if defined(__clang__) && (__clang_major__ >= 5) +#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +#endif + + +// Visual Studio has been supported since Visual Studio 2019 (16.1.2) +#if defined(_MSC_VER) && _MSC_VER >= 1921 +#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +#endif + + +// I confirmed that it works with Intel C++ Compiler 2021.1.2. It did not work with icc 19.0.1. +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER > 202101 || (__INTEL_COMPILER == 202101 && __INTEL_COMPILER_UPDATE >= 2)) +#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ +#endif + +#endif // CUDA_VERSION +#endif // __OPENCV_WAVELET_MATRIX_FEATURE_SUPPORT_CHECKS_H__ diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_float_supporter.cuh b/modules/cudafilters/src/cuda/wavelet_matrix_float_supporter.cuh new file mode 100644 index 00000000000..3a21a3c7082 --- /dev/null +++ b/modules/cudafilters/src/cuda/wavelet_matrix_float_supporter.cuh @@ -0,0 +1,227 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_WAVELET_MATRIX_FLOAT_SUPPORTER_CUH__ +#define __OPENCV_WAVELET_MATRIX_FLOAT_SUPPORTER_CUH__ + +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#include "wavelet_matrix_feature_support_checks.h" +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + + +namespace cv { namespace cuda { namespace device +{ + +namespace wavelet_matrix_median { +namespace WMMedianFloatSupporter { + +template +__global__ void iota_idx1(IdxT *idx_in_cu, const IdxT hw) { + const IdxT i = blockIdx.x * blockDim + threadIdx.x; + if (i >= hw) return; + idx_in_cu[i] = i; +} + +template +__global__ void split_and_iota_idx(IdxT *idx_in_cu, const ValT* val_in_cu, ValT* val_out_cu, const IdxT hw) { + const size_t i = blockIdx.x * blockDim + threadIdx.x; + if (i >= hw) return; + + static_assert(is_same::value, ""); + // static_assert(2 <= CH_NUM && CH_NUM <= 4); + using SrcTU = std::conditional_t>; + + const SrcTU *src_u = (SrcTU*)val_in_cu; + const SrcTU src_uv = src_u[i]; + + if (CH_NUM >= 1) { // constexpr + val_out_cu[i] = src_uv.x; + idx_in_cu[i] = i; + } + if constexpr (CH_NUM >= 2) { + val_out_cu += hw; idx_in_cu += hw; + val_out_cu[i] = src_uv.y; + idx_in_cu[i] = i; + } + if constexpr (CH_NUM >= 3) { + val_out_cu += hw; idx_in_cu += hw; + val_out_cu[i] = src_uv.z; + idx_in_cu[i] = i; + } + if constexpr (CH_NUM >= 4) { + val_out_cu += hw; idx_in_cu += hw; + val_out_cu[i] = src_uv.w; + idx_in_cu[i] = i; + } +} + +template +__global__ void set_wm_val_1(IdxT *wm_src_p, const IdxT *idx_out_cu, const IdxT hw) { + const IdxT i = blockIdx.x * blockDim + threadIdx.x; + if (i >= hw) return; + const IdxT j = idx_out_cu[i]; + wm_src_p[j] = i; +} + +template +__global__ void set_wm_val(IdxT *wm_src_p, const IdxT *idx_out_cu, const IdxT hw, const IdxT buf_byte_div32) { + const IdxT i = blockIdx.x * blockDim + threadIdx.x; + if (i >= hw) return; + const size_t hwc = size_t(hw) * blockIdx.y; + const IdxT j = idx_out_cu[i + hwc]; + const size_t src_offset = buf_byte_div32 * 32ull / sizeof(IdxT) * blockIdx.y; + wm_src_p[src_offset + j] = i; +} + +template +__global__ void conv_res_cu(ValT *dst, const ValT *val_out_cu, const IdxT *res_cu, const IdxT hw) { + const IdxT i = blockIdx.x * blockDim + threadIdx.x; + if (i >= hw) return; + + const IdxT r = res_cu[i]; + dst[i] = val_out_cu[r]; +} + +template +struct WMMedianFloatSupporter { + constexpr static int blockDim = 512; + int h = 0, w = 0; + int hw_bit_len = -1; + WMMedianFloatSupporter(){}; + WMMedianFloatSupporter(int h, int w) { reset(h, w); } + ~WMMedianFloatSupporter(){ + free(); + } + ValT *val_in_cu = nullptr; + IdxT *idx_in_cu = nullptr; + ValT *val_out_cu = nullptr; + IdxT *idx_out_cu = nullptr; + void *cub_temp_storage = nullptr; + size_t cub_temp_storage_bytes; + // set: val_in + // get: val_out + // 1ch + // [val_in][idx_in][val_out][idx_out][cub_temp] + // C ch + + // [val_in][......][......][......] + // AaBbCcDd + // [^^^^^^][......][valin2][idxin2] + // ABCDabcd01230123 + + // [val0in][val1in][idx0in][idx1in][val0out][val1out][idx0out][idx1out][cub_temp][d_offsets] + + void reset(const int H, const int W) { + h = H; w = W; + free(); + } + void alloc(){ + const size_t hwc = size_t(CH_NUM) * h * w; + if (CH_NUM == 1) { // constexpr + cub::DeviceRadixSort::SortPairs( + nullptr, cub_temp_storage_bytes, val_in_cu, val_out_cu, idx_in_cu, idx_out_cu, hwc); + cudaMalloc(&val_in_cu, 2ull * hwc * (sizeof(ValT) + sizeof(IdxT)) + cub_temp_storage_bytes); + } else { + cub::DeviceSegmentedRadixSort::SortPairs( + nullptr, cub_temp_storage_bytes, val_in_cu, val_out_cu, idx_in_cu, idx_out_cu, hwc, CH_NUM, (int*)nullptr, (int*)nullptr); + const size_t offsets_arr_size = (CH_NUM + 1) * sizeof(int); + cudaMalloc(&val_in_cu, 2ull * hwc * (sizeof(ValT) + sizeof(IdxT)) + cub_temp_storage_bytes + offsets_arr_size); + } + idx_in_cu = (IdxT*)(val_in_cu + hwc); + val_out_cu = (ValT*)(idx_in_cu + hwc); + idx_out_cu = (IdxT*)(val_out_cu + hwc); + int *d_offsets = (int*)(idx_out_cu + hwc); + cub_temp_storage = d_offsets + (CH_NUM + 1); + } + void free() { + if (val_in_cu != nullptr) { + cudaFree(val_in_cu); + } + } + void sort_and_set(IdxT *wm_src_p, const IdxT buf_byte_div32 = 0){ + const IdxT hw = h * w; + const size_t hwc = size_t(CH_NUM) * hw; + const dim3 gridDim((hw + blockDim - 1) / blockDim, CH_NUM); + + if (CH_NUM == 1) { // constexpr + iota_idx1<<>>(idx_in_cu, hw); + cub::DeviceRadixSort::SortPairs( + cub_temp_storage, cub_temp_storage_bytes, val_in_cu, val_out_cu, idx_in_cu, idx_out_cu, hw); + set_wm_val_1<<>>(wm_src_p, idx_out_cu, hw); + } else { + auto idx2 = idx_out_cu; + auto val2 = val_out_cu; + auto idx3 = idx_in_cu; + auto val3 = val_in_cu; + split_and_iota_idx<<>>(idx2, val_in_cu, val2, hw); + + int h_offsets[CH_NUM + 1]; + for (size_t i = 0; i <= CH_NUM; ++i) h_offsets[i] = i * hw; + int *d_offsets = (int*)(idx_out_cu + hwc); + cudaMemcpy(d_offsets, h_offsets, (CH_NUM + 1) * sizeof(int), cudaMemcpyHostToDevice); + + + cub::DeviceSegmentedRadixSort::SortPairs( + cub_temp_storage, cub_temp_storage_bytes, val2, val3, idx2, idx3, hwc, CH_NUM, d_offsets, d_offsets + 1); + set_wm_val<<>>(wm_src_p, idx3, hw, buf_byte_div32); + } + for(hw_bit_len = 1; ; ++hw_bit_len) { + if ((1ull << hw_bit_len) >= hw) { + break; + } + } + } + const ValT* get_res_table() const { + if (CH_NUM == 1) { // constexpr + return val_out_cu; + } else { + return val_in_cu; + } + } +}; +} // end namespace WMMedianFloatSupporter +} // end namespace wavelet_matrix_median + +}}} //end namespace cv::cuda::device +#endif +#endif // __OPENCV_WAVELET_MATRIX_FLOAT_SUPPORTER_CUH__ diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_multi.cuh b/modules/cudafilters/src/cuda/wavelet_matrix_multi.cuh new file mode 100644 index 00000000000..de59d75993a --- /dev/null +++ b/modules/cudafilters/src/cuda/wavelet_matrix_multi.cuh @@ -0,0 +1,636 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_WAVELET_MATRIX_MULTI_CUH__ +#define __OPENCV_WAVELET_MATRIX_MULTI_CUH__ + +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#include "wavelet_matrix_feature_support_checks.h" +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + + +#include +#include +#include +#include +#include +#include "opencv2/core/cuda/warp_shuffle.hpp" + + +#include + +namespace cv { namespace cuda { namespace device +{ + +namespace wavelet_matrix_median { + using std::vector; + using namespace std; + + template + constexpr T div_ceil(T a, T b) { + return (a + b - 1) / b; + } + + +template +__global__ void WaveletMatrixMultiCu4G_UpSweep_gpu(const SrcT mask, const uint16_t block_pair_num, const IdxType size_div_w, const SrcT* __restrict__ src, DstT* __restrict__ dst, BlockT* __restrict__ nbit_bp, const IdxType* __restrict__ nsum_zeros_buf, IdxType* __restrict__ nsum_zeros_buf2, const uint32_t bv_block_byte_div32, const uint32_t buf_byte_div32) { + using WordT = decltype(BlockT::nbit); + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + constexpr int WORD_SIZE = 8 * sizeof(WordT); + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, ""); + constexpr uint32_t WORD_DIV_WARP = WORD_SIZE / WARP_SIZE; + + src = (SrcT*)((uint8_t*)src + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + dst = (DstT*)((uint8_t*)dst + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + nsum_zeros_buf = (IdxType*)((uint8_t*)nsum_zeros_buf + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + nsum_zeros_buf2 = (IdxType*)((uint8_t*)nsum_zeros_buf2 + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + nbit_bp = (BlockT*)((uint8_t*)nbit_bp + (size_t)blockIdx.y * (bv_block_byte_div32*32ull)); // TODO: rename + + using WarpScan = cub::WarpScan; + using WarpScanY = cub::WarpScan; + using WarpReduce = cub::WarpReduce; + using WarpReduceY = cub::WarpReduce; + + constexpr size_t shmem_size = sizeof(SrcT) * (ThreadsDimY * (WARP_SIZE - 1) * WARP_SIZE); + static_assert(SRCB_S == shmem_size, ""); + static_assert(SRCB_S + DSTB_S < 64 * 1024, ""); + + constexpr int DST_BUF_SIZE = DSTB_S; + constexpr int DST_BUF_NUM_PER_WARP = DST_BUF_SIZE / (ThreadsDimY * sizeof(DstT)); // [32k/32/2=512] [48k/8/1=6114] + constexpr int DST_BUF_NUM_PER_THREAD = DST_BUF_NUM_PER_WARP / WARP_SIZE; + static_assert(DST_BUF_NUM_PER_THREAD <= WARP_SIZE, ""); + + + extern __shared__ uint8_t shmem_base[]; + SrcT* __restrict__ src_val_cache = (SrcT*)shmem_base; + DstT* __restrict__ dst_buf = (DstT*)&src_val_cache[SRCB_S] + threadIdx.y * DST_BUF_NUM_PER_WARP; //[ThreadsDimY][DST_BUF_NUM_PER_WARP]; + + __shared__ uint4 nsum_count_sh[ThreadsDimY]; + __shared__ IdxType pre_sum_share[2]; + __shared__ IdxType warp_scan_sums[WARP_SIZE]; + __shared__ typename WarpScan::TempStorage s_scanStorage; + __shared__ typename WarpScanY::TempStorage s_scanStorage2; + __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY]; + __shared__ typename WarpReduceY::TempStorage WarpReduceY_temp_storage; + // shmem ------ end ------ + + const IdxType size_div_warp = size_div_w * WORD_DIV_WARP; + const IdxType nsum = nbit_bp[size_div_w].nsum; + const IdxType nsum_offset = nsum_zeros_buf[blockIdx.x]; + + + IdxType nsum_idx0_org = nsum_offset; + IdxType nsum_idx1_org = (IdxType)blockIdx.x * block_pair_num * THREAD_PER_GRID + nsum - nsum_idx0_org; + nsum_idx0_org /= (IdxType)block_pair_num * ThreadsDimY * WARP_SIZE; + nsum_idx1_org /= (IdxType)block_pair_num * ThreadsDimY * WARP_SIZE; + const IdxType nsum_idx0_bound = (nsum_idx0_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE; + const IdxType nsum_idx1_bound = (nsum_idx1_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE; + uint4 nsum_count = make_uint4(0, 0, 0, 0); + + const unsigned short th_idx = threadIdx.y * WARP_SIZE + threadIdx.x; + if (th_idx == 0) { + pre_sum_share[0] = nsum_offset; + } + + + for (IdxType ka = 0; ka < block_pair_num; ka += WARP_SIZE) { + const IdxType ibb = ((IdxType)blockIdx.x * block_pair_num + ka) * ThreadsDimY; + if (ibb >= size_div_warp) break; + WarpWT my_bits = 0; + SrcT first_val; + + const IdxType src_val_cache_offset = IdxType(threadIdx.y * (WARP_SIZE - 1) - 1) * WARP_SIZE + threadIdx.x; + for (IdxType kb = 0, i = ibb + WARP_SIZE * threadIdx.y; kb < WARP_SIZE; ++kb, ++i) { + if (i >= size_div_warp) break; + WarpWT bits; + const SrcT v = src[i * WARP_SIZE + threadIdx.x]; + if (kb == 0) { + first_val = v; + } else { + src_val_cache[src_val_cache_offset + kb * WARP_SIZE] = v; + } + if (v <= mask) { + bits = __activemask(); + } else { + bits = ~__activemask(); + } + if (threadIdx.x == kb) { + my_bits = bits; + } + } + IdxType t, c = __popc(my_bits); + WarpScan(s_scanStorage).ExclusiveSum(c, t); + + if (threadIdx.x == WARP_SIZE - 1) { + warp_scan_sums[threadIdx.y] = c + t; + } + __syncthreads(); + IdxType pre_sum = pre_sum_share[(ka & WARP_SIZE) > 0]; + IdxType s = threadIdx.x < ThreadsDimY ? warp_scan_sums[threadIdx.x] : 0; + WarpScanY(s_scanStorage2).ExclusiveSum(s, s); + + s = cv::cuda::device::shfl(s, threadIdx.y, WARP_SIZE); + + s += t + pre_sum; + if (th_idx == THREAD_PER_GRID - 1) { + pre_sum_share[(ka & WARP_SIZE) == 0] = s + c; + } + const IdxType bi = ibb + th_idx; + + if (bi < size_div_warp) { + static_assert(WORD_SIZE == 32, ""); + nbit_bp[bi] = BlockT{s, my_bits}; + } + if (mask == 0) continue; + + const SrcT mask_2 = mask >> 1; + SrcT vo = first_val; + for (IdxType j = 0, i = ibb + WARP_SIZE * threadIdx.y; j < WARP_SIZE;) { + if (i >= size_div_warp) break; + + + IdxType idx0_begin, idx0_num, idx1_offset, idx01_num, idx1_offset0; + if (DST_BUF_SIZE > 0) { // constexpr + IdxType idx1_begin, idx0_end; + const IdxType ib = ::min(size_div_warp, i + DST_BUF_NUM_PER_THREAD); + const IdxType jb = j + ib - i - 1; + idx0_begin = cv::cuda::device::shfl(s, j, WARP_SIZE); + idx1_begin = i * WARP_SIZE + nsum - idx0_begin; + idx0_end = cv::cuda::device::shfl(s + c, jb, WARP_SIZE); + + idx0_num = idx0_end - idx0_begin; + idx1_offset = idx1_begin - idx0_num; + idx01_num = (ib - i) * WARP_SIZE; + idx1_offset0 = nsum - idx1_begin + idx0_num; + } + constexpr int DST_LOOP_NUM = (DST_BUF_SIZE == 0 ? 1 : DST_BUF_NUM_PER_THREAD); + for (IdxType kb = 0; kb < DST_LOOP_NUM; ++kb, ++j, ++i) { + if (i >= size_div_warp) break; + + const WarpWT e_nbit = cv::cuda::device::shfl(my_bits, j, WARP_SIZE); + const IdxType e_nsum = cv::cuda::device::shfl(s, j, WARP_SIZE); + IdxType rank = __popc(e_nbit << (WARP_SIZE - threadIdx.x)); + const IdxType idx0 = e_nsum + rank; + + DstT v = (DstT)vo; + IdxType idx; + IdxType buf_idx; + if (vo > mask) { // 1 + const IdxType ij = i * WARP_SIZE + threadIdx.x; + idx = ij + nsum - idx0; + v &= mask; + buf_idx = ij - idx0 + idx1_offset0; + } else { + idx = idx0; + buf_idx = idx0 - idx0_begin; + } + if (DST_BUF_SIZE == 0) { + dst[idx] = v; + } else { + dst_buf[buf_idx] = (DstT)v; + } + + if (v <= mask_2) { + if (vo <= mask) { + if (idx < nsum_idx0_bound) { + nsum_count.x++; + } else { + assert(idx < nsum_idx0_bound + block_pair_num * ThreadsDimY * WARP_SIZE); + nsum_count.y++; + } + } else { + if (idx < nsum_idx1_bound) { + nsum_count.z++; + } else { + assert(idx < nsum_idx1_bound + block_pair_num * ThreadsDimY * WARP_SIZE); + nsum_count.w++; + } + } + } + if (j == WARP_SIZE - 1) { j = WARP_SIZE; break; } + vo = src_val_cache[(threadIdx.y * (WARP_SIZE - 1) + j) * WARP_SIZE + threadIdx.x]; + } + if (DST_BUF_SIZE > 0) { // constexpr + for (IdxType j = threadIdx.x; (int)j < DST_BUF_NUM_PER_WARP; j += WARP_SIZE) { + if (j >= idx01_num) break; + IdxType idx; + if (j < idx0_num) { // 0 + idx = j + idx0_begin; + } else { // 1 + idx = j + idx1_offset; + } + dst[idx] = dst_buf[j]; + } + } + } + } + + if (blockIdx.x == gridDim.x - 1 && th_idx == 0) { + nbit_bp[size_div_warp / WORD_DIV_WARP].nsum = nsum; + } + if (mask == 0) return; + + nsum_count.x = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.x); + nsum_count.y = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.y); + nsum_count.z = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.z); + nsum_count.w = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.w); + if (threadIdx.x == 0) { + nsum_count_sh[threadIdx.y] = nsum_count; + } + __syncthreads(); + + + if (threadIdx.x < ThreadsDimY) { + nsum_count = nsum_count_sh[threadIdx.x]; + nsum_count.x = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.x); + nsum_count.y = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.y); + nsum_count.z = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.z); + nsum_count.w = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.w); + if (th_idx == 0) { + const IdxType nsum_idx0_org = nsum_idx0_bound / ((IdxType)block_pair_num * ThreadsDimY * WARP_SIZE); + const IdxType nsum_idx1_org = nsum_idx1_bound / ((IdxType)block_pair_num * ThreadsDimY * WARP_SIZE); + if (nsum_count.x > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx0_org - 1, nsum_count.x); + if (nsum_count.y > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx0_org - 0, nsum_count.y); + if (nsum_count.z > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx1_org - 1, nsum_count.z); + if (nsum_count.w > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx1_org - 0, nsum_count.w); + } + } +} + + +template +__global__ void WaveletMatrixMultiCu4G_ExclusiveSum(IdxType* __restrict__ nsum_scan_buf, IdxType* __restrict__ nsum_zeros_buf2, BlockT* __restrict__ nsum_p, const uint32_t buf_byte_div32, const uint32_t bv_block_byte_div32) { + + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + + IdxType thread_data1; + IdxType thread_data2; + nsum_scan_buf = (IdxType*)((uint8_t*)nsum_scan_buf + (size_t)blockIdx.x * (buf_byte_div32*32ull)); + nsum_zeros_buf2 = (IdxType*)((uint8_t*)nsum_zeros_buf2 + (size_t)blockIdx.x * (buf_byte_div32*32ull)); + + thread_data1 = nsum_scan_buf[threadIdx.x]; + BlockScan(temp_storage).ExclusiveSum(thread_data1, thread_data2); + + nsum_scan_buf[threadIdx.x] = thread_data2; + nsum_zeros_buf2[threadIdx.x] = 0; + + if (threadIdx.x == blockDim.x - 1) { + thread_data2 += thread_data1; + nsum_p = (BlockT*)((uint8_t*)nsum_p + (size_t)blockIdx.x * (bv_block_byte_div32*32ull)); + nsum_p->nsum = thread_data2; + } +} + + +template +__global__ void WaveletMatrixMultiCu4G_first_gpu(const SrcT mask, uint16_t block_pair_num, const IdxType size_div_warp, const SrcT* __restrict__ src, IdxType* __restrict__ nsum_scan_buf, const uint32_t buf_byte_div32) { + using WarpWT = uint32_t; + constexpr int WARP_SIZE = 8 * sizeof(WarpWT); + static_assert(WARP_SIZE == 32, ""); + static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE; + + src = (SrcT*)((uint8_t*)src + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + IdxType cs = 0; + IdxType ibb = (IdxType)blockIdx.x * block_pair_num * ThreadsDimY; + for (IdxType ka = 0; ka < block_pair_num; ka += WARP_SIZE, ibb += THREAD_PER_GRID) { + for (IdxType kb = 0, i = ibb + WARP_SIZE * threadIdx.y; kb < WARP_SIZE; ++kb, ++i) { + if (i >= size_div_warp) break; + const SrcT v = src[i * WARP_SIZE + threadIdx.x]; + if (v <= mask) { + ++cs; + } + } + } + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage s_reduceStorage; + IdxType reducedValue = BlockReduce(s_reduceStorage).Sum(cs); + + if (threadIdx.y == 0 && threadIdx.x == 0) { + nsum_scan_buf = (IdxType*)((uint8_t*)nsum_scan_buf + (size_t)blockIdx.y * (buf_byte_div32*32ull)); + nsum_scan_buf[blockIdx.x] = reducedValue; + } +} + + + +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_Z(uint32_t r) { + return (r >= MIN_DSTBUF_KB) ? r * 1024 : 0; +} + +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_B(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_Z((r + 1) / 2); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A16(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_B(r | (r >> 16)); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A8(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A16(r | (r >> 8)); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A4(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A8(r | (r >> 4)); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A2(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A4(r | (r >> 2)); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A1(uint32_t r) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A2(r | (r >> 1)); +} +template +constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb(int SRCB_S) { + return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A1(SHMEM_USE_KB - (SRCB_S + 1023) / 1024); +} + +// template +// constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb(int SRCB_S) { +// uint32_t r = (SHMEM_USE_KB - (SRCB_S + 1023) / 1024); +// r |= r >> 1; +// r |= r >> 2; +// r |= r >> 4; +// r |= r >> 8; +// r |= r >> 16; +// r = (r + 1) / 2; +// return (r >= MIN_DSTBUF_KB) ? r * 1024 : 0; +// } + + +template +struct WaveletMatrixMultiCu4G { + static constexpr int MAX_BIT_LEN = 8 * sizeof(T); + + static constexpr uint32_t WSIZE = WORD_SIZE; + using T_Type = T; + static constexpr int WARP_SIZE = 32; + static constexpr int THREAD_PER_GRID = TH_NUM; + static constexpr int MAX_BLOCK_X = 1024; + static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, "WORD SIZE must be 32 or 64"); + using WordT = typename std::conditional::type; + + static constexpr int SHMEM_USE_KB = 64; + + struct __align__(8) BLOCK32_T { uint32_t nsum; union { uint32_t nbit; uint32_t nbit_a[1];}; }; + struct __align__(4) BLOCK64_T { uint32_t nsum; union { uint64_t nbit; uint32_t nbit_a[2];}; }; + using BlockT = typename std::conditional::type; + + static constexpr int MIN_DSTBUF_KB = 4; + static constexpr int BLOCK_TYPE = 2; + using WarpWT = uint32_t; + static_assert(8 * sizeof(WarpWT) == WARP_SIZE, "bits of WarpWT must be WARP_SIZE"); + + IdxType size = 0; + int wm_num = 0; + +private: + T* src_cu = nullptr; + uint8_t* bv_block_nbit_and_nsum_base_cu = nullptr; + uint32_t bv_block_byte_div32; + uint32_t buf_byte_div32; +public: + size_t bv_block_len = 0; + IdxType bv_zeros[MAX_BIT_LEN]; + int bit_len = 0; + + WaveletMatrixMultiCu4G(IdxType _n = 0, int _bit_len = 0, int num = 0) { + reset(_n, _bit_len, num); + } + void reset(IdxType _n, int _bit_len, int _num) { + cudaError_t err; + assert(size == 0 && src_cu == nullptr && 0 <= _bit_len && _bit_len <= MAX_BIT_LEN); + bit_len = _bit_len; + wm_num = _num; + if (_n == 0 || wm_num == 0) return; + size = div_ceil(_n, WORD_SIZE) * WORD_SIZE; + bv_block_len = div_ceil(size, THREAD_PER_GRID) * THREAD_PER_GRID / WORD_SIZE + 1; + bv_block_len = div_ceil(bv_block_len, 8*2) * 8*2; + + const size_t bv_block_byte = (sizeof(BlockT)) * bit_len * bv_block_len; + if (bv_block_byte % 32 != 0) { printf("bv_block_byte not 32n!"); exit(-1); } + bv_block_byte_div32 = div_ceil(bv_block_byte, 32); + + err = cudaMalloc(&bv_block_nbit_and_nsum_base_cu, (size_t)(bv_block_byte_div32*32ull) * _num); + if (bv_block_nbit_and_nsum_base_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); release(); return; } + + const uint16_t block_pair_num = get_block_pair_num(); + const IdxType nsum_scan_buf_len = div_ceil(size, (size_t)THREAD_PER_GRID * block_pair_num); + + const size_t buf_byte = sizeof(IdxType) * 2 * nsum_scan_buf_len + sizeof(T) * size * 2; + buf_byte_div32 = div_ceil(buf_byte, 32); + err = cudaMalloc(&src_cu, (size_t)(buf_byte_div32*32ull) * _num); + if (src_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); release(); return; } + + } + void release() { + size = 0; + if (src_cu != nullptr) cudaFree(src_cu); + if (bv_block_nbit_and_nsum_base_cu != nullptr) cudaFree(bv_block_nbit_and_nsum_base_cu); + src_cu = nullptr; + bv_block_nbit_and_nsum_base_cu = nullptr; + } + ~WaveletMatrixMultiCu4G() { release(); } + + BlockT* get_bv_block_cu(int h) const { return (BlockT*)(bv_block_nbit_and_nsum_base_cu + (sizeof(BlockT)) * bv_block_len * h); } + BlockT* get_bv_block_cu(int h, int c) const { return (BlockT*)((uint8_t*)get_bv_block_cu(h) + (size_t)c * (bv_block_byte_div32*32ull)); } + uint64_t get_bv_block_byte() const { return (bv_block_byte_div32*32ull); } + + + T* get_src_p(int c) const { return src_cu + (size_t)(buf_byte_div32*32ull) / (sizeof(T)) * c; } + + uint16_t get_block_pair_num() const { + constexpr int x_chunk = 65536 / THREAD_PER_GRID / WARP_SIZE; // To make the pixels assigned per grid a multiple of 65536. + static_assert(x_chunk > 0, ""); + const uint64_t total_gridx = div_ceil(size, THREAD_PER_GRID * WARP_SIZE); + uint64_t block_pair_num_org = div_ceil(total_gridx, MAX_BLOCK_X); + if (block_pair_num_org <= x_chunk) { + block_pair_num_org--; + block_pair_num_org |= block_pair_num_org >> 1; + block_pair_num_org |= block_pair_num_org >> 2; + block_pair_num_org |= block_pair_num_org >> 4; + block_pair_num_org |= block_pair_num_org >> 8; + block_pair_num_org++; + } else { + block_pair_num_org = div_ceil(block_pair_num_org, x_chunk) * x_chunk; + } + block_pair_num_org *= WARP_SIZE; + + if (block_pair_num_org >= (1LL << (8 * sizeof(uint16_t)))) { printf("over block_pair_num %ld\n", block_pair_num_org); exit(1); } + return (uint16_t)block_pair_num_org; + } + std::pair get_nsum_buf_and_buf_byte() const { + IdxType* nsum_buf = (IdxType*)(src_cu + 2ull * size); + return { nsum_buf, (buf_byte_div32*32ull) }; + } + IdxType* get_nsum_buf(int c) const { + IdxType* nsum_buf = (IdxType*)(src_cu + 2ull * size); + return (IdxType*)((uint8_t*)nsum_buf + (size_t)(buf_byte_div32*32ull) * c); + } + uint64_t get_buf_byte() const { return (buf_byte_div32*32ull); } + uint32_t get_buf_byte_div32() const { return buf_byte_div32; } + + IdxType get_nsum_scan_buf_len(uint16_t block_pair_num) const { + return div_ceil(size, THREAD_PER_GRID * block_pair_num); + } + IdxType get_nsum_scan_buf_len() const { + const uint16_t block_pair_num = get_block_pair_num(); + return get_nsum_scan_buf_len(block_pair_num); + } + + // Set data in src_cu before calling (data will be destroyed). + void construct(const cudaStream_t main_stream = 0, const bool run_first = true) { + assert(size > 0 && src_cu != nullptr); + if (size == 0 || wm_num == 0) return; + if (src_cu == nullptr) { printf("Build Error: memory not alloced."); return;} + + T mask = ((T)1 << bit_len) - 1; + + const uint16_t block_pair_num = get_block_pair_num(); + const int grid_x = div_ceil(size, THREAD_PER_GRID * block_pair_num); + if (grid_x > MAX_BLOCK_X) { printf("over grid_x %d\n", grid_x); exit(1); } + + + const dim3 grid(grid_x, wm_num); + const dim3 thread(WARP_SIZE, THREAD_PER_GRID / WARP_SIZE); + const IdxType size_div_w = size / WORD_SIZE; + const IdxType size_div_warp = size / WARP_SIZE; + assert(size % WARP_SIZE == 0); + constexpr int ThreadsDimY = THREAD_PER_GRID / WARP_SIZE; + + + const int nsum_scan_buf_len = get_nsum_scan_buf_len(block_pair_num); // same grid_x + + +#define CALC_SRCB_SIZE(SrcT) (sizeof(SrcT) * (ThreadsDimY * (WARP_SIZE - 1) * WARP_SIZE)) + constexpr int SRCB_S_T = CALC_SRCB_SIZE(T); + constexpr int SRCB_S_8 = CALC_SRCB_SIZE(uint8_t); +#undef CALC_SRCB_SIZE + constexpr int BLOCK_SHMEM_KB = SHMEM_USE_KB * THREAD_PER_GRID / 1024; + constexpr int DSTB_S_T = WaveletMatrixMultiCu4G_get_dstbuf_kb(SRCB_S_T); + constexpr int DSTB_S_8 = WaveletMatrixMultiCu4G_get_dstbuf_kb(SRCB_S_8); + static_assert(SHMEM_USE_KB >= 64 || THREAD_PER_GRID == 1024, "if SHMEM_USE_KB < 64, THREAD_PER_GRID must 1024"); + static_assert(SRCB_S_T + DSTB_S_T <= BLOCK_SHMEM_KB * 1024 && ((DSTB_S_T == 0 && SRCB_S_T + MIN_DSTBUF_KB * 1024> BLOCK_SHMEM_KB * 1024) || (DSTB_S_T >= MIN_DSTBUF_KB * 1024 && SRCB_S_T + DSTB_S_T * 2 > BLOCK_SHMEM_KB * 1024)), ""); + static_assert(SRCB_S_8 + DSTB_S_8 <= BLOCK_SHMEM_KB * 1024 && ((DSTB_S_8 == 0 && SRCB_S_8 + MIN_DSTBUF_KB * 1024> BLOCK_SHMEM_KB * 1024) || (DSTB_S_8 >= MIN_DSTBUF_KB * 1024 && SRCB_S_8 + DSTB_S_8 * 2 > BLOCK_SHMEM_KB * 1024)), ""); + + { using SrcT = T; using DstT = T; + cudaFuncSetAttribute(&WaveletMatrixMultiCu4G_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_T + DSTB_S_T); + } { using SrcT = T; using DstT = uint8_t; + cudaFuncSetAttribute(&WaveletMatrixMultiCu4G_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_T + DSTB_S_T); + } { using SrcT = uint8_t; using DstT = uint8_t; + cudaFuncSetAttribute(&WaveletMatrixMultiCu4G_UpSweep_gpu, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_8 + DSTB_S_8); + } + + T* now_cu = src_cu; + T* nxt_cu = src_cu + size; + IdxType* nsum_zeros_buf = (IdxType*)(nxt_cu + size); + IdxType* nsum_zeros_buf2 = nsum_zeros_buf + nsum_scan_buf_len; + + const uint32_t nsum_pos = get_nsum_pos(); + + int h = bit_len - 1; + if (run_first) { + WaveletMatrixMultiCu4G_first_gpu <<>> (T(mask / 2), block_pair_num, size_div_warp, src_cu, nsum_zeros_buf, buf_byte_div32); + } + WaveletMatrixMultiCu4G_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf, nsum_zeros_buf2, get_bv_block_cu(h) + nsum_pos, buf_byte_div32, bv_block_byte_div32); + + for (; h > 8; --h) { + using SrcT = T; + using DstT = T; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + WaveletMatrixMultiCu4G_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_zeros_buf, nsum_zeros_buf2, bv_block_byte_div32, buf_byte_div32); + + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrixMultiCu4G_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf2, nsum_zeros_buf, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_zeros_buf, nsum_zeros_buf2); + swap(now_cu, nxt_cu); + } + if (h == 8 || (is_same::value && bit_len <= 8 && h == bit_len - 1)) { + using SrcT = T; + using DstT = uint8_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + WaveletMatrixMultiCu4G_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_zeros_buf, nsum_zeros_buf2, bv_block_byte_div32, buf_byte_div32); + if (h == 0) return; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrixMultiCu4G_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf2, nsum_zeros_buf, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_zeros_buf, nsum_zeros_buf2); + swap(now_cu, nxt_cu); + --h; + } + + for (; h >= 0; --h) { + using SrcT = uint8_t; + using DstT = uint8_t; + mask >>= 1; + BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h); + WaveletMatrixMultiCu4G_UpSweep_gpu <<>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_zeros_buf, nsum_zeros_buf2, bv_block_byte_div32, buf_byte_div32); + if (h == 0) break; + BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos; + WaveletMatrixMultiCu4G_ExclusiveSum <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf2, nsum_zeros_buf, nsum_p, buf_byte_div32, bv_block_byte_div32); + swap(nsum_zeros_buf, nsum_zeros_buf2); + swap(now_cu, nxt_cu); + } + } + + IdxType get_nsum_pos() const { + const IdxType size_div_w = size / WORD_SIZE; + return size_div_w; + } + IdxType get_bv_block_h_byte_div32() const { + return (bv_block_len * (sizeof(WordT) + sizeof(IdxType))) / 32u; + } +}; + +} // end namespace wavelet_median +}}} //end namespace cv::cuda::device + +#endif +#endif // __OPENCV_WAVELET_MATRIX_MULTI_CUH__ diff --git a/modules/cudafilters/src/filtering.cpp b/modules/cudafilters/src/filtering.cpp index daab3acde10..2ae789c856d 100644 --- a/modules/cudafilters/src/filtering.cpp +++ b/modules/cudafilters/src/filtering.cpp @@ -72,6 +72,8 @@ Ptr cv::cuda::createColumnSumFilter(int, int, int, int, int, Scalar) { t Ptr cv::cuda::createMedianFilter(int srcType, int _windowSize, int _partitions){ throw_no_cuda(); return Ptr();} #else +#include +#include namespace { @@ -1047,12 +1049,20 @@ Ptr cv::cuda::createColumnSumFilter(int srcType, int dstType, int ksize, //////////////////////////////////////////////////////////////////////////////////////////////////// // Median Filter +// The CUB library is used for the Median Filter with Wavelet Matrix, +// which has become a standard library since CUDA 11. +#include "cuda/wavelet_matrix_feature_support_checks.h" namespace cv { namespace cuda { namespace device { void medianFiltering_gpu(const PtrStepSzb src, PtrStepSzb dst, PtrStepSzi devHist, PtrStepSzi devCoarseHist,int kernel, int partitions, cudaStream_t stream); + +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + template + void medianFiltering_wavelet_matrix_gpu(const PtrStepSz src, PtrStepSz dst, int radius, const int num_channels, cudaStream_t stream); +#endif }}} namespace @@ -1074,7 +1084,15 @@ namespace MedianFilter::MedianFilter(int srcType, int _windowSize, int _partitions) : windowSize(_windowSize),partitions(_partitions) { - CV_Assert( srcType == CV_8UC1 ); +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 + || srcType == CV_16UC1 || srcType == CV_16UC3 || srcType == CV_16UC4 + || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4); +#else + if (srcType != CV_8UC1) { + CV_Error(Error::StsNotImplemented, "If CUDA version is below 10, only implementations that support CV_8UC1 are available"); + } +#endif CV_Assert(windowSize>=3); CV_Assert(_partitions>=1); @@ -1094,6 +1112,18 @@ namespace // Kernel needs to be half window size int kernel=windowSize/2; +#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__ + const int depth = src.depth(); + if (depth == CV_8U) { + medianFiltering_wavelet_matrix_gpu(src, dst, kernel, src.channels(), StreamAccessor::getStream(_stream)); + } else if (depth == CV_16U) { + medianFiltering_wavelet_matrix_gpu(src, dst, kernel, src.channels(), StreamAccessor::getStream(_stream)); + } else if (depth == CV_32F) { + medianFiltering_wavelet_matrix_gpu(src, dst, kernel, src.channels(), StreamAccessor::getStream(_stream)); + } else { + CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32F); + } +#else CV_Assert(kernel < src.rows); CV_Assert(kernel < src.cols); @@ -1107,6 +1137,7 @@ namespace devCoarseHist.setTo(0, _stream); medianFiltering_gpu(src,dst,devHist, devCoarseHist,kernel,partitions,StreamAccessor::getStream(_stream)); +# endif } } diff --git a/modules/cudafilters/test/test_filters.cpp b/modules/cudafilters/test/test_filters.cpp index bb235dad093..432b5d2a5ac 100644 --- a/modules/cudafilters/test/test_filters.cpp +++ b/modules/cudafilters/test/test_filters.cpp @@ -43,6 +43,7 @@ #include "test_precomp.hpp" #ifdef HAVE_CUDA +#include "../src/cuda/wavelet_matrix_feature_support_checks.h" namespace opencv_test { namespace { @@ -647,7 +648,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Filters, MorphEx, testing::Combine( // Median -PARAM_TEST_CASE(Median, cv::cuda::DeviceInfo, cv::Size, MatDepth, KernelSize, UseRoi) +PARAM_TEST_CASE(Median, cv::cuda::DeviceInfo, cv::Size, MatType, KernelSize, UseRoi) { cv::cuda::DeviceInfo devInfo; cv::Size size; @@ -681,7 +682,7 @@ CUDA_TEST_P(Median, Accuracy) cv::Mat dst_gold; cv::medianBlur(src,dst_gold,kernel); - cv::Rect rect(kernel+1,0,src.cols-(2*kernel+1),src.rows); + cv::Rect rect(kernel/2, kernel/2, src.cols-(kernel-1), src.rows-(kernel-1)); cv::Mat dst_gold_no_border = dst_gold(rect); cv::cuda::GpuMat dst_no_border = cv::cuda::GpuMat(dst, rect); @@ -703,6 +704,17 @@ INSTANTIATE_TEST_CASE_P(CUDA_Filters, Median, testing::Combine( WHOLE_SUBMAT) ); -}} // namespace +INSTANTIATE_TEST_CASE_P(CUDA_Filters_Median_HDR, Median, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES, + testing::Values( + MatType(CV_8UC3), MatType(CV_8UC4), + MatType(CV_16U), MatType(CV_16UC3), MatType(CV_16UC4), + MatType(CV_32F), MatType(CV_32FC3), MatType(CV_32FC4)), + testing::Values(KernelSize(3), KernelSize(5)), + WHOLE_SUBMAT) + ); + +}} // namespace #endif // HAVE_CUDA From 8c16a489a1fa9e2ccf21ce8eeb7fbb1affe548b1 Mon Sep 17 00:00:00 2001 From: Vincent Rabaud Date: Thu, 23 May 2024 22:14:29 +0200 Subject: [PATCH 09/12] Get code to compile with CUDA 12.4 This fixes https://github.com/opencv/opencv_contrib/issues/3741 --- modules/cudaimgproc/src/histogram.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp index 177bf75b1ac..51a5ce1a83e 100644 --- a/modules/cudaimgproc/src/histogram.cpp +++ b/modules/cudaimgproc/src/histogram.cpp @@ -281,8 +281,13 @@ cv::Ptr cv::cuda::createCLAHE(double clipLimit, cv::Size tileGr namespace { +#if (CUDA_VERSION >= 12040) + typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, size_t* hpBufferSize); + typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], size_t* hpBufferSize); +#else typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize); typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize); +#endif template struct NppHistogramEvenFuncC1 { @@ -315,7 +320,11 @@ namespace sz.width = src.cols; sz.height = src.rows; +#if (CUDA_VERSION >= 12040) + size_t buf_size; +#else int buf_size; +#endif get_buf_size(sz, levels, &buf_size); BufferPool pool(stream); @@ -349,7 +358,11 @@ namespace Npp32s* pHist[] = {hist[0].ptr(), hist[1].ptr(), hist[2].ptr(), hist[3].ptr