Merge pull request #25230 from hanliutong/rvv-conv

hanliutong · web-flow · commit eba158fb0c68 · 2024-03-31T16:47:06.000+03:00
Optimize int8 layers in DNN modules by using RISC-V Vector intrinsic. #25230 This patch optimize 3 functions in the int8 layer by using RVV Native Intrinsic. This patch was tested on QEMU using VLEN=128 and VLEN=256 on `./bin/opencv_test_dnn --gtest_filter="*Int8*"`; On the real device (k230, VLEN=128), `EfficientDet_int8` in `opencv_perf_dnn` showed a performance improvement of 1.46x. | Name of Test | Original | optimized | Speed-up | | ------------------------------------------ | -------- | ---------- | -------- | | EfficientDet_int8::DNNTestNetwork::OCV/CPU | 2843.467 | 1947.013 | 1.46 | ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
 
 ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
-ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)
+ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX)
 ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16)
 ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
 ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON_FP16)
diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp
@@ -702,13 +702,14 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
         bool useAVX2;
         bool useAVX512;
         bool useLASX;
+        bool useRVV;
         int blk_size_cn;
         int inpZp, outZp;
         const std::vector<float>* multiplier;
 
         ParallelConv()
             : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
-              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false)
+              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false), useRVV(false)
             , blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
         {}
 
@@ -765,6 +766,7 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;
 
             p.useLASX   = checkHardwareSupport(CPU_LASX) && isConv2D;
+            p.useRVV   = checkHardwareSupport(CPU_RVV) && isConv2D;
 
             int kernel_d = isConv3D? kernel_size[0] : 1;
             int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
@@ -970,6 +972,13 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
                                     biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
                             else
                         #endif
+                        #if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
+                            if(useRVV)
+                                opt_RVV::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
+                            else
+                        #endif
                         #if CV_RVP052
                             if(isConv2D)
                                 opt_RVP052::fastDepthwiseConv(wptr, kernel_h, kernel_w,
@@ -1356,6 +1365,12 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
                                           outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
                         else
                     #endif
+                    #if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
+                        if(useRVV)
+                            opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
+                        else
+                    #endif
                     #if CV_RVP052
                         if(isConv2D)
                             opt_RVP052::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@@ -228,7 +228,7 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
     {
     public:
         FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
-                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {}
+                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false), useRVV(false) {}
 
         static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
                         const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
@@ -253,6 +253,7 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
             p.useAVX2 = checkHardwareSupport(CPU_AVX2);
             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
             p.useLASX = checkHardwareSupport(CPU_LASX);
+            p.useRVV = checkHardwareSupport(CPU_RVV);
 
             parallel_for_(Range(0, nstripes), p, nstripes);
         }
@@ -303,6 +304,11 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
                     opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
                 else
             #endif
+            #if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
+                if( useRVV)
+                    opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
+                else
+            #endif
             #if CV_RVP052
                 if( 1 )
                     opt_RVP052::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
@@ -363,6 +369,7 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
         bool useAVX2;
         bool useAVX512;
         bool useLASX;
+        bool useRVV;
     };
 
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
diff --git a/modules/dnn/src/int8layers/layers_common.simd.hpp b/modules/dnn/src/int8layers/layers_common.simd.hpp