Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit eba158f

Browse files
authored
Merge pull request #25230 from hanliutong/rvv-conv
Optimize int8 layers in DNN modules by using RISC-V Vector intrinsic. #25230 This patch optimize 3 functions in the int8 layer by using RVV Native Intrinsic. This patch was tested on QEMU using VLEN=128 and VLEN=256 on `./bin/opencv_test_dnn --gtest_filter="*Int8*"`; On the real device (k230, VLEN=128), `EfficientDet_int8` in `opencv_perf_dnn` showed a performance improvement of 1.46x. | Name of Test | Original | optimized | Speed-up | | ------------------------------------------ | -------- | ---------- | -------- | | EfficientDet_int8::DNNTestNetwork::OCV/CPU | 2843.467 | 1947.013 | 1.46 | ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
1 parent 8c540a5 commit eba158f

File tree

4 files changed

+460
-3
lines changed

4 files changed

+460
-3
lines changed

modules/dnn/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ endif()
55
set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
66

77
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
8-
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)
8+
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX)
99
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16)
1010
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
1111
ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON_FP16)

modules/dnn/src/int8layers/convolution_layer.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -702,13 +702,14 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
702702
bool useAVX2;
703703
bool useAVX512;
704704
bool useLASX;
705+
bool useRVV;
705706
int blk_size_cn;
706707
int inpZp, outZp;
707708
const std::vector<float>* multiplier;
708709

709710
ParallelConv()
710711
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
711-
biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false)
712+
biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false), useRVV(false)
712713
, blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
713714
{}
714715

@@ -765,6 +766,7 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
765766
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
766767

767768
p.useLASX = checkHardwareSupport(CPU_LASX) && isConv2D;
769+
p.useRVV = checkHardwareSupport(CPU_RVV) && isConv2D;
768770

769771
int kernel_d = isConv3D? kernel_size[0] : 1;
770772
int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
@@ -970,6 +972,13 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
970972
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
971973
else
972974
#endif
975+
#if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
976+
if(useRVV)
977+
opt_RVV::fastDepthwiseConv(wptr, kernel_h, kernel_w,
978+
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
979+
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
980+
else
981+
#endif
973982
#if CV_RVP052
974983
if(isConv2D)
975984
opt_RVP052::fastDepthwiseConv(wptr, kernel_h, kernel_w,
@@ -1356,6 +1365,12 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
13561365
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
13571366
else
13581367
#endif
1368+
#if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
1369+
if(useRVV)
1370+
opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
1371+
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
1372+
else
1373+
#endif
13591374
#if CV_RVP052
13601375
if(isConv2D)
13611376
opt_RVP052::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,

modules/dnn/src/int8layers/fully_connected_layer.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
228228
{
229229
public:
230230
FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
231-
dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {}
231+
dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false), useRVV(false) {}
232232

233233
static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
234234
const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
@@ -253,6 +253,7 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
253253
p.useAVX2 = checkHardwareSupport(CPU_AVX2);
254254
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
255255
p.useLASX = checkHardwareSupport(CPU_LASX);
256+
p.useRVV = checkHardwareSupport(CPU_RVV);
256257

257258
parallel_for_(Range(0, nstripes), p, nstripes);
258259
}
@@ -303,6 +304,11 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
303304
opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
304305
else
305306
#endif
307+
#if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
308+
if( useRVV)
309+
opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
310+
else
311+
#endif
306312
#if CV_RVP052
307313
if( 1 )
308314
opt_RVP052::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
@@ -363,6 +369,7 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
363369
bool useAVX2;
364370
bool useAVX512;
365371
bool useLASX;
372+
bool useRVV;
366373
};
367374

368375
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE

0 commit comments

Comments
 (0)