try to fix crash in perf_dnn with adding compute branch for AVX/AVX2

zihaomu · zihaomu · commit 54597ea6e8fe · 2023-04-21T10:17:24.000+08:00
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
@@ -9,6 +9,8 @@ namespace dnn {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
 void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR);
+void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                  const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR);
 
 #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
 
@@ -78,6 +80,58 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
     _mm256_zeroupper();
 }
 
+void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                  const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
+{
+    CV_Assert(convNR == 24);
+    __m256 c0 = _mm256_set1_ps(bias), c1 = c0, c2 = c0;
+
+    if (outLen > 8)
+    {
+        for (int p = 0; p < np; p++, a++, b += convNR)
+        {
+            __m256 a0 = _mm256_set1_ps(a[0]);
+            __m256 b0 = _mm256_loadu_ps(b), b1 = _mm256_loadu_ps(b + 8), b2 = _mm256_loadu_ps(b + 16);
+
+            c0 = _mm256_fmadd_ps(b0, a0, c0);
+            c1 = _mm256_fmadd_ps(b1, a0, c1);
+            c2 = _mm256_fmadd_ps(b2, a0, c2);
+        }
+    }
+    else
+    {
+        for (int p = 0; p < np; p++, a++, b += convNR)
+        {
+            __m256 a0 = _mm256_set1_ps(a[0]);
+            __m256 b0 = _mm256_loadu_ps(b);
+
+            c0 = _mm256_fmadd_ps(b0, a0, c0);
+        }
+    }
+
+    if (init_c)
+    {
+        c0 = _mm256_add_ps(_mm256_loadu_ps(c), c0);
+        c1 = _mm256_add_ps(_mm256_loadu_ps(c + 8), c1);
+        c2 = _mm256_add_ps(_mm256_loadu_ps(c + 16), c2);
+    }
+
+     if (ifMinMaxAct)
+    {
+        __m256 vmax = _mm256_set1_ps(maxval);
+        __m256 vmin = _mm256_set1_ps(minval);
+
+        c0 = _mm256_min_ps(_mm256_max_ps(c0, vmin), vmax);
+        c1 = _mm256_min_ps(_mm256_max_ps(c1, vmin), vmax);
+        c2 = _mm256_min_ps(_mm256_max_ps(c2, vmin), vmax);
+    }
+
+    _mm256_storeu_ps(c, c0);
+    _mm256_storeu_ps(c + 8, c1);
+    _mm256_storeu_ps(c + 16, c2);
+    _mm256_zeroupper();
+}
+
 #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 CV_CPU_OPTIMIZATION_NAMESPACE_END
diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.cpp b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
@@ -456,7 +456,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
     int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
 
     int ksize = Dk*Hk*Wk;
-    bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1;
+    bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1
+            && pad_front == 0 && pad_left == 0 && pad_top == 0;
     int DkHkWkCg = Dk*Hk*Wk*Cg;
 
     std::vector<int> ofstab_(Hk*Wk*Dk*4, 0);
@@ -502,7 +503,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
             }
     }
 
-    int MAX_STRIPES = (56 + CONV_NR - 1)/CONV_NR;
+    int MAX_STRIPES = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : (56 + CONV_NR - 1)/CONV_NR;
 
     // Friendly to L1 cache
     const int K_BLOCK_SIZE = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
@@ -949,6 +950,16 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                             cptr = cptr0;
                         }
 
+#if CV_TRY_AVX2
+                        if (conv->useAVX2)
+                            opt_AVX2::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
+                        else
+#endif
+#if CV_TRY_AVX
+                        if (conv->useAVX)
+                            opt_AVX::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
+                        else
+#endif
                         convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
 
                         if (ifBuffer)
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
@@ -256,6 +256,7 @@ TEST_P(Test_ONNX_layers, Convolution3D_bias)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
     }
     testONNXModels("conv3d_bias");
+    testONNXModels("conv3d_depthwise_bias"); // kernel 1x1
 }
 
 TEST_P(Test_ONNX_layers, Two_convolution)

Original file line number	Diff line number	Diff line change
`@@ -256,6 +256,7 @@ TEST_P(Test_ONNX_layers, Convolution3D_bias)`
`256`	`256`	`applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);`
`257`	`257`	`}`
`258`	`258`	`testONNXModels("conv3d_bias");`
	`259`	`+ testONNXModels("conv3d_depthwise_bias"); // kernel 1x1`
`259`	`260`	`}`
`260`	`261`
`261`	`262`	`TEST_P(Test_ONNX_layers, Two_convolution)`