try to fix crash in perf_dnn with adding compute branch for AVX/AVX2

zihaomu · zihaomu · commit a8ae90b64782 · 2023-04-20T16:12:00.000+08:00
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
@@ -9,6 +9,8 @@ namespace dnn {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
 void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR);
+void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                  const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR);
 
 #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
 
@@ -78,6 +80,62 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
     _mm256_zeroupper();
 }
 
+void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                  const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
+{
+#if CONV_NR == 24
+    __m256 c0 = _mm256_set1_ps(bias), c1 = c0, c2 = c0;
+
+    if (outLen > 8)
+    {
+        for (int p = 0; p < np; p++, a++, b += CONV_NR)
+        {
+            __m256 a0 = _mm256_set1_ps(a[0]);
+            __m256 b0 = _mm256_loadu_ps(b), b1 = _mm256_loadu_ps(b + 8), b2 = _mm256_loadu_ps(b + 16);
+
+            c0 = _mm256_fmadd_ps(b0, a0, c0);
+            c1 = _mm256_fmadd_ps(b1, a0, c1);
+            c2 = _mm256_fmadd_ps(b2, a0, c2);
+        }
+    }
+    else
+    {
+        for (int p = 0; p < np; p++, a++, b += CONV_NR)
+        {
+            __m256 a0 = _mm256_set1_ps(a[0]);
+            __m256 b0 = _mm256_loadu_ps(b);
+
+            c0 = _mm256_fmadd_ps(b0, a0, c0);
+        }
+    }
+
+    if (init_c)
+    {
+        c0 = _mm256_add_ps(_mm256_loadu_ps(c), c0);
+        c1 = _mm256_add_ps(_mm256_loadu_ps(c + 8), c1);
+        c2 = _mm256_add_ps(_mm256_loadu_ps(c + 16), c2);
+    }
+
+     if (ifMinMaxAct)
+    {
+        __m256 vmax = _mm256_set1_ps(maxval);
+        __m256 vmin = _mm256_set1_ps(minval);
+
+        c0 = _mm256_min_ps(_mm256_max_ps(c0, vmin), vmax);
+        c1 = _mm256_min_ps(_mm256_max_ps(c1, vmin), vmax);
+        c2 = _mm256_min_ps(_mm256_max_ps(c2, vmin), vmax);
+    }
+
+    _mm256_storeu_ps(c, c0);
+    _mm256_storeu_ps(c + 8, c1);
+    _mm256_storeu_ps(c + 16, c2);
+    _mm256_zeroupper();
+#else
+#error "unsupported CONV_NR in convBlockMR1."
+#endif
+}
+
+
 #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 CV_CPU_OPTIMIZATION_NAMESPACE_END
diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.cpp b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
@@ -502,7 +502,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
             }
     }
 
-    int MAX_STRIPES = (56 + CONV_NR - 1)/CONV_NR;
+    int MAX_STRIPES = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : (56 + CONV_NR - 1)/CONV_NR;
 
     // Friendly to L1 cache
     const int K_BLOCK_SIZE = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
@@ -949,6 +949,16 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                             cptr = cptr0;
                         }
 
+#if CV_TRY_AVX2
+                        if (conv->useAVX2)
+                            opt_AVX2::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
+                        else
+#endif
+#if CV_TRY_AVX
+                        if (conv->useAVX)
+                            opt_AVX::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
+                        else
+#endif
                         convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
 
                         if (ifBuffer)