Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 54597ea

Browse files
committed
try to fix crash in perf_dnn with adding compute branch for AVX/AVX2
1 parent b0eddeb commit 54597ea

File tree

3 files changed

+68
-2
lines changed

3 files changed

+68
-2
lines changed

modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ namespace dnn {
99
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
1010

1111
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR);
12+
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
13+
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR);
1214

1315
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
1416

@@ -78,6 +80,58 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
7880
_mm256_zeroupper();
7981
}
8082

83+
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
84+
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
85+
{
86+
CV_Assert(convNR == 24);
87+
__m256 c0 = _mm256_set1_ps(bias), c1 = c0, c2 = c0;
88+
89+
if (outLen > 8)
90+
{
91+
for (int p = 0; p < np; p++, a++, b += convNR)
92+
{
93+
__m256 a0 = _mm256_set1_ps(a[0]);
94+
__m256 b0 = _mm256_loadu_ps(b), b1 = _mm256_loadu_ps(b + 8), b2 = _mm256_loadu_ps(b + 16);
95+
96+
c0 = _mm256_fmadd_ps(b0, a0, c0);
97+
c1 = _mm256_fmadd_ps(b1, a0, c1);
98+
c2 = _mm256_fmadd_ps(b2, a0, c2);
99+
}
100+
}
101+
else
102+
{
103+
for (int p = 0; p < np; p++, a++, b += convNR)
104+
{
105+
__m256 a0 = _mm256_set1_ps(a[0]);
106+
__m256 b0 = _mm256_loadu_ps(b);
107+
108+
c0 = _mm256_fmadd_ps(b0, a0, c0);
109+
}
110+
}
111+
112+
if (init_c)
113+
{
114+
c0 = _mm256_add_ps(_mm256_loadu_ps(c), c0);
115+
c1 = _mm256_add_ps(_mm256_loadu_ps(c + 8), c1);
116+
c2 = _mm256_add_ps(_mm256_loadu_ps(c + 16), c2);
117+
}
118+
119+
if (ifMinMaxAct)
120+
{
121+
__m256 vmax = _mm256_set1_ps(maxval);
122+
__m256 vmin = _mm256_set1_ps(minval);
123+
124+
c0 = _mm256_min_ps(_mm256_max_ps(c0, vmin), vmax);
125+
c1 = _mm256_min_ps(_mm256_max_ps(c1, vmin), vmax);
126+
c2 = _mm256_min_ps(_mm256_max_ps(c2, vmin), vmax);
127+
}
128+
129+
_mm256_storeu_ps(c, c0);
130+
_mm256_storeu_ps(c + 8, c1);
131+
_mm256_storeu_ps(c + 16, c2);
132+
_mm256_zeroupper();
133+
}
134+
81135
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
82136

83137
CV_CPU_OPTIMIZATION_NAMESPACE_END

modules/dnn/src/layers/cpu_kernels/convolution.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
456456
int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
457457

458458
int ksize = Dk*Hk*Wk;
459-
bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1;
459+
bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1
460+
&& pad_front == 0 && pad_left == 0 && pad_top == 0;
460461
int DkHkWkCg = Dk*Hk*Wk*Cg;
461462

462463
std::vector<int> ofstab_(Hk*Wk*Dk*4, 0);
@@ -502,7 +503,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
502503
}
503504
}
504505

505-
int MAX_STRIPES = (56 + CONV_NR - 1)/CONV_NR;
506+
int MAX_STRIPES = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : (56 + CONV_NR - 1)/CONV_NR;
506507

507508
// Friendly to L1 cache
508509
const int K_BLOCK_SIZE = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
@@ -949,6 +950,16 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
949950
cptr = cptr0;
950951
}
951952

953+
#if CV_TRY_AVX2
954+
if (conv->useAVX2)
955+
opt_AVX2::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
956+
else
957+
#endif
958+
#if CV_TRY_AVX
959+
if (conv->useAVX)
960+
opt_AVX::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
961+
else
962+
#endif
952963
convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
953964

954965
if (ifBuffer)

modules/dnn/test/test_onnx_importer.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ TEST_P(Test_ONNX_layers, Convolution3D_bias)
256256
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
257257
}
258258
testONNXModels("conv3d_bias");
259+
testONNXModels("conv3d_depthwise_bias"); // kernel 1x1
259260
}
260261

261262
TEST_P(Test_ONNX_layers, Two_convolution)

0 commit comments

Comments
 (0)