Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit a8ae90b

Browse files
committed
try to fix crash in perf_dnn with adding compute branch for AVX/AVX2
1 parent b0eddeb commit a8ae90b

File tree

2 files changed

+69
-1
lines changed

2 files changed

+69
-1
lines changed

modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ namespace dnn {
99
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
1010

1111
void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR);
12+
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
13+
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR);
1214

1315
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
1416

@@ -78,6 +80,62 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
7880
_mm256_zeroupper();
7981
}
8082

83+
void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
84+
const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
85+
{
86+
#if CONV_NR == 24
87+
__m256 c0 = _mm256_set1_ps(bias), c1 = c0, c2 = c0;
88+
89+
if (outLen > 8)
90+
{
91+
for (int p = 0; p < np; p++, a++, b += CONV_NR)
92+
{
93+
__m256 a0 = _mm256_set1_ps(a[0]);
94+
__m256 b0 = _mm256_loadu_ps(b), b1 = _mm256_loadu_ps(b + 8), b2 = _mm256_loadu_ps(b + 16);
95+
96+
c0 = _mm256_fmadd_ps(b0, a0, c0);
97+
c1 = _mm256_fmadd_ps(b1, a0, c1);
98+
c2 = _mm256_fmadd_ps(b2, a0, c2);
99+
}
100+
}
101+
else
102+
{
103+
for (int p = 0; p < np; p++, a++, b += CONV_NR)
104+
{
105+
__m256 a0 = _mm256_set1_ps(a[0]);
106+
__m256 b0 = _mm256_loadu_ps(b);
107+
108+
c0 = _mm256_fmadd_ps(b0, a0, c0);
109+
}
110+
}
111+
112+
if (init_c)
113+
{
114+
c0 = _mm256_add_ps(_mm256_loadu_ps(c), c0);
115+
c1 = _mm256_add_ps(_mm256_loadu_ps(c + 8), c1);
116+
c2 = _mm256_add_ps(_mm256_loadu_ps(c + 16), c2);
117+
}
118+
119+
if (ifMinMaxAct)
120+
{
121+
__m256 vmax = _mm256_set1_ps(maxval);
122+
__m256 vmin = _mm256_set1_ps(minval);
123+
124+
c0 = _mm256_min_ps(_mm256_max_ps(c0, vmin), vmax);
125+
c1 = _mm256_min_ps(_mm256_max_ps(c1, vmin), vmax);
126+
c2 = _mm256_min_ps(_mm256_max_ps(c2, vmin), vmax);
127+
}
128+
129+
_mm256_storeu_ps(c, c0);
130+
_mm256_storeu_ps(c + 8, c1);
131+
_mm256_storeu_ps(c + 16, c2);
132+
_mm256_zeroupper();
133+
#else
134+
#error "unsupported CONV_NR in convBlockMR1."
135+
#endif
136+
}
137+
138+
81139
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
82140

83141
CV_CPU_OPTIMIZATION_NAMESPACE_END

modules/dnn/src/layers/cpu_kernels/convolution.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
502502
}
503503
}
504504

505-
int MAX_STRIPES = (56 + CONV_NR - 1)/CONV_NR;
505+
int MAX_STRIPES = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : (56 + CONV_NR - 1)/CONV_NR;
506506

507507
// Friendly to L1 cache
508508
const int K_BLOCK_SIZE = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
@@ -949,6 +949,16 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
949949
cptr = cptr0;
950950
}
951951

952+
#if CV_TRY_AVX2
953+
if (conv->useAVX2)
954+
opt_AVX2::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
955+
else
956+
#endif
957+
#if CV_TRY_AVX
958+
if (conv->useAVX)
959+
opt_AVX::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
960+
else
961+
#endif
952962
convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
953963

954964
if (ifBuffer)

0 commit comments

Comments
 (0)