@@ -9,6 +9,8 @@ namespace dnn {
9
9
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
10
10
11
11
void convBlock (int np, const float * a, const float * b, float * c, int ldc, bool init_c, const int convMR, const int convNR);
12
+ void convBlockMR1 (int np, const float * a, const float * b, float *c, const float bias, bool init_c,
13
+ const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR);
12
14
13
15
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
14
16
@@ -78,6 +80,62 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
78
80
_mm256_zeroupper ();
79
81
}
80
82
83
+ void convBlockMR1 (int np, const float * a, const float * b, float *c, const float bias, bool init_c,
84
+ const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
85
+ {
86
+ #if CONV_NR == 24
87
+ __m256 c0 = _mm256_set1_ps (bias), c1 = c0, c2 = c0;
88
+
89
+ if (outLen > 8 )
90
+ {
91
+ for (int p = 0 ; p < np; p++, a++, b += CONV_NR)
92
+ {
93
+ __m256 a0 = _mm256_set1_ps (a[0 ]);
94
+ __m256 b0 = _mm256_loadu_ps (b), b1 = _mm256_loadu_ps (b + 8 ), b2 = _mm256_loadu_ps (b + 16 );
95
+
96
+ c0 = _mm256_fmadd_ps (b0, a0, c0);
97
+ c1 = _mm256_fmadd_ps (b1, a0, c1);
98
+ c2 = _mm256_fmadd_ps (b2, a0, c2);
99
+ }
100
+ }
101
+ else
102
+ {
103
+ for (int p = 0 ; p < np; p++, a++, b += CONV_NR)
104
+ {
105
+ __m256 a0 = _mm256_set1_ps (a[0 ]);
106
+ __m256 b0 = _mm256_loadu_ps (b);
107
+
108
+ c0 = _mm256_fmadd_ps (b0, a0, c0);
109
+ }
110
+ }
111
+
112
+ if (init_c)
113
+ {
114
+ c0 = _mm256_add_ps (_mm256_loadu_ps (c), c0);
115
+ c1 = _mm256_add_ps (_mm256_loadu_ps (c + 8 ), c1);
116
+ c2 = _mm256_add_ps (_mm256_loadu_ps (c + 16 ), c2);
117
+ }
118
+
119
+ if (ifMinMaxAct)
120
+ {
121
+ __m256 vmax = _mm256_set1_ps (maxval);
122
+ __m256 vmin = _mm256_set1_ps (minval);
123
+
124
+ c0 = _mm256_min_ps (_mm256_max_ps (c0, vmin), vmax);
125
+ c1 = _mm256_min_ps (_mm256_max_ps (c1, vmin), vmax);
126
+ c2 = _mm256_min_ps (_mm256_max_ps (c2, vmin), vmax);
127
+ }
128
+
129
+ _mm256_storeu_ps (c, c0);
130
+ _mm256_storeu_ps (c + 8 , c1);
131
+ _mm256_storeu_ps (c + 16 , c2);
132
+ _mm256_zeroupper ();
133
+ #else
134
+ #error "unsupported CONV_NR in convBlockMR1."
135
+ #endif
136
+ }
137
+
138
+
81
139
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
82
140
83
141
CV_CPU_OPTIMIZATION_NAMESPACE_END
0 commit comments