Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit e21e962

Browse files
alex-davicenko-armjwright-arm
authored andcommitted
Add SVE2 implementation of vpx_highbd_convolve12_vert
Add an Arm SVE2 implementation of vpx_highbd_convolve12_horiz and associated unit tests. Change-Id: Ibd086b6db1769fa90e428426eb99073151ba2c00
1 parent 764fbf9 commit e21e962

4 files changed

Lines changed: 154 additions & 5 deletions

File tree

test/convolve_test.cc

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1818,8 +1818,11 @@ WRAP12TAP(convolve12_neon, 12)
18181818

18191819
#if HAVE_SVE2
18201820
WRAP12TAP(convolve12_horiz_sve2, 8)
1821+
WRAP12TAP(convolve12_vert_sve2, 8)
18211822
WRAP12TAP(convolve12_horiz_sve2, 10)
1823+
WRAP12TAP(convolve12_vert_sve2, 10)
18221824
WRAP12TAP(convolve12_horiz_sve2, 12)
1825+
WRAP12TAP(convolve12_vert_sve2, 12)
18231826
#endif // HAVE_SVE2
18241827

18251828
WRAP12TAP(convolve12_horiz_c, 8)
@@ -2189,15 +2192,15 @@ INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest,
21892192

21902193
#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
21912194
const ConvolveFunctions12Tap convolve12tap_8bit_sve2(
2192-
wrap_convolve12_horiz_sve2_8, wrap_convolve12_vert_c_8, wrap_convolve12_c_8,
2193-
8);
2195+
wrap_convolve12_horiz_sve2_8, wrap_convolve12_vert_sve2_8,
2196+
wrap_convolve12_c_8, 8);
21942197

21952198
const ConvolveFunctions12Tap convolve12tap_10bit_sve2(
2196-
wrap_convolve12_horiz_sve2_10, wrap_convolve12_vert_c_10,
2199+
wrap_convolve12_horiz_sve2_10, wrap_convolve12_vert_sve2_10,
21972200
wrap_convolve12_c_10, 10);
21982201

21992202
const ConvolveFunctions12Tap convolve12tap_12bit_sve2(
2200-
wrap_convolve12_horiz_sve2_12, wrap_convolve12_vert_c_12,
2203+
wrap_convolve12_horiz_sve2_12, wrap_convolve12_vert_sve2_12,
22012204
wrap_convolve12_c_12, 12);
22022205

22032206
const Convolve12TapParam kArrayConvolve12Tap_sve2[] = {

vp9/common/vp9_rtcd_defs.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ ()
206206

207207
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
208208
add_proto qw/void vpx_highbd_convolve12_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
209-
specialize qw/vpx_highbd_convolve12_vert ssse3 avx2 neon/;
209+
specialize qw/vpx_highbd_convolve12_vert ssse3 avx2 neon sve2/;
210210

211211
add_proto qw/void vpx_highbd_convolve12_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
212212
specialize qw/vpx_highbd_convolve12_horiz ssse3 avx2 neon sve2/;

vp9/encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "./vpx_config.h"
1818
#include "vpx/vpx_integer.h"
1919
#include "vpx_dsp/arm/mem_neon.h"
20+
#include "vpx_dsp/arm/transpose_neon.h"
2021
#include "vp9/encoder/vp9_temporal_filter.h"
2122
#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
2223
#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h"
@@ -132,3 +133,120 @@ void vpx_highbd_convolve12_horiz_sve2(const uint16_t *src, ptrdiff_t src_stride,
132133
h -= 2;
133134
} while (h != 0);
134135
}
136+
137+
static INLINE uint16x4_t highbd_convolve12_4_v(const int16x8_t s0[2],
138+
const int16x8_t s1[2],
139+
const int16x8_t s2[2],
140+
const int16x8_t filter_0_7,
141+
const int16x8_t filter_4_11,
142+
const uint16x4_t max) {
143+
int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0);
144+
sum01 = vpx_dotq_lane_s16(sum01, s1[0], filter_0_7, 1);
145+
sum01 = vpx_dotq_lane_s16(sum01, s2[0], filter_4_11, 1);
146+
147+
int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0);
148+
sum23 = vpx_dotq_lane_s16(sum23, s1[1], filter_0_7, 1);
149+
sum23 = vpx_dotq_lane_s16(sum23, s2[1], filter_4_11, 1);
150+
151+
int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
152+
153+
uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
154+
155+
return vmin_u16(res, max);
156+
}
157+
158+
void vpx_highbd_convolve12_vert_sve2(const uint16_t *src, ptrdiff_t src_stride,
159+
uint16_t *dst, ptrdiff_t dst_stride,
160+
const InterpKernel12 *filter, int x0_q4,
161+
int x_step_q4, int y0_q4, int y_step_q4,
162+
int w, int h, int bd) {
163+
// Scaling not supported by SVE2 implementation.
164+
if (y_step_q4 != 16) {
165+
vpx_highbd_convolve12_vert_c(src, src_stride, dst, dst_stride, filter,
166+
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
167+
return;
168+
}
169+
assert(w == 32 || w == 16 || w == 8);
170+
assert(h % 4 == 0);
171+
172+
const int16x8_t filter_0_7 = vld1q_s16(filter[y0_q4]);
173+
const int16x8_t filter_4_11 = vld1q_s16(filter[y0_q4] + 4);
174+
175+
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
176+
177+
src -= src_stride * (MAX_FILTER_TAP / 2 - 1);
178+
179+
do {
180+
const int16_t *s = (const int16_t *)src;
181+
uint16_t *d = dst;
182+
int height = h;
183+
184+
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
185+
load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
186+
&s9, &sA);
187+
s += 11 * src_stride;
188+
189+
int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
190+
s6789[2], s789A[2];
191+
transpose_concat_s16_4x4(s0, s1, s2, s3, &s0123[0], &s0123[1]);
192+
transpose_concat_s16_4x4(s1, s2, s3, s4, &s1234[0], &s1234[1]);
193+
transpose_concat_s16_4x4(s2, s3, s4, s5, &s2345[0], &s2345[1]);
194+
transpose_concat_s16_4x4(s3, s4, s5, s6, &s3456[0], &s3456[1]);
195+
transpose_concat_s16_4x4(s4, s5, s6, s7, &s4567[0], &s4567[1]);
196+
transpose_concat_s16_4x4(s5, s6, s7, s8, &s5678[0], &s5678[1]);
197+
transpose_concat_s16_4x4(s6, s7, s8, s9, &s6789[0], &s6789[1]);
198+
transpose_concat_s16_4x4(s7, s8, s9, sA, &s789A[0], &s789A[1]);
199+
200+
do {
201+
int16x4_t sB, sC, sD, sE;
202+
load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
203+
204+
int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
205+
transpose_concat_s16_4x4(s8, s9, sA, sB, &s89AB[0], &s89AB[1]);
206+
transpose_concat_s16_4x4(s9, sA, sB, sC, &s9ABC[0], &s9ABC[1]);
207+
transpose_concat_s16_4x4(sA, sB, sC, sD, &sABCD[0], &sABCD[1]);
208+
transpose_concat_s16_4x4(sB, sC, sD, sE, &sBCDE[0], &sBCDE[1]);
209+
210+
uint16x4_t d0 = highbd_convolve12_4_v(s0123, s4567, s89AB, filter_0_7,
211+
filter_4_11, max);
212+
uint16x4_t d1 = highbd_convolve12_4_v(s1234, s5678, s9ABC, filter_0_7,
213+
filter_4_11, max);
214+
uint16x4_t d2 = highbd_convolve12_4_v(s2345, s6789, sABCD, filter_0_7,
215+
filter_4_11, max);
216+
uint16x4_t d3 = highbd_convolve12_4_v(s3456, s789A, sBCDE, filter_0_7,
217+
filter_4_11, max);
218+
219+
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
220+
221+
// Prepare block for next iteration - reusing as much as possible.
222+
// Shuffle everything up four rows.
223+
s0123[0] = s4567[0];
224+
s0123[1] = s4567[1];
225+
s1234[0] = s5678[0];
226+
s1234[1] = s5678[1];
227+
s2345[0] = s6789[0];
228+
s2345[1] = s6789[1];
229+
s3456[0] = s789A[0];
230+
s3456[1] = s789A[1];
231+
s4567[0] = s89AB[0];
232+
s4567[1] = s89AB[1];
233+
s5678[0] = s9ABC[0];
234+
s5678[1] = s9ABC[1];
235+
s6789[0] = sABCD[0];
236+
s6789[1] = sABCD[1];
237+
s789A[0] = sBCDE[0];
238+
s789A[1] = sBCDE[1];
239+
240+
s8 = sC;
241+
s9 = sD;
242+
sA = sE;
243+
244+
s += 4 * src_stride;
245+
d += 4 * dst_stride;
246+
height -= 4;
247+
} while (height != 0);
248+
src += 4;
249+
dst += 4;
250+
w -= 4;
251+
} while (w != 0);
252+
}

vpx_dsp/arm/mem_neon.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,34 @@ static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p,
499499
*s3 = vld1_s16(s);
500500
}
501501

502+
static INLINE void load_s16_4x11(const int16_t *s, const ptrdiff_t p,
503+
int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
504+
int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
505+
int16x4_t *s6, int16x4_t *s7, int16x4_t *s8,
506+
int16x4_t *s9, int16x4_t *s10) {
507+
*s0 = vld1_s16(s);
508+
s += p;
509+
*s1 = vld1_s16(s);
510+
s += p;
511+
*s2 = vld1_s16(s);
512+
s += p;
513+
*s3 = vld1_s16(s);
514+
s += p;
515+
*s4 = vld1_s16(s);
516+
s += p;
517+
*s5 = vld1_s16(s);
518+
s += p;
519+
*s6 = vld1_s16(s);
520+
s += p;
521+
*s7 = vld1_s16(s);
522+
s += p;
523+
*s8 = vld1_s16(s);
524+
s += p;
525+
*s9 = vld1_s16(s);
526+
s += p;
527+
*s10 = vld1_s16(s);
528+
}
529+
502530
static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p,
503531
const uint16x4_t s0, const uint16x4_t s1,
504532
const uint16x4_t s2, const uint16x4_t s3) {

0 commit comments

Comments
 (0)