|
17 | 17 | #include "./vpx_config.h" |
18 | 18 | #include "vpx/vpx_integer.h" |
19 | 19 | #include "vpx_dsp/arm/mem_neon.h" |
| 20 | +#include "vpx_dsp/arm/transpose_neon.h" |
20 | 21 | #include "vp9/encoder/vp9_temporal_filter.h" |
21 | 22 | #include "vpx_dsp/arm/vpx_neon_sve_bridge.h" |
22 | 23 | #include "vpx_dsp/arm/vpx_neon_sve2_bridge.h" |
@@ -132,3 +133,120 @@ void vpx_highbd_convolve12_horiz_sve2(const uint16_t *src, ptrdiff_t src_stride, |
132 | 133 | h -= 2; |
133 | 134 | } while (h != 0); |
134 | 135 | } |
| 136 | + |
| 137 | +static INLINE uint16x4_t highbd_convolve12_4_v(const int16x8_t s0[2], |
| 138 | + const int16x8_t s1[2], |
| 139 | + const int16x8_t s2[2], |
| 140 | + const int16x8_t filter_0_7, |
| 141 | + const int16x8_t filter_4_11, |
| 142 | + const uint16x4_t max) { |
| 143 | + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0); |
| 144 | + sum01 = vpx_dotq_lane_s16(sum01, s1[0], filter_0_7, 1); |
| 145 | + sum01 = vpx_dotq_lane_s16(sum01, s2[0], filter_4_11, 1); |
| 146 | + |
| 147 | + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0); |
| 148 | + sum23 = vpx_dotq_lane_s16(sum23, s1[1], filter_0_7, 1); |
| 149 | + sum23 = vpx_dotq_lane_s16(sum23, s2[1], filter_4_11, 1); |
| 150 | + |
| 151 | + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); |
| 152 | + |
| 153 | + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); |
| 154 | + |
| 155 | + return vmin_u16(res, max); |
| 156 | +} |
| 157 | + |
| 158 | +void vpx_highbd_convolve12_vert_sve2(const uint16_t *src, ptrdiff_t src_stride, |
| 159 | + uint16_t *dst, ptrdiff_t dst_stride, |
| 160 | + const InterpKernel12 *filter, int x0_q4, |
| 161 | + int x_step_q4, int y0_q4, int y_step_q4, |
| 162 | + int w, int h, int bd) { |
| 163 | + // Scaling not supported by SVE2 implementation. |
| 164 | + if (y_step_q4 != 16) { |
| 165 | + vpx_highbd_convolve12_vert_c(src, src_stride, dst, dst_stride, filter, |
| 166 | + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); |
| 167 | + return; |
| 168 | + } |
| 169 | + assert(w == 32 || w == 16 || w == 8); |
| 170 | + assert(h % 4 == 0); |
| 171 | + |
| 172 | + const int16x8_t filter_0_7 = vld1q_s16(filter[y0_q4]); |
| 173 | + const int16x8_t filter_4_11 = vld1q_s16(filter[y0_q4] + 4); |
| 174 | + |
| 175 | + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); |
| 176 | + |
| 177 | + src -= src_stride * (MAX_FILTER_TAP / 2 - 1); |
| 178 | + |
| 179 | + do { |
| 180 | + const int16_t *s = (const int16_t *)src; |
| 181 | + uint16_t *d = dst; |
| 182 | + int height = h; |
| 183 | + |
| 184 | + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; |
| 185 | + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, |
| 186 | + &s9, &sA); |
| 187 | + s += 11 * src_stride; |
| 188 | + |
| 189 | + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], |
| 190 | + s6789[2], s789A[2]; |
| 191 | + transpose_concat_s16_4x4(s0, s1, s2, s3, &s0123[0], &s0123[1]); |
| 192 | + transpose_concat_s16_4x4(s1, s2, s3, s4, &s1234[0], &s1234[1]); |
| 193 | + transpose_concat_s16_4x4(s2, s3, s4, s5, &s2345[0], &s2345[1]); |
| 194 | + transpose_concat_s16_4x4(s3, s4, s5, s6, &s3456[0], &s3456[1]); |
| 195 | + transpose_concat_s16_4x4(s4, s5, s6, s7, &s4567[0], &s4567[1]); |
| 196 | + transpose_concat_s16_4x4(s5, s6, s7, s8, &s5678[0], &s5678[1]); |
| 197 | + transpose_concat_s16_4x4(s6, s7, s8, s9, &s6789[0], &s6789[1]); |
| 198 | + transpose_concat_s16_4x4(s7, s8, s9, sA, &s789A[0], &s789A[1]); |
| 199 | + |
| 200 | + do { |
| 201 | + int16x4_t sB, sC, sD, sE; |
| 202 | + load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); |
| 203 | + |
| 204 | + int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; |
| 205 | + transpose_concat_s16_4x4(s8, s9, sA, sB, &s89AB[0], &s89AB[1]); |
| 206 | + transpose_concat_s16_4x4(s9, sA, sB, sC, &s9ABC[0], &s9ABC[1]); |
| 207 | + transpose_concat_s16_4x4(sA, sB, sC, sD, &sABCD[0], &sABCD[1]); |
| 208 | + transpose_concat_s16_4x4(sB, sC, sD, sE, &sBCDE[0], &sBCDE[1]); |
| 209 | + |
| 210 | + uint16x4_t d0 = highbd_convolve12_4_v(s0123, s4567, s89AB, filter_0_7, |
| 211 | + filter_4_11, max); |
| 212 | + uint16x4_t d1 = highbd_convolve12_4_v(s1234, s5678, s9ABC, filter_0_7, |
| 213 | + filter_4_11, max); |
| 214 | + uint16x4_t d2 = highbd_convolve12_4_v(s2345, s6789, sABCD, filter_0_7, |
| 215 | + filter_4_11, max); |
| 216 | + uint16x4_t d3 = highbd_convolve12_4_v(s3456, s789A, sBCDE, filter_0_7, |
| 217 | + filter_4_11, max); |
| 218 | + |
| 219 | + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); |
| 220 | + |
| 221 | + // Prepare block for next iteration - reusing as much as possible. |
| 222 | + // Shuffle everything up four rows. |
| 223 | + s0123[0] = s4567[0]; |
| 224 | + s0123[1] = s4567[1]; |
| 225 | + s1234[0] = s5678[0]; |
| 226 | + s1234[1] = s5678[1]; |
| 227 | + s2345[0] = s6789[0]; |
| 228 | + s2345[1] = s6789[1]; |
| 229 | + s3456[0] = s789A[0]; |
| 230 | + s3456[1] = s789A[1]; |
| 231 | + s4567[0] = s89AB[0]; |
| 232 | + s4567[1] = s89AB[1]; |
| 233 | + s5678[0] = s9ABC[0]; |
| 234 | + s5678[1] = s9ABC[1]; |
| 235 | + s6789[0] = sABCD[0]; |
| 236 | + s6789[1] = sABCD[1]; |
| 237 | + s789A[0] = sBCDE[0]; |
| 238 | + s789A[1] = sBCDE[1]; |
| 239 | + |
| 240 | + s8 = sC; |
| 241 | + s9 = sD; |
| 242 | + sA = sE; |
| 243 | + |
| 244 | + s += 4 * src_stride; |
| 245 | + d += 4 * dst_stride; |
| 246 | + height -= 4; |
| 247 | + } while (height != 0); |
| 248 | + src += 4; |
| 249 | + dst += 4; |
| 250 | + w -= 4; |
| 251 | + } while (w != 0); |
| 252 | +} |
0 commit comments