1 /*
2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <arm_neon.h>
14
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
20 #include "aom_dsp/arm/aom_neon_sve2_bridge.h"
21 #include "aom_dsp/arm/mem_neon.h"
22 #include "aom_ports/mem.h"
23 #include "av1/common/convolve.h"
24 #include "av1/common/filter.h"
25
26 DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = {
27 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
28 4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2,
29 };
30
convolve12_4_x(int16x8_t s0,int16x8_t s1,int16x8_t filter_0_7,int16x8_t filter_4_11,const int64x2_t offset,uint16x8x4_t permute_tbl,uint16x4_t max)31 static INLINE uint16x4_t convolve12_4_x(
32 int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11,
33 const int64x2_t offset, uint16x8x4_t permute_tbl, uint16x4_t max) {
34 int16x8_t permuted_samples[6];
35 permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
36 permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
37 permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
38 permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
39 permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
40 permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
41
42 int64x2_t sum01 =
43 aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
44 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
45 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
46
47 int64x2_t sum23 =
48 aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
49 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
50 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
51
52 int32x4_t res0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
53 uint16x4_t res = vqrshrun_n_s32(res0123, FILTER_BITS);
54
55 return vmin_u16(res, max);
56 }
57
convolve12_8_x(int16x8_t s0,int16x8_t s1,int16x8_t s2,int16x8_t filter_0_7,int16x8_t filter_4_11,int64x2_t offset,uint16x8x4_t permute_tbl,uint16x8_t max)58 static INLINE uint16x8_t convolve12_8_x(int16x8_t s0, int16x8_t s1,
59 int16x8_t s2, int16x8_t filter_0_7,
60 int16x8_t filter_4_11, int64x2_t offset,
61 uint16x8x4_t permute_tbl,
62 uint16x8_t max) {
63 int16x8_t permuted_samples[8];
64 permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
65 permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
66 permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
67 permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
68 permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
69 permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
70 permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]);
71 permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]);
72
73 int64x2_t sum01 =
74 aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
75 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
76 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
77
78 int64x2_t sum23 =
79 aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
80 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
81 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
82
83 int64x2_t sum45 =
84 aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0);
85 sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1);
86 sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1);
87
88 int64x2_t sum67 =
89 aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0);
90 sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1);
91 sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1);
92
93 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
94 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
95
96 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
97 vqrshrun_n_s32(sum4567, FILTER_BITS));
98
99 return vminq_u16(res, max);
100 }
101
highbd_convolve_x_sr_12tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd)102 static INLINE void highbd_convolve_x_sr_12tap_sve2(
103 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
104 int width, int height, const int16_t *y_filter_ptr,
105 ConvolveParams *conv_params, int bd) {
106 // This shim allows to do only one rounding shift instead of two.
107 const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1));
108
109 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
110 const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
111
112 uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl);
113 // Scale indices by size of the true vector length to avoid reading from an
114 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
115 uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64(
116 vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL)));
117 permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0);
118
119 uint16x8_t correction1 = vreinterpretq_u16_u64(
120 vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL),
121 vdup_n_u64(svcnth() * 0x0001000100010000ULL)));
122 permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1);
123
124 if (width == 4) {
125 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
126 const int16_t *s = (const int16_t *)src;
127
128 do {
129 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
130 load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6);
131 load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7);
132
133 uint16x4_t d0 = convolve12_4_x(s0, s1, y_filter_0_7, y_filter_4_11,
134 offset, permute_tbl, max);
135 uint16x4_t d1 = convolve12_4_x(s2, s3, y_filter_0_7, y_filter_4_11,
136 offset, permute_tbl, max);
137 uint16x4_t d2 = convolve12_4_x(s4, s5, y_filter_0_7, y_filter_4_11,
138 offset, permute_tbl, max);
139 uint16x4_t d3 = convolve12_4_x(s6, s7, y_filter_0_7, y_filter_4_11,
140 offset, permute_tbl, max);
141
142 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
143
144 s += 4 * src_stride;
145 dst += 4 * dst_stride;
146 height -= 4;
147 } while (height != 0);
148 } else {
149 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
150
151 do {
152 const int16_t *s = (const int16_t *)src;
153 uint16_t *d = dst;
154 int w = width;
155
156 do {
157 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
158 load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9);
159 load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10);
160 load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11);
161
162 uint16x8_t d0 = convolve12_8_x(s0, s1, s2, y_filter_0_7, y_filter_4_11,
163 offset, permute_tbl, max);
164 uint16x8_t d1 = convolve12_8_x(s3, s4, s5, y_filter_0_7, y_filter_4_11,
165 offset, permute_tbl, max);
166 uint16x8_t d2 = convolve12_8_x(s6, s7, s8, y_filter_0_7, y_filter_4_11,
167 offset, permute_tbl, max);
168 uint16x8_t d3 = convolve12_8_x(s9, s10, s11, y_filter_0_7,
169 y_filter_4_11, offset, permute_tbl, max);
170
171 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
172
173 s += 8;
174 d += 8;
175 w -= 8;
176 } while (w != 0);
177 src += 4 * src_stride;
178 dst += 4 * dst_stride;
179 height -= 4;
180 } while (height != 0);
181 }
182 }
183
convolve8_8_x(int16x8_t s0[8],int16x8_t filter,int64x2_t offset,uint16x8_t max)184 static INLINE uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
185 int64x2_t offset, uint16x8_t max) {
186 int64x2_t sum[8];
187 sum[0] = aom_sdotq_s16(offset, s0[0], filter);
188 sum[1] = aom_sdotq_s16(offset, s0[1], filter);
189 sum[2] = aom_sdotq_s16(offset, s0[2], filter);
190 sum[3] = aom_sdotq_s16(offset, s0[3], filter);
191 sum[4] = aom_sdotq_s16(offset, s0[4], filter);
192 sum[5] = aom_sdotq_s16(offset, s0[5], filter);
193 sum[6] = aom_sdotq_s16(offset, s0[6], filter);
194 sum[7] = aom_sdotq_s16(offset, s0[7], filter);
195
196 sum[0] = vpaddq_s64(sum[0], sum[1]);
197 sum[2] = vpaddq_s64(sum[2], sum[3]);
198 sum[4] = vpaddq_s64(sum[4], sum[5]);
199 sum[6] = vpaddq_s64(sum[6], sum[7]);
200
201 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
202 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6]));
203
204 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
205 vqrshrun_n_s32(sum4567, FILTER_BITS));
206
207 return vminq_u16(res, max);
208 }
209
highbd_convolve_x_sr_8tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd)210 static INLINE void highbd_convolve_x_sr_8tap_sve2(
211 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
212 int width, int height, const int16_t *y_filter_ptr,
213 ConvolveParams *conv_params, int bd) {
214 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
215 // This shim allows to do only one rounding shift instead of two.
216 const int64_t offset = 1 << (conv_params->round_0 - 1);
217 const int64x2_t offset_lo = vcombine_s64((int64x1_t)(offset), vdup_n_s64(0));
218
219 const int16x8_t filter = vld1q_s16(y_filter_ptr);
220
221 do {
222 const int16_t *s = (const int16_t *)src;
223 uint16_t *d = dst;
224 int w = width;
225
226 do {
227 int16x8_t s0[8], s1[8], s2[8], s3[8];
228 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
229 &s0[4], &s0[5], &s0[6], &s0[7]);
230 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
231 &s1[4], &s1[5], &s1[6], &s1[7]);
232 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
233 &s2[4], &s2[5], &s2[6], &s2[7]);
234 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
235 &s3[4], &s3[5], &s3[6], &s3[7]);
236
237 uint16x8_t d0 = convolve8_8_x(s0, filter, offset_lo, max);
238 uint16x8_t d1 = convolve8_8_x(s1, filter, offset_lo, max);
239 uint16x8_t d2 = convolve8_8_x(s2, filter, offset_lo, max);
240 uint16x8_t d3 = convolve8_8_x(s3, filter, offset_lo, max);
241
242 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
243
244 s += 8;
245 d += 8;
246 w -= 8;
247 } while (w != 0);
248 src += 4 * src_stride;
249 dst += 4 * dst_stride;
250 height -= 4;
251 } while (height != 0);
252 }
253
254 // clang-format off
255 DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
256 0, 2, 4, 6, 1, 3, 5, 7,
257 };
258 // clang-format on
259
convolve4_4_x(int16x8_t s0,int16x8_t filter,int64x2_t offset,uint16x8x2_t permute_tbl,uint16x4_t max)260 static INLINE uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter,
261 int64x2_t offset,
262 uint16x8x2_t permute_tbl,
263 uint16x4_t max) {
264 int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
265 int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]);
266
267 int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0);
268 int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0);
269
270 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
271 uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
272
273 return vmin_u16(res, max);
274 }
275
convolve4_8_x(int16x8_t s0[4],int16x8_t filter,int64x2_t offset,uint16x8_t tbl,uint16x8_t max)276 static INLINE uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter,
277 int64x2_t offset, uint16x8_t tbl,
278 uint16x8_t max) {
279 int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
280 int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0);
281 int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0);
282 int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0);
283
284 int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
285 int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
286
287 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, FILTER_BITS),
288 vqrshrun_n_s32(sum2637, FILTER_BITS));
289 res = aom_tbl_u16(res, tbl);
290
291 return vminq_u16(res, max);
292 }
293
highbd_convolve_x_sr_4tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * x_filter_ptr,ConvolveParams * conv_params,int bd)294 static INLINE void highbd_convolve_x_sr_4tap_sve2(
295 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
296 int width, int height, const int16_t *x_filter_ptr,
297 ConvolveParams *conv_params, int bd) {
298 // This shim allows to do only one rounding shift instead of two.
299 const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1));
300
301 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
302 const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
303
304 if (width == 4) {
305 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
306 uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl);
307
308 const int16_t *s = (const int16_t *)(src);
309
310 do {
311 int16x8_t s0, s1, s2, s3;
312 load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
313
314 uint16x4_t d0 = convolve4_4_x(s0, filter, offset, permute_tbl, max);
315 uint16x4_t d1 = convolve4_4_x(s1, filter, offset, permute_tbl, max);
316 uint16x4_t d2 = convolve4_4_x(s2, filter, offset, permute_tbl, max);
317 uint16x4_t d3 = convolve4_4_x(s3, filter, offset, permute_tbl, max);
318
319 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
320
321 s += 4 * src_stride;
322 dst += 4 * dst_stride;
323 height -= 4;
324 } while (height != 0);
325 } else {
326 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
327 uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
328
329 do {
330 const int16_t *s = (const int16_t *)(src);
331 uint16_t *d = dst;
332 int w = width;
333
334 do {
335 int16x8_t s0[4], s1[4], s2[4], s3[4];
336 load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
337 load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
338 load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
339 load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
340
341 uint16x8_t d0 = convolve4_8_x(s0, filter, offset, idx, max);
342 uint16x8_t d1 = convolve4_8_x(s1, filter, offset, idx, max);
343 uint16x8_t d2 = convolve4_8_x(s2, filter, offset, idx, max);
344 uint16x8_t d3 = convolve4_8_x(s3, filter, offset, idx, max);
345
346 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
347
348 s += 8;
349 d += 8;
350 w -= 8;
351 } while (w != 0);
352 src += 4 * src_stride;
353 dst += 4 * dst_stride;
354 height -= 4;
355 } while (height != 0);
356 }
357 }
358
av1_highbd_convolve_x_sr_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)359 void av1_highbd_convolve_x_sr_sve2(const uint16_t *src, int src_stride,
360 uint16_t *dst, int dst_stride, int w, int h,
361 const InterpFilterParams *filter_params_x,
362 const int subpel_x_qn,
363 ConvolveParams *conv_params, int bd) {
364 if (w == 2 || h == 2) {
365 av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
366 filter_params_x, subpel_x_qn, conv_params, bd);
367 return;
368 }
369
370 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
371
372 if (x_filter_taps == 6) {
373 av1_highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h,
374 filter_params_x, subpel_x_qn, conv_params,
375 bd);
376 return;
377 }
378
379 const int horiz_offset = filter_params_x->taps / 2 - 1;
380 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
381 filter_params_x, subpel_x_qn & SUBPEL_MASK);
382
383 src -= horiz_offset;
384
385 if (x_filter_taps == 12) {
386 highbd_convolve_x_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h,
387 x_filter_ptr, conv_params, bd);
388 return;
389 }
390
391 if (x_filter_taps == 8) {
392 highbd_convolve_x_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h,
393 x_filter_ptr, conv_params, bd);
394 return;
395 }
396
397 highbd_convolve_x_sr_4tap_sve2(src + 2, src_stride, dst, dst_stride, w, h,
398 x_filter_ptr, conv_params, bd);
399 }
400
401 // clang-format off
402 DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = {
403 // Shift left and insert new last column in transposed 4x4 block.
404 1, 2, 3, 0, 5, 6, 7, 4,
405 // Shift left and insert two new columns in transposed 4x4 block.
406 2, 3, 0, 1, 6, 7, 4, 5,
407 // Shift left and insert three new columns in transposed 4x4 block.
408 3, 0, 1, 2, 7, 4, 5, 6,
409 };
410 // clang-format on
411
transpose_concat_4x4(int16x4_t s0,int16x4_t s1,int16x4_t s2,int16x4_t s3,int16x8_t res[2])412 static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
413 int16x4_t s2, int16x4_t s3,
414 int16x8_t res[2]) {
415 // Transpose 16-bit elements and concatenate result rows as follows:
416 // s0: 00, 01, 02, 03
417 // s1: 10, 11, 12, 13
418 // s2: 20, 21, 22, 23
419 // s3: 30, 31, 32, 33
420 //
421 // res[0]: 00 10 20 30 01 11 21 31
422 // res[1]: 02 12 22 32 03 13 23 33
423
424 int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
425 int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
426 int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
427 int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
428
429 int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q));
430 int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q));
431
432 int32x4x2_t s0123 = vzipq_s32(s01, s23);
433
434 res[0] = vreinterpretq_s16_s32(s0123.val[0]);
435 res[1] = vreinterpretq_s16_s32(s0123.val[1]);
436 }
437
transpose_concat_8x4(int16x8_t s0,int16x8_t s1,int16x8_t s2,int16x8_t s3,int16x8_t res[4])438 static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
439 int16x8_t s2, int16x8_t s3,
440 int16x8_t res[4]) {
441 // Transpose 16-bit elements and concatenate result rows as follows:
442 // s0: 00, 01, 02, 03, 04, 05, 06, 07
443 // s1: 10, 11, 12, 13, 14, 15, 16, 17
444 // s2: 20, 21, 22, 23, 24, 25, 26, 27
445 // s3: 30, 31, 32, 33, 34, 35, 36, 37
446 //
447 // res[0]: 00 10 20 30 01 11 21 31
448 // res[1]: 02 12 22 32 03 13 23 33
449 // res[2]: 04 14 24 34 05 15 25 35
450 // res[3]: 06 16 26 36 07 17 27 37
451
452 int16x8x2_t tr01_16 = vzipq_s16(s0, s1);
453 int16x8x2_t tr23_16 = vzipq_s16(s2, s3);
454
455 int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]),
456 vreinterpretq_s32_s16(tr23_16.val[0]));
457 int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]),
458 vreinterpretq_s32_s16(tr23_16.val[1]));
459
460 res[0] = vreinterpretq_s16_s32(tr01_32.val[0]);
461 res[1] = vreinterpretq_s16_s32(tr01_32.val[1]);
462 res[2] = vreinterpretq_s16_s32(tr23_32.val[0]);
463 res[3] = vreinterpretq_s16_s32(tr23_32.val[1]);
464 }
465
aom_tbl2x4_s16(int16x8_t t0[4],int16x8_t t1[4],uint16x8_t tbl,int16x8_t res[4])466 static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
467 uint16x8_t tbl, int16x8_t res[4]) {
468 res[0] = aom_tbl2_s16(t0[0], t1[0], tbl);
469 res[1] = aom_tbl2_s16(t0[1], t1[1], tbl);
470 res[2] = aom_tbl2_s16(t0[2], t1[2], tbl);
471 res[3] = aom_tbl2_s16(t0[3], t1[3], tbl);
472 }
473
aom_tbl2x2_s16(int16x8_t t0[2],int16x8_t t1[2],uint16x8_t tbl,int16x8_t res[2])474 static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
475 uint16x8_t tbl, int16x8_t res[2]) {
476 res[0] = aom_tbl2_s16(t0[0], t1[0], tbl);
477 res[1] = aom_tbl2_s16(t0[1], t1[1], tbl);
478 }
479
highbd_convolve12_4_y(int16x8_t s0[2],int16x8_t s1[2],int16x8_t s2[2],int16x8_t filter_0_7,int16x8_t filter_4_11,uint16x4_t max)480 static INLINE uint16x4_t highbd_convolve12_4_y(int16x8_t s0[2], int16x8_t s1[2],
481 int16x8_t s2[2],
482 int16x8_t filter_0_7,
483 int16x8_t filter_4_11,
484 uint16x4_t max) {
485 int64x2_t sum[2];
486
487 sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0);
488 sum[0] = aom_svdot_lane_s16(sum[0], s1[0], filter_0_7, 1);
489 sum[0] = aom_svdot_lane_s16(sum[0], s2[0], filter_4_11, 1);
490
491 sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0);
492 sum[1] = aom_svdot_lane_s16(sum[1], s1[1], filter_0_7, 1);
493 sum[1] = aom_svdot_lane_s16(sum[1], s2[1], filter_4_11, 1);
494
495 int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1]));
496
497 uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS);
498
499 return vmin_u16(res, max);
500 }
501
highbd_convolve_y_sr_12tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,int bd)502 static INLINE void highbd_convolve_y_sr_12tap_sve2(
503 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
504 int width, int height, const int16_t *y_filter_ptr, int bd) {
505 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
506 const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
507
508 uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
509 // Scale indices by size of the true vector length to avoid reading from an
510 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
511 uint16x8_t correction0 =
512 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
513 merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
514
515 uint16x8_t correction1 =
516 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
517 merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
518
519 uint16x8_t correction2 =
520 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
521 merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
522
523 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
524
525 do {
526 int16_t *s = (int16_t *)src;
527 uint16_t *d = dst;
528 int h = height;
529
530 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
531 load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
532 &s9, &sA);
533 s += 11 * src_stride;
534
535 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
536 s6789[2], s789A[2];
537 transpose_concat_4x4(s0, s1, s2, s3, s0123);
538 transpose_concat_4x4(s1, s2, s3, s4, s1234);
539 transpose_concat_4x4(s2, s3, s4, s5, s2345);
540 transpose_concat_4x4(s3, s4, s5, s6, s3456);
541 transpose_concat_4x4(s4, s5, s6, s7, s4567);
542 transpose_concat_4x4(s5, s6, s7, s8, s5678);
543 transpose_concat_4x4(s6, s7, s8, s9, s6789);
544 transpose_concat_4x4(s7, s8, s9, sA, s789A);
545
546 do {
547 int16x4_t sB, sC, sD, sE;
548 load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
549
550 int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
551 transpose_concat_4x4(sB, sC, sD, sE, sBCDE);
552
553 // Use the above transpose and reuse data from the previous loop to get
554 // the rest.
555 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
556 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC);
557 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD);
558
559 uint16x4_t d0 = highbd_convolve12_4_y(s0123, s4567, s89AB, y_filter_0_7,
560 y_filter_4_11, max);
561 uint16x4_t d1 = highbd_convolve12_4_y(s1234, s5678, s9ABC, y_filter_0_7,
562 y_filter_4_11, max);
563 uint16x4_t d2 = highbd_convolve12_4_y(s2345, s6789, sABCD, y_filter_0_7,
564 y_filter_4_11, max);
565 uint16x4_t d3 = highbd_convolve12_4_y(s3456, s789A, sBCDE, y_filter_0_7,
566 y_filter_4_11, max);
567
568 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
569
570 // Prepare block for next iteration - re-using as much as possible.
571 // Shuffle everything up four rows.
572 s0123[0] = s4567[0];
573 s0123[1] = s4567[1];
574 s1234[0] = s5678[0];
575 s1234[1] = s5678[1];
576 s2345[0] = s6789[0];
577 s2345[1] = s6789[1];
578 s3456[0] = s789A[0];
579 s3456[1] = s789A[1];
580 s4567[0] = s89AB[0];
581 s4567[1] = s89AB[1];
582 s5678[0] = s9ABC[0];
583 s5678[1] = s9ABC[1];
584 s6789[0] = sABCD[0];
585 s6789[1] = sABCD[1];
586 s789A[0] = sBCDE[0];
587 s789A[1] = sBCDE[1];
588
589 s += 4 * src_stride;
590 d += 4 * dst_stride;
591 h -= 4;
592 } while (h != 0);
593 src += 4;
594 dst += 4;
595 width -= 4;
596 } while (width != 0);
597 }
598
highbd_convolve8_4_y(int16x8_t samples_lo[2],int16x8_t samples_hi[2],int16x8_t filter,uint16x4_t max)599 static INLINE uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
600 int16x8_t samples_hi[2],
601 int16x8_t filter,
602 uint16x4_t max) {
603 int64x2_t sum01 =
604 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
605 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
606
607 int64x2_t sum23 =
608 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
609 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
610
611 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
612 uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
613 return vmin_u16(res, max);
614 }
615
highbd_convolve8_8_y(int16x8_t samples_lo[4],int16x8_t samples_hi[4],int16x8_t filter,uint16x8_t max)616 static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
617 int16x8_t samples_hi[4],
618 int16x8_t filter,
619 uint16x8_t max) {
620 int64x2_t sum01 =
621 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
622 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
623
624 int64x2_t sum23 =
625 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
626 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
627
628 int64x2_t sum45 =
629 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0);
630 sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1);
631
632 int64x2_t sum67 =
633 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0);
634 sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1);
635
636 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
637 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
638 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
639 vqrshrun_n_s32(sum4567, FILTER_BITS));
640 return vminq_u16(res, max);
641 }
642
highbd_convolve_y_sr_8tap_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int width,int height,const int16_t * filter_y,int bd)643 void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
644 uint16_t *dst, ptrdiff_t dst_stride,
645 int width, int height,
646 const int16_t *filter_y, int bd) {
647 assert(w >= 4 && h >= 4);
648
649 const int16x8_t y_filter = vld1q_s16(filter_y);
650
651 uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
652 // Scale indices by size of the true vector length to avoid reading from an
653 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
654 uint16x8_t correction0 =
655 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
656 merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
657
658 uint16x8_t correction1 =
659 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
660 merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
661
662 uint16x8_t correction2 =
663 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
664 merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
665
666 if (width == 4) {
667 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
668 int16_t *s = (int16_t *)src;
669
670 int16x4_t s0, s1, s2, s3, s4, s5, s6;
671 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
672 s += 7 * src_stride;
673
674 // This operation combines a conventional transpose and the sample permute
675 // required before computing the dot product.
676 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
677 transpose_concat_4x4(s0, s1, s2, s3, s0123);
678 transpose_concat_4x4(s1, s2, s3, s4, s1234);
679 transpose_concat_4x4(s2, s3, s4, s5, s2345);
680 transpose_concat_4x4(s3, s4, s5, s6, s3456);
681
682 do {
683 int16x4_t s7, s8, s9, s10;
684 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
685
686 int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
687 // Transpose and shuffle the 4 lines that were loaded.
688 transpose_concat_4x4(s7, s8, s9, s10, s789A);
689
690 // Merge new data into block from previous iteration.
691 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
692 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
693 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
694
695 uint16x4_t d0 = highbd_convolve8_4_y(s0123, s4567, y_filter, max);
696 uint16x4_t d1 = highbd_convolve8_4_y(s1234, s5678, y_filter, max);
697 uint16x4_t d2 = highbd_convolve8_4_y(s2345, s6789, y_filter, max);
698 uint16x4_t d3 = highbd_convolve8_4_y(s3456, s789A, y_filter, max);
699
700 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
701
702 // Prepare block for next iteration - re-using as much as possible.
703 // Shuffle everything up four rows.
704 s0123[0] = s4567[0];
705 s0123[1] = s4567[1];
706 s1234[0] = s5678[0];
707 s1234[1] = s5678[1];
708 s2345[0] = s6789[0];
709 s2345[1] = s6789[1];
710 s3456[0] = s789A[0];
711 s3456[1] = s789A[1];
712 s += 4 * src_stride;
713 dst += 4 * dst_stride;
714 height -= 4;
715 } while (height != 0);
716 } else {
717 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
718
719 do {
720 int h = height;
721 int16_t *s = (int16_t *)src;
722 uint16_t *d = dst;
723
724 int16x8_t s0, s1, s2, s3, s4, s5, s6;
725 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
726 s += 7 * src_stride;
727
728 // This operation combines a conventional transpose and the sample permute
729 // required before computing the dot product.
730 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
731 transpose_concat_8x4(s0, s1, s2, s3, s0123);
732 transpose_concat_8x4(s1, s2, s3, s4, s1234);
733 transpose_concat_8x4(s2, s3, s4, s5, s2345);
734 transpose_concat_8x4(s3, s4, s5, s6, s3456);
735
736 do {
737 int16x8_t s7, s8, s9, s10;
738 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
739
740 int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
741 // Transpose and shuffle the 4 lines that were loaded.
742 transpose_concat_8x4(s7, s8, s9, s10, s789A);
743
744 // Merge new data into block from previous iteration.
745 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
746 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
747 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
748
749 uint16x8_t d0 = highbd_convolve8_8_y(s0123, s4567, y_filter, max);
750 uint16x8_t d1 = highbd_convolve8_8_y(s1234, s5678, y_filter, max);
751 uint16x8_t d2 = highbd_convolve8_8_y(s2345, s6789, y_filter, max);
752 uint16x8_t d3 = highbd_convolve8_8_y(s3456, s789A, y_filter, max);
753
754 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
755
756 // Prepare block for next iteration - re-using as much as possible.
757 // Shuffle everything up four rows.
758 s0123[0] = s4567[0];
759 s0123[1] = s4567[1];
760 s0123[2] = s4567[2];
761 s0123[3] = s4567[3];
762 s1234[0] = s5678[0];
763 s1234[1] = s5678[1];
764 s1234[2] = s5678[2];
765 s1234[3] = s5678[3];
766 s2345[0] = s6789[0];
767 s2345[1] = s6789[1];
768 s2345[2] = s6789[2];
769 s2345[3] = s6789[3];
770 s3456[0] = s789A[0];
771 s3456[1] = s789A[1];
772 s3456[2] = s789A[2];
773 s3456[3] = s789A[3];
774
775 s += 4 * src_stride;
776 d += 4 * dst_stride;
777 h -= 4;
778 } while (h != 0);
779 src += 8;
780 dst += 8;
781 width -= 8;
782 } while (width != 0);
783 }
784 }
785
highbd_convolve4_4_y(int16x8_t samples[2],int16x8_t filter,uint16x4_t max)786 static INLINE uint16x4_t highbd_convolve4_4_y(int16x8_t samples[2],
787 int16x8_t filter,
788 uint16x4_t max) {
789 int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0);
790 int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0);
791
792 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
793 uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
794 return vmin_u16(res, max);
795 }
796
highbd_convolve4_8_y(int16x8_t samples[4],int16x8_t filter,uint16x8_t max)797 static INLINE uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4],
798 int16x8_t filter,
799 uint16x8_t max) {
800 int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0);
801 int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0);
802 int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[2], filter, 0);
803 int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[3], filter, 0);
804
805 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
806 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
807 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
808 vqrshrun_n_s32(sum4567, FILTER_BITS));
809 return vminq_u16(res, max);
810 }
811
highbd_convolve_y_sr_4tap_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int width,int height,const int16_t * filter_y,int bd)812 void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src, ptrdiff_t src_stride,
813 uint16_t *dst, ptrdiff_t dst_stride,
814 int width, int height,
815 const int16_t *filter_y, int bd) {
816 assert(w >= 4 && h >= 4);
817
818 const int16x8_t y_filter =
819 vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
820
821 if (width == 4) {
822 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
823 int16_t *s = (int16_t *)src;
824
825 int16x4_t s0, s1, s2;
826 load_s16_4x3(s, src_stride, &s0, &s1, &s2);
827 s += 3 * src_stride;
828
829 do {
830 int16x4_t s3, s4, s5, s6;
831 load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
832
833 // This operation combines a conventional transpose and the sample permute
834 // required before computing the dot product.
835 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
836 transpose_concat_4x4(s0, s1, s2, s3, s0123);
837 transpose_concat_4x4(s1, s2, s3, s4, s1234);
838 transpose_concat_4x4(s2, s3, s4, s5, s2345);
839 transpose_concat_4x4(s3, s4, s5, s6, s3456);
840
841 uint16x4_t d0 = highbd_convolve4_4_y(s0123, y_filter, max);
842 uint16x4_t d1 = highbd_convolve4_4_y(s1234, y_filter, max);
843 uint16x4_t d2 = highbd_convolve4_4_y(s2345, y_filter, max);
844 uint16x4_t d3 = highbd_convolve4_4_y(s3456, y_filter, max);
845
846 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
847
848 // Shuffle everything up four rows.
849 s0 = s4;
850 s1 = s5;
851 s2 = s6;
852
853 s += 4 * src_stride;
854 dst += 4 * dst_stride;
855 height -= 4;
856 } while (height != 0);
857 } else {
858 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
859
860 do {
861 int h = height;
862 int16_t *s = (int16_t *)src;
863 uint16_t *d = dst;
864
865 int16x8_t s0, s1, s2;
866 load_s16_8x3(s, src_stride, &s0, &s1, &s2);
867 s += 3 * src_stride;
868
869 do {
870 int16x8_t s3, s4, s5, s6;
871 load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
872
873 // This operation combines a conventional transpose and the sample
874 // permute required before computing the dot product.
875 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
876 transpose_concat_8x4(s0, s1, s2, s3, s0123);
877 transpose_concat_8x4(s1, s2, s3, s4, s1234);
878 transpose_concat_8x4(s2, s3, s4, s5, s2345);
879 transpose_concat_8x4(s3, s4, s5, s6, s3456);
880
881 uint16x8_t d0 = highbd_convolve4_8_y(s0123, y_filter, max);
882 uint16x8_t d1 = highbd_convolve4_8_y(s1234, y_filter, max);
883 uint16x8_t d2 = highbd_convolve4_8_y(s2345, y_filter, max);
884 uint16x8_t d3 = highbd_convolve4_8_y(s3456, y_filter, max);
885
886 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
887
888 // Shuffle everything up four rows.
889 s0 = s4;
890 s1 = s5;
891 s2 = s6;
892
893 s += 4 * src_stride;
894 d += 4 * dst_stride;
895 h -= 4;
896 } while (h != 0);
897 src += 8;
898 dst += 8;
899 width -= 8;
900 } while (width != 0);
901 }
902 }
903
av1_highbd_convolve_y_sr_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)904 void av1_highbd_convolve_y_sr_sve2(const uint16_t *src, int src_stride,
905 uint16_t *dst, int dst_stride, int w, int h,
906 const InterpFilterParams *filter_params_y,
907 const int subpel_y_qn, int bd) {
908 if (w == 2 || h == 2) {
909 av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
910 filter_params_y, subpel_y_qn, bd);
911 return;
912 }
913 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
914
915 if (y_filter_taps == 6) {
916 av1_highbd_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h,
917 filter_params_y, subpel_y_qn, bd);
918 return;
919 }
920
921 const int vert_offset = filter_params_y->taps / 2 - 1;
922 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
923 filter_params_y, subpel_y_qn & SUBPEL_MASK);
924
925 src -= vert_offset * src_stride;
926
927 if (y_filter_taps > 8) {
928 highbd_convolve_y_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h,
929 y_filter_ptr, bd);
930 return;
931 }
932
933 if (y_filter_taps == 4) {
934 highbd_convolve_y_sr_4tap_sve2(src + 2 * src_stride, src_stride, dst,
935 dst_stride, w, h, y_filter_ptr, bd);
936 return;
937 }
938
939 highbd_convolve_y_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h,
940 y_filter_ptr, bd);
941 }
942
convolve12_4_2d_h(int16x8_t s0,int16x8_t s1,int16x8_t filter_0_7,int16x8_t filter_4_11,const int64x2_t offset,int32x4_t shift,uint16x8x4_t permute_tbl)943 static INLINE uint16x4_t convolve12_4_2d_h(
944 int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11,
945 const int64x2_t offset, int32x4_t shift, uint16x8x4_t permute_tbl) {
946 int16x8_t permuted_samples[6];
947 permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
948 permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
949 permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
950 permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
951 permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
952 permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
953
954 int64x2_t sum01 =
955 aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
956 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
957 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
958
959 int64x2_t sum23 =
960 aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
961 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
962 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
963
964 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
965 sum0123 = vqrshlq_s32(sum0123, shift);
966 return vqmovun_s32(sum0123);
967 }
968
convolve12_8_2d_h(int16x8_t s0,int16x8_t s1,int16x8_t s2,int16x8_t filter_0_7,int16x8_t filter_4_11,int64x2_t offset,int32x4_t shift,uint16x8x4_t permute_tbl)969 static INLINE uint16x8_t convolve12_8_2d_h(int16x8_t s0, int16x8_t s1,
970 int16x8_t s2, int16x8_t filter_0_7,
971 int16x8_t filter_4_11,
972 int64x2_t offset, int32x4_t shift,
973 uint16x8x4_t permute_tbl) {
974 int16x8_t permuted_samples[8];
975 permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
976 permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
977 permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
978 permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
979 permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
980 permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
981 permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]);
982 permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]);
983
984 int64x2_t sum01 =
985 aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
986 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
987 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
988
989 int64x2_t sum23 =
990 aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
991 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
992 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
993
994 int64x2_t sum45 =
995 aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0);
996 sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1);
997 sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1);
998
999 int64x2_t sum67 =
1000 aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0);
1001 sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1);
1002 sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1);
1003
1004 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1005 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
1006
1007 sum0123 = vqrshlq_s32(sum0123, shift);
1008 sum4567 = vqrshlq_s32(sum4567, shift);
1009
1010 return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
1011 }
1012
highbd_convolve_2d_sr_horiz_12tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,ConvolveParams * conv_params,const int x_offset)1013 static INLINE void highbd_convolve_2d_sr_horiz_12tap_sve2(
1014 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1015 int width, int height, const int16_t *y_filter_ptr,
1016 ConvolveParams *conv_params, const int x_offset) {
1017 const int64x2_t offset = vdupq_n_s64(x_offset);
1018 const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
1019
1020 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
1021 const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
1022
1023 uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl);
1024 // Scale indices by size of the true vector length to avoid reading from an
1025 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
1026 uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64(
1027 vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL)));
1028 permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0);
1029
1030 uint16x8_t correction1 = vreinterpretq_u16_u64(
1031 vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL),
1032 vdup_n_u64(svcnth() * 0x0001000100010000ULL)));
1033 permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1);
1034
1035 if (width == 4) {
1036 const int16_t *s = (const int16_t *)src;
1037
1038 do {
1039 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1040 load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6);
1041 load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7);
1042
1043 uint16x4_t d0 = convolve12_4_2d_h(s0, s1, y_filter_0_7, y_filter_4_11,
1044 offset, shift, permute_tbl);
1045 uint16x4_t d1 = convolve12_4_2d_h(s2, s3, y_filter_0_7, y_filter_4_11,
1046 offset, shift, permute_tbl);
1047 uint16x4_t d2 = convolve12_4_2d_h(s4, s5, y_filter_0_7, y_filter_4_11,
1048 offset, shift, permute_tbl);
1049 uint16x4_t d3 = convolve12_4_2d_h(s6, s7, y_filter_0_7, y_filter_4_11,
1050 offset, shift, permute_tbl);
1051
1052 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
1053
1054 dst += 4 * dst_stride;
1055 s += 4 * src_stride;
1056 height -= 4;
1057 } while (height > 0);
1058 } else {
1059 do {
1060 const int16_t *s = (const int16_t *)src;
1061 uint16_t *d = dst;
1062 int w = width;
1063
1064 do {
1065 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
1066 load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9);
1067 load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10);
1068 load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11);
1069
1070 uint16x8_t d0 =
1071 convolve12_8_2d_h(s0, s1, s2, y_filter_0_7, y_filter_4_11, offset,
1072 shift, permute_tbl);
1073 uint16x8_t d1 =
1074 convolve12_8_2d_h(s3, s4, s5, y_filter_0_7, y_filter_4_11, offset,
1075 shift, permute_tbl);
1076 uint16x8_t d2 =
1077 convolve12_8_2d_h(s6, s7, s8, y_filter_0_7, y_filter_4_11, offset,
1078 shift, permute_tbl);
1079 uint16x8_t d3 =
1080 convolve12_8_2d_h(s9, s10, s11, y_filter_0_7, y_filter_4_11, offset,
1081 shift, permute_tbl);
1082
1083 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1084
1085 s += 8;
1086 d += 8;
1087 w -= 8;
1088 } while (w != 0);
1089 src += 4 * src_stride;
1090 dst += 4 * dst_stride;
1091 height -= 4;
1092 } while (height > 0);
1093 }
1094 }
1095
convolve8_8_2d_h(int16x8_t s0[8],int16x8_t filter,int64x2_t offset,int32x4_t shift)1096 static INLINE uint16x8_t convolve8_8_2d_h(int16x8_t s0[8], int16x8_t filter,
1097 int64x2_t offset, int32x4_t shift) {
1098 int64x2_t sum[8];
1099 sum[0] = aom_sdotq_s16(offset, s0[0], filter);
1100 sum[1] = aom_sdotq_s16(offset, s0[1], filter);
1101 sum[2] = aom_sdotq_s16(offset, s0[2], filter);
1102 sum[3] = aom_sdotq_s16(offset, s0[3], filter);
1103 sum[4] = aom_sdotq_s16(offset, s0[4], filter);
1104 sum[5] = aom_sdotq_s16(offset, s0[5], filter);
1105 sum[6] = aom_sdotq_s16(offset, s0[6], filter);
1106 sum[7] = aom_sdotq_s16(offset, s0[7], filter);
1107
1108 sum[0] = vpaddq_s64(sum[0], sum[1]);
1109 sum[2] = vpaddq_s64(sum[2], sum[3]);
1110 sum[4] = vpaddq_s64(sum[4], sum[5]);
1111 sum[6] = vpaddq_s64(sum[6], sum[7]);
1112
1113 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
1114 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6]));
1115
1116 sum0123 = vqrshlq_s32(sum0123, shift);
1117 sum4567 = vqrshlq_s32(sum4567, shift);
1118
1119 return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
1120 }
1121
highbd_convolve_2d_sr_horiz_8tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,ConvolveParams * conv_params,const int x_offset)1122 static INLINE void highbd_convolve_2d_sr_horiz_8tap_sve2(
1123 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1124 int width, int height, const int16_t *y_filter_ptr,
1125 ConvolveParams *conv_params, const int x_offset) {
1126 const int64x2_t offset = vdupq_n_s64(x_offset);
1127 const int64x2_t offset_lo = vcombine_s64(vget_low_s64(offset), vdup_n_s64(0));
1128 const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
1129
1130 const int16x8_t filter = vld1q_s16(y_filter_ptr);
1131
1132 do {
1133 const int16_t *s = (const int16_t *)src;
1134 uint16_t *d = dst;
1135 int w = width;
1136
1137 do {
1138 int16x8_t s0[8], s1[8], s2[8], s3[8];
1139 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1140 &s0[4], &s0[5], &s0[6], &s0[7]);
1141 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1142 &s1[4], &s1[5], &s1[6], &s1[7]);
1143 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1144 &s2[4], &s2[5], &s2[6], &s2[7]);
1145 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1146 &s3[4], &s3[5], &s3[6], &s3[7]);
1147
1148 uint16x8_t d0 = convolve8_8_2d_h(s0, filter, offset_lo, shift);
1149 uint16x8_t d1 = convolve8_8_2d_h(s1, filter, offset_lo, shift);
1150 uint16x8_t d2 = convolve8_8_2d_h(s2, filter, offset_lo, shift);
1151 uint16x8_t d3 = convolve8_8_2d_h(s3, filter, offset_lo, shift);
1152
1153 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1154
1155 s += 8;
1156 d += 8;
1157 w -= 8;
1158 } while (w != 0);
1159 src += 4 * src_stride;
1160 dst += 4 * dst_stride;
1161 height -= 4;
1162 } while (height > 0);
1163 }
1164
convolve4_4_2d_h(int16x8_t s0,int16x8_t filter,int64x2_t offset,int32x4_t shift,uint16x8x2_t permute_tbl)1165 static INLINE uint16x4_t convolve4_4_2d_h(int16x8_t s0, int16x8_t filter,
1166 int64x2_t offset, int32x4_t shift,
1167 uint16x8x2_t permute_tbl) {
1168 int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
1169 int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]);
1170
1171 int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0);
1172 int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0);
1173
1174 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1175 sum0123 = vqrshlq_s32(sum0123, shift);
1176 return vqmovun_s32(sum0123);
1177 }
1178
convolve4_8_2d_h(int16x8_t s0[8],int16x8_t filter,int64x2_t offset,int32x4_t shift,uint16x8_t tbl)1179 static INLINE uint16x8_t convolve4_8_2d_h(int16x8_t s0[8], int16x8_t filter,
1180 int64x2_t offset, int32x4_t shift,
1181 uint16x8_t tbl) {
1182 int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
1183 int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0);
1184 int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0);
1185 int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0);
1186
1187 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
1188 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
1189
1190 sum0123 = vqrshlq_s32(sum0123, shift);
1191 sum4567 = vqrshlq_s32(sum4567, shift);
1192
1193 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
1194 return aom_tbl_u16(res, tbl);
1195 }
1196
highbd_convolve_2d_sr_horiz_4tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * x_filter_ptr,ConvolveParams * conv_params,const int x_offset)1197 static INLINE void highbd_convolve_2d_sr_horiz_4tap_sve2(
1198 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1199 int width, int height, const int16_t *x_filter_ptr,
1200 ConvolveParams *conv_params, const int x_offset) {
1201 const int64x2_t offset = vdupq_n_s64(x_offset);
1202 const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
1203
1204 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1205 const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
1206
1207 if (width == 4) {
1208 const int16_t *s = (const int16_t *)(src);
1209
1210 uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl);
1211
1212 do {
1213 int16x8_t s0, s1, s2, s3;
1214 load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
1215
1216 uint16x4_t d0 = convolve4_4_2d_h(s0, filter, offset, shift, permute_tbl);
1217 uint16x4_t d1 = convolve4_4_2d_h(s1, filter, offset, shift, permute_tbl);
1218 uint16x4_t d2 = convolve4_4_2d_h(s2, filter, offset, shift, permute_tbl);
1219 uint16x4_t d3 = convolve4_4_2d_h(s3, filter, offset, shift, permute_tbl);
1220
1221 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
1222
1223 s += 4 * src_stride;
1224 dst += 4 * dst_stride;
1225 height -= 4;
1226 } while (height > 0);
1227 } else {
1228 uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
1229
1230 do {
1231 const int16_t *s = (const int16_t *)(src);
1232 uint16_t *d = dst;
1233 int w = width;
1234
1235 do {
1236 int16x8_t s0[8], s1[8], s2[8], s3[8];
1237 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1238 &s0[4], &s0[5], &s0[6], &s0[7]);
1239 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1240 &s1[4], &s1[5], &s1[6], &s1[7]);
1241 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1242 &s2[4], &s2[5], &s2[6], &s2[7]);
1243 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1244 &s3[4], &s3[5], &s3[6], &s3[7]);
1245
1246 uint16x8_t d0 = convolve4_8_2d_h(s0, filter, offset, shift, idx);
1247 uint16x8_t d1 = convolve4_8_2d_h(s1, filter, offset, shift, idx);
1248 uint16x8_t d2 = convolve4_8_2d_h(s2, filter, offset, shift, idx);
1249 uint16x8_t d3 = convolve4_8_2d_h(s3, filter, offset, shift, idx);
1250
1251 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1252
1253 s += 8;
1254 d += 8;
1255 w -= 8;
1256 } while (w != 0);
1257 src += 4 * src_stride;
1258 dst += 4 * dst_stride;
1259 height -= 4;
1260 } while (height > 0);
1261 }
1262 }
1263
highbd_convolve12_4_2d_v(int16x8_t s0[2],int16x8_t s1[2],int16x8_t s2[2],int16x8_t filter_0_7,int16x8_t filter_4_11,int32x4_t shift,int64x2_t offset,uint16x4_t max)1264 static INLINE uint16x4_t highbd_convolve12_4_2d_v(
1265 int16x8_t s0[2], int16x8_t s1[2], int16x8_t s2[2], int16x8_t filter_0_7,
1266 int16x8_t filter_4_11, int32x4_t shift, int64x2_t offset, uint16x4_t max) {
1267 int64x2_t sum01 = aom_svdot_lane_s16(offset, s0[0], filter_0_7, 0);
1268 sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1);
1269 sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1);
1270
1271 int64x2_t sum23 = aom_svdot_lane_s16(offset, s0[1], filter_0_7, 0);
1272 sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1);
1273 sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1);
1274
1275 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1276 sum0123 = vshlq_s32(sum0123, shift);
1277
1278 uint16x4_t res = vqmovun_s32(sum0123);
1279
1280 return vmin_u16(res, max);
1281 }
1282
highbd_convolve_2d_sr_vert_12tap_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int width,int height,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd,const int y_offset)1283 static INLINE void highbd_convolve_2d_sr_vert_12tap_sve2(
1284 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1285 int width, int height, const int16_t *y_filter_ptr,
1286 ConvolveParams *conv_params, int bd, const int y_offset) {
1287 const int64x2_t offset = vdupq_n_s64(y_offset);
1288 const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
1289
1290 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
1291 const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
1292
1293 uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
1294 // Scale indices by size of the true vector length to avoid reading from an
1295 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
1296 uint16x8_t correction0 =
1297 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
1298 merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
1299
1300 uint16x8_t correction1 =
1301 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
1302 merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
1303
1304 uint16x8_t correction2 =
1305 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
1306 merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
1307
1308 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1309
1310 do {
1311 int16_t *s = (int16_t *)src;
1312 uint16_t *d = (uint16_t *)dst;
1313 int h = height;
1314
1315 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
1316 load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
1317 &s9, &sA);
1318 s += 11 * src_stride;
1319
1320 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
1321 s6789[2], s789A[2];
1322 // This operation combines a conventional transpose and the sample permute
1323 // required before computing the dot product.
1324 transpose_concat_4x4(s0, s1, s2, s3, s0123);
1325 transpose_concat_4x4(s1, s2, s3, s4, s1234);
1326 transpose_concat_4x4(s2, s3, s4, s5, s2345);
1327 transpose_concat_4x4(s3, s4, s5, s6, s3456);
1328 transpose_concat_4x4(s4, s5, s6, s7, s4567);
1329 transpose_concat_4x4(s5, s6, s7, s8, s5678);
1330 transpose_concat_4x4(s6, s7, s8, s9, s6789);
1331 transpose_concat_4x4(s7, s8, s9, sA, s789A);
1332
1333 do {
1334 int16x4_t sB, sC, sD, sE;
1335 load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
1336
1337 int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
1338 transpose_concat_4x4(sB, sC, sD, sE, sBCDE);
1339
1340 // Use the above transpose and reuse data from the previous loop to get
1341 // the rest.
1342 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
1343 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC);
1344 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD);
1345
1346 uint16x4_t d0 = highbd_convolve12_4_2d_v(
1347 s0123, s4567, s89AB, y_filter_0_7, y_filter_4_11, shift, offset, max);
1348 uint16x4_t d1 = highbd_convolve12_4_2d_v(
1349 s1234, s5678, s9ABC, y_filter_0_7, y_filter_4_11, shift, offset, max);
1350 uint16x4_t d2 = highbd_convolve12_4_2d_v(
1351 s2345, s6789, sABCD, y_filter_0_7, y_filter_4_11, shift, offset, max);
1352 uint16x4_t d3 = highbd_convolve12_4_2d_v(
1353 s3456, s789A, sBCDE, y_filter_0_7, y_filter_4_11, shift, offset, max);
1354
1355 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1356
1357 // Prepare block for next iteration - re-using as much as possible.
1358 // Shuffle everything up four rows.
1359 s0123[0] = s4567[0];
1360 s0123[1] = s4567[1];
1361 s1234[0] = s5678[0];
1362 s1234[1] = s5678[1];
1363 s2345[0] = s6789[0];
1364 s2345[1] = s6789[1];
1365 s3456[0] = s789A[0];
1366 s3456[1] = s789A[1];
1367 s4567[0] = s89AB[0];
1368 s4567[1] = s89AB[1];
1369 s5678[0] = s9ABC[0];
1370 s5678[1] = s9ABC[1];
1371 s6789[0] = sABCD[0];
1372 s6789[1] = sABCD[1];
1373 s789A[0] = sBCDE[0];
1374 s789A[1] = sBCDE[1];
1375
1376 s += 4 * src_stride;
1377 d += 4 * dst_stride;
1378 h -= 4;
1379 } while (h != 0);
1380 src += 4;
1381 dst += 4;
1382 width -= 4;
1383 } while (width != 0);
1384 }
1385
highbd_convolve8_4_2d_v(int16x8_t samples_lo[2],int16x8_t samples_hi[2],int16x8_t filter,int32x4_t shift,int64x2_t offset,uint16x4_t max)1386 static INLINE uint16x4_t highbd_convolve8_4_2d_v(
1387 int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter,
1388 int32x4_t shift, int64x2_t offset, uint16x4_t max) {
1389 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
1390 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
1391
1392 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
1393 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
1394
1395 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1396 sum0123 = vshlq_s32(sum0123, shift);
1397
1398 uint16x4_t res = vqmovun_s32(sum0123);
1399 return vmin_u16(res, max);
1400 }
1401
highbd_convolve8_8_2d_v(int16x8_t samples_lo[4],int16x8_t samples_hi[4],int16x8_t filter,int32x4_t shift,int64x2_t offset,uint16x8_t max)1402 static INLINE uint16x8_t highbd_convolve8_8_2d_v(
1403 int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter,
1404 int32x4_t shift, int64x2_t offset, uint16x8_t max) {
1405 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
1406 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
1407
1408 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
1409 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
1410
1411 int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0);
1412 sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1);
1413
1414 int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0);
1415 sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1);
1416
1417 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1418 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
1419
1420 sum0123 = vshlq_s32(sum0123, shift);
1421 sum4567 = vshlq_s32(sum4567, shift);
1422
1423 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
1424 return vminq_u16(res, max);
1425 }
1426
highbd_convolve_2d_sr_vert_8tap_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int width,int height,const int16_t * filter_y,ConvolveParams * conv_params,int bd,const int y_offset)1427 void highbd_convolve_2d_sr_vert_8tap_sve2(const uint16_t *src,
1428 ptrdiff_t src_stride, uint16_t *dst,
1429 ptrdiff_t dst_stride, int width,
1430 int height, const int16_t *filter_y,
1431 ConvolveParams *conv_params, int bd,
1432 const int y_offset) {
1433 assert(w >= 4 && h >= 4);
1434 const int64x2_t offset = vdupq_n_s64(y_offset);
1435 const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
1436 const int16x8_t y_filter = vld1q_s16(filter_y);
1437
1438 uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
1439 // Scale indices by size of the true vector length to avoid reading from an
1440 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
1441 uint16x8_t correction0 =
1442 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
1443 merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
1444
1445 uint16x8_t correction1 =
1446 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
1447 merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
1448
1449 uint16x8_t correction2 =
1450 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
1451 merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
1452
1453 if (width == 4) {
1454 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1455 int16_t *s = (int16_t *)src;
1456
1457 int16x4_t s0, s1, s2, s3, s4, s5, s6;
1458 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1459 s += 7 * src_stride;
1460
1461 // This operation combines a conventional transpose and the sample permute
1462 // required before computing the dot product.
1463 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
1464 transpose_concat_4x4(s0, s1, s2, s3, s0123);
1465 transpose_concat_4x4(s1, s2, s3, s4, s1234);
1466 transpose_concat_4x4(s2, s3, s4, s5, s2345);
1467 transpose_concat_4x4(s3, s4, s5, s6, s3456);
1468
1469 do {
1470 int16x4_t s7, s8, s9, s10;
1471 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
1472
1473 int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
1474 // Transpose and shuffle the 4 lines that were loaded.
1475 transpose_concat_4x4(s7, s8, s9, s10, s789A);
1476
1477 // Merge new data into block from previous iteration.
1478 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
1479 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
1480 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
1481
1482 uint16x4_t d0 =
1483 highbd_convolve8_4_2d_v(s0123, s4567, y_filter, shift, offset, max);
1484 uint16x4_t d1 =
1485 highbd_convolve8_4_2d_v(s1234, s5678, y_filter, shift, offset, max);
1486 uint16x4_t d2 =
1487 highbd_convolve8_4_2d_v(s2345, s6789, y_filter, shift, offset, max);
1488 uint16x4_t d3 =
1489 highbd_convolve8_4_2d_v(s3456, s789A, y_filter, shift, offset, max);
1490
1491 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
1492
1493 // Prepare block for next iteration - re-using as much as possible.
1494 // Shuffle everything up four rows.
1495 s0123[0] = s4567[0];
1496 s0123[1] = s4567[1];
1497 s1234[0] = s5678[0];
1498 s1234[1] = s5678[1];
1499 s2345[0] = s6789[0];
1500 s2345[1] = s6789[1];
1501 s3456[0] = s789A[0];
1502 s3456[1] = s789A[1];
1503
1504 s += 4 * src_stride;
1505 dst += 4 * dst_stride;
1506 height -= 4;
1507 } while (height != 0);
1508 } else {
1509 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1510
1511 do {
1512 int h = height;
1513 int16_t *s = (int16_t *)src;
1514 uint16_t *d = dst;
1515
1516 int16x8_t s0, s1, s2, s3, s4, s5, s6;
1517 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1518 s += 7 * src_stride;
1519
1520 // This operation combines a conventional transpose and the sample permute
1521 // required before computing the dot product.
1522 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
1523 transpose_concat_8x4(s0, s1, s2, s3, s0123);
1524 transpose_concat_8x4(s1, s2, s3, s4, s1234);
1525 transpose_concat_8x4(s2, s3, s4, s5, s2345);
1526 transpose_concat_8x4(s3, s4, s5, s6, s3456);
1527
1528 do {
1529 int16x8_t s7, s8, s9, s10;
1530 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
1531
1532 int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
1533 // Transpose and shuffle the 4 lines that were loaded.
1534 transpose_concat_8x4(s7, s8, s9, s10, s789A);
1535
1536 // Merge new data into block from previous iteration.
1537 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
1538 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
1539 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
1540
1541 uint16x8_t d0 =
1542 highbd_convolve8_8_2d_v(s0123, s4567, y_filter, shift, offset, max);
1543 uint16x8_t d1 =
1544 highbd_convolve8_8_2d_v(s1234, s5678, y_filter, shift, offset, max);
1545 uint16x8_t d2 =
1546 highbd_convolve8_8_2d_v(s2345, s6789, y_filter, shift, offset, max);
1547 uint16x8_t d3 =
1548 highbd_convolve8_8_2d_v(s3456, s789A, y_filter, shift, offset, max);
1549
1550 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1551
1552 // Prepare block for next iteration - re-using as much as possible.
1553 // Shuffle everything up four rows.
1554 s0123[0] = s4567[0];
1555 s0123[1] = s4567[1];
1556 s0123[2] = s4567[2];
1557 s0123[3] = s4567[3];
1558 s1234[0] = s5678[0];
1559 s1234[1] = s5678[1];
1560 s1234[2] = s5678[2];
1561 s1234[3] = s5678[3];
1562 s2345[0] = s6789[0];
1563 s2345[1] = s6789[1];
1564 s2345[2] = s6789[2];
1565 s2345[3] = s6789[3];
1566 s3456[0] = s789A[0];
1567 s3456[1] = s789A[1];
1568 s3456[2] = s789A[2];
1569 s3456[3] = s789A[3];
1570
1571 s += 4 * src_stride;
1572 d += 4 * dst_stride;
1573 h -= 4;
1574 } while (h != 0);
1575 src += 8;
1576 dst += 8;
1577 width -= 8;
1578 } while (width != 0);
1579 }
1580 }
1581
highbd_convolve4_4_2d_v(int16x8_t samples[2],int16x8_t filter,int32x4_t shift,int64x2_t offset,uint16x4_t max)1582 static INLINE uint16x4_t highbd_convolve4_4_2d_v(int16x8_t samples[2],
1583 int16x8_t filter,
1584 int32x4_t shift,
1585 int64x2_t offset,
1586 uint16x4_t max) {
1587 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0);
1588 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0);
1589
1590 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1591 sum0123 = vshlq_s32(sum0123, shift);
1592
1593 uint16x4_t res = vqmovun_s32(sum0123);
1594 return vmin_u16(res, max);
1595 }
1596
highbd_convolve4_8_2d_v(int16x8_t samples[4],int16x8_t filter,int32x4_t shift,int64x2_t offset,uint16x8_t max)1597 static INLINE uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4],
1598 int16x8_t filter,
1599 int32x4_t shift,
1600 int64x2_t offset,
1601 uint16x8_t max) {
1602 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0);
1603 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0);
1604 int64x2_t sum45 = aom_svdot_lane_s16(offset, samples[2], filter, 0);
1605 int64x2_t sum67 = aom_svdot_lane_s16(offset, samples[3], filter, 0);
1606
1607 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
1608 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
1609
1610 sum0123 = vshlq_s32(sum0123, shift);
1611 sum4567 = vshlq_s32(sum4567, shift);
1612
1613 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
1614 return vminq_u16(res, max);
1615 }
1616
highbd_convolve_2d_sr_vert_4tap_sve2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int width,int height,const int16_t * filter_y,ConvolveParams * conv_params,int bd,const int y_offset)1617 void highbd_convolve_2d_sr_vert_4tap_sve2(const uint16_t *src,
1618 ptrdiff_t src_stride, uint16_t *dst,
1619 ptrdiff_t dst_stride, int width,
1620 int height, const int16_t *filter_y,
1621 ConvolveParams *conv_params, int bd,
1622 const int y_offset) {
1623 assert(w >= 4 && h >= 4);
1624 const int64x2_t offset = vdupq_n_s64(y_offset);
1625 const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
1626
1627 const int16x8_t y_filter =
1628 vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
1629
1630 if (width == 4) {
1631 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1632 int16_t *s = (int16_t *)(src);
1633
1634 int16x4_t s0, s1, s2;
1635 load_s16_4x3(s, src_stride, &s0, &s1, &s2);
1636 s += 3 * src_stride;
1637
1638 do {
1639 int16x4_t s3, s4, s5, s6;
1640 load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
1641
1642 // This operation combines a conventional transpose and the sample permute
1643 // required before computing the dot product.
1644 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
1645 transpose_concat_4x4(s0, s1, s2, s3, s0123);
1646 transpose_concat_4x4(s1, s2, s3, s4, s1234);
1647 transpose_concat_4x4(s2, s3, s4, s5, s2345);
1648 transpose_concat_4x4(s3, s4, s5, s6, s3456);
1649
1650 uint16x4_t d0 =
1651 highbd_convolve4_4_2d_v(s0123, y_filter, shift, offset, max);
1652 uint16x4_t d1 =
1653 highbd_convolve4_4_2d_v(s1234, y_filter, shift, offset, max);
1654 uint16x4_t d2 =
1655 highbd_convolve4_4_2d_v(s2345, y_filter, shift, offset, max);
1656 uint16x4_t d3 =
1657 highbd_convolve4_4_2d_v(s3456, y_filter, shift, offset, max);
1658
1659 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
1660
1661 // Shuffle everything up four rows.
1662 s0 = s4;
1663 s1 = s5;
1664 s2 = s6;
1665
1666 s += 4 * src_stride;
1667 dst += 4 * dst_stride;
1668 height -= 4;
1669 } while (height != 0);
1670 } else {
1671 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1672
1673 do {
1674 int h = height;
1675 int16_t *s = (int16_t *)(src);
1676 uint16_t *d = dst;
1677
1678 int16x8_t s0, s1, s2;
1679 load_s16_8x3(s, src_stride, &s0, &s1, &s2);
1680 s += 3 * src_stride;
1681
1682 do {
1683 int16x8_t s3, s4, s5, s6;
1684 load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
1685
1686 // This operation combines a conventional transpose and the sample
1687 // permute required before computing the dot product.
1688 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
1689 transpose_concat_8x4(s0, s1, s2, s3, s0123);
1690 transpose_concat_8x4(s1, s2, s3, s4, s1234);
1691 transpose_concat_8x4(s2, s3, s4, s5, s2345);
1692 transpose_concat_8x4(s3, s4, s5, s6, s3456);
1693
1694 uint16x8_t d0 =
1695 highbd_convolve4_8_2d_v(s0123, y_filter, shift, offset, max);
1696 uint16x8_t d1 =
1697 highbd_convolve4_8_2d_v(s1234, y_filter, shift, offset, max);
1698 uint16x8_t d2 =
1699 highbd_convolve4_8_2d_v(s2345, y_filter, shift, offset, max);
1700 uint16x8_t d3 =
1701 highbd_convolve4_8_2d_v(s3456, y_filter, shift, offset, max);
1702
1703 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1704
1705 // Shuffle everything up four rows.
1706 s0 = s4;
1707 s1 = s5;
1708 s2 = s6;
1709
1710 s += 4 * src_stride;
1711 d += 4 * dst_stride;
1712 h -= 4;
1713 } while (h != 0);
1714 src += 8;
1715 dst += 8;
1716 width -= 8;
1717 } while (width != 0);
1718 }
1719 }
1720
av1_highbd_convolve_2d_sr_sve2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1721 void av1_highbd_convolve_2d_sr_sve2(const uint16_t *src, int src_stride,
1722 uint16_t *dst, int dst_stride, int w, int h,
1723 const InterpFilterParams *filter_params_x,
1724 const InterpFilterParams *filter_params_y,
1725 const int subpel_x_qn,
1726 const int subpel_y_qn,
1727 ConvolveParams *conv_params, int bd) {
1728 if (w == 2 || h == 2) {
1729 av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1730 filter_params_x, filter_params_y, subpel_x_qn,
1731 subpel_y_qn, conv_params, bd);
1732 return;
1733 }
1734
1735 DECLARE_ALIGNED(16, uint16_t,
1736 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1737 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1738 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1739
1740 if (x_filter_taps == 6 || y_filter_taps == 6) {
1741 av1_highbd_convolve_2d_sr_neon(src, src_stride, dst, dst_stride, w, h,
1742 filter_params_x, filter_params_y,
1743 subpel_x_qn, subpel_y_qn, conv_params, bd);
1744 return;
1745 }
1746
1747 const int clamped_x_taps = x_filter_taps < 4 ? 4 : x_filter_taps;
1748 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
1749
1750 const int im_stride = MAX_SB_SIZE;
1751 const int vert_offset = clamped_y_taps / 2 - 1;
1752 const int horiz_offset = clamped_x_taps / 2 - 1;
1753 const int x_offset = (1 << (bd + FILTER_BITS - 1));
1754 const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1755 // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a
1756 // simple shift left instead of a rounding saturating shift left.
1757 const int y_offset =
1758 (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1));
1759
1760 const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1761
1762 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1763 filter_params_x, subpel_x_qn & SUBPEL_MASK);
1764 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1765 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1766 const int im_h = h + clamped_y_taps - 1;
1767
1768 if (x_filter_taps > 8) {
1769 highbd_convolve_2d_sr_horiz_12tap_sve2(src_ptr, src_stride, im_block,
1770 im_stride, w, im_h, x_filter_ptr,
1771 conv_params, x_offset);
1772
1773 highbd_convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride,
1774 w, h, y_filter_ptr, conv_params, bd,
1775 y_offset);
1776 return;
1777 }
1778
1779 if (x_filter_taps <= 4) {
1780 highbd_convolve_2d_sr_horiz_4tap_sve2(src_ptr, src_stride, im_block,
1781 im_stride, w, im_h, x_filter_ptr,
1782 conv_params, x_offset);
1783 } else {
1784 highbd_convolve_2d_sr_horiz_8tap_sve2(src_ptr, src_stride, im_block,
1785 im_stride, w, im_h, x_filter_ptr,
1786 conv_params, x_offset);
1787 }
1788
1789 if (y_filter_taps <= 4) {
1790 highbd_convolve_2d_sr_vert_4tap_sve2(im_block, im_stride, dst, dst_stride,
1791 w, h, y_filter_ptr, conv_params, bd,
1792 y_offset);
1793 } else {
1794 highbd_convolve_2d_sr_vert_8tap_sve2(im_block, im_stride, dst, dst_stride,
1795 w, h, y_filter_ptr, conv_params, bd,
1796 y_offset);
1797 }
1798 }
1799