1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "aom_dsp/arm/mem_neon.h"
16 #include "av1/common/arm/compound_convolve_neon.h"
17 #include "config/aom_config.h"
18 #include "config/av1_rtcd.h"
19
20 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
21 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
22 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
23 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
24 };
25
convolve4_4_2d_h(uint8x16_t samples,const int8x8_t x_filter,const uint8x16_t permute_tbl,const int32x4_t horiz_const)26 static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
27 const int8x8_t x_filter,
28 const uint8x16_t permute_tbl,
29 const int32x4_t horiz_const) {
30 // Permute samples ready for dot product.
31 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
32 uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
33
34 // First 4 output values.
35 int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, x_filter, 0);
36
37 // We halved the convolution filter values so -1 from the right shift.
38 return vshrn_n_s32(sum, ROUND0_BITS - 1);
39 }
40
convolve8_8_2d_h(uint8x16_t samples,const int8x8_t x_filter,const uint8x16x3_t permute_tbl,const int32x4_t horiz_const)41 static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
42 const int8x8_t x_filter,
43 const uint8x16x3_t permute_tbl,
44 const int32x4_t horiz_const) {
45 uint8x16_t permuted_samples[3];
46 int32x4_t sum[2];
47
48 // Permute samples ready for dot product.
49 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
50 permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
51 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
52 permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
53 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
54 permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
55
56 // First 4 output values.
57 sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0);
58 sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
59 // Second 4 output values.
60 sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0);
61 sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
62
63 // Narrow and re-pack.
64 // We halved the convolution filter values so -1 from the right shift.
65 return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
66 vshrn_n_s32(sum[1], ROUND0_BITS - 1));
67 }
68
dist_wtd_convolve_2d_horiz_neon_i8mm(const uint8_t * src,int src_stride,int16_t * im_block,const int im_stride,const int16_t * x_filter_ptr,const int im_h,int w)69 static INLINE void dist_wtd_convolve_2d_horiz_neon_i8mm(
70 const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
71 const int16_t *x_filter_ptr, const int im_h, int w) {
72 const int bd = 8;
73 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
74 // shifts - which are generally faster than rounding shifts on modern CPUs.
75 // (The extra -1 is needed because we halved the filter values.)
76 const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
77 (1 << ((ROUND0_BITS - 1) - 1)));
78
79 const uint8_t *src_ptr = src;
80 int16_t *dst_ptr = im_block;
81 int dst_stride = im_stride;
82 int height = im_h;
83
84 if (w == 4) {
85 const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
86 // 4-tap filters are used for blocks having width <= 4.
87 // Filter values are even, so halve to reduce intermediate precision reqs.
88 const int8x8_t x_filter =
89 vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
90
91 src_ptr += 2;
92
93 do {
94 uint8x16_t s0, s1, s2, s3;
95 load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
96
97 int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
98 int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
99 int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
100 int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
101
102 store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
103
104 src_ptr += 4 * src_stride;
105 dst_ptr += 4 * dst_stride;
106 height -= 4;
107 } while (height > 4);
108
109 do {
110 uint8x16_t s0 = vld1q_u8(src_ptr);
111
112 int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
113
114 vst1_s16(dst_ptr, d0);
115
116 src_ptr += src_stride;
117 dst_ptr += dst_stride;
118 } while (--height != 0);
119 } else {
120 const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
121 // Filter values are even, so halve to reduce intermediate precision reqs.
122 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
123
124 do {
125 const uint8_t *s = src_ptr;
126 int16_t *d = dst_ptr;
127 int width = w;
128
129 do {
130 uint8x16_t s0, s1, s2, s3;
131 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
132
133 int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
134 int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
135 int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
136 int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
137
138 store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
139
140 s += 8;
141 d += 8;
142 width -= 8;
143 } while (width > 0);
144 src_ptr += 4 * src_stride;
145 dst_ptr += 4 * dst_stride;
146 height -= 4;
147 } while (height > 4);
148
149 do {
150 const uint8_t *s = src_ptr;
151 int16_t *d = dst_ptr;
152 int width = w;
153
154 do {
155 uint8x16_t s0 = vld1q_u8(s);
156
157 int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
158
159 vst1q_s16(d, d0);
160
161 s += 8;
162 d += 8;
163 width -= 8;
164 } while (width > 0);
165 src_ptr += src_stride;
166 dst_ptr += dst_stride;
167 } while (--height != 0);
168 }
169 }
170
av1_dist_wtd_convolve_2d_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)171 void av1_dist_wtd_convolve_2d_neon_i8mm(
172 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
173 int h, const InterpFilterParams *filter_params_x,
174 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
175 const int subpel_y_qn, ConvolveParams *conv_params) {
176 assert(w % 4 == 0);
177 assert(h % 4 == 0);
178
179 DECLARE_ALIGNED(16, int16_t,
180 im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
181
182 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
183 const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
184
185 const int im_h = h + clamped_y_taps - 1;
186 const int im_stride = MAX_SB_SIZE;
187 const int vert_offset = clamped_y_taps / 2 - 1;
188 const int horiz_offset = filter_params_x->taps / 2 - 1;
189 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
190 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
191 filter_params_x, subpel_x_qn & SUBPEL_MASK);
192 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
193 filter_params_y, subpel_y_qn & SUBPEL_MASK);
194
195 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
196
197 dist_wtd_convolve_2d_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride,
198 x_filter_ptr, im_h, w);
199
200 if (clamped_y_taps == 6) {
201 if (conv_params->do_average) {
202 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
203 dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
204 im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
205 w);
206 } else {
207 dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
208 dst8_stride, conv_params,
209 y_filter, h, w);
210 }
211 } else {
212 dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
213 y_filter, h, w);
214 }
215 } else {
216 if (conv_params->do_average) {
217 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
218 dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
219 im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
220 w);
221 } else {
222 dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
223 dst8_stride, conv_params,
224 y_filter, h, w);
225 }
226 } else {
227 dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
228 y_filter, h, w);
229 }
230 }
231 }
232
convolve4_4_x(uint8x16_t samples,const int8x8_t x_filter,const uint8x16_t permute_tbl,const int32x4_t round_offset)233 static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
234 const int8x8_t x_filter,
235 const uint8x16_t permute_tbl,
236 const int32x4_t round_offset) {
237 // Permute samples ready for dot product.
238 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
239 uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
240
241 // First 4 output values.
242 int32x4_t sum = vusdotq_lane_s32(round_offset, permuted_samples, x_filter, 0);
243
244 // We halved the convolution filter values so -1 from the right shift.
245 return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
246 }
247
convolve8_8_x(uint8x16_t samples,const int8x8_t x_filter,const uint8x16x3_t permute_tbl,const int32x4_t round_offset)248 static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
249 const int8x8_t x_filter,
250 const uint8x16x3_t permute_tbl,
251 const int32x4_t round_offset) {
252 uint8x16_t permuted_samples[3];
253 int32x4_t sum[2];
254
255 // Permute samples ready for dot product.
256 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
257 permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
258 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
259 permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
260 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
261 permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
262
263 // First 4 output values.
264 sum[0] = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0);
265 sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
266 // Second 4 output values.
267 sum[1] = vusdotq_lane_s32(round_offset, permuted_samples[1], x_filter, 0);
268 sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
269
270 // Narrow and re-pack.
271 // We halved the convolution filter values so -1 from the right shift.
272 int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
273 vshrn_n_s32(sum[1], ROUND0_BITS - 1));
274 return vreinterpretq_u16_s16(res);
275 }
276
dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)277 static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
278 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
279 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
280 ConvolveParams *conv_params) {
281 assert(w % 4 == 0);
282 assert(h % 4 == 0);
283
284 const int bd = 8;
285 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
286 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
287 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
288 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
289 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
290 // shifts - which are generally faster than rounding shifts on modern CPUs.
291 // (The extra -1 is needed because we halved the filter values.)
292 const int32x4_t round_offset_shim = vdupq_n_s32(
293 (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
294
295 const uint16_t fwd_offset = conv_params->fwd_offset;
296 const uint16_t bck_offset = conv_params->bck_offset;
297
298 // Horizontal filter.
299 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
300 filter_params_x, subpel_x_qn & SUBPEL_MASK);
301
302 const int horiz_offset = filter_params_x->taps / 2 - 1;
303 const uint8_t *src_ptr = src - horiz_offset;
304 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
305 uint8_t *dst8_ptr = dst8;
306 int dst_stride = conv_params->dst_stride;
307 int height = h;
308
309 if (w == 4) {
310 const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
311 // 4-tap filters are used for blocks having width <= 4.
312 // Filter values are even, so halve to reduce intermediate precision reqs.
313 const int8x8_t x_filter =
314 vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
315
316 src_ptr += 2;
317
318 do {
319 uint8x16_t s0, s1, s2, s3;
320 load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
321
322 uint16x4_t d0 =
323 convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
324 uint16x4_t d1 =
325 convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
326 uint16x4_t d2 =
327 convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
328 uint16x4_t d3 =
329 convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
330
331 uint16x4_t dd0, dd1, dd2, dd3;
332 load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
333
334 uint8x8_t d01_u8, d23_u8;
335 compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
336 bck_offset, round_offset_vec, &d01_u8, &d23_u8);
337
338 store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
339 store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
340
341 src_ptr += 4 * src_stride;
342 dst_ptr += 4 * dst_stride;
343 dst8_ptr += 4 * dst8_stride;
344 height -= 4;
345 } while (height != 0);
346 } else {
347 const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
348 // Filter values are even, so halve to reduce intermediate precision reqs.
349 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
350
351 do {
352 const uint8_t *s = src_ptr;
353 CONV_BUF_TYPE *d = dst_ptr;
354 uint8_t *d_u8 = dst8_ptr;
355 int width = w;
356
357 do {
358 uint8x16_t s0, s1, s2, s3;
359 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
360
361 uint16x8_t d0 =
362 convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
363 uint16x8_t d1 =
364 convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
365 uint16x8_t d2 =
366 convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
367 uint16x8_t d3 =
368 convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
369
370 uint16x8_t dd0, dd1, dd2, dd3;
371 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
372
373 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
374 compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
375 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
376 &d2_u8, &d3_u8);
377
378 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
379
380 s += 8;
381 d += 8;
382 d_u8 += 8;
383 width -= 8;
384 } while (width != 0);
385 src_ptr += 4 * src_stride;
386 dst_ptr += 4 * dst_stride;
387 dst8_ptr += 4 * dst8_stride;
388 height -= 4;
389 } while (height != 0);
390 }
391 }
392
dist_wtd_convolve_x_avg_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)393 static INLINE void dist_wtd_convolve_x_avg_neon_i8mm(
394 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
395 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
396 ConvolveParams *conv_params) {
397 assert(w % 4 == 0);
398 assert(h % 4 == 0);
399
400 const int bd = 8;
401 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
402 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
403 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
404 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
405 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
406 // shifts - which are generally faster than rounding shifts on modern CPUs.
407 // (The extra -1 is needed because we halved the filter values.)
408 const int32x4_t round_offset_shim = vdupq_n_s32(
409 (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
410
411 // Horizontal filter.
412 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
413 filter_params_x, subpel_x_qn & SUBPEL_MASK);
414
415 const int horiz_offset = filter_params_x->taps / 2 - 1;
416 const uint8_t *src_ptr = src - horiz_offset;
417 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
418 uint8_t *dst8_ptr = dst8;
419 int dst_stride = conv_params->dst_stride;
420 int height = h;
421
422 if (w == 4) {
423 const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
424 // 4-tap filters are used for blocks having width <= 4.
425 // Filter values are even, so halve to reduce intermediate precision reqs.
426 const int8x8_t x_filter =
427 vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
428
429 src_ptr += 2;
430
431 do {
432 uint8x16_t s0, s1, s2, s3;
433 load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
434
435 uint16x4_t d0 =
436 convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
437 uint16x4_t d1 =
438 convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
439 uint16x4_t d2 =
440 convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
441 uint16x4_t d3 =
442 convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
443
444 uint16x4_t dd0, dd1, dd2, dd3;
445 load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
446
447 uint8x8_t d01_u8, d23_u8;
448 compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
449 round_offset_vec, &d01_u8, &d23_u8);
450
451 store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
452 store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
453
454 src_ptr += 4 * src_stride;
455 dst_ptr += 4 * dst_stride;
456 dst8_ptr += 4 * dst8_stride;
457 height -= 4;
458 } while (height != 0);
459 } else {
460 const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
461 // Filter values are even, so halve to reduce intermediate precision reqs.
462 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
463
464 do {
465 const uint8_t *s = src_ptr;
466 CONV_BUF_TYPE *d = dst_ptr;
467 uint8_t *d_u8 = dst8_ptr;
468 int width = w;
469
470 do {
471 uint8x16_t s0, s1, s2, s3;
472 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
473
474 uint16x8_t d0 =
475 convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
476 uint16x8_t d1 =
477 convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
478 uint16x8_t d2 =
479 convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
480 uint16x8_t d3 =
481 convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
482
483 uint16x8_t dd0, dd1, dd2, dd3;
484 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
485
486 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
487 compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
488 round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
489
490 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
491
492 s += 8;
493 d += 8;
494 d_u8 += 8;
495 width -= 8;
496 } while (width != 0);
497 src_ptr += 4 * src_stride;
498 dst_ptr += 4 * dst_stride;
499 dst8_ptr += 4 * dst8_stride;
500 height -= 4;
501 } while (height != 0);
502 }
503 }
504
dist_wtd_convolve_x_neon_i8mm(const uint8_t * src,int src_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)505 static INLINE void dist_wtd_convolve_x_neon_i8mm(
506 const uint8_t *src, int src_stride, int w, int h,
507 const InterpFilterParams *filter_params_x, const int subpel_x_qn,
508 ConvolveParams *conv_params) {
509 assert(w % 4 == 0);
510 assert(h % 4 == 0);
511
512 const int bd = 8;
513 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
514 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
515 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
516 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
517 // shifts - which are generally faster than rounding shifts on modern CPUs.
518 // (The extra -1 is needed because we halved the filter values.)
519 const int32x4_t round_offset_shim = vdupq_n_s32(
520 (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
521
522 // Horizontal filter.
523 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
524 filter_params_x, subpel_x_qn & SUBPEL_MASK);
525
526 const int horiz_offset = filter_params_x->taps / 2 - 1;
527 const uint8_t *src_ptr = src - horiz_offset;
528 CONV_BUF_TYPE *dst_ptr = conv_params->dst;
529 int dst_stride = conv_params->dst_stride;
530 int height = h;
531
532 if (w == 4) {
533 const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
534 // 4-tap filters are used for blocks having width <= 4.
535 // Filter values are even, so halve to reduce intermediate precision reqs.
536 const int8x8_t x_filter =
537 vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
538
539 src_ptr += 2;
540
541 do {
542 uint8x16_t s0, s1, s2, s3;
543 load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
544
545 uint16x4_t d0 =
546 convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
547 uint16x4_t d1 =
548 convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
549 uint16x4_t d2 =
550 convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
551 uint16x4_t d3 =
552 convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
553
554 store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
555
556 src_ptr += 4 * src_stride;
557 dst_ptr += 4 * dst_stride;
558 height -= 4;
559 } while (height != 0);
560 } else {
561 const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
562 // Filter values are even, so halve to reduce intermediate precision reqs.
563 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
564
565 do {
566 const uint8_t *s = src_ptr;
567 CONV_BUF_TYPE *d = dst_ptr;
568 int width = w;
569
570 do {
571 uint8x16_t s0, s1, s2, s3;
572 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
573
574 uint16x8_t d0 =
575 convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
576 uint16x8_t d1 =
577 convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
578 uint16x8_t d2 =
579 convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
580 uint16x8_t d3 =
581 convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
582
583 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
584
585 s += 8;
586 d += 8;
587 width -= 8;
588 } while (width != 0);
589 src_ptr += 4 * src_stride;
590 dst_ptr += 4 * dst_stride;
591 height -= 4;
592 } while (height != 0);
593 }
594 }
595
av1_dist_wtd_convolve_x_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)596 void av1_dist_wtd_convolve_x_neon_i8mm(
597 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
598 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
599 ConvolveParams *conv_params) {
600 if (conv_params->do_average) {
601 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
602 dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
603 src, src_stride, dst8, dst8_stride, w, h, filter_params_x,
604 subpel_x_qn, conv_params);
605 } else {
606 dist_wtd_convolve_x_avg_neon_i8mm(src, src_stride, dst8, dst8_stride, w,
607 h, filter_params_x, subpel_x_qn,
608 conv_params);
609 }
610 } else {
611 dist_wtd_convolve_x_neon_i8mm(src, src_stride, w, h, filter_params_x,
612 subpel_x_qn, conv_params);
613 }
614 }
615