1 /*
2 * Copyright (c) 2021 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/transpose_neon.h"
19 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
20 #include "vpx_dsp/vpx_filter.h"
21 #include "vpx_ports/mem.h"
22
23 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
24 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
25 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
26 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
27 };
28
29 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
30 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
31 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
32 };
33
34 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
35 /* Shift left and insert new last column in transposed 4x4 block. */
36 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
37 /* Shift left and insert two new columns in transposed 4x4 block. */
38 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
39 /* Shift left and insert three new columns in transposed 4x4 block. */
40 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
41 };
42
vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)43 void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
44 ptrdiff_t src_stride, uint8_t *dst,
45 ptrdiff_t dst_stride,
46 const InterpKernel *filter, int x0_q4,
47 int x_step_q4, int y0_q4,
48 int y_step_q4, int w, int h) {
49 const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
50 const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
51 const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
52 const uint8x16_t range_limit = vdupq_n_u8(128);
53 uint8x16_t s0, s1, s2, s3;
54
55 assert((intptr_t)dst % 4 == 0);
56 assert(dst_stride % 4 == 0);
57 assert(x_step_q4 == 16);
58 assert(h % 4 == 3);
59
60 (void)x_step_q4;
61 (void)y0_q4;
62 (void)y_step_q4;
63
64 src -= 3;
65
66 if (w == 4) {
67 const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
68 int16x4_t d0, d1, d2, d3;
69 uint8x8_t d01, d23;
70
71 do {
72 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
73
74 d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
75 d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
76 d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
77 d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
78 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
79 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
80
81 store_u8(dst + 0 * dst_stride, dst_stride, d01);
82 store_u8(dst + 2 * dst_stride, dst_stride, d23);
83
84 src += 4 * src_stride;
85 dst += 4 * dst_stride;
86 h -= 4;
87 } while (h > 3);
88
89 /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
90 * further details on possible values of block height. */
91 load_u8_16x3(src, src_stride, &s0, &s1, &s2);
92
93 d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
94 d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
95 d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
96 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
97 d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
98
99 store_u8(dst + 0 * dst_stride, dst_stride, d01);
100 store_u8_4x1(dst + 2 * dst_stride, d23);
101 } else {
102 const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
103 const uint8_t *s;
104 uint8_t *d;
105 int width;
106 uint8x8_t d0, d1, d2, d3;
107
108 do {
109 width = w;
110 s = src;
111 d = dst;
112 do {
113 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
114
115 d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
116 d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
117 d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
118 d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
119
120 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
121
122 s += 8;
123 d += 8;
124 width -= 8;
125 } while (width != 0);
126 src += 4 * src_stride;
127 dst += 4 * dst_stride;
128 h -= 4;
129 } while (h > 3);
130
131 /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
132 * further details on possible values of block height. */
133 width = w;
134 s = src;
135 d = dst;
136 do {
137 load_u8_16x3(s, src_stride, &s0, &s1, &s2);
138
139 d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
140 d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
141 d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
142
143 store_u8_8x3(d, dst_stride, d0, d1, d2);
144
145 s += 8;
146 d += 8;
147 width -= 8;
148 } while (width != 0);
149 }
150 }
151
vpx_convolve8_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)152 void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
153 uint8_t *dst, ptrdiff_t dst_stride,
154 const InterpKernel *filter, int x0_q4,
155 int x_step_q4, int y0_q4, int y_step_q4,
156 int w, int h) {
157 const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
158 const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
159 const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
160 const uint8x16_t range_limit = vdupq_n_u8(128);
161 uint8x16_t s0, s1, s2, s3;
162
163 assert((intptr_t)dst % 4 == 0);
164 assert(dst_stride % 4 == 0);
165 assert(x_step_q4 == 16);
166
167 (void)x_step_q4;
168 (void)y0_q4;
169 (void)y_step_q4;
170
171 src -= 3;
172
173 if (w == 4) {
174 const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
175 do {
176 int16x4_t t0, t1, t2, t3;
177 uint8x8_t d01, d23;
178
179 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
180
181 t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
182 t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
183 t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
184 t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
185 d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
186 d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
187
188 store_u8(dst + 0 * dst_stride, dst_stride, d01);
189 store_u8(dst + 2 * dst_stride, dst_stride, d23);
190
191 src += 4 * src_stride;
192 dst += 4 * dst_stride;
193 h -= 4;
194 } while (h != 0);
195 } else {
196 const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
197 const uint8_t *s;
198 uint8_t *d;
199 int width;
200 uint8x8_t d0, d1, d2, d3;
201
202 do {
203 width = w;
204 s = src;
205 d = dst;
206 do {
207 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
208
209 d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
210 d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
211 d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
212 d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
213
214 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
215
216 s += 8;
217 d += 8;
218 width -= 8;
219 } while (width != 0);
220 src += 4 * src_stride;
221 dst += 4 * dst_stride;
222 h -= 4;
223 } while (h != 0);
224 }
225 }
226
vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)227 void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
228 ptrdiff_t src_stride, uint8_t *dst,
229 ptrdiff_t dst_stride,
230 const InterpKernel *filter, int x0_q4,
231 int x_step_q4, int y0_q4,
232 int y_step_q4, int w, int h) {
233 const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
234 const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
235 const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
236 const uint8x16_t range_limit = vdupq_n_u8(128);
237 uint8x16_t s0, s1, s2, s3;
238
239 assert((intptr_t)dst % 4 == 0);
240 assert(dst_stride % 4 == 0);
241 assert(x_step_q4 == 16);
242
243 (void)x_step_q4;
244 (void)y0_q4;
245 (void)y_step_q4;
246
247 src -= 3;
248
249 if (w == 4) {
250 const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
251 do {
252 int16x4_t t0, t1, t2, t3;
253 uint8x8_t d01, d23, dd01, dd23;
254
255 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
256
257 t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
258 t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
259 t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
260 t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
261 d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
262 d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
263
264 dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
265 dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
266
267 d01 = vrhadd_u8(d01, dd01);
268 d23 = vrhadd_u8(d23, dd23);
269
270 store_u8(dst + 0 * dst_stride, dst_stride, d01);
271 store_u8(dst + 2 * dst_stride, dst_stride, d23);
272
273 src += 4 * src_stride;
274 dst += 4 * dst_stride;
275 h -= 4;
276 } while (h != 0);
277 } else {
278 const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
279 const uint8_t *s;
280 uint8_t *d;
281 int width;
282 uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
283
284 do {
285 width = w;
286 s = src;
287 d = dst;
288 do {
289 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
290
291 d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
292 d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
293 d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
294 d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
295
296 load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
297
298 d0 = vrhadd_u8(d0, dd0);
299 d1 = vrhadd_u8(d1, dd1);
300 d2 = vrhadd_u8(d2, dd2);
301 d3 = vrhadd_u8(d3, dd3);
302
303 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
304
305 s += 8;
306 d += 8;
307 width -= 8;
308 } while (width != 0);
309 src += 4 * src_stride;
310 dst += 4 * dst_stride;
311 h -= 4;
312 } while (h != 0);
313 }
314 }
315
transpose_concat_4x4(int8x8_t a0,int8x8_t a1,int8x8_t a2,int8x8_t a3,int8x16_t * b,const uint8x16_t permute_tbl)316 static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
317 int8x8_t a3, int8x16_t *b,
318 const uint8x16_t permute_tbl) {
319 /* Transpose 8-bit elements and concatenate result rows as follows:
320 * a0: 00, 01, 02, 03, XX, XX, XX, XX
321 * a1: 10, 11, 12, 13, XX, XX, XX, XX
322 * a2: 20, 21, 22, 23, XX, XX, XX, XX
323 * a3: 30, 31, 32, 33, XX, XX, XX, XX
324 *
325 * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
326 *
327 * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
328 * as an argument is preferable to loading it directly from memory as this
329 * inline helper is called many times from the same parent function.
330 */
331
332 int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
333 *b = vqtbl2q_s8(samples, permute_tbl);
334 }
335
transpose_concat_8x4(int8x8_t a0,int8x8_t a1,int8x8_t a2,int8x8_t a3,int8x16_t * b0,int8x16_t * b1,const uint8x16x2_t permute_tbl)336 static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
337 int8x8_t a3, int8x16_t *b0,
338 int8x16_t *b1,
339 const uint8x16x2_t permute_tbl) {
340 /* Transpose 8-bit elements and concatenate result rows as follows:
341 * a0: 00, 01, 02, 03, 04, 05, 06, 07
342 * a1: 10, 11, 12, 13, 14, 15, 16, 17
343 * a2: 20, 21, 22, 23, 24, 25, 26, 27
344 * a3: 30, 31, 32, 33, 34, 35, 36, 37
345 *
346 * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
347 * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
348 *
349 * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
350 * as an argument is preferable to loading it directly from memory as this
351 * inline helper is called many times from the same parent function.
352 */
353
354 int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
355 *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
356 *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
357 }
358
vpx_convolve8_vert_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)359 void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
360 uint8_t *dst, ptrdiff_t dst_stride,
361 const InterpKernel *filter, int x0_q4,
362 int x_step_q4, int y0_q4, int y_step_q4,
363 int w, int h) {
364 const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
365 const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
366 const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
367 const uint8x8_t range_limit = vdup_n_u8(128);
368 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
369 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
370 int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
371 int8x16x2_t samples_LUT;
372
373 assert((intptr_t)dst % 4 == 0);
374 assert(dst_stride % 4 == 0);
375 assert(y_step_q4 == 16);
376
377 (void)x0_q4;
378 (void)x_step_q4;
379 (void)y_step_q4;
380
381 src -= 3 * src_stride;
382
383 if (w == 4) {
384 const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
385 int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
386 int16x4_t d0, d1, d2, d3;
387 uint8x8_t d01, d23;
388
389 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
390 src += 7 * src_stride;
391
392 /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
393 s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
394 s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
395 s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
396 s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
397 s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
398 s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
399 s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
400 s7 = vdup_n_s8(0);
401 s8 = vdup_n_s8(0);
402 s9 = vdup_n_s8(0);
403
404 /* This operation combines a conventional transpose and the sample permute
405 * (see horizontal case) required before computing the dot product.
406 */
407 transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
408 transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
409 transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
410 transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
411 transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
412 transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
413 transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
414
415 do {
416 uint8x8_t t7, t8, t9, t10;
417
418 load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
419
420 s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
421 s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
422 s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
423 s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
424
425 transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
426
427 /* Merge new data into block from previous iteration. */
428 samples_LUT.val[0] = s3456;
429 samples_LUT.val[1] = s78910;
430 s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
431 s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
432 s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
433
434 d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
435 d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
436 d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
437 d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
438 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
439 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
440
441 store_u8(dst + 0 * dst_stride, dst_stride, d01);
442 store_u8(dst + 2 * dst_stride, dst_stride, d23);
443
444 /* Prepare block for next iteration - re-using as much as possible. */
445 /* Shuffle everything up four rows. */
446 s0123 = s4567;
447 s1234 = s5678;
448 s2345 = s6789;
449 s3456 = s78910;
450
451 src += 4 * src_stride;
452 dst += 4 * dst_stride;
453 h -= 4;
454 } while (h != 0);
455 } else {
456 const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
457 int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
458 s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
459 s6789_hi, s78910_lo, s78910_hi;
460 uint8x8_t d0, d1, d2, d3;
461 const uint8_t *s;
462 uint8_t *d;
463 int height;
464
465 do {
466 height = h;
467 s = src;
468 d = dst;
469
470 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
471 s += 7 * src_stride;
472
473 /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
474 s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
475 s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
476 s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
477 s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
478 s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
479 s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
480 s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
481 s7 = vdup_n_s8(0);
482 s8 = vdup_n_s8(0);
483 s9 = vdup_n_s8(0);
484
485 /* This operation combines a conventional transpose and the sample permute
486 * (see horizontal case) required before computing the dot product.
487 */
488 transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
489 tran_concat_tbl);
490 transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
491 tran_concat_tbl);
492 transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
493 tran_concat_tbl);
494 transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
495 tran_concat_tbl);
496 transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
497 tran_concat_tbl);
498 transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
499 tran_concat_tbl);
500 transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
501 tran_concat_tbl);
502
503 do {
504 uint8x8_t t7, t8, t9, t10;
505
506 load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
507
508 s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
509 s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
510 s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
511 s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
512
513 transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
514 tran_concat_tbl);
515
516 /* Merge new data into block from previous iteration. */
517 samples_LUT.val[0] = s3456_lo;
518 samples_LUT.val[1] = s78910_lo;
519 s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
520 s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
521 s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
522
523 samples_LUT.val[0] = s3456_hi;
524 samples_LUT.val[1] = s78910_hi;
525 s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
526 s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
527 s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
528
529 d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
530 correction, filters);
531 d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
532 correction, filters);
533 d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
534 correction, filters);
535 d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
536 correction, filters);
537
538 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
539
540 /* Prepare block for next iteration - re-using as much as possible. */
541 /* Shuffle everything up four rows. */
542 s0123_lo = s4567_lo;
543 s0123_hi = s4567_hi;
544 s1234_lo = s5678_lo;
545 s1234_hi = s5678_hi;
546 s2345_lo = s6789_lo;
547 s2345_hi = s6789_hi;
548 s3456_lo = s78910_lo;
549 s3456_hi = s78910_hi;
550
551 s += 4 * src_stride;
552 d += 4 * dst_stride;
553 height -= 4;
554 } while (height != 0);
555 src += 8;
556 dst += 8;
557 w -= 8;
558 } while (w != 0);
559 }
560 }
561
vpx_convolve8_avg_vert_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)562 void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
563 ptrdiff_t src_stride, uint8_t *dst,
564 ptrdiff_t dst_stride,
565 const InterpKernel *filter, int x0_q4,
566 int x_step_q4, int y0_q4,
567 int y_step_q4, int w, int h) {
568 const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
569 const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
570 const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
571 const uint8x8_t range_limit = vdup_n_u8(128);
572 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
573 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
574 int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
575 int8x16x2_t samples_LUT;
576
577 assert((intptr_t)dst % 4 == 0);
578 assert(dst_stride % 4 == 0);
579 assert(y_step_q4 == 16);
580
581 (void)x0_q4;
582 (void)x_step_q4;
583 (void)y_step_q4;
584
585 src -= 3 * src_stride;
586
587 if (w == 4) {
588 const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
589 int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
590 int16x4_t d0, d1, d2, d3;
591 uint8x8_t d01, d23, dd01, dd23;
592
593 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
594 src += 7 * src_stride;
595
596 /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
597 s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
598 s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
599 s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
600 s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
601 s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
602 s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
603 s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
604 s7 = vdup_n_s8(0);
605 s8 = vdup_n_s8(0);
606 s9 = vdup_n_s8(0);
607
608 /* This operation combines a conventional transpose and the sample permute
609 * (see horizontal case) required before computing the dot product.
610 */
611 transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
612 transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
613 transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
614 transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
615 transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
616 transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
617 transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
618
619 do {
620 uint8x8_t t7, t8, t9, t10;
621
622 load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
623
624 s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
625 s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
626 s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
627 s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
628
629 transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
630
631 /* Merge new data into block from previous iteration. */
632 samples_LUT.val[0] = s3456;
633 samples_LUT.val[1] = s78910;
634 s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
635 s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
636 s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
637
638 d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
639 d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
640 d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
641 d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
642 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
643 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
644
645 dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
646 dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
647
648 d01 = vrhadd_u8(d01, dd01);
649 d23 = vrhadd_u8(d23, dd23);
650
651 store_u8(dst + 0 * dst_stride, dst_stride, d01);
652 store_u8(dst + 2 * dst_stride, dst_stride, d23);
653
654 /* Prepare block for next iteration - re-using as much as possible. */
655 /* Shuffle everything up four rows. */
656 s0123 = s4567;
657 s1234 = s5678;
658 s2345 = s6789;
659 s3456 = s78910;
660
661 src += 4 * src_stride;
662 dst += 4 * dst_stride;
663 h -= 4;
664 } while (h != 0);
665 } else {
666 const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
667 int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
668 s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
669 s6789_hi, s78910_lo, s78910_hi;
670 uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
671 const uint8_t *s;
672 uint8_t *d;
673 int height;
674
675 do {
676 height = h;
677 s = src;
678 d = dst;
679
680 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
681 s += 7 * src_stride;
682
683 /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
684 s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
685 s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
686 s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
687 s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
688 s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
689 s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
690 s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
691 s7 = vdup_n_s8(0);
692 s8 = vdup_n_s8(0);
693 s9 = vdup_n_s8(0);
694
695 /* This operation combines a conventional transpose and the sample permute
696 * (see horizontal case) required before computing the dot product.
697 */
698 transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
699 tran_concat_tbl);
700 transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
701 tran_concat_tbl);
702 transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
703 tran_concat_tbl);
704 transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
705 tran_concat_tbl);
706 transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
707 tran_concat_tbl);
708 transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
709 tran_concat_tbl);
710 transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
711 tran_concat_tbl);
712
713 do {
714 uint8x8_t t7, t8, t9, t10;
715
716 load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
717
718 s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
719 s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
720 s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
721 s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
722
723 transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
724 tran_concat_tbl);
725
726 /* Merge new data into block from previous iteration. */
727 samples_LUT.val[0] = s3456_lo;
728 samples_LUT.val[1] = s78910_lo;
729 s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
730 s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
731 s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
732
733 samples_LUT.val[0] = s3456_hi;
734 samples_LUT.val[1] = s78910_hi;
735 s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
736 s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
737 s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
738
739 d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
740 correction, filters);
741 d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
742 correction, filters);
743 d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
744 correction, filters);
745 d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
746 correction, filters);
747
748 load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
749
750 d0 = vrhadd_u8(d0, dd0);
751 d1 = vrhadd_u8(d1, dd1);
752 d2 = vrhadd_u8(d2, dd2);
753 d3 = vrhadd_u8(d3, dd3);
754
755 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
756
757 /* Prepare block for next iteration - re-using as much as possible. */
758 /* Shuffle everything up four rows. */
759 s0123_lo = s4567_lo;
760 s0123_hi = s4567_hi;
761 s1234_lo = s5678_lo;
762 s1234_hi = s5678_hi;
763 s2345_lo = s6789_lo;
764 s2345_hi = s6789_hi;
765 s3456_lo = s78910_lo;
766 s3456_hi = s78910_hi;
767
768 s += 4 * src_stride;
769 d += 4 * dst_stride;
770 height -= 4;
771 } while (height != 0);
772 src += 8;
773 dst += 8;
774 w -= 8;
775 } while (w != 0);
776 }
777 }
778