• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/transpose_neon.h"
19 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
20 #include "vpx_dsp/vpx_filter.h"
21 #include "vpx_ports/mem.h"
22 
23 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
24   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
25   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
26   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
27 };
28 
29 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
30   0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
31   4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
32 };
33 
34 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
35   /* Shift left and insert new last column in transposed 4x4 block. */
36   1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
37   /* Shift left and insert two new columns in transposed 4x4 block. */
38   2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
39   /* Shift left and insert three new columns in transposed 4x4 block. */
40   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
41 };
42 
vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)43 void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
44                                          ptrdiff_t src_stride, uint8_t *dst,
45                                          ptrdiff_t dst_stride,
46                                          const InterpKernel *filter, int x0_q4,
47                                          int x_step_q4, int y0_q4,
48                                          int y_step_q4, int w, int h) {
49   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
50   const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
51   const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
52   const uint8x16_t range_limit = vdupq_n_u8(128);
53   uint8x16_t s0, s1, s2, s3;
54 
55   assert((intptr_t)dst % 4 == 0);
56   assert(dst_stride % 4 == 0);
57   assert(x_step_q4 == 16);
58   assert(h % 4 == 3);
59 
60   (void)x_step_q4;
61   (void)y0_q4;
62   (void)y_step_q4;
63 
64   src -= 3;
65 
66   if (w == 4) {
67     const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
68     int16x4_t d0, d1, d2, d3;
69     uint8x8_t d01, d23;
70 
71     do {
72       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
73 
74       d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
75       d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
76       d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
77       d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
78       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
79       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
80 
81       store_u8(dst + 0 * dst_stride, dst_stride, d01);
82       store_u8(dst + 2 * dst_stride, dst_stride, d23);
83 
84       src += 4 * src_stride;
85       dst += 4 * dst_stride;
86       h -= 4;
87     } while (h > 3);
88 
89     /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
90      * further details on possible values of block height. */
91     load_u8_16x3(src, src_stride, &s0, &s1, &s2);
92 
93     d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
94     d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
95     d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
96     d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
97     d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
98 
99     store_u8(dst + 0 * dst_stride, dst_stride, d01);
100     store_u8_4x1(dst + 2 * dst_stride, d23);
101   } else {
102     const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
103     const uint8_t *s;
104     uint8_t *d;
105     int width;
106     uint8x8_t d0, d1, d2, d3;
107 
108     do {
109       width = w;
110       s = src;
111       d = dst;
112       do {
113         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
114 
115         d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
116         d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
117         d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
118         d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
119 
120         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
121 
122         s += 8;
123         d += 8;
124         width -= 8;
125       } while (width != 0);
126       src += 4 * src_stride;
127       dst += 4 * dst_stride;
128       h -= 4;
129     } while (h > 3);
130 
131     /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
132      * further details on possible values of block height. */
133     width = w;
134     s = src;
135     d = dst;
136     do {
137       load_u8_16x3(s, src_stride, &s0, &s1, &s2);
138 
139       d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
140       d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
141       d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
142 
143       store_u8_8x3(d, dst_stride, d0, d1, d2);
144 
145       s += 8;
146       d += 8;
147       width -= 8;
148     } while (width != 0);
149   }
150 }
151 
vpx_convolve8_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)152 void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
153                                       uint8_t *dst, ptrdiff_t dst_stride,
154                                       const InterpKernel *filter, int x0_q4,
155                                       int x_step_q4, int y0_q4, int y_step_q4,
156                                       int w, int h) {
157   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
158   const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
159   const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
160   const uint8x16_t range_limit = vdupq_n_u8(128);
161   uint8x16_t s0, s1, s2, s3;
162 
163   assert((intptr_t)dst % 4 == 0);
164   assert(dst_stride % 4 == 0);
165   assert(x_step_q4 == 16);
166 
167   (void)x_step_q4;
168   (void)y0_q4;
169   (void)y_step_q4;
170 
171   src -= 3;
172 
173   if (w == 4) {
174     const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
175     do {
176       int16x4_t t0, t1, t2, t3;
177       uint8x8_t d01, d23;
178 
179       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
180 
181       t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
182       t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
183       t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
184       t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
185       d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
186       d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
187 
188       store_u8(dst + 0 * dst_stride, dst_stride, d01);
189       store_u8(dst + 2 * dst_stride, dst_stride, d23);
190 
191       src += 4 * src_stride;
192       dst += 4 * dst_stride;
193       h -= 4;
194     } while (h != 0);
195   } else {
196     const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
197     const uint8_t *s;
198     uint8_t *d;
199     int width;
200     uint8x8_t d0, d1, d2, d3;
201 
202     do {
203       width = w;
204       s = src;
205       d = dst;
206       do {
207         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
208 
209         d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
210         d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
211         d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
212         d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
213 
214         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
215 
216         s += 8;
217         d += 8;
218         width -= 8;
219       } while (width != 0);
220       src += 4 * src_stride;
221       dst += 4 * dst_stride;
222       h -= 4;
223     } while (h != 0);
224   }
225 }
226 
vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)227 void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
228                                           ptrdiff_t src_stride, uint8_t *dst,
229                                           ptrdiff_t dst_stride,
230                                           const InterpKernel *filter, int x0_q4,
231                                           int x_step_q4, int y0_q4,
232                                           int y_step_q4, int w, int h) {
233   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
234   const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
235   const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
236   const uint8x16_t range_limit = vdupq_n_u8(128);
237   uint8x16_t s0, s1, s2, s3;
238 
239   assert((intptr_t)dst % 4 == 0);
240   assert(dst_stride % 4 == 0);
241   assert(x_step_q4 == 16);
242 
243   (void)x_step_q4;
244   (void)y0_q4;
245   (void)y_step_q4;
246 
247   src -= 3;
248 
249   if (w == 4) {
250     const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
251     do {
252       int16x4_t t0, t1, t2, t3;
253       uint8x8_t d01, d23, dd01, dd23;
254 
255       load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
256 
257       t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
258       t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
259       t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
260       t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
261       d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
262       d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
263 
264       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
265       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
266 
267       d01 = vrhadd_u8(d01, dd01);
268       d23 = vrhadd_u8(d23, dd23);
269 
270       store_u8(dst + 0 * dst_stride, dst_stride, d01);
271       store_u8(dst + 2 * dst_stride, dst_stride, d23);
272 
273       src += 4 * src_stride;
274       dst += 4 * dst_stride;
275       h -= 4;
276     } while (h != 0);
277   } else {
278     const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
279     const uint8_t *s;
280     uint8_t *d;
281     int width;
282     uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
283 
284     do {
285       width = w;
286       s = src;
287       d = dst;
288       do {
289         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
290 
291         d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
292         d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
293         d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
294         d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
295 
296         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
297 
298         d0 = vrhadd_u8(d0, dd0);
299         d1 = vrhadd_u8(d1, dd1);
300         d2 = vrhadd_u8(d2, dd2);
301         d3 = vrhadd_u8(d3, dd3);
302 
303         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
304 
305         s += 8;
306         d += 8;
307         width -= 8;
308       } while (width != 0);
309       src += 4 * src_stride;
310       dst += 4 * dst_stride;
311       h -= 4;
312     } while (h != 0);
313   }
314 }
315 
transpose_concat_4x4(int8x8_t a0,int8x8_t a1,int8x8_t a2,int8x8_t a3,int8x16_t * b,const uint8x16_t permute_tbl)316 static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
317                                         int8x8_t a3, int8x16_t *b,
318                                         const uint8x16_t permute_tbl) {
319   /* Transpose 8-bit elements and concatenate result rows as follows:
320    * a0: 00, 01, 02, 03, XX, XX, XX, XX
321    * a1: 10, 11, 12, 13, XX, XX, XX, XX
322    * a2: 20, 21, 22, 23, XX, XX, XX, XX
323    * a3: 30, 31, 32, 33, XX, XX, XX, XX
324    *
325    * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
326    *
327    * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
328    * as an argument is preferable to loading it directly from memory as this
329    * inline helper is called many times from the same parent function.
330    */
331 
332   int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
333   *b = vqtbl2q_s8(samples, permute_tbl);
334 }
335 
transpose_concat_8x4(int8x8_t a0,int8x8_t a1,int8x8_t a2,int8x8_t a3,int8x16_t * b0,int8x16_t * b1,const uint8x16x2_t permute_tbl)336 static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
337                                         int8x8_t a3, int8x16_t *b0,
338                                         int8x16_t *b1,
339                                         const uint8x16x2_t permute_tbl) {
340   /* Transpose 8-bit elements and concatenate result rows as follows:
341    * a0: 00, 01, 02, 03, 04, 05, 06, 07
342    * a1: 10, 11, 12, 13, 14, 15, 16, 17
343    * a2: 20, 21, 22, 23, 24, 25, 26, 27
344    * a3: 30, 31, 32, 33, 34, 35, 36, 37
345    *
346    * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
347    * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
348    *
349    * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
350    * as an argument is preferable to loading it directly from memory as this
351    * inline helper is called many times from the same parent function.
352    */
353 
354   int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
355   *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
356   *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
357 }
358 
vpx_convolve8_vert_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)359 void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
360                                      uint8_t *dst, ptrdiff_t dst_stride,
361                                      const InterpKernel *filter, int x0_q4,
362                                      int x_step_q4, int y0_q4, int y_step_q4,
363                                      int w, int h) {
364   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
365   const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
366   const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
367   const uint8x8_t range_limit = vdup_n_u8(128);
368   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
369   uint8x8_t t0, t1, t2, t3, t4, t5, t6;
370   int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
371   int8x16x2_t samples_LUT;
372 
373   assert((intptr_t)dst % 4 == 0);
374   assert(dst_stride % 4 == 0);
375   assert(y_step_q4 == 16);
376 
377   (void)x0_q4;
378   (void)x_step_q4;
379   (void)y_step_q4;
380 
381   src -= 3 * src_stride;
382 
383   if (w == 4) {
384     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
385     int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
386     int16x4_t d0, d1, d2, d3;
387     uint8x8_t d01, d23;
388 
389     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
390     src += 7 * src_stride;
391 
392     /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
393     s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
394     s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
395     s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
396     s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
397     s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
398     s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
399     s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
400     s7 = vdup_n_s8(0);
401     s8 = vdup_n_s8(0);
402     s9 = vdup_n_s8(0);
403 
404     /* This operation combines a conventional transpose and the sample permute
405      * (see horizontal case) required before computing the dot product.
406      */
407     transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
408     transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
409     transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
410     transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
411     transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
412     transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
413     transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
414 
415     do {
416       uint8x8_t t7, t8, t9, t10;
417 
418       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
419 
420       s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
421       s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
422       s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
423       s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
424 
425       transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
426 
427       /* Merge new data into block from previous iteration. */
428       samples_LUT.val[0] = s3456;
429       samples_LUT.val[1] = s78910;
430       s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
431       s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
432       s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
433 
434       d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
435       d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
436       d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
437       d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
438       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
439       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
440 
441       store_u8(dst + 0 * dst_stride, dst_stride, d01);
442       store_u8(dst + 2 * dst_stride, dst_stride, d23);
443 
444       /* Prepare block for next iteration - re-using as much as possible. */
445       /* Shuffle everything up four rows. */
446       s0123 = s4567;
447       s1234 = s5678;
448       s2345 = s6789;
449       s3456 = s78910;
450 
451       src += 4 * src_stride;
452       dst += 4 * dst_stride;
453       h -= 4;
454     } while (h != 0);
455   } else {
456     const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
457     int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
458         s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
459         s6789_hi, s78910_lo, s78910_hi;
460     uint8x8_t d0, d1, d2, d3;
461     const uint8_t *s;
462     uint8_t *d;
463     int height;
464 
465     do {
466       height = h;
467       s = src;
468       d = dst;
469 
470       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
471       s += 7 * src_stride;
472 
473       /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
474       s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
475       s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
476       s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
477       s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
478       s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
479       s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
480       s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
481       s7 = vdup_n_s8(0);
482       s8 = vdup_n_s8(0);
483       s9 = vdup_n_s8(0);
484 
485       /* This operation combines a conventional transpose and the sample permute
486        * (see horizontal case) required before computing the dot product.
487        */
488       transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
489                            tran_concat_tbl);
490       transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
491                            tran_concat_tbl);
492       transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
493                            tran_concat_tbl);
494       transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
495                            tran_concat_tbl);
496       transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
497                            tran_concat_tbl);
498       transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
499                            tran_concat_tbl);
500       transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
501                            tran_concat_tbl);
502 
503       do {
504         uint8x8_t t7, t8, t9, t10;
505 
506         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
507 
508         s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
509         s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
510         s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
511         s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
512 
513         transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
514                              tran_concat_tbl);
515 
516         /* Merge new data into block from previous iteration. */
517         samples_LUT.val[0] = s3456_lo;
518         samples_LUT.val[1] = s78910_lo;
519         s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
520         s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
521         s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
522 
523         samples_LUT.val[0] = s3456_hi;
524         samples_LUT.val[1] = s78910_hi;
525         s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
526         s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
527         s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
528 
529         d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
530                                       correction, filters);
531         d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
532                                       correction, filters);
533         d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
534                                       correction, filters);
535         d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
536                                       correction, filters);
537 
538         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
539 
540         /* Prepare block for next iteration - re-using as much as possible. */
541         /* Shuffle everything up four rows. */
542         s0123_lo = s4567_lo;
543         s0123_hi = s4567_hi;
544         s1234_lo = s5678_lo;
545         s1234_hi = s5678_hi;
546         s2345_lo = s6789_lo;
547         s2345_hi = s6789_hi;
548         s3456_lo = s78910_lo;
549         s3456_hi = s78910_hi;
550 
551         s += 4 * src_stride;
552         d += 4 * dst_stride;
553         height -= 4;
554       } while (height != 0);
555       src += 8;
556       dst += 8;
557       w -= 8;
558     } while (w != 0);
559   }
560 }
561 
vpx_convolve8_avg_vert_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)562 void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
563                                          ptrdiff_t src_stride, uint8_t *dst,
564                                          ptrdiff_t dst_stride,
565                                          const InterpKernel *filter, int x0_q4,
566                                          int x_step_q4, int y0_q4,
567                                          int y_step_q4, int w, int h) {
568   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
569   const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
570   const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
571   const uint8x8_t range_limit = vdup_n_u8(128);
572   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
573   uint8x8_t t0, t1, t2, t3, t4, t5, t6;
574   int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
575   int8x16x2_t samples_LUT;
576 
577   assert((intptr_t)dst % 4 == 0);
578   assert(dst_stride % 4 == 0);
579   assert(y_step_q4 == 16);
580 
581   (void)x0_q4;
582   (void)x_step_q4;
583   (void)y_step_q4;
584 
585   src -= 3 * src_stride;
586 
587   if (w == 4) {
588     const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
589     int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
590     int16x4_t d0, d1, d2, d3;
591     uint8x8_t d01, d23, dd01, dd23;
592 
593     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
594     src += 7 * src_stride;
595 
596     /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
597     s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
598     s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
599     s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
600     s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
601     s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
602     s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
603     s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
604     s7 = vdup_n_s8(0);
605     s8 = vdup_n_s8(0);
606     s9 = vdup_n_s8(0);
607 
608     /* This operation combines a conventional transpose and the sample permute
609      * (see horizontal case) required before computing the dot product.
610      */
611     transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
612     transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
613     transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
614     transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
615     transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
616     transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
617     transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
618 
619     do {
620       uint8x8_t t7, t8, t9, t10;
621 
622       load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
623 
624       s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
625       s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
626       s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
627       s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
628 
629       transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
630 
631       /* Merge new data into block from previous iteration. */
632       samples_LUT.val[0] = s3456;
633       samples_LUT.val[1] = s78910;
634       s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
635       s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
636       s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
637 
638       d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
639       d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
640       d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
641       d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
642       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
643       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
644 
645       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
646       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
647 
648       d01 = vrhadd_u8(d01, dd01);
649       d23 = vrhadd_u8(d23, dd23);
650 
651       store_u8(dst + 0 * dst_stride, dst_stride, d01);
652       store_u8(dst + 2 * dst_stride, dst_stride, d23);
653 
654       /* Prepare block for next iteration - re-using as much as possible. */
655       /* Shuffle everything up four rows. */
656       s0123 = s4567;
657       s1234 = s5678;
658       s2345 = s6789;
659       s3456 = s78910;
660 
661       src += 4 * src_stride;
662       dst += 4 * dst_stride;
663       h -= 4;
664     } while (h != 0);
665   } else {
666     const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
667     int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
668         s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
669         s6789_hi, s78910_lo, s78910_hi;
670     uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
671     const uint8_t *s;
672     uint8_t *d;
673     int height;
674 
675     do {
676       height = h;
677       s = src;
678       d = dst;
679 
680       load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
681       s += 7 * src_stride;
682 
683       /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
684       s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
685       s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
686       s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
687       s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
688       s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
689       s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
690       s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
691       s7 = vdup_n_s8(0);
692       s8 = vdup_n_s8(0);
693       s9 = vdup_n_s8(0);
694 
695       /* This operation combines a conventional transpose and the sample permute
696        * (see horizontal case) required before computing the dot product.
697        */
698       transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
699                            tran_concat_tbl);
700       transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
701                            tran_concat_tbl);
702       transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
703                            tran_concat_tbl);
704       transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
705                            tran_concat_tbl);
706       transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
707                            tran_concat_tbl);
708       transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
709                            tran_concat_tbl);
710       transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
711                            tran_concat_tbl);
712 
713       do {
714         uint8x8_t t7, t8, t9, t10;
715 
716         load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
717 
718         s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
719         s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
720         s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
721         s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
722 
723         transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
724                              tran_concat_tbl);
725 
726         /* Merge new data into block from previous iteration. */
727         samples_LUT.val[0] = s3456_lo;
728         samples_LUT.val[1] = s78910_lo;
729         s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
730         s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
731         s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
732 
733         samples_LUT.val[0] = s3456_hi;
734         samples_LUT.val[1] = s78910_hi;
735         s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
736         s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
737         s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
738 
739         d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
740                                       correction, filters);
741         d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
742                                       correction, filters);
743         d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
744                                       correction, filters);
745         d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
746                                       correction, filters);
747 
748         load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
749 
750         d0 = vrhadd_u8(d0, dd0);
751         d1 = vrhadd_u8(d1, dd1);
752         d2 = vrhadd_u8(d2, dd2);
753         d3 = vrhadd_u8(d3, dd3);
754 
755         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
756 
757         /* Prepare block for next iteration - re-using as much as possible. */
758         /* Shuffle everything up four rows. */
759         s0123_lo = s4567_lo;
760         s0123_hi = s4567_hi;
761         s1234_lo = s5678_lo;
762         s1234_hi = s5678_hi;
763         s2345_lo = s6789_lo;
764         s2345_hi = s6789_hi;
765         s3456_lo = s78910_lo;
766         s3456_hi = s78910_hi;
767 
768         s += 4 * src_stride;
769         d += 4 * dst_stride;
770         height -= 4;
771       } while (height != 0);
772       src += 8;
773       dst += 8;
774       w -= 8;
775     } while (w != 0);
776   }
777 }
778