• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
13 #define AOM_AOM_DSP_ARM_MEM_NEON_H_
14 
15 #include <arm_neon.h>
16 #include <string.h>
17 #include "aom_dsp/aom_dsp_common.h"
18 
19 // Support for xN Neon intrinsics is lacking in some compilers.
20 #if defined(__arm__) || defined(_M_ARM)
21 #define ARM_32_BIT
22 #endif
23 
24 // DEFICIENT_CLANG_32_BIT includes clang-cl.
25 #if defined(__clang__) && defined(ARM_32_BIT) && \
26     (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
27 #define DEFICIENT_CLANG_32_BIT  // This includes clang-cl.
28 #endif
29 
30 #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
31 #define GCC_32_BIT
32 #endif
33 
34 #if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
35 
vld1q_u8_x3(const uint8_t * ptr)36 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
37   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
38                          vld1q_u8(ptr + 2 * 16) } };
39   return res;
40 }
41 
vld1q_u8_x2(const uint8_t * ptr)42 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
43   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
44   return res;
45 }
46 
vld1q_u16_x2(const uint16_t * ptr)47 static inline uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
48   uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
49   return res;
50 }
51 
vld1q_u16_x4(const uint16_t * ptr)52 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
53   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
54                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
55   return res;
56 }
57 
vld1q_s16_x2(const int16_t * ptr)58 static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
59   int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
60   return res;
61 }
62 
vld1q_s16_x4(const int16_t * ptr)63 static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
64   int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
65                         vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
66   return res;
67 }
68 
vst1_u8_x2(uint8_t * ptr,uint8x8x2_t a)69 static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
70   vst1_u8(ptr + 0 * 8, a.val[0]);
71   vst1_u8(ptr + 1 * 8, a.val[1]);
72 }
73 
vst1_u8_x4(uint8_t * ptr,uint8x8x4_t a)74 static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
75   vst1_u8(ptr + 0 * 8, a.val[0]);
76   vst1_u8(ptr + 1 * 8, a.val[1]);
77   vst1_u8(ptr + 2 * 8, a.val[2]);
78   vst1_u8(ptr + 3 * 8, a.val[3]);
79 }
80 
vst1q_u16_x2(uint16_t * ptr,uint16x8x2_t a)81 static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) {
82   vst1q_u16(ptr + 0 * 8, a.val[0]);
83   vst1q_u16(ptr + 1 * 8, a.val[1]);
84 }
85 
vst1q_u16_x4(uint16_t * ptr,uint16x8x4_t a)86 static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) {
87   vst1q_u16(ptr + 0 * 8, a.val[0]);
88   vst1q_u16(ptr + 1 * 8, a.val[1]);
89   vst1q_u16(ptr + 2 * 8, a.val[2]);
90   vst1q_u16(ptr + 3 * 8, a.val[3]);
91 }
92 
93 #elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
94 #if __GNUC__ < 8
vld1q_u8_x2(const uint8_t * ptr)95 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
96   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
97   return res;
98 }
99 
vld1q_s16_x2(const int16_t * ptr)100 static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
101   int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
102   return res;
103 }
104 #endif  // __GNUC__ < 8
105 
106 #if __GNUC__ < 9
vld1q_u8_x3(const uint8_t * ptr)107 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
108   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
109                          vld1q_u8(ptr + 2 * 16) } };
110   return res;
111 }
112 #endif  // __GNUC__ < 9
113 
114 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
vld1q_u16_x4(const uint16_t * ptr)115 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
116   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
117                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
118   return res;
119 }
120 
vld1q_s16_x4(const int16_t * ptr)121 static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
122   int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
123                         vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
124   return res;
125 }
126 
vst1_u8_x2(uint8_t * ptr,uint8x8x2_t a)127 static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
128   vst1_u8(ptr + 0 * 8, a.val[0]);
129   vst1_u8(ptr + 1 * 8, a.val[1]);
130 }
131 
vst1_u8_x4(uint8_t * ptr,uint8x8x4_t a)132 static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
133   vst1_u8(ptr + 0 * 8, a.val[0]);
134   vst1_u8(ptr + 1 * 8, a.val[1]);
135   vst1_u8(ptr + 2 * 8, a.val[2]);
136   vst1_u8(ptr + 3 * 8, a.val[3]);
137 }
138 #endif  // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
139 #endif  // defined(__GNUC__) && !defined(__clang__)
140 
store_u8_8x2(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1)141 static inline void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
142                                 const uint8x8_t s1) {
143   vst1_u8(s, s0);
144   s += p;
145   vst1_u8(s, s1);
146   s += p;
147 }
148 
load_u8_8x2(const uint8_t * s,ptrdiff_t p)149 static inline uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
150   return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
151 }
152 
153 // Load four bytes into the low half of a uint8x8_t, zero the upper half.
load_u8_4x1(const uint8_t * p)154 static inline uint8x8_t load_u8_4x1(const uint8_t *p) {
155   uint8x8_t ret = vdup_n_u8(0);
156   ret = vreinterpret_u8_u32(
157       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
158   return ret;
159 }
160 
load_u8_4x2(const uint8_t * p,int stride)161 static inline uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
162   uint8x8_t ret = vdup_n_u8(0);
163   ret = vreinterpret_u8_u32(
164       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
165   p += stride;
166   ret = vreinterpret_u8_u32(
167       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
168   return ret;
169 }
170 
load_u16_2x2(const uint16_t * p,int stride)171 static inline uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
172   uint16x4_t ret = vdup_n_u16(0);
173   ret = vreinterpret_u16_u32(
174       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
175   p += stride;
176   ret = vreinterpret_u16_u32(
177       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
178   return ret;
179 }
180 
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)181 static inline void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
182                                uint8x8_t *const s0, uint8x8_t *const s1,
183                                uint8x8_t *const s2, uint8x8_t *const s3,
184                                uint8x8_t *const s4, uint8x8_t *const s5,
185                                uint8x8_t *const s6, uint8x8_t *const s7) {
186   *s0 = vld1_u8(s);
187   s += p;
188   *s1 = vld1_u8(s);
189   s += p;
190   *s2 = vld1_u8(s);
191   s += p;
192   *s3 = vld1_u8(s);
193   s += p;
194   *s4 = vld1_u8(s);
195   s += p;
196   *s5 = vld1_u8(s);
197   s += p;
198   *s6 = vld1_u8(s);
199   s += p;
200   *s7 = vld1_u8(s);
201 }
202 
load_u8_8x7(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6)203 static inline void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
204                                uint8x8_t *const s0, uint8x8_t *const s1,
205                                uint8x8_t *const s2, uint8x8_t *const s3,
206                                uint8x8_t *const s4, uint8x8_t *const s5,
207                                uint8x8_t *const s6) {
208   *s0 = vld1_u8(s);
209   s += p;
210   *s1 = vld1_u8(s);
211   s += p;
212   *s2 = vld1_u8(s);
213   s += p;
214   *s3 = vld1_u8(s);
215   s += p;
216   *s4 = vld1_u8(s);
217   s += p;
218   *s5 = vld1_u8(s);
219   s += p;
220   *s6 = vld1_u8(s);
221 }
222 
load_u8_8x6(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5)223 static inline void load_u8_8x6(const uint8_t *s, ptrdiff_t p,
224                                uint8x8_t *const s0, uint8x8_t *const s1,
225                                uint8x8_t *const s2, uint8x8_t *const s3,
226                                uint8x8_t *const s4, uint8x8_t *const s5) {
227   *s0 = vld1_u8(s);
228   s += p;
229   *s1 = vld1_u8(s);
230   s += p;
231   *s2 = vld1_u8(s);
232   s += p;
233   *s3 = vld1_u8(s);
234   s += p;
235   *s4 = vld1_u8(s);
236   s += p;
237   *s5 = vld1_u8(s);
238 }
239 
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)240 static inline void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
241                                uint8x8_t *const s0, uint8x8_t *const s1,
242                                uint8x8_t *const s2, uint8x8_t *const s3) {
243   *s0 = vld1_u8(s);
244   s += p;
245   *s1 = vld1_u8(s);
246   s += p;
247   *s2 = vld1_u8(s);
248   s += p;
249   *s3 = vld1_u8(s);
250 }
251 
load_u8_8x3(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2)252 static inline void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
253                                uint8x8_t *const s0, uint8x8_t *const s1,
254                                uint8x8_t *const s2) {
255   *s0 = vld1_u8(s);
256   s += p;
257   *s1 = vld1_u8(s);
258   s += p;
259   *s2 = vld1_u8(s);
260 }
261 
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)262 static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
263                                 uint16x4_t *const s0, uint16x4_t *const s1,
264                                 uint16x4_t *const s2, uint16x4_t *const s3) {
265   *s0 = vld1_u16(s);
266   s += p;
267   *s1 = vld1_u16(s);
268   s += p;
269   *s2 = vld1_u16(s);
270   s += p;
271   *s3 = vld1_u16(s);
272   s += p;
273 }
274 
load_u16_4x6(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5)275 static inline void load_u16_4x6(const uint16_t *s, ptrdiff_t p,
276                                 uint16x4_t *const s0, uint16x4_t *const s1,
277                                 uint16x4_t *const s2, uint16x4_t *const s3,
278                                 uint16x4_t *const s4, uint16x4_t *const s5) {
279   *s0 = vld1_u16(s);
280   s += p;
281   *s1 = vld1_u16(s);
282   s += p;
283   *s2 = vld1_u16(s);
284   s += p;
285   *s3 = vld1_u16(s);
286   s += p;
287   *s4 = vld1_u16(s);
288   s += p;
289   *s5 = vld1_u16(s);
290 }
291 
load_u16_4x7(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6)292 static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
293                                 uint16x4_t *const s0, uint16x4_t *const s1,
294                                 uint16x4_t *const s2, uint16x4_t *const s3,
295                                 uint16x4_t *const s4, uint16x4_t *const s5,
296                                 uint16x4_t *const s6) {
297   *s0 = vld1_u16(s);
298   s += p;
299   *s1 = vld1_u16(s);
300   s += p;
301   *s2 = vld1_u16(s);
302   s += p;
303   *s3 = vld1_u16(s);
304   s += p;
305   *s4 = vld1_u16(s);
306   s += p;
307   *s5 = vld1_u16(s);
308   s += p;
309   *s6 = vld1_u16(s);
310 }
311 
load_u16_4x8(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6,uint16x4_t * const s7)312 static inline void load_u16_4x8(const uint16_t *s, ptrdiff_t p,
313                                 uint16x4_t *const s0, uint16x4_t *const s1,
314                                 uint16x4_t *const s2, uint16x4_t *const s3,
315                                 uint16x4_t *const s4, uint16x4_t *const s5,
316                                 uint16x4_t *const s6, uint16x4_t *const s7) {
317   *s0 = vld1_u16(s);
318   s += p;
319   *s1 = vld1_u16(s);
320   s += p;
321   *s2 = vld1_u16(s);
322   s += p;
323   *s3 = vld1_u16(s);
324   s += p;
325   *s4 = vld1_u16(s);
326   s += p;
327   *s5 = vld1_u16(s);
328   s += p;
329   *s6 = vld1_u16(s);
330   s += p;
331   *s7 = vld1_u16(s);
332 }
333 
load_u16_4x14(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6,uint16x4_t * const s7,uint16x4_t * const s8,uint16x4_t * const s9,uint16x4_t * const s10,uint16x4_t * const s11,uint16x4_t * const s12,uint16x4_t * const s13)334 static inline void load_u16_4x14(const uint16_t *s, ptrdiff_t p,
335                                  uint16x4_t *const s0, uint16x4_t *const s1,
336                                  uint16x4_t *const s2, uint16x4_t *const s3,
337                                  uint16x4_t *const s4, uint16x4_t *const s5,
338                                  uint16x4_t *const s6, uint16x4_t *const s7,
339                                  uint16x4_t *const s8, uint16x4_t *const s9,
340                                  uint16x4_t *const s10, uint16x4_t *const s11,
341                                  uint16x4_t *const s12, uint16x4_t *const s13) {
342   *s0 = vld1_u16(s);
343   s += p;
344   *s1 = vld1_u16(s);
345   s += p;
346   *s2 = vld1_u16(s);
347   s += p;
348   *s3 = vld1_u16(s);
349   s += p;
350   *s4 = vld1_u16(s);
351   s += p;
352   *s5 = vld1_u16(s);
353   s += p;
354   *s6 = vld1_u16(s);
355   s += p;
356   *s7 = vld1_u16(s);
357   s += p;
358   *s8 = vld1_u16(s);
359   s += p;
360   *s9 = vld1_u16(s);
361   s += p;
362   *s10 = vld1_u16(s);
363   s += p;
364   *s11 = vld1_u16(s);
365   s += p;
366   *s12 = vld1_u16(s);
367   s += p;
368   *s13 = vld1_u16(s);
369 }
370 
load_s16_8x2(const int16_t * s,const ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1)371 static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
372                                 int16x8_t *const s0, int16x8_t *const s1) {
373   *s0 = vld1q_s16(s);
374   s += p;
375   *s1 = vld1q_s16(s);
376 }
377 
load_u16_8x2(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1)378 static inline void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
379                                 uint16x8_t *const s0, uint16x8_t *const s1) {
380   *s0 = vld1q_u16(s);
381   s += p;
382   *s1 = vld1q_u16(s);
383 }
384 
load_u16_8x3(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2)385 static inline void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
386                                 uint16x8_t *const s0, uint16x8_t *const s1,
387                                 uint16x8_t *const s2) {
388   *s0 = vld1q_u16(s);
389   s += p;
390   *s1 = vld1q_u16(s);
391   s += p;
392   *s2 = vld1q_u16(s);
393 }
394 
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)395 static inline void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
396                                 uint16x8_t *const s0, uint16x8_t *const s1,
397                                 uint16x8_t *const s2, uint16x8_t *const s3) {
398   *s0 = vld1q_u16(s);
399   s += p;
400   *s1 = vld1q_u16(s);
401   s += p;
402   *s2 = vld1q_u16(s);
403   s += p;
404   *s3 = vld1q_u16(s);
405   s += p;
406 }
407 
load_s16_4x12(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10,int16x4_t * const s11)408 static inline void load_s16_4x12(const int16_t *s, ptrdiff_t p,
409                                  int16x4_t *const s0, int16x4_t *const s1,
410                                  int16x4_t *const s2, int16x4_t *const s3,
411                                  int16x4_t *const s4, int16x4_t *const s5,
412                                  int16x4_t *const s6, int16x4_t *const s7,
413                                  int16x4_t *const s8, int16x4_t *const s9,
414                                  int16x4_t *const s10, int16x4_t *const s11) {
415   *s0 = vld1_s16(s);
416   s += p;
417   *s1 = vld1_s16(s);
418   s += p;
419   *s2 = vld1_s16(s);
420   s += p;
421   *s3 = vld1_s16(s);
422   s += p;
423   *s4 = vld1_s16(s);
424   s += p;
425   *s5 = vld1_s16(s);
426   s += p;
427   *s6 = vld1_s16(s);
428   s += p;
429   *s7 = vld1_s16(s);
430   s += p;
431   *s8 = vld1_s16(s);
432   s += p;
433   *s9 = vld1_s16(s);
434   s += p;
435   *s10 = vld1_s16(s);
436   s += p;
437   *s11 = vld1_s16(s);
438 }
439 
load_s16_4x11(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10)440 static inline void load_s16_4x11(const int16_t *s, ptrdiff_t p,
441                                  int16x4_t *const s0, int16x4_t *const s1,
442                                  int16x4_t *const s2, int16x4_t *const s3,
443                                  int16x4_t *const s4, int16x4_t *const s5,
444                                  int16x4_t *const s6, int16x4_t *const s7,
445                                  int16x4_t *const s8, int16x4_t *const s9,
446                                  int16x4_t *const s10) {
447   *s0 = vld1_s16(s);
448   s += p;
449   *s1 = vld1_s16(s);
450   s += p;
451   *s2 = vld1_s16(s);
452   s += p;
453   *s3 = vld1_s16(s);
454   s += p;
455   *s4 = vld1_s16(s);
456   s += p;
457   *s5 = vld1_s16(s);
458   s += p;
459   *s6 = vld1_s16(s);
460   s += p;
461   *s7 = vld1_s16(s);
462   s += p;
463   *s8 = vld1_s16(s);
464   s += p;
465   *s9 = vld1_s16(s);
466   s += p;
467   *s10 = vld1_s16(s);
468 }
469 
load_u16_4x11(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6,uint16x4_t * const s7,uint16x4_t * const s8,uint16x4_t * const s9,uint16x4_t * const s10)470 static inline void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
471                                  uint16x4_t *const s0, uint16x4_t *const s1,
472                                  uint16x4_t *const s2, uint16x4_t *const s3,
473                                  uint16x4_t *const s4, uint16x4_t *const s5,
474                                  uint16x4_t *const s6, uint16x4_t *const s7,
475                                  uint16x4_t *const s8, uint16x4_t *const s9,
476                                  uint16x4_t *const s10) {
477   *s0 = vld1_u16(s);
478   s += p;
479   *s1 = vld1_u16(s);
480   s += p;
481   *s2 = vld1_u16(s);
482   s += p;
483   *s3 = vld1_u16(s);
484   s += p;
485   *s4 = vld1_u16(s);
486   s += p;
487   *s5 = vld1_u16(s);
488   s += p;
489   *s6 = vld1_u16(s);
490   s += p;
491   *s7 = vld1_u16(s);
492   s += p;
493   *s8 = vld1_u16(s);
494   s += p;
495   *s9 = vld1_u16(s);
496   s += p;
497   *s10 = vld1_u16(s);
498 }
499 
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)500 static inline void load_s16_4x8(const int16_t *s, ptrdiff_t p,
501                                 int16x4_t *const s0, int16x4_t *const s1,
502                                 int16x4_t *const s2, int16x4_t *const s3,
503                                 int16x4_t *const s4, int16x4_t *const s5,
504                                 int16x4_t *const s6, int16x4_t *const s7) {
505   *s0 = vld1_s16(s);
506   s += p;
507   *s1 = vld1_s16(s);
508   s += p;
509   *s2 = vld1_s16(s);
510   s += p;
511   *s3 = vld1_s16(s);
512   s += p;
513   *s4 = vld1_s16(s);
514   s += p;
515   *s5 = vld1_s16(s);
516   s += p;
517   *s6 = vld1_s16(s);
518   s += p;
519   *s7 = vld1_s16(s);
520 }
521 
load_s16_4x7(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6)522 static inline void load_s16_4x7(const int16_t *s, ptrdiff_t p,
523                                 int16x4_t *const s0, int16x4_t *const s1,
524                                 int16x4_t *const s2, int16x4_t *const s3,
525                                 int16x4_t *const s4, int16x4_t *const s5,
526                                 int16x4_t *const s6) {
527   *s0 = vld1_s16(s);
528   s += p;
529   *s1 = vld1_s16(s);
530   s += p;
531   *s2 = vld1_s16(s);
532   s += p;
533   *s3 = vld1_s16(s);
534   s += p;
535   *s4 = vld1_s16(s);
536   s += p;
537   *s5 = vld1_s16(s);
538   s += p;
539   *s6 = vld1_s16(s);
540 }
541 
load_s16_4x6(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5)542 static inline void load_s16_4x6(const int16_t *s, ptrdiff_t p,
543                                 int16x4_t *const s0, int16x4_t *const s1,
544                                 int16x4_t *const s2, int16x4_t *const s3,
545                                 int16x4_t *const s4, int16x4_t *const s5) {
546   *s0 = vld1_s16(s);
547   s += p;
548   *s1 = vld1_s16(s);
549   s += p;
550   *s2 = vld1_s16(s);
551   s += p;
552   *s3 = vld1_s16(s);
553   s += p;
554   *s4 = vld1_s16(s);
555   s += p;
556   *s5 = vld1_s16(s);
557 }
558 
load_s16_4x5(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4)559 static inline void load_s16_4x5(const int16_t *s, ptrdiff_t p,
560                                 int16x4_t *const s0, int16x4_t *const s1,
561                                 int16x4_t *const s2, int16x4_t *const s3,
562                                 int16x4_t *const s4) {
563   *s0 = vld1_s16(s);
564   s += p;
565   *s1 = vld1_s16(s);
566   s += p;
567   *s2 = vld1_s16(s);
568   s += p;
569   *s3 = vld1_s16(s);
570   s += p;
571   *s4 = vld1_s16(s);
572 }
573 
load_u16_4x5(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4)574 static inline void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
575                                 uint16x4_t *const s0, uint16x4_t *const s1,
576                                 uint16x4_t *const s2, uint16x4_t *const s3,
577                                 uint16x4_t *const s4) {
578   *s0 = vld1_u16(s);
579   s += p;
580   *s1 = vld1_u16(s);
581   s += p;
582   *s2 = vld1_u16(s);
583   s += p;
584   *s3 = vld1_u16(s);
585   s += p;
586   *s4 = vld1_u16(s);
587   s += p;
588 }
589 
load_u8_8x5(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4)590 static inline void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
591                                uint8x8_t *const s0, uint8x8_t *const s1,
592                                uint8x8_t *const s2, uint8x8_t *const s3,
593                                uint8x8_t *const s4) {
594   *s0 = vld1_u8(s);
595   s += p;
596   *s1 = vld1_u8(s);
597   s += p;
598   *s2 = vld1_u8(s);
599   s += p;
600   *s3 = vld1_u8(s);
601   s += p;
602   *s4 = vld1_u8(s);
603 }
604 
load_u16_8x5(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4)605 static inline void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
606                                 uint16x8_t *const s0, uint16x8_t *const s1,
607                                 uint16x8_t *const s2, uint16x8_t *const s3,
608                                 uint16x8_t *const s4) {
609   *s0 = vld1q_u16(s);
610   s += p;
611   *s1 = vld1q_u16(s);
612   s += p;
613   *s2 = vld1q_u16(s);
614   s += p;
615   *s3 = vld1q_u16(s);
616   s += p;
617   *s4 = vld1q_u16(s);
618   s += p;
619 }
620 
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)621 static inline void load_s16_4x4(const int16_t *s, ptrdiff_t p,
622                                 int16x4_t *const s0, int16x4_t *const s1,
623                                 int16x4_t *const s2, int16x4_t *const s3) {
624   *s0 = vld1_s16(s);
625   s += p;
626   *s1 = vld1_s16(s);
627   s += p;
628   *s2 = vld1_s16(s);
629   s += p;
630   *s3 = vld1_s16(s);
631 }
632 
load_s16_4x3(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2)633 static inline void load_s16_4x3(const int16_t *s, ptrdiff_t p,
634                                 int16x4_t *const s0, int16x4_t *const s1,
635                                 int16x4_t *const s2) {
636   *s0 = vld1_s16(s);
637   s += p;
638   *s1 = vld1_s16(s);
639   s += p;
640   *s2 = vld1_s16(s);
641 }
642 
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)643 static inline void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
644                                 const uint8x8_t s1, const uint8x8_t s2,
645                                 const uint8x8_t s3, const uint8x8_t s4,
646                                 const uint8x8_t s5, const uint8x8_t s6,
647                                 const uint8x8_t s7) {
648   vst1_u8(s, s0);
649   s += p;
650   vst1_u8(s, s1);
651   s += p;
652   vst1_u8(s, s2);
653   s += p;
654   vst1_u8(s, s3);
655   s += p;
656   vst1_u8(s, s4);
657   s += p;
658   vst1_u8(s, s5);
659   s += p;
660   vst1_u8(s, s6);
661   s += p;
662   vst1_u8(s, s7);
663 }
664 
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)665 static inline void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
666                                 const uint8x8_t s1, const uint8x8_t s2,
667                                 const uint8x8_t s3) {
668   vst1_u8(s, s0);
669   s += p;
670   vst1_u8(s, s1);
671   s += p;
672   vst1_u8(s, s2);
673   s += p;
674   vst1_u8(s, s3);
675 }
676 
store_u8_16x4(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)677 static inline void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
678                                  const uint8x16_t s1, const uint8x16_t s2,
679                                  const uint8x16_t s3) {
680   vst1q_u8(s, s0);
681   s += p;
682   vst1q_u8(s, s1);
683   s += p;
684   vst1q_u8(s, s2);
685   s += p;
686   vst1q_u8(s, s3);
687 }
688 
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)689 static inline void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
690                                  const uint16x8_t s0, const uint16x8_t s1,
691                                  const uint16x8_t s2, const uint16x8_t s3,
692                                  const uint16x8_t s4, const uint16x8_t s5,
693                                  const uint16x8_t s6, const uint16x8_t s7) {
694   vst1q_u16(s, s0);
695   s += dst_stride;
696   vst1q_u16(s, s1);
697   s += dst_stride;
698   vst1q_u16(s, s2);
699   s += dst_stride;
700   vst1q_u16(s, s3);
701   s += dst_stride;
702   vst1q_u16(s, s4);
703   s += dst_stride;
704   vst1q_u16(s, s5);
705   s += dst_stride;
706   vst1q_u16(s, s6);
707   s += dst_stride;
708   vst1q_u16(s, s7);
709 }
710 
store_u16_4x3(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2)711 static inline void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride,
712                                  const uint16x4_t s0, const uint16x4_t s1,
713                                  const uint16x4_t s2) {
714   vst1_u16(s, s0);
715   s += dst_stride;
716   vst1_u16(s, s1);
717   s += dst_stride;
718   vst1_u16(s, s2);
719 }
720 
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)721 static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
722                                  const uint16x4_t s0, const uint16x4_t s1,
723                                  const uint16x4_t s2, const uint16x4_t s3) {
724   vst1_u16(s, s0);
725   s += dst_stride;
726   vst1_u16(s, s1);
727   s += dst_stride;
728   vst1_u16(s, s2);
729   s += dst_stride;
730   vst1_u16(s, s3);
731 }
732 
store_u16_4x6(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3,const uint16x4_t s4,const uint16x4_t s5)733 static inline void store_u16_4x6(uint16_t *s, ptrdiff_t dst_stride,
734                                  const uint16x4_t s0, const uint16x4_t s1,
735                                  const uint16x4_t s2, const uint16x4_t s3,
736                                  const uint16x4_t s4, const uint16x4_t s5) {
737   vst1_u16(s, s0);
738   s += dst_stride;
739   vst1_u16(s, s1);
740   s += dst_stride;
741   vst1_u16(s, s2);
742   s += dst_stride;
743   vst1_u16(s, s3);
744   s += dst_stride;
745   vst1_u16(s, s4);
746   s += dst_stride;
747   vst1_u16(s, s5);
748 }
749 
store_u16_4x12(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3,const uint16x4_t s4,const uint16x4_t s5,const uint16x4_t s6,const uint16x4_t s7,const uint16x4_t s8,const uint16x4_t s9,const uint16x4_t s10,const uint16x4_t s11)750 static inline void store_u16_4x12(uint16_t *s, ptrdiff_t dst_stride,
751                                   const uint16x4_t s0, const uint16x4_t s1,
752                                   const uint16x4_t s2, const uint16x4_t s3,
753                                   const uint16x4_t s4, const uint16x4_t s5,
754                                   const uint16x4_t s6, const uint16x4_t s7,
755                                   const uint16x4_t s8, const uint16x4_t s9,
756                                   const uint16x4_t s10, const uint16x4_t s11) {
757   vst1_u16(s, s0);
758   s += dst_stride;
759   vst1_u16(s, s1);
760   s += dst_stride;
761   vst1_u16(s, s2);
762   s += dst_stride;
763   vst1_u16(s, s3);
764   s += dst_stride;
765   vst1_u16(s, s4);
766   s += dst_stride;
767   vst1_u16(s, s5);
768   s += dst_stride;
769   vst1_u16(s, s6);
770   s += dst_stride;
771   vst1_u16(s, s7);
772   s += dst_stride;
773   vst1_u16(s, s8);
774   s += dst_stride;
775   vst1_u16(s, s9);
776   s += dst_stride;
777   vst1_u16(s, s10);
778   s += dst_stride;
779   vst1_u16(s, s11);
780   s += dst_stride;
781 }
782 
store_u16_8x2(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1)783 static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
784                                  const uint16x8_t s0, const uint16x8_t s1) {
785   vst1q_u16(s, s0);
786   s += dst_stride;
787   vst1q_u16(s, s1);
788 }
789 
store_u16_8x3(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2)790 static inline void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride,
791                                  const uint16x8_t s0, const uint16x8_t s1,
792                                  const uint16x8_t s2) {
793   vst1q_u16(s, s0);
794   s += dst_stride;
795   vst1q_u16(s, s1);
796   s += dst_stride;
797   vst1q_u16(s, s2);
798 }
799 
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)800 static inline void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
801                                  const uint16x8_t s0, const uint16x8_t s1,
802                                  const uint16x8_t s2, const uint16x8_t s3) {
803   vst1q_u16(s, s0);
804   s += dst_stride;
805   vst1q_u16(s, s1);
806   s += dst_stride;
807   vst1q_u16(s, s2);
808   s += dst_stride;
809   vst1q_u16(s, s3);
810 }
811 
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)812 static inline void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
813                                  const int16x8_t s0, const int16x8_t s1,
814                                  const int16x8_t s2, const int16x8_t s3,
815                                  const int16x8_t s4, const int16x8_t s5,
816                                  const int16x8_t s6, const int16x8_t s7) {
817   vst1q_s16(s, s0);
818   s += dst_stride;
819   vst1q_s16(s, s1);
820   s += dst_stride;
821   vst1q_s16(s, s2);
822   s += dst_stride;
823   vst1q_s16(s, s3);
824   s += dst_stride;
825   vst1q_s16(s, s4);
826   s += dst_stride;
827   vst1q_s16(s, s5);
828   s += dst_stride;
829   vst1q_s16(s, s6);
830   s += dst_stride;
831   vst1q_s16(s, s7);
832 }
833 
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)834 static inline void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
835                                  const int16x4_t s0, const int16x4_t s1,
836                                  const int16x4_t s2, const int16x4_t s3) {
837   vst1_s16(s, s0);
838   s += dst_stride;
839   vst1_s16(s, s1);
840   s += dst_stride;
841   vst1_s16(s, s2);
842   s += dst_stride;
843   vst1_s16(s, s3);
844 }
845 
store_s16_4x8(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7)846 static inline void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
847                                  const int16x4_t s0, const int16x4_t s1,
848                                  const int16x4_t s2, const int16x4_t s3,
849                                  const int16x4_t s4, const int16x4_t s5,
850                                  const int16x4_t s6, const int16x4_t s7) {
851   vst1_s16(s, s0);
852   s += dst_stride;
853   vst1_s16(s, s1);
854   s += dst_stride;
855   vst1_s16(s, s2);
856   s += dst_stride;
857   vst1_s16(s, s3);
858   s += dst_stride;
859   vst1_s16(s, s4);
860   s += dst_stride;
861   vst1_s16(s, s5);
862   s += dst_stride;
863   vst1_s16(s, s6);
864   s += dst_stride;
865   vst1_s16(s, s7);
866 }
867 
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)868 static inline void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
869                                  const int16x8_t s0, const int16x8_t s1,
870                                  const int16x8_t s2, const int16x8_t s3) {
871   vst1q_s16(s, s0);
872   s += dst_stride;
873   vst1q_s16(s, s1);
874   s += dst_stride;
875   vst1q_s16(s, s2);
876   s += dst_stride;
877   vst1q_s16(s, s3);
878 }
879 
store_s16_8x2(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1)880 static inline void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
881                                  const int16x8_t s0, const int16x8_t s1) {
882   vst1q_s16(s, s0);
883   s += dst_stride;
884   vst1q_s16(s, s1);
885 }
886 
load_u8_8x11(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7,uint8x8_t * const s8,uint8x8_t * const s9,uint8x8_t * const s10)887 static inline void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
888                                 uint8x8_t *const s0, uint8x8_t *const s1,
889                                 uint8x8_t *const s2, uint8x8_t *const s3,
890                                 uint8x8_t *const s4, uint8x8_t *const s5,
891                                 uint8x8_t *const s6, uint8x8_t *const s7,
892                                 uint8x8_t *const s8, uint8x8_t *const s9,
893                                 uint8x8_t *const s10) {
894   *s0 = vld1_u8(s);
895   s += p;
896   *s1 = vld1_u8(s);
897   s += p;
898   *s2 = vld1_u8(s);
899   s += p;
900   *s3 = vld1_u8(s);
901   s += p;
902   *s4 = vld1_u8(s);
903   s += p;
904   *s5 = vld1_u8(s);
905   s += p;
906   *s6 = vld1_u8(s);
907   s += p;
908   *s7 = vld1_u8(s);
909   s += p;
910   *s8 = vld1_u8(s);
911   s += p;
912   *s9 = vld1_u8(s);
913   s += p;
914   *s10 = vld1_u8(s);
915 }
916 
load_s16_8x10(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9)917 static inline void load_s16_8x10(const int16_t *s, ptrdiff_t p,
918                                  int16x8_t *const s0, int16x8_t *const s1,
919                                  int16x8_t *const s2, int16x8_t *const s3,
920                                  int16x8_t *const s4, int16x8_t *const s5,
921                                  int16x8_t *const s6, int16x8_t *const s7,
922                                  int16x8_t *const s8, int16x8_t *const s9) {
923   *s0 = vld1q_s16(s);
924   s += p;
925   *s1 = vld1q_s16(s);
926   s += p;
927   *s2 = vld1q_s16(s);
928   s += p;
929   *s3 = vld1q_s16(s);
930   s += p;
931   *s4 = vld1q_s16(s);
932   s += p;
933   *s5 = vld1q_s16(s);
934   s += p;
935   *s6 = vld1q_s16(s);
936   s += p;
937   *s7 = vld1q_s16(s);
938   s += p;
939   *s8 = vld1q_s16(s);
940   s += p;
941   *s9 = vld1q_s16(s);
942 }
943 
load_s16_8x11(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10)944 static inline void load_s16_8x11(const int16_t *s, ptrdiff_t p,
945                                  int16x8_t *const s0, int16x8_t *const s1,
946                                  int16x8_t *const s2, int16x8_t *const s3,
947                                  int16x8_t *const s4, int16x8_t *const s5,
948                                  int16x8_t *const s6, int16x8_t *const s7,
949                                  int16x8_t *const s8, int16x8_t *const s9,
950                                  int16x8_t *const s10) {
951   *s0 = vld1q_s16(s);
952   s += p;
953   *s1 = vld1q_s16(s);
954   s += p;
955   *s2 = vld1q_s16(s);
956   s += p;
957   *s3 = vld1q_s16(s);
958   s += p;
959   *s4 = vld1q_s16(s);
960   s += p;
961   *s5 = vld1q_s16(s);
962   s += p;
963   *s6 = vld1q_s16(s);
964   s += p;
965   *s7 = vld1q_s16(s);
966   s += p;
967   *s8 = vld1q_s16(s);
968   s += p;
969   *s9 = vld1q_s16(s);
970   s += p;
971   *s10 = vld1q_s16(s);
972 }
973 
load_s16_8x12(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10,int16x8_t * const s11)974 static inline void load_s16_8x12(const int16_t *s, ptrdiff_t p,
975                                  int16x8_t *const s0, int16x8_t *const s1,
976                                  int16x8_t *const s2, int16x8_t *const s3,
977                                  int16x8_t *const s4, int16x8_t *const s5,
978                                  int16x8_t *const s6, int16x8_t *const s7,
979                                  int16x8_t *const s8, int16x8_t *const s9,
980                                  int16x8_t *const s10, int16x8_t *const s11) {
981   *s0 = vld1q_s16(s);
982   s += p;
983   *s1 = vld1q_s16(s);
984   s += p;
985   *s2 = vld1q_s16(s);
986   s += p;
987   *s3 = vld1q_s16(s);
988   s += p;
989   *s4 = vld1q_s16(s);
990   s += p;
991   *s5 = vld1q_s16(s);
992   s += p;
993   *s6 = vld1q_s16(s);
994   s += p;
995   *s7 = vld1q_s16(s);
996   s += p;
997   *s8 = vld1q_s16(s);
998   s += p;
999   *s9 = vld1q_s16(s);
1000   s += p;
1001   *s10 = vld1q_s16(s);
1002   s += p;
1003   *s11 = vld1q_s16(s);
1004 }
1005 
load_u16_8x11(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7,uint16x8_t * const s8,uint16x8_t * const s9,uint16x8_t * const s10)1006 static inline void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
1007                                  uint16x8_t *const s0, uint16x8_t *const s1,
1008                                  uint16x8_t *const s2, uint16x8_t *const s3,
1009                                  uint16x8_t *const s4, uint16x8_t *const s5,
1010                                  uint16x8_t *const s6, uint16x8_t *const s7,
1011                                  uint16x8_t *const s8, uint16x8_t *const s9,
1012                                  uint16x8_t *const s10) {
1013   *s0 = vld1q_u16(s);
1014   s += p;
1015   *s1 = vld1q_u16(s);
1016   s += p;
1017   *s2 = vld1q_u16(s);
1018   s += p;
1019   *s3 = vld1q_u16(s);
1020   s += p;
1021   *s4 = vld1q_u16(s);
1022   s += p;
1023   *s5 = vld1q_u16(s);
1024   s += p;
1025   *s6 = vld1q_u16(s);
1026   s += p;
1027   *s7 = vld1q_u16(s);
1028   s += p;
1029   *s8 = vld1q_u16(s);
1030   s += p;
1031   *s9 = vld1q_u16(s);
1032   s += p;
1033   *s10 = vld1q_u16(s);
1034 }
1035 
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)1036 static inline void load_s16_8x8(const int16_t *s, ptrdiff_t p,
1037                                 int16x8_t *const s0, int16x8_t *const s1,
1038                                 int16x8_t *const s2, int16x8_t *const s3,
1039                                 int16x8_t *const s4, int16x8_t *const s5,
1040                                 int16x8_t *const s6, int16x8_t *const s7) {
1041   *s0 = vld1q_s16(s);
1042   s += p;
1043   *s1 = vld1q_s16(s);
1044   s += p;
1045   *s2 = vld1q_s16(s);
1046   s += p;
1047   *s3 = vld1q_s16(s);
1048   s += p;
1049   *s4 = vld1q_s16(s);
1050   s += p;
1051   *s5 = vld1q_s16(s);
1052   s += p;
1053   *s6 = vld1q_s16(s);
1054   s += p;
1055   *s7 = vld1q_s16(s);
1056 }
1057 
load_u16_8x7(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6)1058 static inline void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
1059                                 uint16x8_t *const s0, uint16x8_t *const s1,
1060                                 uint16x8_t *const s2, uint16x8_t *const s3,
1061                                 uint16x8_t *const s4, uint16x8_t *const s5,
1062                                 uint16x8_t *const s6) {
1063   *s0 = vld1q_u16(s);
1064   s += p;
1065   *s1 = vld1q_u16(s);
1066   s += p;
1067   *s2 = vld1q_u16(s);
1068   s += p;
1069   *s3 = vld1q_u16(s);
1070   s += p;
1071   *s4 = vld1q_u16(s);
1072   s += p;
1073   *s5 = vld1q_u16(s);
1074   s += p;
1075   *s6 = vld1q_u16(s);
1076 }
1077 
load_s16_8x7(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6)1078 static inline void load_s16_8x7(const int16_t *s, ptrdiff_t p,
1079                                 int16x8_t *const s0, int16x8_t *const s1,
1080                                 int16x8_t *const s2, int16x8_t *const s3,
1081                                 int16x8_t *const s4, int16x8_t *const s5,
1082                                 int16x8_t *const s6) {
1083   *s0 = vld1q_s16(s);
1084   s += p;
1085   *s1 = vld1q_s16(s);
1086   s += p;
1087   *s2 = vld1q_s16(s);
1088   s += p;
1089   *s3 = vld1q_s16(s);
1090   s += p;
1091   *s4 = vld1q_s16(s);
1092   s += p;
1093   *s5 = vld1q_s16(s);
1094   s += p;
1095   *s6 = vld1q_s16(s);
1096 }
1097 
load_s16_8x6(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5)1098 static inline void load_s16_8x6(const int16_t *s, ptrdiff_t p,
1099                                 int16x8_t *const s0, int16x8_t *const s1,
1100                                 int16x8_t *const s2, int16x8_t *const s3,
1101                                 int16x8_t *const s4, int16x8_t *const s5) {
1102   *s0 = vld1q_s16(s);
1103   s += p;
1104   *s1 = vld1q_s16(s);
1105   s += p;
1106   *s2 = vld1q_s16(s);
1107   s += p;
1108   *s3 = vld1q_s16(s);
1109   s += p;
1110   *s4 = vld1q_s16(s);
1111   s += p;
1112   *s5 = vld1q_s16(s);
1113 }
1114 
load_s16_8x5(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4)1115 static inline void load_s16_8x5(const int16_t *s, ptrdiff_t p,
1116                                 int16x8_t *const s0, int16x8_t *const s1,
1117                                 int16x8_t *const s2, int16x8_t *const s3,
1118                                 int16x8_t *const s4) {
1119   *s0 = vld1q_s16(s);
1120   s += p;
1121   *s1 = vld1q_s16(s);
1122   s += p;
1123   *s2 = vld1q_s16(s);
1124   s += p;
1125   *s3 = vld1q_s16(s);
1126   s += p;
1127   *s4 = vld1q_s16(s);
1128 }
1129 
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)1130 static inline void load_s16_8x4(const int16_t *s, ptrdiff_t p,
1131                                 int16x8_t *const s0, int16x8_t *const s1,
1132                                 int16x8_t *const s2, int16x8_t *const s3) {
1133   *s0 = vld1q_s16(s);
1134   s += p;
1135   *s1 = vld1q_s16(s);
1136   s += p;
1137   *s2 = vld1q_s16(s);
1138   s += p;
1139   *s3 = vld1q_s16(s);
1140 }
1141 
load_s16_8x3(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2)1142 static inline void load_s16_8x3(const int16_t *s, ptrdiff_t p,
1143                                 int16x8_t *const s0, int16x8_t *const s1,
1144                                 int16x8_t *const s2) {
1145   *s0 = vld1q_s16(s);
1146   s += p;
1147   *s1 = vld1q_s16(s);
1148   s += p;
1149   *s2 = vld1q_s16(s);
1150 }
1151 
1152 #if AOM_ARCH_AARCH64
1153 #define load_unaligned_u32_2x1_lane(v, p, lane)              \
1154   do {                                                       \
1155     (v) = vld1_lane_u32((const uint32_t *)(p), (v), (lane)); \
1156   } while (0)
1157 
1158 #define load_unaligned_u32_4x1_lane(v, p, lane)               \
1159   do {                                                        \
1160     (v) = vld1q_lane_u32((const uint32_t *)(p), (v), (lane)); \
1161   } while (0)
1162 #else
1163 #define load_unaligned_u32_2x1_lane(v, p, lane) \
1164   do {                                          \
1165     uint32_t tmp;                               \
1166     memcpy(&tmp, (p), 4);                       \
1167     (v) = vset_lane_u32(tmp, (v), (lane));      \
1168   } while (0)
1169 
1170 #define load_unaligned_u32_4x1_lane(v, p, lane) \
1171   do {                                          \
1172     uint32_t tmp;                               \
1173     memcpy(&tmp, (p), 4);                       \
1174     (v) = vsetq_lane_u32(tmp, (v), (lane));     \
1175   } while (0)
1176 #endif
1177 
1178 // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,int stride)1179 static inline uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
1180   uint32_t a;
1181   memcpy(&a, buf, 4);
1182   buf += stride;
1183   uint32x2_t a_u32 = vdup_n_u32(a);
1184   memcpy(&a, buf, 4);
1185   a_u32 = vset_lane_u32(a, a_u32, 1);
1186   return vreinterpret_u8_u32(a_u32);
1187 }
1188 
1189 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,int stride)1190 static inline uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
1191   uint32_t a;
1192   uint32x4_t a_u32;
1193   if (stride == 4) return vld1q_u8(buf);
1194   memcpy(&a, buf, 4);
1195   buf += stride;
1196   a_u32 = vdupq_n_u32(a);
1197   memcpy(&a, buf, 4);
1198   buf += stride;
1199   a_u32 = vsetq_lane_u32(a, a_u32, 1);
1200   memcpy(&a, buf, 4);
1201   buf += stride;
1202   a_u32 = vsetq_lane_u32(a, a_u32, 2);
1203   memcpy(&a, buf, 4);
1204   a_u32 = vsetq_lane_u32(a, a_u32, 3);
1205   return vreinterpretq_u8_u32(a_u32);
1206 }
1207 
load_unaligned_u8_2x2(const uint8_t * buf,int stride)1208 static inline uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
1209   uint16_t a;
1210   uint16x4_t a_u16;
1211 
1212   memcpy(&a, buf, 2);
1213   buf += stride;
1214   a_u16 = vdup_n_u16(a);
1215   memcpy(&a, buf, 2);
1216   a_u16 = vset_lane_u16(a, a_u16, 1);
1217   return vreinterpret_u8_u16(a_u16);
1218 }
1219 
load_unaligned_u8_4x1(const uint8_t * buf)1220 static inline uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
1221   uint32_t a;
1222   uint32x2_t a_u32;
1223 
1224   memcpy(&a, buf, 4);
1225   a_u32 = vdup_n_u32(0);
1226   a_u32 = vset_lane_u32(a, a_u32, 0);
1227   return vreinterpret_u8_u32(a_u32);
1228 }
1229 
load_unaligned_dup_u8_4x2(const uint8_t * buf)1230 static inline uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
1231   uint32_t a;
1232   uint32x2_t a_u32;
1233 
1234   memcpy(&a, buf, 4);
1235   a_u32 = vdup_n_u32(a);
1236   return vreinterpret_u8_u32(a_u32);
1237 }
1238 
load_unaligned_dup_u8_2x4(const uint8_t * buf)1239 static inline uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
1240   uint16_t a;
1241   uint16x4_t a_u32;
1242 
1243   memcpy(&a, buf, 2);
1244   a_u32 = vdup_n_u16(a);
1245   return vreinterpret_u8_u16(a_u32);
1246 }
1247 
load_unaligned_u8_4x2(const uint8_t * buf,int stride)1248 static inline uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
1249   uint32_t a;
1250   uint32x2_t a_u32;
1251 
1252   memcpy(&a, buf, 4);
1253   buf += stride;
1254   a_u32 = vdup_n_u32(a);
1255   memcpy(&a, buf, 4);
1256   a_u32 = vset_lane_u32(a, a_u32, 1);
1257   return vreinterpret_u8_u32(a_u32);
1258 }
1259 
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1)1260 static inline void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
1261                                          uint8x8_t *tu0, uint8x8_t *tu1) {
1262   *tu0 = load_unaligned_u8_4x2(buf, stride);
1263   buf += 2 * stride;
1264   *tu1 = load_unaligned_u8_4x2(buf, stride);
1265 }
1266 
load_unaligned_u8_3x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2)1267 static inline void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
1268                                          uint8x8_t *tu0, uint8x8_t *tu1,
1269                                          uint8x8_t *tu2) {
1270   load_unaligned_u8_4x4(buf, stride, tu0, tu1);
1271   buf += 4 * stride;
1272   *tu2 = load_unaligned_u8_4x2(buf, stride);
1273 }
1274 
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2,uint8x8_t * tu3)1275 static inline void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
1276                                          uint8x8_t *tu0, uint8x8_t *tu1,
1277                                          uint8x8_t *tu2, uint8x8_t *tu3) {
1278   load_unaligned_u8_4x4(buf, stride, tu0, tu1);
1279   buf += 4 * stride;
1280   load_unaligned_u8_4x4(buf, stride, tu2, tu3);
1281 }
1282 
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)1283 static inline void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
1284                                 uint8x16_t *const s0, uint8x16_t *const s1,
1285                                 uint8x16_t *const s2, uint8x16_t *const s3,
1286                                 uint8x16_t *const s4, uint8x16_t *const s5,
1287                                 uint8x16_t *const s6, uint8x16_t *const s7) {
1288   *s0 = vld1q_u8(s);
1289   s += p;
1290   *s1 = vld1q_u8(s);
1291   s += p;
1292   *s2 = vld1q_u8(s);
1293   s += p;
1294   *s3 = vld1q_u8(s);
1295   s += p;
1296   *s4 = vld1q_u8(s);
1297   s += p;
1298   *s5 = vld1q_u8(s);
1299   s += p;
1300   *s6 = vld1q_u8(s);
1301   s += p;
1302   *s7 = vld1q_u8(s);
1303 }
1304 
load_u8_16x5(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4)1305 static inline void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
1306                                 uint8x16_t *const s0, uint8x16_t *const s1,
1307                                 uint8x16_t *const s2, uint8x16_t *const s3,
1308                                 uint8x16_t *const s4) {
1309   *s0 = vld1q_u8(s);
1310   s += p;
1311   *s1 = vld1q_u8(s);
1312   s += p;
1313   *s2 = vld1q_u8(s);
1314   s += p;
1315   *s3 = vld1q_u8(s);
1316   s += p;
1317   *s4 = vld1q_u8(s);
1318 }
1319 
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)1320 static inline void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
1321                                 uint8x16_t *const s0, uint8x16_t *const s1,
1322                                 uint8x16_t *const s2, uint8x16_t *const s3) {
1323   *s0 = vld1q_u8(s);
1324   s += p;
1325   *s1 = vld1q_u8(s);
1326   s += p;
1327   *s2 = vld1q_u8(s);
1328   s += p;
1329   *s3 = vld1q_u8(s);
1330 }
1331 
load_u8_16x3(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2)1332 static inline void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
1333                                 uint8x16_t *const s0, uint8x16_t *const s1,
1334                                 uint8x16_t *const s2) {
1335   *s0 = vld1q_u8(s);
1336   s += p;
1337   *s1 = vld1q_u8(s);
1338   s += p;
1339   *s2 = vld1q_u8(s);
1340 }
1341 
load_u16_8x8(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)1342 static inline void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
1343                                 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
1344                                 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
1345                                 uint16x8_t *s6, uint16x8_t *s7) {
1346   *s0 = vld1q_u16(s);
1347   s += p;
1348   *s1 = vld1q_u16(s);
1349   s += p;
1350   *s2 = vld1q_u16(s);
1351   s += p;
1352   *s3 = vld1q_u16(s);
1353   s += p;
1354   *s4 = vld1q_u16(s);
1355   s += p;
1356   *s5 = vld1q_u16(s);
1357   s += p;
1358   *s6 = vld1q_u16(s);
1359   s += p;
1360   *s7 = vld1q_u16(s);
1361 }
1362 
load_u16_16x4(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7)1363 static inline void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
1364                                  uint16x8_t *const s0, uint16x8_t *const s1,
1365                                  uint16x8_t *const s2, uint16x8_t *const s3,
1366                                  uint16x8_t *const s4, uint16x8_t *const s5,
1367                                  uint16x8_t *const s6, uint16x8_t *const s7) {
1368   *s0 = vld1q_u16(s);
1369   *s1 = vld1q_u16(s + 8);
1370   s += p;
1371   *s2 = vld1q_u16(s);
1372   *s3 = vld1q_u16(s + 8);
1373   s += p;
1374   *s4 = vld1q_u16(s);
1375   *s5 = vld1q_u16(s + 8);
1376   s += p;
1377   *s6 = vld1q_u16(s);
1378   *s7 = vld1q_u16(s + 8);
1379 }
1380 
load_unaligned_u16_2x2(const uint16_t * buf,int stride)1381 static inline uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
1382                                                 int stride) {
1383   uint32_t a;
1384   uint32x2_t a_u32;
1385 
1386   memcpy(&a, buf, 4);
1387   buf += stride;
1388   a_u32 = vdup_n_u32(a);
1389   memcpy(&a, buf, 4);
1390   a_u32 = vset_lane_u32(a, a_u32, 1);
1391   return vreinterpret_u16_u32(a_u32);
1392 }
1393 
load_unaligned_u16_4x1(const uint16_t * buf)1394 static inline uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
1395   uint64_t a;
1396   uint64x1_t a_u64 = vdup_n_u64(0);
1397   memcpy(&a, buf, 8);
1398   a_u64 = vset_lane_u64(a, a_u64, 0);
1399   return vreinterpret_u16_u64(a_u64);
1400 }
1401 
load_unaligned_u16_4x2(const uint16_t * buf,uint32_t stride)1402 static inline uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
1403                                                 uint32_t stride) {
1404   uint64_t a;
1405   uint64x2_t a_u64;
1406 
1407   memcpy(&a, buf, 8);
1408   buf += stride;
1409   a_u64 = vdupq_n_u64(0);
1410   a_u64 = vsetq_lane_u64(a, a_u64, 0);
1411   memcpy(&a, buf, 8);
1412   buf += stride;
1413   a_u64 = vsetq_lane_u64(a, a_u64, 1);
1414   return vreinterpretq_u16_u64(a_u64);
1415 }
1416 
load_unaligned_s16_4x2(const int16_t * buf,uint32_t stride)1417 static inline int16x8_t load_unaligned_s16_4x2(const int16_t *buf,
1418                                                uint32_t stride) {
1419   int64_t a;
1420   int64x2_t a_s64;
1421   memcpy(&a, buf, 8);
1422   buf += stride;
1423   a_s64 = vdupq_n_s64(0);
1424   a_s64 = vsetq_lane_s64(a, a_s64, 0);
1425   memcpy(&a, buf, 8);
1426   buf += stride;
1427   a_s64 = vsetq_lane_s64(a, a_s64, 1);
1428   return vreinterpretq_s16_s64(a_s64);
1429 }
1430 
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint16x8_t * tu0,uint16x8_t * tu1)1431 static inline void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
1432                                           uint16x8_t *tu0, uint16x8_t *tu1) {
1433   *tu0 = load_unaligned_u16_4x2(buf, stride);
1434   buf += 2 * stride;
1435   *tu1 = load_unaligned_u16_4x2(buf, stride);
1436 }
1437 
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)1438 static inline void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
1439                                 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
1440   *s1 = vld1q_s32(s);
1441   s += p;
1442   *s2 = vld1q_s32(s);
1443   s += p;
1444   *s3 = vld1q_s32(s);
1445   s += p;
1446   *s4 = vld1q_s32(s);
1447 }
1448 
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)1449 static inline void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
1450                                  int32x4_t s2, int32x4_t s3, int32x4_t s4) {
1451   vst1q_s32(s, s1);
1452   s += p;
1453   vst1q_s32(s, s2);
1454   s += p;
1455   vst1q_s32(s, s3);
1456   s += p;
1457   vst1q_s32(s, s4);
1458 }
1459 
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)1460 static inline void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
1461                                 uint32x4_t *s2, uint32x4_t *s3,
1462                                 uint32x4_t *s4) {
1463   *s1 = vld1q_u32(s);
1464   s += p;
1465   *s2 = vld1q_u32(s);
1466   s += p;
1467   *s3 = vld1q_u32(s);
1468   s += p;
1469   *s4 = vld1q_u32(s);
1470 }
1471 
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)1472 static inline void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
1473                                  uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
1474   vst1q_u32(s, s1);
1475   s += p;
1476   vst1q_u32(s, s2);
1477   s += p;
1478   vst1q_u32(s, s3);
1479   s += p;
1480   vst1q_u32(s, s4);
1481 }
1482 
load_tran_low_to_s16q(const tran_low_t * buf)1483 static inline int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
1484   const int32x4_t v0 = vld1q_s32(buf);
1485   const int32x4_t v1 = vld1q_s32(buf + 4);
1486   const int16x4_t s0 = vmovn_s32(v0);
1487   const int16x4_t s1 = vmovn_s32(v1);
1488   return vcombine_s16(s0, s1);
1489 }
1490 
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)1491 static inline void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
1492   const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
1493   const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
1494   vst1q_s32(buf, v0);
1495   vst1q_s32(buf + 4, v1);
1496 }
1497 
store_s16_to_tran_low(tran_low_t * buf,const int16x4_t a)1498 static inline void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
1499   const int32x4_t v0 = vmovl_s16(a);
1500   vst1q_s32(buf, v0);
1501 }
1502 
load_u8_gather_s16_x8(const uint8_t * src,int16x8_t indices)1503 static inline uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
1504                                               int16x8_t indices) {
1505   // Recent Clang and GCC versions correctly identify that this zero-broadcast
1506   // is redundant. Alternatively we could load and broadcast the zeroth element
1507   // and then replace the other lanes, however this is slower than loading a
1508   // single element without broadcast on some micro-architectures.
1509   uint8x8_t ret = vdup_n_u8(0);
1510   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0);
1511   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1);
1512   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2);
1513   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3);
1514   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4);
1515   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5);
1516   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6);
1517   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7);
1518   return ret;
1519 }
1520 
1521 // The `lane` parameter here must be an immediate.
1522 #define store_u8_2x1_lane(dst, src, lane)                       \
1523   do {                                                          \
1524     uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
1525     memcpy(dst, &a, 2);                                         \
1526   } while (0)
1527 
1528 #define store_u8_4x1_lane(dst, src, lane)                       \
1529   do {                                                          \
1530     uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
1531     memcpy(dst, &a, 4);                                         \
1532   } while (0)
1533 
1534 #define store_u16_2x1_lane(dst, src, lane)                       \
1535   do {                                                           \
1536     uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
1537     memcpy(dst, &a, 4);                                          \
1538   } while (0)
1539 
1540 #define store_u16_4x1_lane(dst, src, lane)                         \
1541   do {                                                             \
1542     uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
1543     memcpy(dst, &a, 8);                                            \
1544   } while (0)
1545 
1546 #define store_s16_4x1_lane(dst, src, lane)                        \
1547   do {                                                            \
1548     int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \
1549     memcpy(dst, &a, 8);                                           \
1550   } while (0)
1551 
1552 // Store the low 16-bits from a single vector.
store_u8_2x1(uint8_t * dst,const uint8x8_t src)1553 static inline void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
1554   store_u8_2x1_lane(dst, src, 0);
1555 }
1556 
1557 // Store the low 32-bits from a single vector.
store_u8_4x1(uint8_t * dst,const uint8x8_t src)1558 static inline void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
1559   store_u8_4x1_lane(dst, src, 0);
1560 }
1561 
1562 // Store two blocks of 16-bits from a single vector.
store_u8x2_strided_x2(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1563 static inline void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
1564                                          uint8x8_t src) {
1565   store_u8_2x1_lane(dst, src, 0);
1566   dst += dst_stride;
1567   store_u8_2x1_lane(dst, src, 1);
1568 }
1569 
store_u8x2_strided_x4(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1570 static inline void store_u8x2_strided_x4(uint8_t *dst, uint32_t dst_stride,
1571                                          uint8x8_t src) {
1572   store_u8_2x1_lane(dst, src, 0);
1573   dst += dst_stride;
1574   store_u8_2x1_lane(dst, src, 1);
1575   dst += dst_stride;
1576   store_u8_2x1_lane(dst, src, 2);
1577   dst += dst_stride;
1578   store_u8_2x1_lane(dst, src, 3);
1579 }
1580 
1581 // Store two blocks of 32-bits from a single vector.
store_u8x4_strided_x2(uint8_t * dst,ptrdiff_t stride,uint8x8_t src)1582 static inline void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
1583                                          uint8x8_t src) {
1584   store_u8_4x1_lane(dst, src, 0);
1585   dst += stride;
1586   store_u8_4x1_lane(dst, src, 1);
1587 }
1588 
1589 // Store four blocks of 32-bits from a single vector.
store_u8x4_strided_x4(uint8_t * dst,ptrdiff_t stride,uint8x16_t src)1590 static inline void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
1591                                          uint8x16_t src) {
1592   store_u8_4x1_lane(dst, vget_low_u8(src), 0);
1593   dst += stride;
1594   store_u8_4x1_lane(dst, vget_low_u8(src), 1);
1595   dst += stride;
1596   store_u8_4x1_lane(dst, vget_high_u8(src), 0);
1597   dst += stride;
1598   store_u8_4x1_lane(dst, vget_high_u8(src), 1);
1599 }
1600 
1601 // Store the low 32-bits from a single vector.
store_u16_2x1(uint16_t * dst,const uint16x4_t src)1602 static inline void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
1603   store_u16_2x1_lane(dst, src, 0);
1604 }
1605 
1606 // Store two blocks of 32-bits from a single vector.
store_u16x2_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x4_t src)1607 static inline void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
1608                                           uint16x4_t src) {
1609   store_u16_2x1_lane(dst, src, 0);
1610   dst += dst_stride;
1611   store_u16_2x1_lane(dst, src, 1);
1612 }
1613 
1614 // Store two blocks of 64-bits from a single vector.
store_u16x4_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x8_t src)1615 static inline void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
1616                                           uint16x8_t src) {
1617   store_u16_4x1_lane(dst, src, 0);
1618   dst += dst_stride;
1619   store_u16_4x1_lane(dst, src, 1);
1620 }
1621 
1622 // Store two blocks of 64-bits from a single vector.
store_s16x4_strided_x2(int16_t * dst,int32_t dst_stride,int16x8_t src)1623 static inline void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride,
1624                                           int16x8_t src) {
1625   store_s16_4x1_lane(dst, src, 0);
1626   dst += dst_stride;
1627   store_s16_4x1_lane(dst, src, 1);
1628 }
1629 
1630 #undef store_u8_2x1_lane
1631 #undef store_u8_4x1_lane
1632 #undef store_u16_2x1_lane
1633 #undef store_u16_4x1_lane
1634 #undef store_s16_4x1_lane
1635 
1636 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
1637