• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
12 #define AOM_AOM_DSP_ARM_MEM_NEON_H_
13 
14 #include <arm_neon.h>
15 #include <string.h>
16 #include "aom_dsp/aom_dsp_common.h"
17 
18 // Support for xN Neon intrinsics is lacking in some compilers.
19 #if defined(__arm__) || defined(_M_ARM)
20 #define ARM_32_BIT
21 #endif
22 
23 // DEFICIENT_CLANG_32_BIT includes clang-cl.
24 #if defined(__clang__) && defined(ARM_32_BIT) && \
25     (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
26 #define DEFICIENT_CLANG_32_BIT  // This includes clang-cl.
27 #endif
28 
29 #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
30 #define GCC_32_BIT
31 #endif
32 
33 #if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
34 
vld1q_u8_x3(const uint8_t * ptr)35 static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
36   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
37                          vld1q_u8(ptr + 2 * 16) } };
38   return res;
39 }
40 
vld1q_u8_x2(const uint8_t * ptr)41 static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
42   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
43   return res;
44 }
45 
vld1q_u16_x2(const uint16_t * ptr)46 static INLINE uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
47   uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
48   return res;
49 }
50 
vld1q_u16_x4(const uint16_t * ptr)51 static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
52   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
53                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
54   return res;
55 }
56 
57 #elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
58 #if __GNUC__ < 8
vld1q_u8_x2(const uint8_t * ptr)59 static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
60   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
61   return res;
62 }
63 #endif  // __GNUC__ < 8
64 
65 #if __GNUC__ < 9
vld1q_u8_x3(const uint8_t * ptr)66 static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
67   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
68                          vld1q_u8(ptr + 2 * 16) } };
69   return res;
70 }
71 #endif  // __GNUC__ < 9
72 
73 // vld1q_u16_x4 is defined from GCC 8.5.0 and onwards.
74 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
vld1q_u16_x4(const uint16_t * ptr)75 static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
76   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
77                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
78   return res;
79 }
80 #endif  // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
81 #endif  // defined(__GNUC__) && !defined(__clang__)
82 
store_u8_8x2(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1)83 static INLINE void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
84                                 const uint8x8_t s1) {
85   vst1_u8(s, s0);
86   s += p;
87   vst1_u8(s, s1);
88   s += p;
89 }
90 
load_u8_8x2(const uint8_t * s,ptrdiff_t p)91 static INLINE uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
92   return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
93 }
94 
95 // Load four bytes into the low half of a uint8x8_t, zero the upper half.
load_u8_4x1(const uint8_t * p)96 static INLINE uint8x8_t load_u8_4x1(const uint8_t *p) {
97   uint8x8_t ret = vdup_n_u8(0);
98   ret = vreinterpret_u8_u32(
99       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
100   return ret;
101 }
102 
load_u8_4x2(const uint8_t * p,int stride)103 static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
104   uint8x8_t ret = vdup_n_u8(0);
105   ret = vreinterpret_u8_u32(
106       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
107   p += stride;
108   ret = vreinterpret_u8_u32(
109       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
110   return ret;
111 }
112 
load_u16_2x2(const uint16_t * p,int stride)113 static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
114   uint16x4_t ret = vdup_n_u16(0);
115   ret = vreinterpret_u16_u32(
116       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
117   p += stride;
118   ret = vreinterpret_u16_u32(
119       vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
120   return ret;
121 }
122 
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)123 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
124                                uint8x8_t *const s0, uint8x8_t *const s1,
125                                uint8x8_t *const s2, uint8x8_t *const s3,
126                                uint8x8_t *const s4, uint8x8_t *const s5,
127                                uint8x8_t *const s6, uint8x8_t *const s7) {
128   *s0 = vld1_u8(s);
129   s += p;
130   *s1 = vld1_u8(s);
131   s += p;
132   *s2 = vld1_u8(s);
133   s += p;
134   *s3 = vld1_u8(s);
135   s += p;
136   *s4 = vld1_u8(s);
137   s += p;
138   *s5 = vld1_u8(s);
139   s += p;
140   *s6 = vld1_u8(s);
141   s += p;
142   *s7 = vld1_u8(s);
143 }
144 
load_u8_8x7(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6)145 static INLINE void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
146                                uint8x8_t *const s0, uint8x8_t *const s1,
147                                uint8x8_t *const s2, uint8x8_t *const s3,
148                                uint8x8_t *const s4, uint8x8_t *const s5,
149                                uint8x8_t *const s6) {
150   *s0 = vld1_u8(s);
151   s += p;
152   *s1 = vld1_u8(s);
153   s += p;
154   *s2 = vld1_u8(s);
155   s += p;
156   *s3 = vld1_u8(s);
157   s += p;
158   *s4 = vld1_u8(s);
159   s += p;
160   *s5 = vld1_u8(s);
161   s += p;
162   *s6 = vld1_u8(s);
163 }
164 
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)165 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
166                                uint8x8_t *const s0, uint8x8_t *const s1,
167                                uint8x8_t *const s2, uint8x8_t *const s3) {
168   *s0 = vld1_u8(s);
169   s += p;
170   *s1 = vld1_u8(s);
171   s += p;
172   *s2 = vld1_u8(s);
173   s += p;
174   *s3 = vld1_u8(s);
175 }
176 
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)177 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
178                                 uint16x4_t *const s0, uint16x4_t *const s1,
179                                 uint16x4_t *const s2, uint16x4_t *const s3) {
180   *s0 = vld1_u16(s);
181   s += p;
182   *s1 = vld1_u16(s);
183   s += p;
184   *s2 = vld1_u16(s);
185   s += p;
186   *s3 = vld1_u16(s);
187   s += p;
188 }
189 
load_u16_4x7(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6)190 static INLINE void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
191                                 uint16x4_t *const s0, uint16x4_t *const s1,
192                                 uint16x4_t *const s2, uint16x4_t *const s3,
193                                 uint16x4_t *const s4, uint16x4_t *const s5,
194                                 uint16x4_t *const s6) {
195   *s0 = vld1_u16(s);
196   s += p;
197   *s1 = vld1_u16(s);
198   s += p;
199   *s2 = vld1_u16(s);
200   s += p;
201   *s3 = vld1_u16(s);
202   s += p;
203   *s4 = vld1_u16(s);
204   s += p;
205   *s5 = vld1_u16(s);
206   s += p;
207   *s6 = vld1_u16(s);
208 }
209 
load_s16_8x2(const int16_t * s,const ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1)210 static INLINE void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
211                                 int16x8_t *const s0, int16x8_t *const s1) {
212   *s0 = vld1q_s16(s);
213   s += p;
214   *s1 = vld1q_s16(s);
215 }
216 
load_u16_8x2(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1)217 static INLINE void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
218                                 uint16x8_t *const s0, uint16x8_t *const s1) {
219   *s0 = vld1q_u16(s);
220   s += p;
221   *s1 = vld1q_u16(s);
222 }
223 
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)224 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
225                                 uint16x8_t *const s0, uint16x8_t *const s1,
226                                 uint16x8_t *const s2, uint16x8_t *const s3) {
227   *s0 = vld1q_u16(s);
228   s += p;
229   *s1 = vld1q_u16(s);
230   s += p;
231   *s2 = vld1q_u16(s);
232   s += p;
233   *s3 = vld1q_u16(s);
234   s += p;
235 }
236 
load_s16_4x12(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10,int16x4_t * const s11)237 static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p,
238                                  int16x4_t *const s0, int16x4_t *const s1,
239                                  int16x4_t *const s2, int16x4_t *const s3,
240                                  int16x4_t *const s4, int16x4_t *const s5,
241                                  int16x4_t *const s6, int16x4_t *const s7,
242                                  int16x4_t *const s8, int16x4_t *const s9,
243                                  int16x4_t *const s10, int16x4_t *const s11) {
244   *s0 = vld1_s16(s);
245   s += p;
246   *s1 = vld1_s16(s);
247   s += p;
248   *s2 = vld1_s16(s);
249   s += p;
250   *s3 = vld1_s16(s);
251   s += p;
252   *s4 = vld1_s16(s);
253   s += p;
254   *s5 = vld1_s16(s);
255   s += p;
256   *s6 = vld1_s16(s);
257   s += p;
258   *s7 = vld1_s16(s);
259   s += p;
260   *s8 = vld1_s16(s);
261   s += p;
262   *s9 = vld1_s16(s);
263   s += p;
264   *s10 = vld1_s16(s);
265   s += p;
266   *s11 = vld1_s16(s);
267 }
268 
load_s16_4x11(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10)269 static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p,
270                                  int16x4_t *const s0, int16x4_t *const s1,
271                                  int16x4_t *const s2, int16x4_t *const s3,
272                                  int16x4_t *const s4, int16x4_t *const s5,
273                                  int16x4_t *const s6, int16x4_t *const s7,
274                                  int16x4_t *const s8, int16x4_t *const s9,
275                                  int16x4_t *const s10) {
276   *s0 = vld1_s16(s);
277   s += p;
278   *s1 = vld1_s16(s);
279   s += p;
280   *s2 = vld1_s16(s);
281   s += p;
282   *s3 = vld1_s16(s);
283   s += p;
284   *s4 = vld1_s16(s);
285   s += p;
286   *s5 = vld1_s16(s);
287   s += p;
288   *s6 = vld1_s16(s);
289   s += p;
290   *s7 = vld1_s16(s);
291   s += p;
292   *s8 = vld1_s16(s);
293   s += p;
294   *s9 = vld1_s16(s);
295   s += p;
296   *s10 = vld1_s16(s);
297 }
298 
load_u16_4x11(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6,uint16x4_t * const s7,uint16x4_t * const s8,uint16x4_t * const s9,uint16x4_t * const s10)299 static INLINE void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
300                                  uint16x4_t *const s0, uint16x4_t *const s1,
301                                  uint16x4_t *const s2, uint16x4_t *const s3,
302                                  uint16x4_t *const s4, uint16x4_t *const s5,
303                                  uint16x4_t *const s6, uint16x4_t *const s7,
304                                  uint16x4_t *const s8, uint16x4_t *const s9,
305                                  uint16x4_t *const s10) {
306   *s0 = vld1_u16(s);
307   s += p;
308   *s1 = vld1_u16(s);
309   s += p;
310   *s2 = vld1_u16(s);
311   s += p;
312   *s3 = vld1_u16(s);
313   s += p;
314   *s4 = vld1_u16(s);
315   s += p;
316   *s5 = vld1_u16(s);
317   s += p;
318   *s6 = vld1_u16(s);
319   s += p;
320   *s7 = vld1_u16(s);
321   s += p;
322   *s8 = vld1_u16(s);
323   s += p;
324   *s9 = vld1_u16(s);
325   s += p;
326   *s10 = vld1_u16(s);
327 }
328 
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)329 static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
330                                 int16x4_t *const s0, int16x4_t *const s1,
331                                 int16x4_t *const s2, int16x4_t *const s3,
332                                 int16x4_t *const s4, int16x4_t *const s5,
333                                 int16x4_t *const s6, int16x4_t *const s7) {
334   *s0 = vld1_s16(s);
335   s += p;
336   *s1 = vld1_s16(s);
337   s += p;
338   *s2 = vld1_s16(s);
339   s += p;
340   *s3 = vld1_s16(s);
341   s += p;
342   *s4 = vld1_s16(s);
343   s += p;
344   *s5 = vld1_s16(s);
345   s += p;
346   *s6 = vld1_s16(s);
347   s += p;
348   *s7 = vld1_s16(s);
349 }
350 
load_s16_4x7(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6)351 static INLINE void load_s16_4x7(const int16_t *s, ptrdiff_t p,
352                                 int16x4_t *const s0, int16x4_t *const s1,
353                                 int16x4_t *const s2, int16x4_t *const s3,
354                                 int16x4_t *const s4, int16x4_t *const s5,
355                                 int16x4_t *const s6) {
356   *s0 = vld1_s16(s);
357   s += p;
358   *s1 = vld1_s16(s);
359   s += p;
360   *s2 = vld1_s16(s);
361   s += p;
362   *s3 = vld1_s16(s);
363   s += p;
364   *s4 = vld1_s16(s);
365   s += p;
366   *s5 = vld1_s16(s);
367   s += p;
368   *s6 = vld1_s16(s);
369 }
370 
load_s16_4x6(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5)371 static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p,
372                                 int16x4_t *const s0, int16x4_t *const s1,
373                                 int16x4_t *const s2, int16x4_t *const s3,
374                                 int16x4_t *const s4, int16x4_t *const s5) {
375   *s0 = vld1_s16(s);
376   s += p;
377   *s1 = vld1_s16(s);
378   s += p;
379   *s2 = vld1_s16(s);
380   s += p;
381   *s3 = vld1_s16(s);
382   s += p;
383   *s4 = vld1_s16(s);
384   s += p;
385   *s5 = vld1_s16(s);
386 }
387 
load_s16_4x5(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4)388 static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p,
389                                 int16x4_t *const s0, int16x4_t *const s1,
390                                 int16x4_t *const s2, int16x4_t *const s3,
391                                 int16x4_t *const s4) {
392   *s0 = vld1_s16(s);
393   s += p;
394   *s1 = vld1_s16(s);
395   s += p;
396   *s2 = vld1_s16(s);
397   s += p;
398   *s3 = vld1_s16(s);
399   s += p;
400   *s4 = vld1_s16(s);
401 }
402 
load_u16_4x5(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4)403 static INLINE void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
404                                 uint16x4_t *const s0, uint16x4_t *const s1,
405                                 uint16x4_t *const s2, uint16x4_t *const s3,
406                                 uint16x4_t *const s4) {
407   *s0 = vld1_u16(s);
408   s += p;
409   *s1 = vld1_u16(s);
410   s += p;
411   *s2 = vld1_u16(s);
412   s += p;
413   *s3 = vld1_u16(s);
414   s += p;
415   *s4 = vld1_u16(s);
416   s += p;
417 }
418 
load_u8_8x5(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4)419 static INLINE void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
420                                uint8x8_t *const s0, uint8x8_t *const s1,
421                                uint8x8_t *const s2, uint8x8_t *const s3,
422                                uint8x8_t *const s4) {
423   *s0 = vld1_u8(s);
424   s += p;
425   *s1 = vld1_u8(s);
426   s += p;
427   *s2 = vld1_u8(s);
428   s += p;
429   *s3 = vld1_u8(s);
430   s += p;
431   *s4 = vld1_u8(s);
432 }
433 
load_u16_8x5(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4)434 static INLINE void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
435                                 uint16x8_t *const s0, uint16x8_t *const s1,
436                                 uint16x8_t *const s2, uint16x8_t *const s3,
437                                 uint16x8_t *const s4) {
438   *s0 = vld1q_u16(s);
439   s += p;
440   *s1 = vld1q_u16(s);
441   s += p;
442   *s2 = vld1q_u16(s);
443   s += p;
444   *s3 = vld1q_u16(s);
445   s += p;
446   *s4 = vld1q_u16(s);
447   s += p;
448 }
449 
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)450 static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
451                                 int16x4_t *const s0, int16x4_t *const s1,
452                                 int16x4_t *const s2, int16x4_t *const s3) {
453   *s0 = vld1_s16(s);
454   s += p;
455   *s1 = vld1_s16(s);
456   s += p;
457   *s2 = vld1_s16(s);
458   s += p;
459   *s3 = vld1_s16(s);
460 }
461 
load_s16_4x3(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2)462 static INLINE void load_s16_4x3(const int16_t *s, ptrdiff_t p,
463                                 int16x4_t *const s0, int16x4_t *const s1,
464                                 int16x4_t *const s2) {
465   *s0 = vld1_s16(s);
466   s += p;
467   *s1 = vld1_s16(s);
468   s += p;
469   *s2 = vld1_s16(s);
470 }
471 
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)472 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
473                                 const uint8x8_t s1, const uint8x8_t s2,
474                                 const uint8x8_t s3, const uint8x8_t s4,
475                                 const uint8x8_t s5, const uint8x8_t s6,
476                                 const uint8x8_t s7) {
477   vst1_u8(s, s0);
478   s += p;
479   vst1_u8(s, s1);
480   s += p;
481   vst1_u8(s, s2);
482   s += p;
483   vst1_u8(s, s3);
484   s += p;
485   vst1_u8(s, s4);
486   s += p;
487   vst1_u8(s, s5);
488   s += p;
489   vst1_u8(s, s6);
490   s += p;
491   vst1_u8(s, s7);
492 }
493 
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)494 static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
495                                 const uint8x8_t s1, const uint8x8_t s2,
496                                 const uint8x8_t s3) {
497   vst1_u8(s, s0);
498   s += p;
499   vst1_u8(s, s1);
500   s += p;
501   vst1_u8(s, s2);
502   s += p;
503   vst1_u8(s, s3);
504 }
505 
store_u8_16x4(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)506 static INLINE void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
507                                  const uint8x16_t s1, const uint8x16_t s2,
508                                  const uint8x16_t s3) {
509   vst1q_u8(s, s0);
510   s += p;
511   vst1q_u8(s, s1);
512   s += p;
513   vst1q_u8(s, s2);
514   s += p;
515   vst1q_u8(s, s3);
516 }
517 
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)518 static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
519                                  const uint16x8_t s0, const uint16x8_t s1,
520                                  const uint16x8_t s2, const uint16x8_t s3,
521                                  const uint16x8_t s4, const uint16x8_t s5,
522                                  const uint16x8_t s6, const uint16x8_t s7) {
523   vst1q_u16(s, s0);
524   s += dst_stride;
525   vst1q_u16(s, s1);
526   s += dst_stride;
527   vst1q_u16(s, s2);
528   s += dst_stride;
529   vst1q_u16(s, s3);
530   s += dst_stride;
531   vst1q_u16(s, s4);
532   s += dst_stride;
533   vst1q_u16(s, s5);
534   s += dst_stride;
535   vst1q_u16(s, s6);
536   s += dst_stride;
537   vst1q_u16(s, s7);
538 }
539 
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)540 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
541                                  const uint16x4_t s0, const uint16x4_t s1,
542                                  const uint16x4_t s2, const uint16x4_t s3) {
543   vst1_u16(s, s0);
544   s += dst_stride;
545   vst1_u16(s, s1);
546   s += dst_stride;
547   vst1_u16(s, s2);
548   s += dst_stride;
549   vst1_u16(s, s3);
550 }
551 
store_u16_8x2(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1)552 static INLINE void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
553                                  const uint16x8_t s0, const uint16x8_t s1) {
554   vst1q_u16(s, s0);
555   s += dst_stride;
556   vst1q_u16(s, s1);
557 }
558 
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)559 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
560                                  const uint16x8_t s0, const uint16x8_t s1,
561                                  const uint16x8_t s2, const uint16x8_t s3) {
562   vst1q_u16(s, s0);
563   s += dst_stride;
564   vst1q_u16(s, s1);
565   s += dst_stride;
566   vst1q_u16(s, s2);
567   s += dst_stride;
568   vst1q_u16(s, s3);
569 }
570 
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)571 static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
572                                  const int16x8_t s0, const int16x8_t s1,
573                                  const int16x8_t s2, const int16x8_t s3,
574                                  const int16x8_t s4, const int16x8_t s5,
575                                  const int16x8_t s6, const int16x8_t s7) {
576   vst1q_s16(s, s0);
577   s += dst_stride;
578   vst1q_s16(s, s1);
579   s += dst_stride;
580   vst1q_s16(s, s2);
581   s += dst_stride;
582   vst1q_s16(s, s3);
583   s += dst_stride;
584   vst1q_s16(s, s4);
585   s += dst_stride;
586   vst1q_s16(s, s5);
587   s += dst_stride;
588   vst1q_s16(s, s6);
589   s += dst_stride;
590   vst1q_s16(s, s7);
591 }
592 
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)593 static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
594                                  const int16x4_t s0, const int16x4_t s1,
595                                  const int16x4_t s2, const int16x4_t s3) {
596   vst1_s16(s, s0);
597   s += dst_stride;
598   vst1_s16(s, s1);
599   s += dst_stride;
600   vst1_s16(s, s2);
601   s += dst_stride;
602   vst1_s16(s, s3);
603 }
604 
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)605 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
606                                  const int16x8_t s0, const int16x8_t s1,
607                                  const int16x8_t s2, const int16x8_t s3) {
608   vst1q_s16(s, s0);
609   s += dst_stride;
610   vst1q_s16(s, s1);
611   s += dst_stride;
612   vst1q_s16(s, s2);
613   s += dst_stride;
614   vst1q_s16(s, s3);
615 }
616 
load_u8_8x11(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7,uint8x8_t * const s8,uint8x8_t * const s9,uint8x8_t * const s10)617 static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
618                                 uint8x8_t *const s0, uint8x8_t *const s1,
619                                 uint8x8_t *const s2, uint8x8_t *const s3,
620                                 uint8x8_t *const s4, uint8x8_t *const s5,
621                                 uint8x8_t *const s6, uint8x8_t *const s7,
622                                 uint8x8_t *const s8, uint8x8_t *const s9,
623                                 uint8x8_t *const s10) {
624   *s0 = vld1_u8(s);
625   s += p;
626   *s1 = vld1_u8(s);
627   s += p;
628   *s2 = vld1_u8(s);
629   s += p;
630   *s3 = vld1_u8(s);
631   s += p;
632   *s4 = vld1_u8(s);
633   s += p;
634   *s5 = vld1_u8(s);
635   s += p;
636   *s6 = vld1_u8(s);
637   s += p;
638   *s7 = vld1_u8(s);
639   s += p;
640   *s8 = vld1_u8(s);
641   s += p;
642   *s9 = vld1_u8(s);
643   s += p;
644   *s10 = vld1_u8(s);
645 }
646 
load_s16_8x10(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9)647 static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p,
648                                  int16x8_t *const s0, int16x8_t *const s1,
649                                  int16x8_t *const s2, int16x8_t *const s3,
650                                  int16x8_t *const s4, int16x8_t *const s5,
651                                  int16x8_t *const s6, int16x8_t *const s7,
652                                  int16x8_t *const s8, int16x8_t *const s9) {
653   *s0 = vld1q_s16(s);
654   s += p;
655   *s1 = vld1q_s16(s);
656   s += p;
657   *s2 = vld1q_s16(s);
658   s += p;
659   *s3 = vld1q_s16(s);
660   s += p;
661   *s4 = vld1q_s16(s);
662   s += p;
663   *s5 = vld1q_s16(s);
664   s += p;
665   *s6 = vld1q_s16(s);
666   s += p;
667   *s7 = vld1q_s16(s);
668   s += p;
669   *s8 = vld1q_s16(s);
670   s += p;
671   *s9 = vld1q_s16(s);
672 }
673 
load_s16_8x11(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10)674 static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
675                                  int16x8_t *const s0, int16x8_t *const s1,
676                                  int16x8_t *const s2, int16x8_t *const s3,
677                                  int16x8_t *const s4, int16x8_t *const s5,
678                                  int16x8_t *const s6, int16x8_t *const s7,
679                                  int16x8_t *const s8, int16x8_t *const s9,
680                                  int16x8_t *const s10) {
681   *s0 = vld1q_s16(s);
682   s += p;
683   *s1 = vld1q_s16(s);
684   s += p;
685   *s2 = vld1q_s16(s);
686   s += p;
687   *s3 = vld1q_s16(s);
688   s += p;
689   *s4 = vld1q_s16(s);
690   s += p;
691   *s5 = vld1q_s16(s);
692   s += p;
693   *s6 = vld1q_s16(s);
694   s += p;
695   *s7 = vld1q_s16(s);
696   s += p;
697   *s8 = vld1q_s16(s);
698   s += p;
699   *s9 = vld1q_s16(s);
700   s += p;
701   *s10 = vld1q_s16(s);
702 }
703 
load_s16_8x12(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10,int16x8_t * const s11)704 static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p,
705                                  int16x8_t *const s0, int16x8_t *const s1,
706                                  int16x8_t *const s2, int16x8_t *const s3,
707                                  int16x8_t *const s4, int16x8_t *const s5,
708                                  int16x8_t *const s6, int16x8_t *const s7,
709                                  int16x8_t *const s8, int16x8_t *const s9,
710                                  int16x8_t *const s10, int16x8_t *const s11) {
711   *s0 = vld1q_s16(s);
712   s += p;
713   *s1 = vld1q_s16(s);
714   s += p;
715   *s2 = vld1q_s16(s);
716   s += p;
717   *s3 = vld1q_s16(s);
718   s += p;
719   *s4 = vld1q_s16(s);
720   s += p;
721   *s5 = vld1q_s16(s);
722   s += p;
723   *s6 = vld1q_s16(s);
724   s += p;
725   *s7 = vld1q_s16(s);
726   s += p;
727   *s8 = vld1q_s16(s);
728   s += p;
729   *s9 = vld1q_s16(s);
730   s += p;
731   *s10 = vld1q_s16(s);
732   s += p;
733   *s11 = vld1q_s16(s);
734 }
735 
load_u16_8x11(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7,uint16x8_t * const s8,uint16x8_t * const s9,uint16x8_t * const s10)736 static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
737                                  uint16x8_t *const s0, uint16x8_t *const s1,
738                                  uint16x8_t *const s2, uint16x8_t *const s3,
739                                  uint16x8_t *const s4, uint16x8_t *const s5,
740                                  uint16x8_t *const s6, uint16x8_t *const s7,
741                                  uint16x8_t *const s8, uint16x8_t *const s9,
742                                  uint16x8_t *const s10) {
743   *s0 = vld1q_u16(s);
744   s += p;
745   *s1 = vld1q_u16(s);
746   s += p;
747   *s2 = vld1q_u16(s);
748   s += p;
749   *s3 = vld1q_u16(s);
750   s += p;
751   *s4 = vld1q_u16(s);
752   s += p;
753   *s5 = vld1q_u16(s);
754   s += p;
755   *s6 = vld1q_u16(s);
756   s += p;
757   *s7 = vld1q_u16(s);
758   s += p;
759   *s8 = vld1q_u16(s);
760   s += p;
761   *s9 = vld1q_u16(s);
762   s += p;
763   *s10 = vld1q_u16(s);
764 }
765 
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)766 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
767                                 int16x8_t *const s0, int16x8_t *const s1,
768                                 int16x8_t *const s2, int16x8_t *const s3,
769                                 int16x8_t *const s4, int16x8_t *const s5,
770                                 int16x8_t *const s6, int16x8_t *const s7) {
771   *s0 = vld1q_s16(s);
772   s += p;
773   *s1 = vld1q_s16(s);
774   s += p;
775   *s2 = vld1q_s16(s);
776   s += p;
777   *s3 = vld1q_s16(s);
778   s += p;
779   *s4 = vld1q_s16(s);
780   s += p;
781   *s5 = vld1q_s16(s);
782   s += p;
783   *s6 = vld1q_s16(s);
784   s += p;
785   *s7 = vld1q_s16(s);
786 }
787 
load_u16_8x7(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6)788 static INLINE void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
789                                 uint16x8_t *const s0, uint16x8_t *const s1,
790                                 uint16x8_t *const s2, uint16x8_t *const s3,
791                                 uint16x8_t *const s4, uint16x8_t *const s5,
792                                 uint16x8_t *const s6) {
793   *s0 = vld1q_u16(s);
794   s += p;
795   *s1 = vld1q_u16(s);
796   s += p;
797   *s2 = vld1q_u16(s);
798   s += p;
799   *s3 = vld1q_u16(s);
800   s += p;
801   *s4 = vld1q_u16(s);
802   s += p;
803   *s5 = vld1q_u16(s);
804   s += p;
805   *s6 = vld1q_u16(s);
806 }
807 
load_s16_8x7(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6)808 static INLINE void load_s16_8x7(const int16_t *s, ptrdiff_t p,
809                                 int16x8_t *const s0, int16x8_t *const s1,
810                                 int16x8_t *const s2, int16x8_t *const s3,
811                                 int16x8_t *const s4, int16x8_t *const s5,
812                                 int16x8_t *const s6) {
813   *s0 = vld1q_s16(s);
814   s += p;
815   *s1 = vld1q_s16(s);
816   s += p;
817   *s2 = vld1q_s16(s);
818   s += p;
819   *s3 = vld1q_s16(s);
820   s += p;
821   *s4 = vld1q_s16(s);
822   s += p;
823   *s5 = vld1q_s16(s);
824   s += p;
825   *s6 = vld1q_s16(s);
826 }
827 
load_s16_8x6(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5)828 static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p,
829                                 int16x8_t *const s0, int16x8_t *const s1,
830                                 int16x8_t *const s2, int16x8_t *const s3,
831                                 int16x8_t *const s4, int16x8_t *const s5) {
832   *s0 = vld1q_s16(s);
833   s += p;
834   *s1 = vld1q_s16(s);
835   s += p;
836   *s2 = vld1q_s16(s);
837   s += p;
838   *s3 = vld1q_s16(s);
839   s += p;
840   *s4 = vld1q_s16(s);
841   s += p;
842   *s5 = vld1q_s16(s);
843 }
844 
load_s16_8x5(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4)845 static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p,
846                                 int16x8_t *const s0, int16x8_t *const s1,
847                                 int16x8_t *const s2, int16x8_t *const s3,
848                                 int16x8_t *const s4) {
849   *s0 = vld1q_s16(s);
850   s += p;
851   *s1 = vld1q_s16(s);
852   s += p;
853   *s2 = vld1q_s16(s);
854   s += p;
855   *s3 = vld1q_s16(s);
856   s += p;
857   *s4 = vld1q_s16(s);
858 }
859 
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)860 static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
861                                 int16x8_t *const s0, int16x8_t *const s1,
862                                 int16x8_t *const s2, int16x8_t *const s3) {
863   *s0 = vld1q_s16(s);
864   s += p;
865   *s1 = vld1q_s16(s);
866   s += p;
867   *s2 = vld1q_s16(s);
868   s += p;
869   *s3 = vld1q_s16(s);
870 }
871 
load_s16_8x3(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2)872 static INLINE void load_s16_8x3(const int16_t *s, ptrdiff_t p,
873                                 int16x8_t *const s0, int16x8_t *const s1,
874                                 int16x8_t *const s2) {
875   *s0 = vld1q_s16(s);
876   s += p;
877   *s1 = vld1q_s16(s);
878   s += p;
879   *s2 = vld1q_s16(s);
880 }
881 
882 // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,int stride)883 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
884   uint32_t a;
885   memcpy(&a, buf, 4);
886   buf += stride;
887   uint32x2_t a_u32 = vdup_n_u32(a);
888   memcpy(&a, buf, 4);
889   a_u32 = vset_lane_u32(a, a_u32, 1);
890   return vreinterpret_u8_u32(a_u32);
891 }
892 
893 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,int stride)894 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
895   uint32_t a;
896   uint32x4_t a_u32;
897   if (stride == 4) return vld1q_u8(buf);
898   memcpy(&a, buf, 4);
899   buf += stride;
900   a_u32 = vdupq_n_u32(a);
901   memcpy(&a, buf, 4);
902   buf += stride;
903   a_u32 = vsetq_lane_u32(a, a_u32, 1);
904   memcpy(&a, buf, 4);
905   buf += stride;
906   a_u32 = vsetq_lane_u32(a, a_u32, 2);
907   memcpy(&a, buf, 4);
908   a_u32 = vsetq_lane_u32(a, a_u32, 3);
909   return vreinterpretq_u8_u32(a_u32);
910 }
911 
load_unaligned_u8_2x2(const uint8_t * buf,int stride)912 static INLINE uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
913   uint16_t a;
914   uint16x4_t a_u16;
915 
916   memcpy(&a, buf, 2);
917   buf += stride;
918   a_u16 = vdup_n_u16(a);
919   memcpy(&a, buf, 2);
920   a_u16 = vset_lane_u16(a, a_u16, 1);
921   return vreinterpret_u8_u16(a_u16);
922 }
923 
load_unaligned_u8_4x1(const uint8_t * buf)924 static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
925   uint32_t a;
926   uint32x2_t a_u32;
927 
928   memcpy(&a, buf, 4);
929   a_u32 = vdup_n_u32(0);
930   a_u32 = vset_lane_u32(a, a_u32, 0);
931   return vreinterpret_u8_u32(a_u32);
932 }
933 
load_unaligned_dup_u8_4x2(const uint8_t * buf)934 static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
935   uint32_t a;
936   uint32x2_t a_u32;
937 
938   memcpy(&a, buf, 4);
939   a_u32 = vdup_n_u32(a);
940   return vreinterpret_u8_u32(a_u32);
941 }
942 
load_unaligned_dup_u8_2x4(const uint8_t * buf)943 static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
944   uint16_t a;
945   uint16x4_t a_u32;
946 
947   memcpy(&a, buf, 2);
948   a_u32 = vdup_n_u16(a);
949   return vreinterpret_u8_u16(a_u32);
950 }
951 
load_unaligned_u8_4x2(const uint8_t * buf,int stride)952 static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
953   uint32_t a;
954   uint32x2_t a_u32;
955 
956   memcpy(&a, buf, 4);
957   buf += stride;
958   a_u32 = vdup_n_u32(a);
959   memcpy(&a, buf, 4);
960   a_u32 = vset_lane_u32(a, a_u32, 1);
961   return vreinterpret_u8_u32(a_u32);
962 }
963 
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1)964 static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
965                                          uint8x8_t *tu0, uint8x8_t *tu1) {
966   *tu0 = load_unaligned_u8_4x2(buf, stride);
967   buf += 2 * stride;
968   *tu1 = load_unaligned_u8_4x2(buf, stride);
969 }
970 
load_unaligned_u8_3x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2)971 static INLINE void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
972                                          uint8x8_t *tu0, uint8x8_t *tu1,
973                                          uint8x8_t *tu2) {
974   load_unaligned_u8_4x4(buf, stride, tu0, tu1);
975   buf += 4 * stride;
976   *tu2 = load_unaligned_u8_4x2(buf, stride);
977 }
978 
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2,uint8x8_t * tu3)979 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
980                                          uint8x8_t *tu0, uint8x8_t *tu1,
981                                          uint8x8_t *tu2, uint8x8_t *tu3) {
982   load_unaligned_u8_4x4(buf, stride, tu0, tu1);
983   buf += 4 * stride;
984   load_unaligned_u8_4x4(buf, stride, tu2, tu3);
985 }
986 
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)987 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
988                                 uint8x16_t *const s0, uint8x16_t *const s1,
989                                 uint8x16_t *const s2, uint8x16_t *const s3,
990                                 uint8x16_t *const s4, uint8x16_t *const s5,
991                                 uint8x16_t *const s6, uint8x16_t *const s7) {
992   *s0 = vld1q_u8(s);
993   s += p;
994   *s1 = vld1q_u8(s);
995   s += p;
996   *s2 = vld1q_u8(s);
997   s += p;
998   *s3 = vld1q_u8(s);
999   s += p;
1000   *s4 = vld1q_u8(s);
1001   s += p;
1002   *s5 = vld1q_u8(s);
1003   s += p;
1004   *s6 = vld1q_u8(s);
1005   s += p;
1006   *s7 = vld1q_u8(s);
1007 }
1008 
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)1009 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
1010                                 uint8x16_t *const s0, uint8x16_t *const s1,
1011                                 uint8x16_t *const s2, uint8x16_t *const s3) {
1012   *s0 = vld1q_u8(s);
1013   s += p;
1014   *s1 = vld1q_u8(s);
1015   s += p;
1016   *s2 = vld1q_u8(s);
1017   s += p;
1018   *s3 = vld1q_u8(s);
1019 }
1020 
load_u16_8x8(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)1021 static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
1022                                 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
1023                                 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
1024                                 uint16x8_t *s6, uint16x8_t *s7) {
1025   *s0 = vld1q_u16(s);
1026   s += p;
1027   *s1 = vld1q_u16(s);
1028   s += p;
1029   *s2 = vld1q_u16(s);
1030   s += p;
1031   *s3 = vld1q_u16(s);
1032   s += p;
1033   *s4 = vld1q_u16(s);
1034   s += p;
1035   *s5 = vld1q_u16(s);
1036   s += p;
1037   *s6 = vld1q_u16(s);
1038   s += p;
1039   *s7 = vld1q_u16(s);
1040 }
1041 
load_u16_16x4(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7)1042 static INLINE void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
1043                                  uint16x8_t *const s0, uint16x8_t *const s1,
1044                                  uint16x8_t *const s2, uint16x8_t *const s3,
1045                                  uint16x8_t *const s4, uint16x8_t *const s5,
1046                                  uint16x8_t *const s6, uint16x8_t *const s7) {
1047   *s0 = vld1q_u16(s);
1048   *s1 = vld1q_u16(s + 8);
1049   s += p;
1050   *s2 = vld1q_u16(s);
1051   *s3 = vld1q_u16(s + 8);
1052   s += p;
1053   *s4 = vld1q_u16(s);
1054   *s5 = vld1q_u16(s + 8);
1055   s += p;
1056   *s6 = vld1q_u16(s);
1057   *s7 = vld1q_u16(s + 8);
1058 }
1059 
load_unaligned_u16_2x2(const uint16_t * buf,int stride)1060 static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
1061                                                 int stride) {
1062   uint32_t a;
1063   uint32x2_t a_u32;
1064 
1065   memcpy(&a, buf, 4);
1066   buf += stride;
1067   a_u32 = vdup_n_u32(a);
1068   memcpy(&a, buf, 4);
1069   a_u32 = vset_lane_u32(a, a_u32, 1);
1070   return vreinterpret_u16_u32(a_u32);
1071 }
1072 
load_unaligned_u16_4x1(const uint16_t * buf)1073 static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
1074   uint64_t a;
1075   uint64x1_t a_u64 = vdup_n_u64(0);
1076   memcpy(&a, buf, 8);
1077   a_u64 = vset_lane_u64(a, a_u64, 0);
1078   return vreinterpret_u16_u64(a_u64);
1079 }
1080 
load_unaligned_u16_4x2(const uint16_t * buf,uint32_t stride)1081 static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
1082                                                 uint32_t stride) {
1083   uint64_t a;
1084   uint64x2_t a_u64;
1085 
1086   memcpy(&a, buf, 8);
1087   buf += stride;
1088   a_u64 = vdupq_n_u64(0);
1089   a_u64 = vsetq_lane_u64(a, a_u64, 0);
1090   memcpy(&a, buf, 8);
1091   buf += stride;
1092   a_u64 = vsetq_lane_u64(a, a_u64, 1);
1093   return vreinterpretq_u16_u64(a_u64);
1094 }
1095 
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint16x8_t * tu0,uint16x8_t * tu1)1096 static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
1097                                           uint16x8_t *tu0, uint16x8_t *tu1) {
1098   *tu0 = load_unaligned_u16_4x2(buf, stride);
1099   buf += 2 * stride;
1100   *tu1 = load_unaligned_u16_4x2(buf, stride);
1101 }
1102 
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)1103 static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
1104                                 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
1105   *s1 = vld1q_s32(s);
1106   s += p;
1107   *s2 = vld1q_s32(s);
1108   s += p;
1109   *s3 = vld1q_s32(s);
1110   s += p;
1111   *s4 = vld1q_s32(s);
1112 }
1113 
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)1114 static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
1115                                  int32x4_t s2, int32x4_t s3, int32x4_t s4) {
1116   vst1q_s32(s, s1);
1117   s += p;
1118   vst1q_s32(s, s2);
1119   s += p;
1120   vst1q_s32(s, s3);
1121   s += p;
1122   vst1q_s32(s, s4);
1123 }
1124 
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)1125 static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
1126                                 uint32x4_t *s2, uint32x4_t *s3,
1127                                 uint32x4_t *s4) {
1128   *s1 = vld1q_u32(s);
1129   s += p;
1130   *s2 = vld1q_u32(s);
1131   s += p;
1132   *s3 = vld1q_u32(s);
1133   s += p;
1134   *s4 = vld1q_u32(s);
1135 }
1136 
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)1137 static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
1138                                  uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
1139   vst1q_u32(s, s1);
1140   s += p;
1141   vst1q_u32(s, s2);
1142   s += p;
1143   vst1q_u32(s, s3);
1144   s += p;
1145   vst1q_u32(s, s4);
1146 }
1147 
load_tran_low_to_s16q(const tran_low_t * buf)1148 static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
1149   const int32x4_t v0 = vld1q_s32(buf);
1150   const int32x4_t v1 = vld1q_s32(buf + 4);
1151   const int16x4_t s0 = vmovn_s32(v0);
1152   const int16x4_t s1 = vmovn_s32(v1);
1153   return vcombine_s16(s0, s1);
1154 }
1155 
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)1156 static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
1157   const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
1158   const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
1159   vst1q_s32(buf, v0);
1160   vst1q_s32(buf + 4, v1);
1161 }
1162 
store_s16_to_tran_low(tran_low_t * buf,const int16x4_t a)1163 static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
1164   const int32x4_t v0 = vmovl_s16(a);
1165   vst1q_s32(buf, v0);
1166 }
1167 
load_u8_gather_s16_x8(const uint8_t * src,int16x8_t indices)1168 static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
1169                                               int16x8_t indices) {
1170   // Recent Clang and GCC versions correctly identify that this zero-broadcast
1171   // is redundant. Alternatively we could load and broadcast the zeroth element
1172   // and then replace the other lanes, however this is slower than loading a
1173   // single element without broadcast on some micro-architectures.
1174   uint8x8_t ret = vdup_n_u8(0);
1175   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0);
1176   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1);
1177   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2);
1178   ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3);
1179   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4);
1180   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5);
1181   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6);
1182   ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7);
1183   return ret;
1184 }
1185 
1186 // The `lane` parameter here must be an immediate.
1187 #define store_u8_2x1_lane(dst, src, lane)                       \
1188   do {                                                          \
1189     uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
1190     memcpy(dst, &a, 2);                                         \
1191   } while (0)
1192 
1193 #define store_u8_4x1_lane(dst, src, lane)                       \
1194   do {                                                          \
1195     uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
1196     memcpy(dst, &a, 4);                                         \
1197   } while (0)
1198 
1199 #define store_u16_2x1_lane(dst, src, lane)                       \
1200   do {                                                           \
1201     uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
1202     memcpy(dst, &a, 4);                                          \
1203   } while (0)
1204 
1205 #define store_u16_4x1_lane(dst, src, lane)                         \
1206   do {                                                             \
1207     uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
1208     memcpy(dst, &a, 8);                                            \
1209   } while (0)
1210 
1211 // Store the low 16-bits from a single vector.
store_u8_2x1(uint8_t * dst,const uint8x8_t src)1212 static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
1213   store_u8_2x1_lane(dst, src, 0);
1214 }
1215 
1216 // Store the low 32-bits from a single vector.
store_u8_4x1(uint8_t * dst,const uint8x8_t src)1217 static INLINE void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
1218   store_u8_4x1_lane(dst, src, 0);
1219 }
1220 
1221 // Store two blocks of 16-bits from a single vector.
store_u8x2_strided_x2(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1222 static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
1223                                          uint8x8_t src) {
1224   store_u8_2x1_lane(dst, src, 0);
1225   dst += dst_stride;
1226   store_u8_2x1_lane(dst, src, 1);
1227 }
1228 
1229 // Store two blocks of 32-bits from a single vector.
store_u8x4_strided_x2(uint8_t * dst,ptrdiff_t stride,uint8x8_t src)1230 static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
1231                                          uint8x8_t src) {
1232   store_u8_4x1_lane(dst, src, 0);
1233   dst += stride;
1234   store_u8_4x1_lane(dst, src, 1);
1235 }
1236 
1237 // Store four blocks of 32-bits from a single vector.
store_u8x4_strided_x4(uint8_t * dst,ptrdiff_t stride,uint8x16_t src)1238 static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
1239                                          uint8x16_t src) {
1240   store_u8_4x1_lane(dst, vget_low_u8(src), 0);
1241   dst += stride;
1242   store_u8_4x1_lane(dst, vget_low_u8(src), 1);
1243   dst += stride;
1244   store_u8_4x1_lane(dst, vget_high_u8(src), 0);
1245   dst += stride;
1246   store_u8_4x1_lane(dst, vget_high_u8(src), 1);
1247 }
1248 
1249 // Store the low 32-bits from a single vector.
store_u16_2x1(uint16_t * dst,const uint16x4_t src)1250 static INLINE void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
1251   store_u16_2x1_lane(dst, src, 0);
1252 }
1253 
1254 // Store two blocks of 32-bits from a single vector.
store_u16x2_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x4_t src)1255 static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
1256                                           uint16x4_t src) {
1257   store_u16_2x1_lane(dst, src, 0);
1258   dst += dst_stride;
1259   store_u16_2x1_lane(dst, src, 1);
1260 }
1261 
1262 // Store two blocks of 64-bits from a single vector.
store_u16x4_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x8_t src)1263 static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
1264                                           uint16x8_t src) {
1265   store_u16_4x1_lane(dst, src, 0);
1266   dst += dst_stride;
1267   store_u16_4x1_lane(dst, src, 1);
1268 }
1269 
1270 #undef store_u8_2x1_lane
1271 #undef store_u8_4x1_lane
1272 #undef store_u16_2x1_lane
1273 #undef store_u16_4x1_lane
1274 
1275 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
1276