1 /*
2  *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
12 #define AOM_AOM_DSP_ARM_MEM_NEON_H_
13 
14 #include <arm_neon.h>
15 #include <string.h>
16 #include "aom_dsp/aom_dsp_common.h"
17 
18 // Support for xN Neon intrinsics is lacking in some compilers.
19 #if defined(__arm__) || defined(_M_ARM)
20 #define ARM_32_BIT
21 #endif
22 
23 // DEFICIENT_CLANG_32_BIT includes clang-cl.
24 #if defined(__clang__) && defined(ARM_32_BIT) && \
25     (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
26 #define DEFICIENT_CLANG_32_BIT  // This includes clang-cl.
27 #endif
28 
29 #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
30 #define GCC_32_BIT
31 #endif
32 
33 #if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
34 
vld1q_u8_x3(const uint8_t * ptr)35 static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
36   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
37                          vld1q_u8(ptr + 2 * 16) } };
38   return res;
39 }
40 
vld1q_u8_x2(const uint8_t * ptr)41 static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
42   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
43   return res;
44 }
45 
vld1q_u16_x4(const uint16_t * ptr)46 static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
47   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
48                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
49   return res;
50 }
51 
52 #elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
53 #if __GNUC__ < 8
54 
vld1q_u8_x2(const uint8_t * ptr)55 static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
56   uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
57   return res;
58 }
59 
vld1q_u16_x4(const uint16_t * ptr)60 static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
61   uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
62                          vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
63   return res;
64 }
65 #endif  // __GNUC__ < 8
66 
67 #if __GNUC__ < 9
vld1q_u8_x3(const uint8_t * ptr)68 static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
69   uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
70                          vld1q_u8(ptr + 2 * 16) } };
71   return res;
72 }
73 #endif  // __GNUC__ < 9
74 #endif  // defined(__GNUC__) && !defined(__clang__)
75 
store_row2_u8_8x8(uint8_t * s,int p,const uint8x8_t s0,const uint8x8_t s1)76 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
77                                      const uint8x8_t s1) {
78   vst1_u8(s, s0);
79   s += p;
80   vst1_u8(s, s1);
81   s += p;
82 }
83 
84 /* These intrinsics require immediate values, so we must use #defines
85    to enforce that. */
86 #define load_u8_4x1(s, s0, lane)                                           \
87   do {                                                                     \
88     *(s0) = vreinterpret_u8_u32(                                           \
89         vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
90   } while (0)
91 
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)92 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
93                                uint8x8_t *const s0, uint8x8_t *const s1,
94                                uint8x8_t *const s2, uint8x8_t *const s3,
95                                uint8x8_t *const s4, uint8x8_t *const s5,
96                                uint8x8_t *const s6, uint8x8_t *const s7) {
97   *s0 = vld1_u8(s);
98   s += p;
99   *s1 = vld1_u8(s);
100   s += p;
101   *s2 = vld1_u8(s);
102   s += p;
103   *s3 = vld1_u8(s);
104   s += p;
105   *s4 = vld1_u8(s);
106   s += p;
107   *s5 = vld1_u8(s);
108   s += p;
109   *s6 = vld1_u8(s);
110   s += p;
111   *s7 = vld1_u8(s);
112 }
113 
load_u8_8x16(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)114 static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
115                                 uint8x16_t *const s0, uint8x16_t *const s1,
116                                 uint8x16_t *const s2, uint8x16_t *const s3) {
117   *s0 = vld1q_u8(s);
118   s += p;
119   *s1 = vld1q_u8(s);
120   s += p;
121   *s2 = vld1q_u8(s);
122   s += p;
123   *s3 = vld1q_u8(s);
124 }
125 
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)126 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
127                                uint8x8_t *const s0, uint8x8_t *const s1,
128                                uint8x8_t *const s2, uint8x8_t *const s3) {
129   *s0 = vld1_u8(s);
130   s += p;
131   *s1 = vld1_u8(s);
132   s += p;
133   *s2 = vld1_u8(s);
134   s += p;
135   *s3 = vld1_u8(s);
136 }
137 
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)138 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
139                                 uint16x4_t *const s0, uint16x4_t *const s1,
140                                 uint16x4_t *const s2, uint16x4_t *const s3) {
141   *s0 = vld1_u16(s);
142   s += p;
143   *s1 = vld1_u16(s);
144   s += p;
145   *s2 = vld1_u16(s);
146   s += p;
147   *s3 = vld1_u16(s);
148   s += p;
149 }
150 
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)151 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
152                                 uint16x8_t *const s0, uint16x8_t *const s1,
153                                 uint16x8_t *const s2, uint16x8_t *const s3) {
154   *s0 = vld1q_u16(s);
155   s += p;
156   *s1 = vld1q_u16(s);
157   s += p;
158   *s2 = vld1q_u16(s);
159   s += p;
160   *s3 = vld1q_u16(s);
161   s += p;
162 }
163 
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)164 static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
165                                 int16x4_t *const s0, int16x4_t *const s1,
166                                 int16x4_t *const s2, int16x4_t *const s3,
167                                 int16x4_t *const s4, int16x4_t *const s5,
168                                 int16x4_t *const s6, int16x4_t *const s7) {
169   *s0 = vld1_s16(s);
170   s += p;
171   *s1 = vld1_s16(s);
172   s += p;
173   *s2 = vld1_s16(s);
174   s += p;
175   *s3 = vld1_s16(s);
176   s += p;
177   *s4 = vld1_s16(s);
178   s += p;
179   *s5 = vld1_s16(s);
180   s += p;
181   *s6 = vld1_s16(s);
182   s += p;
183   *s7 = vld1_s16(s);
184 }
185 
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)186 static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
187                                 int16x4_t *const s0, int16x4_t *const s1,
188                                 int16x4_t *const s2, int16x4_t *const s3) {
189   *s0 = vld1_s16(s);
190   s += p;
191   *s1 = vld1_s16(s);
192   s += p;
193   *s2 = vld1_s16(s);
194   s += p;
195   *s3 = vld1_s16(s);
196 }
197 
198 /* These intrinsics require immediate values, so we must use #defines
199    to enforce that. */
200 #define store_u8_4x1(s, s0, lane)                                  \
201   do {                                                             \
202     vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
203   } while (0)
204 
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)205 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
206                                 const uint8x8_t s1, const uint8x8_t s2,
207                                 const uint8x8_t s3, const uint8x8_t s4,
208                                 const uint8x8_t s5, const uint8x8_t s6,
209                                 const uint8x8_t s7) {
210   vst1_u8(s, s0);
211   s += p;
212   vst1_u8(s, s1);
213   s += p;
214   vst1_u8(s, s2);
215   s += p;
216   vst1_u8(s, s3);
217   s += p;
218   vst1_u8(s, s4);
219   s += p;
220   vst1_u8(s, s5);
221   s += p;
222   vst1_u8(s, s6);
223   s += p;
224   vst1_u8(s, s7);
225 }
226 
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)227 static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
228                                 const uint8x8_t s1, const uint8x8_t s2,
229                                 const uint8x8_t s3) {
230   vst1_u8(s, s0);
231   s += p;
232   vst1_u8(s, s1);
233   s += p;
234   vst1_u8(s, s2);
235   s += p;
236   vst1_u8(s, s3);
237 }
238 
store_u8_8x16(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)239 static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
240                                  const uint8x16_t s1, const uint8x16_t s2,
241                                  const uint8x16_t s3) {
242   vst1q_u8(s, s0);
243   s += p;
244   vst1q_u8(s, s1);
245   s += p;
246   vst1q_u8(s, s2);
247   s += p;
248   vst1q_u8(s, s3);
249 }
250 
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)251 static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
252                                  const uint16x8_t s0, const uint16x8_t s1,
253                                  const uint16x8_t s2, const uint16x8_t s3,
254                                  const uint16x8_t s4, const uint16x8_t s5,
255                                  const uint16x8_t s6, const uint16x8_t s7) {
256   vst1q_u16(s, s0);
257   s += dst_stride;
258   vst1q_u16(s, s1);
259   s += dst_stride;
260   vst1q_u16(s, s2);
261   s += dst_stride;
262   vst1q_u16(s, s3);
263   s += dst_stride;
264   vst1q_u16(s, s4);
265   s += dst_stride;
266   vst1q_u16(s, s5);
267   s += dst_stride;
268   vst1q_u16(s, s6);
269   s += dst_stride;
270   vst1q_u16(s, s7);
271 }
272 
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)273 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
274                                  const uint16x4_t s0, const uint16x4_t s1,
275                                  const uint16x4_t s2, const uint16x4_t s3) {
276   vst1_u16(s, s0);
277   s += dst_stride;
278   vst1_u16(s, s1);
279   s += dst_stride;
280   vst1_u16(s, s2);
281   s += dst_stride;
282   vst1_u16(s, s3);
283 }
284 
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)285 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
286                                  const uint16x8_t s0, const uint16x8_t s1,
287                                  const uint16x8_t s2, const uint16x8_t s3) {
288   vst1q_u16(s, s0);
289   s += dst_stride;
290   vst1q_u16(s, s1);
291   s += dst_stride;
292   vst1q_u16(s, s2);
293   s += dst_stride;
294   vst1q_u16(s, s3);
295 }
296 
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)297 static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
298                                  const int16x8_t s0, const int16x8_t s1,
299                                  const int16x8_t s2, const int16x8_t s3,
300                                  const int16x8_t s4, const int16x8_t s5,
301                                  const int16x8_t s6, const int16x8_t s7) {
302   vst1q_s16(s, s0);
303   s += dst_stride;
304   vst1q_s16(s, s1);
305   s += dst_stride;
306   vst1q_s16(s, s2);
307   s += dst_stride;
308   vst1q_s16(s, s3);
309   s += dst_stride;
310   vst1q_s16(s, s4);
311   s += dst_stride;
312   vst1q_s16(s, s5);
313   s += dst_stride;
314   vst1q_s16(s, s6);
315   s += dst_stride;
316   vst1q_s16(s, s7);
317 }
318 
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)319 static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
320                                  const int16x4_t s0, const int16x4_t s1,
321                                  const int16x4_t s2, const int16x4_t s3) {
322   vst1_s16(s, s0);
323   s += dst_stride;
324   vst1_s16(s, s1);
325   s += dst_stride;
326   vst1_s16(s, s2);
327   s += dst_stride;
328   vst1_s16(s, s3);
329 }
330 
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)331 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
332                                  const int16x8_t s0, const int16x8_t s1,
333                                  const int16x8_t s2, const int16x8_t s3) {
334   vst1q_s16(s, s0);
335   s += dst_stride;
336   vst1q_s16(s, s1);
337   s += dst_stride;
338   vst1q_s16(s, s2);
339   s += dst_stride;
340   vst1q_s16(s, s3);
341 }
342 
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)343 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
344                                 int16x8_t *const s0, int16x8_t *const s1,
345                                 int16x8_t *const s2, int16x8_t *const s3,
346                                 int16x8_t *const s4, int16x8_t *const s5,
347                                 int16x8_t *const s6, int16x8_t *const s7) {
348   *s0 = vld1q_s16(s);
349   s += p;
350   *s1 = vld1q_s16(s);
351   s += p;
352   *s2 = vld1q_s16(s);
353   s += p;
354   *s3 = vld1q_s16(s);
355   s += p;
356   *s4 = vld1q_s16(s);
357   s += p;
358   *s5 = vld1q_s16(s);
359   s += p;
360   *s6 = vld1q_s16(s);
361   s += p;
362   *s7 = vld1q_s16(s);
363 }
364 
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)365 static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
366                                 int16x8_t *const s0, int16x8_t *const s1,
367                                 int16x8_t *const s2, int16x8_t *const s3) {
368   *s0 = vld1q_s16(s);
369   s += p;
370   *s1 = vld1q_s16(s);
371   s += p;
372   *s2 = vld1q_s16(s);
373   s += p;
374   *s3 = vld1q_s16(s);
375 }
376 
377 // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,int stride)378 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
379   uint32_t a;
380   memcpy(&a, buf, 4);
381   buf += stride;
382   uint32x2_t a_u32 = vdup_n_u32(a);
383   memcpy(&a, buf, 4);
384   a_u32 = vset_lane_u32(a, a_u32, 1);
385   return vreinterpret_u8_u32(a_u32);
386 }
387 
388 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,int stride)389 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
390   uint32_t a;
391   uint32x4_t a_u32;
392   if (stride == 4) return vld1q_u8(buf);
393   memcpy(&a, buf, 4);
394   buf += stride;
395   a_u32 = vdupq_n_u32(a);
396   memcpy(&a, buf, 4);
397   buf += stride;
398   a_u32 = vsetq_lane_u32(a, a_u32, 1);
399   memcpy(&a, buf, 4);
400   buf += stride;
401   a_u32 = vsetq_lane_u32(a, a_u32, 2);
402   memcpy(&a, buf, 4);
403   a_u32 = vsetq_lane_u32(a, a_u32, 3);
404   return vreinterpretq_u8_u32(a_u32);
405 }
406 
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1,uint32x2_t * tu2,uint32x2_t * tu3)407 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
408                                          uint32x2_t *tu0, uint32x2_t *tu1,
409                                          uint32x2_t *tu2, uint32x2_t *tu3) {
410   uint32_t a;
411 
412   memcpy(&a, buf, 4);
413   buf += stride;
414   *tu0 = vdup_n_u32(a);
415   memcpy(&a, buf, 4);
416   buf += stride;
417   *tu0 = vset_lane_u32(a, *tu0, 1);
418   memcpy(&a, buf, 4);
419   buf += stride;
420   *tu1 = vdup_n_u32(a);
421   memcpy(&a, buf, 4);
422   buf += stride;
423   *tu1 = vset_lane_u32(a, *tu1, 1);
424   memcpy(&a, buf, 4);
425   buf += stride;
426   *tu2 = vdup_n_u32(a);
427   memcpy(&a, buf, 4);
428   buf += stride;
429   *tu2 = vset_lane_u32(a, *tu2, 1);
430   memcpy(&a, buf, 4);
431   buf += stride;
432   *tu3 = vdup_n_u32(a);
433   memcpy(&a, buf, 4);
434   *tu3 = vset_lane_u32(a, *tu3, 1);
435 }
436 
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1)437 static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
438                                          uint32x2_t *tu0, uint32x2_t *tu1) {
439   uint32_t a;
440 
441   memcpy(&a, buf, 4);
442   buf += stride;
443   *tu0 = vdup_n_u32(a);
444   memcpy(&a, buf, 4);
445   buf += stride;
446   *tu0 = vset_lane_u32(a, *tu0, 1);
447   memcpy(&a, buf, 4);
448   buf += stride;
449   *tu1 = vdup_n_u32(a);
450   memcpy(&a, buf, 4);
451   *tu1 = vset_lane_u32(a, *tu1, 1);
452 }
453 
load_unaligned_u8_4x1(const uint8_t * buf,int stride,uint32x2_t * tu0)454 static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
455                                          uint32x2_t *tu0) {
456   uint32_t a;
457 
458   memcpy(&a, buf, 4);
459   buf += stride;
460   *tu0 = vset_lane_u32(a, *tu0, 0);
461 }
462 
load_unaligned_u8_4x2(const uint8_t * buf,int stride,uint32x2_t * tu0)463 static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
464                                          uint32x2_t *tu0) {
465   uint32_t a;
466 
467   memcpy(&a, buf, 4);
468   buf += stride;
469   *tu0 = vdup_n_u32(a);
470   memcpy(&a, buf, 4);
471   *tu0 = vset_lane_u32(a, *tu0, 1);
472 }
473 
474 /* These intrinsics require immediate values, so we must use #defines
475    to enforce that. */
476 #define store_unaligned_u8_4x1(dst, src, lane)         \
477   do {                                                 \
478     uint32_t a;                                        \
479     a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
480     memcpy(dst, &a, 4);                                \
481   } while (0)
482 
483 #define store_unaligned_u8_2x1(dst, src, lane)         \
484   do {                                                 \
485     uint16_t a;                                        \
486     a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
487     memcpy(dst, &a, 2);                                \
488   } while (0)
489 
load_unaligned_u8_2x2(const uint8_t * buf,int stride,uint16x4_t * tu0)490 static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
491                                          uint16x4_t *tu0) {
492   uint16_t a;
493 
494   memcpy(&a, buf, 2);
495   buf += stride;
496   *tu0 = vdup_n_u16(a);
497   memcpy(&a, buf, 2);
498   *tu0 = vset_lane_u16(a, *tu0, 1);
499 }
500 
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)501 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
502                                 uint8x16_t *const s0, uint8x16_t *const s1,
503                                 uint8x16_t *const s2, uint8x16_t *const s3,
504                                 uint8x16_t *const s4, uint8x16_t *const s5,
505                                 uint8x16_t *const s6, uint8x16_t *const s7) {
506   *s0 = vld1q_u8(s);
507   s += p;
508   *s1 = vld1q_u8(s);
509   s += p;
510   *s2 = vld1q_u8(s);
511   s += p;
512   *s3 = vld1q_u8(s);
513   s += p;
514   *s4 = vld1q_u8(s);
515   s += p;
516   *s5 = vld1q_u8(s);
517   s += p;
518   *s6 = vld1q_u8(s);
519   s += p;
520   *s7 = vld1q_u8(s);
521 }
522 
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)523 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
524                                 uint8x16_t *const s0, uint8x16_t *const s1,
525                                 uint8x16_t *const s2, uint8x16_t *const s3) {
526   *s0 = vld1q_u8(s);
527   s += p;
528   *s1 = vld1q_u8(s);
529   s += p;
530   *s2 = vld1q_u8(s);
531   s += p;
532   *s3 = vld1q_u8(s);
533 }
534 
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint64x2_t * tu0,uint64x2_t * tu1)535 static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
536                                           uint64x2_t *tu0, uint64x2_t *tu1) {
537   uint64_t a;
538 
539   memcpy(&a, buf, 8);
540   buf += stride;
541   *tu0 = vdupq_n_u64(a);
542   memcpy(&a, buf, 8);
543   buf += stride;
544   *tu0 = vsetq_lane_u64(a, *tu0, 1);
545   memcpy(&a, buf, 8);
546   buf += stride;
547   *tu1 = vdupq_n_u64(a);
548   memcpy(&a, buf, 8);
549   *tu1 = vsetq_lane_u64(a, *tu1, 1);
550 }
551 
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)552 static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
553                                 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
554   *s1 = vld1q_s32(s);
555   s += p;
556   *s2 = vld1q_s32(s);
557   s += p;
558   *s3 = vld1q_s32(s);
559   s += p;
560   *s4 = vld1q_s32(s);
561 }
562 
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)563 static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
564                                  int32x4_t s2, int32x4_t s3, int32x4_t s4) {
565   vst1q_s32(s, s1);
566   s += p;
567   vst1q_s32(s, s2);
568   s += p;
569   vst1q_s32(s, s3);
570   s += p;
571   vst1q_s32(s, s4);
572 }
573 
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)574 static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
575                                 uint32x4_t *s2, uint32x4_t *s3,
576                                 uint32x4_t *s4) {
577   *s1 = vld1q_u32(s);
578   s += p;
579   *s2 = vld1q_u32(s);
580   s += p;
581   *s3 = vld1q_u32(s);
582   s += p;
583   *s4 = vld1q_u32(s);
584 }
585 
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)586 static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
587                                  uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
588   vst1q_u32(s, s1);
589   s += p;
590   vst1q_u32(s, s2);
591   s += p;
592   vst1q_u32(s, s3);
593   s += p;
594   vst1q_u32(s, s4);
595 }
596 
load_tran_low_to_s16q(const tran_low_t * buf)597 static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
598   const int32x4_t v0 = vld1q_s32(buf);
599   const int32x4_t v1 = vld1q_s32(buf + 4);
600   const int16x4_t s0 = vmovn_s32(v0);
601   const int16x4_t s1 = vmovn_s32(v1);
602   return vcombine_s16(s0, s1);
603 }
604 
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)605 static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
606   const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
607   const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
608   vst1q_s32(buf, v0);
609   vst1q_s32(buf + 4, v1);
610 }
611 
612 // Stores the second result at an offset of 8 (instead of 4) to match the output
613 // with that of C implementation and the function is similar to
614 // store_s16q_to_tran_low(). The offset in the function name signifies that
615 // pointer should be incremented by at least 4 in the calling function after
616 // store_s16q_to_tran_low_offset_4() call.
store_s16q_to_tran_low_offset_4(tran_low_t * buf,const int16x8_t a)617 static INLINE void store_s16q_to_tran_low_offset_4(tran_low_t *buf,
618                                                    const int16x8_t a) {
619   const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
620   const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
621   vst1q_s32(buf, v0);
622   vst1q_s32(buf + 8, v1);
623 }
624 
625 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
626