1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
12 #define AOM_AOM_DSP_ARM_MEM_NEON_H_
13
14 #include <arm_neon.h>
15 #include <string.h>
16 #include "aom_dsp/aom_dsp_common.h"
17
18 // Support for xN Neon intrinsics is lacking in some compilers.
19 #if defined(__arm__) || defined(_M_ARM)
20 #define ARM_32_BIT
21 #endif
22
23 // DEFICIENT_CLANG_32_BIT includes clang-cl.
24 #if defined(__clang__) && defined(ARM_32_BIT) && \
25 (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
26 #define DEFICIENT_CLANG_32_BIT // This includes clang-cl.
27 #endif
28
29 #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
30 #define GCC_32_BIT
31 #endif
32
33 #if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
34
vld1q_u8_x3(const uint8_t * ptr)35 static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
36 uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
37 vld1q_u8(ptr + 2 * 16) } };
38 return res;
39 }
40
vld1q_u8_x2(const uint8_t * ptr)41 static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
42 uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
43 return res;
44 }
45
vld1q_u16_x4(const uint16_t * ptr)46 static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
47 uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
48 vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
49 return res;
50 }
51
52 #elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit.
53 #if __GNUC__ < 8
54
vld1q_u8_x2(const uint8_t * ptr)55 static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
56 uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
57 return res;
58 }
59
vld1q_u16_x4(const uint16_t * ptr)60 static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
61 uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
62 vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
63 return res;
64 }
65 #endif // __GNUC__ < 8
66
67 #if __GNUC__ < 9
vld1q_u8_x3(const uint8_t * ptr)68 static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
69 uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
70 vld1q_u8(ptr + 2 * 16) } };
71 return res;
72 }
73 #endif // __GNUC__ < 9
74 #endif // defined(__GNUC__) && !defined(__clang__)
75
store_row2_u8_8x8(uint8_t * s,int p,const uint8x8_t s0,const uint8x8_t s1)76 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
77 const uint8x8_t s1) {
78 vst1_u8(s, s0);
79 s += p;
80 vst1_u8(s, s1);
81 s += p;
82 }
83
84 /* These intrinsics require immediate values, so we must use #defines
85 to enforce that. */
86 #define load_u8_4x1(s, s0, lane) \
87 do { \
88 *(s0) = vreinterpret_u8_u32( \
89 vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
90 } while (0)
91
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)92 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
93 uint8x8_t *const s0, uint8x8_t *const s1,
94 uint8x8_t *const s2, uint8x8_t *const s3,
95 uint8x8_t *const s4, uint8x8_t *const s5,
96 uint8x8_t *const s6, uint8x8_t *const s7) {
97 *s0 = vld1_u8(s);
98 s += p;
99 *s1 = vld1_u8(s);
100 s += p;
101 *s2 = vld1_u8(s);
102 s += p;
103 *s3 = vld1_u8(s);
104 s += p;
105 *s4 = vld1_u8(s);
106 s += p;
107 *s5 = vld1_u8(s);
108 s += p;
109 *s6 = vld1_u8(s);
110 s += p;
111 *s7 = vld1_u8(s);
112 }
113
load_u8_8x16(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)114 static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
115 uint8x16_t *const s0, uint8x16_t *const s1,
116 uint8x16_t *const s2, uint8x16_t *const s3) {
117 *s0 = vld1q_u8(s);
118 s += p;
119 *s1 = vld1q_u8(s);
120 s += p;
121 *s2 = vld1q_u8(s);
122 s += p;
123 *s3 = vld1q_u8(s);
124 }
125
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)126 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
127 uint8x8_t *const s0, uint8x8_t *const s1,
128 uint8x8_t *const s2, uint8x8_t *const s3) {
129 *s0 = vld1_u8(s);
130 s += p;
131 *s1 = vld1_u8(s);
132 s += p;
133 *s2 = vld1_u8(s);
134 s += p;
135 *s3 = vld1_u8(s);
136 }
137
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)138 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
139 uint16x4_t *const s0, uint16x4_t *const s1,
140 uint16x4_t *const s2, uint16x4_t *const s3) {
141 *s0 = vld1_u16(s);
142 s += p;
143 *s1 = vld1_u16(s);
144 s += p;
145 *s2 = vld1_u16(s);
146 s += p;
147 *s3 = vld1_u16(s);
148 s += p;
149 }
150
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)151 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
152 uint16x8_t *const s0, uint16x8_t *const s1,
153 uint16x8_t *const s2, uint16x8_t *const s3) {
154 *s0 = vld1q_u16(s);
155 s += p;
156 *s1 = vld1q_u16(s);
157 s += p;
158 *s2 = vld1q_u16(s);
159 s += p;
160 *s3 = vld1q_u16(s);
161 s += p;
162 }
163
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)164 static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
165 int16x4_t *const s0, int16x4_t *const s1,
166 int16x4_t *const s2, int16x4_t *const s3,
167 int16x4_t *const s4, int16x4_t *const s5,
168 int16x4_t *const s6, int16x4_t *const s7) {
169 *s0 = vld1_s16(s);
170 s += p;
171 *s1 = vld1_s16(s);
172 s += p;
173 *s2 = vld1_s16(s);
174 s += p;
175 *s3 = vld1_s16(s);
176 s += p;
177 *s4 = vld1_s16(s);
178 s += p;
179 *s5 = vld1_s16(s);
180 s += p;
181 *s6 = vld1_s16(s);
182 s += p;
183 *s7 = vld1_s16(s);
184 }
185
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)186 static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
187 int16x4_t *const s0, int16x4_t *const s1,
188 int16x4_t *const s2, int16x4_t *const s3) {
189 *s0 = vld1_s16(s);
190 s += p;
191 *s1 = vld1_s16(s);
192 s += p;
193 *s2 = vld1_s16(s);
194 s += p;
195 *s3 = vld1_s16(s);
196 }
197
198 /* These intrinsics require immediate values, so we must use #defines
199 to enforce that. */
200 #define store_u8_4x1(s, s0, lane) \
201 do { \
202 vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
203 } while (0)
204
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)205 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
206 const uint8x8_t s1, const uint8x8_t s2,
207 const uint8x8_t s3, const uint8x8_t s4,
208 const uint8x8_t s5, const uint8x8_t s6,
209 const uint8x8_t s7) {
210 vst1_u8(s, s0);
211 s += p;
212 vst1_u8(s, s1);
213 s += p;
214 vst1_u8(s, s2);
215 s += p;
216 vst1_u8(s, s3);
217 s += p;
218 vst1_u8(s, s4);
219 s += p;
220 vst1_u8(s, s5);
221 s += p;
222 vst1_u8(s, s6);
223 s += p;
224 vst1_u8(s, s7);
225 }
226
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)227 static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
228 const uint8x8_t s1, const uint8x8_t s2,
229 const uint8x8_t s3) {
230 vst1_u8(s, s0);
231 s += p;
232 vst1_u8(s, s1);
233 s += p;
234 vst1_u8(s, s2);
235 s += p;
236 vst1_u8(s, s3);
237 }
238
store_u8_8x16(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)239 static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
240 const uint8x16_t s1, const uint8x16_t s2,
241 const uint8x16_t s3) {
242 vst1q_u8(s, s0);
243 s += p;
244 vst1q_u8(s, s1);
245 s += p;
246 vst1q_u8(s, s2);
247 s += p;
248 vst1q_u8(s, s3);
249 }
250
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)251 static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
252 const uint16x8_t s0, const uint16x8_t s1,
253 const uint16x8_t s2, const uint16x8_t s3,
254 const uint16x8_t s4, const uint16x8_t s5,
255 const uint16x8_t s6, const uint16x8_t s7) {
256 vst1q_u16(s, s0);
257 s += dst_stride;
258 vst1q_u16(s, s1);
259 s += dst_stride;
260 vst1q_u16(s, s2);
261 s += dst_stride;
262 vst1q_u16(s, s3);
263 s += dst_stride;
264 vst1q_u16(s, s4);
265 s += dst_stride;
266 vst1q_u16(s, s5);
267 s += dst_stride;
268 vst1q_u16(s, s6);
269 s += dst_stride;
270 vst1q_u16(s, s7);
271 }
272
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)273 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
274 const uint16x4_t s0, const uint16x4_t s1,
275 const uint16x4_t s2, const uint16x4_t s3) {
276 vst1_u16(s, s0);
277 s += dst_stride;
278 vst1_u16(s, s1);
279 s += dst_stride;
280 vst1_u16(s, s2);
281 s += dst_stride;
282 vst1_u16(s, s3);
283 }
284
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)285 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
286 const uint16x8_t s0, const uint16x8_t s1,
287 const uint16x8_t s2, const uint16x8_t s3) {
288 vst1q_u16(s, s0);
289 s += dst_stride;
290 vst1q_u16(s, s1);
291 s += dst_stride;
292 vst1q_u16(s, s2);
293 s += dst_stride;
294 vst1q_u16(s, s3);
295 }
296
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)297 static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
298 const int16x8_t s0, const int16x8_t s1,
299 const int16x8_t s2, const int16x8_t s3,
300 const int16x8_t s4, const int16x8_t s5,
301 const int16x8_t s6, const int16x8_t s7) {
302 vst1q_s16(s, s0);
303 s += dst_stride;
304 vst1q_s16(s, s1);
305 s += dst_stride;
306 vst1q_s16(s, s2);
307 s += dst_stride;
308 vst1q_s16(s, s3);
309 s += dst_stride;
310 vst1q_s16(s, s4);
311 s += dst_stride;
312 vst1q_s16(s, s5);
313 s += dst_stride;
314 vst1q_s16(s, s6);
315 s += dst_stride;
316 vst1q_s16(s, s7);
317 }
318
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)319 static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
320 const int16x4_t s0, const int16x4_t s1,
321 const int16x4_t s2, const int16x4_t s3) {
322 vst1_s16(s, s0);
323 s += dst_stride;
324 vst1_s16(s, s1);
325 s += dst_stride;
326 vst1_s16(s, s2);
327 s += dst_stride;
328 vst1_s16(s, s3);
329 }
330
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)331 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
332 const int16x8_t s0, const int16x8_t s1,
333 const int16x8_t s2, const int16x8_t s3) {
334 vst1q_s16(s, s0);
335 s += dst_stride;
336 vst1q_s16(s, s1);
337 s += dst_stride;
338 vst1q_s16(s, s2);
339 s += dst_stride;
340 vst1q_s16(s, s3);
341 }
342
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)343 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
344 int16x8_t *const s0, int16x8_t *const s1,
345 int16x8_t *const s2, int16x8_t *const s3,
346 int16x8_t *const s4, int16x8_t *const s5,
347 int16x8_t *const s6, int16x8_t *const s7) {
348 *s0 = vld1q_s16(s);
349 s += p;
350 *s1 = vld1q_s16(s);
351 s += p;
352 *s2 = vld1q_s16(s);
353 s += p;
354 *s3 = vld1q_s16(s);
355 s += p;
356 *s4 = vld1q_s16(s);
357 s += p;
358 *s5 = vld1q_s16(s);
359 s += p;
360 *s6 = vld1q_s16(s);
361 s += p;
362 *s7 = vld1q_s16(s);
363 }
364
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)365 static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
366 int16x8_t *const s0, int16x8_t *const s1,
367 int16x8_t *const s2, int16x8_t *const s3) {
368 *s0 = vld1q_s16(s);
369 s += p;
370 *s1 = vld1q_s16(s);
371 s += p;
372 *s2 = vld1q_s16(s);
373 s += p;
374 *s3 = vld1q_s16(s);
375 }
376
377 // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,int stride)378 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
379 uint32_t a;
380 memcpy(&a, buf, 4);
381 buf += stride;
382 uint32x2_t a_u32 = vdup_n_u32(a);
383 memcpy(&a, buf, 4);
384 a_u32 = vset_lane_u32(a, a_u32, 1);
385 return vreinterpret_u8_u32(a_u32);
386 }
387
388 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,int stride)389 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
390 uint32_t a;
391 uint32x4_t a_u32;
392 if (stride == 4) return vld1q_u8(buf);
393 memcpy(&a, buf, 4);
394 buf += stride;
395 a_u32 = vdupq_n_u32(a);
396 memcpy(&a, buf, 4);
397 buf += stride;
398 a_u32 = vsetq_lane_u32(a, a_u32, 1);
399 memcpy(&a, buf, 4);
400 buf += stride;
401 a_u32 = vsetq_lane_u32(a, a_u32, 2);
402 memcpy(&a, buf, 4);
403 a_u32 = vsetq_lane_u32(a, a_u32, 3);
404 return vreinterpretq_u8_u32(a_u32);
405 }
406
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1,uint32x2_t * tu2,uint32x2_t * tu3)407 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
408 uint32x2_t *tu0, uint32x2_t *tu1,
409 uint32x2_t *tu2, uint32x2_t *tu3) {
410 uint32_t a;
411
412 memcpy(&a, buf, 4);
413 buf += stride;
414 *tu0 = vdup_n_u32(a);
415 memcpy(&a, buf, 4);
416 buf += stride;
417 *tu0 = vset_lane_u32(a, *tu0, 1);
418 memcpy(&a, buf, 4);
419 buf += stride;
420 *tu1 = vdup_n_u32(a);
421 memcpy(&a, buf, 4);
422 buf += stride;
423 *tu1 = vset_lane_u32(a, *tu1, 1);
424 memcpy(&a, buf, 4);
425 buf += stride;
426 *tu2 = vdup_n_u32(a);
427 memcpy(&a, buf, 4);
428 buf += stride;
429 *tu2 = vset_lane_u32(a, *tu2, 1);
430 memcpy(&a, buf, 4);
431 buf += stride;
432 *tu3 = vdup_n_u32(a);
433 memcpy(&a, buf, 4);
434 *tu3 = vset_lane_u32(a, *tu3, 1);
435 }
436
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1)437 static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
438 uint32x2_t *tu0, uint32x2_t *tu1) {
439 uint32_t a;
440
441 memcpy(&a, buf, 4);
442 buf += stride;
443 *tu0 = vdup_n_u32(a);
444 memcpy(&a, buf, 4);
445 buf += stride;
446 *tu0 = vset_lane_u32(a, *tu0, 1);
447 memcpy(&a, buf, 4);
448 buf += stride;
449 *tu1 = vdup_n_u32(a);
450 memcpy(&a, buf, 4);
451 *tu1 = vset_lane_u32(a, *tu1, 1);
452 }
453
load_unaligned_u8_4x1(const uint8_t * buf,int stride,uint32x2_t * tu0)454 static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
455 uint32x2_t *tu0) {
456 uint32_t a;
457
458 memcpy(&a, buf, 4);
459 buf += stride;
460 *tu0 = vset_lane_u32(a, *tu0, 0);
461 }
462
load_unaligned_u8_4x2(const uint8_t * buf,int stride,uint32x2_t * tu0)463 static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
464 uint32x2_t *tu0) {
465 uint32_t a;
466
467 memcpy(&a, buf, 4);
468 buf += stride;
469 *tu0 = vdup_n_u32(a);
470 memcpy(&a, buf, 4);
471 *tu0 = vset_lane_u32(a, *tu0, 1);
472 }
473
474 /* These intrinsics require immediate values, so we must use #defines
475 to enforce that. */
476 #define store_unaligned_u8_4x1(dst, src, lane) \
477 do { \
478 uint32_t a; \
479 a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
480 memcpy(dst, &a, 4); \
481 } while (0)
482
483 #define store_unaligned_u8_2x1(dst, src, lane) \
484 do { \
485 uint16_t a; \
486 a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
487 memcpy(dst, &a, 2); \
488 } while (0)
489
load_unaligned_u8_2x2(const uint8_t * buf,int stride,uint16x4_t * tu0)490 static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
491 uint16x4_t *tu0) {
492 uint16_t a;
493
494 memcpy(&a, buf, 2);
495 buf += stride;
496 *tu0 = vdup_n_u16(a);
497 memcpy(&a, buf, 2);
498 *tu0 = vset_lane_u16(a, *tu0, 1);
499 }
500
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)501 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
502 uint8x16_t *const s0, uint8x16_t *const s1,
503 uint8x16_t *const s2, uint8x16_t *const s3,
504 uint8x16_t *const s4, uint8x16_t *const s5,
505 uint8x16_t *const s6, uint8x16_t *const s7) {
506 *s0 = vld1q_u8(s);
507 s += p;
508 *s1 = vld1q_u8(s);
509 s += p;
510 *s2 = vld1q_u8(s);
511 s += p;
512 *s3 = vld1q_u8(s);
513 s += p;
514 *s4 = vld1q_u8(s);
515 s += p;
516 *s5 = vld1q_u8(s);
517 s += p;
518 *s6 = vld1q_u8(s);
519 s += p;
520 *s7 = vld1q_u8(s);
521 }
522
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)523 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
524 uint8x16_t *const s0, uint8x16_t *const s1,
525 uint8x16_t *const s2, uint8x16_t *const s3) {
526 *s0 = vld1q_u8(s);
527 s += p;
528 *s1 = vld1q_u8(s);
529 s += p;
530 *s2 = vld1q_u8(s);
531 s += p;
532 *s3 = vld1q_u8(s);
533 }
534
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint64x2_t * tu0,uint64x2_t * tu1)535 static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
536 uint64x2_t *tu0, uint64x2_t *tu1) {
537 uint64_t a;
538
539 memcpy(&a, buf, 8);
540 buf += stride;
541 *tu0 = vdupq_n_u64(a);
542 memcpy(&a, buf, 8);
543 buf += stride;
544 *tu0 = vsetq_lane_u64(a, *tu0, 1);
545 memcpy(&a, buf, 8);
546 buf += stride;
547 *tu1 = vdupq_n_u64(a);
548 memcpy(&a, buf, 8);
549 *tu1 = vsetq_lane_u64(a, *tu1, 1);
550 }
551
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)552 static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
553 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
554 *s1 = vld1q_s32(s);
555 s += p;
556 *s2 = vld1q_s32(s);
557 s += p;
558 *s3 = vld1q_s32(s);
559 s += p;
560 *s4 = vld1q_s32(s);
561 }
562
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)563 static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
564 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
565 vst1q_s32(s, s1);
566 s += p;
567 vst1q_s32(s, s2);
568 s += p;
569 vst1q_s32(s, s3);
570 s += p;
571 vst1q_s32(s, s4);
572 }
573
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)574 static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
575 uint32x4_t *s2, uint32x4_t *s3,
576 uint32x4_t *s4) {
577 *s1 = vld1q_u32(s);
578 s += p;
579 *s2 = vld1q_u32(s);
580 s += p;
581 *s3 = vld1q_u32(s);
582 s += p;
583 *s4 = vld1q_u32(s);
584 }
585
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)586 static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
587 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
588 vst1q_u32(s, s1);
589 s += p;
590 vst1q_u32(s, s2);
591 s += p;
592 vst1q_u32(s, s3);
593 s += p;
594 vst1q_u32(s, s4);
595 }
596
load_tran_low_to_s16q(const tran_low_t * buf)597 static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
598 const int32x4_t v0 = vld1q_s32(buf);
599 const int32x4_t v1 = vld1q_s32(buf + 4);
600 const int16x4_t s0 = vmovn_s32(v0);
601 const int16x4_t s1 = vmovn_s32(v1);
602 return vcombine_s16(s0, s1);
603 }
604
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)605 static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
606 const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
607 const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
608 vst1q_s32(buf, v0);
609 vst1q_s32(buf + 4, v1);
610 }
611
612 // Stores the second result at an offset of 8 (instead of 4) to match the output
613 // with that of C implementation and the function is similar to
614 // store_s16q_to_tran_low(). The offset in the function name signifies that
615 // pointer should be incremented by at least 4 in the calling function after
616 // store_s16q_to_tran_low_offset_4() call.
store_s16q_to_tran_low_offset_4(tran_low_t * buf,const int16x8_t a)617 static INLINE void store_s16q_to_tran_low_offset_4(tran_low_t *buf,
618 const int16x8_t a) {
619 const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
620 const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
621 vst1q_s32(buf, v0);
622 vst1q_s32(buf + 8, v1);
623 }
624
625 #endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
626