1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
12 #define AOM_AV1_COMMON_ARM_MEM_NEON_H_
13
14 #include <arm_neon.h>
15 #include <string.h>
16 #include "aom_dsp/aom_dsp_common.h"
17
store_row2_u8_8x8(uint8_t * s,int p,const uint8x8_t s0,const uint8x8_t s1)18 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
19 const uint8x8_t s1) {
20 vst1_u8(s, s0);
21 s += p;
22 vst1_u8(s, s1);
23 s += p;
24 }
25
26 /* These intrinsics require immediate values, so we must use #defines
27 to enforce that. */
28 #define load_u8_4x1(s, s0, lane) \
29 do { \
30 *(s0) = vreinterpret_u8_u32( \
31 vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
32 } while (0)
33
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)34 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
35 uint8x8_t *const s0, uint8x8_t *const s1,
36 uint8x8_t *const s2, uint8x8_t *const s3,
37 uint8x8_t *const s4, uint8x8_t *const s5,
38 uint8x8_t *const s6, uint8x8_t *const s7) {
39 *s0 = vld1_u8(s);
40 s += p;
41 *s1 = vld1_u8(s);
42 s += p;
43 *s2 = vld1_u8(s);
44 s += p;
45 *s3 = vld1_u8(s);
46 s += p;
47 *s4 = vld1_u8(s);
48 s += p;
49 *s5 = vld1_u8(s);
50 s += p;
51 *s6 = vld1_u8(s);
52 s += p;
53 *s7 = vld1_u8(s);
54 }
55
load_u8_8x16(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)56 static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
57 uint8x16_t *const s0, uint8x16_t *const s1,
58 uint8x16_t *const s2, uint8x16_t *const s3) {
59 *s0 = vld1q_u8(s);
60 s += p;
61 *s1 = vld1q_u8(s);
62 s += p;
63 *s2 = vld1q_u8(s);
64 s += p;
65 *s3 = vld1q_u8(s);
66 }
67
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)68 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
69 uint8x8_t *const s0, uint8x8_t *const s1,
70 uint8x8_t *const s2, uint8x8_t *const s3) {
71 *s0 = vld1_u8(s);
72 s += p;
73 *s1 = vld1_u8(s);
74 s += p;
75 *s2 = vld1_u8(s);
76 s += p;
77 *s3 = vld1_u8(s);
78 }
79
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)80 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
81 uint16x4_t *const s0, uint16x4_t *const s1,
82 uint16x4_t *const s2, uint16x4_t *const s3) {
83 *s0 = vld1_u16(s);
84 s += p;
85 *s1 = vld1_u16(s);
86 s += p;
87 *s2 = vld1_u16(s);
88 s += p;
89 *s3 = vld1_u16(s);
90 s += p;
91 }
92
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)93 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
94 uint16x8_t *const s0, uint16x8_t *const s1,
95 uint16x8_t *const s2, uint16x8_t *const s3) {
96 *s0 = vld1q_u16(s);
97 s += p;
98 *s1 = vld1q_u16(s);
99 s += p;
100 *s2 = vld1q_u16(s);
101 s += p;
102 *s3 = vld1q_u16(s);
103 s += p;
104 }
105
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)106 static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
107 int16x4_t *const s0, int16x4_t *const s1,
108 int16x4_t *const s2, int16x4_t *const s3,
109 int16x4_t *const s4, int16x4_t *const s5,
110 int16x4_t *const s6, int16x4_t *const s7) {
111 *s0 = vld1_s16(s);
112 s += p;
113 *s1 = vld1_s16(s);
114 s += p;
115 *s2 = vld1_s16(s);
116 s += p;
117 *s3 = vld1_s16(s);
118 s += p;
119 *s4 = vld1_s16(s);
120 s += p;
121 *s5 = vld1_s16(s);
122 s += p;
123 *s6 = vld1_s16(s);
124 s += p;
125 *s7 = vld1_s16(s);
126 }
127
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)128 static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
129 int16x4_t *const s0, int16x4_t *const s1,
130 int16x4_t *const s2, int16x4_t *const s3) {
131 *s0 = vld1_s16(s);
132 s += p;
133 *s1 = vld1_s16(s);
134 s += p;
135 *s2 = vld1_s16(s);
136 s += p;
137 *s3 = vld1_s16(s);
138 }
139
140 /* These intrinsics require immediate values, so we must use #defines
141 to enforce that. */
142 #define store_u8_4x1(s, s0, lane) \
143 do { \
144 vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
145 } while (0)
146
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)147 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
148 const uint8x8_t s1, const uint8x8_t s2,
149 const uint8x8_t s3, const uint8x8_t s4,
150 const uint8x8_t s5, const uint8x8_t s6,
151 const uint8x8_t s7) {
152 vst1_u8(s, s0);
153 s += p;
154 vst1_u8(s, s1);
155 s += p;
156 vst1_u8(s, s2);
157 s += p;
158 vst1_u8(s, s3);
159 s += p;
160 vst1_u8(s, s4);
161 s += p;
162 vst1_u8(s, s5);
163 s += p;
164 vst1_u8(s, s6);
165 s += p;
166 vst1_u8(s, s7);
167 }
168
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)169 static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
170 const uint8x8_t s1, const uint8x8_t s2,
171 const uint8x8_t s3) {
172 vst1_u8(s, s0);
173 s += p;
174 vst1_u8(s, s1);
175 s += p;
176 vst1_u8(s, s2);
177 s += p;
178 vst1_u8(s, s3);
179 }
180
store_u8_8x16(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)181 static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
182 const uint8x16_t s1, const uint8x16_t s2,
183 const uint8x16_t s3) {
184 vst1q_u8(s, s0);
185 s += p;
186 vst1q_u8(s, s1);
187 s += p;
188 vst1q_u8(s, s2);
189 s += p;
190 vst1q_u8(s, s3);
191 }
192
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)193 static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
194 const uint16x8_t s0, const uint16x8_t s1,
195 const uint16x8_t s2, const uint16x8_t s3,
196 const uint16x8_t s4, const uint16x8_t s5,
197 const uint16x8_t s6, const uint16x8_t s7) {
198 vst1q_u16(s, s0);
199 s += dst_stride;
200 vst1q_u16(s, s1);
201 s += dst_stride;
202 vst1q_u16(s, s2);
203 s += dst_stride;
204 vst1q_u16(s, s3);
205 s += dst_stride;
206 vst1q_u16(s, s4);
207 s += dst_stride;
208 vst1q_u16(s, s5);
209 s += dst_stride;
210 vst1q_u16(s, s6);
211 s += dst_stride;
212 vst1q_u16(s, s7);
213 }
214
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)215 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
216 const uint16x4_t s0, const uint16x4_t s1,
217 const uint16x4_t s2, const uint16x4_t s3) {
218 vst1_u16(s, s0);
219 s += dst_stride;
220 vst1_u16(s, s1);
221 s += dst_stride;
222 vst1_u16(s, s2);
223 s += dst_stride;
224 vst1_u16(s, s3);
225 }
226
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)227 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
228 const uint16x8_t s0, const uint16x8_t s1,
229 const uint16x8_t s2, const uint16x8_t s3) {
230 vst1q_u16(s, s0);
231 s += dst_stride;
232 vst1q_u16(s, s1);
233 s += dst_stride;
234 vst1q_u16(s, s2);
235 s += dst_stride;
236 vst1q_u16(s, s3);
237 }
238
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)239 static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
240 const int16x8_t s0, const int16x8_t s1,
241 const int16x8_t s2, const int16x8_t s3,
242 const int16x8_t s4, const int16x8_t s5,
243 const int16x8_t s6, const int16x8_t s7) {
244 vst1q_s16(s, s0);
245 s += dst_stride;
246 vst1q_s16(s, s1);
247 s += dst_stride;
248 vst1q_s16(s, s2);
249 s += dst_stride;
250 vst1q_s16(s, s3);
251 s += dst_stride;
252 vst1q_s16(s, s4);
253 s += dst_stride;
254 vst1q_s16(s, s5);
255 s += dst_stride;
256 vst1q_s16(s, s6);
257 s += dst_stride;
258 vst1q_s16(s, s7);
259 }
260
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)261 static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
262 const int16x4_t s0, const int16x4_t s1,
263 const int16x4_t s2, const int16x4_t s3) {
264 vst1_s16(s, s0);
265 s += dst_stride;
266 vst1_s16(s, s1);
267 s += dst_stride;
268 vst1_s16(s, s2);
269 s += dst_stride;
270 vst1_s16(s, s3);
271 }
272
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)273 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
274 const int16x8_t s0, const int16x8_t s1,
275 const int16x8_t s2, const int16x8_t s3) {
276 vst1q_s16(s, s0);
277 s += dst_stride;
278 vst1q_s16(s, s1);
279 s += dst_stride;
280 vst1q_s16(s, s2);
281 s += dst_stride;
282 vst1q_s16(s, s3);
283 }
284
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)285 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
286 int16x8_t *const s0, int16x8_t *const s1,
287 int16x8_t *const s2, int16x8_t *const s3,
288 int16x8_t *const s4, int16x8_t *const s5,
289 int16x8_t *const s6, int16x8_t *const s7) {
290 *s0 = vld1q_s16(s);
291 s += p;
292 *s1 = vld1q_s16(s);
293 s += p;
294 *s2 = vld1q_s16(s);
295 s += p;
296 *s3 = vld1q_s16(s);
297 s += p;
298 *s4 = vld1q_s16(s);
299 s += p;
300 *s5 = vld1q_s16(s);
301 s += p;
302 *s6 = vld1q_s16(s);
303 s += p;
304 *s7 = vld1q_s16(s);
305 }
306
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)307 static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
308 int16x8_t *const s0, int16x8_t *const s1,
309 int16x8_t *const s2, int16x8_t *const s3) {
310 *s0 = vld1q_s16(s);
311 s += p;
312 *s1 = vld1q_s16(s);
313 s += p;
314 *s2 = vld1q_s16(s);
315 s += p;
316 *s3 = vld1q_s16(s);
317 }
318
319 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,int stride)320 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
321 uint32_t a;
322 uint32x4_t a_u32 = vdupq_n_u32(0);
323 if (stride == 4) return vld1q_u8(buf);
324 memcpy(&a, buf, 4);
325 buf += stride;
326 a_u32 = vsetq_lane_u32(a, a_u32, 0);
327 memcpy(&a, buf, 4);
328 buf += stride;
329 a_u32 = vsetq_lane_u32(a, a_u32, 1);
330 memcpy(&a, buf, 4);
331 buf += stride;
332 a_u32 = vsetq_lane_u32(a, a_u32, 2);
333 memcpy(&a, buf, 4);
334 buf += stride;
335 a_u32 = vsetq_lane_u32(a, a_u32, 3);
336 return vreinterpretq_u8_u32(a_u32);
337 }
338
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1,uint32x2_t * tu2,uint32x2_t * tu3)339 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
340 uint32x2_t *tu0, uint32x2_t *tu1,
341 uint32x2_t *tu2, uint32x2_t *tu3) {
342 uint32_t a;
343
344 memcpy(&a, buf, 4);
345 buf += stride;
346 *tu0 = vset_lane_u32(a, *tu0, 0);
347 memcpy(&a, buf, 4);
348 buf += stride;
349 *tu0 = vset_lane_u32(a, *tu0, 1);
350 memcpy(&a, buf, 4);
351 buf += stride;
352 *tu1 = vset_lane_u32(a, *tu1, 0);
353 memcpy(&a, buf, 4);
354 buf += stride;
355 *tu1 = vset_lane_u32(a, *tu1, 1);
356 memcpy(&a, buf, 4);
357 buf += stride;
358 *tu2 = vset_lane_u32(a, *tu2, 0);
359 memcpy(&a, buf, 4);
360 buf += stride;
361 *tu2 = vset_lane_u32(a, *tu2, 1);
362 memcpy(&a, buf, 4);
363 buf += stride;
364 *tu3 = vset_lane_u32(a, *tu3, 0);
365 memcpy(&a, buf, 4);
366 *tu3 = vset_lane_u32(a, *tu3, 1);
367 }
368
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1)369 static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
370 uint32x2_t *tu0, uint32x2_t *tu1) {
371 uint32_t a;
372
373 memcpy(&a, buf, 4);
374 buf += stride;
375 *tu0 = vset_lane_u32(a, *tu0, 0);
376 memcpy(&a, buf, 4);
377 buf += stride;
378 *tu0 = vset_lane_u32(a, *tu0, 1);
379 memcpy(&a, buf, 4);
380 buf += stride;
381 *tu1 = vset_lane_u32(a, *tu1, 0);
382 memcpy(&a, buf, 4);
383 *tu1 = vset_lane_u32(a, *tu1, 1);
384 }
385
load_unaligned_u8_4x1(const uint8_t * buf,int stride,uint32x2_t * tu0)386 static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
387 uint32x2_t *tu0) {
388 uint32_t a;
389
390 memcpy(&a, buf, 4);
391 buf += stride;
392 *tu0 = vset_lane_u32(a, *tu0, 0);
393 }
394
load_unaligned_u8_4x2(const uint8_t * buf,int stride,uint32x2_t * tu0)395 static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
396 uint32x2_t *tu0) {
397 uint32_t a;
398
399 memcpy(&a, buf, 4);
400 buf += stride;
401 *tu0 = vset_lane_u32(a, *tu0, 0);
402 memcpy(&a, buf, 4);
403 buf += stride;
404 *tu0 = vset_lane_u32(a, *tu0, 1);
405 }
406
407 /* These intrinsics require immediate values, so we must use #defines
408 to enforce that. */
409 #define store_unaligned_u8_4x1(dst, src, lane) \
410 do { \
411 uint32_t a; \
412 a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
413 memcpy(dst, &a, 4); \
414 } while (0)
415
load_unaligned_u8_2x2(const uint8_t * buf,int stride,uint16x4_t * tu0)416 static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
417 uint16x4_t *tu0) {
418 uint16_t a;
419
420 memcpy(&a, buf, 2);
421 buf += stride;
422 *tu0 = vset_lane_u16(a, *tu0, 0);
423 memcpy(&a, buf, 2);
424 buf += stride;
425 *tu0 = vset_lane_u16(a, *tu0, 1);
426 }
427
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)428 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
429 uint8x16_t *const s0, uint8x16_t *const s1,
430 uint8x16_t *const s2, uint8x16_t *const s3,
431 uint8x16_t *const s4, uint8x16_t *const s5,
432 uint8x16_t *const s6, uint8x16_t *const s7) {
433 *s0 = vld1q_u8(s);
434 s += p;
435 *s1 = vld1q_u8(s);
436 s += p;
437 *s2 = vld1q_u8(s);
438 s += p;
439 *s3 = vld1q_u8(s);
440 s += p;
441 *s4 = vld1q_u8(s);
442 s += p;
443 *s5 = vld1q_u8(s);
444 s += p;
445 *s6 = vld1q_u8(s);
446 s += p;
447 *s7 = vld1q_u8(s);
448 }
449
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)450 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
451 uint8x16_t *const s0, uint8x16_t *const s1,
452 uint8x16_t *const s2, uint8x16_t *const s3) {
453 *s0 = vld1q_u8(s);
454 s += p;
455 *s1 = vld1q_u8(s);
456 s += p;
457 *s2 = vld1q_u8(s);
458 s += p;
459 *s3 = vld1q_u8(s);
460 }
461
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint64x2_t * tu0,uint64x2_t * tu1)462 static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
463 uint64x2_t *tu0, uint64x2_t *tu1) {
464 uint64_t a;
465
466 memcpy(&a, buf, 8);
467 buf += stride;
468 *tu0 = vsetq_lane_u64(a, *tu0, 0);
469 memcpy(&a, buf, 8);
470 buf += stride;
471 *tu0 = vsetq_lane_u64(a, *tu0, 1);
472 memcpy(&a, buf, 8);
473 buf += stride;
474 *tu1 = vsetq_lane_u64(a, *tu1, 0);
475 memcpy(&a, buf, 8);
476 *tu1 = vsetq_lane_u64(a, *tu1, 1);
477 }
478
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)479 static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
480 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
481 *s1 = vld1q_s32(s);
482 s += p;
483 *s2 = vld1q_s32(s);
484 s += p;
485 *s3 = vld1q_s32(s);
486 s += p;
487 *s4 = vld1q_s32(s);
488 }
489
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)490 static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
491 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
492 vst1q_s32(s, s1);
493 s += p;
494 vst1q_s32(s, s2);
495 s += p;
496 vst1q_s32(s, s3);
497 s += p;
498 vst1q_s32(s, s4);
499 }
500
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)501 static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
502 uint32x4_t *s2, uint32x4_t *s3,
503 uint32x4_t *s4) {
504 *s1 = vld1q_u32(s);
505 s += p;
506 *s2 = vld1q_u32(s);
507 s += p;
508 *s3 = vld1q_u32(s);
509 s += p;
510 *s4 = vld1q_u32(s);
511 }
512
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)513 static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
514 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
515 vst1q_u32(s, s1);
516 s += p;
517 vst1q_u32(s, s2);
518 s += p;
519 vst1q_u32(s, s3);
520 s += p;
521 vst1q_u32(s, s4);
522 }
523
load_tran_low_to_s16q(const tran_low_t * buf)524 static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
525 const int32x4_t v0 = vld1q_s32(buf);
526 const int32x4_t v1 = vld1q_s32(buf + 4);
527 const int16x4_t s0 = vmovn_s32(v0);
528 const int16x4_t s1 = vmovn_s32(v1);
529 return vcombine_s16(s0, s1);
530 }
531
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)532 static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
533 const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
534 const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
535 vst1q_s32(buf, v0);
536 vst1q_s32(buf + 4, v1);
537 }
538
539 #endif // AOM_AV1_COMMON_ARM_MEM_NEON_H_
540