1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
13 #define AOM_AOM_DSP_ARM_MEM_NEON_H_
14
15 #include <arm_neon.h>
16 #include <string.h>
17 #include "aom_dsp/aom_dsp_common.h"
18
19 // Support for xN Neon intrinsics is lacking in some compilers.
20 #if defined(__arm__) || defined(_M_ARM)
21 #define ARM_32_BIT
22 #endif
23
24 // DEFICIENT_CLANG_32_BIT includes clang-cl.
25 #if defined(__clang__) && defined(ARM_32_BIT) && \
26 (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
27 #define DEFICIENT_CLANG_32_BIT // This includes clang-cl.
28 #endif
29
30 #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
31 #define GCC_32_BIT
32 #endif
33
34 #if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
35
vld1q_u8_x3(const uint8_t * ptr)36 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
37 uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
38 vld1q_u8(ptr + 2 * 16) } };
39 return res;
40 }
41
vld1q_u8_x2(const uint8_t * ptr)42 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
43 uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
44 return res;
45 }
46
vld1q_u16_x2(const uint16_t * ptr)47 static inline uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
48 uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
49 return res;
50 }
51
vld1q_u16_x4(const uint16_t * ptr)52 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
53 uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
54 vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
55 return res;
56 }
57
vld1q_s16_x2(const int16_t * ptr)58 static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
59 int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
60 return res;
61 }
62
vld1q_s16_x4(const int16_t * ptr)63 static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
64 int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
65 vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
66 return res;
67 }
68
vst1_u8_x2(uint8_t * ptr,uint8x8x2_t a)69 static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
70 vst1_u8(ptr + 0 * 8, a.val[0]);
71 vst1_u8(ptr + 1 * 8, a.val[1]);
72 }
73
vst1_u8_x4(uint8_t * ptr,uint8x8x4_t a)74 static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
75 vst1_u8(ptr + 0 * 8, a.val[0]);
76 vst1_u8(ptr + 1 * 8, a.val[1]);
77 vst1_u8(ptr + 2 * 8, a.val[2]);
78 vst1_u8(ptr + 3 * 8, a.val[3]);
79 }
80
vst1q_u16_x2(uint16_t * ptr,uint16x8x2_t a)81 static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) {
82 vst1q_u16(ptr + 0 * 8, a.val[0]);
83 vst1q_u16(ptr + 1 * 8, a.val[1]);
84 }
85
vst1q_u16_x4(uint16_t * ptr,uint16x8x4_t a)86 static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) {
87 vst1q_u16(ptr + 0 * 8, a.val[0]);
88 vst1q_u16(ptr + 1 * 8, a.val[1]);
89 vst1q_u16(ptr + 2 * 8, a.val[2]);
90 vst1q_u16(ptr + 3 * 8, a.val[3]);
91 }
92
93 #elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit.
94 #if __GNUC__ < 8
vld1q_u8_x2(const uint8_t * ptr)95 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
96 uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
97 return res;
98 }
99
vld1q_s16_x2(const int16_t * ptr)100 static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) {
101 int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } };
102 return res;
103 }
104 #endif // __GNUC__ < 8
105
106 #if __GNUC__ < 9
vld1q_u8_x3(const uint8_t * ptr)107 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
108 uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
109 vld1q_u8(ptr + 2 * 16) } };
110 return res;
111 }
112 #endif // __GNUC__ < 9
113
114 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
vld1q_u16_x4(const uint16_t * ptr)115 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
116 uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
117 vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
118 return res;
119 }
120
vld1q_s16_x4(const int16_t * ptr)121 static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) {
122 int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8),
123 vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } };
124 return res;
125 }
126
vst1_u8_x2(uint8_t * ptr,uint8x8x2_t a)127 static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) {
128 vst1_u8(ptr + 0 * 8, a.val[0]);
129 vst1_u8(ptr + 1 * 8, a.val[1]);
130 }
131
vst1_u8_x4(uint8_t * ptr,uint8x8x4_t a)132 static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) {
133 vst1_u8(ptr + 0 * 8, a.val[0]);
134 vst1_u8(ptr + 1 * 8, a.val[1]);
135 vst1_u8(ptr + 2 * 8, a.val[2]);
136 vst1_u8(ptr + 3 * 8, a.val[3]);
137 }
138 #endif // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
139 #endif // defined(__GNUC__) && !defined(__clang__)
140
store_u8_8x2(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1)141 static inline void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
142 const uint8x8_t s1) {
143 vst1_u8(s, s0);
144 s += p;
145 vst1_u8(s, s1);
146 s += p;
147 }
148
load_u8_8x2(const uint8_t * s,ptrdiff_t p)149 static inline uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
150 return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
151 }
152
153 // Load four bytes into the low half of a uint8x8_t, zero the upper half.
load_u8_4x1(const uint8_t * p)154 static inline uint8x8_t load_u8_4x1(const uint8_t *p) {
155 uint8x8_t ret = vdup_n_u8(0);
156 ret = vreinterpret_u8_u32(
157 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
158 return ret;
159 }
160
load_u8_4x2(const uint8_t * p,int stride)161 static inline uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
162 uint8x8_t ret = vdup_n_u8(0);
163 ret = vreinterpret_u8_u32(
164 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
165 p += stride;
166 ret = vreinterpret_u8_u32(
167 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
168 return ret;
169 }
170
load_u16_2x2(const uint16_t * p,int stride)171 static inline uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
172 uint16x4_t ret = vdup_n_u16(0);
173 ret = vreinterpret_u16_u32(
174 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
175 p += stride;
176 ret = vreinterpret_u16_u32(
177 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
178 return ret;
179 }
180
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)181 static inline void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
182 uint8x8_t *const s0, uint8x8_t *const s1,
183 uint8x8_t *const s2, uint8x8_t *const s3,
184 uint8x8_t *const s4, uint8x8_t *const s5,
185 uint8x8_t *const s6, uint8x8_t *const s7) {
186 *s0 = vld1_u8(s);
187 s += p;
188 *s1 = vld1_u8(s);
189 s += p;
190 *s2 = vld1_u8(s);
191 s += p;
192 *s3 = vld1_u8(s);
193 s += p;
194 *s4 = vld1_u8(s);
195 s += p;
196 *s5 = vld1_u8(s);
197 s += p;
198 *s6 = vld1_u8(s);
199 s += p;
200 *s7 = vld1_u8(s);
201 }
202
load_u8_8x7(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6)203 static inline void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
204 uint8x8_t *const s0, uint8x8_t *const s1,
205 uint8x8_t *const s2, uint8x8_t *const s3,
206 uint8x8_t *const s4, uint8x8_t *const s5,
207 uint8x8_t *const s6) {
208 *s0 = vld1_u8(s);
209 s += p;
210 *s1 = vld1_u8(s);
211 s += p;
212 *s2 = vld1_u8(s);
213 s += p;
214 *s3 = vld1_u8(s);
215 s += p;
216 *s4 = vld1_u8(s);
217 s += p;
218 *s5 = vld1_u8(s);
219 s += p;
220 *s6 = vld1_u8(s);
221 }
222
load_u8_8x6(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5)223 static inline void load_u8_8x6(const uint8_t *s, ptrdiff_t p,
224 uint8x8_t *const s0, uint8x8_t *const s1,
225 uint8x8_t *const s2, uint8x8_t *const s3,
226 uint8x8_t *const s4, uint8x8_t *const s5) {
227 *s0 = vld1_u8(s);
228 s += p;
229 *s1 = vld1_u8(s);
230 s += p;
231 *s2 = vld1_u8(s);
232 s += p;
233 *s3 = vld1_u8(s);
234 s += p;
235 *s4 = vld1_u8(s);
236 s += p;
237 *s5 = vld1_u8(s);
238 }
239
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)240 static inline void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
241 uint8x8_t *const s0, uint8x8_t *const s1,
242 uint8x8_t *const s2, uint8x8_t *const s3) {
243 *s0 = vld1_u8(s);
244 s += p;
245 *s1 = vld1_u8(s);
246 s += p;
247 *s2 = vld1_u8(s);
248 s += p;
249 *s3 = vld1_u8(s);
250 }
251
load_u8_8x3(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2)252 static inline void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
253 uint8x8_t *const s0, uint8x8_t *const s1,
254 uint8x8_t *const s2) {
255 *s0 = vld1_u8(s);
256 s += p;
257 *s1 = vld1_u8(s);
258 s += p;
259 *s2 = vld1_u8(s);
260 }
261
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)262 static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
263 uint16x4_t *const s0, uint16x4_t *const s1,
264 uint16x4_t *const s2, uint16x4_t *const s3) {
265 *s0 = vld1_u16(s);
266 s += p;
267 *s1 = vld1_u16(s);
268 s += p;
269 *s2 = vld1_u16(s);
270 s += p;
271 *s3 = vld1_u16(s);
272 s += p;
273 }
274
load_u16_4x6(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5)275 static inline void load_u16_4x6(const uint16_t *s, ptrdiff_t p,
276 uint16x4_t *const s0, uint16x4_t *const s1,
277 uint16x4_t *const s2, uint16x4_t *const s3,
278 uint16x4_t *const s4, uint16x4_t *const s5) {
279 *s0 = vld1_u16(s);
280 s += p;
281 *s1 = vld1_u16(s);
282 s += p;
283 *s2 = vld1_u16(s);
284 s += p;
285 *s3 = vld1_u16(s);
286 s += p;
287 *s4 = vld1_u16(s);
288 s += p;
289 *s5 = vld1_u16(s);
290 }
291
load_u16_4x7(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6)292 static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
293 uint16x4_t *const s0, uint16x4_t *const s1,
294 uint16x4_t *const s2, uint16x4_t *const s3,
295 uint16x4_t *const s4, uint16x4_t *const s5,
296 uint16x4_t *const s6) {
297 *s0 = vld1_u16(s);
298 s += p;
299 *s1 = vld1_u16(s);
300 s += p;
301 *s2 = vld1_u16(s);
302 s += p;
303 *s3 = vld1_u16(s);
304 s += p;
305 *s4 = vld1_u16(s);
306 s += p;
307 *s5 = vld1_u16(s);
308 s += p;
309 *s6 = vld1_u16(s);
310 }
311
load_u16_4x8(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6,uint16x4_t * const s7)312 static inline void load_u16_4x8(const uint16_t *s, ptrdiff_t p,
313 uint16x4_t *const s0, uint16x4_t *const s1,
314 uint16x4_t *const s2, uint16x4_t *const s3,
315 uint16x4_t *const s4, uint16x4_t *const s5,
316 uint16x4_t *const s6, uint16x4_t *const s7) {
317 *s0 = vld1_u16(s);
318 s += p;
319 *s1 = vld1_u16(s);
320 s += p;
321 *s2 = vld1_u16(s);
322 s += p;
323 *s3 = vld1_u16(s);
324 s += p;
325 *s4 = vld1_u16(s);
326 s += p;
327 *s5 = vld1_u16(s);
328 s += p;
329 *s6 = vld1_u16(s);
330 s += p;
331 *s7 = vld1_u16(s);
332 }
333
load_u16_4x14(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6,uint16x4_t * const s7,uint16x4_t * const s8,uint16x4_t * const s9,uint16x4_t * const s10,uint16x4_t * const s11,uint16x4_t * const s12,uint16x4_t * const s13)334 static inline void load_u16_4x14(const uint16_t *s, ptrdiff_t p,
335 uint16x4_t *const s0, uint16x4_t *const s1,
336 uint16x4_t *const s2, uint16x4_t *const s3,
337 uint16x4_t *const s4, uint16x4_t *const s5,
338 uint16x4_t *const s6, uint16x4_t *const s7,
339 uint16x4_t *const s8, uint16x4_t *const s9,
340 uint16x4_t *const s10, uint16x4_t *const s11,
341 uint16x4_t *const s12, uint16x4_t *const s13) {
342 *s0 = vld1_u16(s);
343 s += p;
344 *s1 = vld1_u16(s);
345 s += p;
346 *s2 = vld1_u16(s);
347 s += p;
348 *s3 = vld1_u16(s);
349 s += p;
350 *s4 = vld1_u16(s);
351 s += p;
352 *s5 = vld1_u16(s);
353 s += p;
354 *s6 = vld1_u16(s);
355 s += p;
356 *s7 = vld1_u16(s);
357 s += p;
358 *s8 = vld1_u16(s);
359 s += p;
360 *s9 = vld1_u16(s);
361 s += p;
362 *s10 = vld1_u16(s);
363 s += p;
364 *s11 = vld1_u16(s);
365 s += p;
366 *s12 = vld1_u16(s);
367 s += p;
368 *s13 = vld1_u16(s);
369 }
370
load_s16_8x2(const int16_t * s,const ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1)371 static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
372 int16x8_t *const s0, int16x8_t *const s1) {
373 *s0 = vld1q_s16(s);
374 s += p;
375 *s1 = vld1q_s16(s);
376 }
377
load_u16_8x2(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1)378 static inline void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
379 uint16x8_t *const s0, uint16x8_t *const s1) {
380 *s0 = vld1q_u16(s);
381 s += p;
382 *s1 = vld1q_u16(s);
383 }
384
load_u16_8x3(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2)385 static inline void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
386 uint16x8_t *const s0, uint16x8_t *const s1,
387 uint16x8_t *const s2) {
388 *s0 = vld1q_u16(s);
389 s += p;
390 *s1 = vld1q_u16(s);
391 s += p;
392 *s2 = vld1q_u16(s);
393 }
394
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)395 static inline void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
396 uint16x8_t *const s0, uint16x8_t *const s1,
397 uint16x8_t *const s2, uint16x8_t *const s3) {
398 *s0 = vld1q_u16(s);
399 s += p;
400 *s1 = vld1q_u16(s);
401 s += p;
402 *s2 = vld1q_u16(s);
403 s += p;
404 *s3 = vld1q_u16(s);
405 s += p;
406 }
407
load_s16_4x12(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10,int16x4_t * const s11)408 static inline void load_s16_4x12(const int16_t *s, ptrdiff_t p,
409 int16x4_t *const s0, int16x4_t *const s1,
410 int16x4_t *const s2, int16x4_t *const s3,
411 int16x4_t *const s4, int16x4_t *const s5,
412 int16x4_t *const s6, int16x4_t *const s7,
413 int16x4_t *const s8, int16x4_t *const s9,
414 int16x4_t *const s10, int16x4_t *const s11) {
415 *s0 = vld1_s16(s);
416 s += p;
417 *s1 = vld1_s16(s);
418 s += p;
419 *s2 = vld1_s16(s);
420 s += p;
421 *s3 = vld1_s16(s);
422 s += p;
423 *s4 = vld1_s16(s);
424 s += p;
425 *s5 = vld1_s16(s);
426 s += p;
427 *s6 = vld1_s16(s);
428 s += p;
429 *s7 = vld1_s16(s);
430 s += p;
431 *s8 = vld1_s16(s);
432 s += p;
433 *s9 = vld1_s16(s);
434 s += p;
435 *s10 = vld1_s16(s);
436 s += p;
437 *s11 = vld1_s16(s);
438 }
439
load_s16_4x11(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10)440 static inline void load_s16_4x11(const int16_t *s, ptrdiff_t p,
441 int16x4_t *const s0, int16x4_t *const s1,
442 int16x4_t *const s2, int16x4_t *const s3,
443 int16x4_t *const s4, int16x4_t *const s5,
444 int16x4_t *const s6, int16x4_t *const s7,
445 int16x4_t *const s8, int16x4_t *const s9,
446 int16x4_t *const s10) {
447 *s0 = vld1_s16(s);
448 s += p;
449 *s1 = vld1_s16(s);
450 s += p;
451 *s2 = vld1_s16(s);
452 s += p;
453 *s3 = vld1_s16(s);
454 s += p;
455 *s4 = vld1_s16(s);
456 s += p;
457 *s5 = vld1_s16(s);
458 s += p;
459 *s6 = vld1_s16(s);
460 s += p;
461 *s7 = vld1_s16(s);
462 s += p;
463 *s8 = vld1_s16(s);
464 s += p;
465 *s9 = vld1_s16(s);
466 s += p;
467 *s10 = vld1_s16(s);
468 }
469
load_u16_4x11(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6,uint16x4_t * const s7,uint16x4_t * const s8,uint16x4_t * const s9,uint16x4_t * const s10)470 static inline void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
471 uint16x4_t *const s0, uint16x4_t *const s1,
472 uint16x4_t *const s2, uint16x4_t *const s3,
473 uint16x4_t *const s4, uint16x4_t *const s5,
474 uint16x4_t *const s6, uint16x4_t *const s7,
475 uint16x4_t *const s8, uint16x4_t *const s9,
476 uint16x4_t *const s10) {
477 *s0 = vld1_u16(s);
478 s += p;
479 *s1 = vld1_u16(s);
480 s += p;
481 *s2 = vld1_u16(s);
482 s += p;
483 *s3 = vld1_u16(s);
484 s += p;
485 *s4 = vld1_u16(s);
486 s += p;
487 *s5 = vld1_u16(s);
488 s += p;
489 *s6 = vld1_u16(s);
490 s += p;
491 *s7 = vld1_u16(s);
492 s += p;
493 *s8 = vld1_u16(s);
494 s += p;
495 *s9 = vld1_u16(s);
496 s += p;
497 *s10 = vld1_u16(s);
498 }
499
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)500 static inline void load_s16_4x8(const int16_t *s, ptrdiff_t p,
501 int16x4_t *const s0, int16x4_t *const s1,
502 int16x4_t *const s2, int16x4_t *const s3,
503 int16x4_t *const s4, int16x4_t *const s5,
504 int16x4_t *const s6, int16x4_t *const s7) {
505 *s0 = vld1_s16(s);
506 s += p;
507 *s1 = vld1_s16(s);
508 s += p;
509 *s2 = vld1_s16(s);
510 s += p;
511 *s3 = vld1_s16(s);
512 s += p;
513 *s4 = vld1_s16(s);
514 s += p;
515 *s5 = vld1_s16(s);
516 s += p;
517 *s6 = vld1_s16(s);
518 s += p;
519 *s7 = vld1_s16(s);
520 }
521
load_s16_4x7(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6)522 static inline void load_s16_4x7(const int16_t *s, ptrdiff_t p,
523 int16x4_t *const s0, int16x4_t *const s1,
524 int16x4_t *const s2, int16x4_t *const s3,
525 int16x4_t *const s4, int16x4_t *const s5,
526 int16x4_t *const s6) {
527 *s0 = vld1_s16(s);
528 s += p;
529 *s1 = vld1_s16(s);
530 s += p;
531 *s2 = vld1_s16(s);
532 s += p;
533 *s3 = vld1_s16(s);
534 s += p;
535 *s4 = vld1_s16(s);
536 s += p;
537 *s5 = vld1_s16(s);
538 s += p;
539 *s6 = vld1_s16(s);
540 }
541
load_s16_4x6(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5)542 static inline void load_s16_4x6(const int16_t *s, ptrdiff_t p,
543 int16x4_t *const s0, int16x4_t *const s1,
544 int16x4_t *const s2, int16x4_t *const s3,
545 int16x4_t *const s4, int16x4_t *const s5) {
546 *s0 = vld1_s16(s);
547 s += p;
548 *s1 = vld1_s16(s);
549 s += p;
550 *s2 = vld1_s16(s);
551 s += p;
552 *s3 = vld1_s16(s);
553 s += p;
554 *s4 = vld1_s16(s);
555 s += p;
556 *s5 = vld1_s16(s);
557 }
558
load_s16_4x5(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4)559 static inline void load_s16_4x5(const int16_t *s, ptrdiff_t p,
560 int16x4_t *const s0, int16x4_t *const s1,
561 int16x4_t *const s2, int16x4_t *const s3,
562 int16x4_t *const s4) {
563 *s0 = vld1_s16(s);
564 s += p;
565 *s1 = vld1_s16(s);
566 s += p;
567 *s2 = vld1_s16(s);
568 s += p;
569 *s3 = vld1_s16(s);
570 s += p;
571 *s4 = vld1_s16(s);
572 }
573
load_u16_4x5(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4)574 static inline void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
575 uint16x4_t *const s0, uint16x4_t *const s1,
576 uint16x4_t *const s2, uint16x4_t *const s3,
577 uint16x4_t *const s4) {
578 *s0 = vld1_u16(s);
579 s += p;
580 *s1 = vld1_u16(s);
581 s += p;
582 *s2 = vld1_u16(s);
583 s += p;
584 *s3 = vld1_u16(s);
585 s += p;
586 *s4 = vld1_u16(s);
587 s += p;
588 }
589
load_u8_8x5(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4)590 static inline void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
591 uint8x8_t *const s0, uint8x8_t *const s1,
592 uint8x8_t *const s2, uint8x8_t *const s3,
593 uint8x8_t *const s4) {
594 *s0 = vld1_u8(s);
595 s += p;
596 *s1 = vld1_u8(s);
597 s += p;
598 *s2 = vld1_u8(s);
599 s += p;
600 *s3 = vld1_u8(s);
601 s += p;
602 *s4 = vld1_u8(s);
603 }
604
load_u16_8x5(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4)605 static inline void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
606 uint16x8_t *const s0, uint16x8_t *const s1,
607 uint16x8_t *const s2, uint16x8_t *const s3,
608 uint16x8_t *const s4) {
609 *s0 = vld1q_u16(s);
610 s += p;
611 *s1 = vld1q_u16(s);
612 s += p;
613 *s2 = vld1q_u16(s);
614 s += p;
615 *s3 = vld1q_u16(s);
616 s += p;
617 *s4 = vld1q_u16(s);
618 s += p;
619 }
620
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)621 static inline void load_s16_4x4(const int16_t *s, ptrdiff_t p,
622 int16x4_t *const s0, int16x4_t *const s1,
623 int16x4_t *const s2, int16x4_t *const s3) {
624 *s0 = vld1_s16(s);
625 s += p;
626 *s1 = vld1_s16(s);
627 s += p;
628 *s2 = vld1_s16(s);
629 s += p;
630 *s3 = vld1_s16(s);
631 }
632
load_s16_4x3(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2)633 static inline void load_s16_4x3(const int16_t *s, ptrdiff_t p,
634 int16x4_t *const s0, int16x4_t *const s1,
635 int16x4_t *const s2) {
636 *s0 = vld1_s16(s);
637 s += p;
638 *s1 = vld1_s16(s);
639 s += p;
640 *s2 = vld1_s16(s);
641 }
642
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)643 static inline void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
644 const uint8x8_t s1, const uint8x8_t s2,
645 const uint8x8_t s3, const uint8x8_t s4,
646 const uint8x8_t s5, const uint8x8_t s6,
647 const uint8x8_t s7) {
648 vst1_u8(s, s0);
649 s += p;
650 vst1_u8(s, s1);
651 s += p;
652 vst1_u8(s, s2);
653 s += p;
654 vst1_u8(s, s3);
655 s += p;
656 vst1_u8(s, s4);
657 s += p;
658 vst1_u8(s, s5);
659 s += p;
660 vst1_u8(s, s6);
661 s += p;
662 vst1_u8(s, s7);
663 }
664
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)665 static inline void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
666 const uint8x8_t s1, const uint8x8_t s2,
667 const uint8x8_t s3) {
668 vst1_u8(s, s0);
669 s += p;
670 vst1_u8(s, s1);
671 s += p;
672 vst1_u8(s, s2);
673 s += p;
674 vst1_u8(s, s3);
675 }
676
store_u8_16x4(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)677 static inline void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
678 const uint8x16_t s1, const uint8x16_t s2,
679 const uint8x16_t s3) {
680 vst1q_u8(s, s0);
681 s += p;
682 vst1q_u8(s, s1);
683 s += p;
684 vst1q_u8(s, s2);
685 s += p;
686 vst1q_u8(s, s3);
687 }
688
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)689 static inline void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
690 const uint16x8_t s0, const uint16x8_t s1,
691 const uint16x8_t s2, const uint16x8_t s3,
692 const uint16x8_t s4, const uint16x8_t s5,
693 const uint16x8_t s6, const uint16x8_t s7) {
694 vst1q_u16(s, s0);
695 s += dst_stride;
696 vst1q_u16(s, s1);
697 s += dst_stride;
698 vst1q_u16(s, s2);
699 s += dst_stride;
700 vst1q_u16(s, s3);
701 s += dst_stride;
702 vst1q_u16(s, s4);
703 s += dst_stride;
704 vst1q_u16(s, s5);
705 s += dst_stride;
706 vst1q_u16(s, s6);
707 s += dst_stride;
708 vst1q_u16(s, s7);
709 }
710
store_u16_4x3(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2)711 static inline void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride,
712 const uint16x4_t s0, const uint16x4_t s1,
713 const uint16x4_t s2) {
714 vst1_u16(s, s0);
715 s += dst_stride;
716 vst1_u16(s, s1);
717 s += dst_stride;
718 vst1_u16(s, s2);
719 }
720
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)721 static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
722 const uint16x4_t s0, const uint16x4_t s1,
723 const uint16x4_t s2, const uint16x4_t s3) {
724 vst1_u16(s, s0);
725 s += dst_stride;
726 vst1_u16(s, s1);
727 s += dst_stride;
728 vst1_u16(s, s2);
729 s += dst_stride;
730 vst1_u16(s, s3);
731 }
732
store_u16_4x6(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3,const uint16x4_t s4,const uint16x4_t s5)733 static inline void store_u16_4x6(uint16_t *s, ptrdiff_t dst_stride,
734 const uint16x4_t s0, const uint16x4_t s1,
735 const uint16x4_t s2, const uint16x4_t s3,
736 const uint16x4_t s4, const uint16x4_t s5) {
737 vst1_u16(s, s0);
738 s += dst_stride;
739 vst1_u16(s, s1);
740 s += dst_stride;
741 vst1_u16(s, s2);
742 s += dst_stride;
743 vst1_u16(s, s3);
744 s += dst_stride;
745 vst1_u16(s, s4);
746 s += dst_stride;
747 vst1_u16(s, s5);
748 }
749
store_u16_4x12(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3,const uint16x4_t s4,const uint16x4_t s5,const uint16x4_t s6,const uint16x4_t s7,const uint16x4_t s8,const uint16x4_t s9,const uint16x4_t s10,const uint16x4_t s11)750 static inline void store_u16_4x12(uint16_t *s, ptrdiff_t dst_stride,
751 const uint16x4_t s0, const uint16x4_t s1,
752 const uint16x4_t s2, const uint16x4_t s3,
753 const uint16x4_t s4, const uint16x4_t s5,
754 const uint16x4_t s6, const uint16x4_t s7,
755 const uint16x4_t s8, const uint16x4_t s9,
756 const uint16x4_t s10, const uint16x4_t s11) {
757 vst1_u16(s, s0);
758 s += dst_stride;
759 vst1_u16(s, s1);
760 s += dst_stride;
761 vst1_u16(s, s2);
762 s += dst_stride;
763 vst1_u16(s, s3);
764 s += dst_stride;
765 vst1_u16(s, s4);
766 s += dst_stride;
767 vst1_u16(s, s5);
768 s += dst_stride;
769 vst1_u16(s, s6);
770 s += dst_stride;
771 vst1_u16(s, s7);
772 s += dst_stride;
773 vst1_u16(s, s8);
774 s += dst_stride;
775 vst1_u16(s, s9);
776 s += dst_stride;
777 vst1_u16(s, s10);
778 s += dst_stride;
779 vst1_u16(s, s11);
780 s += dst_stride;
781 }
782
store_u16_8x2(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1)783 static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
784 const uint16x8_t s0, const uint16x8_t s1) {
785 vst1q_u16(s, s0);
786 s += dst_stride;
787 vst1q_u16(s, s1);
788 }
789
store_u16_8x3(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2)790 static inline void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride,
791 const uint16x8_t s0, const uint16x8_t s1,
792 const uint16x8_t s2) {
793 vst1q_u16(s, s0);
794 s += dst_stride;
795 vst1q_u16(s, s1);
796 s += dst_stride;
797 vst1q_u16(s, s2);
798 }
799
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)800 static inline void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
801 const uint16x8_t s0, const uint16x8_t s1,
802 const uint16x8_t s2, const uint16x8_t s3) {
803 vst1q_u16(s, s0);
804 s += dst_stride;
805 vst1q_u16(s, s1);
806 s += dst_stride;
807 vst1q_u16(s, s2);
808 s += dst_stride;
809 vst1q_u16(s, s3);
810 }
811
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)812 static inline void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
813 const int16x8_t s0, const int16x8_t s1,
814 const int16x8_t s2, const int16x8_t s3,
815 const int16x8_t s4, const int16x8_t s5,
816 const int16x8_t s6, const int16x8_t s7) {
817 vst1q_s16(s, s0);
818 s += dst_stride;
819 vst1q_s16(s, s1);
820 s += dst_stride;
821 vst1q_s16(s, s2);
822 s += dst_stride;
823 vst1q_s16(s, s3);
824 s += dst_stride;
825 vst1q_s16(s, s4);
826 s += dst_stride;
827 vst1q_s16(s, s5);
828 s += dst_stride;
829 vst1q_s16(s, s6);
830 s += dst_stride;
831 vst1q_s16(s, s7);
832 }
833
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)834 static inline void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
835 const int16x4_t s0, const int16x4_t s1,
836 const int16x4_t s2, const int16x4_t s3) {
837 vst1_s16(s, s0);
838 s += dst_stride;
839 vst1_s16(s, s1);
840 s += dst_stride;
841 vst1_s16(s, s2);
842 s += dst_stride;
843 vst1_s16(s, s3);
844 }
845
store_s16_4x8(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7)846 static inline void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
847 const int16x4_t s0, const int16x4_t s1,
848 const int16x4_t s2, const int16x4_t s3,
849 const int16x4_t s4, const int16x4_t s5,
850 const int16x4_t s6, const int16x4_t s7) {
851 vst1_s16(s, s0);
852 s += dst_stride;
853 vst1_s16(s, s1);
854 s += dst_stride;
855 vst1_s16(s, s2);
856 s += dst_stride;
857 vst1_s16(s, s3);
858 s += dst_stride;
859 vst1_s16(s, s4);
860 s += dst_stride;
861 vst1_s16(s, s5);
862 s += dst_stride;
863 vst1_s16(s, s6);
864 s += dst_stride;
865 vst1_s16(s, s7);
866 }
867
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)868 static inline void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
869 const int16x8_t s0, const int16x8_t s1,
870 const int16x8_t s2, const int16x8_t s3) {
871 vst1q_s16(s, s0);
872 s += dst_stride;
873 vst1q_s16(s, s1);
874 s += dst_stride;
875 vst1q_s16(s, s2);
876 s += dst_stride;
877 vst1q_s16(s, s3);
878 }
879
store_s16_8x2(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1)880 static inline void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride,
881 const int16x8_t s0, const int16x8_t s1) {
882 vst1q_s16(s, s0);
883 s += dst_stride;
884 vst1q_s16(s, s1);
885 }
886
load_u8_8x11(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7,uint8x8_t * const s8,uint8x8_t * const s9,uint8x8_t * const s10)887 static inline void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
888 uint8x8_t *const s0, uint8x8_t *const s1,
889 uint8x8_t *const s2, uint8x8_t *const s3,
890 uint8x8_t *const s4, uint8x8_t *const s5,
891 uint8x8_t *const s6, uint8x8_t *const s7,
892 uint8x8_t *const s8, uint8x8_t *const s9,
893 uint8x8_t *const s10) {
894 *s0 = vld1_u8(s);
895 s += p;
896 *s1 = vld1_u8(s);
897 s += p;
898 *s2 = vld1_u8(s);
899 s += p;
900 *s3 = vld1_u8(s);
901 s += p;
902 *s4 = vld1_u8(s);
903 s += p;
904 *s5 = vld1_u8(s);
905 s += p;
906 *s6 = vld1_u8(s);
907 s += p;
908 *s7 = vld1_u8(s);
909 s += p;
910 *s8 = vld1_u8(s);
911 s += p;
912 *s9 = vld1_u8(s);
913 s += p;
914 *s10 = vld1_u8(s);
915 }
916
load_s16_8x10(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9)917 static inline void load_s16_8x10(const int16_t *s, ptrdiff_t p,
918 int16x8_t *const s0, int16x8_t *const s1,
919 int16x8_t *const s2, int16x8_t *const s3,
920 int16x8_t *const s4, int16x8_t *const s5,
921 int16x8_t *const s6, int16x8_t *const s7,
922 int16x8_t *const s8, int16x8_t *const s9) {
923 *s0 = vld1q_s16(s);
924 s += p;
925 *s1 = vld1q_s16(s);
926 s += p;
927 *s2 = vld1q_s16(s);
928 s += p;
929 *s3 = vld1q_s16(s);
930 s += p;
931 *s4 = vld1q_s16(s);
932 s += p;
933 *s5 = vld1q_s16(s);
934 s += p;
935 *s6 = vld1q_s16(s);
936 s += p;
937 *s7 = vld1q_s16(s);
938 s += p;
939 *s8 = vld1q_s16(s);
940 s += p;
941 *s9 = vld1q_s16(s);
942 }
943
load_s16_8x11(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10)944 static inline void load_s16_8x11(const int16_t *s, ptrdiff_t p,
945 int16x8_t *const s0, int16x8_t *const s1,
946 int16x8_t *const s2, int16x8_t *const s3,
947 int16x8_t *const s4, int16x8_t *const s5,
948 int16x8_t *const s6, int16x8_t *const s7,
949 int16x8_t *const s8, int16x8_t *const s9,
950 int16x8_t *const s10) {
951 *s0 = vld1q_s16(s);
952 s += p;
953 *s1 = vld1q_s16(s);
954 s += p;
955 *s2 = vld1q_s16(s);
956 s += p;
957 *s3 = vld1q_s16(s);
958 s += p;
959 *s4 = vld1q_s16(s);
960 s += p;
961 *s5 = vld1q_s16(s);
962 s += p;
963 *s6 = vld1q_s16(s);
964 s += p;
965 *s7 = vld1q_s16(s);
966 s += p;
967 *s8 = vld1q_s16(s);
968 s += p;
969 *s9 = vld1q_s16(s);
970 s += p;
971 *s10 = vld1q_s16(s);
972 }
973
load_s16_8x12(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10,int16x8_t * const s11)974 static inline void load_s16_8x12(const int16_t *s, ptrdiff_t p,
975 int16x8_t *const s0, int16x8_t *const s1,
976 int16x8_t *const s2, int16x8_t *const s3,
977 int16x8_t *const s4, int16x8_t *const s5,
978 int16x8_t *const s6, int16x8_t *const s7,
979 int16x8_t *const s8, int16x8_t *const s9,
980 int16x8_t *const s10, int16x8_t *const s11) {
981 *s0 = vld1q_s16(s);
982 s += p;
983 *s1 = vld1q_s16(s);
984 s += p;
985 *s2 = vld1q_s16(s);
986 s += p;
987 *s3 = vld1q_s16(s);
988 s += p;
989 *s4 = vld1q_s16(s);
990 s += p;
991 *s5 = vld1q_s16(s);
992 s += p;
993 *s6 = vld1q_s16(s);
994 s += p;
995 *s7 = vld1q_s16(s);
996 s += p;
997 *s8 = vld1q_s16(s);
998 s += p;
999 *s9 = vld1q_s16(s);
1000 s += p;
1001 *s10 = vld1q_s16(s);
1002 s += p;
1003 *s11 = vld1q_s16(s);
1004 }
1005
load_u16_8x11(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7,uint16x8_t * const s8,uint16x8_t * const s9,uint16x8_t * const s10)1006 static inline void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
1007 uint16x8_t *const s0, uint16x8_t *const s1,
1008 uint16x8_t *const s2, uint16x8_t *const s3,
1009 uint16x8_t *const s4, uint16x8_t *const s5,
1010 uint16x8_t *const s6, uint16x8_t *const s7,
1011 uint16x8_t *const s8, uint16x8_t *const s9,
1012 uint16x8_t *const s10) {
1013 *s0 = vld1q_u16(s);
1014 s += p;
1015 *s1 = vld1q_u16(s);
1016 s += p;
1017 *s2 = vld1q_u16(s);
1018 s += p;
1019 *s3 = vld1q_u16(s);
1020 s += p;
1021 *s4 = vld1q_u16(s);
1022 s += p;
1023 *s5 = vld1q_u16(s);
1024 s += p;
1025 *s6 = vld1q_u16(s);
1026 s += p;
1027 *s7 = vld1q_u16(s);
1028 s += p;
1029 *s8 = vld1q_u16(s);
1030 s += p;
1031 *s9 = vld1q_u16(s);
1032 s += p;
1033 *s10 = vld1q_u16(s);
1034 }
1035
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)1036 static inline void load_s16_8x8(const int16_t *s, ptrdiff_t p,
1037 int16x8_t *const s0, int16x8_t *const s1,
1038 int16x8_t *const s2, int16x8_t *const s3,
1039 int16x8_t *const s4, int16x8_t *const s5,
1040 int16x8_t *const s6, int16x8_t *const s7) {
1041 *s0 = vld1q_s16(s);
1042 s += p;
1043 *s1 = vld1q_s16(s);
1044 s += p;
1045 *s2 = vld1q_s16(s);
1046 s += p;
1047 *s3 = vld1q_s16(s);
1048 s += p;
1049 *s4 = vld1q_s16(s);
1050 s += p;
1051 *s5 = vld1q_s16(s);
1052 s += p;
1053 *s6 = vld1q_s16(s);
1054 s += p;
1055 *s7 = vld1q_s16(s);
1056 }
1057
load_u16_8x7(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6)1058 static inline void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
1059 uint16x8_t *const s0, uint16x8_t *const s1,
1060 uint16x8_t *const s2, uint16x8_t *const s3,
1061 uint16x8_t *const s4, uint16x8_t *const s5,
1062 uint16x8_t *const s6) {
1063 *s0 = vld1q_u16(s);
1064 s += p;
1065 *s1 = vld1q_u16(s);
1066 s += p;
1067 *s2 = vld1q_u16(s);
1068 s += p;
1069 *s3 = vld1q_u16(s);
1070 s += p;
1071 *s4 = vld1q_u16(s);
1072 s += p;
1073 *s5 = vld1q_u16(s);
1074 s += p;
1075 *s6 = vld1q_u16(s);
1076 }
1077
load_s16_8x7(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6)1078 static inline void load_s16_8x7(const int16_t *s, ptrdiff_t p,
1079 int16x8_t *const s0, int16x8_t *const s1,
1080 int16x8_t *const s2, int16x8_t *const s3,
1081 int16x8_t *const s4, int16x8_t *const s5,
1082 int16x8_t *const s6) {
1083 *s0 = vld1q_s16(s);
1084 s += p;
1085 *s1 = vld1q_s16(s);
1086 s += p;
1087 *s2 = vld1q_s16(s);
1088 s += p;
1089 *s3 = vld1q_s16(s);
1090 s += p;
1091 *s4 = vld1q_s16(s);
1092 s += p;
1093 *s5 = vld1q_s16(s);
1094 s += p;
1095 *s6 = vld1q_s16(s);
1096 }
1097
load_s16_8x6(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5)1098 static inline void load_s16_8x6(const int16_t *s, ptrdiff_t p,
1099 int16x8_t *const s0, int16x8_t *const s1,
1100 int16x8_t *const s2, int16x8_t *const s3,
1101 int16x8_t *const s4, int16x8_t *const s5) {
1102 *s0 = vld1q_s16(s);
1103 s += p;
1104 *s1 = vld1q_s16(s);
1105 s += p;
1106 *s2 = vld1q_s16(s);
1107 s += p;
1108 *s3 = vld1q_s16(s);
1109 s += p;
1110 *s4 = vld1q_s16(s);
1111 s += p;
1112 *s5 = vld1q_s16(s);
1113 }
1114
load_s16_8x5(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4)1115 static inline void load_s16_8x5(const int16_t *s, ptrdiff_t p,
1116 int16x8_t *const s0, int16x8_t *const s1,
1117 int16x8_t *const s2, int16x8_t *const s3,
1118 int16x8_t *const s4) {
1119 *s0 = vld1q_s16(s);
1120 s += p;
1121 *s1 = vld1q_s16(s);
1122 s += p;
1123 *s2 = vld1q_s16(s);
1124 s += p;
1125 *s3 = vld1q_s16(s);
1126 s += p;
1127 *s4 = vld1q_s16(s);
1128 }
1129
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)1130 static inline void load_s16_8x4(const int16_t *s, ptrdiff_t p,
1131 int16x8_t *const s0, int16x8_t *const s1,
1132 int16x8_t *const s2, int16x8_t *const s3) {
1133 *s0 = vld1q_s16(s);
1134 s += p;
1135 *s1 = vld1q_s16(s);
1136 s += p;
1137 *s2 = vld1q_s16(s);
1138 s += p;
1139 *s3 = vld1q_s16(s);
1140 }
1141
load_s16_8x3(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2)1142 static inline void load_s16_8x3(const int16_t *s, ptrdiff_t p,
1143 int16x8_t *const s0, int16x8_t *const s1,
1144 int16x8_t *const s2) {
1145 *s0 = vld1q_s16(s);
1146 s += p;
1147 *s1 = vld1q_s16(s);
1148 s += p;
1149 *s2 = vld1q_s16(s);
1150 }
1151
1152 #if AOM_ARCH_AARCH64
1153 #define load_unaligned_u32_2x1_lane(v, p, lane) \
1154 do { \
1155 (v) = vld1_lane_u32((const uint32_t *)(p), (v), (lane)); \
1156 } while (0)
1157
1158 #define load_unaligned_u32_4x1_lane(v, p, lane) \
1159 do { \
1160 (v) = vld1q_lane_u32((const uint32_t *)(p), (v), (lane)); \
1161 } while (0)
1162 #else
1163 #define load_unaligned_u32_2x1_lane(v, p, lane) \
1164 do { \
1165 uint32_t tmp; \
1166 memcpy(&tmp, (p), 4); \
1167 (v) = vset_lane_u32(tmp, (v), (lane)); \
1168 } while (0)
1169
1170 #define load_unaligned_u32_4x1_lane(v, p, lane) \
1171 do { \
1172 uint32_t tmp; \
1173 memcpy(&tmp, (p), 4); \
1174 (v) = vsetq_lane_u32(tmp, (v), (lane)); \
1175 } while (0)
1176 #endif
1177
1178 // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,int stride)1179 static inline uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
1180 uint32_t a;
1181 memcpy(&a, buf, 4);
1182 buf += stride;
1183 uint32x2_t a_u32 = vdup_n_u32(a);
1184 memcpy(&a, buf, 4);
1185 a_u32 = vset_lane_u32(a, a_u32, 1);
1186 return vreinterpret_u8_u32(a_u32);
1187 }
1188
1189 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,int stride)1190 static inline uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
1191 uint32_t a;
1192 uint32x4_t a_u32;
1193 if (stride == 4) return vld1q_u8(buf);
1194 memcpy(&a, buf, 4);
1195 buf += stride;
1196 a_u32 = vdupq_n_u32(a);
1197 memcpy(&a, buf, 4);
1198 buf += stride;
1199 a_u32 = vsetq_lane_u32(a, a_u32, 1);
1200 memcpy(&a, buf, 4);
1201 buf += stride;
1202 a_u32 = vsetq_lane_u32(a, a_u32, 2);
1203 memcpy(&a, buf, 4);
1204 a_u32 = vsetq_lane_u32(a, a_u32, 3);
1205 return vreinterpretq_u8_u32(a_u32);
1206 }
1207
load_unaligned_u8_2x2(const uint8_t * buf,int stride)1208 static inline uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
1209 uint16_t a;
1210 uint16x4_t a_u16;
1211
1212 memcpy(&a, buf, 2);
1213 buf += stride;
1214 a_u16 = vdup_n_u16(a);
1215 memcpy(&a, buf, 2);
1216 a_u16 = vset_lane_u16(a, a_u16, 1);
1217 return vreinterpret_u8_u16(a_u16);
1218 }
1219
load_unaligned_u8_4x1(const uint8_t * buf)1220 static inline uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
1221 uint32_t a;
1222 uint32x2_t a_u32;
1223
1224 memcpy(&a, buf, 4);
1225 a_u32 = vdup_n_u32(0);
1226 a_u32 = vset_lane_u32(a, a_u32, 0);
1227 return vreinterpret_u8_u32(a_u32);
1228 }
1229
load_unaligned_dup_u8_4x2(const uint8_t * buf)1230 static inline uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
1231 uint32_t a;
1232 uint32x2_t a_u32;
1233
1234 memcpy(&a, buf, 4);
1235 a_u32 = vdup_n_u32(a);
1236 return vreinterpret_u8_u32(a_u32);
1237 }
1238
load_unaligned_dup_u8_2x4(const uint8_t * buf)1239 static inline uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
1240 uint16_t a;
1241 uint16x4_t a_u32;
1242
1243 memcpy(&a, buf, 2);
1244 a_u32 = vdup_n_u16(a);
1245 return vreinterpret_u8_u16(a_u32);
1246 }
1247
load_unaligned_u8_4x2(const uint8_t * buf,int stride)1248 static inline uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
1249 uint32_t a;
1250 uint32x2_t a_u32;
1251
1252 memcpy(&a, buf, 4);
1253 buf += stride;
1254 a_u32 = vdup_n_u32(a);
1255 memcpy(&a, buf, 4);
1256 a_u32 = vset_lane_u32(a, a_u32, 1);
1257 return vreinterpret_u8_u32(a_u32);
1258 }
1259
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1)1260 static inline void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
1261 uint8x8_t *tu0, uint8x8_t *tu1) {
1262 *tu0 = load_unaligned_u8_4x2(buf, stride);
1263 buf += 2 * stride;
1264 *tu1 = load_unaligned_u8_4x2(buf, stride);
1265 }
1266
load_unaligned_u8_3x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2)1267 static inline void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
1268 uint8x8_t *tu0, uint8x8_t *tu1,
1269 uint8x8_t *tu2) {
1270 load_unaligned_u8_4x4(buf, stride, tu0, tu1);
1271 buf += 4 * stride;
1272 *tu2 = load_unaligned_u8_4x2(buf, stride);
1273 }
1274
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2,uint8x8_t * tu3)1275 static inline void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
1276 uint8x8_t *tu0, uint8x8_t *tu1,
1277 uint8x8_t *tu2, uint8x8_t *tu3) {
1278 load_unaligned_u8_4x4(buf, stride, tu0, tu1);
1279 buf += 4 * stride;
1280 load_unaligned_u8_4x4(buf, stride, tu2, tu3);
1281 }
1282
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)1283 static inline void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
1284 uint8x16_t *const s0, uint8x16_t *const s1,
1285 uint8x16_t *const s2, uint8x16_t *const s3,
1286 uint8x16_t *const s4, uint8x16_t *const s5,
1287 uint8x16_t *const s6, uint8x16_t *const s7) {
1288 *s0 = vld1q_u8(s);
1289 s += p;
1290 *s1 = vld1q_u8(s);
1291 s += p;
1292 *s2 = vld1q_u8(s);
1293 s += p;
1294 *s3 = vld1q_u8(s);
1295 s += p;
1296 *s4 = vld1q_u8(s);
1297 s += p;
1298 *s5 = vld1q_u8(s);
1299 s += p;
1300 *s6 = vld1q_u8(s);
1301 s += p;
1302 *s7 = vld1q_u8(s);
1303 }
1304
load_u8_16x5(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4)1305 static inline void load_u8_16x5(const uint8_t *s, ptrdiff_t p,
1306 uint8x16_t *const s0, uint8x16_t *const s1,
1307 uint8x16_t *const s2, uint8x16_t *const s3,
1308 uint8x16_t *const s4) {
1309 *s0 = vld1q_u8(s);
1310 s += p;
1311 *s1 = vld1q_u8(s);
1312 s += p;
1313 *s2 = vld1q_u8(s);
1314 s += p;
1315 *s3 = vld1q_u8(s);
1316 s += p;
1317 *s4 = vld1q_u8(s);
1318 }
1319
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)1320 static inline void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
1321 uint8x16_t *const s0, uint8x16_t *const s1,
1322 uint8x16_t *const s2, uint8x16_t *const s3) {
1323 *s0 = vld1q_u8(s);
1324 s += p;
1325 *s1 = vld1q_u8(s);
1326 s += p;
1327 *s2 = vld1q_u8(s);
1328 s += p;
1329 *s3 = vld1q_u8(s);
1330 }
1331
load_u8_16x3(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2)1332 static inline void load_u8_16x3(const uint8_t *s, ptrdiff_t p,
1333 uint8x16_t *const s0, uint8x16_t *const s1,
1334 uint8x16_t *const s2) {
1335 *s0 = vld1q_u8(s);
1336 s += p;
1337 *s1 = vld1q_u8(s);
1338 s += p;
1339 *s2 = vld1q_u8(s);
1340 }
1341
load_u16_8x8(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)1342 static inline void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
1343 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
1344 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
1345 uint16x8_t *s6, uint16x8_t *s7) {
1346 *s0 = vld1q_u16(s);
1347 s += p;
1348 *s1 = vld1q_u16(s);
1349 s += p;
1350 *s2 = vld1q_u16(s);
1351 s += p;
1352 *s3 = vld1q_u16(s);
1353 s += p;
1354 *s4 = vld1q_u16(s);
1355 s += p;
1356 *s5 = vld1q_u16(s);
1357 s += p;
1358 *s6 = vld1q_u16(s);
1359 s += p;
1360 *s7 = vld1q_u16(s);
1361 }
1362
load_u16_16x4(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7)1363 static inline void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
1364 uint16x8_t *const s0, uint16x8_t *const s1,
1365 uint16x8_t *const s2, uint16x8_t *const s3,
1366 uint16x8_t *const s4, uint16x8_t *const s5,
1367 uint16x8_t *const s6, uint16x8_t *const s7) {
1368 *s0 = vld1q_u16(s);
1369 *s1 = vld1q_u16(s + 8);
1370 s += p;
1371 *s2 = vld1q_u16(s);
1372 *s3 = vld1q_u16(s + 8);
1373 s += p;
1374 *s4 = vld1q_u16(s);
1375 *s5 = vld1q_u16(s + 8);
1376 s += p;
1377 *s6 = vld1q_u16(s);
1378 *s7 = vld1q_u16(s + 8);
1379 }
1380
load_unaligned_u16_2x2(const uint16_t * buf,int stride)1381 static inline uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
1382 int stride) {
1383 uint32_t a;
1384 uint32x2_t a_u32;
1385
1386 memcpy(&a, buf, 4);
1387 buf += stride;
1388 a_u32 = vdup_n_u32(a);
1389 memcpy(&a, buf, 4);
1390 a_u32 = vset_lane_u32(a, a_u32, 1);
1391 return vreinterpret_u16_u32(a_u32);
1392 }
1393
load_unaligned_u16_4x1(const uint16_t * buf)1394 static inline uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
1395 uint64_t a;
1396 uint64x1_t a_u64 = vdup_n_u64(0);
1397 memcpy(&a, buf, 8);
1398 a_u64 = vset_lane_u64(a, a_u64, 0);
1399 return vreinterpret_u16_u64(a_u64);
1400 }
1401
load_unaligned_u16_4x2(const uint16_t * buf,uint32_t stride)1402 static inline uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
1403 uint32_t stride) {
1404 uint64_t a;
1405 uint64x2_t a_u64;
1406
1407 memcpy(&a, buf, 8);
1408 buf += stride;
1409 a_u64 = vdupq_n_u64(0);
1410 a_u64 = vsetq_lane_u64(a, a_u64, 0);
1411 memcpy(&a, buf, 8);
1412 buf += stride;
1413 a_u64 = vsetq_lane_u64(a, a_u64, 1);
1414 return vreinterpretq_u16_u64(a_u64);
1415 }
1416
load_unaligned_s16_4x2(const int16_t * buf,uint32_t stride)1417 static inline int16x8_t load_unaligned_s16_4x2(const int16_t *buf,
1418 uint32_t stride) {
1419 int64_t a;
1420 int64x2_t a_s64;
1421 memcpy(&a, buf, 8);
1422 buf += stride;
1423 a_s64 = vdupq_n_s64(0);
1424 a_s64 = vsetq_lane_s64(a, a_s64, 0);
1425 memcpy(&a, buf, 8);
1426 buf += stride;
1427 a_s64 = vsetq_lane_s64(a, a_s64, 1);
1428 return vreinterpretq_s16_s64(a_s64);
1429 }
1430
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint16x8_t * tu0,uint16x8_t * tu1)1431 static inline void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
1432 uint16x8_t *tu0, uint16x8_t *tu1) {
1433 *tu0 = load_unaligned_u16_4x2(buf, stride);
1434 buf += 2 * stride;
1435 *tu1 = load_unaligned_u16_4x2(buf, stride);
1436 }
1437
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)1438 static inline void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
1439 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
1440 *s1 = vld1q_s32(s);
1441 s += p;
1442 *s2 = vld1q_s32(s);
1443 s += p;
1444 *s3 = vld1q_s32(s);
1445 s += p;
1446 *s4 = vld1q_s32(s);
1447 }
1448
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)1449 static inline void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
1450 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
1451 vst1q_s32(s, s1);
1452 s += p;
1453 vst1q_s32(s, s2);
1454 s += p;
1455 vst1q_s32(s, s3);
1456 s += p;
1457 vst1q_s32(s, s4);
1458 }
1459
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)1460 static inline void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
1461 uint32x4_t *s2, uint32x4_t *s3,
1462 uint32x4_t *s4) {
1463 *s1 = vld1q_u32(s);
1464 s += p;
1465 *s2 = vld1q_u32(s);
1466 s += p;
1467 *s3 = vld1q_u32(s);
1468 s += p;
1469 *s4 = vld1q_u32(s);
1470 }
1471
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)1472 static inline void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
1473 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
1474 vst1q_u32(s, s1);
1475 s += p;
1476 vst1q_u32(s, s2);
1477 s += p;
1478 vst1q_u32(s, s3);
1479 s += p;
1480 vst1q_u32(s, s4);
1481 }
1482
load_tran_low_to_s16q(const tran_low_t * buf)1483 static inline int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
1484 const int32x4_t v0 = vld1q_s32(buf);
1485 const int32x4_t v1 = vld1q_s32(buf + 4);
1486 const int16x4_t s0 = vmovn_s32(v0);
1487 const int16x4_t s1 = vmovn_s32(v1);
1488 return vcombine_s16(s0, s1);
1489 }
1490
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)1491 static inline void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
1492 const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
1493 const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
1494 vst1q_s32(buf, v0);
1495 vst1q_s32(buf + 4, v1);
1496 }
1497
store_s16_to_tran_low(tran_low_t * buf,const int16x4_t a)1498 static inline void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
1499 const int32x4_t v0 = vmovl_s16(a);
1500 vst1q_s32(buf, v0);
1501 }
1502
load_u8_gather_s16_x8(const uint8_t * src,int16x8_t indices)1503 static inline uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
1504 int16x8_t indices) {
1505 // Recent Clang and GCC versions correctly identify that this zero-broadcast
1506 // is redundant. Alternatively we could load and broadcast the zeroth element
1507 // and then replace the other lanes, however this is slower than loading a
1508 // single element without broadcast on some micro-architectures.
1509 uint8x8_t ret = vdup_n_u8(0);
1510 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0);
1511 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1);
1512 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2);
1513 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3);
1514 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4);
1515 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5);
1516 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6);
1517 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7);
1518 return ret;
1519 }
1520
1521 // The `lane` parameter here must be an immediate.
1522 #define store_u8_2x1_lane(dst, src, lane) \
1523 do { \
1524 uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
1525 memcpy(dst, &a, 2); \
1526 } while (0)
1527
1528 #define store_u8_4x1_lane(dst, src, lane) \
1529 do { \
1530 uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
1531 memcpy(dst, &a, 4); \
1532 } while (0)
1533
1534 #define store_u16_2x1_lane(dst, src, lane) \
1535 do { \
1536 uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
1537 memcpy(dst, &a, 4); \
1538 } while (0)
1539
1540 #define store_u16_4x1_lane(dst, src, lane) \
1541 do { \
1542 uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
1543 memcpy(dst, &a, 8); \
1544 } while (0)
1545
1546 #define store_s16_4x1_lane(dst, src, lane) \
1547 do { \
1548 int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \
1549 memcpy(dst, &a, 8); \
1550 } while (0)
1551
1552 // Store the low 16-bits from a single vector.
store_u8_2x1(uint8_t * dst,const uint8x8_t src)1553 static inline void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
1554 store_u8_2x1_lane(dst, src, 0);
1555 }
1556
1557 // Store the low 32-bits from a single vector.
store_u8_4x1(uint8_t * dst,const uint8x8_t src)1558 static inline void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
1559 store_u8_4x1_lane(dst, src, 0);
1560 }
1561
1562 // Store two blocks of 16-bits from a single vector.
store_u8x2_strided_x2(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1563 static inline void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
1564 uint8x8_t src) {
1565 store_u8_2x1_lane(dst, src, 0);
1566 dst += dst_stride;
1567 store_u8_2x1_lane(dst, src, 1);
1568 }
1569
store_u8x2_strided_x4(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1570 static inline void store_u8x2_strided_x4(uint8_t *dst, uint32_t dst_stride,
1571 uint8x8_t src) {
1572 store_u8_2x1_lane(dst, src, 0);
1573 dst += dst_stride;
1574 store_u8_2x1_lane(dst, src, 1);
1575 dst += dst_stride;
1576 store_u8_2x1_lane(dst, src, 2);
1577 dst += dst_stride;
1578 store_u8_2x1_lane(dst, src, 3);
1579 }
1580
1581 // Store two blocks of 32-bits from a single vector.
store_u8x4_strided_x2(uint8_t * dst,ptrdiff_t stride,uint8x8_t src)1582 static inline void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
1583 uint8x8_t src) {
1584 store_u8_4x1_lane(dst, src, 0);
1585 dst += stride;
1586 store_u8_4x1_lane(dst, src, 1);
1587 }
1588
1589 // Store four blocks of 32-bits from a single vector.
store_u8x4_strided_x4(uint8_t * dst,ptrdiff_t stride,uint8x16_t src)1590 static inline void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
1591 uint8x16_t src) {
1592 store_u8_4x1_lane(dst, vget_low_u8(src), 0);
1593 dst += stride;
1594 store_u8_4x1_lane(dst, vget_low_u8(src), 1);
1595 dst += stride;
1596 store_u8_4x1_lane(dst, vget_high_u8(src), 0);
1597 dst += stride;
1598 store_u8_4x1_lane(dst, vget_high_u8(src), 1);
1599 }
1600
1601 // Store the low 32-bits from a single vector.
store_u16_2x1(uint16_t * dst,const uint16x4_t src)1602 static inline void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
1603 store_u16_2x1_lane(dst, src, 0);
1604 }
1605
1606 // Store two blocks of 32-bits from a single vector.
store_u16x2_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x4_t src)1607 static inline void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
1608 uint16x4_t src) {
1609 store_u16_2x1_lane(dst, src, 0);
1610 dst += dst_stride;
1611 store_u16_2x1_lane(dst, src, 1);
1612 }
1613
1614 // Store two blocks of 64-bits from a single vector.
store_u16x4_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x8_t src)1615 static inline void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
1616 uint16x8_t src) {
1617 store_u16_4x1_lane(dst, src, 0);
1618 dst += dst_stride;
1619 store_u16_4x1_lane(dst, src, 1);
1620 }
1621
1622 // Store two blocks of 64-bits from a single vector.
store_s16x4_strided_x2(int16_t * dst,int32_t dst_stride,int16x8_t src)1623 static inline void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride,
1624 int16x8_t src) {
1625 store_s16_4x1_lane(dst, src, 0);
1626 dst += dst_stride;
1627 store_s16_4x1_lane(dst, src, 1);
1628 }
1629
1630 #undef store_u8_2x1_lane
1631 #undef store_u8_4x1_lane
1632 #undef store_u16_2x1_lane
1633 #undef store_u16_4x1_lane
1634 #undef store_s16_4x1_lane
1635
1636 #endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
1637