1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
12 #define AOM_AOM_DSP_ARM_MEM_NEON_H_
13
14 #include <arm_neon.h>
15 #include <string.h>
16 #include "aom_dsp/aom_dsp_common.h"
17
18 // Support for xN Neon intrinsics is lacking in some compilers.
19 #if defined(__arm__) || defined(_M_ARM)
20 #define ARM_32_BIT
21 #endif
22
23 // DEFICIENT_CLANG_32_BIT includes clang-cl.
24 #if defined(__clang__) && defined(ARM_32_BIT) && \
25 (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
26 #define DEFICIENT_CLANG_32_BIT // This includes clang-cl.
27 #endif
28
29 #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
30 #define GCC_32_BIT
31 #endif
32
33 #if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
34
vld1q_u8_x3(const uint8_t * ptr)35 static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
36 uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
37 vld1q_u8(ptr + 2 * 16) } };
38 return res;
39 }
40
vld1q_u8_x2(const uint8_t * ptr)41 static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
42 uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
43 return res;
44 }
45
vld1q_u16_x2(const uint16_t * ptr)46 static INLINE uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
47 uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
48 return res;
49 }
50
vld1q_u16_x4(const uint16_t * ptr)51 static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
52 uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
53 vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
54 return res;
55 }
56
57 #elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit.
58 #if __GNUC__ < 8
vld1q_u8_x2(const uint8_t * ptr)59 static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
60 uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
61 return res;
62 }
63 #endif // __GNUC__ < 8
64
65 #if __GNUC__ < 9
vld1q_u8_x3(const uint8_t * ptr)66 static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
67 uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
68 vld1q_u8(ptr + 2 * 16) } };
69 return res;
70 }
71 #endif // __GNUC__ < 9
72
73 // vld1q_u16_x4 is defined from GCC 8.5.0 and onwards.
74 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
vld1q_u16_x4(const uint16_t * ptr)75 static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
76 uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
77 vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
78 return res;
79 }
80 #endif // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805
81 #endif // defined(__GNUC__) && !defined(__clang__)
82
store_u8_8x2(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1)83 static INLINE void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
84 const uint8x8_t s1) {
85 vst1_u8(s, s0);
86 s += p;
87 vst1_u8(s, s1);
88 s += p;
89 }
90
load_u8_8x2(const uint8_t * s,ptrdiff_t p)91 static INLINE uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
92 return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
93 }
94
95 // Load four bytes into the low half of a uint8x8_t, zero the upper half.
load_u8_4x1(const uint8_t * p)96 static INLINE uint8x8_t load_u8_4x1(const uint8_t *p) {
97 uint8x8_t ret = vdup_n_u8(0);
98 ret = vreinterpret_u8_u32(
99 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
100 return ret;
101 }
102
load_u8_4x2(const uint8_t * p,int stride)103 static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
104 uint8x8_t ret = vdup_n_u8(0);
105 ret = vreinterpret_u8_u32(
106 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
107 p += stride;
108 ret = vreinterpret_u8_u32(
109 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
110 return ret;
111 }
112
load_u16_2x2(const uint16_t * p,int stride)113 static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
114 uint16x4_t ret = vdup_n_u16(0);
115 ret = vreinterpret_u16_u32(
116 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
117 p += stride;
118 ret = vreinterpret_u16_u32(
119 vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
120 return ret;
121 }
122
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)123 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
124 uint8x8_t *const s0, uint8x8_t *const s1,
125 uint8x8_t *const s2, uint8x8_t *const s3,
126 uint8x8_t *const s4, uint8x8_t *const s5,
127 uint8x8_t *const s6, uint8x8_t *const s7) {
128 *s0 = vld1_u8(s);
129 s += p;
130 *s1 = vld1_u8(s);
131 s += p;
132 *s2 = vld1_u8(s);
133 s += p;
134 *s3 = vld1_u8(s);
135 s += p;
136 *s4 = vld1_u8(s);
137 s += p;
138 *s5 = vld1_u8(s);
139 s += p;
140 *s6 = vld1_u8(s);
141 s += p;
142 *s7 = vld1_u8(s);
143 }
144
load_u8_8x7(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6)145 static INLINE void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
146 uint8x8_t *const s0, uint8x8_t *const s1,
147 uint8x8_t *const s2, uint8x8_t *const s3,
148 uint8x8_t *const s4, uint8x8_t *const s5,
149 uint8x8_t *const s6) {
150 *s0 = vld1_u8(s);
151 s += p;
152 *s1 = vld1_u8(s);
153 s += p;
154 *s2 = vld1_u8(s);
155 s += p;
156 *s3 = vld1_u8(s);
157 s += p;
158 *s4 = vld1_u8(s);
159 s += p;
160 *s5 = vld1_u8(s);
161 s += p;
162 *s6 = vld1_u8(s);
163 }
164
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)165 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
166 uint8x8_t *const s0, uint8x8_t *const s1,
167 uint8x8_t *const s2, uint8x8_t *const s3) {
168 *s0 = vld1_u8(s);
169 s += p;
170 *s1 = vld1_u8(s);
171 s += p;
172 *s2 = vld1_u8(s);
173 s += p;
174 *s3 = vld1_u8(s);
175 }
176
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)177 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
178 uint16x4_t *const s0, uint16x4_t *const s1,
179 uint16x4_t *const s2, uint16x4_t *const s3) {
180 *s0 = vld1_u16(s);
181 s += p;
182 *s1 = vld1_u16(s);
183 s += p;
184 *s2 = vld1_u16(s);
185 s += p;
186 *s3 = vld1_u16(s);
187 s += p;
188 }
189
load_u16_4x7(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6)190 static INLINE void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
191 uint16x4_t *const s0, uint16x4_t *const s1,
192 uint16x4_t *const s2, uint16x4_t *const s3,
193 uint16x4_t *const s4, uint16x4_t *const s5,
194 uint16x4_t *const s6) {
195 *s0 = vld1_u16(s);
196 s += p;
197 *s1 = vld1_u16(s);
198 s += p;
199 *s2 = vld1_u16(s);
200 s += p;
201 *s3 = vld1_u16(s);
202 s += p;
203 *s4 = vld1_u16(s);
204 s += p;
205 *s5 = vld1_u16(s);
206 s += p;
207 *s6 = vld1_u16(s);
208 }
209
load_s16_8x2(const int16_t * s,const ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1)210 static INLINE void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
211 int16x8_t *const s0, int16x8_t *const s1) {
212 *s0 = vld1q_s16(s);
213 s += p;
214 *s1 = vld1q_s16(s);
215 }
216
load_u16_8x2(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1)217 static INLINE void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
218 uint16x8_t *const s0, uint16x8_t *const s1) {
219 *s0 = vld1q_u16(s);
220 s += p;
221 *s1 = vld1q_u16(s);
222 }
223
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)224 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
225 uint16x8_t *const s0, uint16x8_t *const s1,
226 uint16x8_t *const s2, uint16x8_t *const s3) {
227 *s0 = vld1q_u16(s);
228 s += p;
229 *s1 = vld1q_u16(s);
230 s += p;
231 *s2 = vld1q_u16(s);
232 s += p;
233 *s3 = vld1q_u16(s);
234 s += p;
235 }
236
load_s16_4x12(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10,int16x4_t * const s11)237 static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p,
238 int16x4_t *const s0, int16x4_t *const s1,
239 int16x4_t *const s2, int16x4_t *const s3,
240 int16x4_t *const s4, int16x4_t *const s5,
241 int16x4_t *const s6, int16x4_t *const s7,
242 int16x4_t *const s8, int16x4_t *const s9,
243 int16x4_t *const s10, int16x4_t *const s11) {
244 *s0 = vld1_s16(s);
245 s += p;
246 *s1 = vld1_s16(s);
247 s += p;
248 *s2 = vld1_s16(s);
249 s += p;
250 *s3 = vld1_s16(s);
251 s += p;
252 *s4 = vld1_s16(s);
253 s += p;
254 *s5 = vld1_s16(s);
255 s += p;
256 *s6 = vld1_s16(s);
257 s += p;
258 *s7 = vld1_s16(s);
259 s += p;
260 *s8 = vld1_s16(s);
261 s += p;
262 *s9 = vld1_s16(s);
263 s += p;
264 *s10 = vld1_s16(s);
265 s += p;
266 *s11 = vld1_s16(s);
267 }
268
load_s16_4x11(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7,int16x4_t * const s8,int16x4_t * const s9,int16x4_t * const s10)269 static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p,
270 int16x4_t *const s0, int16x4_t *const s1,
271 int16x4_t *const s2, int16x4_t *const s3,
272 int16x4_t *const s4, int16x4_t *const s5,
273 int16x4_t *const s6, int16x4_t *const s7,
274 int16x4_t *const s8, int16x4_t *const s9,
275 int16x4_t *const s10) {
276 *s0 = vld1_s16(s);
277 s += p;
278 *s1 = vld1_s16(s);
279 s += p;
280 *s2 = vld1_s16(s);
281 s += p;
282 *s3 = vld1_s16(s);
283 s += p;
284 *s4 = vld1_s16(s);
285 s += p;
286 *s5 = vld1_s16(s);
287 s += p;
288 *s6 = vld1_s16(s);
289 s += p;
290 *s7 = vld1_s16(s);
291 s += p;
292 *s8 = vld1_s16(s);
293 s += p;
294 *s9 = vld1_s16(s);
295 s += p;
296 *s10 = vld1_s16(s);
297 }
298
load_u16_4x11(const uint16_t * s,ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4,uint16x4_t * const s5,uint16x4_t * const s6,uint16x4_t * const s7,uint16x4_t * const s8,uint16x4_t * const s9,uint16x4_t * const s10)299 static INLINE void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
300 uint16x4_t *const s0, uint16x4_t *const s1,
301 uint16x4_t *const s2, uint16x4_t *const s3,
302 uint16x4_t *const s4, uint16x4_t *const s5,
303 uint16x4_t *const s6, uint16x4_t *const s7,
304 uint16x4_t *const s8, uint16x4_t *const s9,
305 uint16x4_t *const s10) {
306 *s0 = vld1_u16(s);
307 s += p;
308 *s1 = vld1_u16(s);
309 s += p;
310 *s2 = vld1_u16(s);
311 s += p;
312 *s3 = vld1_u16(s);
313 s += p;
314 *s4 = vld1_u16(s);
315 s += p;
316 *s5 = vld1_u16(s);
317 s += p;
318 *s6 = vld1_u16(s);
319 s += p;
320 *s7 = vld1_u16(s);
321 s += p;
322 *s8 = vld1_u16(s);
323 s += p;
324 *s9 = vld1_u16(s);
325 s += p;
326 *s10 = vld1_u16(s);
327 }
328
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)329 static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
330 int16x4_t *const s0, int16x4_t *const s1,
331 int16x4_t *const s2, int16x4_t *const s3,
332 int16x4_t *const s4, int16x4_t *const s5,
333 int16x4_t *const s6, int16x4_t *const s7) {
334 *s0 = vld1_s16(s);
335 s += p;
336 *s1 = vld1_s16(s);
337 s += p;
338 *s2 = vld1_s16(s);
339 s += p;
340 *s3 = vld1_s16(s);
341 s += p;
342 *s4 = vld1_s16(s);
343 s += p;
344 *s5 = vld1_s16(s);
345 s += p;
346 *s6 = vld1_s16(s);
347 s += p;
348 *s7 = vld1_s16(s);
349 }
350
load_s16_4x7(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6)351 static INLINE void load_s16_4x7(const int16_t *s, ptrdiff_t p,
352 int16x4_t *const s0, int16x4_t *const s1,
353 int16x4_t *const s2, int16x4_t *const s3,
354 int16x4_t *const s4, int16x4_t *const s5,
355 int16x4_t *const s6) {
356 *s0 = vld1_s16(s);
357 s += p;
358 *s1 = vld1_s16(s);
359 s += p;
360 *s2 = vld1_s16(s);
361 s += p;
362 *s3 = vld1_s16(s);
363 s += p;
364 *s4 = vld1_s16(s);
365 s += p;
366 *s5 = vld1_s16(s);
367 s += p;
368 *s6 = vld1_s16(s);
369 }
370
load_s16_4x6(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5)371 static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p,
372 int16x4_t *const s0, int16x4_t *const s1,
373 int16x4_t *const s2, int16x4_t *const s3,
374 int16x4_t *const s4, int16x4_t *const s5) {
375 *s0 = vld1_s16(s);
376 s += p;
377 *s1 = vld1_s16(s);
378 s += p;
379 *s2 = vld1_s16(s);
380 s += p;
381 *s3 = vld1_s16(s);
382 s += p;
383 *s4 = vld1_s16(s);
384 s += p;
385 *s5 = vld1_s16(s);
386 }
387
load_s16_4x5(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4)388 static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p,
389 int16x4_t *const s0, int16x4_t *const s1,
390 int16x4_t *const s2, int16x4_t *const s3,
391 int16x4_t *const s4) {
392 *s0 = vld1_s16(s);
393 s += p;
394 *s1 = vld1_s16(s);
395 s += p;
396 *s2 = vld1_s16(s);
397 s += p;
398 *s3 = vld1_s16(s);
399 s += p;
400 *s4 = vld1_s16(s);
401 }
402
load_u16_4x5(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3,uint16x4_t * const s4)403 static INLINE void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
404 uint16x4_t *const s0, uint16x4_t *const s1,
405 uint16x4_t *const s2, uint16x4_t *const s3,
406 uint16x4_t *const s4) {
407 *s0 = vld1_u16(s);
408 s += p;
409 *s1 = vld1_u16(s);
410 s += p;
411 *s2 = vld1_u16(s);
412 s += p;
413 *s3 = vld1_u16(s);
414 s += p;
415 *s4 = vld1_u16(s);
416 s += p;
417 }
418
load_u8_8x5(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4)419 static INLINE void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
420 uint8x8_t *const s0, uint8x8_t *const s1,
421 uint8x8_t *const s2, uint8x8_t *const s3,
422 uint8x8_t *const s4) {
423 *s0 = vld1_u8(s);
424 s += p;
425 *s1 = vld1_u8(s);
426 s += p;
427 *s2 = vld1_u8(s);
428 s += p;
429 *s3 = vld1_u8(s);
430 s += p;
431 *s4 = vld1_u8(s);
432 }
433
load_u16_8x5(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4)434 static INLINE void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
435 uint16x8_t *const s0, uint16x8_t *const s1,
436 uint16x8_t *const s2, uint16x8_t *const s3,
437 uint16x8_t *const s4) {
438 *s0 = vld1q_u16(s);
439 s += p;
440 *s1 = vld1q_u16(s);
441 s += p;
442 *s2 = vld1q_u16(s);
443 s += p;
444 *s3 = vld1q_u16(s);
445 s += p;
446 *s4 = vld1q_u16(s);
447 s += p;
448 }
449
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)450 static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
451 int16x4_t *const s0, int16x4_t *const s1,
452 int16x4_t *const s2, int16x4_t *const s3) {
453 *s0 = vld1_s16(s);
454 s += p;
455 *s1 = vld1_s16(s);
456 s += p;
457 *s2 = vld1_s16(s);
458 s += p;
459 *s3 = vld1_s16(s);
460 }
461
load_s16_4x3(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2)462 static INLINE void load_s16_4x3(const int16_t *s, ptrdiff_t p,
463 int16x4_t *const s0, int16x4_t *const s1,
464 int16x4_t *const s2) {
465 *s0 = vld1_s16(s);
466 s += p;
467 *s1 = vld1_s16(s);
468 s += p;
469 *s2 = vld1_s16(s);
470 }
471
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)472 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
473 const uint8x8_t s1, const uint8x8_t s2,
474 const uint8x8_t s3, const uint8x8_t s4,
475 const uint8x8_t s5, const uint8x8_t s6,
476 const uint8x8_t s7) {
477 vst1_u8(s, s0);
478 s += p;
479 vst1_u8(s, s1);
480 s += p;
481 vst1_u8(s, s2);
482 s += p;
483 vst1_u8(s, s3);
484 s += p;
485 vst1_u8(s, s4);
486 s += p;
487 vst1_u8(s, s5);
488 s += p;
489 vst1_u8(s, s6);
490 s += p;
491 vst1_u8(s, s7);
492 }
493
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)494 static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
495 const uint8x8_t s1, const uint8x8_t s2,
496 const uint8x8_t s3) {
497 vst1_u8(s, s0);
498 s += p;
499 vst1_u8(s, s1);
500 s += p;
501 vst1_u8(s, s2);
502 s += p;
503 vst1_u8(s, s3);
504 }
505
store_u8_16x4(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)506 static INLINE void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
507 const uint8x16_t s1, const uint8x16_t s2,
508 const uint8x16_t s3) {
509 vst1q_u8(s, s0);
510 s += p;
511 vst1q_u8(s, s1);
512 s += p;
513 vst1q_u8(s, s2);
514 s += p;
515 vst1q_u8(s, s3);
516 }
517
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)518 static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
519 const uint16x8_t s0, const uint16x8_t s1,
520 const uint16x8_t s2, const uint16x8_t s3,
521 const uint16x8_t s4, const uint16x8_t s5,
522 const uint16x8_t s6, const uint16x8_t s7) {
523 vst1q_u16(s, s0);
524 s += dst_stride;
525 vst1q_u16(s, s1);
526 s += dst_stride;
527 vst1q_u16(s, s2);
528 s += dst_stride;
529 vst1q_u16(s, s3);
530 s += dst_stride;
531 vst1q_u16(s, s4);
532 s += dst_stride;
533 vst1q_u16(s, s5);
534 s += dst_stride;
535 vst1q_u16(s, s6);
536 s += dst_stride;
537 vst1q_u16(s, s7);
538 }
539
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)540 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
541 const uint16x4_t s0, const uint16x4_t s1,
542 const uint16x4_t s2, const uint16x4_t s3) {
543 vst1_u16(s, s0);
544 s += dst_stride;
545 vst1_u16(s, s1);
546 s += dst_stride;
547 vst1_u16(s, s2);
548 s += dst_stride;
549 vst1_u16(s, s3);
550 }
551
store_u16_8x2(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1)552 static INLINE void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
553 const uint16x8_t s0, const uint16x8_t s1) {
554 vst1q_u16(s, s0);
555 s += dst_stride;
556 vst1q_u16(s, s1);
557 }
558
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)559 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
560 const uint16x8_t s0, const uint16x8_t s1,
561 const uint16x8_t s2, const uint16x8_t s3) {
562 vst1q_u16(s, s0);
563 s += dst_stride;
564 vst1q_u16(s, s1);
565 s += dst_stride;
566 vst1q_u16(s, s2);
567 s += dst_stride;
568 vst1q_u16(s, s3);
569 }
570
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)571 static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
572 const int16x8_t s0, const int16x8_t s1,
573 const int16x8_t s2, const int16x8_t s3,
574 const int16x8_t s4, const int16x8_t s5,
575 const int16x8_t s6, const int16x8_t s7) {
576 vst1q_s16(s, s0);
577 s += dst_stride;
578 vst1q_s16(s, s1);
579 s += dst_stride;
580 vst1q_s16(s, s2);
581 s += dst_stride;
582 vst1q_s16(s, s3);
583 s += dst_stride;
584 vst1q_s16(s, s4);
585 s += dst_stride;
586 vst1q_s16(s, s5);
587 s += dst_stride;
588 vst1q_s16(s, s6);
589 s += dst_stride;
590 vst1q_s16(s, s7);
591 }
592
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)593 static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
594 const int16x4_t s0, const int16x4_t s1,
595 const int16x4_t s2, const int16x4_t s3) {
596 vst1_s16(s, s0);
597 s += dst_stride;
598 vst1_s16(s, s1);
599 s += dst_stride;
600 vst1_s16(s, s2);
601 s += dst_stride;
602 vst1_s16(s, s3);
603 }
604
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)605 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
606 const int16x8_t s0, const int16x8_t s1,
607 const int16x8_t s2, const int16x8_t s3) {
608 vst1q_s16(s, s0);
609 s += dst_stride;
610 vst1q_s16(s, s1);
611 s += dst_stride;
612 vst1q_s16(s, s2);
613 s += dst_stride;
614 vst1q_s16(s, s3);
615 }
616
load_u8_8x11(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7,uint8x8_t * const s8,uint8x8_t * const s9,uint8x8_t * const s10)617 static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
618 uint8x8_t *const s0, uint8x8_t *const s1,
619 uint8x8_t *const s2, uint8x8_t *const s3,
620 uint8x8_t *const s4, uint8x8_t *const s5,
621 uint8x8_t *const s6, uint8x8_t *const s7,
622 uint8x8_t *const s8, uint8x8_t *const s9,
623 uint8x8_t *const s10) {
624 *s0 = vld1_u8(s);
625 s += p;
626 *s1 = vld1_u8(s);
627 s += p;
628 *s2 = vld1_u8(s);
629 s += p;
630 *s3 = vld1_u8(s);
631 s += p;
632 *s4 = vld1_u8(s);
633 s += p;
634 *s5 = vld1_u8(s);
635 s += p;
636 *s6 = vld1_u8(s);
637 s += p;
638 *s7 = vld1_u8(s);
639 s += p;
640 *s8 = vld1_u8(s);
641 s += p;
642 *s9 = vld1_u8(s);
643 s += p;
644 *s10 = vld1_u8(s);
645 }
646
load_s16_8x10(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9)647 static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p,
648 int16x8_t *const s0, int16x8_t *const s1,
649 int16x8_t *const s2, int16x8_t *const s3,
650 int16x8_t *const s4, int16x8_t *const s5,
651 int16x8_t *const s6, int16x8_t *const s7,
652 int16x8_t *const s8, int16x8_t *const s9) {
653 *s0 = vld1q_s16(s);
654 s += p;
655 *s1 = vld1q_s16(s);
656 s += p;
657 *s2 = vld1q_s16(s);
658 s += p;
659 *s3 = vld1q_s16(s);
660 s += p;
661 *s4 = vld1q_s16(s);
662 s += p;
663 *s5 = vld1q_s16(s);
664 s += p;
665 *s6 = vld1q_s16(s);
666 s += p;
667 *s7 = vld1q_s16(s);
668 s += p;
669 *s8 = vld1q_s16(s);
670 s += p;
671 *s9 = vld1q_s16(s);
672 }
673
load_s16_8x11(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10)674 static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
675 int16x8_t *const s0, int16x8_t *const s1,
676 int16x8_t *const s2, int16x8_t *const s3,
677 int16x8_t *const s4, int16x8_t *const s5,
678 int16x8_t *const s6, int16x8_t *const s7,
679 int16x8_t *const s8, int16x8_t *const s9,
680 int16x8_t *const s10) {
681 *s0 = vld1q_s16(s);
682 s += p;
683 *s1 = vld1q_s16(s);
684 s += p;
685 *s2 = vld1q_s16(s);
686 s += p;
687 *s3 = vld1q_s16(s);
688 s += p;
689 *s4 = vld1q_s16(s);
690 s += p;
691 *s5 = vld1q_s16(s);
692 s += p;
693 *s6 = vld1q_s16(s);
694 s += p;
695 *s7 = vld1q_s16(s);
696 s += p;
697 *s8 = vld1q_s16(s);
698 s += p;
699 *s9 = vld1q_s16(s);
700 s += p;
701 *s10 = vld1q_s16(s);
702 }
703
load_s16_8x12(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7,int16x8_t * const s8,int16x8_t * const s9,int16x8_t * const s10,int16x8_t * const s11)704 static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p,
705 int16x8_t *const s0, int16x8_t *const s1,
706 int16x8_t *const s2, int16x8_t *const s3,
707 int16x8_t *const s4, int16x8_t *const s5,
708 int16x8_t *const s6, int16x8_t *const s7,
709 int16x8_t *const s8, int16x8_t *const s9,
710 int16x8_t *const s10, int16x8_t *const s11) {
711 *s0 = vld1q_s16(s);
712 s += p;
713 *s1 = vld1q_s16(s);
714 s += p;
715 *s2 = vld1q_s16(s);
716 s += p;
717 *s3 = vld1q_s16(s);
718 s += p;
719 *s4 = vld1q_s16(s);
720 s += p;
721 *s5 = vld1q_s16(s);
722 s += p;
723 *s6 = vld1q_s16(s);
724 s += p;
725 *s7 = vld1q_s16(s);
726 s += p;
727 *s8 = vld1q_s16(s);
728 s += p;
729 *s9 = vld1q_s16(s);
730 s += p;
731 *s10 = vld1q_s16(s);
732 s += p;
733 *s11 = vld1q_s16(s);
734 }
735
load_u16_8x11(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7,uint16x8_t * const s8,uint16x8_t * const s9,uint16x8_t * const s10)736 static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
737 uint16x8_t *const s0, uint16x8_t *const s1,
738 uint16x8_t *const s2, uint16x8_t *const s3,
739 uint16x8_t *const s4, uint16x8_t *const s5,
740 uint16x8_t *const s6, uint16x8_t *const s7,
741 uint16x8_t *const s8, uint16x8_t *const s9,
742 uint16x8_t *const s10) {
743 *s0 = vld1q_u16(s);
744 s += p;
745 *s1 = vld1q_u16(s);
746 s += p;
747 *s2 = vld1q_u16(s);
748 s += p;
749 *s3 = vld1q_u16(s);
750 s += p;
751 *s4 = vld1q_u16(s);
752 s += p;
753 *s5 = vld1q_u16(s);
754 s += p;
755 *s6 = vld1q_u16(s);
756 s += p;
757 *s7 = vld1q_u16(s);
758 s += p;
759 *s8 = vld1q_u16(s);
760 s += p;
761 *s9 = vld1q_u16(s);
762 s += p;
763 *s10 = vld1q_u16(s);
764 }
765
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)766 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
767 int16x8_t *const s0, int16x8_t *const s1,
768 int16x8_t *const s2, int16x8_t *const s3,
769 int16x8_t *const s4, int16x8_t *const s5,
770 int16x8_t *const s6, int16x8_t *const s7) {
771 *s0 = vld1q_s16(s);
772 s += p;
773 *s1 = vld1q_s16(s);
774 s += p;
775 *s2 = vld1q_s16(s);
776 s += p;
777 *s3 = vld1q_s16(s);
778 s += p;
779 *s4 = vld1q_s16(s);
780 s += p;
781 *s5 = vld1q_s16(s);
782 s += p;
783 *s6 = vld1q_s16(s);
784 s += p;
785 *s7 = vld1q_s16(s);
786 }
787
load_u16_8x7(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6)788 static INLINE void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
789 uint16x8_t *const s0, uint16x8_t *const s1,
790 uint16x8_t *const s2, uint16x8_t *const s3,
791 uint16x8_t *const s4, uint16x8_t *const s5,
792 uint16x8_t *const s6) {
793 *s0 = vld1q_u16(s);
794 s += p;
795 *s1 = vld1q_u16(s);
796 s += p;
797 *s2 = vld1q_u16(s);
798 s += p;
799 *s3 = vld1q_u16(s);
800 s += p;
801 *s4 = vld1q_u16(s);
802 s += p;
803 *s5 = vld1q_u16(s);
804 s += p;
805 *s6 = vld1q_u16(s);
806 }
807
load_s16_8x7(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6)808 static INLINE void load_s16_8x7(const int16_t *s, ptrdiff_t p,
809 int16x8_t *const s0, int16x8_t *const s1,
810 int16x8_t *const s2, int16x8_t *const s3,
811 int16x8_t *const s4, int16x8_t *const s5,
812 int16x8_t *const s6) {
813 *s0 = vld1q_s16(s);
814 s += p;
815 *s1 = vld1q_s16(s);
816 s += p;
817 *s2 = vld1q_s16(s);
818 s += p;
819 *s3 = vld1q_s16(s);
820 s += p;
821 *s4 = vld1q_s16(s);
822 s += p;
823 *s5 = vld1q_s16(s);
824 s += p;
825 *s6 = vld1q_s16(s);
826 }
827
load_s16_8x6(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5)828 static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p,
829 int16x8_t *const s0, int16x8_t *const s1,
830 int16x8_t *const s2, int16x8_t *const s3,
831 int16x8_t *const s4, int16x8_t *const s5) {
832 *s0 = vld1q_s16(s);
833 s += p;
834 *s1 = vld1q_s16(s);
835 s += p;
836 *s2 = vld1q_s16(s);
837 s += p;
838 *s3 = vld1q_s16(s);
839 s += p;
840 *s4 = vld1q_s16(s);
841 s += p;
842 *s5 = vld1q_s16(s);
843 }
844
load_s16_8x5(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4)845 static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p,
846 int16x8_t *const s0, int16x8_t *const s1,
847 int16x8_t *const s2, int16x8_t *const s3,
848 int16x8_t *const s4) {
849 *s0 = vld1q_s16(s);
850 s += p;
851 *s1 = vld1q_s16(s);
852 s += p;
853 *s2 = vld1q_s16(s);
854 s += p;
855 *s3 = vld1q_s16(s);
856 s += p;
857 *s4 = vld1q_s16(s);
858 }
859
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)860 static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
861 int16x8_t *const s0, int16x8_t *const s1,
862 int16x8_t *const s2, int16x8_t *const s3) {
863 *s0 = vld1q_s16(s);
864 s += p;
865 *s1 = vld1q_s16(s);
866 s += p;
867 *s2 = vld1q_s16(s);
868 s += p;
869 *s3 = vld1q_s16(s);
870 }
871
load_s16_8x3(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2)872 static INLINE void load_s16_8x3(const int16_t *s, ptrdiff_t p,
873 int16x8_t *const s0, int16x8_t *const s1,
874 int16x8_t *const s2) {
875 *s0 = vld1q_s16(s);
876 s += p;
877 *s1 = vld1q_s16(s);
878 s += p;
879 *s2 = vld1q_s16(s);
880 }
881
882 // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,int stride)883 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
884 uint32_t a;
885 memcpy(&a, buf, 4);
886 buf += stride;
887 uint32x2_t a_u32 = vdup_n_u32(a);
888 memcpy(&a, buf, 4);
889 a_u32 = vset_lane_u32(a, a_u32, 1);
890 return vreinterpret_u8_u32(a_u32);
891 }
892
893 // Load 4 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8q(const uint8_t * buf,int stride)894 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
895 uint32_t a;
896 uint32x4_t a_u32;
897 if (stride == 4) return vld1q_u8(buf);
898 memcpy(&a, buf, 4);
899 buf += stride;
900 a_u32 = vdupq_n_u32(a);
901 memcpy(&a, buf, 4);
902 buf += stride;
903 a_u32 = vsetq_lane_u32(a, a_u32, 1);
904 memcpy(&a, buf, 4);
905 buf += stride;
906 a_u32 = vsetq_lane_u32(a, a_u32, 2);
907 memcpy(&a, buf, 4);
908 a_u32 = vsetq_lane_u32(a, a_u32, 3);
909 return vreinterpretq_u8_u32(a_u32);
910 }
911
load_unaligned_u8_2x2(const uint8_t * buf,int stride)912 static INLINE uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
913 uint16_t a;
914 uint16x4_t a_u16;
915
916 memcpy(&a, buf, 2);
917 buf += stride;
918 a_u16 = vdup_n_u16(a);
919 memcpy(&a, buf, 2);
920 a_u16 = vset_lane_u16(a, a_u16, 1);
921 return vreinterpret_u8_u16(a_u16);
922 }
923
load_unaligned_u8_4x1(const uint8_t * buf)924 static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
925 uint32_t a;
926 uint32x2_t a_u32;
927
928 memcpy(&a, buf, 4);
929 a_u32 = vdup_n_u32(0);
930 a_u32 = vset_lane_u32(a, a_u32, 0);
931 return vreinterpret_u8_u32(a_u32);
932 }
933
load_unaligned_dup_u8_4x2(const uint8_t * buf)934 static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
935 uint32_t a;
936 uint32x2_t a_u32;
937
938 memcpy(&a, buf, 4);
939 a_u32 = vdup_n_u32(a);
940 return vreinterpret_u8_u32(a_u32);
941 }
942
load_unaligned_dup_u8_2x4(const uint8_t * buf)943 static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
944 uint16_t a;
945 uint16x4_t a_u32;
946
947 memcpy(&a, buf, 2);
948 a_u32 = vdup_n_u16(a);
949 return vreinterpret_u8_u16(a_u32);
950 }
951
load_unaligned_u8_4x2(const uint8_t * buf,int stride)952 static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
953 uint32_t a;
954 uint32x2_t a_u32;
955
956 memcpy(&a, buf, 4);
957 buf += stride;
958 a_u32 = vdup_n_u32(a);
959 memcpy(&a, buf, 4);
960 a_u32 = vset_lane_u32(a, a_u32, 1);
961 return vreinterpret_u8_u32(a_u32);
962 }
963
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1)964 static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
965 uint8x8_t *tu0, uint8x8_t *tu1) {
966 *tu0 = load_unaligned_u8_4x2(buf, stride);
967 buf += 2 * stride;
968 *tu1 = load_unaligned_u8_4x2(buf, stride);
969 }
970
load_unaligned_u8_3x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2)971 static INLINE void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
972 uint8x8_t *tu0, uint8x8_t *tu1,
973 uint8x8_t *tu2) {
974 load_unaligned_u8_4x4(buf, stride, tu0, tu1);
975 buf += 4 * stride;
976 *tu2 = load_unaligned_u8_4x2(buf, stride);
977 }
978
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint8x8_t * tu0,uint8x8_t * tu1,uint8x8_t * tu2,uint8x8_t * tu3)979 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
980 uint8x8_t *tu0, uint8x8_t *tu1,
981 uint8x8_t *tu2, uint8x8_t *tu3) {
982 load_unaligned_u8_4x4(buf, stride, tu0, tu1);
983 buf += 4 * stride;
984 load_unaligned_u8_4x4(buf, stride, tu2, tu3);
985 }
986
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)987 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
988 uint8x16_t *const s0, uint8x16_t *const s1,
989 uint8x16_t *const s2, uint8x16_t *const s3,
990 uint8x16_t *const s4, uint8x16_t *const s5,
991 uint8x16_t *const s6, uint8x16_t *const s7) {
992 *s0 = vld1q_u8(s);
993 s += p;
994 *s1 = vld1q_u8(s);
995 s += p;
996 *s2 = vld1q_u8(s);
997 s += p;
998 *s3 = vld1q_u8(s);
999 s += p;
1000 *s4 = vld1q_u8(s);
1001 s += p;
1002 *s5 = vld1q_u8(s);
1003 s += p;
1004 *s6 = vld1q_u8(s);
1005 s += p;
1006 *s7 = vld1q_u8(s);
1007 }
1008
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)1009 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
1010 uint8x16_t *const s0, uint8x16_t *const s1,
1011 uint8x16_t *const s2, uint8x16_t *const s3) {
1012 *s0 = vld1q_u8(s);
1013 s += p;
1014 *s1 = vld1q_u8(s);
1015 s += p;
1016 *s2 = vld1q_u8(s);
1017 s += p;
1018 *s3 = vld1q_u8(s);
1019 }
1020
load_u16_8x8(const uint16_t * s,const ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)1021 static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
1022 uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
1023 uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
1024 uint16x8_t *s6, uint16x8_t *s7) {
1025 *s0 = vld1q_u16(s);
1026 s += p;
1027 *s1 = vld1q_u16(s);
1028 s += p;
1029 *s2 = vld1q_u16(s);
1030 s += p;
1031 *s3 = vld1q_u16(s);
1032 s += p;
1033 *s4 = vld1q_u16(s);
1034 s += p;
1035 *s5 = vld1q_u16(s);
1036 s += p;
1037 *s6 = vld1q_u16(s);
1038 s += p;
1039 *s7 = vld1q_u16(s);
1040 }
1041
load_u16_16x4(const uint16_t * s,ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3,uint16x8_t * const s4,uint16x8_t * const s5,uint16x8_t * const s6,uint16x8_t * const s7)1042 static INLINE void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
1043 uint16x8_t *const s0, uint16x8_t *const s1,
1044 uint16x8_t *const s2, uint16x8_t *const s3,
1045 uint16x8_t *const s4, uint16x8_t *const s5,
1046 uint16x8_t *const s6, uint16x8_t *const s7) {
1047 *s0 = vld1q_u16(s);
1048 *s1 = vld1q_u16(s + 8);
1049 s += p;
1050 *s2 = vld1q_u16(s);
1051 *s3 = vld1q_u16(s + 8);
1052 s += p;
1053 *s4 = vld1q_u16(s);
1054 *s5 = vld1q_u16(s + 8);
1055 s += p;
1056 *s6 = vld1q_u16(s);
1057 *s7 = vld1q_u16(s + 8);
1058 }
1059
load_unaligned_u16_2x2(const uint16_t * buf,int stride)1060 static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
1061 int stride) {
1062 uint32_t a;
1063 uint32x2_t a_u32;
1064
1065 memcpy(&a, buf, 4);
1066 buf += stride;
1067 a_u32 = vdup_n_u32(a);
1068 memcpy(&a, buf, 4);
1069 a_u32 = vset_lane_u32(a, a_u32, 1);
1070 return vreinterpret_u16_u32(a_u32);
1071 }
1072
load_unaligned_u16_4x1(const uint16_t * buf)1073 static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
1074 uint64_t a;
1075 uint64x1_t a_u64 = vdup_n_u64(0);
1076 memcpy(&a, buf, 8);
1077 a_u64 = vset_lane_u64(a, a_u64, 0);
1078 return vreinterpret_u16_u64(a_u64);
1079 }
1080
load_unaligned_u16_4x2(const uint16_t * buf,uint32_t stride)1081 static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
1082 uint32_t stride) {
1083 uint64_t a;
1084 uint64x2_t a_u64;
1085
1086 memcpy(&a, buf, 8);
1087 buf += stride;
1088 a_u64 = vdupq_n_u64(0);
1089 a_u64 = vsetq_lane_u64(a, a_u64, 0);
1090 memcpy(&a, buf, 8);
1091 buf += stride;
1092 a_u64 = vsetq_lane_u64(a, a_u64, 1);
1093 return vreinterpretq_u16_u64(a_u64);
1094 }
1095
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint16x8_t * tu0,uint16x8_t * tu1)1096 static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
1097 uint16x8_t *tu0, uint16x8_t *tu1) {
1098 *tu0 = load_unaligned_u16_4x2(buf, stride);
1099 buf += 2 * stride;
1100 *tu1 = load_unaligned_u16_4x2(buf, stride);
1101 }
1102
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)1103 static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
1104 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
1105 *s1 = vld1q_s32(s);
1106 s += p;
1107 *s2 = vld1q_s32(s);
1108 s += p;
1109 *s3 = vld1q_s32(s);
1110 s += p;
1111 *s4 = vld1q_s32(s);
1112 }
1113
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)1114 static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
1115 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
1116 vst1q_s32(s, s1);
1117 s += p;
1118 vst1q_s32(s, s2);
1119 s += p;
1120 vst1q_s32(s, s3);
1121 s += p;
1122 vst1q_s32(s, s4);
1123 }
1124
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)1125 static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
1126 uint32x4_t *s2, uint32x4_t *s3,
1127 uint32x4_t *s4) {
1128 *s1 = vld1q_u32(s);
1129 s += p;
1130 *s2 = vld1q_u32(s);
1131 s += p;
1132 *s3 = vld1q_u32(s);
1133 s += p;
1134 *s4 = vld1q_u32(s);
1135 }
1136
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)1137 static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
1138 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
1139 vst1q_u32(s, s1);
1140 s += p;
1141 vst1q_u32(s, s2);
1142 s += p;
1143 vst1q_u32(s, s3);
1144 s += p;
1145 vst1q_u32(s, s4);
1146 }
1147
load_tran_low_to_s16q(const tran_low_t * buf)1148 static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
1149 const int32x4_t v0 = vld1q_s32(buf);
1150 const int32x4_t v1 = vld1q_s32(buf + 4);
1151 const int16x4_t s0 = vmovn_s32(v0);
1152 const int16x4_t s1 = vmovn_s32(v1);
1153 return vcombine_s16(s0, s1);
1154 }
1155
store_s16q_to_tran_low(tran_low_t * buf,const int16x8_t a)1156 static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
1157 const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
1158 const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
1159 vst1q_s32(buf, v0);
1160 vst1q_s32(buf + 4, v1);
1161 }
1162
store_s16_to_tran_low(tran_low_t * buf,const int16x4_t a)1163 static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
1164 const int32x4_t v0 = vmovl_s16(a);
1165 vst1q_s32(buf, v0);
1166 }
1167
load_u8_gather_s16_x8(const uint8_t * src,int16x8_t indices)1168 static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
1169 int16x8_t indices) {
1170 // Recent Clang and GCC versions correctly identify that this zero-broadcast
1171 // is redundant. Alternatively we could load and broadcast the zeroth element
1172 // and then replace the other lanes, however this is slower than loading a
1173 // single element without broadcast on some micro-architectures.
1174 uint8x8_t ret = vdup_n_u8(0);
1175 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0);
1176 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1);
1177 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2);
1178 ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3);
1179 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4);
1180 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5);
1181 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6);
1182 ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7);
1183 return ret;
1184 }
1185
1186 // The `lane` parameter here must be an immediate.
1187 #define store_u8_2x1_lane(dst, src, lane) \
1188 do { \
1189 uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
1190 memcpy(dst, &a, 2); \
1191 } while (0)
1192
1193 #define store_u8_4x1_lane(dst, src, lane) \
1194 do { \
1195 uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
1196 memcpy(dst, &a, 4); \
1197 } while (0)
1198
1199 #define store_u16_2x1_lane(dst, src, lane) \
1200 do { \
1201 uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
1202 memcpy(dst, &a, 4); \
1203 } while (0)
1204
1205 #define store_u16_4x1_lane(dst, src, lane) \
1206 do { \
1207 uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
1208 memcpy(dst, &a, 8); \
1209 } while (0)
1210
1211 // Store the low 16-bits from a single vector.
store_u8_2x1(uint8_t * dst,const uint8x8_t src)1212 static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
1213 store_u8_2x1_lane(dst, src, 0);
1214 }
1215
1216 // Store the low 32-bits from a single vector.
store_u8_4x1(uint8_t * dst,const uint8x8_t src)1217 static INLINE void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
1218 store_u8_4x1_lane(dst, src, 0);
1219 }
1220
1221 // Store two blocks of 16-bits from a single vector.
store_u8x2_strided_x2(uint8_t * dst,uint32_t dst_stride,uint8x8_t src)1222 static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
1223 uint8x8_t src) {
1224 store_u8_2x1_lane(dst, src, 0);
1225 dst += dst_stride;
1226 store_u8_2x1_lane(dst, src, 1);
1227 }
1228
1229 // Store two blocks of 32-bits from a single vector.
store_u8x4_strided_x2(uint8_t * dst,ptrdiff_t stride,uint8x8_t src)1230 static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
1231 uint8x8_t src) {
1232 store_u8_4x1_lane(dst, src, 0);
1233 dst += stride;
1234 store_u8_4x1_lane(dst, src, 1);
1235 }
1236
1237 // Store four blocks of 32-bits from a single vector.
store_u8x4_strided_x4(uint8_t * dst,ptrdiff_t stride,uint8x16_t src)1238 static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
1239 uint8x16_t src) {
1240 store_u8_4x1_lane(dst, vget_low_u8(src), 0);
1241 dst += stride;
1242 store_u8_4x1_lane(dst, vget_low_u8(src), 1);
1243 dst += stride;
1244 store_u8_4x1_lane(dst, vget_high_u8(src), 0);
1245 dst += stride;
1246 store_u8_4x1_lane(dst, vget_high_u8(src), 1);
1247 }
1248
1249 // Store the low 32-bits from a single vector.
store_u16_2x1(uint16_t * dst,const uint16x4_t src)1250 static INLINE void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
1251 store_u16_2x1_lane(dst, src, 0);
1252 }
1253
1254 // Store two blocks of 32-bits from a single vector.
store_u16x2_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x4_t src)1255 static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
1256 uint16x4_t src) {
1257 store_u16_2x1_lane(dst, src, 0);
1258 dst += dst_stride;
1259 store_u16_2x1_lane(dst, src, 1);
1260 }
1261
1262 // Store two blocks of 64-bits from a single vector.
store_u16x4_strided_x2(uint16_t * dst,uint32_t dst_stride,uint16x8_t src)1263 static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
1264 uint16x8_t src) {
1265 store_u16_4x1_lane(dst, src, 0);
1266 dst += dst_stride;
1267 store_u16_4x1_lane(dst, src, 1);
1268 }
1269
1270 #undef store_u8_2x1_lane
1271 #undef store_u8_4x1_lane
1272 #undef store_u16_2x1_lane
1273 #undef store_u16_4x1_lane
1274
1275 #endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
1276