1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
12 #define AOM_AV1_COMMON_ARM_MEM_NEON_H_
13
14 #include <arm_neon.h>
15 #include <string.h>
16
store_row2_u8_8x8(uint8_t * s,int p,const uint8x8_t s0,const uint8x8_t s1)17 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
18 const uint8x8_t s1) {
19 vst1_u8(s, s0);
20 s += p;
21 vst1_u8(s, s1);
22 s += p;
23 }
24
25 /* These intrinsics require immediate values, so we must use #defines
26 to enforce that. */
27 #define load_u8_4x1(s, s0, lane) \
28 do { \
29 *(s0) = vreinterpret_u8_u32( \
30 vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
31 } while (0)
32
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)33 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
34 uint8x8_t *const s0, uint8x8_t *const s1,
35 uint8x8_t *const s2, uint8x8_t *const s3,
36 uint8x8_t *const s4, uint8x8_t *const s5,
37 uint8x8_t *const s6, uint8x8_t *const s7) {
38 *s0 = vld1_u8(s);
39 s += p;
40 *s1 = vld1_u8(s);
41 s += p;
42 *s2 = vld1_u8(s);
43 s += p;
44 *s3 = vld1_u8(s);
45 s += p;
46 *s4 = vld1_u8(s);
47 s += p;
48 *s5 = vld1_u8(s);
49 s += p;
50 *s6 = vld1_u8(s);
51 s += p;
52 *s7 = vld1_u8(s);
53 }
54
load_u8_8x16(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)55 static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
56 uint8x16_t *const s0, uint8x16_t *const s1,
57 uint8x16_t *const s2, uint8x16_t *const s3) {
58 *s0 = vld1q_u8(s);
59 s += p;
60 *s1 = vld1q_u8(s);
61 s += p;
62 *s2 = vld1q_u8(s);
63 s += p;
64 *s3 = vld1q_u8(s);
65 }
66
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)67 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
68 uint8x8_t *const s0, uint8x8_t *const s1,
69 uint8x8_t *const s2, uint8x8_t *const s3) {
70 *s0 = vld1_u8(s);
71 s += p;
72 *s1 = vld1_u8(s);
73 s += p;
74 *s2 = vld1_u8(s);
75 s += p;
76 *s3 = vld1_u8(s);
77 }
78
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)79 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
80 uint16x4_t *const s0, uint16x4_t *const s1,
81 uint16x4_t *const s2, uint16x4_t *const s3) {
82 *s0 = vld1_u16(s);
83 s += p;
84 *s1 = vld1_u16(s);
85 s += p;
86 *s2 = vld1_u16(s);
87 s += p;
88 *s3 = vld1_u16(s);
89 s += p;
90 }
91
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)92 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
93 uint16x8_t *const s0, uint16x8_t *const s1,
94 uint16x8_t *const s2, uint16x8_t *const s3) {
95 *s0 = vld1q_u16(s);
96 s += p;
97 *s1 = vld1q_u16(s);
98 s += p;
99 *s2 = vld1q_u16(s);
100 s += p;
101 *s3 = vld1q_u16(s);
102 s += p;
103 }
104
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)105 static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
106 int16x4_t *const s0, int16x4_t *const s1,
107 int16x4_t *const s2, int16x4_t *const s3,
108 int16x4_t *const s4, int16x4_t *const s5,
109 int16x4_t *const s6, int16x4_t *const s7) {
110 *s0 = vld1_s16(s);
111 s += p;
112 *s1 = vld1_s16(s);
113 s += p;
114 *s2 = vld1_s16(s);
115 s += p;
116 *s3 = vld1_s16(s);
117 s += p;
118 *s4 = vld1_s16(s);
119 s += p;
120 *s5 = vld1_s16(s);
121 s += p;
122 *s6 = vld1_s16(s);
123 s += p;
124 *s7 = vld1_s16(s);
125 }
126
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)127 static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
128 int16x4_t *const s0, int16x4_t *const s1,
129 int16x4_t *const s2, int16x4_t *const s3) {
130 *s0 = vld1_s16(s);
131 s += p;
132 *s1 = vld1_s16(s);
133 s += p;
134 *s2 = vld1_s16(s);
135 s += p;
136 *s3 = vld1_s16(s);
137 }
138
139 /* These intrinsics require immediate values, so we must use #defines
140 to enforce that. */
141 #define store_u8_4x1(s, s0, lane) \
142 do { \
143 vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
144 } while (0)
145
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)146 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
147 const uint8x8_t s1, const uint8x8_t s2,
148 const uint8x8_t s3, const uint8x8_t s4,
149 const uint8x8_t s5, const uint8x8_t s6,
150 const uint8x8_t s7) {
151 vst1_u8(s, s0);
152 s += p;
153 vst1_u8(s, s1);
154 s += p;
155 vst1_u8(s, s2);
156 s += p;
157 vst1_u8(s, s3);
158 s += p;
159 vst1_u8(s, s4);
160 s += p;
161 vst1_u8(s, s5);
162 s += p;
163 vst1_u8(s, s6);
164 s += p;
165 vst1_u8(s, s7);
166 }
167
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)168 static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
169 const uint8x8_t s1, const uint8x8_t s2,
170 const uint8x8_t s3) {
171 vst1_u8(s, s0);
172 s += p;
173 vst1_u8(s, s1);
174 s += p;
175 vst1_u8(s, s2);
176 s += p;
177 vst1_u8(s, s3);
178 }
179
store_u8_8x16(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)180 static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
181 const uint8x16_t s1, const uint8x16_t s2,
182 const uint8x16_t s3) {
183 vst1q_u8(s, s0);
184 s += p;
185 vst1q_u8(s, s1);
186 s += p;
187 vst1q_u8(s, s2);
188 s += p;
189 vst1q_u8(s, s3);
190 }
191
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)192 static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
193 const uint16x8_t s0, const uint16x8_t s1,
194 const uint16x8_t s2, const uint16x8_t s3,
195 const uint16x8_t s4, const uint16x8_t s5,
196 const uint16x8_t s6, const uint16x8_t s7) {
197 vst1q_u16(s, s0);
198 s += dst_stride;
199 vst1q_u16(s, s1);
200 s += dst_stride;
201 vst1q_u16(s, s2);
202 s += dst_stride;
203 vst1q_u16(s, s3);
204 s += dst_stride;
205 vst1q_u16(s, s4);
206 s += dst_stride;
207 vst1q_u16(s, s5);
208 s += dst_stride;
209 vst1q_u16(s, s6);
210 s += dst_stride;
211 vst1q_u16(s, s7);
212 }
213
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)214 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
215 const uint16x4_t s0, const uint16x4_t s1,
216 const uint16x4_t s2, const uint16x4_t s3) {
217 vst1_u16(s, s0);
218 s += dst_stride;
219 vst1_u16(s, s1);
220 s += dst_stride;
221 vst1_u16(s, s2);
222 s += dst_stride;
223 vst1_u16(s, s3);
224 }
225
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)226 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
227 const uint16x8_t s0, const uint16x8_t s1,
228 const uint16x8_t s2, const uint16x8_t s3) {
229 vst1q_u16(s, s0);
230 s += dst_stride;
231 vst1q_u16(s, s1);
232 s += dst_stride;
233 vst1q_u16(s, s2);
234 s += dst_stride;
235 vst1q_u16(s, s3);
236 }
237
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)238 static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
239 const int16x8_t s0, const int16x8_t s1,
240 const int16x8_t s2, const int16x8_t s3,
241 const int16x8_t s4, const int16x8_t s5,
242 const int16x8_t s6, const int16x8_t s7) {
243 vst1q_s16(s, s0);
244 s += dst_stride;
245 vst1q_s16(s, s1);
246 s += dst_stride;
247 vst1q_s16(s, s2);
248 s += dst_stride;
249 vst1q_s16(s, s3);
250 s += dst_stride;
251 vst1q_s16(s, s4);
252 s += dst_stride;
253 vst1q_s16(s, s5);
254 s += dst_stride;
255 vst1q_s16(s, s6);
256 s += dst_stride;
257 vst1q_s16(s, s7);
258 }
259
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)260 static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
261 const int16x4_t s0, const int16x4_t s1,
262 const int16x4_t s2, const int16x4_t s3) {
263 vst1_s16(s, s0);
264 s += dst_stride;
265 vst1_s16(s, s1);
266 s += dst_stride;
267 vst1_s16(s, s2);
268 s += dst_stride;
269 vst1_s16(s, s3);
270 }
271
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)272 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
273 const int16x8_t s0, const int16x8_t s1,
274 const int16x8_t s2, const int16x8_t s3) {
275 vst1q_s16(s, s0);
276 s += dst_stride;
277 vst1q_s16(s, s1);
278 s += dst_stride;
279 vst1q_s16(s, s2);
280 s += dst_stride;
281 vst1q_s16(s, s3);
282 }
283
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)284 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
285 int16x8_t *const s0, int16x8_t *const s1,
286 int16x8_t *const s2, int16x8_t *const s3,
287 int16x8_t *const s4, int16x8_t *const s5,
288 int16x8_t *const s6, int16x8_t *const s7) {
289 *s0 = vld1q_s16(s);
290 s += p;
291 *s1 = vld1q_s16(s);
292 s += p;
293 *s2 = vld1q_s16(s);
294 s += p;
295 *s3 = vld1q_s16(s);
296 s += p;
297 *s4 = vld1q_s16(s);
298 s += p;
299 *s5 = vld1q_s16(s);
300 s += p;
301 *s6 = vld1q_s16(s);
302 s += p;
303 *s7 = vld1q_s16(s);
304 }
305
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)306 static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
307 int16x8_t *const s0, int16x8_t *const s1,
308 int16x8_t *const s2, int16x8_t *const s3) {
309 *s0 = vld1q_s16(s);
310 s += p;
311 *s1 = vld1q_s16(s);
312 s += p;
313 *s2 = vld1q_s16(s);
314 s += p;
315 *s3 = vld1q_s16(s);
316 }
317
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1,uint32x2_t * tu2,uint32x2_t * tu3)318 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
319 uint32x2_t *tu0, uint32x2_t *tu1,
320 uint32x2_t *tu2, uint32x2_t *tu3) {
321 uint32_t a;
322
323 memcpy(&a, buf, 4);
324 buf += stride;
325 *tu0 = vset_lane_u32(a, *tu0, 0);
326 memcpy(&a, buf, 4);
327 buf += stride;
328 *tu0 = vset_lane_u32(a, *tu0, 1);
329 memcpy(&a, buf, 4);
330 buf += stride;
331 *tu1 = vset_lane_u32(a, *tu1, 0);
332 memcpy(&a, buf, 4);
333 buf += stride;
334 *tu1 = vset_lane_u32(a, *tu1, 1);
335 memcpy(&a, buf, 4);
336 buf += stride;
337 *tu2 = vset_lane_u32(a, *tu2, 0);
338 memcpy(&a, buf, 4);
339 buf += stride;
340 *tu2 = vset_lane_u32(a, *tu2, 1);
341 memcpy(&a, buf, 4);
342 buf += stride;
343 *tu3 = vset_lane_u32(a, *tu3, 0);
344 memcpy(&a, buf, 4);
345 *tu3 = vset_lane_u32(a, *tu3, 1);
346 }
347
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1)348 static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
349 uint32x2_t *tu0, uint32x2_t *tu1) {
350 uint32_t a;
351
352 memcpy(&a, buf, 4);
353 buf += stride;
354 *tu0 = vset_lane_u32(a, *tu0, 0);
355 memcpy(&a, buf, 4);
356 buf += stride;
357 *tu0 = vset_lane_u32(a, *tu0, 1);
358 memcpy(&a, buf, 4);
359 buf += stride;
360 *tu1 = vset_lane_u32(a, *tu1, 0);
361 memcpy(&a, buf, 4);
362 *tu1 = vset_lane_u32(a, *tu1, 1);
363 }
364
load_unaligned_u8_4x1(const uint8_t * buf,int stride,uint32x2_t * tu0)365 static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
366 uint32x2_t *tu0) {
367 uint32_t a;
368
369 memcpy(&a, buf, 4);
370 buf += stride;
371 *tu0 = vset_lane_u32(a, *tu0, 0);
372 }
373
load_unaligned_u8_4x2(const uint8_t * buf,int stride,uint32x2_t * tu0)374 static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
375 uint32x2_t *tu0) {
376 uint32_t a;
377
378 memcpy(&a, buf, 4);
379 buf += stride;
380 *tu0 = vset_lane_u32(a, *tu0, 0);
381 memcpy(&a, buf, 4);
382 buf += stride;
383 *tu0 = vset_lane_u32(a, *tu0, 1);
384 }
385
386 /* These intrinsics require immediate values, so we must use #defines
387 to enforce that. */
388 #define store_unaligned_u8_4x1(dst, src, lane) \
389 do { \
390 uint32_t a; \
391 a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
392 memcpy(dst, &a, 4); \
393 } while (0)
394
load_unaligned_u8_2x2(const uint8_t * buf,int stride,uint16x4_t * tu0)395 static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
396 uint16x4_t *tu0) {
397 uint16_t a;
398
399 memcpy(&a, buf, 2);
400 buf += stride;
401 *tu0 = vset_lane_u16(a, *tu0, 0);
402 memcpy(&a, buf, 2);
403 buf += stride;
404 *tu0 = vset_lane_u16(a, *tu0, 1);
405 }
406
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)407 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
408 uint8x16_t *const s0, uint8x16_t *const s1,
409 uint8x16_t *const s2, uint8x16_t *const s3,
410 uint8x16_t *const s4, uint8x16_t *const s5,
411 uint8x16_t *const s6, uint8x16_t *const s7) {
412 *s0 = vld1q_u8(s);
413 s += p;
414 *s1 = vld1q_u8(s);
415 s += p;
416 *s2 = vld1q_u8(s);
417 s += p;
418 *s3 = vld1q_u8(s);
419 s += p;
420 *s4 = vld1q_u8(s);
421 s += p;
422 *s5 = vld1q_u8(s);
423 s += p;
424 *s6 = vld1q_u8(s);
425 s += p;
426 *s7 = vld1q_u8(s);
427 }
428
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)429 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
430 uint8x16_t *const s0, uint8x16_t *const s1,
431 uint8x16_t *const s2, uint8x16_t *const s3) {
432 *s0 = vld1q_u8(s);
433 s += p;
434 *s1 = vld1q_u8(s);
435 s += p;
436 *s2 = vld1q_u8(s);
437 s += p;
438 *s3 = vld1q_u8(s);
439 }
440
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint64x2_t * tu0,uint64x2_t * tu1)441 static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
442 uint64x2_t *tu0, uint64x2_t *tu1) {
443 uint64_t a;
444
445 memcpy(&a, buf, 8);
446 buf += stride;
447 *tu0 = vsetq_lane_u64(a, *tu0, 0);
448 memcpy(&a, buf, 8);
449 buf += stride;
450 *tu0 = vsetq_lane_u64(a, *tu0, 1);
451 memcpy(&a, buf, 8);
452 buf += stride;
453 *tu1 = vsetq_lane_u64(a, *tu1, 0);
454 memcpy(&a, buf, 8);
455 *tu1 = vsetq_lane_u64(a, *tu1, 1);
456 }
457
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)458 static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
459 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
460 *s1 = vld1q_s32(s);
461 s += p;
462 *s2 = vld1q_s32(s);
463 s += p;
464 *s3 = vld1q_s32(s);
465 s += p;
466 *s4 = vld1q_s32(s);
467 }
468
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)469 static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
470 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
471 vst1q_s32(s, s1);
472 s += p;
473 vst1q_s32(s, s2);
474 s += p;
475 vst1q_s32(s, s3);
476 s += p;
477 vst1q_s32(s, s4);
478 }
479
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)480 static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
481 uint32x4_t *s2, uint32x4_t *s3,
482 uint32x4_t *s4) {
483 *s1 = vld1q_u32(s);
484 s += p;
485 *s2 = vld1q_u32(s);
486 s += p;
487 *s3 = vld1q_u32(s);
488 s += p;
489 *s4 = vld1q_u32(s);
490 }
491
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)492 static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
493 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
494 vst1q_u32(s, s1);
495 s += p;
496 vst1q_u32(s, s2);
497 s += p;
498 vst1q_u32(s, s3);
499 s += p;
500 vst1q_u32(s, s4);
501 }
502
503 #endif // AOM_AV1_COMMON_ARM_MEM_NEON_H_
504