• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef SSE2NEON_H
2 #define SSE2NEON_H
3 
4 // This header file provides a simple API translation layer
5 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6 //
7 // This header file does not yet translate all of the SSE intrinsics.
8 //
9 // Contributors to this work are:
10 //   John W. Ratcliff <jratcliffscarab@gmail.com>
11 //   Brandon Rowlett <browlett@nvidia.com>
12 //   Ken Fast <kfast@gdeb.com>
13 //   Eric van Beurden <evanbeurden@nvidia.com>
14 //   Alexander Potylitsin <apotylitsin@nvidia.com>
15 //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
16 //   Jim Huang <jserv@biilabs.io>
17 //   Mark Cheng <marktwtn@biilabs.io>
18 //   Malcolm James MacLeod <malcolm@gulden.com>
19 //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20 //   Sebastian Pop <spop@amazon.com>
21 //   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
22 //   Danila Kutenin <danilak@google.com>
23 //   François Turban (JishinMaster) <francois.turban@gmail.com>
24 //   Pei-Hsuan Hung <afcidk@gmail.com>
25 //   Yang-Hao Yuan <yanghau@biilabs.io>
26 
27 /*
28  * sse2neon is freely redistributable under the MIT License.
29  *
30  * Permission is hereby granted, free of charge, to any person obtaining a copy
31  * of this software and associated documentation files (the "Software"), to deal
32  * in the Software without restriction, including without limitation the rights
33  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
34  * copies of the Software, and to permit persons to whom the Software is
35  * furnished to do so, subject to the following conditions:
36  *
37  * The above copyright notice and this permission notice shall be included in
38  * all copies or substantial portions of the Software.
39  *
40  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
45  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
46  * SOFTWARE.
47  */
48 
49 /* Tunable configurations */
50 
51 /* Enable precise implementation of _mm_min_ps and _mm_max_ps
52  * This would slow down the computation a bit, but gives consistent result with
53  * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
54  */
55 #ifndef SSE2NEON_PRECISE_MINMAX
56 #define SSE2NEON_PRECISE_MINMAX (0)
57 #endif
58 
59 #if defined(__GNUC__) || defined(__clang__)
60 #pragma push_macro("FORCE_INLINE")
61 #pragma push_macro("ALIGN_STRUCT")
62 #define FORCE_INLINE static inline __attribute__((always_inline))
63 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
64 #else
65 #error "Macro name collisions may happen with unsupported compiler."
66 #ifdef FORCE_INLINE
67 #undef FORCE_INLINE
68 #endif
69 #define FORCE_INLINE static inline
70 #ifndef ALIGN_STRUCT
71 #define ALIGN_STRUCT(x) __declspec(align(x))
72 #endif
73 #endif
74 
75 #include <stdint.h>
76 #include <stdlib.h>
77 
78 /* Architecture-specific build options */
79 /* FIXME: #pragma GCC push_options is only available on GCC */
80 #if defined(__GNUC__)
81 #if defined(__arm__) && __ARM_ARCH == 7
82 /* According to ARM C Language Extensions Architecture specification,
83  * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
84  * architecture supported.
85  */
86 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
87 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
88 #endif
89 #pragma GCC push_options
90 #pragma GCC target("fpu=neon")
91 #elif defined(__aarch64__)
92 #pragma GCC push_options
93 #pragma GCC target("+simd")
94 #else
95 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
96 #endif
97 #endif
98 
99 #include <arm_neon.h>
100 
101 /* Rounding functions require either Aarch64 instructions or libm failback */
102 #if !defined(__aarch64__)
103 #include <math.h>
104 #endif
105 
106 /* "__has_builtin" can be used to query support for built-in functions
107  * provided by gcc/clang and other compilers that support it.
108  */
109 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
110 /* Compatibility with gcc <= 9 */
111 #if __GNUC__ <= 9
112 #define __has_builtin(x) HAS##x
113 #define HAS__builtin_popcount 1
114 #define HAS__builtin_popcountll 1
115 #else
116 #define __has_builtin(x) 0
117 #endif
118 #endif
119 
120 /**
121  * MACRO for shuffle parameter for _mm_shuffle_ps().
122  * Argument fp3 is a digit[0123] that represents the fp from argument "b"
123  * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
124  * for fp2 in result. fp1 is a digit[0123] that represents the fp from
125  * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
126  * fp0 is the same for fp0 of result.
127  */
128 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
129     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
130 
131 /* Rounding mode macros. */
132 #define _MM_FROUND_TO_NEAREST_INT 0x00
133 #define _MM_FROUND_TO_NEG_INF 0x01
134 #define _MM_FROUND_TO_POS_INF 0x02
135 #define _MM_FROUND_TO_ZERO 0x03
136 #define _MM_FROUND_CUR_DIRECTION 0x04
137 #define _MM_FROUND_NO_EXC 0x08
138 
139 /* indicate immediate constant argument in a given range */
140 #define __constrange(a, b) const
141 
142 /* A few intrinsics accept traditional data types like ints or floats, but
143  * most operate on data types that are specific to SSE.
144  * If a vector type ends in d, it contains doubles, and if it does not have
145  * a suffix, it contains floats. An integer vector type can contain any type
146  * of integer, from chars to shorts to unsigned long longs.
147  */
148 typedef int64x1_t __m64;
149 typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
150 // On ARM 32-bit architecture, the float64x2_t is not supported.
151 // The data type __m128d should be represented in a different way for related
152 // intrinsic conversion.
153 #if defined(__aarch64__)
154 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
155 #else
156 typedef float32x4_t __m128d;
157 #endif
158 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
159 
160 /* type-safe casting between types */
161 
162 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
163 #define vreinterpretq_m128_f32(x) (x)
164 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
165 
166 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
167 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
168 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
169 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
170 
171 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
172 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
173 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
174 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
175 
176 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
177 #define vreinterpretq_f32_m128(x) (x)
178 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
179 
180 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
181 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
182 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
183 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
184 
185 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
186 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
187 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
188 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
189 
190 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
191 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
192 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
193 #define vreinterpretq_m128i_s64(x) (x)
194 
195 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
196 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
197 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
198 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
199 
200 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
201 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
202 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
203 #define vreinterpretq_s64_m128i(x) (x)
204 
205 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
206 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
207 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
208 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
209 
210 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
211 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
212 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
213 #define vreinterpret_m64_s64(x) (x)
214 
215 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
216 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
217 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
218 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
219 
220 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
221 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
222 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
223 
224 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
225 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
226 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
227 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
228 
229 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
230 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
231 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
232 #define vreinterpret_s64_m64(x) (x)
233 
234 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
235 
236 #if defined(__aarch64__)
237 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
238 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
239 
240 #define vreinterpretq_m128d_f64(x) (x)
241 
242 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
243 
244 #define vreinterpretq_f64_m128d(x) (x)
245 #else
246 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
247 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
248 
249 #define vreinterpretq_m128d_f32(x) (x)
250 
251 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
252 
253 #define vreinterpretq_f32_m128d(x) (x)
254 #endif
255 
256 // A struct is defined in this header file called 'SIMDVec' which can be used
257 // by applications which attempt to access the contents of an _m128 struct
258 // directly.  It is important to note that accessing the __m128 struct directly
259 // is bad coding practice by Microsoft: @see:
260 // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
261 //
262 // However, some legacy source code may try to access the contents of an __m128
263 // struct directly so the developer can use the SIMDVec as an alias for it.  Any
264 // casting must be done manually by the developer, as you cannot cast or
265 // otherwise alias the base NEON data type for intrinsic operations.
266 //
267 // union intended to allow direct access to an __m128 variable using the names
268 // that the MSVC compiler provides.  This union should really only be used when
269 // trying to access the members of the vector as integer values.  GCC/clang
270 // allow native access to the float members through a simple array access
271 // operator (in C since 4.6, in C++ since 4.8).
272 //
273 // Ideally direct accesses to SIMD vectors should not be used since it can cause
274 // a performance hit.  If it really is needed however, the original __m128
275 // variable can be aliased with a pointer to this union and used to access
276 // individual components.  The use of this union should be hidden behind a macro
277 // that is used throughout the codebase to access the members instead of always
278 // declaring this type of variable.
279 typedef union ALIGN_STRUCT(16) SIMDVec {
280     float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
281     int8_t m128_i8[16];    // as signed 8-bit integers.
282     int16_t m128_i16[8];   // as signed 16-bit integers.
283     int32_t m128_i32[4];   // as signed 32-bit integers.
284     int64_t m128_i64[2];   // as signed 64-bit integers.
285     uint8_t m128_u8[16];   // as unsigned 8-bit integers.
286     uint16_t m128_u16[8];  // as unsigned 16-bit integers.
287     uint32_t m128_u32[4];  // as unsigned 32-bit integers.
288     uint64_t m128_u64[2];  // as unsigned 64-bit integers.
289 } SIMDVec;
290 
291 // casting using SIMDVec
292 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
293 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
294 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
295 
296 /* Backwards compatibility for compilers with lack of specific type support */
297 
298 // Older gcc does not define vld1q_u8_x4 type
299 #if defined(__GNUC__) && !defined(__clang__)
300 #if __GNUC__ <= 9
vld1q_u8_x4(const uint8_t * p)301 FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
302 {
303     uint8x16x4_t ret;
304     ret.val[0] = vld1q_u8(p + 0);
305     ret.val[1] = vld1q_u8(p + 16);
306     ret.val[2] = vld1q_u8(p + 32);
307     ret.val[3] = vld1q_u8(p + 48);
308     return ret;
309 }
310 #endif
311 #endif
312 
313 /* Function Naming Conventions
314  * The naming convention of SSE intrinsics is straightforward. A generic SSE
315  * intrinsic function is given as follows:
316  *   _mm_<name>_<data_type>
317  *
318  * The parts of this format are given as follows:
319  * 1. <name> describes the operation performed by the intrinsic
320  * 2. <data_type> identifies the data type of the function's primary arguments
321  *
322  * This last part, <data_type>, is a little complicated. It identifies the
323  * content of the input values, and can be set to any of the following values:
324  * + ps - vectors contain floats (ps stands for packed single-precision)
325  * + pd - vectors cantain doubles (pd stands for packed double-precision)
326  * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
327  *                            signed integers
328  * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
329  *                            unsigned integers
330  * + si128 - unspecified 128-bit vector or 256-bit vector
331  * + m128/m128i/m128d - identifies input vector types when they are different
332  *                      than the type of the returned vector
333  *
334  * For example, _mm_setzero_ps. The _mm implies that the function returns
335  * a 128-bit vector. The _ps at the end implies that the argument vectors
336  * contain floats.
337  *
338  * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
339  *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
340  *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
341  *   // Set packed 8-bit integers
342  *   // 128 bits, 16 chars, per 8 bits
343  *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
344  *                                  4, 5, 12, 13, 6, 7, 14, 15);
345  *   // Shuffle packed 8-bit integers
346  *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
347  *
348  * Data (Number, Binary, Byte Index):
349     +------+------+-------------+------+------+-------------+
350     |      1      |      2      |      3      |      4      | Number
351     +------+------+------+------+------+------+------+------+
352     | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
353     +------+------+------+------+------+------+------+------+
354     |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
355     +------+------+------+------+------+------+------+------+
356 
357     +------+------+------+------+------+------+------+------+
358     |      5      |      6      |      7      |      8      | Number
359     +------+------+------+------+------+------+------+------+
360     | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
361     +------+------+------+------+------+------+------+------+
362     |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
363     +------+------+------+------+------+------+------+------+
364  * Index (Byte Index):
365     +------+------+------+------+------+------+------+------+
366     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
367     +------+------+------+------+------+------+------+------+
368 
369     +------+------+------+------+------+------+------+------+
370     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
371     +------+------+------+------+------+------+------+------+
372  * Result:
373     +------+------+------+------+------+------+------+------+
374     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
375     +------+------+------+------+------+------+------+------+
376     | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
377     +------+------+------+------+------+------+------+------+
378     |     256     |      2      |      5      |      6      | Number
379     +------+------+------+------+------+------+------+------+
380 
381     +------+------+------+------+------+------+------+------+
382     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
383     +------+------+------+------+------+------+------+------+
384     | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
385     +------+------+------+------+------+------+------+------+
386     |      3      |      7      |      4      |      8      | Number
387     +------+------+------+------+------+------+-------------+
388  */
389 
390 /* Set/get methods */
391 
392 /* Constants for use with _mm_prefetch.  */
393 enum _mm_hint {
394     _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
395     _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
396     _MM_HINT_T1 = 2,   /* load data to L2 cache only */
397     _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
398     _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
399     _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
400     _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
401     _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
402 };
403 
404 // Loads one cache line of data from address p to a location closer to the
405 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
_mm_prefetch(const void * p,int i)406 FORCE_INLINE void _mm_prefetch(const void *p, int i)
407 {
408     (void) i;
409     __builtin_prefetch(p);
410 }
411 
412 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
413 //
414 //   dst[31:0] := a[31:0]
415 //
416 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
_mm_cvtss_f32(__m128 a)417 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
418 {
419     return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
420 }
421 
422 // Sets the 128-bit value to zero
423 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
_mm_setzero_si128(void)424 FORCE_INLINE __m128i _mm_setzero_si128(void)
425 {
426     return vreinterpretq_m128i_s32(vdupq_n_s32(0));
427 }
428 
429 // Clears the four single-precision, floating-point values.
430 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
_mm_setzero_ps(void)431 FORCE_INLINE __m128 _mm_setzero_ps(void)
432 {
433     return vreinterpretq_m128_f32(vdupq_n_f32(0));
434 }
435 
436 // Sets the four single-precision, floating-point values to w.
437 //
438 //   r0 := r1 := r2 := r3 := w
439 //
440 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set1_ps(float _w)441 FORCE_INLINE __m128 _mm_set1_ps(float _w)
442 {
443     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
444 }
445 
446 // Sets the four single-precision, floating-point values to w.
447 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set_ps1(float _w)448 FORCE_INLINE __m128 _mm_set_ps1(float _w)
449 {
450     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
451 }
452 
453 // Sets the four single-precision, floating-point values to the four inputs.
454 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
_mm_set_ps(float w,float z,float y,float x)455 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
456 {
457     float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
458     return vreinterpretq_m128_f32(vld1q_f32(data));
459 }
460 
461 // Copy single-precision (32-bit) floating-point element a to the lower element
462 // of dst, and zero the upper 3 elements.
463 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
_mm_set_ss(float a)464 FORCE_INLINE __m128 _mm_set_ss(float a)
465 {
466     float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
467     return vreinterpretq_m128_f32(vld1q_f32(data));
468 }
469 
470 // Sets the four single-precision, floating-point values to the four inputs in
471 // reverse order.
472 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
_mm_setr_ps(float w,float z,float y,float x)473 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
474 {
475     float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
476     return vreinterpretq_m128_f32(vld1q_f32(data));
477 }
478 
479 // Sets the 8 signed 16-bit integer values in reverse order.
480 //
481 // Return Value
482 //   r0 := w0
483 //   r1 := w1
484 //   ...
485 //   r7 := w7
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)486 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
487                                     short w1,
488                                     short w2,
489                                     short w3,
490                                     short w4,
491                                     short w5,
492                                     short w6,
493                                     short w7)
494 {
495     int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
496     return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
497 }
498 
499 // Sets the 4 signed 32-bit integer values in reverse order
500 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
_mm_setr_epi32(int i3,int i2,int i1,int i0)501 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
502 {
503     int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
504     return vreinterpretq_m128i_s32(vld1q_s32(data));
505 }
506 
507 // Set packed 64-bit integers in dst with the supplied values in reverse order.
508 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
_mm_setr_epi64(__m64 e1,__m64 e0)509 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
510 {
511     return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
512 }
513 
514 // Sets the 16 signed 8-bit integer values to b.
515 //
516 //   r0 := b
517 //   r1 := b
518 //   ...
519 //   r15 := b
520 //
521 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
_mm_set1_epi8(signed char w)522 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
523 {
524     return vreinterpretq_m128i_s8(vdupq_n_s8(w));
525 }
526 
527 // Sets the 8 signed 16-bit integer values to w.
528 //
529 //   r0 := w
530 //   r1 := w
531 //   ...
532 //   r7 := w
533 //
534 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
_mm_set1_epi16(short w)535 FORCE_INLINE __m128i _mm_set1_epi16(short w)
536 {
537     return vreinterpretq_m128i_s16(vdupq_n_s16(w));
538 }
539 
540 // Sets the 16 signed 8-bit integer values.
541 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
_mm_set_epi8(signed char b15,signed char b14,signed char b13,signed char b12,signed char b11,signed char b10,signed char b9,signed char b8,signed char b7,signed char b6,signed char b5,signed char b4,signed char b3,signed char b2,signed char b1,signed char b0)542 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
543                                   signed char b14,
544                                   signed char b13,
545                                   signed char b12,
546                                   signed char b11,
547                                   signed char b10,
548                                   signed char b9,
549                                   signed char b8,
550                                   signed char b7,
551                                   signed char b6,
552                                   signed char b5,
553                                   signed char b4,
554                                   signed char b3,
555                                   signed char b2,
556                                   signed char b1,
557                                   signed char b0)
558 {
559     int8_t ALIGN_STRUCT(16)
560         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
561                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
562                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
563                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
564     return (__m128i) vld1q_s8(data);
565 }
566 
567 // Sets the 8 signed 16-bit integer values.
568 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
_mm_set_epi16(short i7,short i6,short i5,short i4,short i3,short i2,short i1,short i0)569 FORCE_INLINE __m128i _mm_set_epi16(short i7,
570                                    short i6,
571                                    short i5,
572                                    short i4,
573                                    short i3,
574                                    short i2,
575                                    short i1,
576                                    short i0)
577 {
578     int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
579     return vreinterpretq_m128i_s16(vld1q_s16(data));
580 }
581 
582 // Sets the 16 signed 8-bit integer values in reverse order.
583 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
_mm_setr_epi8(signed char b0,signed char b1,signed char b2,signed char b3,signed char b4,signed char b5,signed char b6,signed char b7,signed char b8,signed char b9,signed char b10,signed char b11,signed char b12,signed char b13,signed char b14,signed char b15)584 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
585                                    signed char b1,
586                                    signed char b2,
587                                    signed char b3,
588                                    signed char b4,
589                                    signed char b5,
590                                    signed char b6,
591                                    signed char b7,
592                                    signed char b8,
593                                    signed char b9,
594                                    signed char b10,
595                                    signed char b11,
596                                    signed char b12,
597                                    signed char b13,
598                                    signed char b14,
599                                    signed char b15)
600 {
601     int8_t ALIGN_STRUCT(16)
602         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
603                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
604                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
605                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
606     return (__m128i) vld1q_s8(data);
607 }
608 
609 // Sets the 4 signed 32-bit integer values to i.
610 //
611 //   r0 := i
612 //   r1 := i
613 //   r2 := i
614 //   r3 := I
615 //
616 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
_mm_set1_epi32(int _i)617 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
618 {
619     return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
620 }
621 
622 // Sets the 2 signed 64-bit integer values to i.
623 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
_mm_set1_epi64(__m64 _i)624 FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
625 {
626     return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
627 }
628 
629 // Sets the 2 signed 64-bit integer values to i.
630 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
_mm_set1_epi64x(int64_t _i)631 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
632 {
633     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
634 }
635 
636 // Sets the 4 signed 32-bit integer values.
637 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
_mm_set_epi32(int i3,int i2,int i1,int i0)638 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
639 {
640     int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
641     return vreinterpretq_m128i_s32(vld1q_s32(data));
642 }
643 
644 // Returns the __m128i structure with its two 64-bit integer values
645 // initialized to the values of the two 64-bit integers passed in.
646 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64x(int64_t i1,int64_t i2)647 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
648 {
649     int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
650     return vreinterpretq_m128i_s64(vld1q_s64(data));
651 }
652 
653 // Returns the __m128i structure with its two 64-bit integer values
654 // initialized to the values of the two 64-bit integers passed in.
655 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64(__m64 i1,__m64 i2)656 FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
657 {
658     return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
659 }
660 
661 // Set packed double-precision (64-bit) floating-point elements in dst with the
662 // supplied values.
663 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
_mm_set_pd(double e1,double e0)664 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
665 {
666     double ALIGN_STRUCT(16) data[2] = {e0, e1};
667 #if defined(__aarch64__)
668     return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
669 #else
670     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
671 #endif
672 }
673 
674 // Stores four single-precision, floating-point values.
675 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
_mm_store_ps(float * p,__m128 a)676 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
677 {
678     vst1q_f32(p, vreinterpretq_f32_m128(a));
679 }
680 
681 // Stores four single-precision, floating-point values.
682 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
_mm_storeu_ps(float * p,__m128 a)683 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
684 {
685     vst1q_f32(p, vreinterpretq_f32_m128(a));
686 }
687 
688 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
689 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_store_si128(__m128i * p,__m128i a)690 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
691 {
692     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
693 }
694 
695 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
696 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_storeu_si128(__m128i * p,__m128i a)697 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
698 {
699     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
700 }
701 
702 // Stores the lower single - precision, floating - point value.
703 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
_mm_store_ss(float * p,__m128 a)704 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
705 {
706     vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
707 }
708 
709 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
710 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
711 // or a general-protection exception may be generated.
712 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
_mm_store_pd(double * mem_addr,__m128d a)713 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
714 {
715 #if defined(__aarch64__)
716     vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
717 #else
718     vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
719 #endif
720 }
721 
722 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
723 // elements) from a into memory. mem_addr does not need to be aligned on any
724 // particular boundary.
725 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
_mm_storeu_pd(double * mem_addr,__m128d a)726 FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
727 {
728     _mm_store_pd(mem_addr, a);
729 }
730 
731 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
732 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
_mm_storel_epi64(__m128i * a,__m128i b)733 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
734 {
735     uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
736     uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
737     *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
738 }
739 
740 // Stores the lower two single-precision floating point values of a to the
741 // address p.
742 //
743 //   *p0 := a0
744 //   *p1 := a1
745 //
746 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
_mm_storel_pi(__m64 * p,__m128 a)747 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
748 {
749     *p = vreinterpret_m64_f32(vget_low_f32(a));
750 }
751 
752 // Stores the upper two single-precision, floating-point values of a to the
753 // address p.
754 //
755 //   *p0 := a2
756 //   *p1 := a3
757 //
758 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
_mm_storeh_pi(__m64 * p,__m128 a)759 FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
760 {
761     *p = vreinterpret_m64_f32(vget_high_f32(a));
762 }
763 
764 // Loads a single single-precision, floating-point value, copying it into all
765 // four words
766 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
_mm_load1_ps(const float * p)767 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
768 {
769     return vreinterpretq_m128_f32(vld1q_dup_f32(p));
770 }
771 
772 // Load a single-precision (32-bit) floating-point element from memory into all
773 // elements of dst.
774 //
775 //   dst[31:0] := MEM[mem_addr+31:mem_addr]
776 //   dst[63:32] := MEM[mem_addr+31:mem_addr]
777 //   dst[95:64] := MEM[mem_addr+31:mem_addr]
778 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
779 //
780 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
781 #define _mm_load_ps1 _mm_load1_ps
782 
783 // Sets the lower two single-precision, floating-point values with 64
784 // bits of data loaded from the address p; the upper two values are passed
785 // through from a.
786 //
787 // Return Value
788 //   r0 := *p0
789 //   r1 := *p1
790 //   r2 := a2
791 //   r3 := a3
792 //
793 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
_mm_loadl_pi(__m128 a,__m64 const * p)794 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
795 {
796     return vreinterpretq_m128_f32(
797         vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
798 }
799 
800 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
801 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
802 // general-protection exception may be generated.
803 //
804 //   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
805 //   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
806 //   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
807 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
808 //
809 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
_mm_loadr_ps(const float * p)810 FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
811 {
812     float32x4_t v = vrev64q_f32(vld1q_f32(p));
813     return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
814 }
815 
816 // Sets the upper two single-precision, floating-point values with 64
817 // bits of data loaded from the address p; the lower two values are passed
818 // through from a.
819 //
820 //   r0 := a0
821 //   r1 := a1
822 //   r2 := *p0
823 //   r3 := *p1
824 //
825 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
_mm_loadh_pi(__m128 a,__m64 const * p)826 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
827 {
828     return vreinterpretq_m128_f32(
829         vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
830 }
831 
832 // Loads four single-precision, floating-point values.
833 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
_mm_load_ps(const float * p)834 FORCE_INLINE __m128 _mm_load_ps(const float *p)
835 {
836     return vreinterpretq_m128_f32(vld1q_f32(p));
837 }
838 
839 // Loads four single-precision, floating-point values.
840 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
_mm_loadu_ps(const float * p)841 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
842 {
843     // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
844     // equivalent for neon
845     return vreinterpretq_m128_f32(vld1q_f32(p));
846 }
847 
848 // Load unaligned 16-bit integer from memory into the first element of dst.
849 //
850 //   dst[15:0] := MEM[mem_addr+15:mem_addr]
851 //   dst[MAX:16] := 0
852 //
853 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
_mm_loadu_si16(const void * p)854 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
855 {
856     return vreinterpretq_m128i_s16(
857         vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
858 }
859 
860 // Load unaligned 64-bit integer from memory into the first element of dst.
861 //
862 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
863 //   dst[MAX:64] := 0
864 //
865 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
_mm_loadu_si64(const void * p)866 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
867 {
868     return vreinterpretq_m128i_s64(
869         vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
870 }
871 
872 // Load a double-precision (64-bit) floating-point element from memory into the
873 // lower of dst, and zero the upper element. mem_addr does not need to be
874 // aligned on any particular boundary.
875 //
876 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
877 //   dst[127:64] := 0
878 //
879 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
_mm_load_sd(const double * p)880 FORCE_INLINE __m128d _mm_load_sd(const double *p)
881 {
882 #if defined(__aarch64__)
883     return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
884 #else
885     const float *fp = (const float *) p;
886     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
887     return vreinterpretq_m128d_f32(vld1q_f32(data));
888 #endif
889 }
890 
891 // Loads two double-precision from 16-byte aligned memory, floating-point
892 // values.
893 //
894 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
895 //
896 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
_mm_load_pd(const double * p)897 FORCE_INLINE __m128d _mm_load_pd(const double *p)
898 {
899 #if defined(__aarch64__)
900     return vreinterpretq_m128d_f64(vld1q_f64(p));
901 #else
902     const float *fp = (const float *) p;
903     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
904     return vreinterpretq_m128d_f32(vld1q_f32(data));
905 #endif
906 }
907 
908 // Loads two double-precision from unaligned memory, floating-point values.
909 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
_mm_loadu_pd(const double * p)910 FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
911 {
912     return _mm_load_pd(p);
913 }
914 
915 // Loads an single - precision, floating - point value into the low word and
916 // clears the upper three words.
917 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
_mm_load_ss(const float * p)918 FORCE_INLINE __m128 _mm_load_ss(const float *p)
919 {
920     return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
921 }
922 
_mm_loadl_epi64(__m128i const * p)923 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
924 {
925     /* Load the lower 64 bits of the value pointed to by p into the
926      * lower 64 bits of the result, zeroing the upper 64 bits of the result.
927      */
928     return vreinterpretq_m128i_s32(
929         vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
930 }
931 
932 // Load a double-precision (64-bit) floating-point element from memory into the
933 // lower element of dst, and copy the upper element from a to dst. mem_addr does
934 // not need to be aligned on any particular boundary.
935 //
936 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
937 //   dst[127:64] := a[127:64]
938 //
939 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
_mm_loadl_pd(__m128d a,const double * p)940 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
941 {
942 #if defined(__aarch64__)
943     return vreinterpretq_m128d_f64(
944         vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
945 #else
946     return vreinterpretq_m128d_f32(
947         vcombine_f32(vld1_f32((const float *) p),
948                      vget_high_f32(vreinterpretq_f32_m128d(a))));
949 #endif
950 }
951 
952 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
953 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
954 // general-protection exception may be generated.
955 //
956 //   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
957 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
958 //
959 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
_mm_loadr_pd(const double * p)960 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
961 {
962 #if defined(__aarch64__)
963     float64x2_t v = vld1q_f64(p);
964     return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
965 #else
966     int64x2_t v = vld1q_s64((const int64_t *) p);
967     return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
968 #endif
969 }
970 
971 // Sets the low word to the single-precision, floating-point value of b
972 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
_mm_move_ss(__m128 a,__m128 b)973 FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
974 {
975     return vreinterpretq_m128_f32(
976         vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
977                        vreinterpretq_f32_m128(a), 0));
978 }
979 
980 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
981 // upper element.
982 //
983 //   dst[63:0] := a[63:0]
984 //   dst[127:64] := 0
985 //
986 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
_mm_move_epi64(__m128i a)987 FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
988 {
989     return vreinterpretq_m128i_s64(
990         vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
991 }
992 
993 // Return vector of type __m128 with undefined elements.
994 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
_mm_undefined_ps(void)995 FORCE_INLINE __m128 _mm_undefined_ps(void)
996 {
997     __m128 a;
998     return a;
999 }
1000 
1001 /* Logic/Binary operations */
1002 
1003 // Computes the bitwise AND-NOT of the four single-precision, floating-point
1004 // values of a and b.
1005 //
1006 //   r0 := ~a0 & b0
1007 //   r1 := ~a1 & b1
1008 //   r2 := ~a2 & b2
1009 //   r3 := ~a3 & b3
1010 //
1011 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
_mm_andnot_ps(__m128 a,__m128 b)1012 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1013 {
1014     return vreinterpretq_m128_s32(
1015         vbicq_s32(vreinterpretq_s32_m128(b),
1016                   vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
1017 }
1018 
1019 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
1020 // elements in a and then AND with b, and store the results in dst.
1021 //
1022 //   FOR j := 0 to 1
1023 // 	     i := j*64
1024 // 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
1025 //   ENDFOR
1026 //
1027 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
_mm_andnot_pd(__m128d a,__m128d b)1028 FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
1029 {
1030     // *NOTE* argument swap
1031     return vreinterpretq_m128d_s64(
1032         vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
1033 }
1034 
1035 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
1036 // 128-bit value in a.
1037 //
1038 //   r := (~a) & b
1039 //
1040 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
_mm_andnot_si128(__m128i a,__m128i b)1041 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
1042 {
1043     return vreinterpretq_m128i_s32(
1044         vbicq_s32(vreinterpretq_s32_m128i(b),
1045                   vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
1046 }
1047 
1048 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
1049 // b.
1050 //
1051 //   r := a & b
1052 //
1053 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
_mm_and_si128(__m128i a,__m128i b)1054 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
1055 {
1056     return vreinterpretq_m128i_s32(
1057         vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1058 }
1059 
1060 // Computes the bitwise AND of the four single-precision, floating-point values
1061 // of a and b.
1062 //
1063 //   r0 := a0 & b0
1064 //   r1 := a1 & b1
1065 //   r2 := a2 & b2
1066 //   r3 := a3 & b3
1067 //
1068 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
_mm_and_ps(__m128 a,__m128 b)1069 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1070 {
1071     return vreinterpretq_m128_s32(
1072         vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1073 }
1074 
1075 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
1076 // elements in a and b, and store the results in dst.
1077 //
1078 //   FOR j := 0 to 1
1079 //     i := j*64
1080 //     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
1081 //   ENDFOR
1082 //
1083 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
_mm_and_pd(__m128d a,__m128d b)1084 FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
1085 {
1086     return vreinterpretq_m128d_s64(
1087         vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1088 }
1089 
1090 // Computes the bitwise OR of the four single-precision, floating-point values
1091 // of a and b.
1092 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
_mm_or_ps(__m128 a,__m128 b)1093 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
1094 {
1095     return vreinterpretq_m128_s32(
1096         vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1097 }
1098 
1099 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
1100 // floating-point values of a and b.
1101 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
_mm_xor_ps(__m128 a,__m128 b)1102 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
1103 {
1104     return vreinterpretq_m128_s32(
1105         veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1106 }
1107 
1108 // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
1109 // elements in a and b, and store the results in dst.
1110 //
1111 //   FOR j := 0 to 1
1112 //      i := j*64
1113 //      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
1114 //   ENDFOR
1115 //
1116 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
_mm_xor_pd(__m128d a,__m128d b)1117 FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
1118 {
1119     return vreinterpretq_m128d_s64(
1120         veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1121 }
1122 
1123 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
1124 //
1125 //   r := a | b
1126 //
1127 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
_mm_or_si128(__m128i a,__m128i b)1128 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
1129 {
1130     return vreinterpretq_m128i_s32(
1131         vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1132 }
1133 
1134 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
1135 // b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
_mm_xor_si128(__m128i a,__m128i b)1136 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
1137 {
1138     return vreinterpretq_m128i_s32(
1139         veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1140 }
1141 
1142 // Duplicate odd-indexed single-precision (32-bit) floating-point elements
1143 // from a, and store the results in dst.
1144 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
_mm_movehdup_ps(__m128 a)1145 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
1146 {
1147 #if __has_builtin(__builtin_shufflevector)
1148     return vreinterpretq_m128_f32(__builtin_shufflevector(
1149         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
1150 #else
1151     float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
1152     float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
1153     float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
1154     return vreinterpretq_m128_f32(vld1q_f32(data));
1155 #endif
1156 }
1157 
1158 // Duplicate even-indexed single-precision (32-bit) floating-point elements
1159 // from a, and store the results in dst.
1160 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
_mm_moveldup_ps(__m128 a)1161 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
1162 {
1163 #if __has_builtin(__builtin_shufflevector)
1164     return vreinterpretq_m128_f32(__builtin_shufflevector(
1165         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
1166 #else
1167     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1168     float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
1169     float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
1170     return vreinterpretq_m128_f32(vld1q_f32(data));
1171 #endif
1172 }
1173 
1174 // Moves the upper two values of B into the lower two values of A.
1175 //
1176 //   r3 := a3
1177 //   r2 := a2
1178 //   r1 := b3
1179 //   r0 := b2
_mm_movehl_ps(__m128 __A,__m128 __B)1180 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
1181 {
1182     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
1183     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
1184     return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
1185 }
1186 
1187 // Moves the lower two values of B into the upper two values of A.
1188 //
1189 //   r3 := b1
1190 //   r2 := b0
1191 //   r1 := a1
1192 //   r0 := a0
_mm_movelh_ps(__m128 __A,__m128 __B)1193 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
1194 {
1195     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
1196     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
1197     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1198 }
1199 
1200 // Compute the absolute value of packed signed 32-bit integers in a, and store
1201 // the unsigned results in dst.
1202 //
1203 //   FOR j := 0 to 3
1204 //     i := j*32
1205 //     dst[i+31:i] := ABS(a[i+31:i])
1206 //   ENDFOR
1207 //
1208 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
_mm_abs_epi32(__m128i a)1209 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
1210 {
1211     return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
1212 }
1213 
1214 // Compute the absolute value of packed signed 16-bit integers in a, and store
1215 // the unsigned results in dst.
1216 //
1217 //   FOR j := 0 to 7
1218 //     i := j*16
1219 //     dst[i+15:i] := ABS(a[i+15:i])
1220 //   ENDFOR
1221 //
1222 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
_mm_abs_epi16(__m128i a)1223 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
1224 {
1225     return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
1226 }
1227 
1228 // Compute the absolute value of packed signed 8-bit integers in a, and store
1229 // the unsigned results in dst.
1230 //
1231 //   FOR j := 0 to 15
1232 //     i := j*8
1233 //     dst[i+7:i] := ABS(a[i+7:i])
1234 //   ENDFOR
1235 //
1236 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
_mm_abs_epi8(__m128i a)1237 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
1238 {
1239     return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
1240 }
1241 
1242 // Compute the absolute value of packed signed 32-bit integers in a, and store
1243 // the unsigned results in dst.
1244 //
1245 //   FOR j := 0 to 1
1246 //     i := j*32
1247 //     dst[i+31:i] := ABS(a[i+31:i])
1248 //   ENDFOR
1249 //
1250 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
_mm_abs_pi32(__m64 a)1251 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
1252 {
1253     return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
1254 }
1255 
1256 // Compute the absolute value of packed signed 16-bit integers in a, and store
1257 // the unsigned results in dst.
1258 //
1259 //   FOR j := 0 to 3
1260 //     i := j*16
1261 //     dst[i+15:i] := ABS(a[i+15:i])
1262 //   ENDFOR
1263 //
1264 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
_mm_abs_pi16(__m64 a)1265 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
1266 {
1267     return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
1268 }
1269 
1270 // Compute the absolute value of packed signed 8-bit integers in a, and store
1271 // the unsigned results in dst.
1272 //
1273 //   FOR j := 0 to 7
1274 //     i := j*8
1275 //     dst[i+7:i] := ABS(a[i+7:i])
1276 //   ENDFOR
1277 //
1278 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
_mm_abs_pi8(__m64 a)1279 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
1280 {
1281     return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
1282 }
1283 
1284 // Takes the upper 64 bits of a and places it in the low end of the result
1285 // Takes the lower 64 bits of b and places it into the high end of the result.
_mm_shuffle_ps_1032(__m128 a,__m128 b)1286 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
1287 {
1288     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1289     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1290     return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
1291 }
1292 
1293 // takes the lower two 32-bit values from a and swaps them and places in high
1294 // end of result takes the higher two 32 bit values from b and swaps them and
1295 // places in low end of result.
_mm_shuffle_ps_2301(__m128 a,__m128 b)1296 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
1297 {
1298     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1299     float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
1300     return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
1301 }
1302 
_mm_shuffle_ps_0321(__m128 a,__m128 b)1303 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
1304 {
1305     float32x2_t a21 = vget_high_f32(
1306         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1307     float32x2_t b03 = vget_low_f32(
1308         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1309     return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
1310 }
1311 
_mm_shuffle_ps_2103(__m128 a,__m128 b)1312 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
1313 {
1314     float32x2_t a03 = vget_low_f32(
1315         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1316     float32x2_t b21 = vget_high_f32(
1317         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1318     return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
1319 }
1320 
_mm_shuffle_ps_1010(__m128 a,__m128 b)1321 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
1322 {
1323     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1324     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1325     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1326 }
1327 
_mm_shuffle_ps_1001(__m128 a,__m128 b)1328 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
1329 {
1330     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1331     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1332     return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
1333 }
1334 
_mm_shuffle_ps_0101(__m128 a,__m128 b)1335 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
1336 {
1337     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1338     float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
1339     return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
1340 }
1341 
1342 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
1343 // high
_mm_shuffle_ps_3210(__m128 a,__m128 b)1344 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
1345 {
1346     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1347     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1348     return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
1349 }
1350 
_mm_shuffle_ps_0011(__m128 a,__m128 b)1351 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
1352 {
1353     float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
1354     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1355     return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
1356 }
1357 
_mm_shuffle_ps_0022(__m128 a,__m128 b)1358 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
1359 {
1360     float32x2_t a22 =
1361         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1362     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1363     return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
1364 }
1365 
_mm_shuffle_ps_2200(__m128 a,__m128 b)1366 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
1367 {
1368     float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
1369     float32x2_t b22 =
1370         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
1371     return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
1372 }
1373 
_mm_shuffle_ps_3202(__m128 a,__m128 b)1374 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
1375 {
1376     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1377     float32x2_t a22 =
1378         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1379     float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
1380     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1381     return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
1382 }
1383 
_mm_shuffle_ps_1133(__m128 a,__m128 b)1384 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
1385 {
1386     float32x2_t a33 =
1387         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
1388     float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
1389     return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
1390 }
1391 
_mm_shuffle_ps_2010(__m128 a,__m128 b)1392 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
1393 {
1394     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1395     float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
1396     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1397     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1398     return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
1399 }
1400 
_mm_shuffle_ps_2001(__m128 a,__m128 b)1401 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
1402 {
1403     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1404     float32_t b2 = vgetq_lane_f32(b, 2);
1405     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1406     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1407     return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
1408 }
1409 
_mm_shuffle_ps_2032(__m128 a,__m128 b)1410 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
1411 {
1412     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1413     float32_t b2 = vgetq_lane_f32(b, 2);
1414     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1415     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1416     return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
1417 }
1418 
1419 // NEON does not support a general purpose permute intrinsic
1420 // Selects four specific single-precision, floating-point values from a and b,
1421 // based on the mask i.
1422 //
1423 // C equivalent:
1424 //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1425 //                                 __constrange(0, 255) int imm) {
1426 //       __m128 ret;
1427 //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
1428 //       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
1429 //       return ret;
1430 //   }
1431 //
1432 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
1433 #define _mm_shuffle_ps_default(a, b, imm)                                  \
1434     __extension__({                                                        \
1435         float32x4_t ret;                                                   \
1436         ret = vmovq_n_f32(                                                 \
1437             vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
1438         ret = vsetq_lane_f32(                                              \
1439             vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1440             ret, 1);                                                       \
1441         ret = vsetq_lane_f32(                                              \
1442             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1443             ret, 2);                                                       \
1444         ret = vsetq_lane_f32(                                              \
1445             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1446             ret, 3);                                                       \
1447         vreinterpretq_m128_f32(ret);                                       \
1448     })
1449 
1450 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
1451 // int imm)
1452 #if __has_builtin(__builtin_shufflevector)
1453 #define _mm_shuffle_ps(a, b, imm)                                \
1454     __extension__({                                              \
1455         float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
1456         float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
1457         float32x4_t _shuf = __builtin_shufflevector(             \
1458             _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1459             (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
1460         vreinterpretq_m128_f32(_shuf);                           \
1461     })
1462 #else  // generic
1463 #define _mm_shuffle_ps(a, b, imm)                          \
1464     __extension__({                                        \
1465         __m128 ret;                                        \
1466         switch (imm) {                                     \
1467         case _MM_SHUFFLE(1, 0, 3, 2):                      \
1468             ret = _mm_shuffle_ps_1032((a), (b));           \
1469             break;                                         \
1470         case _MM_SHUFFLE(2, 3, 0, 1):                      \
1471             ret = _mm_shuffle_ps_2301((a), (b));           \
1472             break;                                         \
1473         case _MM_SHUFFLE(0, 3, 2, 1):                      \
1474             ret = _mm_shuffle_ps_0321((a), (b));           \
1475             break;                                         \
1476         case _MM_SHUFFLE(2, 1, 0, 3):                      \
1477             ret = _mm_shuffle_ps_2103((a), (b));           \
1478             break;                                         \
1479         case _MM_SHUFFLE(1, 0, 1, 0):                      \
1480             ret = _mm_movelh_ps((a), (b));                 \
1481             break;                                         \
1482         case _MM_SHUFFLE(1, 0, 0, 1):                      \
1483             ret = _mm_shuffle_ps_1001((a), (b));           \
1484             break;                                         \
1485         case _MM_SHUFFLE(0, 1, 0, 1):                      \
1486             ret = _mm_shuffle_ps_0101((a), (b));           \
1487             break;                                         \
1488         case _MM_SHUFFLE(3, 2, 1, 0):                      \
1489             ret = _mm_shuffle_ps_3210((a), (b));           \
1490             break;                                         \
1491         case _MM_SHUFFLE(0, 0, 1, 1):                      \
1492             ret = _mm_shuffle_ps_0011((a), (b));           \
1493             break;                                         \
1494         case _MM_SHUFFLE(0, 0, 2, 2):                      \
1495             ret = _mm_shuffle_ps_0022((a), (b));           \
1496             break;                                         \
1497         case _MM_SHUFFLE(2, 2, 0, 0):                      \
1498             ret = _mm_shuffle_ps_2200((a), (b));           \
1499             break;                                         \
1500         case _MM_SHUFFLE(3, 2, 0, 2):                      \
1501             ret = _mm_shuffle_ps_3202((a), (b));           \
1502             break;                                         \
1503         case _MM_SHUFFLE(3, 2, 3, 2):                      \
1504             ret = _mm_movehl_ps((b), (a));                 \
1505             break;                                         \
1506         case _MM_SHUFFLE(1, 1, 3, 3):                      \
1507             ret = _mm_shuffle_ps_1133((a), (b));           \
1508             break;                                         \
1509         case _MM_SHUFFLE(2, 0, 1, 0):                      \
1510             ret = _mm_shuffle_ps_2010((a), (b));           \
1511             break;                                         \
1512         case _MM_SHUFFLE(2, 0, 0, 1):                      \
1513             ret = _mm_shuffle_ps_2001((a), (b));           \
1514             break;                                         \
1515         case _MM_SHUFFLE(2, 0, 3, 2):                      \
1516             ret = _mm_shuffle_ps_2032((a), (b));           \
1517             break;                                         \
1518         default:                                           \
1519             ret = _mm_shuffle_ps_default((a), (b), (imm)); \
1520             break;                                         \
1521         }                                                  \
1522         ret;                                               \
1523     })
1524 #endif
1525 
1526 // Takes the upper 64 bits of a and places it in the low end of the result
1527 // Takes the lower 64 bits of a and places it into the high end of the result.
_mm_shuffle_epi_1032(__m128i a)1528 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
1529 {
1530     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1531     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1532     return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
1533 }
1534 
1535 // takes the lower two 32-bit values from a and swaps them and places in low end
1536 // of result takes the higher two 32 bit values from a and swaps them and places
1537 // in high end of result.
_mm_shuffle_epi_2301(__m128i a)1538 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1539 {
1540     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1541     int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1542     return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1543 }
1544 
1545 // rotates the least significant 32 bits into the most signficant 32 bits, and
1546 // shifts the rest down
_mm_shuffle_epi_0321(__m128i a)1547 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1548 {
1549     return vreinterpretq_m128i_s32(
1550         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1551 }
1552 
1553 // rotates the most significant 32 bits into the least signficant 32 bits, and
1554 // shifts the rest up
_mm_shuffle_epi_2103(__m128i a)1555 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1556 {
1557     return vreinterpretq_m128i_s32(
1558         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1559 }
1560 
1561 // gets the lower 64 bits of a, and places it in the upper 64 bits
1562 // gets the lower 64 bits of a and places it in the lower 64 bits
_mm_shuffle_epi_1010(__m128i a)1563 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1564 {
1565     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1566     return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1567 }
1568 
1569 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1570 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
_mm_shuffle_epi_1001(__m128i a)1571 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1572 {
1573     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1574     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1575     return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1576 }
1577 
1578 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1579 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1580 // places it in the lower 64 bits
_mm_shuffle_epi_0101(__m128i a)1581 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1582 {
1583     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1584     return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1585 }
1586 
_mm_shuffle_epi_2211(__m128i a)1587 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1588 {
1589     int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1590     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1591     return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1592 }
1593 
_mm_shuffle_epi_0122(__m128i a)1594 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1595 {
1596     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1597     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1598     return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1599 }
1600 
_mm_shuffle_epi_3332(__m128i a)1601 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1602 {
1603     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1604     int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1605     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1606 }
1607 
1608 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
1609 // corresponding 8-bit element of b, and store the results in dst.
1610 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
_mm_shuffle_epi8(__m128i a,__m128i b)1611 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
1612 {
1613     int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
1614     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
1615     uint8x16_t idx_masked =
1616         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
1617 #if defined(__aarch64__)
1618     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
1619 #elif defined(__GNUC__)
1620     int8x16_t ret;
1621     // %e and %f represent the even and odd D registers
1622     // respectively.
1623     __asm__ __volatile__(
1624         "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
1625         "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
1626         : [ret] "=&w"(ret)
1627         : [tbl] "w"(tbl), [idx] "w"(idx_masked));
1628     return vreinterpretq_m128i_s8(ret);
1629 #else
1630     // use this line if testing on aarch64
1631     int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
1632     return vreinterpretq_m128i_s8(
1633         vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
1634                     vtbl2_s8(a_split, vget_high_u8(idx_masked))));
1635 #endif
1636 }
1637 
1638 // C equivalent:
1639 //   __m128i _mm_shuffle_epi32_default(__m128i a,
1640 //                                     __constrange(0, 255) int imm) {
1641 //       __m128i ret;
1642 //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
1643 //       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
1644 //       return ret;
1645 //   }
1646 #define _mm_shuffle_epi32_default(a, imm)                                   \
1647     __extension__({                                                         \
1648         int32x4_t ret;                                                      \
1649         ret = vmovq_n_s32(                                                  \
1650             vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
1651         ret = vsetq_lane_s32(                                               \
1652             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
1653             ret, 1);                                                        \
1654         ret = vsetq_lane_s32(                                               \
1655             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
1656             ret, 2);                                                        \
1657         ret = vsetq_lane_s32(                                               \
1658             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
1659             ret, 3);                                                        \
1660         vreinterpretq_m128i_s32(ret);                                       \
1661     })
1662 
1663 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1664 // int imm)
1665 #if defined(__aarch64__)
1666 #define _mm_shuffle_epi32_splat(a, imm)                          \
1667     __extension__({                                              \
1668         vreinterpretq_m128i_s32(                                 \
1669             vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1670     })
1671 #else
1672 #define _mm_shuffle_epi32_splat(a, imm)                                      \
1673     __extension__({                                                          \
1674         vreinterpretq_m128i_s32(                                             \
1675             vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1676     })
1677 #endif
1678 
1679 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
1680 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
1681 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
1682 //                                        __constrange(0,255) int imm)
1683 #if __has_builtin(__builtin_shufflevector)
1684 #define _mm_shuffle_epi32(a, imm)                              \
1685     __extension__({                                            \
1686         int32x4_t _input = vreinterpretq_s32_m128i(a);         \
1687         int32x4_t _shuf = __builtin_shufflevector(             \
1688             _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1689             ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
1690         vreinterpretq_m128i_s32(_shuf);                        \
1691     })
1692 #else  // generic
1693 #define _mm_shuffle_epi32(a, imm)                        \
1694     __extension__({                                      \
1695         __m128i ret;                                     \
1696         switch (imm) {                                   \
1697         case _MM_SHUFFLE(1, 0, 3, 2):                    \
1698             ret = _mm_shuffle_epi_1032((a));             \
1699             break;                                       \
1700         case _MM_SHUFFLE(2, 3, 0, 1):                    \
1701             ret = _mm_shuffle_epi_2301((a));             \
1702             break;                                       \
1703         case _MM_SHUFFLE(0, 3, 2, 1):                    \
1704             ret = _mm_shuffle_epi_0321((a));             \
1705             break;                                       \
1706         case _MM_SHUFFLE(2, 1, 0, 3):                    \
1707             ret = _mm_shuffle_epi_2103((a));             \
1708             break;                                       \
1709         case _MM_SHUFFLE(1, 0, 1, 0):                    \
1710             ret = _mm_shuffle_epi_1010((a));             \
1711             break;                                       \
1712         case _MM_SHUFFLE(1, 0, 0, 1):                    \
1713             ret = _mm_shuffle_epi_1001((a));             \
1714             break;                                       \
1715         case _MM_SHUFFLE(0, 1, 0, 1):                    \
1716             ret = _mm_shuffle_epi_0101((a));             \
1717             break;                                       \
1718         case _MM_SHUFFLE(2, 2, 1, 1):                    \
1719             ret = _mm_shuffle_epi_2211((a));             \
1720             break;                                       \
1721         case _MM_SHUFFLE(0, 1, 2, 2):                    \
1722             ret = _mm_shuffle_epi_0122((a));             \
1723             break;                                       \
1724         case _MM_SHUFFLE(3, 3, 3, 2):                    \
1725             ret = _mm_shuffle_epi_3332((a));             \
1726             break;                                       \
1727         case _MM_SHUFFLE(0, 0, 0, 0):                    \
1728             ret = _mm_shuffle_epi32_splat((a), 0);       \
1729             break;                                       \
1730         case _MM_SHUFFLE(1, 1, 1, 1):                    \
1731             ret = _mm_shuffle_epi32_splat((a), 1);       \
1732             break;                                       \
1733         case _MM_SHUFFLE(2, 2, 2, 2):                    \
1734             ret = _mm_shuffle_epi32_splat((a), 2);       \
1735             break;                                       \
1736         case _MM_SHUFFLE(3, 3, 3, 3):                    \
1737             ret = _mm_shuffle_epi32_splat((a), 3);       \
1738             break;                                       \
1739         default:                                         \
1740             ret = _mm_shuffle_epi32_default((a), (imm)); \
1741             break;                                       \
1742         }                                                \
1743         ret;                                             \
1744     })
1745 #endif
1746 
1747 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
1748 // by imm.
1749 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
1750 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
1751 //                                                   __constrange(0,255) int
1752 //                                                   imm)
1753 #define _mm_shufflelo_epi16_function(a, imm)                                  \
1754     __extension__({                                                           \
1755         int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
1756         int16x4_t lowBits = vget_low_s16(ret);                                \
1757         ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
1758         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1759                              1);                                              \
1760         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1761                              2);                                              \
1762         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1763                              3);                                              \
1764         vreinterpretq_m128i_s16(ret);                                         \
1765     })
1766 
1767 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
1768 //                                          __constrange(0,255) int imm)
1769 #if __has_builtin(__builtin_shufflevector)
1770 #define _mm_shufflelo_epi16(a, imm)                                  \
1771     __extension__({                                                  \
1772         int16x8_t _input = vreinterpretq_s16_m128i(a);               \
1773         int16x8_t _shuf = __builtin_shufflevector(                   \
1774             _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
1775             (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
1776         vreinterpretq_m128i_s16(_shuf);                              \
1777     })
1778 #else  // generic
1779 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
1780 #endif
1781 
1782 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1783 // by imm.
1784 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1785 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1786 //                                                   __constrange(0,255) int
1787 //                                                   imm)
1788 #define _mm_shufflehi_epi16_function(a, imm)                                   \
1789     __extension__({                                                            \
1790         int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
1791         int16x4_t highBits = vget_high_s16(ret);                               \
1792         ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
1793         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1794                              5);                                               \
1795         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1796                              6);                                               \
1797         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1798                              7);                                               \
1799         vreinterpretq_m128i_s16(ret);                                          \
1800     })
1801 
1802 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
1803 //                                          __constrange(0,255) int imm)
1804 #if __has_builtin(__builtin_shufflevector)
1805 #define _mm_shufflehi_epi16(a, imm)                             \
1806     __extension__({                                             \
1807         int16x8_t _input = vreinterpretq_s16_m128i(a);          \
1808         int16x8_t _shuf = __builtin_shufflevector(              \
1809             _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
1810             (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
1811             (((imm) >> 6) & 0x3) + 4);                          \
1812         vreinterpretq_m128i_s16(_shuf);                         \
1813     })
1814 #else  // generic
1815 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
1816 #endif
1817 
1818 // Blend packed 16-bit integers from a and b using control mask imm8, and store
1819 // the results in dst.
1820 //
1821 //   FOR j := 0 to 7
1822 //       i := j*16
1823 //       IF imm8[j]
1824 //           dst[i+15:i] := b[i+15:i]
1825 //       ELSE
1826 //           dst[i+15:i] := a[i+15:i]
1827 //       FI
1828 //   ENDFOR
1829 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
1830 //                                      __constrange(0,255) int imm)
1831 #define _mm_blend_epi16(a, b, imm)                                        \
1832     __extension__({                                                       \
1833         const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000,  \
1834                                    ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,  \
1835                                    ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,  \
1836                                    ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,  \
1837                                    ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,  \
1838                                    ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,  \
1839                                    ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,  \
1840                                    ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
1841         uint16x8_t _mask_vec = vld1q_u16(_mask);                          \
1842         uint16x8_t _a = vreinterpretq_u16_m128i(a);                       \
1843         uint16x8_t _b = vreinterpretq_u16_m128i(b);                       \
1844         vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));            \
1845     })
1846 
1847 // Blend packed 8-bit integers from a and b using mask, and store the results in
1848 // dst.
1849 //
1850 //   FOR j := 0 to 15
1851 //       i := j*8
1852 //       IF mask[i+7]
1853 //           dst[i+7:i] := b[i+7:i]
1854 //       ELSE
1855 //           dst[i+7:i] := a[i+7:i]
1856 //       FI
1857 //   ENDFOR
_mm_blendv_epi8(__m128i _a,__m128i _b,__m128i _mask)1858 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
1859 {
1860     // Use a signed shift right to create a mask with the sign bit
1861     uint8x16_t mask =
1862         vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
1863     uint8x16_t a = vreinterpretq_u8_m128i(_a);
1864     uint8x16_t b = vreinterpretq_u8_m128i(_b);
1865     return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
1866 }
1867 
1868 /* Shifts */
1869 
1870 
1871 // Shift packed 16-bit integers in a right by imm while shifting in sign
1872 // bits, and store the results in dst.
1873 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
_mm_srai_epi16(__m128i a,int imm)1874 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
1875 {
1876     const int count = (imm & ~15) ? 15 : imm;
1877     return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
1878 }
1879 
1880 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1881 // shifting in zeros.
1882 //
1883 //   r0 := a0 << count
1884 //   r1 := a1 << count
1885 //   ...
1886 //   r7 := a7 << count
1887 //
1888 // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
1889 #define _mm_slli_epi16(a, imm)                                   \
1890     __extension__({                                              \
1891         __m128i ret;                                             \
1892         if ((imm) <= 0) {                                        \
1893             ret = a;                                             \
1894         } else if ((imm) > 15) {                                 \
1895             ret = _mm_setzero_si128();                           \
1896         } else {                                                 \
1897             ret = vreinterpretq_m128i_s16(                       \
1898                 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
1899         }                                                        \
1900         ret;                                                     \
1901     })
1902 
1903 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
1904 // shifting in zeros. :
1905 // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
1906 // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
_mm_slli_epi32(__m128i a,int imm)1907 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
1908 {
1909     if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
1910         return a;
1911     if (imm > 31) /* TODO: add unlikely macro */
1912         return _mm_setzero_si128();
1913     return vreinterpretq_m128i_s32(
1914         vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
1915 }
1916 
1917 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
1918 // store the results in dst.
_mm_slli_epi64(__m128i a,int imm)1919 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
1920 {
1921     if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
1922         return a;
1923     if (imm > 63) /* TODO: add unlikely macro */
1924         return _mm_setzero_si128();
1925     return vreinterpretq_m128i_s64(
1926         vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
1927 }
1928 
1929 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
1930 // store the results in dst.
1931 //
1932 //   FOR j := 0 to 7
1933 //     i := j*16
1934 //     IF imm8[7:0] > 15
1935 //       dst[i+15:i] := 0
1936 //     ELSE
1937 //       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
1938 //     FI
1939 //   ENDFOR
1940 //
1941 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
1942 #define _mm_srli_epi16(a, imm)                                             \
1943     __extension__({                                                        \
1944         __m128i ret;                                                       \
1945         if ((imm) == 0) {                                                  \
1946             ret = a;                                                       \
1947         } else if (0 < (imm) && (imm) < 16) {                              \
1948             ret = vreinterpretq_m128i_u16(                                 \
1949                 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
1950         } else {                                                           \
1951             ret = _mm_setzero_si128();                                     \
1952         }                                                                  \
1953         ret;                                                               \
1954     })
1955 
1956 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
1957 // store the results in dst.
1958 //
1959 //   FOR j := 0 to 3
1960 //     i := j*32
1961 //     IF imm8[7:0] > 31
1962 //       dst[i+31:i] := 0
1963 //     ELSE
1964 //       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
1965 //     FI
1966 //   ENDFOR
1967 //
1968 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
1969 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
1970 #define _mm_srli_epi32(a, imm)                                             \
1971     __extension__({                                                        \
1972         __m128i ret;                                                       \
1973         if ((imm) == 0) {                                                  \
1974             ret = a;                                                       \
1975         } else if (0 < (imm) && (imm) < 32) {                              \
1976             ret = vreinterpretq_m128i_u32(                                 \
1977                 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
1978         } else {                                                           \
1979             ret = _mm_setzero_si128();                                     \
1980         }                                                                  \
1981         ret;                                                               \
1982     })
1983 
1984 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
1985 // store the results in dst.
1986 //
1987 //   FOR j := 0 to 1
1988 //     i := j*64
1989 //     IF imm8[7:0] > 63
1990 //       dst[i+63:i] := 0
1991 //     ELSE
1992 //       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
1993 //     FI
1994 //   ENDFOR
1995 //
1996 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
1997 #define _mm_srli_epi64(a, imm)                                             \
1998     __extension__({                                                        \
1999         __m128i ret;                                                       \
2000         if ((imm) == 0) {                                                  \
2001             ret = a;                                                       \
2002         } else if (0 < (imm) && (imm) < 64) {                              \
2003             ret = vreinterpretq_m128i_u64(                                 \
2004                 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
2005         } else {                                                           \
2006             ret = _mm_setzero_si128();                                     \
2007         }                                                                  \
2008         ret;                                                               \
2009     })
2010 
2011 // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
2012 // and store the results in dst.
2013 //
2014 //   FOR j := 0 to 3
2015 //     i := j*32
2016 //     IF imm8[7:0] > 31
2017 //       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
2018 //     ELSE
2019 //       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
2020 //     FI
2021 //   ENDFOR
2022 //
2023 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
2024 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
2025 #define _mm_srai_epi32(a, imm)                                             \
2026     __extension__({                                                        \
2027         __m128i ret;                                                       \
2028         if ((imm) == 0) {                                                  \
2029             ret = a;                                                       \
2030         } else if (0 < (imm) && (imm) < 32) {                              \
2031             ret = vreinterpretq_m128i_s32(                                 \
2032                 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
2033         } else {                                                           \
2034             ret = vreinterpretq_m128i_s32(                                 \
2035                 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
2036         }                                                                  \
2037         ret;                                                               \
2038     })
2039 
2040 // Shifts the 128 - bit value in a right by imm bytes while shifting in
2041 // zeros.imm must be an immediate.
2042 //
2043 //   r := srl(a, imm*8)
2044 //
2045 // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
2046 // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
2047 #define _mm_srli_si128(a, imm)                                              \
2048     __extension__({                                                         \
2049         __m128i ret;                                                        \
2050         if ((imm) <= 0) {                                                   \
2051             ret = a;                                                        \
2052         } else if ((imm) > 15) {                                            \
2053             ret = _mm_setzero_si128();                                      \
2054         } else {                                                            \
2055             ret = vreinterpretq_m128i_s8(                                   \
2056                 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
2057         }                                                                   \
2058         ret;                                                                \
2059     })
2060 
2061 // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
2062 // must be an immediate.
2063 //
2064 //   r := a << (imm * 8)
2065 //
2066 // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
2067 // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
2068 #define _mm_slli_si128(a, imm)                                          \
2069     __extension__({                                                     \
2070         __m128i ret;                                                    \
2071         if ((imm) <= 0) {                                               \
2072             ret = a;                                                    \
2073         } else if ((imm) > 15) {                                        \
2074             ret = _mm_setzero_si128();                                  \
2075         } else {                                                        \
2076             ret = vreinterpretq_m128i_s8(vextq_s8(                      \
2077                 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
2078         }                                                               \
2079         ret;                                                            \
2080     })
2081 
2082 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
2083 // shifting in zeros.
2084 //
2085 //   r0 := a0 << count
2086 //   r1 := a1 << count
2087 //   ...
2088 //   r7 := a7 << count
2089 //
2090 // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
_mm_sll_epi16(__m128i a,__m128i count)2091 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
2092 {
2093     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2094     if (c > 15)
2095         return _mm_setzero_si128();
2096 
2097     int16x8_t vc = vdupq_n_s16((int16_t) c);
2098     return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
2099 }
2100 
2101 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2102 // shifting in zeros.
2103 //
2104 // r0 := a0 << count
2105 // r1 := a1 << count
2106 // r2 := a2 << count
2107 // r3 := a3 << count
2108 //
2109 // https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
_mm_sll_epi32(__m128i a,__m128i count)2110 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
2111 {
2112     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2113     if (c > 31)
2114         return _mm_setzero_si128();
2115 
2116     int32x4_t vc = vdupq_n_s32((int32_t) c);
2117     return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
2118 }
2119 
2120 // Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
2121 // shifting in zeros.
2122 //
2123 // r0 := a0 << count
2124 // r1 := a1 << count
2125 //
2126 // https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
_mm_sll_epi64(__m128i a,__m128i count)2127 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
2128 {
2129     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2130     if (c > 63)
2131         return _mm_setzero_si128();
2132 
2133     int64x2_t vc = vdupq_n_s64((int64_t) c);
2134     return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
2135 }
2136 
2137 // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
2138 // while shifting in zeros.
2139 //
2140 // r0 := srl(a0, count)
2141 // r1 := srl(a1, count)
2142 // ...
2143 // r7 := srl(a7, count)
2144 //
2145 // https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
_mm_srl_epi16(__m128i a,__m128i count)2146 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
2147 {
2148     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2149     if (c > 15)
2150         return _mm_setzero_si128();
2151 
2152     int16x8_t vc = vdupq_n_s16(-(int16_t) c);
2153     return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
2154 }
2155 
2156 // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
2157 // while shifting in zeros.
2158 //
2159 // r0 := srl(a0, count)
2160 // r1 := srl(a1, count)
2161 // r2 := srl(a2, count)
2162 // r3 := srl(a3, count)
2163 //
2164 // https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
_mm_srl_epi32(__m128i a,__m128i count)2165 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
2166 {
2167     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2168     if (c > 31)
2169         return _mm_setzero_si128();
2170 
2171     int32x4_t vc = vdupq_n_s32(-(int32_t) c);
2172     return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
2173 }
2174 
2175 // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
2176 // while shifting in zeros.
2177 //
2178 // r0 := srl(a0, count)
2179 // r1 := srl(a1, count)
2180 //
2181 // https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
_mm_srl_epi64(__m128i a,__m128i count)2182 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
2183 {
2184     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2185     if (c > 63)
2186         return _mm_setzero_si128();
2187 
2188     int64x2_t vc = vdupq_n_s64(-(int64_t) c);
2189     return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
2190 }
2191 
2192 // NEON does not provide a version of this function.
2193 // Creates a 16-bit mask from the most significant bits of the 16 signed or
2194 // unsigned 8-bit integers in a and zero extends the upper bits.
2195 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
_mm_movemask_epi8(__m128i a)2196 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
2197 {
2198 #if defined(__aarch64__)
2199     uint8x16_t input = vreinterpretq_u8_m128i(a);
2200     const int8_t ALIGN_STRUCT(16)
2201         xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
2202     const uint8x16_t mask_and = vdupq_n_u8(0x80);
2203     const int8x16_t mask_shift = vld1q_s8(xr);
2204     const uint8x16_t mask_result =
2205         vshlq_u8(vandq_u8(input, mask_and), mask_shift);
2206     uint8x8_t lo = vget_low_u8(mask_result);
2207     uint8x8_t hi = vget_high_u8(mask_result);
2208 
2209     return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
2210 #else
2211     // Use increasingly wide shifts+adds to collect the sign bits
2212     // together.
2213     // Since the widening shifts would be rather confusing to follow in little
2214     // endian, everything will be illustrated in big endian order instead. This
2215     // has a different result - the bits would actually be reversed on a big
2216     // endian machine.
2217 
2218     // Starting input (only half the elements are shown):
2219     // 89 ff 1d c0 00 10 99 33
2220     uint8x16_t input = vreinterpretq_u8_m128i(a);
2221 
2222     // Shift out everything but the sign bits with an unsigned shift right.
2223     //
2224     // Bytes of the vector::
2225     // 89 ff 1d c0 00 10 99 33
2226     // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
2227     //  |  |  |  |  |  |  |  |
2228     // 01 01 00 01 00 00 01 00
2229     //
2230     // Bits of first important lane(s):
2231     // 10001001 (89)
2232     // \______
2233     //        |
2234     // 00000001 (01)
2235     uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
2236 
2237     // Merge the even lanes together with a 16-bit unsigned shift right + add.
2238     // 'xx' represents garbage data which will be ignored in the final result.
2239     // In the important bytes, the add functions like a binary OR.
2240     //
2241     // 01 01 00 01 00 00 01 00
2242     //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
2243     //    \|    \|    \|    \|
2244     // xx 03 xx 01 xx 00 xx 02
2245     //
2246     // 00000001 00000001 (01 01)
2247     //        \_______ |
2248     //                \|
2249     // xxxxxxxx xxxxxx11 (xx 03)
2250     uint32x4_t paired16 =
2251         vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2252 
2253     // Repeat with a wider 32-bit shift + add.
2254     // xx 03 xx 01 xx 00 xx 02
2255     //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
2256     //     14))
2257     //          \|          \|
2258     // xx xx xx 0d xx xx xx 02
2259     //
2260     // 00000011 00000001 (03 01)
2261     //        \\_____ ||
2262     //         '----.\||
2263     // xxxxxxxx xxxx1101 (xx 0d)
2264     uint64x2_t paired32 =
2265         vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2266 
2267     // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
2268     // lanes. xx xx xx 0d xx xx xx 02
2269     //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
2270     //            28))
2271     //                      \|
2272     // xx xx xx xx xx xx xx d2
2273     //
2274     // 00001101 00000010 (0d 02)
2275     //     \   \___ |  |
2276     //      '---.  \|  |
2277     // xxxxxxxx 11010010 (xx d2)
2278     uint8x16_t paired64 =
2279         vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2280 
2281     // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
2282     // xx xx xx xx xx xx xx d2
2283     //                      ||  return paired64[0]
2284     //                      d2
2285     // Note: Little endian would return the correct value 4b (01001011) instead.
2286     return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
2287 #endif
2288 }
2289 
2290 // Copy the lower 64-bit integer in a to dst.
2291 //
2292 //   dst[63:0] := a[63:0]
2293 //
2294 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
_mm_movepi64_pi64(__m128i a)2295 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
2296 {
2297     return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
2298 }
2299 
2300 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
2301 // element.
2302 //
2303 //   dst[63:0] := a[63:0]
2304 //   dst[127:64] := 0
2305 //
2306 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
_mm_movpi64_epi64(__m64 a)2307 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
2308 {
2309     return vreinterpretq_m128i_s64(
2310         vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
2311 }
2312 
2313 // NEON does not provide this method
2314 // Creates a 4-bit mask from the most significant bits of the four
2315 // single-precision, floating-point values.
2316 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
_mm_movemask_ps(__m128 a)2317 FORCE_INLINE int _mm_movemask_ps(__m128 a)
2318 {
2319     uint32x4_t input = vreinterpretq_u32_m128(a);
2320 #if defined(__aarch64__)
2321     static const int32x4_t shift = {0, 1, 2, 3};
2322     uint32x4_t tmp = vshrq_n_u32(input, 31);
2323     return vaddvq_u32(vshlq_u32(tmp, shift));
2324 #else
2325     // Uses the exact same method as _mm_movemask_epi8, see that for details.
2326     // Shift out everything but the sign bits with a 32-bit unsigned shift
2327     // right.
2328     uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2329     // Merge the two pairs together with a 64-bit unsigned shift right + add.
2330     uint8x16_t paired =
2331         vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2332     // Extract the result.
2333     return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2334 #endif
2335 }
2336 
2337 // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
2338 // all 1's, and return 1 if the result is zero, otherwise return 0.
2339 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
_mm_test_all_ones(__m128i a)2340 FORCE_INLINE int _mm_test_all_ones(__m128i a)
2341 {
2342     return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
2343            ~(uint64_t) 0;
2344 }
2345 
2346 // Compute the bitwise AND of 128 bits (representing integer data) in a and
2347 // mask, and return 1 if the result is zero, otherwise return 0.
2348 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
_mm_test_all_zeros(__m128i a,__m128i mask)2349 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
2350 {
2351     int64x2_t a_and_mask =
2352         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
2353     return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
2354                                                                            : 1;
2355 }
2356 
2357 /* Math operations */
2358 
2359 // Subtracts the four single-precision, floating-point values of a and b.
2360 //
2361 //   r0 := a0 - b0
2362 //   r1 := a1 - b1
2363 //   r2 := a2 - b2
2364 //   r3 := a3 - b3
2365 //
2366 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
_mm_sub_ps(__m128 a,__m128 b)2367 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2368 {
2369     return vreinterpretq_m128_f32(
2370         vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2371 }
2372 
2373 // Subtract the lower single-precision (32-bit) floating-point element in b from
2374 // the lower single-precision (32-bit) floating-point element in a, store the
2375 // result in the lower element of dst, and copy the upper 3 packed elements from
2376 // a to the upper elements of dst.
2377 //
2378 //   dst[31:0] := a[31:0] - b[31:0]
2379 //   dst[127:32] := a[127:32]
2380 //
2381 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
_mm_sub_ss(__m128 a,__m128 b)2382 FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2383 {
2384     return _mm_move_ss(a, _mm_sub_ps(a, b));
2385 }
2386 
2387 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
2388 // and store the results in dst.
2389 //    r0 := a0 - b0
2390 //    r1 := a1 - b1
_mm_sub_epi64(__m128i a,__m128i b)2391 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
2392 {
2393     return vreinterpretq_m128i_s64(
2394         vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2395 }
2396 
2397 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
2398 // unsigned 32-bit integers of a.
2399 //
2400 //   r0 := a0 - b0
2401 //   r1 := a1 - b1
2402 //   r2 := a2 - b2
2403 //   r3 := a3 - b3
2404 //
2405 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
_mm_sub_epi32(__m128i a,__m128i b)2406 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
2407 {
2408     return vreinterpretq_m128i_s32(
2409         vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2410 }
2411 
_mm_sub_epi16(__m128i a,__m128i b)2412 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
2413 {
2414     return vreinterpretq_m128i_s16(
2415         vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2416 }
2417 
_mm_sub_epi8(__m128i a,__m128i b)2418 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
2419 {
2420     return vreinterpretq_m128i_s8(
2421         vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2422 }
2423 
2424 // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
2425 //
2426 //   dst[63:0] := a[63:0] - b[63:0]
2427 //
2428 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
_mm_sub_si64(__m64 a,__m64 b)2429 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
2430 {
2431     return vreinterpret_m64_s64(
2432         vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2433 }
2434 
2435 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
2436 // integers of a and saturates..
2437 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
_mm_subs_epu16(__m128i a,__m128i b)2438 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
2439 {
2440     return vreinterpretq_m128i_u16(
2441         vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2442 }
2443 
2444 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
2445 // integers of a and saturates.
2446 //
2447 //   r0 := UnsignedSaturate(a0 - b0)
2448 //   r1 := UnsignedSaturate(a1 - b1)
2449 //   ...
2450 //   r15 := UnsignedSaturate(a15 - b15)
2451 //
2452 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
_mm_subs_epu8(__m128i a,__m128i b)2453 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
2454 {
2455     return vreinterpretq_m128i_u8(
2456         vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2457 }
2458 
2459 // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
2460 // of a and saturates.
2461 //
2462 //   r0 := SignedSaturate(a0 - b0)
2463 //   r1 := SignedSaturate(a1 - b1)
2464 //   ...
2465 //   r15 := SignedSaturate(a15 - b15)
2466 //
2467 // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
_mm_subs_epi8(__m128i a,__m128i b)2468 FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
2469 {
2470     return vreinterpretq_m128i_s8(
2471         vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2472 }
2473 
2474 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
2475 // of a and saturates.
2476 //
2477 //   r0 := SignedSaturate(a0 - b0)
2478 //   r1 := SignedSaturate(a1 - b1)
2479 //   ...
2480 //   r7 := SignedSaturate(a7 - b7)
2481 //
2482 // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
_mm_subs_epi16(__m128i a,__m128i b)2483 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
2484 {
2485     return vreinterpretq_m128i_s16(
2486         vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2487 }
2488 
_mm_adds_epu16(__m128i a,__m128i b)2489 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2490 {
2491     return vreinterpretq_m128i_u16(
2492         vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2493 }
2494 
2495 // Negate packed 8-bit integers in a when the corresponding signed
2496 // 8-bit integer in b is negative, and store the results in dst.
2497 // Element in dst are zeroed out when the corresponding element
2498 // in b is zero.
2499 //
2500 //   for i in 0..15
2501 //     if b[i] < 0
2502 //       r[i] := -a[i]
2503 //     else if b[i] == 0
2504 //       r[i] := 0
2505 //     else
2506 //       r[i] := a[i]
2507 //     fi
2508 //   done
_mm_sign_epi8(__m128i _a,__m128i _b)2509 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
2510 {
2511     int8x16_t a = vreinterpretq_s8_m128i(_a);
2512     int8x16_t b = vreinterpretq_s8_m128i(_b);
2513 
2514     // signed shift right: faster than vclt
2515     // (b < 0) ? 0xFF : 0
2516     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
2517 
2518     // (b == 0) ? 0xFF : 0
2519 #if defined(__aarch64__)
2520     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
2521 #else
2522     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
2523 #endif
2524 
2525     // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
2526     // based on ltMask
2527     int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
2528     // res = masked & (~zeroMask)
2529     int8x16_t res = vbicq_s8(masked, zeroMask);
2530 
2531     return vreinterpretq_m128i_s8(res);
2532 }
2533 
2534 // Negate packed 16-bit integers in a when the corresponding signed
2535 // 16-bit integer in b is negative, and store the results in dst.
2536 // Element in dst are zeroed out when the corresponding element
2537 // in b is zero.
2538 //
2539 //   for i in 0..7
2540 //     if b[i] < 0
2541 //       r[i] := -a[i]
2542 //     else if b[i] == 0
2543 //       r[i] := 0
2544 //     else
2545 //       r[i] := a[i]
2546 //     fi
2547 //   done
_mm_sign_epi16(__m128i _a,__m128i _b)2548 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
2549 {
2550     int16x8_t a = vreinterpretq_s16_m128i(_a);
2551     int16x8_t b = vreinterpretq_s16_m128i(_b);
2552 
2553     // signed shift right: faster than vclt
2554     // (b < 0) ? 0xFFFF : 0
2555     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
2556     // (b == 0) ? 0xFFFF : 0
2557 #if defined(__aarch64__)
2558     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
2559 #else
2560     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
2561 #endif
2562 
2563     // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
2564     // 'a') based on ltMask
2565     int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
2566     // res = masked & (~zeroMask)
2567     int16x8_t res = vbicq_s16(masked, zeroMask);
2568     return vreinterpretq_m128i_s16(res);
2569 }
2570 
2571 // Negate packed 32-bit integers in a when the corresponding signed
2572 // 32-bit integer in b is negative, and store the results in dst.
2573 // Element in dst are zeroed out when the corresponding element
2574 // in b is zero.
2575 //
2576 //   for i in 0..3
2577 //     if b[i] < 0
2578 //       r[i] := -a[i]
2579 //     else if b[i] == 0
2580 //       r[i] := 0
2581 //     else
2582 //       r[i] := a[i]
2583 //     fi
2584 //   done
_mm_sign_epi32(__m128i _a,__m128i _b)2585 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
2586 {
2587     int32x4_t a = vreinterpretq_s32_m128i(_a);
2588     int32x4_t b = vreinterpretq_s32_m128i(_b);
2589 
2590     // signed shift right: faster than vclt
2591     // (b < 0) ? 0xFFFFFFFF : 0
2592     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
2593 
2594     // (b == 0) ? 0xFFFFFFFF : 0
2595 #if defined(__aarch64__)
2596     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
2597 #else
2598     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
2599 #endif
2600 
2601     // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
2602     // 'a') based on ltMask
2603     int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
2604     // res = masked & (~zeroMask)
2605     int32x4_t res = vbicq_s32(masked, zeroMask);
2606     return vreinterpretq_m128i_s32(res);
2607 }
2608 
2609 // Negate packed 16-bit integers in a when the corresponding signed 16-bit
2610 // integer in b is negative, and store the results in dst. Element in dst are
2611 // zeroed out when the corresponding element in b is zero.
2612 //
2613 //   FOR j := 0 to 3
2614 //      i := j*16
2615 //      IF b[i+15:i] < 0
2616 //        dst[i+15:i] := -(a[i+15:i])
2617 //      ELSE IF b[i+15:i] == 0
2618 //        dst[i+15:i] := 0
2619 //      ELSE
2620 //        dst[i+15:i] := a[i+15:i]
2621 //      FI
2622 //   ENDFOR
2623 //
2624 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
_mm_sign_pi16(__m64 _a,__m64 _b)2625 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
2626 {
2627     int16x4_t a = vreinterpret_s16_m64(_a);
2628     int16x4_t b = vreinterpret_s16_m64(_b);
2629 
2630     // signed shift right: faster than vclt
2631     // (b < 0) ? 0xFFFF : 0
2632     uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
2633 
2634     // (b == 0) ? 0xFFFF : 0
2635 #if defined(__aarch64__)
2636     int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
2637 #else
2638     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
2639 #endif
2640 
2641     // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
2642     // based on ltMask
2643     int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
2644     // res = masked & (~zeroMask)
2645     int16x4_t res = vbic_s16(masked, zeroMask);
2646 
2647     return vreinterpret_m64_s16(res);
2648 }
2649 
2650 // Negate packed 32-bit integers in a when the corresponding signed 32-bit
2651 // integer in b is negative, and store the results in dst. Element in dst are
2652 // zeroed out when the corresponding element in b is zero.
2653 //
2654 //   FOR j := 0 to 1
2655 //      i := j*32
2656 //      IF b[i+31:i] < 0
2657 //        dst[i+31:i] := -(a[i+31:i])
2658 //      ELSE IF b[i+31:i] == 0
2659 //        dst[i+31:i] := 0
2660 //      ELSE
2661 //        dst[i+31:i] := a[i+31:i]
2662 //      FI
2663 //   ENDFOR
2664 //
2665 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
_mm_sign_pi32(__m64 _a,__m64 _b)2666 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
2667 {
2668     int32x2_t a = vreinterpret_s32_m64(_a);
2669     int32x2_t b = vreinterpret_s32_m64(_b);
2670 
2671     // signed shift right: faster than vclt
2672     // (b < 0) ? 0xFFFFFFFF : 0
2673     uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
2674 
2675     // (b == 0) ? 0xFFFFFFFF : 0
2676 #if defined(__aarch64__)
2677     int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
2678 #else
2679     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
2680 #endif
2681 
2682     // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
2683     // based on ltMask
2684     int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
2685     // res = masked & (~zeroMask)
2686     int32x2_t res = vbic_s32(masked, zeroMask);
2687 
2688     return vreinterpret_m64_s32(res);
2689 }
2690 
2691 // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
2692 // in b is negative, and store the results in dst. Element in dst are zeroed out
2693 // when the corresponding element in b is zero.
2694 //
2695 //   FOR j := 0 to 7
2696 //      i := j*8
2697 //      IF b[i+7:i] < 0
2698 //        dst[i+7:i] := -(a[i+7:i])
2699 //      ELSE IF b[i+7:i] == 0
2700 //        dst[i+7:i] := 0
2701 //      ELSE
2702 //        dst[i+7:i] := a[i+7:i]
2703 //      FI
2704 //   ENDFOR
2705 //
2706 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
_mm_sign_pi8(__m64 _a,__m64 _b)2707 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
2708 {
2709     int8x8_t a = vreinterpret_s8_m64(_a);
2710     int8x8_t b = vreinterpret_s8_m64(_b);
2711 
2712     // signed shift right: faster than vclt
2713     // (b < 0) ? 0xFF : 0
2714     uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
2715 
2716     // (b == 0) ? 0xFF : 0
2717 #if defined(__aarch64__)
2718     int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
2719 #else
2720     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
2721 #endif
2722 
2723     // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
2724     // based on ltMask
2725     int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
2726     // res = masked & (~zeroMask)
2727     int8x8_t res = vbic_s8(masked, zeroMask);
2728 
2729     return vreinterpret_m64_s8(res);
2730 }
2731 
2732 // Average packed unsigned 16-bit integers in a and b, and store the results in
2733 // dst.
2734 //
2735 //   FOR j := 0 to 3
2736 //     i := j*16
2737 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2738 //   ENDFOR
2739 //
2740 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
_mm_avg_pu16(__m64 a,__m64 b)2741 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
2742 {
2743     return vreinterpret_m64_u16(
2744         vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
2745 }
2746 
2747 // Average packed unsigned 8-bit integers in a and b, and store the results in
2748 // dst.
2749 //
2750 //   FOR j := 0 to 7
2751 //     i := j*8
2752 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2753 //   ENDFOR
2754 //
2755 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
_mm_avg_pu8(__m64 a,__m64 b)2756 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
2757 {
2758     return vreinterpret_m64_u8(
2759         vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2760 }
2761 
2762 // Average packed unsigned 8-bit integers in a and b, and store the results in
2763 // dst.
2764 //
2765 //   FOR j := 0 to 7
2766 //     i := j*8
2767 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2768 //   ENDFOR
2769 //
2770 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2771 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2772 
2773 // Average packed unsigned 16-bit integers in a and b, and store the results in
2774 // dst.
2775 //
2776 //   FOR j := 0 to 3
2777 //     i := j*16
2778 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2779 //   ENDFOR
2780 //
2781 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2782 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2783 
2784 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
2785 // unsigned 8-bit integers in b and rounds.
2786 //
2787 //   r0 := (a0 + b0) / 2
2788 //   r1 := (a1 + b1) / 2
2789 //   ...
2790 //   r15 := (a15 + b15) / 2
2791 //
2792 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
_mm_avg_epu8(__m128i a,__m128i b)2793 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
2794 {
2795     return vreinterpretq_m128i_u8(
2796         vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2797 }
2798 
2799 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
2800 // unsigned 16-bit integers in b and rounds.
2801 //
2802 //   r0 := (a0 + b0) / 2
2803 //   r1 := (a1 + b1) / 2
2804 //   ...
2805 //   r7 := (a7 + b7) / 2
2806 //
2807 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
_mm_avg_epu16(__m128i a,__m128i b)2808 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
2809 {
2810     return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
2811                                  vreinterpretq_u16_m128i(b));
2812 }
2813 
2814 // Adds the four single-precision, floating-point values of a and b.
2815 //
2816 //   r0 := a0 + b0
2817 //   r1 := a1 + b1
2818 //   r2 := a2 + b2
2819 //   r3 := a3 + b3
2820 //
2821 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
_mm_add_ps(__m128 a,__m128 b)2822 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
2823 {
2824     return vreinterpretq_m128_f32(
2825         vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2826 }
2827 
2828 // Add packed double-precision (64-bit) floating-point elements in a and b, and
2829 // store the results in dst.
2830 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
_mm_add_pd(__m128d a,__m128d b)2831 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2832 {
2833 #if defined(__aarch64__)
2834     return vreinterpretq_m128d_f64(
2835         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2836 #else
2837     double *da = (double *) &a;
2838     double *db = (double *) &b;
2839     double c[2];
2840     c[0] = da[0] + db[0];
2841     c[1] = da[1] + db[1];
2842     return vld1q_f32((float32_t *) c);
2843 #endif
2844 }
2845 
2846 // Add 64-bit integers a and b, and store the result in dst.
2847 //
2848 //   dst[63:0] := a[63:0] + b[63:0]
2849 //
2850 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
_mm_add_si64(__m64 a,__m64 b)2851 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2852 {
2853     return vreinterpret_m64_s64(
2854         vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2855 }
2856 
2857 // adds the scalar single-precision floating point values of a and b.
2858 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
_mm_add_ss(__m128 a,__m128 b)2859 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
2860 {
2861     float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
2862     float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
2863     // the upper values in the result must be the remnants of <a>.
2864     return vreinterpretq_m128_f32(vaddq_f32(a, value));
2865 }
2866 
2867 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2868 // unsigned 32-bit integers in b.
2869 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi64(__m128i a,__m128i b)2870 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2871 {
2872     return vreinterpretq_m128i_s64(
2873         vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2874 }
2875 
2876 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2877 // unsigned 32-bit integers in b.
2878 //
2879 //   r0 := a0 + b0
2880 //   r1 := a1 + b1
2881 //   r2 := a2 + b2
2882 //   r3 := a3 + b3
2883 //
2884 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi32(__m128i a,__m128i b)2885 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2886 {
2887     return vreinterpretq_m128i_s32(
2888         vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2889 }
2890 
2891 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2892 // unsigned 16-bit integers in b.
2893 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
_mm_add_epi16(__m128i a,__m128i b)2894 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2895 {
2896     return vreinterpretq_m128i_s16(
2897         vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2898 }
2899 
2900 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
2901 // unsigned 8-bit integers in b.
2902 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
_mm_add_epi8(__m128i a,__m128i b)2903 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2904 {
2905     return vreinterpretq_m128i_s8(
2906         vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2907 }
2908 
2909 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
2910 // and saturates.
2911 //
2912 //   r0 := SignedSaturate(a0 + b0)
2913 //   r1 := SignedSaturate(a1 + b1)
2914 //   ...
2915 //   r7 := SignedSaturate(a7 + b7)
2916 //
2917 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
_mm_adds_epi16(__m128i a,__m128i b)2918 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
2919 {
2920     return vreinterpretq_m128i_s16(
2921         vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2922 }
2923 
2924 // Add packed signed 8-bit integers in a and b using saturation, and store the
2925 // results in dst.
2926 //
2927 //   FOR j := 0 to 15
2928 //     i := j*8
2929 //     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
2930 //   ENDFOR
2931 //
2932 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
_mm_adds_epi8(__m128i a,__m128i b)2933 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
2934 {
2935     return vreinterpretq_m128i_s8(
2936         vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2937 }
2938 
2939 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
2940 // b and saturates..
2941 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
_mm_adds_epu8(__m128i a,__m128i b)2942 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
2943 {
2944     return vreinterpretq_m128i_u8(
2945         vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2946 }
2947 
2948 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
2949 // unsigned 16-bit integers from b.
2950 //
2951 //   r0 := (a0 * b0)[15:0]
2952 //   r1 := (a1 * b1)[15:0]
2953 //   ...
2954 //   r7 := (a7 * b7)[15:0]
2955 //
2956 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
_mm_mullo_epi16(__m128i a,__m128i b)2957 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
2958 {
2959     return vreinterpretq_m128i_s16(
2960         vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2961 }
2962 
2963 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
2964 // unsigned 32-bit integers from b.
2965 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
_mm_mullo_epi32(__m128i a,__m128i b)2966 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
2967 {
2968     return vreinterpretq_m128i_s32(
2969         vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2970 }
2971 
2972 // Multiply the packed unsigned 16-bit integers in a and b, producing
2973 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2974 // integers in dst.
2975 //
2976 //   FOR j := 0 to 3
2977 //      i := j*16
2978 //      tmp[31:0] := a[i+15:i] * b[i+15:i]
2979 //      dst[i+15:i] := tmp[31:16]
2980 //   ENDFOR
2981 //
2982 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2983 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2984 
2985 // Multiplies the four single-precision, floating-point values of a and b.
2986 //
2987 //   r0 := a0 * b0
2988 //   r1 := a1 * b1
2989 //   r2 := a2 * b2
2990 //   r3 := a3 * b3
2991 //
2992 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
_mm_mul_ps(__m128 a,__m128 b)2993 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2994 {
2995     return vreinterpretq_m128_f32(
2996         vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2997 }
2998 
2999 // Multiply the lower single-precision (32-bit) floating-point element in a and
3000 // b, store the result in the lower element of dst, and copy the upper 3 packed
3001 // elements from a to the upper elements of dst.
3002 //
3003 //   dst[31:0] := a[31:0] * b[31:0]
3004 //   dst[127:32] := a[127:32]
3005 //
3006 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
_mm_mul_ss(__m128 a,__m128 b)3007 FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
3008 {
3009     return _mm_move_ss(a, _mm_mul_ps(a, b));
3010 }
3011 
3012 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
3013 // a and b, and store the unsigned 64-bit results in dst.
3014 //
3015 //   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
3016 //   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
_mm_mul_epu32(__m128i a,__m128i b)3017 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
3018 {
3019     // vmull_u32 upcasts instead of masking, so we downcast.
3020     uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
3021     uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
3022     return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
3023 }
3024 
3025 // Multiply the low unsigned 32-bit integers from a and b, and store the
3026 // unsigned 64-bit result in dst.
3027 //
3028 //   dst[63:0] := a[31:0] * b[31:0]
3029 //
3030 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
_mm_mul_su32(__m64 a,__m64 b)3031 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
3032 {
3033     return vreinterpret_m64_u64(vget_low_u64(
3034         vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
3035 }
3036 
3037 // Multiply the low signed 32-bit integers from each packed 64-bit element in
3038 // a and b, and store the signed 64-bit results in dst.
3039 //
3040 //   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
3041 //   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
_mm_mul_epi32(__m128i a,__m128i b)3042 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
3043 {
3044     // vmull_s32 upcasts instead of masking, so we downcast.
3045     int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
3046     int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
3047     return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
3048 }
3049 
3050 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3051 // integers from b.
3052 //
3053 //   r0 := (a0 * b0) + (a1 * b1)
3054 //   r1 := (a2 * b2) + (a3 * b3)
3055 //   r2 := (a4 * b4) + (a5 * b5)
3056 //   r3 := (a6 * b6) + (a7 * b7)
3057 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
_mm_madd_epi16(__m128i a,__m128i b)3058 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
3059 {
3060     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3061                               vget_low_s16(vreinterpretq_s16_m128i(b)));
3062     int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3063                                vget_high_s16(vreinterpretq_s16_m128i(b)));
3064 
3065     int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
3066     int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
3067 
3068     return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
3069 }
3070 
3071 // Multiply packed signed 16-bit integers in a and b, producing intermediate
3072 // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
3073 // the packed 16-bit integers in dst.
3074 //
3075 //   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
3076 //   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
3077 //   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
3078 //   ...
3079 //   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
_mm_mulhrs_epi16(__m128i a,__m128i b)3080 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
3081 {
3082     // Has issues due to saturation
3083     // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
3084 
3085     // Multiply
3086     int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3087                                  vget_low_s16(vreinterpretq_s16_m128i(b)));
3088     int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3089                                  vget_high_s16(vreinterpretq_s16_m128i(b)));
3090 
3091     // Rounding narrowing shift right
3092     // narrow = (int16_t)((mul + 16384) >> 15);
3093     int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
3094     int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
3095 
3096     // Join together
3097     return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
3098 }
3099 
3100 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
3101 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
3102 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
3103 // and pack the saturated results in dst.
3104 //
3105 //   FOR j := 0 to 7
3106 //      i := j*16
3107 //      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
3108 //      a[i+7:i]*b[i+7:i] )
3109 //   ENDFOR
_mm_maddubs_epi16(__m128i _a,__m128i _b)3110 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
3111 {
3112 #if defined(__aarch64__)
3113     uint8x16_t a = vreinterpretq_u8_m128i(_a);
3114     int8x16_t b = vreinterpretq_s8_m128i(_b);
3115     int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
3116                              vmovl_s8(vget_low_s8(b)));
3117     int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
3118                              vmovl_s8(vget_high_s8(b)));
3119     return vreinterpretq_m128i_s16(
3120         vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
3121 #else
3122     // This would be much simpler if x86 would choose to zero extend OR sign
3123     // extend, not both. This could probably be optimized better.
3124     uint16x8_t a = vreinterpretq_u16_m128i(_a);
3125     int16x8_t b = vreinterpretq_s16_m128i(_b);
3126 
3127     // Zero extend a
3128     int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
3129     int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
3130 
3131     // Sign extend by shifting left then shifting right.
3132     int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
3133     int16x8_t b_odd = vshrq_n_s16(b, 8);
3134 
3135     // multiply
3136     int16x8_t prod1 = vmulq_s16(a_even, b_even);
3137     int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
3138 
3139     // saturated add
3140     return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
3141 #endif
3142 }
3143 
3144 // Computes the fused multiple add product of 32-bit floating point numbers.
3145 //
3146 // Return Value
3147 // Multiplies A and B, and adds C to the temporary result before returning it.
3148 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
_mm_fmadd_ps(__m128 a,__m128 b,__m128 c)3149 FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
3150 {
3151 #if defined(__aarch64__)
3152     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
3153                                             vreinterpretq_f32_m128(b),
3154                                             vreinterpretq_f32_m128(a)));
3155 #else
3156     return _mm_add_ps(_mm_mul_ps(a, b), c);
3157 #endif
3158 }
3159 
3160 // Alternatively add and subtract packed single-precision (32-bit)
3161 // floating-point elements in a to/from packed elements in b, and store the
3162 // results in dst.
3163 //
3164 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
_mm_addsub_ps(__m128 a,__m128 b)3165 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
3166 {
3167     __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
3168     return _mm_fmadd_ps(b, mask, a);
3169 }
3170 
3171 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3172 // b, then horizontally sum each consecutive 8 differences to produce two
3173 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3174 // 16 bits of 64-bit elements in dst.
3175 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
_mm_sad_epu8(__m128i a,__m128i b)3176 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
3177 {
3178     uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
3179     uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3180     uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3181     uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3182     return (__m128i) vsetq_lane_u16(r4, r, 4);
3183 }
3184 
3185 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3186 // b, then horizontally sum each consecutive 8 differences to produce four
3187 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3188 // 16 bits of dst.
3189 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
_mm_sad_pu8(__m64 a,__m64 b)3190 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
3191 {
3192     uint16x4_t t =
3193         vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3194     uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3195     return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
3196 }
3197 
3198 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3199 // b, then horizontally sum each consecutive 8 differences to produce four
3200 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3201 // 16 bits of dst.
3202 //
3203 //   FOR j := 0 to 7
3204 //      i := j*8
3205 //      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
3206 //   ENDFOR
3207 //   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
3208 //   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
3209 //
3210 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
3211 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
3212 
3213 // Divides the four single-precision, floating-point values of a and b.
3214 //
3215 //   r0 := a0 / b0
3216 //   r1 := a1 / b1
3217 //   r2 := a2 / b2
3218 //   r3 := a3 / b3
3219 //
3220 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
_mm_div_ps(__m128 a,__m128 b)3221 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
3222 {
3223 #if defined(__aarch64__)
3224     return vreinterpretq_m128_f32(
3225         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3226 #else
3227     float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
3228     float32x4_t recip1 =
3229         vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
3230     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
3231 #endif
3232 }
3233 
3234 // Divides the scalar single-precision floating point value of a by b.
3235 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
_mm_div_ss(__m128 a,__m128 b)3236 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
3237 {
3238     float32_t value =
3239         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
3240     return vreinterpretq_m128_f32(
3241         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3242 }
3243 
3244 // Computes the approximations of reciprocals of the four single-precision,
3245 // floating-point values of a.
3246 // https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
_mm_rcp_ps(__m128 in)3247 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
3248 {
3249     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
3250     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
3251     return vreinterpretq_m128_f32(recip);
3252 }
3253 
3254 // Compute the approximate reciprocal of the lower single-precision (32-bit)
3255 // floating-point element in a, store the result in the lower element of dst,
3256 // and copy the upper 3 packed elements from a to the upper elements of dst. The
3257 // maximum relative error for this approximation is less than 1.5*2^-12.
3258 //
3259 //   dst[31:0] := (1.0 / a[31:0])
3260 //   dst[127:32] := a[127:32]
3261 //
3262 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
_mm_rcp_ss(__m128 a)3263 FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
3264 {
3265     return _mm_move_ss(a, _mm_rcp_ps(a));
3266 }
3267 
3268 // Computes the approximations of square roots of the four single-precision,
3269 // floating-point values of a. First computes reciprocal square roots and then
3270 // reciprocals of the four values.
3271 //
3272 //   r0 := sqrt(a0)
3273 //   r1 := sqrt(a1)
3274 //   r2 := sqrt(a2)
3275 //   r3 := sqrt(a3)
3276 //
3277 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
_mm_sqrt_ps(__m128 in)3278 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
3279 {
3280 #if defined(__aarch64__)
3281     return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
3282 #else
3283     float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3284     float32x4_t sq = vrecpeq_f32(recipsq);
3285     // ??? use step versions of both sqrt and recip for better accuracy?
3286     return vreinterpretq_m128_f32(sq);
3287 #endif
3288 }
3289 
3290 // Computes the approximation of the square root of the scalar single-precision
3291 // floating point value of in.
3292 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
_mm_sqrt_ss(__m128 in)3293 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
3294 {
3295     float32_t value =
3296         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
3297     return vreinterpretq_m128_f32(
3298         vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
3299 }
3300 
3301 // Computes the approximations of the reciprocal square roots of the four
3302 // single-precision floating point values of in.
3303 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
_mm_rsqrt_ps(__m128 in)3304 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
3305 {
3306     return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
3307 }
3308 
3309 // Compute the approximate reciprocal square root of the lower single-precision
3310 // (32-bit) floating-point element in a, store the result in the lower element
3311 // of dst, and copy the upper 3 packed elements from a to the upper elements of
3312 // dst.
3313 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
_mm_rsqrt_ss(__m128 in)3314 FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
3315 {
3316     return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
3317 }
3318 
3319 // Compare packed signed 16-bit integers in a and b, and store packed maximum
3320 // values in dst.
3321 //
3322 //   FOR j := 0 to 3
3323 //      i := j*16
3324 //      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3325 //   ENDFOR
3326 //
3327 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
_mm_max_pi16(__m64 a,__m64 b)3328 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
3329 {
3330     return vreinterpret_m64_s16(
3331         vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3332 }
3333 
3334 // Compare packed signed 16-bit integers in a and b, and store packed maximum
3335 // values in dst.
3336 //
3337 //   FOR j := 0 to 3
3338 //      i := j*16
3339 //      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3340 //   ENDFOR
3341 //
3342 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
3343 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
3344 
3345 // Computes the maximums of the four single-precision, floating-point values of
3346 // a and b.
3347 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
_mm_max_ps(__m128 a,__m128 b)3348 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
3349 {
3350 #if SSE2NEON_PRECISE_MINMAX
3351     float32x4_t _a = vreinterpretq_f32_m128(a);
3352     float32x4_t _b = vreinterpretq_f32_m128(b);
3353     return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
3354 #else
3355     return vreinterpretq_m128_f32(
3356         vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3357 #endif
3358 }
3359 
3360 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3361 // values in dst.
3362 //
3363 //   FOR j := 0 to 7
3364 //      i := j*8
3365 //      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3366 //   ENDFOR
3367 //
3368 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
_mm_max_pu8(__m64 a,__m64 b)3369 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
3370 {
3371     return vreinterpret_m64_u8(
3372         vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3373 }
3374 
3375 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3376 // values in dst.
3377 //
3378 //   FOR j := 0 to 7
3379 //      i := j*8
3380 //      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3381 //   ENDFOR
3382 //
3383 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
3384 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
3385 
3386 // Compare packed signed 16-bit integers in a and b, and store packed minimum
3387 // values in dst.
3388 //
3389 //   FOR j := 0 to 3
3390 //      i := j*16
3391 //      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3392 //   ENDFOR
3393 //
3394 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
_mm_min_pi16(__m64 a,__m64 b)3395 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
3396 {
3397     return vreinterpret_m64_s16(
3398         vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3399 }
3400 
3401 // Compare packed signed 16-bit integers in a and b, and store packed minimum
3402 // values in dst.
3403 //
3404 //   FOR j := 0 to 3
3405 //      i := j*16
3406 //      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3407 //   ENDFOR
3408 //
3409 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
3410 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
3411 
3412 // Computes the minima of the four single-precision, floating-point values of a
3413 // and b.
3414 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
_mm_min_ps(__m128 a,__m128 b)3415 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
3416 {
3417 #if SSE2NEON_PRECISE_MINMAX
3418     float32x4_t _a = vreinterpretq_f32_m128(a);
3419     float32x4_t _b = vreinterpretq_f32_m128(b);
3420     return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
3421 #else
3422     return vreinterpretq_m128_f32(
3423         vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3424 #endif
3425 }
3426 
3427 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3428 // values in dst.
3429 //
3430 //   FOR j := 0 to 7
3431 //      i := j*8
3432 //      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3433 //   ENDFOR
3434 //
3435 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
_mm_min_pu8(__m64 a,__m64 b)3436 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
3437 {
3438     return vreinterpret_m64_u8(
3439         vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3440 }
3441 
3442 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3443 // values in dst.
3444 //
3445 //   FOR j := 0 to 7
3446 //      i := j*8
3447 //      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3448 //   ENDFOR
3449 //
3450 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
3451 #define _m_pminub(a, b) _mm_min_pu8(a, b)
3452 
3453 // Computes the maximum of the two lower scalar single-precision floating point
3454 // values of a and b.
3455 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
_mm_max_ss(__m128 a,__m128 b)3456 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
3457 {
3458     float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
3459     return vreinterpretq_m128_f32(
3460         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3461 }
3462 
3463 // Computes the minimum of the two lower scalar single-precision floating point
3464 // values of a and b.
3465 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
_mm_min_ss(__m128 a,__m128 b)3466 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
3467 {
3468     float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
3469     return vreinterpretq_m128_f32(
3470         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3471 }
3472 
3473 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
3474 // 16 unsigned 8-bit integers from b.
3475 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
_mm_max_epu8(__m128i a,__m128i b)3476 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
3477 {
3478     return vreinterpretq_m128i_u8(
3479         vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3480 }
3481 
3482 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
3483 // 16 unsigned 8-bit integers from b.
3484 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
_mm_min_epu8(__m128i a,__m128i b)3485 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
3486 {
3487     return vreinterpretq_m128i_u8(
3488         vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3489 }
3490 
3491 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
3492 // signed 16-bit integers from b.
3493 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
_mm_min_epi16(__m128i a,__m128i b)3494 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
3495 {
3496     return vreinterpretq_m128i_s16(
3497         vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3498 }
3499 
3500 // Compare packed signed 8-bit integers in a and b, and store packed maximum
3501 // values in dst.
3502 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
_mm_max_epi8(__m128i a,__m128i b)3503 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
3504 {
3505     return vreinterpretq_m128i_s8(
3506         vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3507 }
3508 
3509 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
3510 // signed 16-bit integers from b.
3511 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
_mm_max_epi16(__m128i a,__m128i b)3512 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
3513 {
3514     return vreinterpretq_m128i_s16(
3515         vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3516 }
3517 
3518 // epi versions of min/max
3519 // Computes the pariwise maximums of the four signed 32-bit integer values of a
3520 // and b.
3521 //
3522 // A 128-bit parameter that can be defined with the following equations:
3523 //   r0 := (a0 > b0) ? a0 : b0
3524 //   r1 := (a1 > b1) ? a1 : b1
3525 //   r2 := (a2 > b2) ? a2 : b2
3526 //   r3 := (a3 > b3) ? a3 : b3
3527 //
3528 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
_mm_max_epi32(__m128i a,__m128i b)3529 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
3530 {
3531     return vreinterpretq_m128i_s32(
3532         vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3533 }
3534 
3535 // Computes the pariwise minima of the four signed 32-bit integer values of a
3536 // and b.
3537 //
3538 // A 128-bit parameter that can be defined with the following equations:
3539 //   r0 := (a0 < b0) ? a0 : b0
3540 //   r1 := (a1 < b1) ? a1 : b1
3541 //   r2 := (a2 < b2) ? a2 : b2
3542 //   r3 := (a3 < b3) ? a3 : b3
3543 //
3544 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
_mm_min_epi32(__m128i a,__m128i b)3545 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
3546 {
3547     return vreinterpretq_m128i_s32(
3548         vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3549 }
3550 
3551 // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
3552 // values in dst.
3553 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_max_epu32(__m128i a,__m128i b)3554 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
3555 {
3556     return vreinterpretq_m128i_u32(
3557         vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3558 }
3559 
3560 // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
3561 // values in dst.
3562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_min_epu32(__m128i a,__m128i b)3563 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
3564 {
3565     return vreinterpretq_m128i_u32(
3566         vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3567 }
3568 
3569 // Multiply the packed unsigned 16-bit integers in a and b, producing
3570 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
3571 // integers in dst.
3572 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
_mm_mulhi_pu16(__m64 a,__m64 b)3573 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
3574 {
3575     return vreinterpret_m64_u16(vshrn_n_u32(
3576         vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
3577 }
3578 
3579 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3580 // integers from b.
3581 //
3582 //   r0 := (a0 * b0)[31:16]
3583 //   r1 := (a1 * b1)[31:16]
3584 //   ...
3585 //   r7 := (a7 * b7)[31:16]
3586 //
3587 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
_mm_mulhi_epi16(__m128i a,__m128i b)3588 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
3589 {
3590     /* FIXME: issue with large values because of result saturation */
3591     // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
3592     // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
3593     // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
3594     int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
3595     int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
3596     int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
3597     int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
3598     int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
3599     int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
3600     uint16x8x2_t r =
3601         vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
3602     return vreinterpretq_m128i_u16(r.val[1]);
3603 }
3604 
3605 // Computes pairwise add of each argument as single-precision, floating-point
3606 // values a and b.
3607 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
_mm_hadd_ps(__m128 a,__m128 b)3608 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
3609 {
3610 #if defined(__aarch64__)
3611     return vreinterpretq_m128_f32(
3612         vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3613 #else
3614     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
3615     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
3616     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
3617     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
3618     return vreinterpretq_m128_f32(
3619         vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
3620 #endif
3621 }
3622 
3623 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
3624 // values a and b.
_mm_hadd_epi16(__m128i _a,__m128i _b)3625 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
3626 {
3627     int16x8_t a = vreinterpretq_s16_m128i(_a);
3628     int16x8_t b = vreinterpretq_s16_m128i(_b);
3629 #if defined(__aarch64__)
3630     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
3631 #else
3632     return vreinterpretq_m128i_s16(
3633         vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
3634                      vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
3635 #endif
3636 }
3637 
3638 // Horizontally substract adjacent pairs of single-precision (32-bit)
3639 // floating-point elements in a and b, and pack the results in dst.
3640 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
_mm_hsub_ps(__m128 _a,__m128 _b)3641 FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
3642 {
3643 #if defined(__aarch64__)
3644     return vreinterpretq_m128_f32(vsubq_f32(
3645         vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
3646         vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
3647 #else
3648     float32x4x2_t c =
3649         vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
3650     return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
3651 #endif
3652 }
3653 
3654 // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
3655 // signed 16-bit results in dst.
3656 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
_mm_hadd_pi16(__m64 a,__m64 b)3657 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
3658 {
3659     return vreinterpret_m64_s16(
3660         vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3661 }
3662 
3663 // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
3664 // signed 32-bit results in dst.
3665 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
_mm_hadd_pi32(__m64 a,__m64 b)3666 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
3667 {
3668     return vreinterpret_m64_s32(
3669         vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
3670 }
3671 
3672 // Computes pairwise difference of each argument as a 16-bit signed or unsigned
3673 // integer values a and b.
_mm_hsub_epi16(__m128i _a,__m128i _b)3674 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
3675 {
3676     int32x4_t a = vreinterpretq_s32_m128i(_a);
3677     int32x4_t b = vreinterpretq_s32_m128i(_b);
3678     // Interleave using vshrn/vmovn
3679     // [a0|a2|a4|a6|b0|b2|b4|b6]
3680     // [a1|a3|a5|a7|b1|b3|b5|b7]
3681     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3682     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3683     // Subtract
3684     return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
3685 }
3686 
3687 // Computes saturated pairwise sub of each argument as a 16-bit signed
3688 // integer values a and b.
_mm_hadds_epi16(__m128i _a,__m128i _b)3689 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
3690 {
3691 #if defined(__aarch64__)
3692     int16x8_t a = vreinterpretq_s16_m128i(_a);
3693     int16x8_t b = vreinterpretq_s16_m128i(_b);
3694     return vreinterpretq_s64_s16(
3695         vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3696 #else
3697     int32x4_t a = vreinterpretq_s32_m128i(_a);
3698     int32x4_t b = vreinterpretq_s32_m128i(_b);
3699     // Interleave using vshrn/vmovn
3700     // [a0|a2|a4|a6|b0|b2|b4|b6]
3701     // [a1|a3|a5|a7|b1|b3|b5|b7]
3702     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3703     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3704     // Saturated add
3705     return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
3706 #endif
3707 }
3708 
3709 // Computes saturated pairwise difference of each argument as a 16-bit signed
3710 // integer values a and b.
3711 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
_mm_hsubs_epi16(__m128i _a,__m128i _b)3712 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
3713 {
3714 #if defined(__aarch64__)
3715     int16x8_t a = vreinterpretq_s16_m128i(_a);
3716     int16x8_t b = vreinterpretq_s16_m128i(_b);
3717     return vreinterpretq_s64_s16(
3718         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3719 #else
3720     int32x4_t a = vreinterpretq_s32_m128i(_a);
3721     int32x4_t b = vreinterpretq_s32_m128i(_b);
3722     // Interleave using vshrn/vmovn
3723     // [a0|a2|a4|a6|b0|b2|b4|b6]
3724     // [a1|a3|a5|a7|b1|b3|b5|b7]
3725     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3726     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3727     // Saturated subtract
3728     return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
3729 #endif
3730 }
3731 
3732 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
3733 // values a and b.
_mm_hadd_epi32(__m128i _a,__m128i _b)3734 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
3735 {
3736     int32x4_t a = vreinterpretq_s32_m128i(_a);
3737     int32x4_t b = vreinterpretq_s32_m128i(_b);
3738     return vreinterpretq_m128i_s32(
3739         vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
3740                      vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
3741 }
3742 
3743 // Computes pairwise difference of each argument as a 32-bit signed or unsigned
3744 // integer values a and b.
_mm_hsub_epi32(__m128i _a,__m128i _b)3745 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
3746 {
3747     int64x2_t a = vreinterpretq_s64_m128i(_a);
3748     int64x2_t b = vreinterpretq_s64_m128i(_b);
3749     // Interleave using vshrn/vmovn
3750     // [a0|a2|b0|b2]
3751     // [a1|a2|b1|b3]
3752     int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
3753     int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
3754     // Subtract
3755     return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
3756 }
3757 
3758 // Kahan summation for accurate summation of floating-point numbers.
3759 // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
sse2neon_kadd_f32(float * sum,float * c,float y)3760 FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y)
3761 {
3762     y -= *c;
3763     float t = *sum + y;
3764     *c = (t - *sum) - y;
3765     *sum = t;
3766 }
3767 
3768 // Conditionally multiply the packed single-precision (32-bit) floating-point
3769 // elements in a and b using the high 4 bits in imm8, sum the four products,
3770 // and conditionally store the sum in dst using the low 4 bits of imm.
3771 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
_mm_dp_ps(__m128 a,__m128 b,const int imm)3772 FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
3773 {
3774 #if defined(__aarch64__)
3775     /* shortcuts */
3776     if (imm == 0xFF) {
3777         return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
3778     }
3779     if (imm == 0x7F) {
3780         float32x4_t m = _mm_mul_ps(a, b);
3781         m[3] = 0;
3782         return _mm_set1_ps(vaddvq_f32(m));
3783     }
3784 #endif
3785 
3786     float s = 0, c = 0;
3787     float32x4_t f32a = vreinterpretq_f32_m128(a);
3788     float32x4_t f32b = vreinterpretq_f32_m128(b);
3789 
3790     /* To improve the accuracy of floating-point summation, Kahan algorithm
3791      * is used for each operation.
3792      */
3793     if (imm & (1 << 4))
3794         sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
3795     if (imm & (1 << 5))
3796         sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
3797     if (imm & (1 << 6))
3798         sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
3799     if (imm & (1 << 7))
3800         sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
3801     s += c;
3802 
3803     float32x4_t res = {
3804         (imm & 0x1) ? s : 0,
3805         (imm & 0x2) ? s : 0,
3806         (imm & 0x4) ? s : 0,
3807         (imm & 0x8) ? s : 0,
3808     };
3809     return vreinterpretq_m128_f32(res);
3810 }
3811 
3812 /* Compare operations */
3813 
3814 // Compares for less than
3815 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
_mm_cmplt_ps(__m128 a,__m128 b)3816 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
3817 {
3818     return vreinterpretq_m128_u32(
3819         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3820 }
3821 
3822 // Compares for less than
3823 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
_mm_cmplt_ss(__m128 a,__m128 b)3824 FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
3825 {
3826     return _mm_move_ss(a, _mm_cmplt_ps(a, b));
3827 }
3828 
3829 // Compares for greater than.
3830 //
3831 //   r0 := (a0 > b0) ? 0xffffffff : 0x0
3832 //   r1 := (a1 > b1) ? 0xffffffff : 0x0
3833 //   r2 := (a2 > b2) ? 0xffffffff : 0x0
3834 //   r3 := (a3 > b3) ? 0xffffffff : 0x0
3835 //
3836 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
_mm_cmpgt_ps(__m128 a,__m128 b)3837 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
3838 {
3839     return vreinterpretq_m128_u32(
3840         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3841 }
3842 
3843 // Compares for greater than.
3844 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
_mm_cmpgt_ss(__m128 a,__m128 b)3845 FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
3846 {
3847     return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
3848 }
3849 
3850 // Compares for greater than or equal.
3851 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
_mm_cmpge_ps(__m128 a,__m128 b)3852 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
3853 {
3854     return vreinterpretq_m128_u32(
3855         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3856 }
3857 
3858 // Compares for greater than or equal.
3859 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
_mm_cmpge_ss(__m128 a,__m128 b)3860 FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
3861 {
3862     return _mm_move_ss(a, _mm_cmpge_ps(a, b));
3863 }
3864 
3865 // Compares for less than or equal.
3866 //
3867 //   r0 := (a0 <= b0) ? 0xffffffff : 0x0
3868 //   r1 := (a1 <= b1) ? 0xffffffff : 0x0
3869 //   r2 := (a2 <= b2) ? 0xffffffff : 0x0
3870 //   r3 := (a3 <= b3) ? 0xffffffff : 0x0
3871 //
3872 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
_mm_cmple_ps(__m128 a,__m128 b)3873 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
3874 {
3875     return vreinterpretq_m128_u32(
3876         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3877 }
3878 
3879 // Compares for less than or equal.
3880 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
_mm_cmple_ss(__m128 a,__m128 b)3881 FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
3882 {
3883     return _mm_move_ss(a, _mm_cmple_ps(a, b));
3884 }
3885 
3886 // Compares for equality.
3887 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
_mm_cmpeq_ps(__m128 a,__m128 b)3888 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
3889 {
3890     return vreinterpretq_m128_u32(
3891         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3892 }
3893 
3894 // Compares for equality.
3895 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
_mm_cmpeq_ss(__m128 a,__m128 b)3896 FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
3897 {
3898     return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
3899 }
3900 
3901 // Compares for inequality.
3902 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
_mm_cmpneq_ps(__m128 a,__m128 b)3903 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
3904 {
3905     return vreinterpretq_m128_u32(vmvnq_u32(
3906         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
3907 }
3908 
3909 // Compares for inequality.
3910 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
_mm_cmpneq_ss(__m128 a,__m128 b)3911 FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
3912 {
3913     return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
3914 }
3915 
3916 // Compares for not greater than or equal.
3917 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
_mm_cmpnge_ps(__m128 a,__m128 b)3918 FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
3919 {
3920     return _mm_cmplt_ps(a, b);
3921 }
3922 
3923 // Compares for not greater than or equal.
3924 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
_mm_cmpnge_ss(__m128 a,__m128 b)3925 FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
3926 {
3927     return _mm_cmplt_ss(a, b);
3928 }
3929 
3930 // Compares for not greater than.
3931 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
_mm_cmpngt_ps(__m128 a,__m128 b)3932 FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
3933 {
3934     return _mm_cmple_ps(a, b);
3935 }
3936 
3937 // Compares for not greater than.
3938 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpngt_ss(__m128 a,__m128 b)3939 FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
3940 {
3941     return _mm_cmple_ss(a, b);
3942 }
3943 
3944 // Compares for not less than or equal.
3945 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
_mm_cmpnle_ps(__m128 a,__m128 b)3946 FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
3947 {
3948     return _mm_cmpgt_ps(a, b);
3949 }
3950 
3951 // Compares for not less than or equal.
3952 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpnle_ss(__m128 a,__m128 b)3953 FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
3954 {
3955     return _mm_cmpgt_ss(a, b);
3956 }
3957 
3958 // Compares for not less than.
3959 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
_mm_cmpnlt_ps(__m128 a,__m128 b)3960 FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
3961 {
3962     return _mm_cmpge_ps(a, b);
3963 }
3964 
3965 // Compares for not less than.
3966 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
_mm_cmpnlt_ss(__m128 a,__m128 b)3967 FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
3968 {
3969     return _mm_cmpge_ss(a, b);
3970 }
3971 
3972 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3973 // unsigned 8-bit integers in b for equality.
3974 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
_mm_cmpeq_epi8(__m128i a,__m128i b)3975 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3976 {
3977     return vreinterpretq_m128i_u8(
3978         vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3979 }
3980 
3981 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3982 // unsigned 16-bit integers in b for equality.
3983 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
_mm_cmpeq_epi16(__m128i a,__m128i b)3984 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3985 {
3986     return vreinterpretq_m128i_u16(
3987         vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3988 }
3989 
3990 // Compare packed 32-bit integers in a and b for equality, and store the results
3991 // in dst
_mm_cmpeq_epi32(__m128i a,__m128i b)3992 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3993 {
3994     return vreinterpretq_m128i_u32(
3995         vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3996 }
3997 
3998 // Compare packed 64-bit integers in a and b for equality, and store the results
3999 // in dst
_mm_cmpeq_epi64(__m128i a,__m128i b)4000 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
4001 {
4002 #if defined(__aarch64__)
4003     return vreinterpretq_m128i_u64(
4004         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
4005 #else
4006     // ARMv7 lacks vceqq_u64
4007     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
4008     uint32x4_t cmp =
4009         vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
4010     uint32x4_t swapped = vrev64q_u32(cmp);
4011     return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
4012 #endif
4013 }
4014 
4015 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4016 // in b for lesser than.
4017 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
_mm_cmplt_epi8(__m128i a,__m128i b)4018 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
4019 {
4020     return vreinterpretq_m128i_u8(
4021         vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4022 }
4023 
4024 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4025 // in b for greater than.
4026 //
4027 //   r0 := (a0 > b0) ? 0xff : 0x0
4028 //   r1 := (a1 > b1) ? 0xff : 0x0
4029 //   ...
4030 //   r15 := (a15 > b15) ? 0xff : 0x0
4031 //
4032 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
_mm_cmpgt_epi8(__m128i a,__m128i b)4033 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
4034 {
4035     return vreinterpretq_m128i_u8(
4036         vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4037 }
4038 
4039 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4040 // in b for less than.
4041 //
4042 //   r0 := (a0 < b0) ? 0xffff : 0x0
4043 //   r1 := (a1 < b1) ? 0xffff : 0x0
4044 //   ...
4045 //   r7 := (a7 < b7) ? 0xffff : 0x0
4046 //
4047 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
_mm_cmplt_epi16(__m128i a,__m128i b)4048 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
4049 {
4050     return vreinterpretq_m128i_u16(
4051         vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4052 }
4053 
4054 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4055 // in b for greater than.
4056 //
4057 //   r0 := (a0 > b0) ? 0xffff : 0x0
4058 //   r1 := (a1 > b1) ? 0xffff : 0x0
4059 //   ...
4060 //   r7 := (a7 > b7) ? 0xffff : 0x0
4061 //
4062 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
_mm_cmpgt_epi16(__m128i a,__m128i b)4063 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
4064 {
4065     return vreinterpretq_m128i_u16(
4066         vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4067 }
4068 
4069 
4070 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4071 // in b for less than.
4072 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
_mm_cmplt_epi32(__m128i a,__m128i b)4073 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
4074 {
4075     return vreinterpretq_m128i_u32(
4076         vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4077 }
4078 
4079 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4080 // in b for greater than.
4081 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
_mm_cmpgt_epi32(__m128i a,__m128i b)4082 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
4083 {
4084     return vreinterpretq_m128i_u32(
4085         vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4086 }
4087 
4088 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
4089 // in b for greater than.
_mm_cmpgt_epi64(__m128i a,__m128i b)4090 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
4091 {
4092 #if defined(__aarch64__)
4093     return vreinterpretq_m128i_u64(
4094         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
4095 #else
4096     // ARMv7 lacks vcgtq_s64.
4097     // This is based off of Clang's SSE2 polyfill:
4098     // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
4099 
4100     // Mask the sign bit out since we need a signed AND an unsigned comparison
4101     // and it is ugly to try and split them.
4102     int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
4103     int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
4104     int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
4105     // Check if a > b
4106     int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
4107     // Copy upper mask to lower mask
4108     // a_hi > b_hi
4109     int64x2_t gt_hi = vshrq_n_s64(greater, 63);
4110     // Copy lower mask to upper mask
4111     // a_lo > b_lo
4112     int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
4113     // Compare for equality
4114     int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
4115     // Copy upper mask to lower mask
4116     // a_hi == b_hi
4117     int64x2_t eq_hi = vshrq_n_s64(equal, 63);
4118     // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
4119     int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
4120     return vreinterpretq_m128i_s64(ret);
4121 #endif
4122 }
4123 
4124 // Compares the four 32-bit floats in a and b to check if any values are NaN.
4125 // Ordered compare between each value returns true for "orderable" and false for
4126 // "not orderable" (NaN).
4127 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
4128 // also:
4129 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
4130 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
_mm_cmpord_ps(__m128 a,__m128 b)4131 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
4132 {
4133     // Note: NEON does not have ordered compare builtin
4134     // Need to compare a eq a and b eq b to check for NaN
4135     // Do AND of results to get final
4136     uint32x4_t ceqaa =
4137         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4138     uint32x4_t ceqbb =
4139         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4140     return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
4141 }
4142 
4143 // Compares for ordered.
4144 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
_mm_cmpord_ss(__m128 a,__m128 b)4145 FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
4146 {
4147     return _mm_move_ss(a, _mm_cmpord_ps(a, b));
4148 }
4149 
4150 // Compares for unordered.
4151 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
_mm_cmpunord_ps(__m128 a,__m128 b)4152 FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
4153 {
4154     uint32x4_t f32a =
4155         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4156     uint32x4_t f32b =
4157         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4158     return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
4159 }
4160 
4161 // Compares for unordered.
4162 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
_mm_cmpunord_ss(__m128 a,__m128 b)4163 FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
4164 {
4165     return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
4166 }
4167 
4168 // Compares the lower single-precision floating point scalar values of a and b
4169 // using a less than operation. :
4170 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
4171 // note!! The documentation on MSDN is incorrect!  If either of the values is a
4172 // NAN the docs say you will get a one, but in fact, it will return a zero!!
_mm_comilt_ss(__m128 a,__m128 b)4173 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
4174 {
4175     uint32x4_t a_not_nan =
4176         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4177     uint32x4_t b_not_nan =
4178         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4179     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4180     uint32x4_t a_lt_b =
4181         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4182     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
4183 }
4184 
4185 // Compares the lower single-precision floating point scalar values of a and b
4186 // using a greater than operation. :
4187 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
_mm_comigt_ss(__m128 a,__m128 b)4188 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
4189 {
4190     // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
4191     // vreinterpretq_f32_m128(b)), 0);
4192     uint32x4_t a_not_nan =
4193         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4194     uint32x4_t b_not_nan =
4195         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4196     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4197     uint32x4_t a_gt_b =
4198         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4199     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
4200 }
4201 
4202 // Compares the lower single-precision floating point scalar values of a and b
4203 // using a less than or equal operation. :
4204 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
_mm_comile_ss(__m128 a,__m128 b)4205 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
4206 {
4207     // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
4208     // vreinterpretq_f32_m128(b)), 0);
4209     uint32x4_t a_not_nan =
4210         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4211     uint32x4_t b_not_nan =
4212         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4213     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4214     uint32x4_t a_le_b =
4215         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4216     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
4217 }
4218 
4219 // Compares the lower single-precision floating point scalar values of a and b
4220 // using a greater than or equal operation. :
4221 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
_mm_comige_ss(__m128 a,__m128 b)4222 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
4223 {
4224     // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
4225     // vreinterpretq_f32_m128(b)), 0);
4226     uint32x4_t a_not_nan =
4227         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4228     uint32x4_t b_not_nan =
4229         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4230     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4231     uint32x4_t a_ge_b =
4232         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4233     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
4234 }
4235 
4236 // Compares the lower single-precision floating point scalar values of a and b
4237 // using an equality operation. :
4238 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
_mm_comieq_ss(__m128 a,__m128 b)4239 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
4240 {
4241     // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4242     // vreinterpretq_f32_m128(b)), 0);
4243     uint32x4_t a_not_nan =
4244         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4245     uint32x4_t b_not_nan =
4246         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4247     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4248     uint32x4_t a_eq_b =
4249         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4250     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
4251 }
4252 
4253 // Compares the lower single-precision floating point scalar values of a and b
4254 // using an inequality operation. :
4255 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
_mm_comineq_ss(__m128 a,__m128 b)4256 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
4257 {
4258     // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4259     // vreinterpretq_f32_m128(b)), 0);
4260     uint32x4_t a_not_nan =
4261         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4262     uint32x4_t b_not_nan =
4263         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4264     uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4265     uint32x4_t a_neq_b = vmvnq_u32(
4266         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4267     return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
4268 }
4269 
4270 // according to the documentation, these intrinsics behave the same as the
4271 // non-'u' versions.  We'll just alias them here.
4272 #define _mm_ucomilt_ss _mm_comilt_ss
4273 #define _mm_ucomile_ss _mm_comile_ss
4274 #define _mm_ucomigt_ss _mm_comigt_ss
4275 #define _mm_ucomige_ss _mm_comige_ss
4276 #define _mm_ucomieq_ss _mm_comieq_ss
4277 #define _mm_ucomineq_ss _mm_comineq_ss
4278 
4279 /* Conversions */
4280 
4281 // Convert packed signed 32-bit integers in b to packed single-precision
4282 // (32-bit) floating-point elements, store the results in the lower 2 elements
4283 // of dst, and copy the upper 2 packed elements from a to the upper elements of
4284 // dst.
4285 //
4286 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4287 //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4288 //   dst[95:64] := a[95:64]
4289 //   dst[127:96] := a[127:96]
4290 //
4291 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
_mm_cvt_pi2ps(__m128 a,__m64 b)4292 FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
4293 {
4294     return vreinterpretq_m128_f32(
4295         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4296                      vget_high_f32(vreinterpretq_f32_m128(a))));
4297 }
4298 
4299 // Convert the signed 32-bit integer b to a single-precision (32-bit)
4300 // floating-point element, store the result in the lower element of dst, and
4301 // copy the upper 3 packed elements from a to the upper elements of dst.
4302 //
4303 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4304 //   dst[127:32] := a[127:32]
4305 //
4306 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
_mm_cvt_si2ss(__m128 a,int b)4307 FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
4308 {
4309     __m128 ret = a;
4310     return vreinterpretq_m128_f32(
4311         vsetq_lane_f32((float) b, vreinterpretq_f32_m128(ret), 0));
4312 }
4313 
4314 // Convert the lower single-precision (32-bit) floating-point element in a to a
4315 // 32-bit integer, and store the result in dst.
4316 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
_mm_cvt_ss2si(__m128 a)4317 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
4318 {
4319 #if defined(__aarch64__)
4320     return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
4321 #else
4322     float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4323     float32_t diff = data - floor(data);
4324     if (diff > 0.5)
4325         return (int32_t) ceil(data);
4326     if (diff == 0.5) {
4327         int32_t f = (int32_t) floor(data);
4328         int32_t c = (int32_t) ceil(data);
4329         return c & 1 ? f : c;
4330     }
4331     return (int32_t) floor(data);
4332 #endif
4333 }
4334 
4335 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
4336 // floating-point elements, and store the results in dst.
4337 //
4338 //   FOR j := 0 to 3
4339 //      i := j*16
4340 //      m := j*32
4341 //      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
4342 //   ENDFOR
4343 //
4344 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
_mm_cvtpi16_ps(__m64 a)4345 FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
4346 {
4347     return vreinterpretq_m128_f32(
4348         vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
4349 }
4350 
4351 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
4352 // floating-point elements, store the results in the lower 2 elements of dst,
4353 // and copy the upper 2 packed elements from a to the upper elements of dst.
4354 //
4355 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4356 //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4357 //   dst[95:64] := a[95:64]
4358 //   dst[127:96] := a[127:96]
4359 //
4360 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
_mm_cvtpi32_ps(__m128 a,__m64 b)4361 FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
4362 {
4363     return vreinterpretq_m128_f32(
4364         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4365                      vget_high_f32(vreinterpretq_f32_m128(a))));
4366 }
4367 
4368 // Convert packed signed 32-bit integers in a to packed single-precision
4369 // (32-bit) floating-point elements, store the results in the lower 2 elements
4370 // of dst, then covert the packed signed 32-bit integers in b to
4371 // single-precision (32-bit) floating-point element, and store the results in
4372 // the upper 2 elements of dst.
4373 //
4374 //   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
4375 //   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
4376 //   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
4377 //   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
4378 //
4379 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
_mm_cvtpi32x2_ps(__m64 a,__m64 b)4380 FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
4381 {
4382     return vreinterpretq_m128_f32(vcvtq_f32_s32(
4383         vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
4384 }
4385 
4386 // Convert the lower packed 8-bit integers in a to packed single-precision
4387 // (32-bit) floating-point elements, and store the results in dst.
4388 //
4389 //   FOR j := 0 to 3
4390 //      i := j*8
4391 //      m := j*32
4392 //      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
4393 //   ENDFOR
4394 //
4395 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
_mm_cvtpi8_ps(__m64 a)4396 FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
4397 {
4398     return vreinterpretq_m128_f32(vcvtq_f32_s32(
4399         vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
4400 }
4401 
4402 // Convert packed unsigned 16-bit integers in a to packed single-precision
4403 // (32-bit) floating-point elements, and store the results in dst.
4404 //
4405 //   FOR j := 0 to 3
4406 //      i := j*16
4407 //      m := j*32
4408 //      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
4409 //   ENDFOR
4410 //
4411 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
_mm_cvtpu16_ps(__m64 a)4412 FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
4413 {
4414     return vreinterpretq_m128_f32(
4415         vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
4416 }
4417 
4418 // Convert the lower packed unsigned 8-bit integers in a to packed
4419 // single-precision (32-bit) floating-point elements, and store the results in
4420 // dst.
4421 //
4422 //   FOR j := 0 to 3
4423 //      i := j*8
4424 //      m := j*32
4425 //      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
4426 //   ENDFOR
4427 //
4428 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
_mm_cvtpu8_ps(__m64 a)4429 FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
4430 {
4431     return vreinterpretq_m128_f32(vcvtq_f32_u32(
4432         vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
4433 }
4434 
4435 // Converts the four single-precision, floating-point values of a to signed
4436 // 32-bit integer values using truncate.
4437 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
_mm_cvttps_epi32(__m128 a)4438 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4439 {
4440     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4441 }
4442 
4443 // Converts the four signed 32-bit integer values of a to single-precision,
4444 // floating-point values
4445 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
_mm_cvtepi32_ps(__m128i a)4446 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
4447 {
4448     return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
4449 }
4450 
4451 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
4452 // unsigned 32-bit integers.
_mm_cvtepu8_epi16(__m128i a)4453 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
4454 {
4455     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
4456     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4457     return vreinterpretq_m128i_u16(u16x8);
4458 }
4459 
4460 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
4461 // unsigned 32-bit integers.
4462 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
_mm_cvtepu8_epi32(__m128i a)4463 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
4464 {
4465     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
4466     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
4467     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
4468     return vreinterpretq_m128i_u32(u32x4);
4469 }
4470 
4471 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
4472 // unsigned 64-bit integers.
_mm_cvtepu8_epi64(__m128i a)4473 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
4474 {
4475     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
4476     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
4477     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4478     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4479     return vreinterpretq_m128i_u64(u64x2);
4480 }
4481 
4482 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
4483 // unsigned 32-bit integers.
_mm_cvtepi8_epi16(__m128i a)4484 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
4485 {
4486     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
4487     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4488     return vreinterpretq_m128i_s16(s16x8);
4489 }
4490 
4491 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
4492 // unsigned 32-bit integers.
_mm_cvtepi8_epi32(__m128i a)4493 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
4494 {
4495     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
4496     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
4497     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
4498     return vreinterpretq_m128i_s32(s32x4);
4499 }
4500 
4501 // Converts the two signed 8-bit integers in the lower 32 bits to four
4502 // signed 64-bit integers.
_mm_cvtepi8_epi64(__m128i a)4503 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
4504 {
4505     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
4506     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
4507     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4508     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4509     return vreinterpretq_m128i_s64(s64x2);
4510 }
4511 
4512 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
4513 // 32-bit integers.
_mm_cvtepi16_epi32(__m128i a)4514 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
4515 {
4516     return vreinterpretq_m128i_s32(
4517         vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
4518 }
4519 
4520 // Converts the two signed 16-bit integers in the lower 32 bits two signed
4521 // 32-bit integers.
_mm_cvtepi16_epi64(__m128i a)4522 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
4523 {
4524     int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
4525     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4526     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4527     return vreinterpretq_m128i_s64(s64x2);
4528 }
4529 
4530 // Converts the four unsigned 16-bit integers in the lower 64 bits to four
4531 // unsigned 32-bit integers.
_mm_cvtepu16_epi32(__m128i a)4532 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
4533 {
4534     return vreinterpretq_m128i_u32(
4535         vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
4536 }
4537 
4538 // Converts the two unsigned 16-bit integers in the lower 32 bits to two
4539 // unsigned 64-bit integers.
_mm_cvtepu16_epi64(__m128i a)4540 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
4541 {
4542     uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
4543     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4544     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4545     return vreinterpretq_m128i_u64(u64x2);
4546 }
4547 
4548 // Converts the two unsigned 32-bit integers in the lower 64 bits to two
4549 // unsigned 64-bit integers.
_mm_cvtepu32_epi64(__m128i a)4550 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
4551 {
4552     return vreinterpretq_m128i_u64(
4553         vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
4554 }
4555 
4556 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
4557 // 64-bit integers.
_mm_cvtepi32_epi64(__m128i a)4558 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
4559 {
4560     return vreinterpretq_m128i_s64(
4561         vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
4562 }
4563 
4564 // Converts the four single-precision, floating-point values of a to signed
4565 // 32-bit integer values.
4566 //
4567 //   r0 := (int) a0
4568 //   r1 := (int) a1
4569 //   r2 := (int) a2
4570 //   r3 := (int) a3
4571 //
4572 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
4573 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
4574 // does not support! It is supported on ARMv8-A however.
_mm_cvtps_epi32(__m128 a)4575 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
4576 {
4577 #if defined(__aarch64__)
4578     return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
4579 #else
4580     uint32x4_t signmask = vdupq_n_u32(0x80000000);
4581     float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4582                                  vdupq_n_f32(0.5f)); /* +/- 0.5 */
4583     int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4584         vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4585     int32x4_t r_trunc =
4586         vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4587     int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4588         vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4589     int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4590                                  vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4591     float32x4_t delta = vsubq_f32(
4592         vreinterpretq_f32_m128(a),
4593         vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4594     uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
4595     return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
4596 #endif
4597 }
4598 
4599 // Copy the lower 32-bit integer in a to dst.
4600 //
4601 //   dst[31:0] := a[31:0]
4602 //
4603 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
_mm_cvtsi128_si32(__m128i a)4604 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4605 {
4606     return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4607 }
4608 
4609 // Copy the lower 64-bit integer in a to dst.
4610 //
4611 //   dst[63:0] := a[63:0]
4612 //
4613 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
_mm_cvtsi128_si64(__m128i a)4614 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4615 {
4616     return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4617 }
4618 
4619 // Copy the lower 64-bit integer in a to dst.
4620 //
4621 //   dst[63:0] := a[63:0]
4622 //
4623 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4624 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4625 
4626 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4627 // zero extending the upper bits.
4628 //
4629 //   r0 := a
4630 //   r1 := 0x0
4631 //   r2 := 0x0
4632 //   r3 := 0x0
4633 //
4634 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
_mm_cvtsi32_si128(int a)4635 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4636 {
4637     return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4638 }
4639 
4640 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4641 // zero extending the upper bits.
4642 //
4643 //   r0 := a
4644 //   r1 := 0x0
_mm_cvtsi64_si128(int64_t a)4645 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4646 {
4647     return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4648 }
4649 
4650 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
4651 // compilation and does not generate any instructions, thus it has zero latency.
4652 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
_mm_castps_pd(__m128 a)4653 FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
4654 {
4655     return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
4656 }
4657 
4658 // Applies a type cast to reinterpret four 32-bit floating point values passed
4659 // in as a 128-bit parameter as packed 32-bit integers.
4660 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
_mm_castps_si128(__m128 a)4661 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
4662 {
4663     return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
4664 }
4665 
4666 // Applies a type cast to reinterpret four 32-bit integers passed in as a
4667 // 128-bit parameter as packed 32-bit floating point values.
4668 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
_mm_castsi128_ps(__m128i a)4669 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
4670 {
4671     return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
4672 }
4673 
4674 // Loads 128-bit value. :
4675 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
_mm_load_si128(const __m128i * p)4676 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4677 {
4678     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4679 }
4680 
4681 // Load a double-precision (64-bit) floating-point element from memory into both
4682 // elements of dst.
4683 //
4684 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4685 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4686 //
4687 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
_mm_load1_pd(const double * p)4688 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4689 {
4690 #if defined(__aarch64__)
4691     return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4692 #else
4693     return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4694 #endif
4695 }
4696 
4697 // Load a double-precision (64-bit) floating-point element from memory into the
4698 // upper element of dst, and copy the lower element from a to dst. mem_addr does
4699 // not need to be aligned on any particular boundary.
4700 //
4701 //   dst[63:0] := a[63:0]
4702 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4703 //
4704 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
_mm_loadh_pd(__m128d a,const double * p)4705 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4706 {
4707 #if defined(__aarch64__)
4708     return vreinterpretq_m128d_f64(
4709         vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4710 #else
4711     return vreinterpretq_m128d_f32(vcombine_f32(
4712         vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4713 #endif
4714 }
4715 
4716 // Load a double-precision (64-bit) floating-point element from memory into both
4717 // elements of dst.
4718 //
4719 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4720 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4721 //
4722 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4723 #define _mm_load_pd1 _mm_load1_pd
4724 
4725 // Load a double-precision (64-bit) floating-point element from memory into both
4726 // elements of dst.
4727 //
4728 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4729 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4730 //
4731 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
4732 #define _mm_loaddup_pd _mm_load1_pd
4733 
4734 // Loads 128-bit value. :
4735 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
_mm_loadu_si128(const __m128i * p)4736 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4737 {
4738     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4739 }
4740 
4741 // Load unaligned 32-bit integer from memory into the first element of dst.
4742 //
4743 //   dst[31:0] := MEM[mem_addr+31:mem_addr]
4744 //   dst[MAX:32] := 0
4745 //
4746 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
_mm_loadu_si32(const void * p)4747 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4748 {
4749     return vreinterpretq_m128i_s32(
4750         vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4751 }
4752 
4753 // Convert packed double-precision (64-bit) floating-point elements in a to
4754 // packed single-precision (32-bit) floating-point elements, and store the
4755 // results in dst.
4756 //
4757 //   FOR j := 0 to 1
4758 //     i := 32*j
4759 //     k := 64*j
4760 //     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
4761 //   ENDFOR
4762 //   dst[127:64] := 0
4763 //
4764 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
_mm_cvtpd_ps(__m128d a)4765 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
4766 {
4767 #if defined(__aarch64__)
4768     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
4769     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
4770 #else
4771     float a0 = (float) ((double *) &a)[0];
4772     float a1 = (float) ((double *) &a)[1];
4773     return _mm_set_ps(0, 0, a1, a0);
4774 #endif
4775 }
4776 
4777 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
4778 //
4779 //   dst[63:0] := a[63:0]
4780 //
4781 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
_mm_cvtsd_f64(__m128d a)4782 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
4783 {
4784 #if defined(__aarch64__)
4785     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4786 #else
4787     return ((double *) &a)[0];
4788 #endif
4789 }
4790 
4791 // Convert packed single-precision (32-bit) floating-point elements in a to
4792 // packed double-precision (64-bit) floating-point elements, and store the
4793 // results in dst.
4794 //
4795 //   FOR j := 0 to 1
4796 //     i := 64*j
4797 //     k := 32*j
4798 //     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
4799 //   ENDFOR
4800 //
4801 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
_mm_cvtps_pd(__m128 a)4802 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
4803 {
4804 #if defined(__aarch64__)
4805     return vreinterpretq_m128d_f64(
4806         vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4807 #else
4808     double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4809     double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4810     return _mm_set_pd(a1, a0);
4811 #endif
4812 }
4813 
4814 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
4815 // compilation and does not generate any instructions, thus it has zero latency.
4816 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
_mm_castpd_si128(__m128d a)4817 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
4818 {
4819     return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
4820 }
4821 
4822 // Blend packed single-precision (32-bit) floating-point elements from a and b
4823 // using mask, and store the results in dst.
4824 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
_mm_blendv_ps(__m128 a,__m128 b,__m128 mask)4825 FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
4826 {
4827     return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
4828                                             vreinterpretq_f32_m128(b),
4829                                             vreinterpretq_f32_m128(a)));
4830 }
4831 
4832 // Round the packed single-precision (32-bit) floating-point elements in a using
4833 // the rounding parameter, and store the results as packed single-precision
4834 // floating-point elements in dst.
4835 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
_mm_round_ps(__m128 a,int rounding)4836 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
4837 {
4838 #if defined(__aarch64__)
4839     switch (rounding) {
4840     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
4841         return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
4842     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
4843         return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
4844     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
4845         return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
4846     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
4847         return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
4848     default:  //_MM_FROUND_CUR_DIRECTION
4849         return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
4850     }
4851 #else
4852     float *v_float = (float *) &a;
4853     __m128 zero, neg_inf, pos_inf;
4854 
4855     switch (rounding) {
4856     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
4857         return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
4858     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
4859         return (__m128){floorf(v_float[0]), floorf(v_float[1]),
4860                         floorf(v_float[2]), floorf(v_float[3])};
4861     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
4862         return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
4863                         ceilf(v_float[3])};
4864     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
4865         zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
4866         neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
4867                              floorf(v_float[2]), floorf(v_float[3]));
4868         pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
4869                              ceilf(v_float[2]), ceilf(v_float[3]));
4870         return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
4871     default:  //_MM_FROUND_CUR_DIRECTION
4872         return (__m128){roundf(v_float[0]), roundf(v_float[1]),
4873                         roundf(v_float[2]), roundf(v_float[3])};
4874     }
4875 #endif
4876 }
4877 
4878 // Round the packed single-precision (32-bit) floating-point elements in a up to
4879 // an integer value, and store the results as packed single-precision
4880 // floating-point elements in dst.
4881 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
_mm_ceil_ps(__m128 a)4882 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
4883 {
4884     return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4885 }
4886 
4887 // Round the packed single-precision (32-bit) floating-point elements in a down
4888 // to an integer value, and store the results as packed single-precision
4889 // floating-point elements in dst.
4890 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
_mm_floor_ps(__m128 a)4891 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
4892 {
4893     return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4894 }
4895 
4896 
4897 // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
4898 // may perform better than _mm_loadu_si128 when the data crosses a cache line
4899 // boundary.
4900 //
4901 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
4902 //
4903 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
4904 #define _mm_lddqu_si128 _mm_loadu_si128
4905 
4906 /* Miscellaneous Operations */
4907 
4908 // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
4909 // in the sign bit.
4910 //
4911 //   r0 := a0 >> count
4912 //   r1 := a1 >> count
4913 //   ...
4914 //   r7 := a7 >> count
4915 //
4916 // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
_mm_sra_epi16(__m128i a,__m128i count)4917 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
4918 {
4919     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
4920     if (c > 15)
4921         return _mm_cmplt_epi16(a, _mm_setzero_si128());
4922     return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
4923 }
4924 
4925 // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
4926 // in the sign bit.
4927 //
4928 //   r0 := a0 >> count
4929 //   r1 := a1 >> count
4930 //   r2 := a2 >> count
4931 //   r3 := a3 >> count
4932 //
4933 // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
_mm_sra_epi32(__m128i a,__m128i count)4934 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
4935 {
4936     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
4937     if (c > 31)
4938         return _mm_cmplt_epi32(a, _mm_setzero_si128());
4939     return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
4940 }
4941 
4942 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
4943 // saturates.
4944 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
_mm_packs_epi16(__m128i a,__m128i b)4945 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4946 {
4947     return vreinterpretq_m128i_s8(
4948         vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4949                     vqmovn_s16(vreinterpretq_s16_m128i(b))));
4950 }
4951 
4952 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
4953 // integers and saturates.
4954 //
4955 //   r0 := UnsignedSaturate(a0)
4956 //   r1 := UnsignedSaturate(a1)
4957 //   ...
4958 //   r7 := UnsignedSaturate(a7)
4959 //   r8 := UnsignedSaturate(b0)
4960 //   r9 := UnsignedSaturate(b1)
4961 //   ...
4962 //   r15 := UnsignedSaturate(b7)
4963 //
4964 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
_mm_packus_epi16(const __m128i a,const __m128i b)4965 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4966 {
4967     return vreinterpretq_m128i_u8(
4968         vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4969                     vqmovun_s16(vreinterpretq_s16_m128i(b))));
4970 }
4971 
4972 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
4973 // and saturates.
4974 //
4975 //   r0 := SignedSaturate(a0)
4976 //   r1 := SignedSaturate(a1)
4977 //   r2 := SignedSaturate(a2)
4978 //   r3 := SignedSaturate(a3)
4979 //   r4 := SignedSaturate(b0)
4980 //   r5 := SignedSaturate(b1)
4981 //   r6 := SignedSaturate(b2)
4982 //   r7 := SignedSaturate(b3)
4983 //
4984 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
_mm_packs_epi32(__m128i a,__m128i b)4985 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4986 {
4987     return vreinterpretq_m128i_s16(
4988         vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4989                      vqmovn_s32(vreinterpretq_s32_m128i(b))));
4990 }
4991 
4992 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
4993 // integers and saturates.
4994 //
4995 //   r0 := UnsignedSaturate(a0)
4996 //   r1 := UnsignedSaturate(a1)
4997 //   r2 := UnsignedSaturate(a2)
4998 //   r3 := UnsignedSaturate(a3)
4999 //   r4 := UnsignedSaturate(b0)
5000 //   r5 := UnsignedSaturate(b1)
5001 //   r6 := UnsignedSaturate(b2)
5002 //   r7 := UnsignedSaturate(b3)
_mm_packus_epi32(__m128i a,__m128i b)5003 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
5004 {
5005     return vreinterpretq_m128i_u16(
5006         vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
5007                      vqmovun_s32(vreinterpretq_s32_m128i(b))));
5008 }
5009 
5010 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
5011 // 8 signed or unsigned 8-bit integers in b.
5012 //
5013 //   r0 := a0
5014 //   r1 := b0
5015 //   r2 := a1
5016 //   r3 := b1
5017 //   ...
5018 //   r14 := a7
5019 //   r15 := b7
5020 //
5021 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
_mm_unpacklo_epi8(__m128i a,__m128i b)5022 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
5023 {
5024 #if defined(__aarch64__)
5025     return vreinterpretq_m128i_s8(
5026         vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5027 #else
5028     int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5029     int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5030     int8x8x2_t result = vzip_s8(a1, b1);
5031     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5032 #endif
5033 }
5034 
5035 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
5036 // lower 4 signed or unsigned 16-bit integers in b.
5037 //
5038 //   r0 := a0
5039 //   r1 := b0
5040 //   r2 := a1
5041 //   r3 := b1
5042 //   r4 := a2
5043 //   r5 := b2
5044 //   r6 := a3
5045 //   r7 := b3
5046 //
5047 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
_mm_unpacklo_epi16(__m128i a,__m128i b)5048 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
5049 {
5050 #if defined(__aarch64__)
5051     return vreinterpretq_m128i_s16(
5052         vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5053 #else
5054     int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5055     int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5056     int16x4x2_t result = vzip_s16(a1, b1);
5057     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5058 #endif
5059 }
5060 
5061 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
5062 // lower 2 signed or unsigned 32 - bit integers in b.
5063 //
5064 //   r0 := a0
5065 //   r1 := b0
5066 //   r2 := a1
5067 //   r3 := b1
5068 //
5069 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
_mm_unpacklo_epi32(__m128i a,__m128i b)5070 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
5071 {
5072 #if defined(__aarch64__)
5073     return vreinterpretq_m128i_s32(
5074         vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5075 #else
5076     int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5077     int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5078     int32x2x2_t result = vzip_s32(a1, b1);
5079     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5080 #endif
5081 }
5082 
_mm_unpacklo_epi64(__m128i a,__m128i b)5083 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
5084 {
5085     int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5086     int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5087     return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5088 }
5089 
5090 // Selects and interleaves the lower two single-precision, floating-point values
5091 // from a and b.
5092 //
5093 //   r0 := a0
5094 //   r1 := b0
5095 //   r2 := a1
5096 //   r3 := b1
5097 //
5098 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
_mm_unpacklo_ps(__m128 a,__m128 b)5099 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
5100 {
5101 #if defined(__aarch64__)
5102     return vreinterpretq_m128_f32(
5103         vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5104 #else
5105     float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
5106     float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
5107     float32x2x2_t result = vzip_f32(a1, b1);
5108     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5109 #endif
5110 }
5111 
5112 // Selects and interleaves the upper two single-precision, floating-point values
5113 // from a and b.
5114 //
5115 //   r0 := a2
5116 //   r1 := b2
5117 //   r2 := a3
5118 //   r3 := b3
5119 //
5120 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
_mm_unpackhi_ps(__m128 a,__m128 b)5121 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
5122 {
5123 #if defined(__aarch64__)
5124     return vreinterpretq_m128_f32(
5125         vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5126 #else
5127     float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
5128     float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
5129     float32x2x2_t result = vzip_f32(a1, b1);
5130     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5131 #endif
5132 }
5133 
5134 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
5135 // 8 signed or unsigned 8-bit integers in b.
5136 //
5137 //   r0 := a8
5138 //   r1 := b8
5139 //   r2 := a9
5140 //   r3 := b9
5141 //   ...
5142 //   r14 := a15
5143 //   r15 := b15
5144 //
5145 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
_mm_unpackhi_epi8(__m128i a,__m128i b)5146 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
5147 {
5148 #if defined(__aarch64__)
5149     return vreinterpretq_m128i_s8(
5150         vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5151 #else
5152     int8x8_t a1 =
5153         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5154     int8x8_t b1 =
5155         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5156     int8x8x2_t result = vzip_s8(a1, b1);
5157     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5158 #endif
5159 }
5160 
5161 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
5162 // upper 4 signed or unsigned 16-bit integers in b.
5163 //
5164 //   r0 := a4
5165 //   r1 := b4
5166 //   r2 := a5
5167 //   r3 := b5
5168 //   r4 := a6
5169 //   r5 := b6
5170 //   r6 := a7
5171 //   r7 := b7
5172 //
5173 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
_mm_unpackhi_epi16(__m128i a,__m128i b)5174 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5175 {
5176 #if defined(__aarch64__)
5177     return vreinterpretq_m128i_s16(
5178         vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5179 #else
5180     int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5181     int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5182     int16x4x2_t result = vzip_s16(a1, b1);
5183     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5184 #endif
5185 }
5186 
5187 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
5188 // upper 2 signed or unsigned 32-bit integers in b.
5189 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
_mm_unpackhi_epi32(__m128i a,__m128i b)5190 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
5191 {
5192 #if defined(__aarch64__)
5193     return vreinterpretq_m128i_s32(
5194         vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5195 #else
5196     int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5197     int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5198     int32x2x2_t result = vzip_s32(a1, b1);
5199     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5200 #endif
5201 }
5202 
5203 // Interleaves the upper signed or unsigned 64-bit integer in a with the
5204 // upper signed or unsigned 64-bit integer in b.
5205 //
5206 //   r0 := a1
5207 //   r1 := b1
_mm_unpackhi_epi64(__m128i a,__m128i b)5208 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
5209 {
5210     int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5211     int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5212     return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5213 }
5214 
5215 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
5216 // in a, store the minimum and index in dst, and zero the remaining bits in dst.
5217 //
5218 //   index[2:0] := 0
5219 //   min[15:0] := a[15:0]
5220 //   FOR j := 0 to 7
5221 //       i := j*16
5222 //       IF a[i+15:i] < min[15:0]
5223 //           index[2:0] := j
5224 //           min[15:0] := a[i+15:i]
5225 //       FI
5226 //   ENDFOR
5227 //   dst[15:0] := min[15:0]
5228 //   dst[18:16] := index[2:0]
5229 //   dst[127:19] := 0
5230 //
5231 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
_mm_minpos_epu16(__m128i a)5232 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
5233 {
5234     __m128i dst;
5235     uint16_t min, idx = 0;
5236     // Find the minimum value
5237 #if defined(__aarch64__)
5238     min = vminvq_u16(vreinterpretq_u16_m128i(a));
5239 #else
5240     __m64 tmp;
5241     tmp = vreinterpret_m64_u16(
5242         vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
5243                  vget_high_u16(vreinterpretq_u16_m128i(a))));
5244     tmp = vreinterpret_m64_u16(
5245         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5246     tmp = vreinterpret_m64_u16(
5247         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5248     min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
5249 #endif
5250     // Get the index of the minimum value
5251     int i;
5252     for (i = 0; i < 8; i++) {
5253         if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
5254             idx = (uint16_t) i;
5255             break;
5256         }
5257         a = _mm_srli_si128(a, 2);
5258     }
5259     // Generate result
5260     dst = _mm_setzero_si128();
5261     dst = vreinterpretq_m128i_u16(
5262         vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
5263     dst = vreinterpretq_m128i_u16(
5264         vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
5265     return dst;
5266 }
5267 
5268 // shift to right
5269 // https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
5270 // http://blog.csdn.net/hemmingway/article/details/44828303
5271 // Clang requires a macro here, as it is extremely picky about c being a
5272 // literal.
5273 #define _mm_alignr_epi8(a, b, c) \
5274     ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
5275 
5276 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5277 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5278 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5279 // otherwise set CF to 0. Return the CF value.
5280 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
_mm_testc_si128(__m128i a,__m128i b)5281 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
5282 {
5283     int64x2_t s64 =
5284         vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
5285                   vreinterpretq_s64_m128i(b));
5286     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5287 }
5288 
5289 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5290 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5291 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5292 // otherwise set CF to 0. Return the ZF value.
5293 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
_mm_testz_si128(__m128i a,__m128i b)5294 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
5295 {
5296     int64x2_t s64 =
5297         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
5298     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5299 }
5300 
5301 // Extracts the selected signed or unsigned 8-bit integer from a and zero
5302 // extends.
5303 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
5304 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
5305 
5306 // Inserts the least significant 8 bits of b into the selected 8-bit integer
5307 // of a.
5308 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
5309 //                                      __constrange(0,16) int imm)
5310 #define _mm_insert_epi8(a, b, imm)                                 \
5311     __extension__({                                                \
5312         vreinterpretq_m128i_s8(                                    \
5313             vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
5314     })
5315 
5316 // Extracts the selected signed or unsigned 16-bit integer from a and zero
5317 // extends.
5318 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
5319 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
5320 #define _mm_extract_epi16(a, imm) \
5321     vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
5322 
5323 // Inserts the least significant 16 bits of b into the selected 16-bit integer
5324 // of a.
5325 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
5326 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
5327 //                                       __constrange(0,8) int imm)
5328 #define _mm_insert_epi16(a, b, imm)                                  \
5329     __extension__({                                                  \
5330         vreinterpretq_m128i_s16(                                     \
5331             vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
5332     })
5333 
5334 // Extracts the selected signed or unsigned 32-bit integer from a and zero
5335 // extends.
5336 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
5337 #define _mm_extract_epi32(a, imm) \
5338     vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
5339 
5340 // Extracts the selected single-precision (32-bit) floating-point from a.
5341 // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
5342 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
5343 
5344 // Inserts the least significant 32 bits of b into the selected 32-bit integer
5345 // of a.
5346 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
5347 //                                       __constrange(0,4) int imm)
5348 #define _mm_insert_epi32(a, b, imm)                                  \
5349     __extension__({                                                  \
5350         vreinterpretq_m128i_s32(                                     \
5351             vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
5352     })
5353 
5354 // Extracts the selected signed or unsigned 64-bit integer from a and zero
5355 // extends.
5356 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
5357 #define _mm_extract_epi64(a, imm) \
5358     vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
5359 
5360 // Inserts the least significant 64 bits of b into the selected 64-bit integer
5361 // of a.
5362 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
5363 //                                       __constrange(0,2) int imm)
5364 #define _mm_insert_epi64(a, b, imm)                                  \
5365     __extension__({                                                  \
5366         vreinterpretq_m128i_s64(                                     \
5367             vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
5368     })
5369 
5370 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
5371 // return that count in dst.
5372 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
_mm_popcnt_u32(unsigned int a)5373 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
5374 {
5375 #if defined(__aarch64__)
5376 #if __has_builtin(__builtin_popcount)
5377     return __builtin_popcount(a);
5378 #else
5379     return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
5380 #endif
5381 #else
5382     uint32_t count = 0;
5383     uint8x8_t input_val, count8x8_val;
5384     uint16x4_t count16x4_val;
5385     uint32x2_t count32x2_val;
5386 
5387     input_val = vld1_u8((uint8_t *) &a);
5388     count8x8_val = vcnt_u8(input_val);
5389     count16x4_val = vpaddl_u8(count8x8_val);
5390     count32x2_val = vpaddl_u16(count16x4_val);
5391 
5392     vst1_u32(&count, count32x2_val);
5393     return count;
5394 #endif
5395 }
5396 
5397 // Count the number of bits set to 1 in unsigned 64-bit integer a, and
5398 // return that count in dst.
5399 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
_mm_popcnt_u64(uint64_t a)5400 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
5401 {
5402 #if defined(__aarch64__)
5403 #if __has_builtin(__builtin_popcountll)
5404     return __builtin_popcountll(a);
5405 #else
5406     return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
5407 #endif
5408 #else
5409     uint64_t count = 0;
5410     uint8x8_t input_val, count8x8_val;
5411     uint16x4_t count16x4_val;
5412     uint32x2_t count32x2_val;
5413     uint64x1_t count64x1_val;
5414 
5415     input_val = vld1_u8((uint8_t *) &a);
5416     count8x8_val = vcnt_u8(input_val);
5417     count16x4_val = vpaddl_u8(count8x8_val);
5418     count32x2_val = vpaddl_u16(count16x4_val);
5419     count64x1_val = vpaddl_u32(count32x2_val);
5420     vst1_u64(&count, count64x1_val);
5421     return count;
5422 #endif
5423 }
5424 
5425 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
5426 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
5427 // transposed matrix in these vectors (row0 now contains column 0, etc.).
5428 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
5429 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
5430     do {                                                  \
5431         float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
5432         float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
5433         row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
5434                             vget_low_f32(ROW23.val[0]));  \
5435         row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
5436                             vget_low_f32(ROW23.val[1]));  \
5437         row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
5438                             vget_high_f32(ROW23.val[0])); \
5439         row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
5440                             vget_high_f32(ROW23.val[1])); \
5441     } while (0)
5442 
5443 /* Crypto Extensions */
5444 
5445 #if defined(__ARM_FEATURE_CRYPTO)
5446 // Wraps vmull_p64
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)5447 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5448 {
5449     poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
5450     poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
5451     return vreinterpretq_u64_p128(vmull_p64(a, b));
5452 }
5453 #else  // ARMv7 polyfill
5454 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
5455 //
5456 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
5457 // 64-bit->128-bit polynomial multiply.
5458 //
5459 // It needs some work and is somewhat slow, but it is still faster than all
5460 // known scalar methods.
5461 //
5462 // Algorithm adapted to C from
5463 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
5464 // from "Fast Software Polynomial Multiplication on ARM Processors Using the
5465 // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
5466 // (https://hal.inria.fr/hal-01506572)
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)5467 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5468 {
5469     poly8x8_t a = vreinterpret_p8_u64(_a);
5470     poly8x8_t b = vreinterpret_p8_u64(_b);
5471 
5472     // Masks
5473     uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
5474                                     vcreate_u8(0x00000000ffffffff));
5475     uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
5476                                     vcreate_u8(0x0000000000000000));
5477 
5478     // Do the multiplies, rotating with vext to get all combinations
5479     uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
5480     uint8x16_t e =
5481         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
5482     uint8x16_t f =
5483         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
5484     uint8x16_t g =
5485         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
5486     uint8x16_t h =
5487         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
5488     uint8x16_t i =
5489         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
5490     uint8x16_t j =
5491         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
5492     uint8x16_t k =
5493         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
5494 
5495     // Add cross products
5496     uint8x16_t l = veorq_u8(e, f);  // L = E + F
5497     uint8x16_t m = veorq_u8(g, h);  // M = G + H
5498     uint8x16_t n = veorq_u8(i, j);  // N = I + J
5499 
5500     // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
5501     // instructions.
5502 #if defined(__aarch64__)
5503     uint8x16_t lm_p0 = vreinterpretq_u8_u64(
5504         vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5505     uint8x16_t lm_p1 = vreinterpretq_u8_u64(
5506         vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5507     uint8x16_t nk_p0 = vreinterpretq_u8_u64(
5508         vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5509     uint8x16_t nk_p1 = vreinterpretq_u8_u64(
5510         vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5511 #else
5512     uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
5513     uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
5514     uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
5515     uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
5516 #endif
5517     // t0 = (L) (P0 + P1) << 8
5518     // t1 = (M) (P2 + P3) << 16
5519     uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
5520     uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
5521     uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
5522 
5523     // t2 = (N) (P4 + P5) << 24
5524     // t3 = (K) (P6 + P7) << 32
5525     uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
5526     uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
5527     uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
5528 
5529     // De-interleave
5530 #if defined(__aarch64__)
5531     uint8x16_t t0 = vreinterpretq_u8_u64(
5532         vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5533     uint8x16_t t1 = vreinterpretq_u8_u64(
5534         vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5535     uint8x16_t t2 = vreinterpretq_u8_u64(
5536         vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5537     uint8x16_t t3 = vreinterpretq_u8_u64(
5538         vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5539 #else
5540     uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
5541     uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
5542     uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
5543     uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
5544 #endif
5545     // Shift the cross products
5546     uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
5547     uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
5548     uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
5549     uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
5550 
5551     // Accumulate the products
5552     uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
5553     uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
5554     uint8x16_t mix = veorq_u8(d, cross1);
5555     uint8x16_t r = veorq_u8(mix, cross2);
5556     return vreinterpretq_u64_u8(r);
5557 }
5558 #endif  // ARMv7 polyfill
5559 
_mm_clmulepi64_si128(__m128i _a,__m128i _b,const int imm)5560 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
5561 {
5562     uint64x2_t a = vreinterpretq_u64_m128i(_a);
5563     uint64x2_t b = vreinterpretq_u64_m128i(_b);
5564     switch (imm & 0x11) {
5565     case 0x00:
5566         return vreinterpretq_m128i_u64(
5567             _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
5568     case 0x01:
5569         return vreinterpretq_m128i_u64(
5570             _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
5571     case 0x10:
5572         return vreinterpretq_m128i_u64(
5573             _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
5574     case 0x11:
5575         return vreinterpretq_m128i_u64(
5576             _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
5577     default:
5578         abort();
5579     }
5580 }
5581 
5582 #if !defined(__ARM_FEATURE_CRYPTO)
5583 /* clang-format off */
5584 #define SSE2NEON_AES_DATA(w)                                           \
5585     {                                                                  \
5586         w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
5587         w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
5588         w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
5589         w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
5590         w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
5591         w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
5592         w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
5593         w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
5594         w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
5595         w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
5596         w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
5597         w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
5598         w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
5599         w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
5600         w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
5601         w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
5602         w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
5603         w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
5604         w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
5605         w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
5606         w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
5607         w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
5608         w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
5609         w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
5610         w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
5611         w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
5612         w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
5613         w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
5614         w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
5615         w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
5616         w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
5617         w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
5618         w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
5619         w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
5620         w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
5621         w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
5622         w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
5623     }
5624 /* clang-format on */
5625 
5626 /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
5627 #define SSE2NEON_AES_H0(x) (x)
5628 static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
5629 #undef SSE2NEON_AES_H0
5630 
5631 // In the absence of crypto extensions, implement aesenc using regular neon
5632 // intrinsics instead. See:
5633 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
5634 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
5635 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
5636 // for more information Reproduced with permission of the author.
_mm_aesenc_si128(__m128i EncBlock,__m128i RoundKey)5637 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
5638 {
5639 #if defined(__aarch64__)
5640     static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
5641                                          0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
5642                                          0xc, 0x1, 0x6, 0xb};
5643     static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
5644                                        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
5645 
5646     uint8x16_t v;
5647     uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
5648 
5649     // shift rows
5650     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
5651 
5652     // sub bytes
5653     v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
5654     v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
5655     v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
5656     v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
5657 
5658     // mix columns
5659     w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
5660     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
5661     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
5662 
5663     //  add round key
5664     return vreinterpretq_m128i_u8(w) ^ RoundKey;
5665 
5666 #else /* ARMv7-A NEON implementation */
5667 #define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
5668     (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
5669      (b0))
5670 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
5671 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
5672 #define SSE2NEON_AES_U0(p) \
5673     SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
5674 #define SSE2NEON_AES_U1(p) \
5675     SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
5676 #define SSE2NEON_AES_U2(p) \
5677     SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
5678 #define SSE2NEON_AES_U3(p) \
5679     SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
5680     static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
5681         SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
5682         SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
5683         SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
5684         SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
5685     };
5686 #undef SSE2NEON_AES_B2W
5687 #undef SSE2NEON_AES_F2
5688 #undef SSE2NEON_AES_F3
5689 #undef SSE2NEON_AES_U0
5690 #undef SSE2NEON_AES_U1
5691 #undef SSE2NEON_AES_U2
5692 #undef SSE2NEON_AES_U3
5693 
5694     uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
5695     uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
5696     uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
5697     uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
5698 
5699     __m128i out = _mm_set_epi32(
5700         (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
5701          aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
5702         (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
5703          aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
5704         (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
5705          aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
5706         (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
5707          aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
5708 
5709     return _mm_xor_si128(out, RoundKey);
5710 #endif
5711 }
5712 
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)5713 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
5714 {
5715     /* FIXME: optimized for NEON */
5716     uint8_t v[4][4] = {
5717         [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
5718                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
5719                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
5720                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
5721         [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
5722                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
5723                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
5724                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
5725         [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
5726                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
5727                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
5728                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
5729         [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
5730                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
5731                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
5732                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
5733     };
5734     for (int i = 0; i < 16; i++)
5735         vreinterpretq_nth_u8_m128i(a, i) =
5736             v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
5737     return a;
5738 }
5739 
5740 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
5741 // This instruction generates a round key for AES encryption. See
5742 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
5743 // for details.
5744 //
5745 // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
_mm_aeskeygenassist_si128(__m128i key,const int rcon)5746 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
5747 {
5748     uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
5749     uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
5750     for (int i = 0; i < 4; ++i) {
5751         ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
5752         ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
5753     }
5754     return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
5755                          ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
5756 }
5757 #undef SSE2NEON_AES_DATA
5758 
5759 #else /* __ARM_FEATURE_CRYPTO */
5760 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
5761 // AESMC and then manually applying the real key as an xor operation. This
5762 // unfortunately means an additional xor op; the compiler should be able to
5763 // optimize this away for repeated calls however. See
5764 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
5765 // for more details.
_mm_aesenc_si128(__m128i a,__m128i b)5766 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
5767 {
5768     return vreinterpretq_m128i_u8(
5769         vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
5770         vreinterpretq_u8_m128i(b));
5771 }
5772 
5773 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)5774 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
5775 {
5776     return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
5777                              vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
5778                          RoundKey);
5779 }
5780 
_mm_aeskeygenassist_si128(__m128i a,const int rcon)5781 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
5782 {
5783     // AESE does ShiftRows and SubBytes on A
5784     uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
5785 
5786     uint8x16_t dest = {
5787         // Undo ShiftRows step from AESE and extract X1 and X3
5788         u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
5789         u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
5790         u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
5791         u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
5792     };
5793     uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
5794     return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
5795 }
5796 #endif
5797 
5798 /* Streaming Extensions */
5799 
5800 // Guarantees that every preceding store is globally visible before any
5801 // subsequent store.
5802 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
_mm_sfence(void)5803 FORCE_INLINE void _mm_sfence(void)
5804 {
5805     __sync_synchronize();
5806 }
5807 
5808 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
5809 // point elements) from a into memory using a non-temporal memory hint.
5810 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
_mm_stream_ps(float * p,__m128 a)5811 FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
5812 {
5813 #if __has_builtin(__builtin_nontemporal_store)
5814     __builtin_nontemporal_store(a, (float32x4_t *) p);
5815 #else
5816     vst1q_f32(p, vreinterpretq_f32_m128(a));
5817 #endif
5818 }
5819 
5820 // Stores the data in a to the address p without polluting the caches.  If the
5821 // cache line containing address p is already in the cache, the cache will be
5822 // updated.
5823 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
_mm_stream_si128(__m128i * p,__m128i a)5824 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5825 {
5826 #if __has_builtin(__builtin_nontemporal_store)
5827     __builtin_nontemporal_store(a, p);
5828 #else
5829     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5830 #endif
5831 }
5832 
5833 // Load 128-bits of integer data from memory into dst using a non-temporal
5834 // memory hint. mem_addr must be aligned on a 16-byte boundary or a
5835 // general-protection exception may be generated.
5836 //
5837 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
5838 //
5839 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
_mm_stream_load_si128(__m128i * p)5840 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
5841 {
5842 #if __has_builtin(__builtin_nontemporal_store)
5843     return __builtin_nontemporal_load(p);
5844 #else
5845     return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
5846 #endif
5847 }
5848 
5849 // Cache line containing p is flushed and invalidated from all caches in the
5850 // coherency domain. :
5851 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
_mm_clflush(void const * p)5852 FORCE_INLINE void _mm_clflush(void const *p)
5853 {
5854     (void) p;
5855     // no corollary for Neon?
5856 }
5857 
5858 // Allocate aligned blocks of memory.
5859 // https://software.intel.com/en-us/
5860 //         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
_mm_malloc(size_t size,size_t align)5861 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
5862 {
5863     void *ptr;
5864     if (align == 1)
5865         return malloc(size);
5866     if (align == 2 || (sizeof(void *) == 8 && align == 4))
5867         align = sizeof(void *);
5868     if (!posix_memalign(&ptr, align, size))
5869         return ptr;
5870     return NULL;
5871 }
5872 
_mm_free(void * addr)5873 FORCE_INLINE void _mm_free(void *addr)
5874 {
5875     free(addr);
5876 }
5877 
5878 // Starting with the initial value in crc, accumulates a CRC32 value for
5879 // unsigned 8-bit integer v.
5880 // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
_mm_crc32_u8(uint32_t crc,uint8_t v)5881 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
5882 {
5883 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5884     __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
5885                          : [c] "+r"(crc)
5886                          : [v] "r"(v));
5887 #else
5888     crc ^= v;
5889     for (int bit = 0; bit < 8; bit++) {
5890         if (crc & 1)
5891             crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
5892         else
5893             crc = (crc >> 1);
5894     }
5895 #endif
5896     return crc;
5897 }
5898 
5899 // Starting with the initial value in crc, accumulates a CRC32 value for
5900 // unsigned 16-bit integer v.
5901 // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
_mm_crc32_u16(uint32_t crc,uint16_t v)5902 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
5903 {
5904 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5905     __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
5906                          : [c] "+r"(crc)
5907                          : [v] "r"(v));
5908 #else
5909     crc = _mm_crc32_u8(crc, v & 0xff);
5910     crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
5911 #endif
5912     return crc;
5913 }
5914 
5915 // Starting with the initial value in crc, accumulates a CRC32 value for
5916 // unsigned 32-bit integer v.
5917 // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
_mm_crc32_u32(uint32_t crc,uint32_t v)5918 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
5919 {
5920 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5921     __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
5922                          : [c] "+r"(crc)
5923                          : [v] "r"(v));
5924 #else
5925     crc = _mm_crc32_u16(crc, v & 0xffff);
5926     crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
5927 #endif
5928     return crc;
5929 }
5930 
5931 // Starting with the initial value in crc, accumulates a CRC32 value for
5932 // unsigned 64-bit integer v.
5933 // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
_mm_crc32_u64(uint64_t crc,uint64_t v)5934 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
5935 {
5936 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5937     __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
5938                          : [c] "+r"(crc)
5939                          : [v] "r"(v));
5940 #else
5941     crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
5942     crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
5943 #endif
5944     return crc;
5945 }
5946 
5947 #if defined(__GNUC__) || defined(__clang__)
5948 #pragma pop_macro("ALIGN_STRUCT")
5949 #pragma pop_macro("FORCE_INLINE")
5950 #endif
5951 
5952 #if defined(__GNUC__)
5953 #pragma GCC pop_options
5954 #endif
5955 
5956 #endif
5957