• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation,  victoria.zhislina@intel.com
2 
3 //*** Copyright (C) 2012-2018 Intel Corporation.  All rights reserved.
4 
5 //IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
6 
7 //By downloading, copying, installing or using the software you agree to this license.
8 //If you do not agree to this license, do not download, install, copy or use the software.
9 
10 //                              License Agreement
11 //Redistribution and use in source and binary forms, with or without modification,
12 //are permitted provided that the following conditions are met:
13 
14 //  * Redistributions of source code must retain the above copyright notice,
15 //    this list of conditions and the following disclaimer.
16 
17 //  * The name of the copyright holders may not be used to endorse or promote products
18 //    derived from this software without specific prior written permission.
19 
20 //This software is provided by the copyright holders and contributors "as is" and
21 //any express or implied warranties, including, but not limited to, the implied
22 //warranties of merchantability and fitness for a particular purpose are disclaimed.
23 //In no event shall the Intel Corporation or contributors be liable for any direct,
24 //indirect, incidental, special, exemplary, or consequential damages
25 //(including, but not limited to, procurement of substitute goods or services;
26 //loss of use, data, or profits; or business interruption) however caused
27 //and on any theory of liability, whether in contract, strict liability,
28 //or tort (including negligence or otherwise) arising in any way out of
29 //the use of this software, even if advised of the possibility of such damage.
30 
31 //*****************************************************************************************
32 // This file is intended to simplify ARM->IA32 porting
33 // It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
34 // and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
35 //MMX instruction set is not used due to non availability on x64 systems,
36 //performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point  switching
37 //*****************************************************************************************
38 
39 //!!!!!!!!!!!!!!  To use this file just include it in your project that uses ARM NEON intinsics instead of "arm_neon.h" and compile it as usual
40 //!!!!!!!!!!!!!!  but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance.
41 
42 #ifndef NEON2SSE_H
43 #define NEON2SSE_H
44 
45 /*********************************************************************************************************************/
46 //!!!!!!!!!!!!!!
47 //if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used
48 //For older devices without SSE4 support it should be undefined,  for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine
49 #ifndef USE_SSE4
50 #   if defined(__SSE4_2__)
51 #       define USE_SSE4
52 #   endif
53 #endif
54 /*********************************************************************************************************************/
55 
56 #include <xmmintrin.h>     //SSE
57 #include <emmintrin.h>     //SSE2
58 #include <pmmintrin.h>     //SSE3
59 #include <tmmintrin.h>     //SSSE3
60 #ifdef USE_SSE4
61 #   include <smmintrin.h> //SSE4.1
62 #   include <nmmintrin.h> //SSE4.2
63 #endif
64 
65 #include <math.h>
66 
67 //***************  functions and data attributes, compiler dependent  *********************************
68 //***********************************************************************************
69 #ifdef __GNUC__
70 #   define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
71 #   define _NEON2SSESTORAGE static
72 #   define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
73 #   define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
74 #   ifndef NEON2SSE_DISABLE_PERFORMANCE_WARNING
75 #       if _GCC_VERSION <  40500
76 #           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
77 #       else
78 #           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
79 #       endif
80 #   else
81 #       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
82 #   endif
83 #   if defined(__x86_64__)
84 #       define _NEON2SSE_64BIT  __x86_64__
85 #   endif
86 #else
87 #   define _NEON2SSESTORAGE static
88 #   define _NEON2SSE_ALIGN_16  __declspec(align(16))
89 #   define _NEON2SSE_INLINE _NEON2SSESTORAGE __inline
90 #   if (defined(_MSC_VER) || defined (__INTEL_COMPILER)) && !defined(NEON2SSE_DISABLE_PERFORMANCE_WARNING)
91 #       define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
92 #       if defined(_M_X64)
93 #           define _NEON2SSE_64BIT  _M_X64
94 #       endif
95 #   else
96 #       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
97 #   endif
98 #endif
99 
100 #if defined  (_NEON2SSE_64BIT) && defined (USE_SSE4)
101 #   define _NEON2SSE_64BIT_SSE4
102 #endif
103 
104 /*********************************************************************************************************************/
105 //    data types conversion
106 /*********************************************************************************************************************/
107 #if defined(_MSC_VER) && (_MSC_VER < 1300)
108     typedef signed char int8_t;
109     typedef unsigned char uint8_t;
110     typedef signed short int16_t;
111     typedef unsigned short uint16_t;
112     typedef signed int int32_t;
113     typedef unsigned int uint32_t;
114     typedef signed long long int64_t;
115     typedef unsigned long long uint64_t;
116 #elif defined(_MSC_VER)
117     typedef signed __int8 int8_t;
118     typedef unsigned __int8 uint8_t;
119     typedef signed __int16 int16_t;
120     typedef unsigned __int16 uint16_t;
121     typedef signed __int32 int32_t;
122     typedef unsigned __int32 uint32_t;
123 
124     typedef signed long long int64_t;
125     typedef unsigned long long uint64_t;
126 #else
127 #   include <stdint.h>
128 #   include <limits.h>
129 #endif
130 
131 typedef union   __m64_128 {
132     uint64_t m64_u64[1];
133     float m64_f32[2];
134     int8_t m64_i8[8];
135     int16_t m64_i16[4];
136     int32_t m64_i32[2];
137     int64_t m64_i64[1];
138     uint8_t m64_u8[8];
139     uint16_t m64_u16[4];
140     uint32_t m64_u32[2];
141 } __m64_128;
142 
143 typedef __m64_128 int8x8_t;
144 typedef __m64_128 uint8x8_t;
145 typedef __m64_128 int16x4_t;
146 typedef __m64_128 uint16x4_t;
147 typedef __m64_128 int32x2_t;
148 typedef __m64_128 uint32x2_t;
149 typedef __m64_128 int64x1_t;
150 typedef __m64_128 uint64x1_t;
151 typedef __m64_128 poly8x8_t;
152 typedef __m64_128 poly16x4_t;
153 
154 typedef __m64_128 float32x2_t;
155 typedef __m128 float32x4_t;
156 
157 typedef __m128 float16x4_t; //not supported by IA, for compartibility
158 typedef __m128 float16x8_t; //not supported by IA, for compartibility
159 
160 typedef __m64_128 float64x1_t;
161 typedef __m128d float64x2_t;
162 
163 typedef __m128i int8x16_t;
164 typedef __m128i int16x8_t;
165 typedef __m128i int32x4_t;
166 typedef __m128i int64x2_t;
167 typedef __m128i uint8x16_t;
168 typedef __m128i uint16x8_t;
169 typedef __m128i uint32x4_t;
170 typedef __m128i uint64x2_t;
171 typedef __m128i poly8x16_t;
172 typedef __m128i poly16x8_t;
173 
174 #if defined(_MSC_VER)
175 #   define SINT_MIN     (-2147483647 - 1) /* min signed int value */
176 #   define SINT_MAX       2147483647 /* max signed int value */
177 #else
178 #   define SINT_MIN     INT_MIN /* min signed int value */
179 #   define SINT_MAX     INT_MAX /* max signed int value */
180 #endif
181 
182 typedef   float float32_t;
183 #if !defined(__clang__)
184 typedef   float __fp16;
185 #endif
186 
187 typedef   double float64_t;
188 
189 
190 typedef  uint8_t poly8_t;
191 typedef  uint16_t poly16_t;
192 
193 
194 //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type  as functions arguments resulting in
195 //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned.  To avoid it we need the special trick for functions that use these types
196 struct int8x16x2_t {
197     int8x16_t val[2];
198 };
199 struct int16x8x2_t {
200     int16x8_t val[2];
201 };
202 struct int32x4x2_t {
203     int32x4_t val[2];
204 };
205 struct int64x2x2_t {
206     int64x2_t val[2];
207 };
208 //Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
209 struct int8x8x2_t {
210     int8x8_t val[2];
211 };
212 struct int16x4x2_t {
213     int16x4_t val[2];
214 };
215 struct int32x2x2_t {
216     int32x2_t val[2];
217 };
218 struct int64x1x2_t {
219     int64x1_t val[2];
220 };
221 
222 typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
223 typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
224 typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
225 typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
226 
227 typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
228 typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
229 typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
230 typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
231 
232 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
233 typedef struct int8x16x2_t uint8x16x2_t;
234 typedef struct int16x8x2_t uint16x8x2_t;
235 typedef struct int32x4x2_t uint32x4x2_t;
236 typedef struct int64x2x2_t uint64x2x2_t;
237 typedef struct int8x16x2_t poly8x16x2_t;
238 typedef struct int16x8x2_t poly16x8x2_t;
239 
240 typedef struct int8x8x2_t uint8x8x2_t;
241 typedef struct int16x4x2_t uint16x4x2_t;
242 typedef struct int32x2x2_t uint32x2x2_t;
243 typedef struct int64x1x2_t uint64x1x2_t;
244 typedef struct int8x8x2_t poly8x8x2_t;
245 typedef struct int16x4x2_t poly16x4x2_t;
246 
247 //float
248 struct float32x4x2_t {
249     float32x4_t val[2];
250 };
251 struct float16x8x2_t {
252     float16x8_t val[2];
253 };
254 struct float32x2x2_t {
255     float32x2_t val[2];
256 };
257 
258 typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
259 typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
260 typedef struct  float32x2x2_t float32x2x2_t; //for C compilers to make them happy
261 typedef  float16x8x2_t float16x4x2_t;
262 
263 //4
264 struct int8x16x4_t {
265     int8x16_t val[4];
266 };
267 struct int16x8x4_t {
268     int16x8_t val[4];
269 };
270 struct int32x4x4_t {
271     int32x4_t val[4];
272 };
273 struct int64x2x4_t {
274     int64x2_t val[4];
275 };
276 
277 struct int8x8x4_t {
278     int8x8_t val[4];
279 };
280 struct int16x4x4_t {
281     int16x4_t val[4];
282 };
283 struct int32x2x4_t {
284     int32x2_t val[4];
285 };
286 struct int64x1x4_t {
287     int64x1_t val[4];
288 };
289 
290 typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
291 typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
292 typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
293 typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
294 
295 typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
296 typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
297 typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
298 typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
299 
300 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
301 typedef struct int8x8x4_t uint8x8x4_t;
302 typedef struct int16x4x4_t uint16x4x4_t;
303 typedef struct int32x2x4_t uint32x2x4_t;
304 typedef struct int64x1x4_t uint64x1x4_t;
305 typedef struct int8x8x4_t poly8x8x4_t;
306 typedef struct int16x4x4_t poly16x4x4_t;
307 
308 typedef struct int8x16x4_t uint8x16x4_t;
309 typedef struct int16x8x4_t uint16x8x4_t;
310 typedef struct int32x4x4_t uint32x4x4_t;
311 typedef struct int64x2x4_t uint64x2x4_t;
312 typedef struct int8x16x4_t poly8x16x4_t;
313 typedef struct int16x8x4_t poly16x8x4_t;
314 
315 struct float32x4x4_t {
316     float32x4_t val[4];
317 };
318 struct float16x8x4_t {
319     float16x8_t val[4];
320 };
321 struct float32x2x4_t {
322     float32x2_t val[4];
323 };
324 
325 typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
326 typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
327 typedef struct  float32x2x4_t float32x2x4_t; //for C compilers to make them happy
328 typedef  float16x8x4_t float16x4x4_t;
329 
330 //3
331 struct int16x8x3_t {
332     int16x8_t val[3];
333 };
334 struct int32x4x3_t {
335     int32x4_t val[3];
336 };
337 struct int64x2x3_t {
338     int64x2_t val[3];
339 };
340 struct int8x16x3_t {
341     int8x16_t val[3];
342 };
343 
344 struct int16x4x3_t {
345     int16x4_t val[3];
346 };
347 struct int32x2x3_t {
348     int32x2_t val[3];
349 };
350 struct int64x1x3_t {
351     int64x1_t val[3];
352 };
353 struct int8x8x3_t {
354     int8x8_t val[3];
355 };
356 typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
357 typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
358 typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
359 typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
360 
361 typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
362 typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
363 typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
364 typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
365 
366 
367 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
368 typedef struct int8x16x3_t uint8x16x3_t;
369 typedef struct int16x8x3_t uint16x8x3_t;
370 typedef struct int32x4x3_t uint32x4x3_t;
371 typedef struct int64x2x3_t uint64x2x3_t;
372 typedef struct int8x16x3_t poly8x16x3_t;
373 typedef struct int16x8x3_t poly16x8x3_t;
374 typedef struct  int8x8x3_t uint8x8x3_t;
375 typedef struct  int16x4x3_t uint16x4x3_t;
376 typedef struct  int32x2x3_t uint32x2x3_t;
377 typedef struct  int64x1x3_t uint64x1x3_t;
378 typedef struct  int8x8x3_t poly8x8x3_t;
379 typedef struct  int16x4x3_t poly16x4x3_t;
380 
381 //float
382 struct float32x4x3_t {
383     float32x4_t val[3];
384 };
385 struct float32x2x3_t {
386     float32x2_t val[3];
387 };
388 struct float16x8x3_t {
389     float16x8_t val[3];
390 };
391 
392 typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
393 typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
394 typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
395 typedef  float16x8x3_t float16x4x3_t;
396 
397 
398 //****************************************************************************
399 //****** Porting auxiliary macros ********************************************
400 
401 //** floating point related macros **
402 #define _M128i(a) _mm_castps_si128(a)
403 #define _M128(a) _mm_castsi128_ps(a)
404 //here the most performance effective implementation is compiler and 32/64 bits build dependent
405 #if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER  >= 1500) )
406 #   define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
407 #   define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
408 #   define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
409 #else
410    //for 32bit gcc and Microsoft compilers builds
411 #   define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
412 #   define _M64(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), inp)
413 #   define _M64f(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
414 #endif
415 #define _pM128(a) _mm_castsi128_ps(_pM128i(a))
416 
417 #define return64(a)  _M64(res64,a); return res64;
418 #define return64f(a)  _M64f(res64,a); return res64;
419 
420 #define _Ui64(a) (*(uint64_t*)&(a))
421 #define _UNSIGNED_T(a) u ## a
422 
423 #define _SIGNBIT64 ((uint64_t)1 << 63)
424 #define _SWAP_HI_LOW32  (2 | (3 << 2) | (0 << 4) | (1 << 6))
425 #define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
426 
427 #define  _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
428 #define  _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
429 
430 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
431 #define __constrange(min,max)  const
432 #define __transfersize(size)
433 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
434 
435 //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& mask constants used in porting &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
436 _NEON2SSE_ALIGN_16 static const int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7,  9, 11, 13, 15 };
437 _NEON2SSE_ALIGN_16 static const int8_t mask8_32_even_odd[16] = { 0, 1, 4, 5, 8,  9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
438 //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
439 
440 //*************************************************************************
441 //*************************************************************************
442 //*********  Functions declarations as declared in original arm_neon.h *****
443 //*************************************************************************
444 //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
445 _NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
446 _NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
447 _NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
448 _NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
449 _NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
450 _NEON2SSESTORAGE uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
451 _NEON2SSESTORAGE uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
452 _NEON2SSESTORAGE uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
453 _NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
454 _NEON2SSESTORAGE int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
455 _NEON2SSESTORAGE int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
456 _NEON2SSESTORAGE int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
457 _NEON2SSESTORAGE int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
458 _NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
459 _NEON2SSESTORAGE uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
460 _NEON2SSESTORAGE uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
461 _NEON2SSESTORAGE uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
462 _NEON2SSESTORAGE uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
463 //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
464 _NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
465 _NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
466 _NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
467 _NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
468 _NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
469 _NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
470 //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
471 _NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
472 _NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
473 _NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
474 _NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
475 _NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
476 _NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
477 //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
478 _NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
479 _NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
480 _NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
481 _NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
482 _NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
483 _NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
484 _NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
485 _NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
486 _NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
487 _NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
488 _NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
489 _NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
490 //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
491 _NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
492 _NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
493 _NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
494 _NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
495 _NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
496 _NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
497 _NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
498 _NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
499 _NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
500 _NEON2SSESTORAGE uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
501 _NEON2SSESTORAGE uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
502 _NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
503 //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
504 _NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
505 _NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
506 _NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
507 _NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
508 _NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
509 _NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
510 _NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
511 _NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
512 _NEON2SSESTORAGE int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
513 _NEON2SSESTORAGE int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
514 _NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
515 _NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
516 _NEON2SSESTORAGE uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
517 _NEON2SSESTORAGE uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
518 _NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
519 _NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
520 //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
521 _NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
522 _NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
523 _NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
524 _NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
525 _NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
526 _NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
527 //Vector rounding add high half: vraddhn
528 _NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
529 _NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
530 _NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
531 _NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
532 _NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
533 _NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
534 //Multiplication
535 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
536 _NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
537 _NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
538 _NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
539 _NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
540 _NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
541 _NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
542 _NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
543 _NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
544 _NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
545 _NEON2SSESTORAGE int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
546 _NEON2SSESTORAGE int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
547 _NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
548 _NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
549 _NEON2SSESTORAGE uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
550 _NEON2SSESTORAGE uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
551 _NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
552 //multiply lane
553 _NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
554 _NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
555 _NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
556 _NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
557 _NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
558 _NEON2SSESTORAGE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
559 _NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
560 _NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
561 _NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
562 _NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
563 //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
564 _NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
565 _NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
566 _NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
567 _NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
568 _NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
569 _NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
570 _NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
571 _NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
572 _NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
573 _NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
574 _NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
575 _NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
576 _NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
577 _NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
578 //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
579 _NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
580 _NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
581 _NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
582 _NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
583 _NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
584 _NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
585 //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
586 _NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
587 _NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
588 _NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
589 _NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
590 _NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
591 _NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
592 _NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
593 _NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
594 _NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
595 _NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
596 _NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
597 _NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
598 _NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
599 _NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
600 //Vector multiply subtract long
601 _NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
602 _NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
603 _NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
604 _NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
605 _NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
606 _NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
607 //Vector saturating doubling multiply high
608 _NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
609 _NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
610 _NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
611 _NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
612 //Vector saturating rounding doubling multiply high
613 _NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
614 _NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
615 _NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
616 _NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
617 //Vector saturating doubling multiply accumulate long
618 _NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
619 _NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
620 //Vector saturating doubling multiply subtract long
621 _NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
622 _NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
623 //Vector long multiply
624 _NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
625 _NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
626 _NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
627 _NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
628 _NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
629 _NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
630 _NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
631 //Vector saturating doubling long multiply
632 _NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
633 _NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
634 //Subtraction
635 //Vector subtract
636 _NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
637 _NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
638 _NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
639 _NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
640 _NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
641 _NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
642 _NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
643 _NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
644 _NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
645 _NEON2SSESTORAGE int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
646 _NEON2SSESTORAGE int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
647 _NEON2SSESTORAGE int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
648 _NEON2SSESTORAGE int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
649 _NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
650 _NEON2SSESTORAGE uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
651 _NEON2SSESTORAGE uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
652 _NEON2SSESTORAGE uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
653 _NEON2SSESTORAGE uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
654 //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
655 _NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
656 _NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
657 _NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
658 _NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
659 _NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
660 _NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
661 //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
662 _NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
663 _NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
664 _NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
665 _NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
666 _NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
667 _NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
668 //Vector saturating subtract
669 _NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
670 _NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
671 _NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
672 _NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
673 _NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
674 _NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
675 _NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
676 _NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
677 _NEON2SSESTORAGE int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
678 _NEON2SSESTORAGE int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
679 _NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
680 _NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
681 _NEON2SSESTORAGE uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
682 _NEON2SSESTORAGE uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
683 _NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
684 _NEON2SSESTORAGE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
685 //Vector halving subtract
686 _NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
687 _NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
688 _NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
689 _NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
690 _NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
691 _NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
692 _NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
693 _NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
694 _NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
695 _NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
696 _NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
697 _NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
698 //Vector subtract high half
699 _NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
700 _NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
701 _NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
702 _NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
703 _NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
704 _NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
705 //Vector rounding subtract high half
706 _NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
707 _NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
708 _NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
709 _NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
710 _NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
711 _NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
712 //Comparison
713 //Vector compare equal
714 _NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
715 _NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
716 _NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
717 _NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
718 _NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
719 _NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
720 _NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
721 _NEON2SSESTORAGE uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
722 _NEON2SSESTORAGE uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
723 _NEON2SSESTORAGE uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
724 _NEON2SSESTORAGE uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
725 _NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
726 _NEON2SSESTORAGE uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
727 _NEON2SSESTORAGE uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
728 _NEON2SSESTORAGE uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
729 _NEON2SSESTORAGE uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
730 //Vector compare greater-than or equal
731 _NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
732 _NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
733 _NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
734 _NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
735 _NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
736 _NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
737 _NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
738 _NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
739 _NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
740 _NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
741 _NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
742 _NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
743 _NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
744 _NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
745 //Vector compare less-than or equal
746 _NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
747 _NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
748 _NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
749 _NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
750 _NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
751 _NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
752 _NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
753 _NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
754 _NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
755 _NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
756 _NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
757 _NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
758 _NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
759 _NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
760 //Vector compare greater-than
761 _NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
762 _NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
763 _NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
764 _NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
765 _NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
766 _NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
767 _NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
768 _NEON2SSESTORAGE uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
769 _NEON2SSESTORAGE uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
770 _NEON2SSESTORAGE uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
771 _NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
772 _NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
773 _NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
774 _NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
775 //Vector compare less-than
776 _NEON2SSESTORAGE uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
777 _NEON2SSESTORAGE uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
778 _NEON2SSESTORAGE uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
779 _NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
780 _NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
781 _NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
782 _NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
783 _NEON2SSESTORAGE uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
784 _NEON2SSESTORAGE uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
785 _NEON2SSESTORAGE uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
786 _NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
787 _NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
788 _NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
789 _NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
790 //Vector compare absolute greater-than or equal
791 _NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
792 _NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
793 //Vector compare absolute less-than or equal
794 _NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
795 _NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
796 //Vector compare absolute greater-than
797 _NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
798 _NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
799 //Vector compare absolute less-than
800 _NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
801 _NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
802 //Vector test bits
803 _NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
804 _NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
805 _NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
806 _NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
807 _NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
808 _NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
809 _NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
810 _NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
811 _NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
812 _NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
813 _NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
814 _NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
815 _NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
816 _NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
817 //Absolute difference
818 //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
819 _NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
820 _NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
821 _NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
822 _NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
823 _NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
824 _NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
825 _NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
826 _NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
827 _NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
828 _NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
829 _NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
830 _NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
831 _NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
832 _NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
833 //Absolute difference - long
834 _NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
835 _NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
836 _NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
837 _NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
838 _NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
839 _NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
840 //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
841 _NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
842 _NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
843 _NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
844 _NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
845 _NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
846 _NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
847 _NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
848 _NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
849 _NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
850 _NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
851 _NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
852 _NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
853 //Absolute difference and accumulate - long
854 _NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
855 _NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
856 _NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
857 _NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
858 _NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
859 _NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
860 //Max/Min
861 //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
862 _NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
863 _NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
864 _NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
865 _NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
866 _NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
867 _NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
868 _NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
869 _NEON2SSESTORAGE int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
870 _NEON2SSESTORAGE int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
871 _NEON2SSESTORAGE int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
872 _NEON2SSESTORAGE uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
873 _NEON2SSESTORAGE uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
874 _NEON2SSESTORAGE uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
875 _NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
876 
877 _NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
878 
879 //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
880 _NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
881 _NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
882 _NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
883 _NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
884 _NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
885 _NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
886 _NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
887 _NEON2SSESTORAGE int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
888 _NEON2SSESTORAGE int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
889 _NEON2SSESTORAGE int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
890 _NEON2SSESTORAGE uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
891 _NEON2SSESTORAGE uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
892 _NEON2SSESTORAGE uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
893 _NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
894 
895 _NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
896 
897 //Pairwise addition
898 //Pairwise add
899 _NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
900 _NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
901 _NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
902 _NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
903 _NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
904 _NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
905 _NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
906 //Long pairwise add
907 _NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
908 _NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
909 _NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
910 _NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
911 _NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
912 _NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
913 _NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
914 _NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
915 _NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
916 _NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
917 _NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
918 _NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
919 //Long pairwise add and accumulate
920 _NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
921 _NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
922 _NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
923 _NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
924 _NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
925 _NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
926 _NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
927 _NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
928 _NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
929 _NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
930 _NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
931 _NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
932 //Folding maximum vpmax -> takes maximum of adjacent pairs
933 _NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
934 _NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
935 _NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
936 _NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
937 _NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
938 _NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
939 _NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
940 //Folding minimum vpmin -> takes minimum of adjacent pairs
941 _NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
942 _NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
943 _NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
944 _NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
945 _NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
946 _NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
947 _NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
948 //Reciprocal/Sqrt
949 _NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
950 _NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
951 _NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
952 _NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
953 //Shifts by signed variable
954 //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
955 _NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
956 _NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
957 _NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
958 _NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
959 _NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
960 _NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
961 _NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
962 _NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
963 _NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
964 _NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
965 _NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
966 _NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
967 _NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
968 _NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
969 _NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
970 _NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
971 //Vector saturating shift left: (negative values shift right)
972 _NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
973 _NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
974 _NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
975 _NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
976 _NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
977 _NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
978 _NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
979 _NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
980 _NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
981 _NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
982 _NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
983 _NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
984 _NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
985 _NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
986 _NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
987 _NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
988 //Vector rounding shift left: (negative values shift right)
989 _NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
990 _NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
991 _NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
992 _NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
993 _NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
994 _NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
995 _NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
996 _NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
997 _NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
998 _NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
999 _NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
1000 _NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
1001 _NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
1002 _NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
1003 _NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
1004 _NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
1005 //Vector saturating rounding shift left: (negative values shift right)
1006 _NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
1007 _NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
1008 _NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
1009 _NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
1010 _NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
1011 _NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
1012 _NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
1013 _NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
1014 _NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
1015 _NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
1016 _NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
1017 _NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
1018 _NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
1019 _NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
1020 _NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
1021 _NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
1022 //Shifts by a constant
1023 //Vector shift right by constant
1024 _NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
1025 _NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
1026 _NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
1027 _NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
1028 _NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
1029 _NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
1030 _NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
1031 _NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
1032 _NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
1033 _NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
1034 _NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
1035 _NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
1036 _NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
1037 _NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
1038 _NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
1039 _NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
1040 //Vector shift left by constant
1041 _NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
1042 _NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
1043 _NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
1044 _NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
1045 _NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
1046 _NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
1047 _NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
1048 _NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
1049 _NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
1050 _NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
1051 _NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
1052 _NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
1053 _NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
1054 _NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
1055 _NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
1056 _NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
1057 //Vector rounding shift right by constant
1058 _NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
1059 _NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
1060 _NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
1061 _NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
1062 _NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
1063 _NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
1064 _NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
1065 _NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
1066 _NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
1067 _NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
1068 _NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
1069 _NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
1070 _NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
1071 _NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
1072 _NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
1073 _NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
1074 //Vector shift right by constant and accumulate
1075 _NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
1076 _NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
1077 _NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
1078 _NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
1079 _NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
1080 _NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
1081 _NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
1082 _NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
1083 _NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
1084 _NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
1085 _NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
1086 _NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
1087 _NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
1088 _NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
1089 _NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
1090 _NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
1091 //Vector rounding shift right by constant and accumulate
1092 _NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
1093 _NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
1094 _NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
1095 _NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
1096 _NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
1097 _NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
1098 _NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
1099 _NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
1100 _NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
1101 _NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
1102 _NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
1103 _NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
1104 _NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
1105 _NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
1106 _NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
1107 _NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
1108 //Vector saturating shift left by constant
1109 _NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
1110 _NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
1111 _NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
1112 _NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
1113 _NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
1114 _NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
1115 _NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
1116 _NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
1117 _NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
1118 _NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
1119 _NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
1120 _NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
1121 _NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
1122 _NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
1123 _NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
1124 _NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
1125 //Vector signed->unsigned saturating shift left by constant
1126 _NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
1127 _NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
1128 _NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
1129 _NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
1130 _NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
1131 _NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
1132 _NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
1133 _NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
1134 //Vector narrowing shift right by constant
1135 _NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
1136 _NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
1137 _NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
1138 _NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
1139 _NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
1140 _NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
1141 //Vector signed->unsigned narrowing saturating shift right by constant
1142 _NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
1143 _NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
1144 _NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
1145 //Vector signed->unsigned rounding narrowing saturating shift right by constant
1146 _NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
1147 _NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
1148 _NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
1149 //Vector narrowing saturating shift right by constant
1150 _NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
1151 _NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
1152 _NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
1153 _NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
1154 _NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
1155 _NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
1156 //Vector rounding narrowing shift right by constant
1157 _NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
1158 _NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
1159 _NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
1160 _NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
1161 _NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
1162 _NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
1163 //Vector rounding narrowing saturating shift right by constant
1164 _NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
1165 _NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
1166 _NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
1167 _NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
1168 _NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
1169 _NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
1170 //Vector widening shift left by constant
1171 _NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
1172 _NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
1173 _NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
1174 _NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
1175 _NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
1176 _NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
1177 //Shifts with insert
1178 //Vector shift right and insert
1179 _NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1180 _NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1181 _NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
1182 _NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
1183 _NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1184 _NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1185 _NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
1186 _NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
1187 _NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1188 _NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1189 _NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1190 _NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1191 _NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
1192 _NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
1193 _NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1194 _NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1195 _NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
1196 _NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
1197 _NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1198 _NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1199 //Vector shift left and insert
1200 _NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1201 _NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1202 _NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
1203 _NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
1204 _NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1205 _NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1206 _NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
1207 _NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
1208 _NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1209 _NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1210 _NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1211 _NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1212 _NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
1213 _NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
1214 _NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1215 _NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1216 _NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
1217 _NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
1218 _NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1219 _NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1220 //Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
1221 //Load a single vector from memory
1222 _NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1223 _NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1224 _NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1225 _NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1226 _NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1227 _NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1228 _NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1229 _NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1230 _NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
1231 _NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1232 _NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1233 _NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1234 _NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
1235 _NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
1236 _NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
1237 _NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1238 _NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
1239 _NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
1240 _NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
1241 _NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1242 _NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
1243 _NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
1244 _NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
1245 _NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
1246 
1247 _NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1248 
1249 //Load a single lane from memory
1250 _NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1251 _NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
1252 _NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
1253 _NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
1254 _NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1255 _NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
1256 _NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
1257 _NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
1258 _NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
1259 _NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
1260 _NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1261 _NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
1262 _NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
1263 _NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1264 _NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
1265 _NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
1266 _NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
1267 _NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1268 _NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
1269 _NEON2SSESTORAGE float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1270 _NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
1271 _NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
1272 _NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
1273 _NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1274 //Load all lanes of vector with same value from memory
1275 _NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1276 _NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1277 _NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1278 _NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1279 _NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1280 _NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1281 _NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1282 _NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1283 _NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1284 _NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1285 _NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1286 _NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1287 _NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1288 _NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1289 _NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1290 _NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1291 _NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1292 _NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1293 _NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1294 _NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1295 _NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1296 _NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1297 _NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1298 _NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1299 //Store a single vector or lane. Stores all lanes or a single lane of a vector.
1300 //Store a single vector into memory
1301 _NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
1302 _NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
1303 _NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
1304 _NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
1305 _NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
1306 _NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
1307 _NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
1308 _NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
1309 _NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
1310 _NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
1311 _NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
1312 _NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
1313 _NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
1314 _NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
1315 _NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
1316 _NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
1317 _NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
1318 _NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
1319 _NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
1320 _NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
1321 _NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
1322 _NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
1323 _NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
1324 _NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
1325 //Store a lane of a vector into memory
1326 //Loads of an N-element structure
1327 //Load N-element structure from memory
1328 _NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1329 _NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1330 _NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1331 _NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1332 _NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1333 _NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1334 _NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
1335 _NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1336 _NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1337 _NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1338 _NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1339 _NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1340 _NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1341 _NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1342 _NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1343 _NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1344 _NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1345 _NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1346 //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
1347 _NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1348 _NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1349 _NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1350 _NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1351 _NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1352 _NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1353 _NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1354 _NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1355 _NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1356 _NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1357 _NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1358 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1359 _NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1360 _NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1361 _NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1362 _NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1363 _NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1364 _NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1365 _NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1366 _NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1367 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1368 _NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1369 _NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1370 _NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1371 _NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1372 _NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1373 _NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1374 _NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1375 _NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1376 _NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1377 _NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1378 _NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1379 _NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1380 _NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1381 _NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1382 _NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1383 _NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1384 _NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1385 _NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1386 _NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1387 _NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1388 _NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1389 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1390 _NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1391 _NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1392 _NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1393 _NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1394 //Load all lanes of N-element structure with same value from memory
1395 _NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1396 _NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1397 _NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1398 _NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1399 _NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1400 _NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1401 _NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1402 _NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1403 //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1404 _NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1405 _NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1406 _NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1407 _NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1408 _NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1409 _NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1410 _NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1411 _NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1412 _NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1413 _NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1414 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1415 _NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1416 _NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1417 _NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1418 _NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1419 _NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1420 _NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1421 _NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1422 _NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1423 _NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1424 _NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1425 _NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1426 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1427 _NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1428 _NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1429 _NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1430 _NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1431 //Load a single lane of N-element structure from memory
1432 //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
1433 _NEON2SSESTORAGE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1434 _NEON2SSESTORAGE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1435 _NEON2SSESTORAGE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1436 _NEON2SSESTORAGE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1437 _NEON2SSESTORAGE float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1438 _NEON2SSESTORAGE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1439 _NEON2SSESTORAGE poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1440 _NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1441 _NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1442 _NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1443 _NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1444 _NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
1445 _NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
1446 //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1447 _NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t  src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1448 _NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t  src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1449 _NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t  src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1450 _NEON2SSESTORAGE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1451 _NEON2SSESTORAGE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1452 _NEON2SSESTORAGE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1453 _NEON2SSESTORAGE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1454 _NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1455 _NEON2SSESTORAGE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1456 _NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1457 _NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1458 _NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1459 _NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1460 _NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1461 _NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1462 _NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1463 _NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1464 _NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1465 _NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1466 _NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1467 _NEON2SSESTORAGE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1468 _NEON2SSESTORAGE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1469 _NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1470 _NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1471 _NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1472 _NEON2SSESTORAGE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1473 _NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1474 _NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1475 _NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1476 _NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1477 _NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1478 _NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1479 _NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1480 _NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1481 _NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1482 _NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1483 _NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1484 //Store N-element structure to memory
1485 _NEON2SSESTORAGE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1486 _NEON2SSESTORAGE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1487 _NEON2SSESTORAGE void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1488 _NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1489 _NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1490 _NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1491 _NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1492 _NEON2SSESTORAGE void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1493 _NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1494 _NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1495 _NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1496 _NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1497 _NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val); // VST2.32 {d0, d1}, [r0]
1498 _NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val); // VST1.64 {d0, d1}, [r0]
1499 _NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1500 _NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1501 _NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
1502 _NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); // VST1.64 {d0, d1}, [r0]
1503 //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
1504 _NEON2SSESTORAGE void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
1505 _NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1506 _NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1507 _NEON2SSESTORAGE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1508 _NEON2SSESTORAGE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1509 _NEON2SSESTORAGE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1510 _NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1511 _NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1512 _NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1513 _NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1514 _NEON2SSESTORAGE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1515 _NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1516 _NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1517 _NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1518 _NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1519 _NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1520 _NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
1521 _NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1522 _NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1523 _NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1524 _NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
1525 _NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
1526 _NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1527 _NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1528 _NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1529 _NEON2SSESTORAGE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1530 _NEON2SSESTORAGE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1531 _NEON2SSESTORAGE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1532 _NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1533 _NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1534 _NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1535 _NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1536 _NEON2SSESTORAGE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1537 _NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1538 _NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1539 _NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1540 _NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1541 _NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1542 _NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
1543 _NEON2SSESTORAGE void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1544 _NEON2SSESTORAGE void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1545 _NEON2SSESTORAGE void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1546 _NEON2SSESTORAGE void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
1547 _NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
1548 _NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1549 _NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1550 _NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1551 //Store a single lane of N-element structure to memory
1552 _NEON2SSESTORAGE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1553 _NEON2SSESTORAGE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1554 _NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1555 _NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1556 _NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1557 _NEON2SSESTORAGE void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
1558 _NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1559 _NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1560 _NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1561 _NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1562 _NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
1563 _NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1564 _NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1565 _NEON2SSESTORAGE void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1566 _NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1567 _NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1568 _NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1569 _NEON2SSESTORAGE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1570 _NEON2SSESTORAGE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1571 _NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1572 _NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1573 _NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1574 _NEON2SSESTORAGE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
1575 _NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1576 _NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1577 _NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1578 _NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1579 _NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
1580 _NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1581 _NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1582 _NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1583 _NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1584 _NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1585 _NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1586 _NEON2SSESTORAGE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1587 _NEON2SSESTORAGE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1588 _NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1589 _NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1590 _NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1591 _NEON2SSESTORAGE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1592 _NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1593 _NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1594 _NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1595 _NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1596 _NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
1597 _NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1598 _NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1599 _NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1600 _NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1601 _NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1602 _NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1603 //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
1604 _NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
1605 _NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
1606 _NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1607 _NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
1608 _NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
1609 _NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1610 _NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
1611 _NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
1612 _NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1613 _NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
1614 _NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
1615 _NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1616 _NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
1617 _NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
1618 _NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1619 _NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
1620 _NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
1621 _NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1622 _NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
1623 _NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
1624 _NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
1625 _NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
1626 //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
1627 _NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1628 _NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1629 _NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1630 _NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1631 _NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1632 _NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1633 _NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1634 _NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1635 _NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1636 _NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1637 _NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1638 _NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1639 _NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1640 _NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1641 _NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1642 _NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1643 _NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1644 _NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1645 _NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
1646 _NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
1647 _NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
1648 _NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
1649 //Initialize a vector from a literal bit pattern.
1650 _NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
1651 _NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
1652 _NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
1653 _NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
1654 _NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
1655 _NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
1656 _NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
1657 _NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
1658 _NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
1659 _NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
1660 _NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
1661 _NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
1662 //Set all lanes to same value
1663 //Load all lanes of vector to the same literal value
1664 _NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
1665 _NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
1666 _NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
1667 _NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
1668 _NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
1669 _NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
1670 _NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
1671 _NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
1672 _NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
1673 _NEON2SSESTORAGE uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
1674 _NEON2SSESTORAGE uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
1675 _NEON2SSESTORAGE uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
1676 _NEON2SSESTORAGE int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
1677 _NEON2SSESTORAGE int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
1678 _NEON2SSESTORAGE int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
1679 _NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
1680 _NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
1681 _NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
1682 _NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
1683 _NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
1684 _NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
1685 _NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
1686 _NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
1687 _NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
1688 _NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
1689 _NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
1690 _NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
1691 _NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
1692 _NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
1693 _NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
1694 _NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
1695 _NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
1696 _NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
1697 _NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
1698 _NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
1699 _NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
1700 _NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
1701 _NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
1702 _NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
1703 _NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
1704 _NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
1705 _NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
1706 _NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
1707 _NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
1708 //Load all lanes of the vector to the value of a lane of a vector
1709 _NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1710 _NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1711 _NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1712 _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1713 _NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1714 _NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1715 _NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1716 _NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1717 _NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1718 _NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1719 _NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1720 _NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1721 _NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1722 _NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1723 _NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1724 _NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1725 _NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1726 _NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1727 _NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
1728 _NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
1729 _NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
1730 _NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
1731 //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
1732 _NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
1733 _NEON2SSESTORAGE int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
1734 _NEON2SSESTORAGE int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
1735 _NEON2SSESTORAGE int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
1736 _NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
1737 _NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
1738 _NEON2SSESTORAGE uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
1739 _NEON2SSESTORAGE uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
1740 _NEON2SSESTORAGE uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
1741 _NEON2SSESTORAGE uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
1742 _NEON2SSESTORAGE poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
1743 _NEON2SSESTORAGE poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
1744 //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
1745 _NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
1746 _NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
1747 _NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
1748 _NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
1749 _NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
1750 _NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
1751 _NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
1752 _NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
1753 _NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
1754 _NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
1755 _NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
1756 _NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
1757 _NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
1758 _NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
1759 _NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
1760 _NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
1761 _NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
1762 _NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
1763 _NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
1764 _NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
1765 _NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
1766 _NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
1767 _NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
1768 _NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
1769 //Converting vectors. These intrinsics are used to convert vectors.
1770 //Convert from float
1771 _NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
1772 _NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
1773 _NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
1774 _NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
1775 _NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
1776 _NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
1777 _NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
1778 _NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
1779 _NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
1780 //Convert to float
1781 _NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
1782 _NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
1783 _NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
1784 _NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
1785 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
1786 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
1787 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
1788 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
1789 //Convert between floats
1790 _NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
1791 _NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
1792 //Vector narrow integer
1793 _NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
1794 _NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
1795 _NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
1796 _NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
1797 _NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
1798 _NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
1799 //Vector long move
1800 _NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
1801 _NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
1802 _NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
1803 _NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
1804 _NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
1805 _NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
1806 //Vector saturating narrow integer
1807 _NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
1808 _NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
1809 _NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
1810 _NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
1811 _NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
1812 _NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
1813 //Vector saturating narrow integer signed->unsigned
1814 _NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
1815 _NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
1816 _NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
1817 //Table look up
1818 _NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1819 _NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
1820 _NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1821 //Extended table look up intrinsics
1822 _NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1823 _NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
1824 _NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1825 _NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1826 _NEON2SSESTORAGE int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1827 _NEON2SSESTORAGE poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1828 _NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1829 _NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1830 _NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1831 _NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1832 _NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1833 _NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1834 //Operations with a scalar value
1835 //Vector multiply accumulate with scalar
1836 _NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
1837 _NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
1838 _NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
1839 _NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
1840 _NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
1841 _NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
1842 _NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
1843 _NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
1844 _NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
1845 _NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
1846 //Vector widening multiply accumulate with scalar
1847 _NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
1848 _NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
1849 _NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
1850 _NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
1851 //Vector widening saturating doubling multiply accumulate with scalar
1852 _NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
1853 _NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
1854 //Vector multiply subtract with scalar
1855 _NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
1856 _NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
1857 _NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
1858 _NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
1859 _NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
1860 _NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
1861 _NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
1862 _NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
1863 _NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
1864 _NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
1865 //Vector widening multiply subtract with scalar
1866 _NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
1867 _NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
1868 _NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
1869 _NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
1870 //Vector widening saturating doubling multiply subtract with scalar
1871 _NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
1872 _NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
1873 //Vector multiply by scalar
1874 _NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
1875 _NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
1876 _NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
1877 _NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
1878 _NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
1879 _NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
1880 _NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
1881 _NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
1882 _NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
1883 _NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
1884 //Vector long multiply with scalar
1885 _NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
1886 _NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
1887 _NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
1888 _NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
1889 //Vector long multiply by scalar
1890 _NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
1891 _NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
1892 _NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
1893 _NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
1894 //Vector saturating doubling long multiply with scalar
1895 _NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
1896 _NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
1897 //Vector saturating doubling long multiply by scalar
1898 _NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
1899 _NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
1900 //Vector saturating doubling multiply high with scalar
1901 _NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
1902 _NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
1903 _NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
1904 _NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
1905 //Vector saturating doubling multiply high by scalar
1906 _NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
1907 _NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
1908 _NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
1909 _NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
1910 //Vector saturating rounding doubling multiply high with scalar
1911 _NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
1912 _NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
1913 _NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
1914 _NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
1915 //Vector rounding saturating doubling multiply high by scalar
1916 _NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
1917 _NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
1918 _NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
1919 _NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
1920 //Vector multiply accumulate with scalar
1921 _NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
1922 _NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
1923 _NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
1924 _NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
1925 _NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
1926 _NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
1927 _NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
1928 _NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
1929 _NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
1930 _NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
1931 //Vector widening multiply accumulate with scalar
1932 _NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
1933 _NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
1934 _NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
1935 _NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
1936 //Vector widening saturating doubling multiply accumulate with scalar
1937 _NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
1938 _NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
1939 //Vector multiply subtract with scalar
1940 _NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
1941 _NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
1942 _NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
1943 _NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
1944 _NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
1945 _NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
1946 _NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
1947 _NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
1948 _NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
1949 _NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
1950 //Vector widening multiply subtract with scalar
1951 _NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
1952 _NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
1953 _NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
1954 _NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
1955 //Vector widening saturating doubling multiply subtract with scalar
1956 _NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
1957 _NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
1958 //Vector extract
1959 _NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1960 _NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1961 _NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1962 _NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1963 _NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1964 _NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1965 _NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1966 _NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1967 _NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
1968 _NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
1969 _NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1970 _NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1971 _NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1972 _NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1973 _NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1974 _NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1975 _NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1976 _NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
1977 _NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
1978 _NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
1979 _NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
1980 _NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
1981 //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
1982 _NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
1983 _NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
1984 _NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
1985 _NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
1986 _NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
1987 _NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
1988 _NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
1989 _NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
1990 _NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
1991 _NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
1992 _NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
1993 _NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
1994 _NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
1995 _NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
1996 _NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
1997 _NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
1998 _NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
1999 _NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
2000 _NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
2001 _NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
2002 _NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
2003 _NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
2004 _NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
2005 _NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
2006 _NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
2007 _NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
2008 _NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
2009 _NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
2010 _NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
2011 _NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
2012 _NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
2013 _NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
2014 _NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
2015 _NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
2016 _NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
2017 _NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
2018 //Other single operand arithmetic
2019 //Absolute: Vd[i] = |Va[i]|
2020 _NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
2021 _NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
2022 _NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
2023 _NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
2024 _NEON2SSESTORAGE int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
2025 _NEON2SSESTORAGE int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
2026 _NEON2SSESTORAGE int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
2027 _NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
2028 
2029 _NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
2030 _NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
2031 
2032 //Saturating absolute: Vd[i] = sat(|Va[i]|)
2033 _NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
2034 _NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
2035 _NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
2036 _NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
2037 _NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
2038 _NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
2039 //Negate: Vd[i] = - Va[i]
2040 _NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
2041 _NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
2042 _NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
2043 _NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
2044 _NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
2045 _NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
2046 _NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
2047 _NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
2048 //Saturating Negate: sat(Vd[i] = - Va[i])
2049 _NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
2050 _NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
2051 _NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
2052 _NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
2053 _NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
2054 _NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
2055 //Count leading sign bits
2056 _NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
2057 _NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
2058 _NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
2059 _NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
2060 _NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
2061 _NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
2062 //Count leading zeros
2063 _NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
2064 _NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
2065 _NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
2066 _NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
2067 _NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
2068 _NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
2069 _NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
2070 _NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
2071 _NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
2072 _NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
2073 _NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
2074 _NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
2075 //Count number of set bits
2076 _NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
2077 _NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
2078 _NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
2079 _NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
2080 _NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
2081 _NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
2082 //Reciprocal estimate
2083 _NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
2084 _NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
2085 _NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
2086 _NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
2087 //Reciprocal square root estimate
2088 _NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
2089 _NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
2090 _NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
2091 _NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
2092 //Logical operations
2093 //Bitwise not
2094 _NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
2095 _NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
2096 _NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
2097 _NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
2098 _NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
2099 _NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
2100 _NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
2101 _NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
2102 _NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
2103 _NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
2104 _NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
2105 _NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
2106 _NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
2107 _NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
2108 //Bitwise and
2109 _NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
2110 _NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
2111 _NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
2112 _NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
2113 _NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
2114 _NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
2115 _NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
2116 _NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
2117 _NEON2SSESTORAGE int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
2118 _NEON2SSESTORAGE int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
2119 _NEON2SSESTORAGE int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
2120 _NEON2SSESTORAGE int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
2121 _NEON2SSESTORAGE uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
2122 _NEON2SSESTORAGE uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
2123 _NEON2SSESTORAGE uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
2124 _NEON2SSESTORAGE uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
2125 //Bitwise or
2126 _NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
2127 _NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
2128 _NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
2129 _NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
2130 _NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
2131 _NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
2132 _NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
2133 _NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
2134 _NEON2SSESTORAGE int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
2135 _NEON2SSESTORAGE int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
2136 _NEON2SSESTORAGE int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
2137 _NEON2SSESTORAGE int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
2138 _NEON2SSESTORAGE uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
2139 _NEON2SSESTORAGE uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
2140 _NEON2SSESTORAGE uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
2141 _NEON2SSESTORAGE uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
2142 //Bitwise exclusive or (EOR or XOR)
2143 _NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
2144 _NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
2145 _NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
2146 _NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
2147 _NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
2148 _NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
2149 _NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
2150 _NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
2151 _NEON2SSESTORAGE int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
2152 _NEON2SSESTORAGE int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
2153 _NEON2SSESTORAGE int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
2154 _NEON2SSESTORAGE int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
2155 _NEON2SSESTORAGE uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
2156 _NEON2SSESTORAGE uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
2157 _NEON2SSESTORAGE uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
2158 _NEON2SSESTORAGE uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
2159 //Bit Clear
2160 _NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
2161 _NEON2SSESTORAGE int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
2162 _NEON2SSESTORAGE int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
2163 _NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
2164 _NEON2SSESTORAGE uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
2165 _NEON2SSESTORAGE uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
2166 _NEON2SSESTORAGE uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
2167 _NEON2SSESTORAGE uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
2168 _NEON2SSESTORAGE int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
2169 _NEON2SSESTORAGE int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
2170 _NEON2SSESTORAGE int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
2171 _NEON2SSESTORAGE int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
2172 _NEON2SSESTORAGE uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
2173 _NEON2SSESTORAGE uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
2174 _NEON2SSESTORAGE uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
2175 _NEON2SSESTORAGE uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
2176 //Bitwise OR complement
2177 _NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
2178 _NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
2179 _NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
2180 _NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
2181 _NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
2182 _NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
2183 _NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
2184 _NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
2185 _NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
2186 _NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
2187 _NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
2188 _NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
2189 _NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
2190 _NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
2191 _NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
2192 _NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
2193 //Bitwise Select
2194 _NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
2195 _NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
2196 _NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
2197 _NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
2198 _NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
2199 _NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
2200 _NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
2201 _NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
2202 _NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
2203 _NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
2204 _NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
2205 _NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
2206 _NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
2207 _NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
2208 _NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
2209 _NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
2210 _NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
2211 _NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
2212 _NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
2213 _NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
2214 _NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
2215 _NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
2216 //Transposition operations
2217 //Transpose elements
2218 _NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
2219 _NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
2220 _NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
2221 _NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
2222 _NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
2223 _NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
2224 _NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
2225 _NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
2226 _NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
2227 _NEON2SSESTORAGE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
2228 _NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
2229 _NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
2230 _NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
2231 _NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
2232 _NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
2233 _NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
2234 _NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
2235 _NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
2236 //Interleave elements
2237 _NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
2238 _NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
2239 _NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
2240 _NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
2241 _NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
2242 _NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
2243 _NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
2244 _NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
2245 _NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
2246 _NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
2247 _NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
2248 _NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
2249 _NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
2250 _NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
2251 _NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
2252 _NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
2253 _NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
2254 _NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
2255 //De-Interleave elements
2256 _NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
2257 _NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
2258 _NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
2259 _NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
2260 _NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
2261 _NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
2262 _NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
2263 _NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
2264 _NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
2265 _NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
2266 _NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
2267 _NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
2268 _NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
2269 _NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
2270 _NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
2271 _NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
2272 _NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
2273 _NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
2274 
2275 _NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0
2276 
2277 _NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0
2278 
2279 //Sqrt
2280 _NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0
2281 
2282 _NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0
2283 
2284 
2285 
2286 //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2287 // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics.
2288 // we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
2289 //
2290 #if  ( defined (__INTEL_COMPILER)  || defined (__GNUC__) && !defined(__llvm__) )
2291 #   define _MM_ALIGNR_EPI8 _mm_alignr_epi8
2292 #   define _MM_EXTRACT_EPI16  (int16_t) _mm_extract_epi16
2293 #   define _MM_INSERT_EPI16 _mm_insert_epi16
2294 #   ifdef USE_SSE4
2295 #       define _MM_EXTRACT_EPI8  _mm_extract_epi8
2296 #       define _MM_EXTRACT_EPI32  _mm_extract_epi32
2297 #       define _MM_EXTRACT_PS  _mm_extract_ps
2298 #       define _MM_INSERT_EPI8  _mm_insert_epi8
2299 #       define _MM_INSERT_EPI32 _mm_insert_epi32
2300 #       define _MM_INSERT_PS    _mm_insert_ps
2301 #       ifdef  _NEON2SSE_64BIT
2302 #           define _MM_INSERT_EPI64 _mm_insert_epi64
2303 #           define _MM_EXTRACT_EPI64 _mm_extract_epi64
2304 #       endif
2305 #   endif //SSE4
2306 #else
2307 #   define _NEON2SSE_COMMA ,
2308 #   define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
2309         switch(LANE)         \
2310         {                \
2311         case 0:     return NAME(a b, 0); \
2312         case 1:     return NAME(a b, 1); \
2313         case 2:     return NAME(a b, 2); \
2314         case 3:     return NAME(a b, 3); \
2315         case 4:     return NAME(a b, 4); \
2316         case 5:     return NAME(a b, 5); \
2317         case 6:     return NAME(a b, 6); \
2318         case 7:     return NAME(a b, 7); \
2319         case 8:     return NAME(a b, 8); \
2320         case 9:     return NAME(a b, 9); \
2321         case 10:    return NAME(a b, 10); \
2322         case 11:    return NAME(a b, 11); \
2323         case 12:    return NAME(a b, 12); \
2324         case 13:    return NAME(a b, 13); \
2325         case 14:    return NAME(a b, 14); \
2326         case 15:    return NAME(a b, 15); \
2327         default:    return NAME(a b, 0); \
2328         }
2329 
2330 #   define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
2331         switch(LANE)              \
2332         {                          \
2333         case 0:  return NAME(vec p,0); \
2334         case 1:  return NAME(vec p,1); \
2335         case 2:  return NAME(vec p,2); \
2336         case 3:  return NAME(vec p,3); \
2337         case 4:  return NAME(vec p,4); \
2338         case 5:  return NAME(vec p,5); \
2339         case 6:  return NAME(vec p,6); \
2340         case 7:  return NAME(vec p,7); \
2341         default: return NAME(vec p,0); \
2342         }
2343 
2344 #   define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
2345         switch(LANE)              \
2346         {                          \
2347         case case0:  return NAME(vec p,case0); \
2348         case case1:  return NAME(vec p,case1); \
2349         case case2:  return NAME(vec p,case2); \
2350         case case3:  return NAME(vec p,case3); \
2351         default:     return NAME(vec p,case0); \
2352         }
2353 
_MM_ALIGNR_EPI8(__m128i a,__m128i b,int LANE)2354     _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
2355     {
2356         _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
2357     }
2358 
_MM_INSERT_EPI16(__m128i vec,int p,const int LANE)2359     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
2360     {
2361         _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
2362     }
2363 
_MM_EXTRACT_EPI16(__m128i vec,const int LANE)2364     _NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
2365     {
2366         _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
2367     }
2368 
2369 #ifdef USE_SSE4
_MM_EXTRACT_EPI32(__m128i vec,const int LANE)2370         _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2371         {
2372             _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
2373         }
2374 
_MM_EXTRACT_PS(__m128 vec,const int LANE)2375         _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2376         {
2377             _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
2378         }
2379 
_MM_EXTRACT_EPI8(__m128i vec,const int LANE)2380         _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2381         {
2382             _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
2383         }
2384 
_MM_INSERT_EPI32(__m128i vec,int p,const int LANE)2385         _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2386         {
2387             _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
2388         }
2389 
_MM_INSERT_EPI8(__m128i vec,int p,const int LANE)2390         _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2391         {
2392             _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
2393         }
2394 
2395 #ifdef  _NEON2SSE_64BIT
2396             //the special case of functions available only for SSE4 and 64-bit build.
_MM_INSERT_EPI64(__m128i vec,int p,const int LANE)2397             _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
2398             {
2399                 switch(LANE) {
2400                 case 0:
2401                     return _mm_insert_epi64(vec,  p, 0);
2402                 case 1:
2403                     return _mm_insert_epi64(vec,  p, 1);
2404                 default:
2405                     return _mm_insert_epi64(vec,  p, 0);
2406                 }
2407             }
2408 
_MM_EXTRACT_EPI64(__m128i val,const int LANE)2409             _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
2410             {
2411                 if (LANE ==0) return _mm_extract_epi64(val, 0);
2412                 else return _mm_extract_epi64(val, 1);
2413             }
2414 #endif
2415 
_MM_INSERT_PS(__m128 vec,__m128 p,const int LANE)2416         _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2417         {
2418             _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
2419         }
2420 
2421 #endif //USE_SSE4
2422 
2423 #endif     //#ifdef NDEBUG
2424 
2425 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2426 // Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
2427 // or for some specific commonly used operations implementation missing in SSE
2428 #ifdef USE_SSE4
2429 #   define _MM_CVTEPU8_EPI16  _mm_cvtepu8_epi16
2430 #   define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
2431 #   define _MM_CVTEPU32_EPI64  _mm_cvtepu32_epi64
2432 
2433 #   define _MM_CVTEPI8_EPI16  _mm_cvtepi8_epi16
2434 #   define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
2435 #   define _MM_CVTEPI32_EPI64  _mm_cvtepi32_epi64
2436 
2437 #   define _MM_MAX_EPI8  _mm_max_epi8
2438 #   define _MM_MAX_EPI32 _mm_max_epi32
2439 #   define _MM_MAX_EPU16 _mm_max_epu16
2440 #   define _MM_MAX_EPU32 _mm_max_epu32
2441 
2442 #   define _MM_MIN_EPI8  _mm_min_epi8
2443 #   define _MM_MIN_EPI32 _mm_min_epi32
2444 #   define _MM_MIN_EPU16 _mm_min_epu16
2445 #   define _MM_MIN_EPU32 _mm_min_epu32
2446 
2447 #   define _MM_BLENDV_EPI8 _mm_blendv_epi8
2448 #   define _MM_PACKUS_EPI32 _mm_packus_epi32
2449 #   define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
2450 
2451 #   define _MM_MULLO_EPI32 _mm_mullo_epi32
2452 #   define _MM_MUL_EPI32  _mm_mul_epi32
2453 
2454 #   define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
2455 #else     //no SSE4 !!!!!!
_MM_CVTEPU8_EPI16(__m128i a)2456     _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
2457     {
2458         __m128i zero = _mm_setzero_si128();
2459         return _mm_unpacklo_epi8(a, zero);
2460     }
2461 
_MM_CVTEPU16_EPI32(__m128i a)2462     _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
2463     {
2464         __m128i zero = _mm_setzero_si128();
2465         return _mm_unpacklo_epi16(a, zero);
2466     }
2467 
_MM_CVTEPU32_EPI64(__m128i a)2468     _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
2469     {
2470         __m128i zero = _mm_setzero_si128();
2471         return _mm_unpacklo_epi32(a, zero);
2472     }
2473 
_MM_CVTEPI8_EPI16(__m128i a)2474     _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
2475     {
2476         __m128i zero = _mm_setzero_si128();
2477         __m128i sign = _mm_cmpgt_epi8(zero, a);
2478         return _mm_unpacklo_epi8(a, sign);
2479     }
2480 
_MM_CVTEPI16_EPI32(__m128i a)2481     _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
2482     {
2483         __m128i zero = _mm_setzero_si128();
2484         __m128i sign = _mm_cmpgt_epi16(zero, a);
2485         return _mm_unpacklo_epi16(a, sign);
2486     }
2487 
_MM_CVTEPI32_EPI64(__m128i a)2488     _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
2489     {
2490         __m128i zero = _mm_setzero_si128();
2491         __m128i sign = _mm_cmpgt_epi32(zero, a);
2492         return _mm_unpacklo_epi32(a, sign);
2493     }
2494 
_MM_EXTRACT_EPI32(__m128i vec,const int LANE)2495     _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2496     {
2497         _NEON2SSE_ALIGN_16 int32_t tmp[4];
2498         _mm_store_si128((__m128i*)tmp, vec);
2499         return tmp[LANE];
2500     }
2501 
_MM_EXTRACT_EPI8(__m128i vec,const int LANE)2502     _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2503     {
2504         _NEON2SSE_ALIGN_16 int8_t tmp[16];
2505         _mm_store_si128((__m128i*)tmp, vec);
2506         return (int)tmp[LANE];
2507     }
2508 
_MM_EXTRACT_PS(__m128 vec,const int LANE)2509     _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2510     {
2511         _NEON2SSE_ALIGN_16 int32_t tmp[4];
2512         _mm_store_si128((__m128i*)tmp, _M128i(vec));
2513         return tmp[LANE];
2514     }
2515 
_MM_INSERT_EPI32(__m128i vec,int p,const int LANE)2516     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2517     {
2518         _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
2519         _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2520         __m128i vec_masked, p_masked;
2521         pvec[LANE] = p;
2522         mask[LANE] = 0x0;
2523         vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2524         p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2525         return _mm_or_si128(vec_masked, p_masked);
2526     }
2527 
_MM_INSERT_EPI8(__m128i vec,int p,const int LANE)2528     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2529     {
2530         _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
2531         _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
2532         __m128i vec_masked, p_masked;
2533         pvec[LANE] = (int8_t)p;
2534         mask[LANE] = 0x0;
2535         vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2536         p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2537         return _mm_or_si128(vec_masked, p_masked);
2538     }
2539 
_MM_INSERT_PS(__m128 vec,__m128 p,const int LANE)2540     _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2541     {
2542         _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2543         __m128 tmp, vec_masked, p_masked;
2544         mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
2545         vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
2546         p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
2547         tmp = _mm_or_ps(vec_masked, p_masked);
2548         return tmp;
2549     }
2550 
_MM_MAX_EPI8(__m128i a,__m128i b)2551     _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
2552     {
2553         __m128i cmp, resa, resb;
2554         cmp = _mm_cmpgt_epi8 (a, b);
2555         resa = _mm_and_si128 (cmp, a);
2556         resb = _mm_andnot_si128 (cmp,b);
2557         return _mm_or_si128(resa, resb);
2558     }
2559 
_MM_MAX_EPI32(__m128i a,__m128i b)2560     _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
2561     {
2562         __m128i cmp, resa, resb;
2563         cmp = _mm_cmpgt_epi32(a, b);
2564         resa = _mm_and_si128 (cmp, a);
2565         resb = _mm_andnot_si128 (cmp,b);
2566         return _mm_or_si128(resa, resb);
2567     }
2568 
_MM_MAX_EPU16(__m128i a,__m128i b)2569     _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
2570     {
2571         __m128i c8000, b_s, a_s, cmp;
2572         c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2573         c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
2574         b_s = _mm_sub_epi16 (b, c8000);
2575         a_s = _mm_sub_epi16 (a, c8000);
2576         cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
2577         a_s = _mm_and_si128 (cmp,a);
2578         b_s = _mm_andnot_si128 (cmp,b);
2579         return _mm_or_si128(a_s, b_s);
2580     }
2581 
_MM_MAX_EPU32(__m128i a,__m128i b)2582     _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
2583     {
2584         __m128i c80000000, b_s, a_s, cmp;
2585         c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2586         c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
2587         b_s = _mm_sub_epi32 (b, c80000000);
2588         a_s = _mm_sub_epi32 (a, c80000000);
2589         cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
2590         a_s = _mm_and_si128 (cmp,a);
2591         b_s = _mm_andnot_si128 (cmp,b);
2592         return _mm_or_si128(a_s, b_s);
2593     }
2594 
_MM_MIN_EPI8(__m128i a,__m128i b)2595     _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
2596     {
2597         __m128i cmp, resa, resb;
2598         cmp = _mm_cmpgt_epi8 (b, a);
2599         resa = _mm_and_si128 (cmp, a);
2600         resb = _mm_andnot_si128 (cmp,b);
2601         return _mm_or_si128(resa, resb);
2602     }
2603 
_MM_MIN_EPI32(__m128i a,__m128i b)2604     _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
2605     {
2606         __m128i cmp, resa, resb;
2607         cmp = _mm_cmpgt_epi32(b, a);
2608         resa = _mm_and_si128 (cmp, a);
2609         resb = _mm_andnot_si128 (cmp,b);
2610         return _mm_or_si128(resa, resb);
2611     }
2612 
_MM_MIN_EPU16(__m128i a,__m128i b)2613     _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
2614     {
2615         __m128i c8000, b_s, a_s, cmp;
2616         c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2617         c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
2618         b_s = _mm_sub_epi16 (b, c8000);
2619         a_s = _mm_sub_epi16 (a, c8000);
2620         cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
2621         a_s = _mm_and_si128 (cmp,a);
2622         b_s = _mm_andnot_si128 (cmp,b);
2623         return _mm_or_si128(a_s, b_s);
2624     }
2625 
_MM_MIN_EPU32(__m128i a,__m128i b)2626     _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
2627     {
2628         __m128i c80000000, b_s, a_s, cmp;
2629         c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2630         c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
2631         b_s = _mm_sub_epi32 (b, c80000000);
2632         a_s = _mm_sub_epi32 (a, c80000000);
2633         cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
2634         a_s = _mm_and_si128 (cmp,a);
2635         b_s = _mm_andnot_si128 (cmp,b);
2636         return _mm_or_si128(a_s, b_s);
2637     }
2638 
_MM_BLENDV_EPI8(__m128i a,__m128i b,__m128i mask)2639     _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
2640     {
2641         //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
2642         __m128i a_masked, b_masked;
2643         b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
2644         a_masked = _mm_andnot_si128 (mask,a);
2645         return _mm_or_si128(a_masked, b_masked);
2646     }
2647 
_MM_PACKUS_EPI32(__m128i a,__m128i b)2648     _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
2649     {
2650         __m128i a16, b16, res, reshi,cmp, zero;
2651         zero = _mm_setzero_si128();
2652         a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
2653         b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
2654         res = _mm_unpacklo_epi64(a16, b16); //result without saturation
2655         reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
2656         cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2657         res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
2658         cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2659         return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2660     }
2661 
_MM_PACKUS1_EPI32(__m128i a)2662     _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
2663     {
2664         __m128i a16, res, reshi,cmp, zero;
2665         zero = _mm_setzero_si128();
2666         a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
2667         reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
2668         cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2669         res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
2670         cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2671         return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2672     }
2673 
2674 
_NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32 (__m128i a,__m128i b),_NEON2SSE_REASON_SLOW_SERIAL)2675     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
2676     {
2677         _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
2678         int64_t res64;
2679         int i;
2680         _mm_store_si128((__m128i*)atmp, a);
2681         _mm_store_si128((__m128i*)btmp, b);
2682         for (i = 0; i<4; i++) {
2683             res64 = atmp[i] * btmp[i];
2684             res[i] = (int)(res64 & 0xffffffff);
2685         }
2686         return _mm_load_si128((__m128i*)res);
2687     }
2688 
_MM_MUL_EPI32(__m128i a,__m128i b)2689     _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
2690     {
2691         __m128i sign, zero,  mul_us, a_neg, b_neg, mul_us_neg;
2692         sign = _mm_xor_si128 (a, b);
2693         sign =  _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
2694         sign = _mm_shuffle_epi32(sign, _MM_SHUFFLE(2, 2, 0, 0)); //promote sign bit to 3 and 1st data lanes
2695         zero = _mm_setzero_si128();
2696         a_neg = _mm_abs_epi32 (a); //negate a and b
2697         b_neg = _mm_abs_epi32 (b); //negate a and b
2698         mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
2699         mul_us_neg = _mm_sub_epi64(zero, mul_us);
2700         mul_us_neg = _mm_and_si128(sign, mul_us_neg);
2701         mul_us = _mm_andnot_si128(sign, mul_us);
2702         return _mm_or_si128 (mul_us, mul_us_neg);
2703     }
2704 
_MM_CMPEQ_EPI64(__m128i a,__m128i b)2705     _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
2706     {
2707         __m128i res;
2708         res = _mm_cmpeq_epi32 (a, b);
2709         return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
2710     }
2711 #endif     //SSE4
2712 
2713 //the special case of functions working only for 32 bits, no SSE4
_MM_INSERT_EPI64_32(__m128i vec,int p,const int LANE)2714 _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
2715 {
2716     _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
2717     _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
2718     __m128i vec_masked, p_masked;
2719     pvec[LANE] = p;
2720     mask[LANE] = 0x0;
2721     vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2722     p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2723     return _mm_or_si128(vec_masked, p_masked);
2724 }
2725 
_MM_EXTRACT_EPI64_32(__m128i val,const int LANE)2726 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
2727 {
2728     _NEON2SSE_ALIGN_16 int64_t tmp[2];
2729     _mm_store_si128((__m128i*)tmp, val);
2730     return tmp[LANE];
2731 }
2732 
2733 #ifndef _NEON2SSE_64BIT_SSE4
2734 #   define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
2735 #   define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
2736 #endif
2737 
2738 _NEON2SSESTORAGE int32x4_t  vqd_s32(int32x4_t a); //Doubling saturation for signed ints
vqd_s32(int32x4_t a)2739 _NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
2740 {
2741     //Overflow happens only if a and sum have the opposite signs
2742     __m128i c7fffffff, res, res_sat, res_xor_a;
2743     c7fffffff = _mm_set1_epi32(0x7fffffff);
2744     res = _mm_slli_epi32 (a, 1); // res = a*2
2745     res_sat = _mm_srli_epi32(a, 31);
2746     res_sat = _mm_add_epi32(res_sat, c7fffffff);
2747     res_xor_a = _mm_xor_si128(res, a);
2748     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
2749     res_sat = _mm_and_si128(res_xor_a, res_sat);
2750     res = _mm_andnot_si128(res_xor_a, res);
2751     return _mm_or_si128(res, res_sat);
2752 }
2753 
2754 
2755 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2756 //*************************************************************************
2757 //*************************************************************************
2758 //*****************  Functions redefinition\implementatin starts here *****
2759 //*************************************************************************
2760 //*************************************************************************
2761 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2762 
2763 /*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
2764 #ifdef ARM
2765 #define vector_addq_s32 _mm_add_epi32
2766 #else //if we have IA
2767 #define vector_addq_s32 vadd_s32
2768 #endif
2769 
2770 ********************************************************************************************
2771 Functions below are organised in the following way:
2772 
2773 Each NEON intrinsic function has one of the following options:
2774 1.  its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
2775 2.  x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
2776 3.  the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
2777 4.  for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
2778 the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
2779 - please consider such functions removal from your code.
2780 */
2781 
2782 //***********************************************************************
2783 //************************      Vector add   *****************************
2784 //***********************************************************************
2785 _NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
vadd_s8(int8x8_t a,int8x8_t b)2786 _NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
2787 {
2788     int8x8_t res64;
2789     return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
2790 }
2791 
2792 
2793 _NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
vadd_s16(int16x4_t a,int16x4_t b)2794 _NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
2795 {
2796     int16x4_t res64;
2797     return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
2798 }
2799 
2800 
2801 _NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
vadd_s32(int32x2_t a,int32x2_t b)2802 _NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
2803 {
2804     int32x2_t res64;
2805     return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
2806 }
2807 
2808 
2809 _NEON2SSESTORAGE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b); // VADD.I64 d0,d0,d0
vadd_s64(int64x1_t a,int64x1_t b)2810 _NEON2SSE_INLINE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b)
2811 {
2812     int64x1_t res64;
2813     res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
2814     return res64;
2815 }
2816 
2817 
2818 _NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
vadd_f32(float32x2_t a,float32x2_t b)2819 _NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
2820 {
2821     __m128 res;
2822     __m64_128 res64;
2823     res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
2824     _M64f(res64, res);
2825     return res64;
2826 }
2827 
2828 _NEON2SSESTORAGE uint8x8_t  vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
2829 #define vadd_u8 vadd_s8
2830 
2831 _NEON2SSESTORAGE uint16x4_t  vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
2832 #define vadd_u16 vadd_s16
2833 
2834 _NEON2SSESTORAGE uint32x2_t  vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
2835 #define vadd_u32 vadd_s32
2836 
2837 _NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b); // VADD.I64 d0,d0,d0
vadd_u64(uint64x1_t a,uint64x1_t b)2838 _NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b)
2839 {
2840     uint64x1_t res64;
2841     res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
2842     return res64;
2843 }
2844 
2845 
2846 _NEON2SSESTORAGE int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
2847 #define vaddq_s8 _mm_add_epi8
2848 
2849 _NEON2SSESTORAGE int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
2850 #define vaddq_s16 _mm_add_epi16
2851 
2852 _NEON2SSESTORAGE int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
2853 #define vaddq_s32 _mm_add_epi32
2854 
2855 _NEON2SSESTORAGE int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
2856 #define vaddq_s64 _mm_add_epi64
2857 
2858 _NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
2859 #define vaddq_f32 _mm_add_ps
2860 
2861 _NEON2SSESTORAGE uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
2862 #define vaddq_u8 _mm_add_epi8
2863 
2864 _NEON2SSESTORAGE uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
2865 #define vaddq_u16 _mm_add_epi16
2866 
2867 _NEON2SSESTORAGE uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
2868 #define vaddq_u32 _mm_add_epi32
2869 
2870 _NEON2SSESTORAGE uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
2871 #define vaddq_u64 _mm_add_epi64
2872 
2873 //**************************** Vector long add *****************************:
2874 //***********************************************************************
2875 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
2876 _NEON2SSESTORAGE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
vaddl_s8(int8x8_t a,int8x8_t b)2877 _NEON2SSE_INLINE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
2878 {
2879     __m128i a16, b16;
2880     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
2881     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2882     return _mm_add_epi16 (a16, b16);
2883 }
2884 
2885 _NEON2SSESTORAGE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
vaddl_s16(int16x4_t a,int16x4_t b)2886 _NEON2SSE_INLINE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
2887 {
2888     __m128i a32, b32;
2889     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
2890     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
2891     return _mm_add_epi32 (a32, b32);
2892 }
2893 
2894 _NEON2SSESTORAGE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
vaddl_s32(int32x2_t a,int32x2_t b)2895 _NEON2SSE_INLINE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
2896 {
2897     //may be not optimal
2898     __m128i a64, b64;
2899     a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
2900     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2901     return _mm_add_epi64 ( a64, b64);
2902 }
2903 
2904 _NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
vaddl_u8(uint8x8_t a,uint8x8_t b)2905 _NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
2906 {
2907     __m128i a16, b16;
2908     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
2909     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2910     return _mm_add_epi16 (a16, b16);
2911 }
2912 
2913 _NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
vaddl_u16(uint16x4_t a,uint16x4_t b)2914 _NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
2915 {
2916     __m128i a32, b32;
2917     a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
2918     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2919     return _mm_add_epi32 (a32, b32);
2920 }
2921 
2922 _NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
vaddl_u32(uint32x2_t a,uint32x2_t b)2923 _NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
2924 {
2925     //may be not optimal
2926     __m128i a64, b64;
2927     a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
2928     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2929     return _mm_add_epi64 (a64, b64);
2930 }
2931 
2932 //***************   Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
2933 //*************** *********************************************************************
2934 _NEON2SSESTORAGE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
vaddw_s8(int16x8_t a,int8x8_t b)2935 _NEON2SSE_INLINE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
2936 {
2937     __m128i b16;
2938     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2939     return _mm_add_epi16 (a, b16);
2940 }
2941 
2942 _NEON2SSESTORAGE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
vaddw_s16(int32x4_t a,int16x4_t b)2943 _NEON2SSE_INLINE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
2944 {
2945     __m128i b32;
2946     b32 =  _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
2947     return _mm_add_epi32 (a, b32);
2948 }
2949 
2950 _NEON2SSESTORAGE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
vaddw_s32(int64x2_t a,int32x2_t b)2951 _NEON2SSE_INLINE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
2952 {
2953     __m128i b64;
2954     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2955     return _mm_add_epi64 (a, b64);
2956 }
2957 
2958 _NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
vaddw_u8(uint16x8_t a,uint8x8_t b)2959 _NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
2960 {
2961     __m128i b16;
2962     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2963     return _mm_add_epi16 (a, b16);
2964 }
2965 
2966 _NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
vaddw_u16(uint32x4_t a,uint16x4_t b)2967 _NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
2968 {
2969     __m128i b32;
2970     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2971     return _mm_add_epi32 (a, b32);
2972 }
2973 
2974 _NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
vaddw_u32(uint64x2_t a,uint32x2_t b)2975 _NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
2976 {
2977     __m128i b64;
2978     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2979     return _mm_add_epi64 (a, b64);
2980 }
2981 
2982 //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ,  result truncated *******************************
2983 //*************************************************************************************************************************
2984 _NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b); // VHADD.S8 d0,d0,d0
vhadd_s8(int8x8_t a,int8x8_t b)2985 _NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b)
2986 {
2987     int8x8_t res64;
2988     return64(vhaddq_s8(_pM128i(a), _pM128i(b)));
2989 }
2990 
2991 
2992 _NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b); // VHADD.S16 d0,d0,d0
vhadd_s16(int16x4_t a,int16x4_t b)2993 _NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b)
2994 {
2995     int16x4_t res64;
2996     return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
2997 }
2998 
2999 
3000 _NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b); // VHADD.S32 d0,d0,d0
vhadd_s32(int32x2_t a,int32x2_t b)3001 _NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b)
3002 {
3003     int32x2_t res64;
3004     return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
3005 }
3006 
3007 
3008 _NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b); // VHADD.w d0,d0,d0
vhadd_u8(uint8x8_t a,uint8x8_t b)3009 _NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b)
3010 {
3011     uint8x8_t res64;
3012     return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
3013 }
3014 
3015 
3016 _NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b); // VHADD.s16 d0,d0,d0
vhadd_u16(uint16x4_t a,uint16x4_t b)3017 _NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b)
3018 {
3019     uint16x4_t res64;
3020     return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
3021 }
3022 
3023 
3024 _NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b); // VHADD.U32 d0,d0,d0
vhadd_u32(uint32x2_t a,uint32x2_t b)3025 _NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b)
3026 {
3027     uint32x2_t res64;
3028     return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
3029 }
3030 
3031 
3032 _NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
vhaddq_s8(int8x16_t a,int8x16_t b)3033 _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
3034 {
3035     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3036     __m128i tmp1, tmp2;
3037     tmp1 = _mm_and_si128(a,b);
3038     tmp2 = _mm_xor_si128(a,b);
3039     tmp2 = vshrq_n_s8(tmp2,1);
3040     return _mm_add_epi8(tmp1,tmp2);
3041 }
3042 
3043 _NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
vhaddq_s16(int16x8_t a,int16x8_t b)3044 _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
3045 {
3046     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3047     __m128i tmp1, tmp2;
3048     tmp1 = _mm_and_si128(a,b);
3049     tmp2 = _mm_xor_si128(a,b);
3050     tmp2 = _mm_srai_epi16(tmp2,1);
3051     return _mm_add_epi16(tmp1,tmp2);
3052 }
3053 
3054 _NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
vhaddq_s32(int32x4_t a,int32x4_t b)3055 _NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
3056 {
3057     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3058     __m128i tmp1, tmp2;
3059     tmp1 = _mm_and_si128(a,b);
3060     tmp2 = _mm_xor_si128(a,b);
3061     tmp2 = _mm_srai_epi32(tmp2,1);
3062     return _mm_add_epi32(tmp1,tmp2);
3063 }
3064 
3065 _NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
vhaddq_u8(uint8x16_t a,uint8x16_t b)3066 _NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
3067 {
3068     __m128i c1, sum, res;
3069     c1 = _mm_set1_epi8(1);
3070     sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
3071     res = _mm_xor_si128(a, b); //for rounding compensation
3072     res = _mm_and_si128(res,c1); //for rounding compensation
3073     return _mm_sub_epi8 (sum, res); //actual rounding compensation
3074 }
3075 
3076 _NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
vhaddq_u16(uint16x8_t a,uint16x8_t b)3077 _NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
3078 {
3079     __m128i sum, res;
3080     sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
3081     res = _mm_xor_si128(a, b); //for rounding compensation
3082     res = _mm_slli_epi16 (res,15); //shift left  then back right to
3083     res = _mm_srli_epi16 (res,15); //get 1 or zero
3084     return _mm_sub_epi16 (sum, res); //actual rounding compensation
3085 }
3086 
3087 _NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
vhaddq_u32(uint32x4_t a,uint32x4_t b)3088 _NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
3089 {
3090     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3091     __m128i tmp1, tmp2;
3092     tmp1 = _mm_and_si128(a,b);
3093     tmp2 = _mm_xor_si128(a,b);
3094     tmp2 = _mm_srli_epi32(tmp2,1);
3095     return _mm_add_epi32(tmp1,tmp2);
3096 }
3097 
3098 //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1   ***************************
3099 //*****************************************************************************************************************************
3100 _NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b); // VRHADD.S8 d0,d0,d0
vrhadd_s8(int8x8_t a,int8x8_t b)3101 _NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b)
3102 {
3103     int8x8_t res64;
3104     return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
3105 }
3106 
3107 
3108 _NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b); // VRHADD.S16 d0,d0,d0
vrhadd_s16(int16x4_t a,int16x4_t b)3109 _NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b)
3110 {
3111     int16x4_t res64;
3112     return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
3113 }
3114 
3115 
3116 _NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b); // VRHADD.S32 d0,d0,d0
vrhadd_s32(int32x2_t a,int32x2_t b)3117 _NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b)
3118 {
3119     int32x2_t res64;
3120     return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
3121 }
3122 
3123 
3124 _NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
vrhadd_u8(uint8x8_t a,uint8x8_t b)3125 _NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
3126 {
3127     uint8x8_t res64;
3128     return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3129 }
3130 
3131 
3132 _NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
vrhadd_u16(uint16x4_t a,uint16x4_t b)3133 _NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
3134 {
3135     uint16x4_t res64;
3136     return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3137 }
3138 
3139 
3140 _NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b); // VRHADD.U32 d0,d0,d0
vrhadd_u32(uint32x2_t a,uint32x2_t b)3141 _NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b)
3142 {
3143     uint32x2_t res64;
3144     return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
3145 }
3146 
3147 
3148 _NEON2SSESTORAGE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
vrhaddq_s8(int8x16_t a,int8x16_t b)3149 _NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
3150 {
3151     //no signed average in x86 SIMD, go to unsigned
3152     __m128i c128, au, bu, sum;
3153     c128 = _mm_set1_epi8((int8_t)0x80); //-128
3154     au = _mm_sub_epi8(a, c128); //add 128
3155     bu = _mm_sub_epi8(b, c128); //add 128
3156     sum = _mm_avg_epu8(au, bu);
3157     return _mm_add_epi8 (sum, c128); //sub 128
3158 }
3159 
3160 _NEON2SSESTORAGE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
vrhaddq_s16(int16x8_t a,int16x8_t b)3161 _NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
3162 {
3163     //no signed average in x86 SIMD, go to unsigned
3164     __m128i cx8000, au, bu, sum;
3165     cx8000 = _mm_set1_epi16((int16_t)0x8000); // - 32768
3166     au = _mm_sub_epi16(a, cx8000); //add 32768
3167     bu = _mm_sub_epi16(b, cx8000); //add 32768
3168     sum = _mm_avg_epu16(au, bu);
3169     return _mm_add_epi16 (sum, cx8000); //sub 32768
3170 }
3171 
3172 _NEON2SSESTORAGE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
vrhaddq_s32(int32x4_t a,int32x4_t b)3173 _NEON2SSE_INLINE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b)
3174 {
3175     //need to avoid overflow
3176     __m128i a2, b2, res, sum;
3177     a2 = _mm_srai_epi32(a,1); //a2=a/2;
3178     b2 = _mm_srai_epi32(b,1); // b2=b/2;
3179     res = _mm_or_si128(a,b); //for rounding
3180     res = _mm_slli_epi32 (res,31); //shift left  then back right to
3181     res = _mm_srli_epi32 (res,31); //get 1 or zero
3182     sum = _mm_add_epi32(a2,b2);
3183     return _mm_add_epi32(sum,res);
3184 }
3185 
3186 _NEON2SSESTORAGE uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
3187 #define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
3188 
3189 _NEON2SSESTORAGE uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
3190 #define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
3191 
3192 
3193 _NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
vrhaddq_u32(uint32x4_t a,uint32x4_t b)3194 _NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
3195 {
3196     //need to avoid overflow
3197     __m128i a2, b2, res, sum;
3198     a2 = _mm_srli_epi32(a,1); //a2=a/2;
3199     b2 = _mm_srli_epi32(b,1); // b2=b/2;
3200     res = _mm_or_si128(a,b); //for rounding
3201     res = _mm_slli_epi32 (res,31); //shift left  then back right to
3202     res = _mm_srli_epi32 (res,31); //get 1 or zero
3203     sum = _mm_add_epi32(a2,b2);
3204     return _mm_add_epi32(sum,res);
3205 }
3206 
3207 //****************** VQADD: Vector saturating add ************************
3208 //************************************************************************
3209 _NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
vqadd_s8(int8x8_t a,int8x8_t b)3210 _NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
3211 {
3212     int8x8_t res64;
3213     return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
3214 }
3215 
3216 
3217 _NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
vqadd_s16(int16x4_t a,int16x4_t b)3218 _NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
3219 {
3220     int16x4_t res64;
3221     return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
3222 }
3223 
3224 
3225 _NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b); // VQADD.S32 d0,d0,d0
vqadd_s32(int32x2_t a,int32x2_t b)3226 _NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b)
3227 {
3228     int32x2_t res64;
3229     return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
3230 }
3231 
3232 
3233 _NEON2SSESTORAGE int64x1_t  vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)3234 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3235 {
3236     int64x1_t res;
3237     uint64_t a64, b64;
3238     a64 = a.m64_u64[0];
3239     b64 = b.m64_u64[0];
3240     res.m64_u64[0] = a64 + b64;
3241     a64 = (a64 >> 63) + (~_SIGNBIT64);
3242     if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
3243         res.m64_u64[0] = a64;
3244     }
3245     return res;
3246 }
3247 
3248 _NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
vqadd_u8(uint8x8_t a,uint8x8_t b)3249 _NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
3250 {
3251     uint8x8_t res64;
3252     return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
3253 }
3254 
3255 
3256 _NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
vqadd_u16(uint16x4_t a,uint16x4_t b)3257 _NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
3258 {
3259     uint16x4_t res64;
3260     return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
3261 }
3262 
3263 
3264 _NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b); // VQADD.U32 d0,d0,d0
vqadd_u32(uint32x2_t a,uint32x2_t b)3265 _NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b)
3266 {
3267     uint32x2_t res64;
3268     return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
3269 }
3270 
3271 
3272 _NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64 (uint64x1_t a,uint64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)3273 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3274 {
3275     _NEON2SSE_ALIGN_16 uint64_t a64, b64;
3276     uint64x1_t res;
3277     a64 = a.m64_u64[0];
3278     b64 = b.m64_u64[0];
3279     res.m64_u64[0] = a64 + b64;
3280     if (res.m64_u64[0] < a64) {
3281         res.m64_u64[0] = ~(uint64_t)0;
3282     }
3283     return res;
3284 }
3285 
3286 _NEON2SSESTORAGE int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
3287 #define vqaddq_s8 _mm_adds_epi8
3288 
3289 _NEON2SSESTORAGE int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
3290 #define vqaddq_s16 _mm_adds_epi16
3291 
3292 _NEON2SSESTORAGE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
vqaddq_s32(int32x4_t a,int32x4_t b)3293 _NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
3294 {
3295     //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
3296     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
3297     c7fffffff = _mm_set1_epi32(0x7fffffff);
3298     res = _mm_add_epi32(a, b);
3299     res_sat = _mm_srli_epi32(a, 31);
3300     res_sat = _mm_add_epi32(res_sat, c7fffffff);
3301     res_xor_a = _mm_xor_si128(res, a);
3302     b_xor_a_ = _mm_xor_si128(b, a);
3303     res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
3304     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
3305     res_sat = _mm_and_si128(res_xor_a, res_sat);
3306     res = _mm_andnot_si128(res_xor_a, res);
3307     return _mm_or_si128(res, res_sat);
3308 }
3309 
3310 _NEON2SSESTORAGE int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3311 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3312 {
3313     _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
3314     _mm_store_si128((__m128i*)atmp, a);
3315     _mm_store_si128((__m128i*)btmp, b);
3316     res[0] = atmp[0] + btmp[0];
3317     res[1] = atmp[1] + btmp[1];
3318 
3319     atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
3320     atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
3321 
3322     if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
3323         res[0] = atmp[0];
3324     }
3325     if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
3326         res[1] = atmp[1];
3327     }
3328     return _mm_load_si128((__m128i*)res);
3329 }
3330 
3331 _NEON2SSESTORAGE uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
3332 #define vqaddq_u8 _mm_adds_epu8
3333 
3334 _NEON2SSESTORAGE uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
3335 #define vqaddq_u16 _mm_adds_epu16
3336 
3337 _NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
vqaddq_u32(uint32x4_t a,uint32x4_t b)3338 _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
3339 {
3340     __m128i c80000000, cmp, subsum, suba, sum;
3341     c80000000 = _mm_set1_epi32 (0x80000000);
3342     sum = _mm_add_epi32 (a, b);
3343     subsum = _mm_sub_epi32 (sum, c80000000);
3344     suba = _mm_sub_epi32 (a, c80000000);
3345     cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
3346     return _mm_or_si128 (sum, cmp); //saturation
3347 }
3348 
3349 _NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
3350 #ifdef USE_SSE4
vqaddq_u64(uint64x2_t a,uint64x2_t b)3351     _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
3352     {
3353         __m128i c80000000, sum, cmp, suba, subsum;
3354         c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
3355         sum = _mm_add_epi64 (a, b);
3356         subsum = _mm_sub_epi64 (sum, c80000000);
3357         suba = _mm_sub_epi64 (a, c80000000);
3358         cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
3359         return _mm_or_si128 (sum, cmp); //saturation
3360     }
3361 #else
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64 (uint64x2_t a,uint64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3362     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3363     {
3364         _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
3365         _mm_store_si128((__m128i*)atmp, a);
3366         _mm_store_si128((__m128i*)btmp, b);
3367         res[0] = atmp[0] + btmp[0];
3368         res[1] = atmp[1] + btmp[1];
3369         if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
3370         if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
3371         return _mm_load_si128((__m128i*)(res));
3372     }
3373 #endif
3374 
3375 
3376 //******************* Vector add high half (truncated)  ******************
3377 //************************************************************************
3378 _NEON2SSESTORAGE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
vaddhn_s16(int16x8_t a,int16x8_t b)3379 _NEON2SSE_INLINE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
3380 {
3381     int8x8_t res64;
3382     __m128i sum;
3383     sum = _mm_add_epi16 (a, b);
3384     sum = _mm_srai_epi16 (sum, 8);
3385     sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
3386     return64(sum);
3387 }
3388 
3389 _NEON2SSESTORAGE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
vaddhn_s32(int32x4_t a,int32x4_t b)3390 _NEON2SSE_INLINE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
3391 {
3392     int16x4_t res64;
3393     __m128i sum;
3394     sum = _mm_add_epi32 (a, b);
3395     sum = _mm_srai_epi32(sum, 16);
3396     sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
3397     return64(sum);
3398 }
3399 
3400 _NEON2SSESTORAGE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
vaddhn_s64(int64x2_t a,int64x2_t b)3401 _NEON2SSE_INLINE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b)
3402 {
3403     int32x2_t res64;
3404     __m128i sum;
3405     sum = _mm_add_epi64 (a, b);
3406     sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (0 << 4) | (2 << 6));
3407     return64(sum);
3408 }
3409 
3410 _NEON2SSESTORAGE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
vaddhn_u16(uint16x8_t a,uint16x8_t b)3411 _NEON2SSE_INLINE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
3412 {
3413     uint8x8_t res64;
3414     __m128i sum;
3415     sum = _mm_add_epi16 (a, b);
3416     sum = _mm_srli_epi16 (sum, 8);
3417     sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
3418     return64(sum);
3419 }
3420 
3421 _NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
vaddhn_u32(uint32x4_t a,uint32x4_t b)3422 _NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
3423 {
3424     uint16x4_t res64;
3425      __m128i sum;
3426     sum = _mm_add_epi32 (a, b);
3427     sum = _mm_srli_epi32 (sum, 16);
3428 #ifdef USE_SSE4
3429     sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
3430 #else
3431     sum = _mm_shuffle_epi8 (sum, *(__m128i*) mask8_32_even_odd); //go to 16 bits
3432 #endif
3433     return64(sum);
3434 }
3435 
3436 _NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
3437 #define vaddhn_u64 vaddhn_s64
3438 
3439 //*********** Vector rounding add high half: vraddhn_<type> ******************.
3440 //***************************************************************************
3441 _NEON2SSESTORAGE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
vraddhn_s16(int16x8_t a,int16x8_t b)3442 _NEON2SSE_INLINE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
3443 {
3444     int8x8_t res64;
3445     __m128i sum, mask1;
3446     sum = _mm_add_epi16 (a, b);
3447     mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
3448     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
3449     sum = _mm_srai_epi16 (sum, 8); //get high half
3450     sum = _mm_add_epi16 (sum, mask1); //actual rounding
3451     sum = _mm_packs_epi16 (sum, sum);
3452     return64(sum);
3453 }
3454 
3455 _NEON2SSESTORAGE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
vraddhn_s32(int32x4_t a,int32x4_t b)3456 _NEON2SSE_INLINE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
3457 {
3458     //SIMD may be not optimal, serial may be faster
3459     int16x4_t res64;
3460     __m128i sum, mask1;
3461     sum = _mm_add_epi32 (a, b);
3462     mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
3463     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
3464     sum = _mm_srai_epi32 (sum, 16); //get high half
3465     sum = _mm_add_epi32 (sum, mask1); //actual rounding
3466     sum = _mm_packs_epi32 (sum, sum);
3467     return64(sum);
3468 }
3469 
3470 _NEON2SSESTORAGE int32x2_t  vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
vraddhn_s64(int64x2_t a,int64x2_t b)3471 _NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
3472 {
3473     //SIMD may be not optimal, serial may be faster
3474     int32x2_t res64;
3475     __m128i sum, mask1;
3476     sum = _mm_add_epi64 (a, b);
3477     mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
3478     mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
3479     sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
3480     sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (1 << 4) | (3 << 6));
3481     return64(sum);
3482 }
3483 
3484 _NEON2SSESTORAGE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
vraddhn_u16(uint16x8_t a,uint16x8_t b)3485 _NEON2SSE_INLINE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
3486 {
3487     uint8x8_t res64;
3488     __m128i sum, mask1;
3489     sum = _mm_add_epi16 (a, b);
3490     mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
3491     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
3492     sum = _mm_srai_epi16 (sum, 8); //get high half
3493     sum = _mm_add_epi16 (sum, mask1); //actual rounding
3494     sum = _mm_packus_epi16 (sum, sum);
3495     return64(sum);
3496 }
3497 
3498 _NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
vraddhn_u32(uint32x4_t a,uint32x4_t b)3499 _NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
3500 {
3501     //SIMD may be not optimal, serial may be faster
3502     uint16x4_t res64;
3503     __m128i sum, mask1;
3504     sum = _mm_add_epi32 (a, b);
3505     mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
3506     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
3507     sum = _mm_srai_epi32 (sum, 16); //get high half
3508     sum = _mm_add_epi32 (sum, mask1); //actual rounding
3509     sum = _MM_PACKUS1_EPI32 (sum);
3510     return64(sum);
3511 }
3512 
3513 _NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
3514 #define vraddhn_u64 vraddhn_s64
3515 
3516 //**********************************************************************************
3517 //*********             Multiplication            *************************************
3518 //**************************************************************************************
3519 
3520 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
3521 //As we don't go to wider result functions are equal to "multiply low" in x86
3522 _NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
vmul_s8(int8x8_t a,int8x8_t b)3523 _NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
3524 {
3525     // no 8 bit simd multiply, need to go to 16 bits in SSE
3526     int8x8_t res64;
3527     __m128i a128, b128, res;
3528     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
3529     b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3530     res = _mm_mullo_epi16 (a128, b128);
3531     res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
3532     return64(res);
3533 }
3534 
3535 _NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a,  int16x4_t b); // VMUL.I16 d0,d0,d0
3536 #define vmul_s16 vmul_u16
3537 
3538 _NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a,  int32x2_t b); // VMUL.I32 d0,d0,d0
3539 #define vmul_s32 vmul_u32
3540 
3541 _NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
vmul_f32(float32x2_t a,float32x2_t b)3542 _NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
3543 {
3544     float32x4_t tmp;
3545     __m64_128 res64;
3546     tmp =  _mm_mul_ps(_pM128(a),_pM128(b));
3547     _M64f(res64, tmp); //use low 64 bits
3548     return res64;
3549 }
3550 
3551 _NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
vmul_u8(uint8x8_t a,uint8x8_t b)3552 _NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
3553 {
3554     // no 8 bit simd multiply, need to go to 16 bits in SSE
3555     uint8x8_t res64;
3556     __m128i mask, a128, b128, res;
3557     mask = _mm_set1_epi16(0xff);
3558     a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
3559     b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
3560     res = _mm_mullo_epi16 (a128, b128);
3561     res = _mm_and_si128(res, mask); //to avoid saturation
3562     res = _mm_packus_epi16 (res,res); //use only low 64 bits
3563     return64(res);
3564 }
3565 
3566 _NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
vmul_u16(uint16x4_t a,uint16x4_t b)3567 _NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
3568 {
3569     uint16x4_t res64;
3570     return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
3571 }
3572 
3573 _NEON2SSESTORAGE uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vmul_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3574 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3575 {
3576     uint32x2_t res;
3577     res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
3578     res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
3579     return res;
3580 }
3581 
3582 _NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
vmul_p8(poly8x8_t a,poly8x8_t b)3583 _NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
3584 {
3585     //may be optimized
3586     poly8x8_t res64;
3587     __m128i a64, b64, c1, res, tmp, bmasked;
3588     int i;
3589     a64 = _pM128i(a);
3590     b64 = _pM128i(b);
3591     c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
3592     c1 = vshrq_n_u8(c1,7); //0x1
3593     bmasked = _mm_and_si128(b64, c1); //0x1
3594     res = vmulq_u8(a64, bmasked);
3595     for(i = 1; i<8; i++) {
3596         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3597         bmasked = _mm_and_si128(b64, c1); //0x1
3598         tmp = vmulq_u8(a64, bmasked);
3599         res = _mm_xor_si128(res, tmp);
3600     }
3601     return64 (res);
3602 }
3603 
3604 _NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
vmulq_s8(int8x16_t a,int8x16_t b)3605 _NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
3606 {
3607     // no 8 bit simd multiply, need to go to 16 bits
3608     //solution may be not optimal
3609     __m128i a16, b16, r16_1, r16_2;
3610     a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
3611     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3612     r16_1 = _mm_mullo_epi16 (a16, b16);
3613     //swap hi and low part of a and b to process the remaining data
3614     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3615     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3616     a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3617     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1  __m128i r16_2
3618 
3619     r16_2 = _mm_mullo_epi16 (a16, b16);
3620     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
3621     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
3622 
3623     return _mm_unpacklo_epi64(r16_1,  r16_2);
3624 }
3625 
3626 _NEON2SSESTORAGE int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
3627 #define vmulq_s16 _mm_mullo_epi16
3628 
3629 _NEON2SSESTORAGE int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
3630 #define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
3631 
3632 _NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
3633 #define vmulq_f32 _mm_mul_ps
3634 
3635 _NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
vmulq_u8(uint8x16_t a,uint8x16_t b)3636 _NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
3637 {
3638     // no 8 bit simd multiply, need to go to 16 bits
3639     //solution may be not optimal
3640     __m128i maskff, a16, b16, r16_1, r16_2;
3641     maskff = _mm_set1_epi16(0xff);
3642     a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
3643     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3644     r16_1 = _mm_mullo_epi16 (a16, b16);
3645     r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
3646     //swap hi and low part of a and b to process the remaining data
3647     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3648     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3649     a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3650     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3651 
3652     r16_2 = _mm_mullo_epi16 (a16, b16);
3653     r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
3654     return _mm_packus_epi16 (r16_1,  r16_2);
3655 }
3656 
3657 _NEON2SSESTORAGE uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
3658 #define vmulq_u16 _mm_mullo_epi16
3659 
3660 _NEON2SSESTORAGE uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
3661 #define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
3662 
3663 _NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
vmulq_p8(poly8x16_t a,poly8x16_t b)3664 _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
3665 {
3666     //may be optimized
3667     __m128i c1, res, tmp, bmasked;
3668     int i;
3669     c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
3670     c1 = vshrq_n_u8(c1,7); //0x1
3671     bmasked = _mm_and_si128(b, c1); //0x1
3672     res = vmulq_u8(a, bmasked);
3673     for(i = 1; i<8; i++) {
3674         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3675         bmasked = _mm_and_si128(b, c1); //0x1
3676         tmp = vmulq_u8(a, bmasked);
3677         res = _mm_xor_si128(res, tmp);
3678     }
3679     return res;
3680 }
3681 
3682 //************************* Vector long multiply ***********************************
3683 //****************************************************************************
3684 _NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
vmull_s8(int8x8_t a,int8x8_t b)3685 _NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
3686 {
3687     //no 8 bit simd multiply, need to go to 16 bits
3688     __m128i a16, b16;
3689     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
3690     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
3691     return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3692 }
3693 
3694 _NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
vmull_s16(int16x4_t a,int16x4_t b)3695 _NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
3696 {
3697 #ifdef USE_SSE4
3698     __m128i a16, b16;
3699     a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
3700     b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
3701     return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3702 #else
3703     __m128i low, hi, a128,b128;
3704     a128 = _pM128i(a);
3705     b128 = _pM128i(b);
3706     low =  _mm_mullo_epi16(a128,b128);
3707     hi =   _mm_mulhi_epi16(a128,b128);
3708     return _mm_unpacklo_epi16(low,hi);
3709 #endif
3710 }
3711 
3712 _NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
vmull_s32(int32x2_t a,int32x2_t b)3713 _NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
3714 {
3715     __m128i ab, ba, a128, b128;
3716     a128 = _pM128i(a);
3717     b128 = _pM128i(b);
3718     ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3719     ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3720     return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3721 }
3722 
3723 _NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
vmull_u8(uint8x8_t a,uint8x8_t b)3724 _NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
3725 {
3726     //no 8 bit simd multiply, need to go to 16 bits
3727     __m128i a16, b16;
3728     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
3729     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
3730     return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3731 }
3732 
3733 _NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
vmull_u16(uint16x4_t a,uint16x4_t b)3734 _NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
3735 {
3736 #ifdef USE_SSE4
3737     __m128i a16, b16;
3738     a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
3739     b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
3740     return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3741 #else
3742     __m128i a128,b128,low, hi;
3743     a128 = _pM128i(a);
3744     b128 = _pM128i(b);
3745     low =  _mm_mullo_epi16(a128,b128);
3746     hi =   _mm_mulhi_epu16(a128,b128);
3747     return _mm_unpacklo_epi16(low,hi);
3748 #endif
3749 }
3750 
3751 _NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
vmull_u32(uint32x2_t a,uint32x2_t b)3752 _NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
3753 {
3754     ///may be not optimal compared with serial implementation
3755     __m128i ab, ba, a128, b128;
3756     a128 = _pM128i(a);
3757     b128 = _pM128i(b);
3758     ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3759     ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3760     return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3761 }
3762 
3763 _NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
vmull_p8(poly8x8_t a,poly8x8_t b)3764 _NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
3765 {
3766     //may be optimized
3767     __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
3768     int i;
3769     a128 = _pM128i(a);
3770     b128 = _pM128i(b);
3771     c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
3772     c1 = vshrq_n_u8(c1,7); //0x1
3773     bmasked = _mm_and_si128(b128, c1); //0x1
3774 
3775     a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
3776     bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3777     res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
3778     for(i = 1; i<8; i++) {
3779         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3780         bmasked = _mm_and_si128(b128, c1); //0x1
3781         bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3782         tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
3783         res = _mm_xor_si128(res, tmp);
3784     }
3785     return res;
3786 }
3787 
3788 //****************Vector saturating doubling long multiply **************************
3789 //*****************************************************************
3790 _NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
vqdmull_s16(int16x4_t a,int16x4_t b)3791 _NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
3792 {
3793     //the serial soulution may be faster due to saturation
3794     __m128i res;
3795     res = vmull_s16(a, b);
3796     return vqd_s32(res);
3797 }
3798 
3799 _NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3800 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
3801 {
3802     //the serial soulution may be faster due to saturation
3803     __m128i res;
3804     res = vmull_s32(a,b);
3805     return vqaddq_s64(res,res); //slow serial function!!!!
3806 }
3807 
3808 //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  ************************
3809 //******************************************************************************************
3810 _NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
vmla_s8(int8x8_t a,int8x8_t b,int8x8_t c)3811 _NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
3812 {
3813     // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
3814     int8x8_t res64;
3815     __m128i b128, c128, res;
3816     b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3817     c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3818     res = _mm_mullo_epi16 (c128, b128);
3819     res  =  _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
3820     res  = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3821     return64(res);
3822 }
3823 
3824 _NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
vmla_s16(int16x4_t a,int16x4_t b,int16x4_t c)3825 _NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
3826 {
3827     int16x4_t res64;
3828     return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
3829 }
3830 
3831 
3832 _NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
vmla_s32(int32x2_t a,int32x2_t b,int32x2_t c)3833 _NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
3834 {
3835     int32x2_t res64;
3836     __m128i res;
3837     res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
3838     res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
3839     return64(res);
3840 }
3841 
3842 _NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
vmla_f32(float32x2_t a,float32x2_t b,float32x2_t c)3843 _NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
3844 {
3845     //fma is coming soon, but right now:
3846     __m128 res;
3847     __m64_128 res64;
3848     res = _mm_mul_ps (_pM128(c), _pM128(b));
3849     res = _mm_add_ps (_pM128(a), res);
3850     _M64f(res64, res);
3851     return res64;
3852 }
3853 
3854 _NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
vmla_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)3855 _NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
3856 {
3857     // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
3858     uint8x8_t res64;
3859     __m128i mask, b128, c128, res;
3860     mask = _mm_set1_epi16(0xff);
3861     b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3862     c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3863     res = _mm_mullo_epi16 (c128, b128);
3864     res = _mm_and_si128(res, mask); //to avoid saturation
3865     res = _mm_packus_epi16 (res, res);
3866     res =  _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3867     return64(res);
3868 }
3869 
3870 _NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
3871 #define vmla_u16 vmla_s16
3872 
3873 _NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
3874 #define vmla_u32 vmla_s32
3875 
3876 _NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
vmlaq_s8(int8x16_t a,int8x16_t b,int8x16_t c)3877 _NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
3878 {
3879     //solution may be not optimal
3880     // no 8 bit simd multiply, need to go to 16 bits
3881     __m128i b16, c16, r16_1, a_2,r16_2;
3882     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3883     c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
3884     r16_1 = _mm_mullo_epi16 (b16, c16);
3885     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
3886     r16_1 = _mm_add_epi8 (r16_1, a);
3887     //swap hi and low part of a, b and c to process the remaining data
3888     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3889     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3890     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3891     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3892     c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
3893 
3894     r16_2 = _mm_mullo_epi16 (b16, c16);
3895     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
3896     r16_2 = _mm_add_epi8(r16_2, a_2);
3897     return _mm_unpacklo_epi64(r16_1,r16_2);
3898 }
3899 
3900 _NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
vmlaq_s16(int16x8_t a,int16x8_t b,int16x8_t c)3901 _NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
3902 {
3903     __m128i res;
3904     res = _mm_mullo_epi16 (c, b);
3905     return _mm_add_epi16 (res, a);
3906 }
3907 
3908 _NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
vmlaq_s32(int32x4_t a,int32x4_t b,int32x4_t c)3909 _NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
3910 {
3911     __m128i res;
3912     res = _MM_MULLO_EPI32 (c,  b); //SSE4.1
3913     return _mm_add_epi32 (res, a);
3914 }
3915 
3916 _NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
vmlaq_f32(float32x4_t a,float32x4_t b,float32x4_t c)3917 _NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
3918 {
3919     //fma is coming soon, but right now:
3920     __m128 res;
3921     res = _mm_mul_ps (c, b);
3922     return _mm_add_ps (a, res);
3923 }
3924 
3925 _NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
vmlaq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)3926 _NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
3927 {
3928     //solution may be not optimal
3929     // no 8 bit simd multiply, need to go to 16 bits
3930     __m128i b16, c16, r16_1, a_2, r16_2;
3931     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3932     c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
3933     r16_1 = _mm_mullo_epi16 (b16, c16);
3934     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
3935     r16_1 = _mm_add_epi8 (r16_1, a);
3936     //swap hi and low part of a, b and c to process the remaining data
3937     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3938     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3939     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3940     b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
3941     c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
3942 
3943     r16_2 = _mm_mullo_epi16 (b16, c16);
3944     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
3945     r16_2 = _mm_add_epi8(r16_2, a_2);
3946     return _mm_unpacklo_epi64(r16_1,r16_2);
3947 }
3948 
3949 _NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
3950 #define vmlaq_u16 vmlaq_s16
3951 
3952 _NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
3953 #define vmlaq_u32 vmlaq_s32
3954 
3955 //**********************  Vector widening multiply accumulate (long multiply accumulate):
3956 //                          vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  **************
3957 //********************************************************************************************
3958 _NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
vmlal_s8(int16x8_t a,int8x8_t b,int8x8_t c)3959 _NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
3960 {
3961     int16x8_t res;
3962     res = vmull_s8(b, c);
3963     return _mm_add_epi16 (res, a);
3964 }
3965 
3966 _NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
vmlal_s16(int32x4_t a,int16x4_t b,int16x4_t c)3967 _NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
3968 {
3969     //may be not optimal compared with serial implementation
3970     int32x4_t res;
3971     res = vmull_s16(b,  c);
3972     return _mm_add_epi32 (res, a);
3973 }
3974 
3975 _NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
vmlal_s32(int64x2_t a,int32x2_t b,int32x2_t c)3976 _NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
3977 {
3978     //may be not optimal compared with serial implementation
3979     int64x2_t res;
3980     res = vmull_s32( b, c);
3981     return _mm_add_epi64 (res, a);
3982 }
3983 
3984 _NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
vmlal_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)3985 _NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
3986 {
3987     uint16x8_t res;
3988     res = vmull_u8(b, c);
3989     return _mm_add_epi16 (res, a);
3990 }
3991 
3992 _NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
vmlal_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)3993 _NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
3994 {
3995     //may be not optimal compared with serial implementation
3996     uint32x4_t res;
3997     res = vmull_u16(b, c);
3998     return _mm_add_epi32 (res, a);
3999 }
4000 
4001 _NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
vmlal_u32(uint64x2_t a,uint32x2_t b,uint32x2_t c)4002 _NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
4003 {
4004     //may be not optimal compared with serial implementation
4005     int64x2_t res;
4006     res = vmull_u32( b,c);
4007     return _mm_add_epi64 (res, a);
4008 }
4009 
4010 //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
4011 //********************************************************************************************
4012 _NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
vmls_s8(int8x8_t a,int8x8_t b,int8x8_t c)4013 _NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
4014 {
4015     // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
4016     int8x8_t res64;
4017     __m128i res;
4018     res64 = vmul_s8(b,c);
4019     res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
4020     return64(res);
4021 }
4022 
4023 _NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
vmls_s16(int16x4_t a,int16x4_t b,int16x4_t c)4024 _NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
4025 {
4026     int16x4_t res64;
4027     return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
4028 }
4029 
4030 
4031 _NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
vmls_s32(int32x2_t a,int32x2_t b,int32x2_t c)4032 _NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
4033 {
4034     int32x2_t res64;
4035     __m128i res;
4036     res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
4037     res =  _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
4038     return64(res);
4039 }
4040 
4041 _NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
vmls_f32(float32x2_t a,float32x2_t b,float32x2_t c)4042 _NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
4043 {
4044     __m128 res;
4045     __m64_128 res64;
4046     res = _mm_mul_ps (_pM128(c), _pM128(b));
4047     res = _mm_sub_ps (_pM128(a), res);
4048     _M64f(res64, res);
4049     return res64;
4050 }
4051 
4052 _NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
vmls_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)4053 _NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
4054 {
4055     // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
4056     uint8x8_t res64;
4057     __m128i res;
4058     res64 = vmul_u8(b,c);
4059     res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
4060     return64(res);
4061 }
4062 
4063 _NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
4064 #define vmls_u16 vmls_s16
4065 
4066 _NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
4067 #define vmls_u32 vmls_s32
4068 
4069 
4070 _NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
vmlsq_s8(int8x16_t a,int8x16_t b,int8x16_t c)4071 _NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
4072 {
4073     //solution may be not optimal
4074     // no 8 bit simd multiply, need to go to 16 bits
4075     __m128i b16, c16, r16_1, a_2, r16_2;
4076     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
4077     c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
4078     r16_1 = _mm_mullo_epi16 (b16, c16);
4079     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
4080     r16_1 = _mm_sub_epi8 (a, r16_1);
4081     //swap hi and low part of a, b, c to process the remaining data
4082     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4083     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4084     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4085     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
4086     c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
4087 
4088     r16_2 = _mm_mullo_epi16 (b16, c16);
4089     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
4090     r16_2 = _mm_sub_epi8 (a_2, r16_2);
4091     return _mm_unpacklo_epi64(r16_1,r16_2);
4092 }
4093 
4094 _NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
vmlsq_s16(int16x8_t a,int16x8_t b,int16x8_t c)4095 _NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
4096 {
4097     __m128i res;
4098     res = _mm_mullo_epi16 (c, b);
4099     return _mm_sub_epi16 (a, res);
4100 }
4101 
4102 _NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
vmlsq_s32(int32x4_t a,int32x4_t b,int32x4_t c)4103 _NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
4104 {
4105     __m128i res;
4106     res = _MM_MULLO_EPI32 (c, b); //SSE4.1
4107     return _mm_sub_epi32 (a, res);
4108 }
4109 
4110 _NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
vmlsq_f32(float32x4_t a,float32x4_t b,float32x4_t c)4111 _NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
4112 {
4113     __m128 res;
4114     res = _mm_mul_ps (c, b);
4115     return _mm_sub_ps (a, res);
4116 }
4117 
4118 _NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
vmlsq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)4119 _NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
4120 {
4121     //solution may be not optimal
4122     // no 8 bit simd multiply, need to go to 16 bits
4123     __m128i b16, c16, r16_1, a_2, r16_2;
4124     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
4125     c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
4126     r16_1 = _mm_mullo_epi16 (b16, c16);
4127     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
4128     r16_1 = _mm_sub_epi8 (a, r16_1);
4129     //swap hi and low part of a, b and c to process the remaining data
4130     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4131     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4132     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4133     b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
4134     c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
4135 
4136     r16_2 = _mm_mullo_epi16 (b16, c16);
4137     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
4138     r16_2 = _mm_sub_epi8(a_2, r16_2);
4139     return _mm_unpacklo_epi64(r16_1,r16_2);
4140 }
4141 
4142 _NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
4143 #define vmlsq_u16 vmlsq_s16
4144 
4145 _NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
4146 #define vmlsq_u32 vmlsq_s32
4147 
4148 //******************** Vector multiply subtract long (widening multiply subtract) ************************************
4149 //*************************************************************************************************************
4150 _NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
vmlsl_s8(int16x8_t a,int8x8_t b,int8x8_t c)4151 _NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
4152 {
4153     int16x8_t res;
4154     res = vmull_s8(b, c);
4155     return _mm_sub_epi16 (a, res);
4156 }
4157 
4158 _NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
vmlsl_s16(int32x4_t a,int16x4_t b,int16x4_t c)4159 _NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
4160 {
4161     //may be not optimal compared with serial implementation
4162     int32x4_t res;
4163     res = vmull_s16(b,  c);
4164     return _mm_sub_epi32 (a, res);
4165 }
4166 
4167 _NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
vmlsl_s32(int64x2_t a,int32x2_t b,int32x2_t c)4168 _NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
4169 {
4170     //may be not optimal compared with serial implementation
4171     int64x2_t res;
4172     res = vmull_s32( b,c);
4173     return _mm_sub_epi64 (a, res);
4174 }
4175 
4176 _NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
vmlsl_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)4177 _NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
4178 {
4179     uint16x8_t res;
4180     res = vmull_u8(b, c);
4181     return _mm_sub_epi16 (a, res);
4182 }
4183 
4184 _NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
vmlsl_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)4185 _NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
4186 {
4187     //may be not optimal compared with serial implementation
4188     uint32x4_t res;
4189     res = vmull_u16(b, c);
4190     return _mm_sub_epi32 (a, res);
4191 }
4192 
4193 _NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
vmlsl_u32(uint64x2_t a,uint32x2_t b,uint32x2_t c)4194 _NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
4195 {
4196     //may be not optimal compared with serial implementation
4197     int64x2_t res;
4198     res = vmull_u32( b,c);
4199     return _mm_sub_epi64 (a, res);
4200 }
4201 
4202 //******  Vector saturating doubling multiply high **********************
4203 //*************************************************************************
4204 _NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b); // VQDMULH.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)4205 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4206 {
4207     int16x4_t res;
4208     int32_t a32, b32, i;
4209     for (i = 0; i<4; i++) {
4210         a32 = (int32_t) a.m64_i16[i];
4211         b32 = (int32_t) b.m64_i16[i];
4212         a32 = (a32 * b32) >> 15;
4213         res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
4214     }
4215     return res;
4216 }
4217 
4218 _NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
vqdmulh_s32(int32x2_t a,int32x2_t b)4219 _NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
4220 {
4221     //may be not optimal compared with a serial solution
4222     int32x2_t res64;
4223     __m128i mask;
4224     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4225     int64x2_t mul;
4226     mul = vmull_s32(a,b);
4227     mul = _mm_slli_epi64(mul,1); //double the result
4228     //at this point start treating 2 64-bit numbers as 4 32-bit
4229     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4230     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4231     mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
4232     return64(mul);
4233 }
4234 
4235 _NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
vqdmulhq_s16(int16x8_t a,int16x8_t b)4236 _NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
4237 {
4238     __m128i res, res_lo, mask;
4239     _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4240     res = _mm_mulhi_epi16 (a, b);
4241     res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
4242     res_lo = _mm_mullo_epi16 (a, b);
4243     res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
4244     res = _mm_add_epi16(res, res_lo); //combine results
4245     mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
4246     return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
4247 }
4248 
4249 _NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)4250 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4251 {
4252     // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4253     __m128i ab, ba, mask, mul, mul1;
4254     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4255     ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4256     ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4257     mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4258     mul = _mm_slli_epi64(mul,1); //double the result
4259     ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4260     ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4261     mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4262     mul1 = _mm_slli_epi64(mul1,1); //double the result
4263     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4264     mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4265     mul = _mm_unpacklo_epi64(mul, mul1);
4266     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4267     return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
4268 }
4269 
4270 //********* Vector saturating rounding doubling multiply high ****************
4271 //****************************************************************************
4272 //If use _mm_mulhrs_xx functions  the result may differ from NEON one a little  due to different rounding rules and order
4273 _NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b); // VQRDMULH.S16 d0,d0,d0
vqrdmulh_s16(int16x4_t a,int16x4_t b)4274 _NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b)
4275 {
4276     int16x4_t res64;
4277     return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
4278 }
4279 
4280 _NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)4281 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4282 {
4283     //may be not optimal compared with a serial solution
4284     int32x2_t res64;
4285     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4286     __m128i res_sat, mask, mask1;
4287     int64x2_t mul;
4288     mul = vmull_s32(a,b);
4289     res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
4290     mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
4291     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
4292     mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
4293     //at this point start treating 2 64-bit numbers as 4 32-bit
4294     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4295     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4296     mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
4297     return64(mul);
4298 }
4299 
4300 _NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
vqrdmulhq_s16(int16x8_t a,int16x8_t b)4301 _NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
4302 {
4303     __m128i mask, res;
4304     _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4305     res = _mm_mulhrs_epi16 (a, b);
4306     mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
4307     return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
4308 }
4309 
4310 _NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)4311 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4312 {
4313     // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4314     __m128i ab, ba,  mask, mul, mul1, mask1;
4315     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4316     ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4317     ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4318     mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4319     mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
4320     mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
4321     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
4322     mul = _mm_add_epi32 (mul, mask1); //actual rounding
4323 
4324     ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4325     ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4326     mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4327     mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
4328     mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
4329     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
4330     mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
4331     //at this point start treating 2 64-bit numbers as 4 32-bit
4332     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4333     mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4334     mul = _mm_unpacklo_epi64(mul, mul1);
4335     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4336     return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
4337 }
4338 
4339 //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
4340 //*************************************************************************************************************************
4341 _NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
vqdmlal_s16(int32x4_t a,int16x4_t b,int16x4_t c)4342 _NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
4343 {
4344     //not optimal SIMD soulution, serial may be faster
4345     __m128i res32;
4346     res32 = vmull_s16(b,  c);
4347     res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
4348     return vqaddq_s32(res32, a); //saturation
4349 }
4350 
4351 _NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32 (int64x2_t a,int32x2_t b,int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)4352 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
4353 {
4354     __m128i res64;
4355     res64 = vmull_s32(b,c);
4356     res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
4357     return vqaddq_s64(res64, a); //saturation
4358 }
4359 
4360 //************************************************************************************
4361 //******************  Vector subtract ***********************************************
4362 //************************************************************************************
4363 _NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
vsub_s8(int8x8_t a,int8x8_t b)4364 _NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
4365 {
4366     int8x8_t res64;
4367     return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
4368 }
4369 
4370 
4371 _NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
vsub_s16(int16x4_t a,int16x4_t b)4372 _NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
4373 {
4374     int16x4_t res64;
4375     return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
4376 }
4377 
4378 
4379 _NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
vsub_s32(int32x2_t a,int32x2_t b)4380 _NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
4381 {
4382     int32x2_t res64;
4383     return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
4384 }
4385 
4386 
4387 _NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b); // VSUB.I64 d0,d0,d0
vsub_s64(int64x1_t a,int64x1_t b)4388 _NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b)
4389 {
4390     int64x1_t res64;
4391     res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
4392     return res64;
4393 }
4394 
4395 
4396 _NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
vsub_f32(float32x2_t a,float32x2_t b)4397 _NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
4398 {
4399     float32x2_t res;
4400     res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
4401     res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
4402     return res;
4403 }
4404 
4405 _NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
4406 #define vsub_u8 vsub_s8
4407 
4408 _NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
4409 #define vsub_u16 vsub_s16
4410 
4411 _NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
4412 #define vsub_u32 vsub_s32
4413 
4414 
4415 _NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b); // VSUB.I64 d0,d0,d0
vsub_u64(uint64x1_t a,uint64x1_t b)4416 _NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b)
4417 {
4418     int64x1_t res64;
4419     res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
4420     return res64;
4421 }
4422 
4423 
4424 _NEON2SSESTORAGE int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
4425 #define vsubq_s8 _mm_sub_epi8
4426 
4427 _NEON2SSESTORAGE int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
4428 #define vsubq_s16 _mm_sub_epi16
4429 
4430 _NEON2SSESTORAGE int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
4431 #define vsubq_s32 _mm_sub_epi32
4432 
4433 _NEON2SSESTORAGE int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
4434 #define vsubq_s64 _mm_sub_epi64
4435 
4436 _NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
4437 #define vsubq_f32 _mm_sub_ps
4438 
4439 _NEON2SSESTORAGE uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
4440 #define vsubq_u8 _mm_sub_epi8
4441 
4442 _NEON2SSESTORAGE uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
4443 #define vsubq_u16 _mm_sub_epi16
4444 
4445 _NEON2SSESTORAGE uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
4446 #define vsubq_u32 _mm_sub_epi32
4447 
4448 _NEON2SSESTORAGE uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
4449 #define vsubq_u64 _mm_sub_epi64
4450 
4451 //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
4452 //***********************************************************************************
4453 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
4454 _NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
vsubl_s8(int8x8_t a,int8x8_t b)4455 _NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
4456 {
4457     __m128i a16, b16;
4458     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
4459     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4460     return _mm_sub_epi16 (a16, b16);
4461 }
4462 
4463 _NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
vsubl_s16(int16x4_t a,int16x4_t b)4464 _NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
4465 {
4466     __m128i a32, b32;
4467     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
4468     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4469     return _mm_sub_epi32 (a32, b32);
4470 }
4471 
4472 _NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
vsubl_s32(int32x2_t a,int32x2_t b)4473 _NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
4474 {
4475     //may be not optimal
4476     __m128i a64, b64;
4477     a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
4478     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
4479     return _mm_sub_epi64 (a64, b64);
4480 }
4481 
4482 _NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
vsubl_u8(uint8x8_t a,uint8x8_t b)4483 _NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
4484 {
4485     __m128i a16, b16;
4486     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
4487     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4488     return _mm_sub_epi16 (a16, b16);
4489 }
4490 
4491 _NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
vsubl_u16(uint16x4_t a,uint16x4_t b)4492 _NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
4493 {
4494     __m128i a32, b32;
4495     a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
4496     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4497     return _mm_sub_epi32 (a32, b32);
4498 }
4499 
4500 _NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
vsubl_u32(uint32x2_t a,uint32x2_t b)4501 _NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
4502 {
4503     //may be not optimal
4504     __m128i a64, b64;
4505     a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
4506     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
4507     return _mm_sub_epi64 (a64, b64);
4508 }
4509 
4510 //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
4511 //*****************************************************************************************************
4512 _NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
vsubw_s8(int16x8_t a,int8x8_t b)4513 _NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
4514 {
4515     __m128i b16;
4516     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4517     return _mm_sub_epi16 (a, b16);
4518 }
4519 
4520 _NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
vsubw_s16(int32x4_t a,int16x4_t b)4521 _NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
4522 {
4523     __m128i b32;
4524     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4525     return _mm_sub_epi32 (a, b32);
4526 }
4527 
4528 _NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
vsubw_s32(int64x2_t a,int32x2_t b)4529 _NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
4530 {
4531     __m128i b64;
4532     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
4533     return _mm_sub_epi64 (a, b64);
4534 }
4535 
4536 _NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
vsubw_u8(uint16x8_t a,uint8x8_t b)4537 _NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
4538 {
4539     __m128i b16;
4540     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4541     return _mm_sub_epi16 (a, b16);
4542 }
4543 
4544 _NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
vsubw_u16(uint32x4_t a,uint16x4_t b)4545 _NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
4546 {
4547     __m128i b32;
4548     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4549     return _mm_sub_epi32 (a, b32);
4550 }
4551 
4552 _NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
vsubw_u32(uint64x2_t a,uint32x2_t b)4553 _NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
4554 {
4555     __m128i b64;
4556     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
4557     return _mm_sub_epi64 (a, b64);
4558 }
4559 
4560 //************************Vector saturating subtract *********************************
4561 //*************************************************************************************
4562 _NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
vqsub_s8(int8x8_t a,int8x8_t b)4563 _NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
4564 {
4565     int8x8_t res64;
4566     return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
4567 }
4568 
4569 
4570 _NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
vqsub_s16(int16x4_t a,int16x4_t b)4571 _NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
4572 {
4573     int16x4_t res64;
4574     return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
4575 }
4576 
4577 
4578 _NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b); // VQSUB.S32 d0,d0,d0
vqsub_s32(int32x2_t a,int32x2_t b)4579 _NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b)
4580 {
4581     int32x2_t res64;
4582     return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
4583 }
4584 
4585 
4586 _NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)4587 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4588 {
4589     uint64x1_t res;
4590     uint64_t a64,b64;
4591     a64 = a.m64_u64[0];
4592     b64 = b.m64_u64[0];
4593     res.m64_u64[0] = a64 - b64;
4594 
4595     a64 =  (a64 >> 63) + (~_SIGNBIT64);
4596     if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
4597         res.m64_u64[0] = a64;
4598     }
4599     return res;
4600 }
4601 
4602 _NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
vqsub_u8(uint8x8_t a,uint8x8_t b)4603 _NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
4604 {
4605     uint8x8_t res64;
4606     return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
4607 }
4608 
4609 
4610 _NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
vqsub_u16(uint16x4_t a,uint16x4_t b)4611 _NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
4612 {
4613     uint16x4_t res64;
4614     return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
4615 }
4616 
4617 
4618 _NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b); // VQSUB.U32 d0,d0,d0
vqsub_u32(uint32x2_t a,uint32x2_t b)4619 _NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b)
4620 {
4621     uint32x2_t res64;
4622     return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
4623 }
4624 
4625 
4626 _NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64 (uint64x1_t a,uint64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)4627 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4628 {
4629     uint64x1_t res;
4630     uint64_t a64, b64;
4631     a64 = _Ui64(a);
4632     b64 = _Ui64(b);
4633     if (a64 > b64) {
4634         res.m64_u64[0] = a64 - b64;
4635     } else {
4636         res.m64_u64[0] = 0;
4637     }
4638     return res;
4639 }
4640 
4641 _NEON2SSESTORAGE int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
4642 #define vqsubq_s8 _mm_subs_epi8
4643 
4644 _NEON2SSESTORAGE int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
4645 #define vqsubq_s16 _mm_subs_epi16
4646 
4647 _NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
vqsubq_s32(int32x4_t a,int32x4_t b)4648 _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
4649 {
4650     //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
4651     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
4652     c7fffffff = _mm_set1_epi32(0x7fffffff);
4653     res = _mm_sub_epi32(a, b);
4654     res_sat = _mm_srli_epi32(a, 31);
4655     res_sat = _mm_add_epi32(res_sat, c7fffffff);
4656     res_xor_a = _mm_xor_si128(res, a);
4657     b_xor_a = _mm_xor_si128(b, a);
4658     res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
4659     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
4660     res_sat = _mm_and_si128(res_xor_a, res_sat);
4661     res = _mm_andnot_si128(res_xor_a, res);
4662     return _mm_or_si128(res, res_sat);
4663 }
4664 
4665 _NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)4666 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4667 {
4668     _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
4669     _NEON2SSE_ALIGN_16 uint64_t res[2];
4670     _mm_store_si128((__m128i*)atmp, a);
4671     _mm_store_si128((__m128i*)btmp, b);
4672     res[0] = atmp[0] - btmp[0];
4673     res[1] = atmp[1] - btmp[1];
4674     if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
4675         res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
4676     }
4677     if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
4678         res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
4679     }
4680     return _mm_load_si128((__m128i*)res);
4681 }
4682 
4683 _NEON2SSESTORAGE uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
4684 #define vqsubq_u8 _mm_subs_epu8
4685 
4686 _NEON2SSESTORAGE uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
4687 #define vqsubq_u16 _mm_subs_epu16
4688 
4689 _NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
vqsubq_u32(uint32x4_t a,uint32x4_t b)4690 _NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
4691 {
4692     __m128i min, mask, sub;
4693     min = _MM_MIN_EPU32(a, b); //SSE4.1
4694     mask = _mm_cmpeq_epi32 (min,  b);
4695     sub = _mm_sub_epi32 (a, b);
4696     return _mm_and_si128 ( sub, mask);
4697 }
4698 
4699 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
4700 #ifdef USE_SSE4
vqsubq_u64(uint64x2_t a,uint64x2_t b)4701     _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
4702     {
4703         __m128i c80000000, subb, suba, cmp, sub;
4704         c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
4705         sub  = _mm_sub_epi64 (a, b);
4706         suba = _mm_sub_epi64 (a, c80000000);
4707         subb = _mm_sub_epi64 (b, c80000000);
4708         cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
4709         return _mm_and_si128 (sub, cmp); //saturation
4710     }
4711 #else
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64 (uint64x2_t a,uint64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)4712     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4713     {
4714         _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
4715         _mm_store_si128((__m128i*)atmp, a);
4716         _mm_store_si128((__m128i*)btmp, b);
4717         res[0] = (atmp[0] > btmp[0]) ? atmp[0] -  btmp[0] : 0;
4718         res[1] = (atmp[1] > btmp[1]) ? atmp[1] -  btmp[1] : 0;
4719         return _mm_load_si128((__m128i*)(res));
4720     }
4721 #endif
4722 
4723 //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1  ******************************************************
4724 //****************************************************************
4725 _NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
vhsub_s8(int8x8_t a,int8x8_t b)4726 _NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
4727 {
4728     //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
4729     int8x8_t res64;
4730     __m128i r16;
4731     int8x8_t r;
4732     r = vsub_s8 (a, b);
4733     r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
4734     r16 = _mm_srai_epi16 (r16, 1); //SSE2
4735     r16 =  _mm_packs_epi16 (r16,r16); //use low 64 bits
4736     return64(r16);
4737 }
4738 
4739 _NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b); // VHSUB.S16 d0,d0,d0
vhsub_s16(int16x4_t a,int16x4_t b)4740 _NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b)
4741 {
4742     int16x4_t res64;
4743     return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
4744 }
4745 
4746 
4747 
4748 _NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b); // VHSUB.S32 d0,d0,d0
vhsub_s32(int32x2_t a,int32x2_t b)4749 _NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b)
4750 {
4751     int32x2_t res64;
4752     return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
4753 }
4754 
4755 
4756 _NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b); // VHSUB.U8 d0,d0,d0
vhsub_u8(uint8x8_t a,uint8x8_t b)4757 _NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b)
4758 {
4759     uint8x8_t res64;
4760     return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
4761 }
4762 
4763 _NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b); // VHSUB.s16 d0,d0,d0
vhsub_u16(uint16x4_t a,uint16x4_t b)4764 _NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b)
4765 {
4766     uint16x4_t res64;
4767     return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
4768 }
4769 
4770 _NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b); // VHSUB.U32 d0,d0,d0
vhsub_u32(uint32x2_t a,uint32x2_t b)4771 _NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b)
4772 {
4773     uint32x2_t res64;
4774     return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
4775 }
4776 
4777 _NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
vhsubq_s8(int8x16_t a,int8x16_t b)4778 _NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
4779 {
4780     // //need to deal with the possibility of internal overflow
4781     __m128i c128, au,bu;
4782     c128 = _mm_set1_epi8((int8_t)128);
4783     au = _mm_add_epi8( a, c128);
4784     bu = _mm_add_epi8( b, c128);
4785     return vhsubq_u8(au,bu);
4786 }
4787 
4788 _NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
vhsubq_s16(int16x8_t a,int16x8_t b)4789 _NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
4790 {
4791     //need to deal with the possibility of internal overflow
4792     __m128i c8000, au,bu;
4793     c8000 = _mm_set1_epi16((int16_t)0x8000);
4794     au = _mm_add_epi16( a, c8000);
4795     bu = _mm_add_epi16( b, c8000);
4796     return vhsubq_u16(au,bu);
4797 }
4798 
4799 _NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
vhsubq_s32(int32x4_t a,int32x4_t b)4800 _NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
4801 {
4802     //need to deal with the possibility of internal overflow
4803     __m128i a2, b2,r, b_1;
4804     a2 = _mm_srai_epi32 (a,1);
4805     b2 = _mm_srai_epi32 (b,1);
4806     r = _mm_sub_epi32 (a2, b2);
4807     b_1 = _mm_andnot_si128(a, b); //!a and b
4808     b_1 = _mm_slli_epi32 (b_1,31);
4809     b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
4810     return _mm_sub_epi32(r,b_1);
4811 }
4812 
4813 _NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
vhsubq_u8(uint8x16_t a,uint8x16_t b)4814 _NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
4815 {
4816     __m128i avg;
4817     avg = _mm_avg_epu8 (a, b);
4818     return _mm_sub_epi8(a, avg);
4819 }
4820 
4821 _NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
vhsubq_u16(uint16x8_t a,uint16x8_t b)4822 _NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
4823 {
4824     __m128i avg;
4825     avg = _mm_avg_epu16 (a, b);
4826     return _mm_sub_epi16(a, avg);
4827 }
4828 
4829 _NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
vhsubq_u32(uint32x4_t a,uint32x4_t b)4830 _NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
4831 {
4832     //need to deal with the possibility of internal overflow
4833     __m128i a2, b2,r, b_1;
4834     a2 = _mm_srli_epi32 (a,1);
4835     b2 = _mm_srli_epi32 (b,1);
4836     r = _mm_sub_epi32 (a2, b2);
4837     b_1 = _mm_andnot_si128(a, b); //!a and b
4838     b_1 = _mm_slli_epi32 (b_1,31);
4839     b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
4840     return _mm_sub_epi32(r,b_1);
4841 }
4842 
4843 //******* Vector subtract high half (truncated) ** ************
4844 //************************************************************
4845 _NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
vsubhn_s16(int16x8_t a,int16x8_t b)4846 _NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
4847 {
4848     int8x8_t res64;
4849     __m128i sum, sum8;
4850     sum = _mm_sub_epi16 (a, b);
4851     sum8 = _mm_srai_epi16 (sum, 8);
4852     sum8 = _mm_packs_epi16(sum8,sum8);
4853     return64(sum8);
4854 }
4855 
4856 _NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
vsubhn_s32(int32x4_t a,int32x4_t b)4857 _NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
4858 {
4859     int16x4_t res64;
4860     __m128i sum, sum16;
4861     sum = _mm_sub_epi32 (a, b);
4862     sum16 = _mm_srai_epi32 (sum, 16);
4863     sum16 = _mm_packs_epi32(sum16,sum16);
4864     return64(sum16);
4865 }
4866 
4867 _NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
vsubhn_s64(int64x2_t a,int64x2_t b)4868 _NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
4869 {
4870     int32x2_t res64;
4871     __m128i sub;
4872     sub = _mm_sub_epi64 (a, b);
4873     sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
4874     return64(sub);
4875 }
4876 
4877 _NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
vsubhn_u16(uint16x8_t a,uint16x8_t b)4878 _NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
4879 {
4880     uint8x8_t res64;
4881     __m128i sum, sum8;
4882     sum = _mm_sub_epi16 (a, b);
4883     sum8 = _mm_srli_epi16 (sum, 8);
4884     sum8 =  _mm_packus_epi16(sum8,sum8);
4885     return64(sum8);
4886 }
4887 
4888 _NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
vsubhn_u32(uint32x4_t a,uint32x4_t b)4889 _NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
4890 {
4891     uint16x4_t res64;
4892      __m128i sum, sum16;
4893     sum = _mm_sub_epi32 (a, b);
4894     sum16 = _mm_srli_epi32 (sum, 16);
4895 #ifdef USE_SSE4
4896     sum16 =  _MM_PACKUS1_EPI32(sum16);
4897 #else
4898     sum16  = _mm_shuffle_epi8 (sum16, *(__m128i*) mask8_32_even_odd); //go to 16 bits
4899 #endif
4900     return64(sum16);
4901 }
4902 
4903 _NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
4904 #define vsubhn_u64 vsubhn_s64
4905 
4906 //************ Vector rounding subtract high half *********************
4907 //*********************************************************************
4908 _NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
vrsubhn_s16(int16x8_t a,int16x8_t b)4909 _NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
4910 {
4911     int8x8_t res64;
4912     __m128i sub, mask1;
4913     sub = _mm_sub_epi16 (a, b);
4914     mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
4915     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
4916     sub = _mm_srai_epi16 (sub, 8); //get high half
4917     sub = _mm_add_epi16 (sub, mask1); //actual rounding
4918     sub =  _mm_packs_epi16 (sub, sub);
4919     return64(sub);
4920 }
4921 
4922 _NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
vrsubhn_s32(int32x4_t a,int32x4_t b)4923 _NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
4924 {
4925     //SIMD may be not optimal, serial may be faster
4926     int16x4_t res64;
4927     __m128i sub, mask1;
4928     sub = _mm_sub_epi32 (a, b);
4929     mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
4930     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
4931     sub = _mm_srai_epi32 (sub, 16); //get high half
4932     sub = _mm_add_epi32 (sub, mask1); //actual rounding
4933     sub = _mm_packs_epi32 (sub, sub);
4934     return64(sub);
4935 }
4936 
4937 _NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
vrsubhn_s64(int64x2_t a,int64x2_t b)4938 _NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
4939 {
4940     //SIMD may be not optimal, serial may be faster
4941     int32x2_t res64;
4942     __m128i sub, mask1;
4943     sub = _mm_sub_epi64 (a, b);
4944     mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
4945     mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
4946     sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
4947     sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
4948     return64(sub);
4949 }
4950 
4951 _NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
vrsubhn_u16(uint16x8_t a,uint16x8_t b)4952 _NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
4953 {
4954     uint8x8_t res64;
4955     __m128i sub, mask1;
4956     sub = _mm_sub_epi16 (a, b);
4957     mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
4958     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
4959     sub = _mm_srai_epi16 (sub, 8); //get high half
4960     sub = _mm_add_epi16 (sub, mask1); //actual rounding
4961     sub = _mm_packus_epi16 (sub, sub);
4962     return64(sub);
4963 }
4964 
4965 _NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
vrsubhn_u32(uint32x4_t a,uint32x4_t b)4966 _NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
4967 {
4968     //SIMD may be not optimal, serial may be faster
4969     uint16x4_t res64;
4970     __m128i sub, mask1;
4971     sub = _mm_sub_epi32 (a, b);
4972     mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
4973     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
4974     sub = _mm_srai_epi32 (sub, 16); //get high half
4975     sub = _mm_add_epi32 (sub, mask1); //actual rounding
4976 #ifdef USE_SSE4
4977     sub =  _MM_PACKUS1_EPI32 (sub);
4978 #else
4979     sub = _mm_shuffle_epi8 (sub, *(__m128i*) mask8_32_even_odd); //go to 16 bits
4980 #endif
4981     return64(sub);
4982 }
4983 
4984 _NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
4985 #define vrsubhn_u64 vrsubhn_s64
4986 
4987 //*********** Vector saturating doubling multiply subtract long ********************
4988 //************************************************************************************
4989 _NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
vqdmlsl_s16(int32x4_t a,int16x4_t b,int16x4_t c)4990 _NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
4991 {
4992     //not optimal SIMD soulution, serial may be faster
4993     __m128i res32, mask;
4994     int32x4_t res;
4995     _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4996     res = vmull_s16(b,  c);
4997     res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
4998     mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
4999     res32 = _mm_xor_si128 (res32,  mask); //res32 saturated for 0x80000000
5000     return vqsubq_s32(a, res32); //saturation
5001 }
5002 
5003 _NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32 (int64x2_t a,int32x2_t b,int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)5004 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
5005 {
5006     __m128i res64, mask;
5007     int64x2_t res;
5008     _NEON2SSE_ALIGN_16 static const uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
5009     res = vmull_s32(b,  c);
5010     res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
5011     mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
5012     res64 = _mm_xor_si128 (res64,  mask); //res32 saturated for 0x80000000
5013     return vqsubq_s64(a, res64); //saturation
5014 }
5015 
5016 //******************  COMPARISON ***************************************
5017 //******************* Vector compare equal *************************************
5018 //****************************************************************************
5019 _NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
vceq_s8(int8x8_t a,int8x8_t b)5020 _NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
5021 {
5022     int8x8_t res64;
5023     return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
5024 }
5025 
5026 
5027 _NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
vceq_s16(int16x4_t a,int16x4_t b)5028 _NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
5029 {
5030     int16x4_t res64;
5031     return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
5032 }
5033 
5034 
5035 _NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
vceq_s32(int32x2_t a,int32x2_t b)5036 _NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
5037 {
5038     int32x2_t res64;
5039     return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
5040 }
5041 
5042 
5043 _NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
vceq_f32(float32x2_t a,float32x2_t b)5044 _NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
5045 {
5046     uint32x2_t res64;
5047     __m128 res;
5048     res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
5049     return64f(res);
5050 }
5051 
5052 _NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
vceq_u8(uint8x8_t a,uint8x8_t b)5053 _NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
5054 {
5055     uint8x8_t res64;
5056     return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
5057 }
5058 
5059 
5060 _NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
vceq_u16(uint16x4_t a,uint16x4_t b)5061 _NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
5062 {
5063     uint16x4_t res64;
5064     return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
5065 }
5066 
5067 
5068 _NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
vceq_u32(uint32x2_t a,uint32x2_t b)5069 _NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
5070 {
5071     uint32x2_t res64;
5072     return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
5073 }
5074 
5075 
5076 _NEON2SSESTORAGE uint8x8_t   vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
5077 #define vceq_p8 vceq_u8
5078 
5079 
5080 _NEON2SSESTORAGE uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
5081 #define vceqq_s8 _mm_cmpeq_epi8
5082 
5083 _NEON2SSESTORAGE uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
5084 #define vceqq_s16 _mm_cmpeq_epi16
5085 
5086 _NEON2SSESTORAGE uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
5087 #define vceqq_s32 _mm_cmpeq_epi32
5088 
5089 _NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
vceqq_f32(float32x4_t a,float32x4_t b)5090 _NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
5091 {
5092     __m128 res;
5093     res = _mm_cmpeq_ps(a,b);
5094     return _M128i(res);
5095 }
5096 
5097 _NEON2SSESTORAGE uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
5098 #define vceqq_u8 _mm_cmpeq_epi8
5099 
5100 _NEON2SSESTORAGE uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
5101 #define vceqq_u16 _mm_cmpeq_epi16
5102 
5103 _NEON2SSESTORAGE uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
5104 #define vceqq_u32 _mm_cmpeq_epi32
5105 
5106 _NEON2SSESTORAGE uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
5107 #define vceqq_p8 _mm_cmpeq_epi8
5108 
5109 //******************Vector compare greater-than or equal*************************
5110 //*******************************************************************************
5111 //in IA SIMD no greater-than-or-equal comparison for integers,
5112 // there is greater-than available only, so we need the following tricks
5113 
5114 _NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
vcge_s8(int8x8_t a,int8x8_t b)5115 _NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a,  int8x8_t b)
5116 {
5117     int8x8_t res64;
5118     return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
5119 }
5120 
5121 
5122 _NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
vcge_s16(int16x4_t a,int16x4_t b)5123 _NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a,  int16x4_t b)
5124 {
5125     int16x4_t res64;
5126     return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
5127 }
5128 
5129 
5130 _NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
vcge_s32(int32x2_t a,int32x2_t b)5131 _NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a,  int32x2_t b)
5132 {
5133     int32x2_t res64;
5134     return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
5135 }
5136 
5137 
5138 _NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
vcge_f32(float32x2_t a,float32x2_t b)5139 _NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
5140 {
5141     uint32x2_t res64;
5142     __m128 res;
5143     res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5144     return64f(res);
5145 }
5146 
5147 _NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
vcge_u8(uint8x8_t a,uint8x8_t b)5148 _NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b)
5149 {
5150     uint8x8_t res64;
5151     return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
5152 }
5153 
5154 
5155 _NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
vcge_u16(uint16x4_t a,uint16x4_t b)5156 _NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b)
5157 {
5158     uint16x4_t res64;
5159     return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
5160 }
5161 
5162 
5163 _NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
vcge_u32(uint32x2_t a,uint32x2_t b)5164 _NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b)
5165 {
5166     //serial solution looks faster
5167     uint32x2_t res64;
5168     return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
5169 }
5170 
5171 
5172 
5173 _NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
vcgeq_s8(int8x16_t a,int8x16_t b)5174 _NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5175 {
5176     __m128i m1, m2;
5177     m1 = _mm_cmpgt_epi8 ( a, b);
5178     m2 = _mm_cmpeq_epi8 ( a, b);
5179     return _mm_or_si128  ( m1, m2);
5180 }
5181 
5182 _NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
vcgeq_s16(int16x8_t a,int16x8_t b)5183 _NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5184 {
5185     __m128i m1, m2;
5186     m1 = _mm_cmpgt_epi16 ( a, b);
5187     m2 = _mm_cmpeq_epi16 ( a, b);
5188     return _mm_or_si128   ( m1,m2);
5189 }
5190 
5191 _NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
vcgeq_s32(int32x4_t a,int32x4_t b)5192 _NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5193 {
5194     __m128i m1, m2;
5195     m1 = _mm_cmpgt_epi32 (a, b);
5196     m2 = _mm_cmpeq_epi32 (a, b);
5197     return _mm_or_si128   (m1, m2);
5198 }
5199 
5200 _NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
vcgeq_f32(float32x4_t a,float32x4_t b)5201 _NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
5202 {
5203     __m128 res;
5204     res = _mm_cmpge_ps(a,b); //use only 2 first entries
5205     return *(__m128i*)&res;
5206 }
5207 
5208 _NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
vcgeq_u8(uint8x16_t a,uint8x16_t b)5209 _NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5210 {
5211     //no unsigned chars comparison, only signed available,so need the trick
5212     __m128i cmp;
5213     cmp = _mm_max_epu8(a, b);
5214     return _mm_cmpeq_epi8(cmp, a); //a>=b
5215 }
5216 
5217 _NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
vcgeq_u16(uint16x8_t a,uint16x8_t b)5218 _NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5219 {
5220     //no unsigned shorts comparison, only signed available,so need the trick
5221 #ifdef USE_SSE4
5222     __m128i cmp;
5223     cmp = _mm_max_epu16(a, b);
5224     return _mm_cmpeq_epi16(cmp, a); //a>=b
5225 #else
5226     __m128i as, mask;
5227     __m128i zero = _mm_setzero_si128();
5228     __m128i cffff = _mm_set1_epi16(0xffff);
5229     as = _mm_subs_epu16(b,a);
5230     mask = _mm_cmpgt_epi16(as, zero);
5231     return _mm_xor_si128 ( mask, cffff);
5232 #endif
5233 }
5234 
5235 _NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
vcgeq_u32(uint32x4_t a,uint32x4_t b)5236 _NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5237 {
5238     //no unsigned ints comparison, only signed available,so need the trick
5239 #ifdef USE_SSE4
5240     __m128i cmp;
5241     cmp = _mm_max_epu32(a, b);
5242     return _mm_cmpeq_epi32(cmp, a); //a>=b
5243 #else
5244     //serial solution may be faster
5245     __m128i c80000000, as, bs, m1, m2;
5246     c80000000 = _mm_set1_epi32 (0x80000000);
5247     as = _mm_sub_epi32(a,c80000000);
5248     bs = _mm_sub_epi32(b,c80000000);
5249     m1 = _mm_cmpgt_epi32 (as, bs);
5250     m2 = _mm_cmpeq_epi32 (as, bs);
5251     return _mm_or_si128 ( m1,  m2);
5252 #endif
5253 }
5254 
5255 //**********************Vector compare less-than or equal******************************
5256 //***************************************************************************************
5257 //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
5258 
5259 _NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
vcle_s8(int8x8_t a,int8x8_t b)5260 _NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a,  int8x8_t b)
5261 {
5262     int8x8_t res64;
5263     return64(vcleq_s8(_pM128i(a), _pM128i(b)));
5264 }
5265 
5266 
5267 _NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
vcle_s16(int16x4_t a,int16x4_t b)5268 _NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a,  int16x4_t b)
5269 {
5270     int16x4_t res64;
5271     return64(vcleq_s16(_pM128i(a), _pM128i(b)));
5272 }
5273 
5274 
5275 _NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
vcle_s32(int32x2_t a,int32x2_t b)5276 _NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a,  int32x2_t b)
5277 {
5278     int32x2_t res64;
5279     return64(vcleq_s32(_pM128i(a), _pM128i(b)));
5280 }
5281 
5282 
5283 _NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
vcle_f32(float32x2_t a,float32x2_t b)5284 _NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
5285 {
5286     uint32x2_t res64;
5287     __m128 res;
5288     res = _mm_cmple_ps(_pM128(a),_pM128(b));
5289     return64f(res);
5290 }
5291 
5292 _NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
5293 #define vcle_u8(a,b) vcge_u8(b,a)
5294 
5295 
5296 _NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
5297 #define vcle_u16(a,b) vcge_u16(b,a)
5298 
5299 
5300 _NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
5301 #define vcle_u32(a,b) vcge_u32(b,a)
5302 
5303 _NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
vcleq_s8(int8x16_t a,int8x16_t b)5304 _NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5305 {
5306     __m128i c1, res;
5307     c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
5308     res = _mm_cmpgt_epi8 ( a,  b);
5309     return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
5310 }
5311 
5312 _NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
vcleq_s16(int16x8_t a,int16x8_t b)5313 _NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5314 {
5315     __m128i c1, res;
5316     c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
5317     res = _mm_cmpgt_epi16 ( a,  b);
5318     return _mm_andnot_si128 (res, c1);
5319 }
5320 
5321 _NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
vcleq_s32(int32x4_t a,int32x4_t b)5322 _NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5323 {
5324     __m128i c1, res;
5325     c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
5326     res = _mm_cmpgt_epi32 ( a,  b);
5327     return _mm_andnot_si128 (res, c1);
5328 }
5329 
5330 _NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
vcleq_f32(float32x4_t a,float32x4_t b)5331 _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
5332 {
5333     __m128 res;
5334     res = _mm_cmple_ps(a,b);
5335     return *(__m128i*)&res;
5336 }
5337 
5338 _NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
5339 #ifdef USE_SSE4
vcleq_u8(uint8x16_t a,uint8x16_t b)5340     _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5341     {
5342         //no unsigned chars comparison in SSE, only signed available,so need the trick
5343         __m128i cmp;
5344         cmp = _mm_min_epu8(a, b);
5345         return _mm_cmpeq_epi8(cmp, a); //a<=b
5346     }
5347 #else
5348 #   define vcleq_u8(a,b) vcgeq_u8(b,a)
5349 #endif
5350 
5351 
5352 _NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
5353 #ifdef USE_SSE4
vcleq_u16(uint16x8_t a,uint16x8_t b)5354     _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5355     {
5356         //no unsigned shorts comparison in SSE, only signed available,so need the trick
5357         __m128i cmp;
5358         cmp = _mm_min_epu16(a, b);
5359         return _mm_cmpeq_epi16(cmp, a); //a<=b
5360     }
5361 #else
5362 #   define vcleq_u16(a,b) vcgeq_u16(b,a)
5363 #endif
5364 
5365 
5366 _NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
5367 #ifdef USE_SSE4
vcleq_u32(uint32x4_t a,uint32x4_t b)5368     _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5369     {
5370         //no unsigned chars comparison in SSE, only signed available,so need the trick
5371         __m128i cmp;
5372         cmp = _mm_min_epu32(a, b);
5373         return _mm_cmpeq_epi32(cmp, a); //a<=b
5374     }
5375 #else
5376 //solution may be not optimal compared with the serial one
5377 #   define vcleq_u32(a,b) vcgeq_u32(b,a)
5378 #endif
5379 
5380 
5381 //****** Vector compare greater-than ******************************************
5382 //**************************************************************************
5383 _NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
vcgt_s8(int8x8_t a,int8x8_t b)5384 _NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
5385 {
5386     int8x8_t res64;
5387     return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
5388 }
5389 
5390 
5391 _NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
vcgt_s16(int16x4_t a,int16x4_t b)5392 _NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
5393 {
5394     int16x4_t res64;
5395     return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
5396 }
5397 
5398 
5399 _NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
vcgt_s32(int32x2_t a,int32x2_t b)5400 _NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
5401 {
5402     int32x2_t res64;
5403     return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
5404 }
5405 
5406 
5407 _NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
vcgt_f32(float32x2_t a,float32x2_t b)5408 _NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
5409 {
5410     uint32x2_t res64;
5411     __m128 res;
5412     res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5413     return64f(res);
5414 }
5415 
5416 _NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b); // VCGT.U8 d0, d0, d0
vcgt_u8(uint8x8_t a,uint8x8_t b)5417 _NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b)
5418 {
5419     uint8x8_t res64;
5420     return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
5421 }
5422 
5423 
5424 _NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b); // VCGT.s16 d0, d0, d0
vcgt_u16(uint16x4_t a,uint16x4_t b)5425 _NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b)
5426 {
5427     uint16x4_t res64;
5428     return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
5429 }
5430 
5431 
5432 _NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b); // VCGT.U32 d0, d0, d0
vcgt_u32(uint32x2_t a,uint32x2_t b)5433 _NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b)
5434 {
5435     uint32x2_t res64;
5436     return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
5437 }
5438 
5439 
5440 _NEON2SSESTORAGE uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5441 #define vcgtq_s8 _mm_cmpgt_epi8
5442 
5443 _NEON2SSESTORAGE uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5444 #define vcgtq_s16 _mm_cmpgt_epi16
5445 
5446 _NEON2SSESTORAGE uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5447 #define vcgtq_s32 _mm_cmpgt_epi32
5448 
5449 _NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
vcgtq_f32(float32x4_t a,float32x4_t b)5450 _NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
5451 {
5452     __m128 res;
5453     res = _mm_cmpgt_ps(a,b); //use only 2 first entries
5454     return *(__m128i*)&res;
5455 }
5456 
5457 _NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
vcgtq_u8(uint8x16_t a,uint8x16_t b)5458 _NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
5459 {
5460     //no unsigned chars comparison, only signed available,so need the trick
5461     __m128i as;
5462     __m128i zero = _mm_setzero_si128();
5463     as = _mm_subs_epu8(a, b);
5464     return _mm_cmpgt_epi8(as, zero);
5465 }
5466 
5467 _NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
vcgtq_u16(uint16x8_t a,uint16x8_t b)5468 _NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
5469 {
5470     //no unsigned short comparison, only signed available,so need the trick
5471     __m128i as;
5472     __m128i zero = _mm_setzero_si128();
5473     as = _mm_subs_epu16(a, b);
5474     return _mm_cmpgt_epi16(as, zero);
5475 }
5476 
5477 _NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
vcgtq_u32(uint32x4_t a,uint32x4_t b)5478 _NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
5479 {
5480     //no unsigned int comparison, only signed available,so need the trick
5481     __m128i c80000000, as, bs;
5482     c80000000 = _mm_set1_epi32 (0x80000000);
5483     as = _mm_sub_epi32(a,c80000000);
5484     bs = _mm_sub_epi32(b,c80000000);
5485     return _mm_cmpgt_epi32 ( as, bs);
5486 }
5487 
5488 //********************* Vector compare less-than **************************
5489 //*************************************************************************
5490 _NEON2SSESTORAGE uint8x8_t   vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
5491 #define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
5492 
5493 
5494 _NEON2SSESTORAGE uint16x4_t   vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
5495 #define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
5496 
5497 
5498 _NEON2SSESTORAGE uint32x2_t   vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
5499 #define vclt_s32(a,b)  vcgt_s32(b,a) //swap the arguments!!
5500 
5501 
5502 _NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
5503 #define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
5504 
5505 _NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
5506 #define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
5507 
5508 _NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
5509 #define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
5510 
5511 _NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
5512 #define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
5513 
5514 _NEON2SSESTORAGE uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5515 #define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
5516 
5517 _NEON2SSESTORAGE uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5518 #define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
5519 
5520 _NEON2SSESTORAGE uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5521 #define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
5522 
5523 _NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
5524 #define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
5525 
5526 _NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
5527 #define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
5528 
5529 _NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
5530 #define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
5531 
5532 _NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
5533 #define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
5534 
5535 //*****************Vector compare absolute greater-than or equal ************
5536 //***************************************************************************
5537 _NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
vcage_f32(float32x2_t a,float32x2_t b)5538 _NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
5539 {
5540     uint32x2_t res64;
5541     __m128i c7fffffff;
5542     __m128 a0, b0;
5543     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5544     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5545     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5546     a0 = _mm_cmpge_ps ( a0, b0);
5547     return64f(a0);
5548 }
5549 
5550 _NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
vcageq_f32(float32x4_t a,float32x4_t b)5551 _NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5552 {
5553     __m128i c7fffffff;
5554     __m128 a0, b0;
5555     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5556     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5557     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5558     a0 = _mm_cmpge_ps ( a0, b0);
5559     return (*(__m128i*)&a0);
5560 }
5561 
5562 //********Vector compare absolute less-than or equal ******************
5563 //********************************************************************
5564 _NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
vcale_f32(float32x2_t a,float32x2_t b)5565 _NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
5566 {
5567     uint32x2_t res64;
5568     __m128i c7fffffff;
5569     __m128 a0, b0;
5570     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5571     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5572     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5573     a0 = _mm_cmple_ps (a0, b0);
5574     return64f(a0);
5575 }
5576 
5577 _NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
vcaleq_f32(float32x4_t a,float32x4_t b)5578 _NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5579 {
5580     __m128i c7fffffff;
5581     __m128 a0, b0;
5582     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5583     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5584     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5585     a0 = _mm_cmple_ps (a0, b0);
5586     return (*(__m128i*)&a0);
5587 }
5588 
5589 //********  Vector compare absolute greater-than    ******************
5590 //******************************************************************
5591 _NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
vcagt_f32(float32x2_t a,float32x2_t b)5592 _NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
5593 {
5594     uint32x2_t res64;
5595     __m128i c7fffffff;
5596     __m128 a0, b0;
5597     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5598     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5599     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5600     a0 = _mm_cmpgt_ps (a0, b0);
5601     return64f(a0);
5602 }
5603 
5604 _NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
vcagtq_f32(float32x4_t a,float32x4_t b)5605 _NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5606 {
5607     __m128i c7fffffff;
5608     __m128 a0, b0;
5609     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5610     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5611     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5612     a0 = _mm_cmpgt_ps (a0, b0);
5613     return (*(__m128i*)&a0);
5614 }
5615 
5616 //***************Vector compare absolute less-than  ***********************
5617 //*************************************************************************
5618 _NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
vcalt_f32(float32x2_t a,float32x2_t b)5619 _NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
5620 {
5621     uint32x2_t res64;
5622     __m128i c7fffffff;
5623     __m128 a0, b0;
5624     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5625     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5626     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5627     a0 = _mm_cmplt_ps (a0, b0);
5628     return64f(a0);
5629 }
5630 
5631 _NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
vcaltq_f32(float32x4_t a,float32x4_t b)5632 _NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5633 {
5634     __m128i c7fffffff;
5635     __m128 a0, b0;
5636     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5637     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5638     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5639     a0 = _mm_cmplt_ps (a0, b0);
5640     return (*(__m128i*)&a0);
5641 }
5642 
5643 //*************************Vector test bits************************************
5644 //*****************************************************************************
5645 /*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
5646 with the corresponding element of a second vector. If the result is not zero, the
5647 corresponding element in the destination vector is set to all ones. Otherwise, it is set to
5648 all zeros. */
5649 
5650 _NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b); // VTST.8 d0, d0, d0
vtst_s8(int8x8_t a,int8x8_t b)5651 _NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b)
5652 {
5653     int8x8_t res64;
5654     return64(vtstq_s8(_pM128i(a), _pM128i(b)));
5655 }
5656 
5657 
5658 _NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b); // VTST.16 d0, d0, d0
vtst_s16(int16x4_t a,int16x4_t b)5659 _NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b)
5660 {
5661     int16x4_t res64;
5662     return64(vtstq_s16(_pM128i(a), _pM128i(b)));
5663 }
5664 
5665 
5666 _NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b); // VTST.32 d0, d0, d0
vtst_s32(int32x2_t a,int32x2_t b)5667 _NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b)
5668 {
5669     int32x2_t res64;
5670     return64(vtstq_s32(_pM128i(a), _pM128i(b)));
5671 }
5672 
5673 
5674 _NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a,  uint8x8_t b); // VTST.8 d0, d0, d0
5675 #define vtst_u8 vtst_s8
5676 
5677 _NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a,  uint16x4_t b); // VTST.16 d0, d0, d0
5678 #define vtst_u16 vtst_s16
5679 
5680 _NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a,  uint32x2_t b); // VTST.32 d0, d0, d0
5681 #define vtst_u32 vtst_s32
5682 
5683 
5684 _NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
5685 #define vtst_p8 vtst_u8
5686 
5687 _NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
vtstq_s8(int8x16_t a,int8x16_t b)5688 _NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
5689 {
5690     __m128i zero, one, res;
5691     zero = _mm_setzero_si128 ();
5692     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5693     res = _mm_and_si128 (a, b);
5694     res =  _mm_cmpeq_epi8 (res, zero);
5695     return _mm_xor_si128(res, one); //invert result
5696 }
5697 
5698 _NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
vtstq_s16(int16x8_t a,int16x8_t b)5699 _NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
5700 {
5701     __m128i zero, one, res;
5702     zero = _mm_setzero_si128 ();
5703     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5704     res = _mm_and_si128 (a, b);
5705     res =  _mm_cmpeq_epi16 (res, zero);
5706     return _mm_xor_si128(res, one); //invert result
5707 }
5708 
5709 _NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
vtstq_s32(int32x4_t a,int32x4_t b)5710 _NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
5711 {
5712     __m128i zero, one, res;
5713     zero = _mm_setzero_si128 ();
5714     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5715     res = _mm_and_si128 (a, b);
5716     res =  _mm_cmpeq_epi32 (res, zero);
5717     return _mm_xor_si128(res, one); //invert result
5718 }
5719 
5720 _NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
5721 #define vtstq_u8 vtstq_s8
5722 
5723 _NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
5724 #define vtstq_u16 vtstq_s16
5725 
5726 _NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
5727 #define vtstq_u32 vtstq_s32
5728 
5729 _NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
5730 #define vtstq_p8 vtstq_u8
5731 
5732 //****************** Absolute difference ********************
5733 //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
5734 //************************************************************
5735 _NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b); // VABD.S8 d0,d0,d0
vabd_s8(int8x8_t a,int8x8_t b)5736 _NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b)
5737 {
5738     int8x8_t res64;
5739     return64(vabdq_s8(_pM128i(a), _pM128i(b)));
5740 }
5741 
5742 _NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b); // VABD.S16 d0,d0,d0
vabd_s16(int16x4_t a,int16x4_t b)5743 _NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b)
5744 {
5745     int16x4_t res64;
5746     return64(vabdq_s16(_pM128i(a), _pM128i(b)));
5747 }
5748 
5749 _NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b); // VABD.S32 d0,d0,d0
vabd_s32(int32x2_t a,int32x2_t b)5750 _NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b)
5751 {//need to deal with an intermediate overflow
5752     int32x2_t res;
5753     res.m64_i32[0] = (a.m64_i32[0] > b.m64_i32[0]) ? a.m64_i32[0] -  b.m64_i32[0]: b.m64_i32[0] -  a.m64_i32[0];
5754     res.m64_i32[1] = (a.m64_i32[1] > b.m64_i32[1]) ? a.m64_i32[1] -  b.m64_i32[1]: b.m64_i32[1] -  a.m64_i32[1];
5755     return res;
5756 }
5757 
5758 _NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b); // VABD.U8 d0,d0,d0
vabd_u8(uint8x8_t a,uint8x8_t b)5759 _NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b)
5760 {
5761     uint8x8_t res64;
5762     return64(vabdq_u8(_pM128i(a), _pM128i(b)));
5763 }
5764 
5765 _NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b); // VABD.s16 d0,d0,d0
vabd_u16(uint16x4_t a,uint16x4_t b)5766 _NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b)
5767 {
5768     uint16x4_t res64;
5769     return64(vabdq_u16(_pM128i(a), _pM128i(b)));
5770 }
5771 
5772 _NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b); // VABD.U32 d0,d0,d0
vabd_u32(uint32x2_t a,uint32x2_t b)5773 _NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b)
5774 {
5775     uint32x2_t res64;
5776     return64(vabdq_u32(_pM128i(a), _pM128i(b)));
5777 }
5778 
5779 _NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
vabd_f32(float32x2_t a,float32x2_t b)5780 _NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
5781 {
5782     float32x4_t res;
5783     __m64_128 res64;
5784     res = vabdq_f32(_pM128(a), _pM128(b));
5785     _M64f(res64, res);
5786     return res64;
5787 }
5788 
5789 _NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
vabdq_s8(int8x16_t a,int8x16_t b)5790 _NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
5791 { //need to deal with an intermediate overflow
5792    __m128i cmp, difab, difba;
5793    cmp = vcgtq_s8(a,b);
5794    difab = _mm_sub_epi8(a,b);
5795    difba = _mm_sub_epi8(b,a);
5796    difab = _mm_and_si128(cmp, difab);
5797    difba = _mm_andnot_si128(cmp, difba);
5798    return _mm_or_si128(difab, difba);
5799 }
5800 
5801 _NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
vabdq_s16(int16x8_t a,int16x8_t b)5802 _NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
5803 {//need to deal with an intermediate overflow
5804     __m128i cmp, difab, difba;
5805     cmp = vcgtq_s16(a,b);
5806     difab = _mm_sub_epi16(a,b);
5807     difba = _mm_sub_epi16 (b,a);
5808     difab = _mm_and_si128(cmp, difab);
5809     difba = _mm_andnot_si128(cmp, difba);
5810     return _mm_or_si128(difab, difba);
5811 }
5812 
5813 _NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
vabdq_s32(int32x4_t a,int32x4_t b)5814 _NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
5815 {//need to deal with an intermediate overflow
5816     __m128i cmp, difab, difba;
5817     cmp = vcgtq_s32(a,b);
5818     difab = _mm_sub_epi32(a,b);
5819     difba = _mm_sub_epi32(b,a);
5820     difab = _mm_and_si128(cmp, difab);
5821     difba = _mm_andnot_si128(cmp, difba);
5822     return _mm_or_si128(difab, difba);
5823 }
5824 
5825 _NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
vabdq_u8(uint8x16_t a,uint8x16_t b)5826 _NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
5827 {
5828     __m128i  difab, difba;
5829     difab = _mm_subs_epu8(a,b);
5830     difba = _mm_subs_epu8 (b,a);
5831     return _mm_or_si128(difab, difba);
5832 }
5833 
5834 _NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
vabdq_u16(uint16x8_t a,uint16x8_t b)5835 _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
5836 {
5837     __m128i difab, difba;
5838     difab = _mm_subs_epu16(a,b);
5839     difba = _mm_subs_epu16 (b,a);
5840     return _mm_or_si128(difab, difba);
5841 }
5842 
5843 _NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
vabdq_u32(uint32x4_t a,uint32x4_t b)5844 _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
5845 {
5846     __m128i cmp, difab, difba;
5847     cmp = vcgtq_u32(a,b);
5848     difab = _mm_sub_epi32(a,b);
5849     difba = _mm_sub_epi32 (b,a);
5850     difab = _mm_and_si128(cmp, difab);
5851     difba = _mm_andnot_si128(cmp, difba);
5852     return _mm_or_si128(difab, difba);
5853 }
5854 
5855 _NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
vabdq_f32(float32x4_t a,float32x4_t b)5856 _NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
5857 {
5858     __m128i c1;
5859     __m128 res;
5860     c1 =  _mm_set1_epi32(0x7fffffff);
5861     res = _mm_sub_ps (a, b);
5862     return _mm_and_ps (res, *(__m128*)&c1);
5863 }
5864 
5865 //************  Absolute difference - long **************************
5866 //********************************************************************
5867 _NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
vabdl_s8(int8x8_t a,int8x8_t b)5868 _NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
5869 {
5870     __m128i a16, b16;
5871     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
5872     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
5873     return vabdq_s16(a16, b16);
5874 
5875 }
5876 
5877 _NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
vabdl_s16(int16x4_t a,int16x4_t b)5878 _NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
5879 {
5880     __m128i a32, b32;
5881     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
5882     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
5883     return vabdq_s32(a32, b32);
5884 }
5885 
5886 _NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vabdl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)5887 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
5888 {
5889     //no optimal SIMD solution, serial looks faster
5890     _NEON2SSE_ALIGN_16 int64_t res[2];
5891     if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
5892     else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
5893     if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
5894     else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
5895     return _mm_load_si128((__m128i*)res);
5896 }
5897 
5898 _NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
vabdl_u8(uint8x8_t a,uint8x8_t b)5899 _NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
5900 {
5901     __m128i res;
5902     res = vsubl_u8(a,b);
5903     return _mm_abs_epi16(res);
5904 }
5905 
5906 _NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
vabdl_u16(uint16x4_t a,uint16x4_t b)5907 _NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
5908 {
5909     __m128i res;
5910     res = vsubl_u16(a,b);
5911     return _mm_abs_epi32(res);
5912 }
5913 
5914 _NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vabdl_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)5915 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
5916 {
5917     _NEON2SSE_ALIGN_16 uint64_t res[2];
5918     if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
5919     else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
5920     if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
5921     else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
5922     return _mm_load_si128((__m128i*)res);
5923 }
5924 
5925 //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
5926 //*********************************************************************************************
5927 _NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
vaba_s8(int8x8_t a,int8x8_t b,int8x8_t c)5928 _NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c)
5929 {
5930     int8x8_t res64;
5931     return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
5932 }
5933 
5934 _NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
vaba_s16(int16x4_t a,int16x4_t b,int16x4_t c)5935 _NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
5936 {
5937     int16x4_t res64;
5938     return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
5939 }
5940 
5941 _NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
vaba_s32(int32x2_t a,int32x2_t b,int32x2_t c)5942 _NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c)
5943 {
5944     int32x2_t res64;
5945     return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
5946 }
5947 
5948 _NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
vaba_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)5949 _NEON2SSE_INLINE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c)
5950 {
5951     int8x8_t res64;
5952     return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c)));
5953 }
5954 
5955 
5956 _NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
vaba_u16(uint16x4_t a,uint16x4_t b,uint16x4_t c)5957 _NEON2SSE_INLINE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c)
5958 {
5959     int16x4_t res64;
5960     return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c)));
5961 }
5962 
5963 _NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
vaba_u32(uint32x2_t a,uint32x2_t b,uint32x2_t c)5964 _NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c)
5965 {
5966     uint32x2_t res64;
5967     return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
5968 }
5969 
5970 _NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
vabaq_s8(int8x16_t a,int8x16_t b,int8x16_t c)5971 _NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
5972 {
5973     int8x16_t sub;
5974     sub = vabdq_s8(b, c);
5975     return vaddq_s8( a, sub);
5976 }
5977 
5978 _NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
vabaq_s16(int16x8_t a,int16x8_t b,int16x8_t c)5979 _NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
5980 {
5981     int16x8_t sub;
5982     sub = vabdq_s16(b, c);
5983     return vaddq_s16( a, sub);
5984 }
5985 
5986 _NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
vabaq_s32(int32x4_t a,int32x4_t b,int32x4_t c)5987 _NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
5988 {
5989     int32x4_t sub;
5990     sub = vabdq_s32(b, c);
5991     return vaddq_s32( a, sub);
5992 }
5993 
5994 _NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
vabaq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)5995 _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
5996 {
5997     uint8x16_t sub;
5998     sub = vabdq_u8(b, c);
5999     return vaddq_u8( a, sub);
6000 }
6001 
6002 _NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
vabaq_u16(uint16x8_t a,uint16x8_t b,uint16x8_t c)6003 _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
6004 {
6005     uint16x8_t sub;
6006     sub = vabdq_u16(b, c);
6007     return vaddq_u16( a, sub);
6008 }
6009 
6010 _NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
vabaq_u32(uint32x4_t a,uint32x4_t b,uint32x4_t c)6011 _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
6012 {
6013     uint32x4_t sub;
6014     sub = vabdq_u32(b, c);
6015     return vaddq_u32( a, sub);
6016 }
6017 
6018 //************** Absolute difference and accumulate - long ********************************
6019 //*************************************************************************************
6020 _NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
vabal_s8(int16x8_t a,int8x8_t b,int8x8_t c)6021 _NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
6022 {
6023     __m128i b16, c16, res;
6024     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
6025     c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
6026     res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
6027     return _mm_add_epi16 (a, res);
6028 }
6029 
6030 _NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
vabal_s16(int32x4_t a,int16x4_t b,int16x4_t c)6031 _NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
6032 {
6033     __m128i b32, c32, res;
6034     b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
6035     c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
6036     res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
6037     return _mm_add_epi32 (a, res);
6038 }
6039 
6040 _NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vabal_s32 (int64x2_t a,int32x2_t b,int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)6041 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
6042 {
6043     __m128i res;
6044     res = vabdl_s32(b,c);
6045     return _mm_add_epi64(a, res);
6046 }
6047 
6048 _NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
vabal_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)6049 _NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
6050 {
6051     __m128i b16, c16, res;
6052     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
6053     c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
6054     res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
6055     return _mm_add_epi16 (a, res);
6056 }
6057 
6058 _NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
vabal_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)6059 _NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
6060 {
6061     __m128i b32, c32, res;
6062     b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
6063     c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
6064     res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
6065     return _mm_add_epi32 (a, res);
6066 }
6067 
6068 _NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vabal_u32 (uint64x2_t a,uint32x2_t b,uint32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)6069 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
6070 {
6071     __m128i res;
6072     res = vabdl_u32(b,c);
6073     return _mm_add_epi64(a, res);
6074 }
6075 
6076 //***********************************************************************************
6077 //****************  Maximum and minimum operations **********************************
6078 //***********************************************************************************
6079 //************* Maximum:  vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]    *******
6080 //***********************************************************************************
6081 _NEON2SSESTORAGE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
vmax_s8(int8x8_t a,int8x8_t b)6082 _NEON2SSE_INLINE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b)
6083 {
6084     int8x8_t res64;
6085     __m128i res;
6086     res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6087     return64(res);
6088 }
6089 
6090 _NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
vmax_s16(int16x4_t a,int16x4_t b)6091 _NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
6092 {
6093     int16x4_t res64;
6094     return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
6095 }
6096 
6097 _NEON2SSESTORAGE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
vmax_s32(int32x2_t a,int32x2_t b)6098 _NEON2SSE_INLINE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b)
6099 {
6100     int32x2_t res64;
6101     __m128i res;
6102     res =  _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6103     return64(res);
6104 }
6105 
6106 _NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
vmax_u8(uint8x8_t a,uint8x8_t b)6107 _NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
6108 {
6109     uint8x8_t res64;
6110     return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
6111 }
6112 
6113 
6114 _NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
vmax_u16(uint16x4_t a,uint16x4_t b)6115 _NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
6116 {
6117     uint16x4_t res64;
6118     return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
6119 }
6120 
6121 
6122 _NEON2SSESTORAGE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
vmax_u32(uint32x2_t a,uint32x2_t b)6123 _NEON2SSE_INLINE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b)
6124 {
6125     uint32x2_t res64;
6126     __m128i res;
6127     res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6128     return64(res);
6129 }
6130 
6131 _NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
vmax_f32(float32x2_t a,float32x2_t b)6132 _NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
6133 {
6134     //serial solution looks faster than  SIMD one
6135     float32x2_t res;
6136     res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
6137     res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
6138     return res;
6139 }
6140 
6141 _NEON2SSESTORAGE int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
6142 #define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
6143 
6144 _NEON2SSESTORAGE int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
6145 #define vmaxq_s16 _mm_max_epi16
6146 
6147 _NEON2SSESTORAGE int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
6148 #define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
6149 
6150 _NEON2SSESTORAGE uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
6151 #define vmaxq_u8 _mm_max_epu8
6152 
6153 _NEON2SSESTORAGE uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
6154 #define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
6155 
6156 _NEON2SSESTORAGE uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
6157 #define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
6158 
6159 
6160 _NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
6161 #define vmaxq_f32 _mm_max_ps
6162 
6163 
6164 _NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
6165 #define vmaxq_f64 _mm_max_pd
6166 
6167 
6168 //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
6169 //***********************************************************************************************************
6170 _NEON2SSESTORAGE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
vmin_s8(int8x8_t a,int8x8_t b)6171 _NEON2SSE_INLINE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b)
6172 {
6173     int8x8_t res64;
6174     __m128i res;
6175     res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6176     return64(res);
6177 }
6178 
6179 _NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
vmin_s16(int16x4_t a,int16x4_t b)6180 _NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
6181 {
6182     int16x4_t res64;
6183     return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
6184 }
6185 
6186 
6187 _NEON2SSESTORAGE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
vmin_s32(int32x2_t a,int32x2_t b)6188 _NEON2SSE_INLINE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b)
6189 {
6190     int32x2_t res64;
6191     __m128i res;
6192     res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6193     return64(res);
6194 }
6195 
6196 _NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
vmin_u8(uint8x8_t a,uint8x8_t b)6197 _NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
6198 {
6199     uint8x8_t res64;
6200     return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
6201 }
6202 
6203 
6204 _NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
vmin_u16(uint16x4_t a,uint16x4_t b)6205 _NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
6206 {
6207     uint16x4_t res64;
6208     return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
6209 }
6210 
6211 
6212 _NEON2SSESTORAGE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
vmin_u32(uint32x2_t a,uint32x2_t b)6213 _NEON2SSE_INLINE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b)
6214 {
6215     uint32x2_t res64;
6216     __m128i res;
6217     res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6218     return64(res);
6219 }
6220 
6221 _NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
vmin_f32(float32x2_t a,float32x2_t b)6222 _NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
6223 {
6224     //serial solution looks faster than  SIMD one
6225     float32x2_t res;
6226     res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
6227     res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
6228     return res;
6229 }
6230 
6231 _NEON2SSESTORAGE int8x16_t   vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
6232 #define vminq_s8 _MM_MIN_EPI8 //SSE4.1
6233 
6234 _NEON2SSESTORAGE int16x8_t   vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
6235 #define vminq_s16 _mm_min_epi16
6236 
6237 _NEON2SSESTORAGE int32x4_t   vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
6238 #define vminq_s32 _MM_MIN_EPI32 //SSE4.1
6239 
6240 _NEON2SSESTORAGE uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
6241 #define vminq_u8 _mm_min_epu8
6242 
6243 _NEON2SSESTORAGE uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
6244 #define vminq_u16 _MM_MIN_EPU16 //SSE4.1
6245 
6246 _NEON2SSESTORAGE uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
6247 #define vminq_u32 _MM_MIN_EPU32 //SSE4.1
6248 
6249 _NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
6250 #define vminq_f32 _mm_min_ps
6251 
6252 
6253 _NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
6254 #define vminq_f64 _mm_min_pd
6255 
6256 
6257 //*************  Pairwise addition operations. **************************************
6258 //************************************************************************************
6259 //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
6260 _NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
vpadd_s8(int8x8_t a,int8x8_t b)6261 _NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
6262 {
6263     //no 8 bit hadd in IA32, need to go to 16 bit and then pack
6264     int8x8_t res64;
6265     __m128i a16, b16, res;
6266     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6267     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
6268     res = _mm_hadd_epi16 (a16, b16);
6269     res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
6270     return64(res);
6271 }
6272 
6273 _NEON2SSESTORAGE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
vpadd_s16(int16x4_t a,int16x4_t b)6274 _NEON2SSE_INLINE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b)
6275 {
6276     int16x4_t res64;
6277     __m128i hadd128;
6278     hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
6279     hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6280     return64(hadd128);
6281 }
6282 
6283 
6284 _NEON2SSESTORAGE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
vpadd_s32(int32x2_t a,int32x2_t b)6285 _NEON2SSE_INLINE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b)
6286 {
6287     int32x2_t res64;
6288     __m128i hadd128;
6289     hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
6290     hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6291     return64(hadd128);
6292 }
6293 
6294 
6295 _NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
vpadd_u8(uint8x8_t a,uint8x8_t b)6296 _NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
6297 {
6298     //  no 8 bit hadd in IA32, need to go to 16 bit and then pack
6299     uint8x8_t res64;
6300 //  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6301     __m128i mask8, a16, b16, res;
6302     mask8 = _mm_set1_epi16(0xff);
6303     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
6304     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
6305     res = _mm_hadd_epi16 (a16, b16);
6306     res = _mm_and_si128(res, mask8); //to avoid saturation
6307     res = _mm_packus_epi16 (res,res); //use low 64 bits
6308     return64(res);
6309 }
6310 
6311 _NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
vpadd_u16(uint16x4_t a,uint16x4_t b)6312 _NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
6313 {
6314     // solution may be not optimal, serial execution may be faster
6315     // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
6316     uint16x4_t res64;
6317     __m128i c32767,  cfffe, as, bs, res;
6318     c32767 = _mm_set1_epi16 (32767);
6319     cfffe = _mm_set1_epi16 ((int16_t)0xfffe);
6320     as = _mm_sub_epi16 (_pM128i(a), c32767);
6321     bs = _mm_sub_epi16 (_pM128i(b), c32767);
6322     res = _mm_hadd_epi16 (as, bs);
6323     res = _mm_add_epi16 (res, cfffe);
6324     res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6325     return64(res);
6326 }
6327 
6328 _NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
vpadd_u32(uint32x2_t a,uint32x2_t b)6329 _NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
6330 {
6331     //hadd doesn't work for unsigned values
6332     uint32x2_t res64;
6333     __m128i ab, ab_sh, res;
6334     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
6335     ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
6336     res = _mm_add_epi32(ab, ab_sh);
6337     res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6338     return64(res);
6339 }
6340 
6341 _NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
vpadd_f32(float32x2_t a,float32x2_t b)6342 _NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
6343 {
6344     __m128 hadd128;
6345     __m64_128 res64;
6346     hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
6347     hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
6348     _M64f(res64, hadd128);
6349     return res64;
6350 }
6351 
6352 
6353 //**************************  Long pairwise add  **********************************
6354 //*********************************************************************************
6355 //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
6356 // and places the final results in the destination vector.
6357 
6358 _NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
vpaddl_s8(int8x8_t a)6359 _NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
6360 {
6361     //no 8 bit hadd in IA32, need to go to 16 bit anyway
6362     __m128i a16;
6363     int16x4_t res64;
6364     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6365     a16 = _mm_hadd_epi16 (a16,  a16); //use low 64 bits
6366     return64(a16);
6367 }
6368 
6369 _NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
vpaddl_s16(int16x4_t a)6370 _NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
6371 {
6372     // solution may be not optimal, serial execution may be faster
6373     int32x2_t res64;
6374     __m128i r32_1;
6375     r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
6376     r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
6377     return64(r32_1);
6378 }
6379 
6380 _NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32 (int32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6381 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6382 {
6383     int64x1_t res;
6384     res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
6385     return res;
6386 }
6387 
6388 _NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
vpaddl_u8(uint8x8_t a)6389 _NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
6390 {
6391     //  no 8 bit hadd in IA32, need to go to 16 bit
6392 //  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6393     uint16x4_t res64;
6394     __m128i a16;
6395     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
6396     a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
6397     return64(a16);
6398 }
6399 
6400 _NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16 (uint16x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6401 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
6402 {
6403     //serial solution looks faster than a SIMD one
6404     uint32x2_t res;
6405     res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
6406     res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
6407     return res;
6408 }
6409 
6410 _NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32 (uint32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6411 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6412 {
6413     uint64x1_t res;
6414     res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
6415     return res;
6416 }
6417 
6418 _NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
vpaddlq_s8(int8x16_t a)6419 _NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
6420 {
6421     //no 8 bit hadd in IA32, need to go to 16 bit
6422     __m128i r16_1, r16_2;
6423     r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
6424     //swap hi and low part of r to process the remaining data
6425     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6426     r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
6427     return _mm_hadd_epi16 (r16_1, r16_2);
6428 }
6429 
6430 _NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
vpaddlq_s16(int16x8_t a)6431 _NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
6432 {
6433     //no 8 bit hadd in IA32, need to go to 16 bit
6434     __m128i r32_1, r32_2;
6435     r32_1 = _MM_CVTEPI16_EPI32(a);
6436     //swap hi and low part of r to process the remaining data
6437     r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6438     r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
6439     return _mm_hadd_epi32 (r32_1, r32_2);
6440 }
6441 
6442 _NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32 (int32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6443 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
6444 {
6445     _NEON2SSE_ALIGN_16 int32_t atmp[4];
6446     _NEON2SSE_ALIGN_16 int64_t res[2];
6447     _mm_store_si128((__m128i*)atmp, a);
6448     res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
6449     res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
6450     return _mm_load_si128((__m128i*)res);
6451 }
6452 
6453 _NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
vpaddlq_u8(uint8x16_t a)6454 _NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
6455 {
6456     //no 8 bit hadd in IA32, need to go to 16 bit
6457     __m128i r16_1, r16_2;
6458     r16_1 = _MM_CVTEPU8_EPI16(a);
6459     //swap hi and low part of r to process the remaining data
6460     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6461     r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
6462     return _mm_hadd_epi16 (r16_1, r16_2);
6463 }
6464 
6465 _NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16 (uint16x8_t a),_NEON2SSE_REASON_SLOW_SERIAL)6466 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
6467 {
6468     //serial solution looks faster than a SIMD one
6469     _NEON2SSE_ALIGN_16 uint16_t atmp[8];
6470     _NEON2SSE_ALIGN_16 uint32_t res[4];
6471     _mm_store_si128((__m128i*)atmp, a);
6472     res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
6473     res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
6474     res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
6475     res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
6476     return _mm_load_si128((__m128i*)res);
6477 }
6478 
6479 _NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32 (uint32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6480 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6481 {
6482     _NEON2SSE_ALIGN_16 uint32_t atmp[4];
6483     _NEON2SSE_ALIGN_16 uint64_t res[2];
6484     _mm_store_si128((__m128i*)atmp, a);
6485     res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
6486     res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
6487     return _mm_load_si128((__m128i*)res);
6488 }
6489 
6490 //************************  Long pairwise add and accumulate **************************
6491 //****************************************************************************************
6492 //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
6493 // and accumulates the  values of the results into the elements of the destination (wide) vector
6494 _NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b); // VPADAL.S8 d0,d0
vpadal_s8(int16x4_t a,int8x8_t b)6495 _NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b)
6496 {
6497     int16x4_t res64;
6498     return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
6499 }
6500 
6501 _NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b); // VPADAL.S16 d0,d0
vpadal_s16(int32x2_t a,int16x4_t b)6502 _NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b)
6503 {
6504     int32x2_t res64;
6505     return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
6506 }
6507 
6508 
6509 _NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
vpadal_s32(int64x1_t a,int32x2_t b)6510 _NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
6511 {
6512     int64x1_t res;
6513     res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
6514     return res;
6515 }
6516 
6517 _NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b); // VPADAL.U8 d0,d0
vpadal_u8(uint16x4_t a,uint8x8_t b)6518 _NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b)
6519 {
6520     uint16x4_t res64;
6521     return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
6522 }
6523 
6524 
6525 _NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b); // VPADAL.s16 d0,d0
vpadal_u16(uint32x2_t a,uint16x4_t b)6526 _NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b)
6527 {
6528     uint32x2_t res64;
6529     return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
6530 }
6531 
6532 _NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
vpadal_u32(uint64x1_t a,uint32x2_t b)6533 _NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
6534 {
6535     uint64x1_t res;
6536     res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
6537     return res;
6538 }
6539 
6540 _NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
vpadalq_s8(int16x8_t a,int8x16_t b)6541 _NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
6542 {
6543     int16x8_t pad;
6544     pad = vpaddlq_s8(b);
6545     return _mm_add_epi16 (a, pad);
6546 }
6547 
6548 _NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
vpadalq_s16(int32x4_t a,int16x8_t b)6549 _NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
6550 {
6551     int32x4_t pad;
6552     pad = vpaddlq_s16(b);
6553     return _mm_add_epi32(a, pad);
6554 }
6555 
6556 _NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
vpadalq_s32(int64x2_t a,int32x4_t b)6557 _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
6558 {
6559     int64x2_t pad;
6560     pad = vpaddlq_s32(b);
6561     return _mm_add_epi64 (a, pad);
6562 }
6563 
6564 _NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
vpadalq_u8(uint16x8_t a,uint8x16_t b)6565 _NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
6566 {
6567     uint16x8_t pad;
6568     pad = vpaddlq_u8(b);
6569     return _mm_add_epi16 (a, pad);
6570 }
6571 
6572 _NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16 (uint32x4_t a,uint16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)6573 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6574 {
6575     uint32x4_t pad;
6576     pad = vpaddlq_u16(b);
6577     return _mm_add_epi32(a, pad);
6578 } //no optimal SIMD solution, serial is faster
6579 
6580 _NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32 (uint64x2_t a,uint32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)6581 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6582 {
6583     //no optimal SIMD solution, serial is faster
6584     uint64x2_t pad;
6585     pad = vpaddlq_u32(b);
6586     return _mm_add_epi64(a, pad);
6587 } //no optimal SIMD solution, serial is faster
6588 
6589 //**********  Folding maximum   *************************************
6590 //*******************************************************************
6591 //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
6592 //and copies the larger of each pair into the corresponding element in the destination
6593 //    no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
6594 _NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
vpmax_s8(int8x8_t a,int8x8_t b)6595 _NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
6596 {
6597     int8x8_t res64;
6598     __m128i ab, ab1, max;
6599     _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
6600     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6601     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6602     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6603     max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
6604     max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
6605     return64(max); //we need 64 bits only
6606 }
6607 
6608 _NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
vpmax_s16(int16x4_t a,int16x4_t b)6609 _NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
6610 {
6611     //solution may be not optimal compared with the serial one
6612     int16x4_t res64;
6613     __m128i ab, ab1, max;
6614     _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6615     ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
6616     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6617     max = _mm_max_epi16 (ab, ab1);
6618     max =  _mm_shuffle_epi8 (max, *(__m128i*)  mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6619     return64(max);
6620 }
6621 
6622 _NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6623 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6624 {
6625     //serial solution looks faster than SIMD one
6626     int32x2_t res;
6627     res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
6628     res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
6629     return res;
6630 }
6631 
6632 _NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
vpmax_u8(uint8x8_t a,uint8x8_t b)6633 _NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
6634 {
6635     uint8x8_t res64;
6636     __m128i ab, ab1, max;
6637     _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6638     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6639     ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
6640     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6641     max = _mm_max_epu8 (ab, ab1); // SSE4.1
6642     max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
6643     return64(max);
6644 }
6645 
6646 _NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
vpmax_u16(uint16x4_t a,uint16x4_t b)6647 _NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
6648 {
6649     //solution may be not optimal compared with the serial one
6650     uint16x4_t res64;
6651     __m128i ab, ab1, max;
6652     _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6653     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6654     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6655     max = _MM_MAX_EPU16 (ab, ab1);
6656     max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6657     return64(max);
6658 }
6659 
6660 _NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6661 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6662 {
6663     //serial solution looks faster than SIMD one
6664     uint32x2_t res;
6665     res.m64_u32[0] = (a.m64_u32[0] < a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
6666     res.m64_u32[1] = (b.m64_u32[0] < b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
6667     return res;
6668 }
6669 
6670 _NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32 (float32x2_t a,float32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6671 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6672 {
6673     //serial solution looks faster than  SIMD one
6674     float32x2_t res;
6675     res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
6676     res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
6677     return res;
6678 }
6679 
6680 // ***************** Folding minimum  ****************************
6681 // **************************************************************
6682 //vpmin -> takes minimum of adjacent pairs
6683 _NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
vpmin_s8(int8x8_t a,int8x8_t b)6684 _NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
6685 {
6686     int8x8_t res64;
6687     __m128i ab, ab1, min;
6688     _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
6689     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6690     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6691     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
6692     min =  _MM_MIN_EPI8 (ab, ab1); // SSE4.1
6693     min =  _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
6694     return64(min);
6695 }
6696 
6697 _NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
vpmin_s16(int16x4_t a,int16x4_t b)6698 _NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
6699 {
6700     //solution may be not optimal compared with the serial one
6701     int16x4_t res64;
6702     __m128i ab, ab1, min;
6703     _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6704     ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
6705     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6706     min = _mm_min_epi16 (ab, ab1);
6707     min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6708     return64(min);
6709 }
6710 
6711 _NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6712 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6713 {
6714     //serial solution looks faster than SIMD one
6715     int32x2_t res;
6716     res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
6717     res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
6718     return res;
6719 }
6720 
6721 _NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
vpmin_u8(uint8x8_t a,uint8x8_t b)6722 _NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
6723 {
6724     uint8x8_t res64;
6725     __m128i ab, ab1, min;
6726     _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
6727     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6728     ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
6729     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6730     min = _mm_min_epu8 (ab, ab1); // SSE4.1
6731     min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
6732     return64(min);
6733 }
6734 
6735 _NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
vpmin_u16(uint16x4_t a,uint16x4_t b)6736 _NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
6737 {
6738     //solution may be not optimal compared with the serial one
6739     uint16x4_t res64;
6740     __m128i ab, ab1, min;
6741     _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6742     ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
6743     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
6744     min = _MM_MIN_EPU16 (ab, ab1);
6745     min =    _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6746     return64(min);
6747 }
6748 
6749 _NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6750 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6751 {
6752     //serial solution looks faster than SIMD one
6753     uint32x2_t res;
6754     res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
6755     res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
6756     return res;
6757 }
6758 
6759 _NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32 (float32x2_t a,float32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6760 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6761 {
6762     //serial solution looks faster than SIMD one
6763     float32x2_t res;
6764     res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
6765     res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
6766     return res;
6767 }
6768 
6769 //***************************************************************
6770 //***********  Reciprocal/Sqrt ************************************
6771 //***************************************************************
6772 //****************** Reciprocal estimate *******************************
6773 //the ARM NEON and x86 SIMD results may be slightly different
6774 _NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
vrecpe_f32(float32x2_t a)6775 _NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
6776 {
6777     float32x4_t res;
6778     __m64_128 res64;
6779     res = _mm_rcp_ps(_pM128(a));
6780     _M64f(res64, res);
6781     return res64;
6782 }
6783 
6784 _NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32 (uint32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6785 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6786 {
6787     //Input is  fixed point number!!! No reciprocal for ints in IA32 available
6788     uint32x2_t res;
6789     float resf, r;
6790     int i, q, s;
6791     for (i =0; i<2; i++){
6792         if((a.m64_u32[i] & 0x80000000) == 0) {
6793             res.m64_u32[i] = 0xffffffff;
6794         }else{
6795             resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
6796             q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
6797             r = (float)(1.0 / (((float)q + 0.5) / 512.0)); /* reciprocal r */
6798             s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6799             r =  (float)s / 256.0;
6800             res.m64_u32[i] = r * (uint32_t)(1 << 31);
6801         }
6802     }
6803     return res;
6804 }
6805 
6806 _NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
6807 #define vrecpeq_f32 _mm_rcp_ps
6808 
6809 
6810 _NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32 (uint32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6811 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6812 {
6813     //Input is  fixed point number!!!
6814     //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
6815     _NEON2SSE_ALIGN_16 uint32_t atmp[4];
6816     _NEON2SSE_ALIGN_16 uint32_t res[4];
6817     _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
6818     float resf, r;
6819     int i, q, s;
6820     __m128i res128, mask, zero;
6821     _mm_store_si128((__m128i*)atmp, a);
6822     zero = _mm_setzero_si128();
6823     for (i =0; i<4; i++){
6824         resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31)));  //  2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
6825         q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
6826         r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
6827         s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6828         r =  (float)s / 256.0;
6829         res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
6830     }
6831     res128 = _mm_load_si128((__m128i*)res);
6832     mask = _mm_and_si128(a, *(__m128i*)c80000000);
6833     mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x7fffffff
6834     return _mm_or_si128(res128, mask);
6835 }
6836 
6837 //**********Reciprocal square root estimate ****************
6838 //**********************************************************
6839 //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
6840 //but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
6841 ////the ARM NEON and x86 SIMD results may be slightly different
6842 _NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
vrsqrte_f32(float32x2_t a)6843 _NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
6844 {
6845     float32x4_t res;
6846     __m64_128 res64;
6847     res = _mm_rsqrt_ps(_pM128(a));
6848     _M64f(res64, res);
6849     return res64;
6850 }
6851 
6852 _NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32 (uint32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6853 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6854 {
6855     //Input is  fixed point number!!!
6856     //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
6857    uint32x2_t res;
6858    __m128 tmp;
6859     float r, resf, coeff;
6860     int i,q0, s;
6861     for (i =0; i<2; i++){
6862         if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
6863             res.m64_u32[i] = 0xffffffff;
6864         }else{
6865             resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
6866             coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
6867             q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
6868             r = ((float)q0 + 0.5) / coeff;
6869             tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
6870             _mm_store_ss(&r, tmp);
6871             s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6872             r = (float)(s / 256.0);
6873             res.m64_u32[i] = r * (((uint32_t)1) << 31);
6874         }
6875     }
6876     return res;
6877 }
6878 
6879 _NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
6880 #define vrsqrteq_f32 _mm_rsqrt_ps
6881 
6882 _NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32 (uint32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6883 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6884 {
6885     //Input is  fixed point number!!!
6886     //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
6887    _NEON2SSE_ALIGN_16 uint32_t  atmp[4], res[4];
6888    _NEON2SSE_ALIGN_16 static const uint32_t c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
6889    __m128 tmp;
6890    __m128i res128, mask, zero;
6891     float r, resf, coeff;
6892     int i,q0, s;
6893     _mm_store_si128((__m128i*)atmp, a);
6894     zero = _mm_setzero_si128();
6895     for (i =0; i<4; i++){
6896         resf =  (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
6897         coeff = (float)((resf < 0.5)? 512.0 : 256.0); /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
6898         q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
6899         r = ((float)q0 + 0.5) / coeff;
6900         tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
6901         _mm_store_ss(&r, tmp);
6902         s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6903         r = (float)s / 256.0;
6904         res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
6905     }
6906     res128 = _mm_load_si128((__m128i*)res);
6907     mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
6908     mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x3fffffff
6909     return _mm_or_si128(res128, mask);
6910 }
6911 //************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
6912 //******************************************************************************************
6913 //******VRECPS (Vector Reciprocal Step) ***************************************************
6914 //multiplies the elements of one vector by the corresponding elements of another vector,
6915 //subtracts each of the results from 2, and places the final results into the elements of the destination vector.
6916 
6917 _NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
vrecps_f32(float32x2_t a,float32x2_t b)6918 _NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
6919 {
6920     float32x4_t res;
6921     __m64_128 res64;
6922     res = vrecpsq_f32(_pM128(a), _pM128(b));
6923     _M64f(res64, res);
6924     return res64;
6925 }
6926 
6927 _NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
vrecpsq_f32(float32x4_t a,float32x4_t b)6928 _NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
6929 {
6930     __m128 f2, mul;
6931     f2 =  _mm_set1_ps(2.);
6932     mul = _mm_mul_ps(a,b);
6933     return _mm_sub_ps(f2,mul);
6934 }
6935 
6936 //*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
6937 //multiplies the elements of one vector by the corresponding elements of another vector,
6938 //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
6939 
6940 _NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
vrsqrts_f32(float32x2_t a,float32x2_t b)6941 _NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
6942 {
6943     float32x2_t res;
6944     res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
6945     res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
6946     return res;
6947 }
6948 
6949 _NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
vrsqrtsq_f32(float32x4_t a,float32x4_t b)6950 _NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
6951 {
6952     __m128 f3, f05, mul;
6953     f3 =  _mm_set1_ps(3.);
6954     f05 =  _mm_set1_ps(0.5);
6955     mul = _mm_mul_ps(a,b);
6956     f3 = _mm_sub_ps(f3,mul);
6957     return _mm_mul_ps (f3, f05);
6958 }
6959 //********************************************************************************************
6960 //***************************** Shifts by signed variable ***********************************
6961 //********************************************************************************************
6962 //***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
6963 //********************************************************************************************
6964 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
6965 //helper macro. It matches ARM implementation for big shifts
6966 #define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
6967         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
6968         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
6969         for (i = 0; i<LEN; i++) { \
6970         if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
6971         else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
6972         return _mm_load_si128((__m128i*)res);
6973 
6974 #define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
6975         int ## TYPE ## x ## LEN ## _t res;  int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
6976         for (i = 0; i<LEN; i++) { \
6977         if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
6978         else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
6979         return res;
6980 
6981 _NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)6982 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6983 {
6984     SERIAL_SHIFT_64(8, i, 8)
6985 }
6986 
6987 _NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)6988 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6989 {
6990     SERIAL_SHIFT_64(16, i, 4)
6991 }
6992 
6993 _NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6994 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6995 {
6996     SERIAL_SHIFT_64(32, i, 2)
6997 }
6998 
6999 _NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7000 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7001 {
7002     SERIAL_SHIFT_64(64, i, 1)
7003 }
7004 
7005 _NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7006 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7007 {
7008     SERIAL_SHIFT_64(8, u, 8)
7009 }
7010 
7011 _NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7012 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7013 {
7014     SERIAL_SHIFT_64(16, u, 4)
7015 }
7016 
7017 _NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7018 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7019 {
7020     SERIAL_SHIFT_64(32, u, 2)
7021 }
7022 
7023 _NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
vshl_u64(uint64x1_t a,int64x1_t b)7024 _NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing  for large numbers
7025 {
7026     SERIAL_SHIFT_64(64, u, 1)
7027 }
7028 
7029 _NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7030 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7031 {
7032     SERIAL_SHIFT(int8_t, int8_t, 16, 16)
7033 }
7034 
7035 _NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7036 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7037 {
7038     SERIAL_SHIFT(int16_t, int16_t, 8, 8)
7039 }
7040 
7041 _NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7042 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7043 {
7044     SERIAL_SHIFT(int32_t, int32_t, 4, 4)
7045 }
7046 
7047 _NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7048 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7049 {
7050     SERIAL_SHIFT(int64_t, int64_t, 2, 2)
7051 }
7052 
7053 _NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7054 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7055 {
7056     SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
7057 }
7058 
7059 _NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7060 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7061 {
7062     SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
7063 }
7064 
7065 _NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7066 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7067 {
7068     SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
7069 }
7070 
7071 _NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7072 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7073 {
7074     SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
7075 }
7076 
7077 
7078 //*********** Vector saturating shift left: (negative values shift right) **********************
7079 //********************************************************************************************
7080 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7081 #define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7082         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7083         int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7084         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7085         for (i = 0; i<LEN; i++) { \
7086         if (atmp[i] ==0) res[i] = 0; \
7087         else{ \
7088             if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
7089             else{ \
7090                 if (btmp[i]>lanesize_1) { \
7091                     res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7092                 }else{ \
7093                     limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7094                     if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7095                         res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7096                     else res[i] = atmp[i] << btmp[i]; }}}} \
7097         return _mm_load_si128((__m128i*)res);
7098 
7099 #define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7100         _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7101         TYPE lanesize = (sizeof(TYPE) << 3); \
7102         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7103         for (i = 0; i<LEN; i++) { \
7104         if (atmp[i] ==0) {res[i] = 0; \
7105         }else{ \
7106             if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
7107             else{ \
7108                 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
7109                 else{ \
7110                     limit = (TYPE) 1 << (lanesize - btmp[i]); \
7111                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
7112         return _mm_load_si128((__m128i*)res);
7113 
7114 #define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
7115         int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
7116         int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
7117         for (i = 0; i<LEN; i++) { \
7118         if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7119         else{ \
7120             if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7121             else{ \
7122                 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7123                     res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7124                 }else{ \
7125                     limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7126                     if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7127                         res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7128                     else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7129         return res;
7130 
7131 #define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7132         int ## TYPE ## x ## LEN ## _t res;  _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7133         int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7134         for (i = 0; i<LEN; i++) { \
7135         if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7136         }else{ \
7137             if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7138             else{ \
7139                 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
7140                 else{ \
7141                     limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7142                     res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
7143         return res;
7144 
7145 _NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7146 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7147 {
7148     SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
7149 }
7150 
7151 _NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7152 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7153 {
7154     SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
7155 }
7156 
7157 _NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7158 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7159 {
7160     SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
7161 }
7162 
7163 _NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7164 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7165 {
7166     SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
7167 }
7168 
7169 _NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7170 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7171 {
7172     SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
7173 }
7174 
7175 _NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7176 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7177 {
7178     SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
7179 }
7180 
7181 _NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7182 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7183 {
7184     SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
7185 }
7186 
7187 _NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64 (uint64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7188 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7189 {
7190     SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
7191 }
7192 
7193 _NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7194 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7195 {
7196     SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
7197 }
7198 
7199 _NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7200 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7201 {
7202     SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
7203 }
7204 
7205 _NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7206 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7207 {
7208     SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
7209 }
7210 
7211 _NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7212 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7213 {
7214     SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
7215 }
7216 
7217 _NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7218 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7219 {
7220     SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
7221 }
7222 
7223 _NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7224 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7225 {
7226     SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
7227 }
7228 
7229 _NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7230 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7231 {
7232     SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
7233 }
7234 
7235 _NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7236 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7237 {
7238     SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
7239 }
7240 
7241 
7242 //******** Vector rounding shift left: (negative values shift right) **********
7243 //****************************************************************************
7244 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7245 //rounding makes sense for right shifts only.
7246 #define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
7247         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
7248         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7249         for (i = 0; i<LEN; i++) { \
7250         if( btmp[i] >= 0) { \
7251             if(btmp[i] >= lanesize) res[i] = 0; \
7252             else res[i] = (atmp[i] << btmp[i]); \
7253         }else{ \
7254             res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
7255                             (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
7256                             (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) );    }} \
7257         return _mm_load_si128((__m128i*)res);
7258 
7259 
7260 #define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
7261         int ## TYPE ## x ## LEN ## _t res;  int i;  int lanesize = sizeof(int ## TYPE ## _t) << 3; \
7262         for (i = 0; i<LEN; i++) { \
7263         if( b.m64_i ## TYPE[i] >= 0) { \
7264             if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
7265             else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
7266         }else{ \
7267             res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
7268                             (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
7269                             (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) );    }} \
7270         return res;
7271 
7272 
7273 _NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7274 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7275 {
7276     SERIAL_ROUNDING_SHIFT_64(8,i,8)
7277 }
7278 
7279 _NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7280 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7281 {
7282     SERIAL_ROUNDING_SHIFT_64(16,i,4)
7283 }
7284 
7285 _NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7286 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7287 {
7288     SERIAL_ROUNDING_SHIFT_64(32,i,2)
7289 }
7290 
7291 _NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7292 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7293 {
7294     SERIAL_ROUNDING_SHIFT_64(64,i,1)
7295 }
7296 
7297 _NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7298 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7299 {
7300     SERIAL_ROUNDING_SHIFT_64(8,u,8)
7301 }
7302 
7303 _NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7304 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7305 {
7306     SERIAL_ROUNDING_SHIFT_64(16,u,4)
7307 }
7308 
7309 _NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7310 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7311 {
7312     SERIAL_ROUNDING_SHIFT_64(32,u,2)
7313 }
7314 
7315 _NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64 (uint64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7316 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7317 {
7318     SERIAL_ROUNDING_SHIFT_64(64,u,1)
7319 }
7320 
7321 _NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7322 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7323 {
7324     SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
7325 }
7326 
7327 _NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7328 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7329 {
7330     SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
7331 }
7332 
7333 _NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7334 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7335 {
7336     SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
7337 }
7338 
7339 _NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7340 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7341 {
7342     SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
7343 }
7344 
7345 _NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7346 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7347 {
7348     SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
7349 }
7350 
7351 _NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7352 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7353 {
7354     SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
7355 }
7356 
7357 _NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7358 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7359 {
7360     SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
7361 }
7362 
7363 _NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7364 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7365 {
7366     SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
7367 }
7368 
7369 
7370 //********** Vector saturating rounding shift left: (negative values shift right) ****************
7371 //*************************************************************************************************
7372 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
7373 //Saturation happens for left shifts only while rounding makes sense for right shifts only.
7374 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7375         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7376         int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7377         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7378         for (i = 0; i<LEN; i++) { \
7379         if (atmp[i] ==0) res[i] = 0; \
7380         else{ \
7381             if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7382             else{ \
7383                 if (btmp[i]>lanesize_1) { \
7384                     res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7385                 }else{ \
7386                     limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7387                     if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7388                         res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7389                     else res[i] = atmp[i] << btmp[i]; }}}} \
7390         return _mm_load_si128((__m128i*)res);
7391 
7392 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7393         _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7394         int lanesize = (sizeof(TYPE) << 3); \
7395         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7396         for (i = 0; i<LEN; i++) { \
7397         if (atmp[i] ==0) {res[i] = 0; \
7398         }else{ \
7399             if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7400             else{ \
7401                 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
7402                 else{ \
7403                     limit = (TYPE) 1 << (lanesize - btmp[i]); \
7404                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
7405         return _mm_load_si128((__m128i*)res);
7406 
7407 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
7408         __m64_128 res; int ## TYPE ## _t limit; int i; \
7409         int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
7410         for (i = 0; i<LEN; i++) { \
7411         if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7412         else{ \
7413             if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7414             else{ \
7415                 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7416                     res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7417                 }else{ \
7418                     limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7419                     if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7420                         res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7421                     else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7422         return res;
7423 
7424 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7425         __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7426         int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7427         for (i = 0; i<LEN; i++) { \
7428         if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7429         }else{ \
7430             if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7431             else{ \
7432                 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
7433                 else{ \
7434                     limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7435                     res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7436         return res;
7437 
7438 _NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7439 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7440 {
7441     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
7442 }
7443 
7444 _NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7445 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7446 {
7447     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
7448 }
7449 
7450 _NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7451 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7452 {
7453     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
7454 }
7455 
7456 _NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7457 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7458 {
7459     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
7460 }
7461 
7462 _NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7463 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7464 {
7465     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
7466 }
7467 
7468 _NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7469 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7470 {
7471     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
7472 }
7473 
7474 _NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7475 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7476 {
7477     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
7478 }
7479 
7480 _NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64 (uint64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7481 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7482 {
7483     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
7484 }
7485 
7486 _NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7487 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7488 {
7489     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
7490 }
7491 
7492 _NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7493 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7494 {
7495     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
7496 }
7497 
7498 _NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7499 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7500 {
7501     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
7502 }
7503 
7504 _NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7505 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7506 {
7507     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
7508 }
7509 
7510 _NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7511 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7512 {
7513     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
7514 }
7515 
7516 _NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7517 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7518 {
7519     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
7520 }
7521 
7522 _NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7523 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7524 {
7525     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
7526 }
7527 
7528 _NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7529 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7530 {
7531     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
7532 }
7533 
7534 // *********************************************************************************
7535 // *****************************  Shifts by a constant *****************************
7536 // *********************************************************************************
7537 //**************** Vector shift right by constant*************************************
7538 //************************************************************************************
7539 _NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
7540 _NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
7541 {
7542     //no 8 bit shift available, go to 16 bit
7543     int8x8_t res64;
7544     __m128i r;
7545     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7546     r = _mm_srai_epi16 (r, b); //SSE2
7547     r = _mm_packs_epi16 (r,r); //we need 64 bits only
7548     return64(r);
7549 }
7550 
7551 _NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
7552 _NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
7553 {
7554     int16x4_t res64;
7555     return64(_mm_srai_epi16(_pM128i(a), b));
7556 }
7557 
7558 
7559 _NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
7560 _NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
7561 {
7562     int32x2_t res64;
7563     return64(_mm_srai_epi32(_pM128i(a), b));
7564 }
7565 
7566 _NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
7567 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7568 {
7569     //no arithmetic shift for 64bit values, serial solution used
7570     int64x1_t res;
7571     if(b>=64) res.m64_i64[0] = 0;
7572     else res.m64_i64[0] = (*(int64_t*)&a) >> b;
7573     return res;
7574 }
7575 
7576 _NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
7577 _NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
7578 {
7579     //no 8 bit shift available, go to 16 bit
7580     uint8x8_t res64;
7581     __m128i r;
7582     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7583     r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
7584     r = _mm_packus_epi16 (r,r); //we need 64 bits only
7585     return64(r);
7586 }
7587 
7588 _NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
7589 _NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
7590 {
7591     uint16x4_t res64;
7592     return64(_mm_srli_epi16(_pM128i(a), b));
7593 }
7594 
7595 
7596 _NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
7597 _NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
7598 {
7599     uint32x2_t res64;
7600     return64(_mm_srli_epi32(_pM128i(a), b));
7601 }
7602 
7603 
7604 _NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
7605 _NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b)
7606 {
7607     uint64x1_t res64;
7608     return64(_mm_srli_epi64(_pM128i(a), b));
7609 }
7610 
7611 
7612 _NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
7613 _NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
7614 {
7615     //no 8 bit shift available, go to 16 bit trick
7616     __m128i zero, mask0, a_sign, r, a_sign_mask;
7617     _NEON2SSE_ALIGN_16 static const int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0,  0x00f8, 0x00fc, 0x00fe, 0x00ff};
7618     zero = _mm_setzero_si128();
7619     mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
7620     a_sign =  _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
7621     r = _mm_srai_epi16 (a, b);
7622     a_sign_mask =  _mm_and_si128 (mask0, a_sign);
7623     r =  _mm_andnot_si128 (mask0, r);
7624     return _mm_or_si128 (r, a_sign_mask);
7625 }
7626 
7627 _NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
7628 #define vshrq_n_s16 _mm_srai_epi16
7629 
7630 _NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
7631 #define vshrq_n_s32 _mm_srai_epi32
7632 
7633 _NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
7634 _NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
7635 {
7636     //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
7637     __m128i c1, signmask,a0,  res64;
7638     _NEON2SSE_ALIGN_16 static const uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
7639     c1 =  _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
7640     signmask  =  _mm_slli_epi64 (c1, (64 - b));
7641     a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
7642     a0 = _MM_CMPEQ_EPI64 (a, a0);
7643     signmask = _mm_and_si128(a0, signmask);
7644     res64 = _mm_srli_epi64 (a, b);
7645     return _mm_or_si128(res64, signmask);
7646 }
7647 
7648 _NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
7649 _NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
7650 {
7651     //no 8 bit shift available, need the special trick
7652     __m128i mask0, r;
7653     _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f,  0xff07, 0xff03, 0xff01, 0xff00};
7654     mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
7655     r = _mm_srli_epi16 ( a, b);
7656     return _mm_and_si128 (r,  mask0);
7657 }
7658 
7659 _NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
7660 #define vshrq_n_u16 _mm_srli_epi16
7661 
7662 _NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
7663 #define vshrq_n_u32 _mm_srli_epi32
7664 
7665 _NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
7666 #define vshrq_n_u64 _mm_srli_epi64
7667 
7668 //*************************** Vector shift left by constant *************************
7669 //*********************************************************************************
7670 _NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
7671 _NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
7672 {
7673     //no 8 bit shift available, go to 16 bit
7674     int8x8_t res64;
7675     __m128i r;
7676     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7677     r = _mm_slli_epi16 (r, b); //SSE2
7678     r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
7679     return64(r);
7680 }
7681 
7682 _NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
7683 _NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b)
7684 {
7685     int16x4_t res64;
7686     return64(_mm_slli_epi16(_pM128i(a), b));
7687 }
7688 
7689 
7690 _NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
7691 _NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
7692 {
7693     int32x2_t res64;
7694     return64(_mm_slli_epi32(_pM128i(a), b));
7695 }
7696 
7697 
7698 _NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
7699 _NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b)
7700 {
7701     int64x1_t res64;
7702     return64(_mm_slli_epi64(_pM128i(a), b));
7703 }
7704 
7705 
7706 _NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
7707 _NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
7708 {
7709     //no 8 bit shift available, go to 16 bit
7710     uint8x8_t res64;
7711     __m128i mask8;
7712     __m128i r;
7713     mask8 = _mm_set1_epi16(0xff);
7714     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7715     r = _mm_slli_epi16 (r, b); //SSE2
7716     r = _mm_and_si128(r, mask8); //to avoid saturation
7717     r = _mm_packus_epi16 (r,r); //we need 64 bits only
7718     return64(r);
7719 }
7720 
7721 _NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
7722 #define vshl_n_u16 vshl_n_s16
7723 
7724 
7725 _NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
7726 #define vshl_n_u32 vshl_n_s32
7727 
7728 _NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
7729 #define vshl_n_u64 vshl_n_s64
7730 
7731 _NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
7732 #define vshlq_n_s8 vshlq_n_u8
7733 
7734 _NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
7735 #define vshlq_n_s16 _mm_slli_epi16
7736 
7737 _NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
7738 #define vshlq_n_s32 _mm_slli_epi32
7739 
7740 _NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
7741 #define vshlq_n_s64 _mm_slli_epi64
7742 
7743 _NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
7744 _NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
7745 {
7746     //no 8 bit shift available, need the special trick
7747     __m128i mask0, r;
7748     _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff,  0xe0ff, 0xc0ff, 0x80ff, 0xff};
7749     mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
7750     r = _mm_slli_epi16 ( a, b);
7751     return _mm_and_si128 (r,  mask0);
7752 }
7753 
7754 _NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
7755 #define vshlq_n_u16 vshlq_n_s16
7756 
7757 _NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
7758 #define vshlq_n_u32 vshlq_n_s32
7759 
7760 _NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
7761 #define vshlq_n_u64 vshlq_n_s64
7762 
7763 //************* Vector rounding shift right by constant ******************
7764 //*************************************************************************
7765 //No corresponding  x86 intrinsics exist, need to do some tricks
7766 _NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
7767 _NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
7768 {
7769     //no 8 bit shift available, go to 16 bit
7770     int8x8_t res64;
7771     __m128i r, maskb;
7772     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7773     maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
7774     maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
7775     r = _mm_srai_epi16 (r, b);
7776     r = _mm_add_epi16 (r, maskb); //actual rounding
7777     r = _mm_packs_epi16 (r,r); ////we need 64 bits only
7778     return64(r);
7779 }
7780 
7781 _NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
7782 _NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
7783 {
7784     int16x4_t res64;
7785     return64(vrshrq_n_s16(_pM128i(a), b));
7786 }
7787 
7788 
7789 _NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
7790 _NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
7791 {
7792     int32x2_t res64;
7793     return64(vrshrq_n_s32(_pM128i(a), b));
7794 }
7795 
7796 
7797 _NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
7798 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7799 {
7800     //serial solution is faster
7801     int64x1_t res;
7802     int64_t a_i64 = *( int64_t*)&a;
7803     if(b==64) {
7804         res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
7805     } else {
7806         int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
7807         res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
7808     }
7809     return res;
7810 }
7811 
7812 _NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
7813 _NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
7814 {
7815     //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
7816     uint8x8_t res64;
7817     __m128i r, maskb;
7818     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7819     maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
7820     maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
7821     r = _mm_srli_epi16 (r, b);
7822     r = _mm_add_epi16 (r, maskb); //actual rounding
7823     r =  _mm_packus_epi16 (r,r); ////we need 64 bits only
7824     return64(r);
7825 }
7826 
7827 _NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
7828 _NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
7829 {
7830     uint16x4_t res64;
7831     return64(vrshrq_n_u16(_pM128i(a), b));
7832 }
7833 
7834 
7835 _NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
7836 _NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
7837 {
7838     uint32x2_t res64;
7839     return64(vrshrq_n_u32(_pM128i(a), b));
7840 }
7841 
7842 
7843 _NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
7844 _NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
7845 {
7846     uint64x1_t res64;
7847     return64(vrshrq_n_u64(_pM128i(a), b));
7848 }
7849 
7850 _NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
7851 _NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
7852 {
7853     //no 8 bit shift available, go to 16 bit trick
7854     __m128i r, mask1, maskb;
7855     _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7856     r = vshrq_n_s8 (a, b);
7857     mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7858     maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7859     maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
7860     return _mm_add_epi8(r, maskb); //actual rounding
7861 }
7862 
7863 _NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
7864 _NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
7865 {
7866     __m128i maskb, r;
7867     maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
7868     maskb = _mm_srli_epi16(maskb, 15); //1 or 0
7869     r = _mm_srai_epi16 (a, b);
7870     return _mm_add_epi16 (r, maskb); //actual rounding
7871 }
7872 
7873 _NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
7874 _NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
7875 {
7876     __m128i maskb,  r;
7877     maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
7878     maskb = _mm_srli_epi32 (maskb,31); //1 or 0
7879     r = _mm_srai_epi32(a, b);
7880     return _mm_add_epi32 (r, maskb); //actual rounding
7881 }
7882 
7883 _NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
7884 _NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
7885 {
7886     //solution may be not optimal compared with a serial one
7887     __m128i maskb;
7888     int64x2_t r;
7889     maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
7890     maskb = _mm_srli_epi64 (maskb,63); //1 or 0
7891     r = vshrq_n_s64(a, b);
7892     return _mm_add_epi64 (r, maskb); //actual rounding
7893 }
7894 
7895 _NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
7896 _NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
7897 {
7898     //no 8 bit shift available, go to 16 bit trick
7899     __m128i r, mask1, maskb;
7900     _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7901     r = vshrq_n_u8 (a, b);
7902     mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7903     maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7904     maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
7905     return _mm_add_epi8(r, maskb); //actual rounding
7906 }
7907 
7908 _NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
7909 _NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
7910 {
7911     __m128i maskb, r;
7912     maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
7913     maskb = _mm_srli_epi16(maskb, 15); //1 or 0
7914     r = _mm_srli_epi16 (a, b);
7915     return _mm_add_epi16 (r, maskb); //actual rounding
7916 }
7917 
7918 _NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
7919 _NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
7920 {
7921     __m128i maskb,  r;
7922     maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
7923     maskb = _mm_srli_epi32 (maskb,31); //1 or 0
7924     r = _mm_srli_epi32(a, b);
7925     return _mm_add_epi32 (r, maskb); //actual rounding
7926 }
7927 
7928 _NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
7929 _NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
7930 {
7931     //solution may be not optimal compared with a serial one
7932     __m128i maskb,  r;
7933     maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
7934     maskb = _mm_srli_epi64 (maskb,63); //1 or 0
7935     r = _mm_srli_epi64(a, b);
7936     return _mm_add_epi64 (r, maskb); //actual rounding
7937 }
7938 
7939 //************* Vector shift right by constant and accumulate *********
7940 //*********************************************************************
7941 _NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
7942 _NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
7943 {
7944     int8x8_t shift;
7945     shift = vshr_n_s8(b, c);
7946     return vadd_s8( a, shift);
7947 }
7948 
7949 _NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
7950 _NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
7951 {
7952     int16x4_t shift;
7953     shift = vshr_n_s16( b, c);
7954     return vadd_s16(a, shift);
7955 }
7956 
7957 _NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
7958 _NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
7959 {
7960     //may be not optimal compared with the serial execution
7961     int32x2_t shift;
7962     shift = vshr_n_s32(b, c);
7963     return vadd_s32( a, shift);
7964 }
7965 
7966 _NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
7967 _NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
7968 {
7969     //may be not optimal compared with a serial solution
7970     int64x1_t shift;
7971     shift = vshr_n_s64(b, c);
7972     return vadd_s64( a, shift);
7973 }
7974 
7975 _NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
7976 _NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
7977 {
7978     uint8x8_t shift;
7979     shift = vshr_n_u8(b, c);
7980     return vadd_u8(a, shift);
7981 }
7982 
7983 _NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
7984 _NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
7985 {
7986     uint16x4_t shift;
7987     shift = vshr_n_u16(b, c);
7988     return vadd_u16(a,shift);
7989 }
7990 
7991 _NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
7992 _NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
7993 {
7994     //may be not optimal compared with the serial execution
7995     uint32x2_t shift;
7996     shift = vshr_n_u32(b, c);
7997     return vadd_u32( a, shift);
7998 }
7999 
8000 _NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
8001 _NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
8002 {
8003     //may be not optimal compared with the serial execution
8004     uint64x1_t shift;
8005     shift = vshr_n_u64(b, c);
8006     return vadd_u64(a, shift);
8007 }
8008 
8009 _NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
8010 _NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
8011 {
8012     int8x16_t shift;
8013     shift = vshrq_n_s8(b, c);
8014     return vaddq_s8(a, shift);
8015 }
8016 
8017 _NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
8018 _NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
8019 {
8020     int16x8_t shift;
8021     shift = vshrq_n_s16(b, c);
8022     return vaddq_s16(a, shift);
8023 }
8024 
8025 _NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
8026 _NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
8027 {
8028     int32x4_t shift;
8029     shift = vshrq_n_s32(b, c);
8030     return vaddq_s32(a, shift);
8031 }
8032 
8033 _NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
8034 _NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
8035 {
8036     int64x2_t shift;
8037     shift = vshrq_n_s64(b, c);
8038     return vaddq_s64( a, shift);
8039 }
8040 
8041 _NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
8042 _NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
8043 {
8044     uint8x16_t shift;
8045     shift = vshrq_n_u8(b, c);
8046     return vaddq_u8(a, shift);
8047 }
8048 
8049 _NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
8050 _NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
8051 {
8052     uint16x8_t shift;
8053     shift = vshrq_n_u16(b, c);
8054     return vaddq_u16(a,  shift);
8055 }
8056 
8057 _NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
8058 _NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
8059 {
8060     uint32x4_t shift;
8061     shift = vshrq_n_u32(b, c);
8062     return vaddq_u32(a, shift);
8063 }
8064 
8065 _NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
8066 _NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
8067 {
8068     uint64x2_t shift;
8069     shift = vshrq_n_u64(b, c);
8070     return vaddq_u64(a, shift);
8071 }
8072 
8073 //************* Vector rounding shift right by constant and accumulate ****************************
8074 //************************************************************************************************
8075 _NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
8076 _NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
8077 {
8078     int8x8_t shift;
8079     shift = vrshr_n_s8(b, c);
8080     return vadd_s8( a, shift);
8081 }
8082 
8083 _NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
8084 _NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
8085 {
8086     int16x4_t shift;
8087     shift = vrshr_n_s16( b, c);
8088     return vadd_s16(a, shift);
8089 }
8090 
8091 _NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
8092 _NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
8093 {
8094     //may be not optimal compared with the serial execution
8095     int32x2_t shift;
8096     shift = vrshr_n_s32(b, c);
8097     return vadd_s32( a, shift);
8098 }
8099 
8100 _NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
8101 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8102 {
8103     int64x1_t shift;
8104     shift = vrshr_n_s64(b, c);
8105     return vadd_s64( a, shift);
8106 }
8107 
8108 _NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
8109 _NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
8110 {
8111     uint8x8_t shift;
8112     shift = vrshr_n_u8(b, c);
8113     return vadd_u8(a, shift);
8114 }
8115 
8116 _NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
8117 _NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
8118 {
8119     uint16x4_t shift;
8120     shift = vrshr_n_u16(b, c);
8121     return vadd_u16(a,shift);
8122 }
8123 
8124 _NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
8125 _NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
8126 {
8127     //may be not optimal compared with the serial execution
8128     uint32x2_t shift;
8129     shift = vrshr_n_u32(b, c);
8130     return vadd_u32( a, shift);
8131 }
8132 
8133 _NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
8134 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8135 {
8136     //may be not optimal compared with the serial execution
8137     uint64x1_t shift;
8138     shift = vrshr_n_u64(b, c);
8139     return vadd_u64( a, shift);
8140 }
8141 
8142 _NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
8143 _NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
8144 {
8145     int8x16_t shift;
8146     shift = vrshrq_n_s8(b, c);
8147     return vaddq_s8(a, shift);
8148 }
8149 
8150 _NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
8151 _NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
8152 {
8153     int16x8_t shift;
8154     shift = vrshrq_n_s16(b, c);
8155     return vaddq_s16(a, shift);
8156 }
8157 
8158 _NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
8159 _NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
8160 {
8161     int32x4_t shift;
8162     shift = vrshrq_n_s32(b, c);
8163     return vaddq_s32(a, shift);
8164 }
8165 
8166 _NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
8167 _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
8168 {
8169     int64x2_t shift;
8170     shift = vrshrq_n_s64(b, c);
8171     return vaddq_s64(a, shift);
8172 }
8173 
8174 _NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
8175 _NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
8176 {
8177     uint8x16_t shift;
8178     shift = vrshrq_n_u8(b, c);
8179     return vaddq_u8(a, shift);
8180 }
8181 
8182 _NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
8183 _NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
8184 {
8185     uint16x8_t shift;
8186     shift = vrshrq_n_u16(b, c);
8187     return vaddq_u16(a,  shift);
8188 }
8189 
8190 _NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
8191 _NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
8192 {
8193     uint32x4_t shift;
8194     shift = vrshrq_n_u32(b, c);
8195     return vaddq_u32(a, shift);
8196 }
8197 
8198 _NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
8199 _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
8200 {
8201     uint64x2_t shift;
8202     shift = vrshrq_n_u64(b, c);
8203     return vaddq_u64(a, shift);
8204 }
8205 
8206 //**********************Vector saturating shift left by constant *****************************
8207 //********************************************************************************************
8208 //we don't check const ranges  assuming they are met
8209 _NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
8210 _NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
8211 {
8212     //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8213     int8x8_t res64;
8214     __m128i a128, r128;
8215     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8216     r128 = _mm_slli_epi16 (a128, b);
8217     r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
8218     return64(r128);
8219 }
8220 
8221 _NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
8222 _NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
8223 {
8224     // go to 32 bit to get the auto saturation (in packs function)
8225     int16x4_t res64;
8226     __m128i a128, r128;
8227     a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8228     r128 = _mm_slli_epi32 (a128, b); //shift_res
8229     r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
8230     return64(r128);
8231 }
8232 
8233 _NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
8234 _NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
8235 {
8236     //serial execution may be faster
8237     int32x2_t res64;
8238     return64(vqshlq_n_s32 (_pM128i(a), b));
8239 }
8240 
8241 
8242 _NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
8243 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8244 {
8245     // no effective SIMD solution here
8246     int64x1_t res;
8247     int64_t bmask;
8248     int64_t a_i64 = *( int64_t*)&a;
8249     bmask = ( int64_t)1 << (63 - b); //positive
8250     if (a_i64 >= bmask) {
8251         res.m64_i64[0] = ~(_SIGNBIT64);
8252     } else {
8253         res.m64_i64[0]  = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
8254     }
8255     return res;
8256 }
8257 
8258 
8259 _NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
8260 _NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
8261 {
8262     //no 8 bit shift available in IA32 SIMD, go to 16 bit
8263     uint8x8_t res64;
8264     __m128i a128, r128;
8265     a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
8266     r128 = _mm_slli_epi16 (a128, b); //shift_res
8267     r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8268     return64(r128);
8269 }
8270 
8271 _NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
8272 _NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
8273 {
8274     // go to 32 bit to get the auto saturation (in packus function)
8275     uint16x4_t res64;
8276     __m128i a128, r128;
8277     a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
8278     r128 = _mm_slli_epi32 (a128, b); //shift_res
8279     r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
8280     return64(r128);
8281 }
8282 
8283 _NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
8284 _NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b)
8285 {
8286     uint32x2_t res64;
8287     return64(vqshlq_n_u32(_pM128i(a), b));
8288 }
8289 
8290 _NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
8291 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8292 {
8293     // no effective SIMD solution here
8294     uint64x1_t res;
8295     uint64_t bmask;
8296     uint64_t a_i64 = *(uint64_t*)&a;
8297     bmask = ( uint64_t)1 << (64 - b);
8298     res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
8299     return res;
8300 }
8301 
8302 _NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
8303 _NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
8304 {
8305     // go to 16 bit to get the auto saturation (in packs function)
8306     __m128i a128, r128_1, r128_2;
8307     a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8308     r128_1 = _mm_slli_epi16 (a128, b);
8309     //swap hi and low part of a128 to process the remaining data
8310     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8311     a128 = _MM_CVTEPI8_EPI16 (a128);
8312     r128_2 = _mm_slli_epi16 (a128, b);
8313     return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
8314 }
8315 
8316 _NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
8317 _NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
8318 {
8319     // manual saturation solution looks LESS optimal than 32 bits conversion one
8320     // go to 32 bit to get the auto saturation (in packs function)
8321     __m128i a128, r128_1, r128_2;
8322     a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8323     r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8324     //swap hi and low part of a128 to process the remaining data
8325     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8326     a128 = _MM_CVTEPI16_EPI32 (a128);
8327     r128_2 = _mm_slli_epi32 (a128, b);
8328     return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
8329 }
8330 
8331 _NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
8332 _NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
8333 {
8334     // no 64 bit saturation option available, special tricks necessary
8335     __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
8336     c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
8337     maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
8338     saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0  otherwise
8339     c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
8340     shift_res = _mm_slli_epi32 (a, b);
8341     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8342     //result with positive numbers saturated
8343     shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8344     //treat negative numbers
8345     maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones  and (32-b+1)  zeros
8346     saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0  otherwise
8347     c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
8348     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8349     return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8350 }
8351 
8352 _NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
8353 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8354 {
8355     // no effective SIMD solution here
8356     _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
8357     int64_t bmask;
8358     int i;
8359     bmask = ( int64_t)1 << (63 - b); //positive
8360     _mm_store_si128((__m128i*)atmp, a);
8361     for (i = 0; i<2; i++) {
8362         if (atmp[i] >= bmask) {
8363             res[i] = ~(_SIGNBIT64);
8364         } else {
8365             res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
8366         }
8367     }
8368     return _mm_load_si128((__m128i*)res);
8369 }
8370 
8371 _NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
8372 _NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
8373 {
8374     // go to 16 bit to get the auto saturation (in packs function)
8375     __m128i a128, r128_1, r128_2;
8376     a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
8377     r128_1 = _mm_slli_epi16 (a128, b);
8378     //swap hi and low part of a128 to process the remaining data
8379     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8380     a128 = _MM_CVTEPU8_EPI16 (a128);
8381     r128_2 = _mm_slli_epi16 (a128, b);
8382     return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8383 }
8384 
8385 _NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
8386 _NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
8387 {
8388     // manual saturation solution looks more optimal than 32 bits conversion one
8389     __m128i cb, c8000, a_signed, saturation_mask,  shift_res;
8390     cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
8391     c8000 = _mm_set1_epi16 ((int16_t)0x8000);
8392 //no unsigned shorts comparison in SSE, only signed available, so need the trick
8393     a_signed = _mm_sub_epi16(a, c8000); //go to signed
8394     saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
8395     shift_res = _mm_slli_epi16 (a, b);
8396     return _mm_or_si128 (shift_res, saturation_mask);
8397 }
8398 
8399 _NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
8400 _NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
8401 {
8402     // manual saturation solution, no 64 bit saturation option, the serial version may be faster
8403     __m128i cb, c80000000, a_signed, saturation_mask,  shift_res;
8404     cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
8405     c80000000 = _mm_set1_epi32 (0x80000000);
8406 //no unsigned ints comparison in SSE, only signed available, so need the trick
8407     a_signed = _mm_sub_epi32(a, c80000000); //go to signed
8408     saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
8409     shift_res = _mm_slli_epi32 (a, b);
8410     return _mm_or_si128 (shift_res, saturation_mask);
8411 }
8412 
8413 _NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
8414 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8415 {
8416     // no effective SIMD solution here
8417     _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
8418     uint64_t bmask;
8419     int i;
8420     bmask = ( uint64_t)1 << (64 - b);
8421     _mm_store_si128((__m128i*)atmp, a);
8422     for (i = 0; i<2; i++) {
8423         res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
8424     }
8425     return _mm_load_si128((__m128i*)res);
8426 }
8427 
8428 //**************Vector signed->unsigned saturating shift left by constant *************
8429 //*************************************************************************************
8430 _NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
8431 _NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
8432 {
8433     //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8434     uint8x8_t res64;
8435     __m128i a128, r128;
8436     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8437     r128 = _mm_slli_epi16 (a128, b);
8438     r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8439     return64(r128);
8440 }
8441 
8442 _NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
8443 _NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
8444 {
8445     uint16x4_t res64;
8446     __m128i a128, r128;
8447     a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8448     r128 = _mm_slli_epi32 (a128, b); //shift_res
8449     r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
8450     return64(r128);
8451 }
8452 
8453 _NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
8454 _NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b)
8455 {
8456     int32x2_t res64;
8457     return64( vqshluq_n_s32(_pM128i(a), b));
8458 }
8459 
8460 _NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
8461 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
8462 {
8463     uint64x1_t res;
8464     uint64_t limit;
8465     if (a.m64_i64[0]<=0) {
8466         res.m64_u64[0] = 0;
8467     } else {
8468         limit = (uint64_t) 1 << (64 - b);
8469         res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
8470     }
8471     return res;
8472 }
8473 
8474 _NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
8475 _NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
8476 {
8477     __m128i a128, r128_1, r128_2;
8478     a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8479     r128_1 = _mm_slli_epi16 (a128, b);
8480     //swap hi and low part of a128 to process the remaining data
8481     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8482     a128 = _MM_CVTEPI8_EPI16 (a128);
8483     r128_2 = _mm_slli_epi16 (a128, b);
8484     return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8485 }
8486 
8487 _NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
8488 _NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
8489 {
8490     // manual saturation solution looks LESS optimal than 32 bits conversion one
8491     __m128i a128, r128_1, r128_2;
8492     a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8493     r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8494     //swap hi and low part of a128 to process the remaining data
8495     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8496     a128 = _MM_CVTEPI16_EPI32 (a128);
8497     r128_2 = _mm_slli_epi32 (a128, b);
8498     return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
8499 }
8500 
8501 _NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
8502 _NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
8503 {
8504     //solution may be  not optimal compared with the serial one
8505     __m128i zero, maskA, maskGT0, a0,  a_masked, a_shift;
8506     zero = _mm_setzero_si128();
8507     maskA = _mm_cmpeq_epi32(a, a);
8508     maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
8509     //saturate negative numbers to zero
8510     maskGT0   = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
8511     a0 = _mm_and_si128 (a,  maskGT0); //negative are zeros now
8512     //saturate positive to 0xffffffff
8513     a_masked = _mm_and_si128 (a0, maskA);
8514     a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
8515     a_shift = _mm_slli_epi32 (a0, b);
8516     return _mm_or_si128 (a_shift, a_masked); //actual saturation
8517 }
8518 
8519 _NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
8520 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b),  _NEON2SSE_REASON_SLOW_SERIAL)
8521 {
8522     // no effective SIMD solution here, serial execution looks faster
8523     _NEON2SSE_ALIGN_16 int64_t atmp[2];
8524     _NEON2SSE_ALIGN_16 uint64_t res[2];
8525     uint64_t limit;
8526     int i;
8527     _mm_store_si128((__m128i*)atmp, a);
8528     for (i = 0; i<2; i++) {
8529         if (atmp[i]<=0) {
8530             res[i] = 0;
8531         } else {
8532             limit = (uint64_t) 1 << (64 - b);
8533             res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
8534         }
8535     }
8536     return _mm_load_si128((__m128i*)res);
8537 }
8538 
8539 //************** Vector narrowing  shift right by constant **************
8540 //**********************************************************************
8541 _NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
8542 _NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
8543 {
8544     int8x8_t res64;
8545     __m128i r16;
8546     r16  = vshrq_n_s16(a,b);
8547     r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8548     return64(r16);
8549 }
8550 
8551 _NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
8552 _NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
8553 {
8554     int16x4_t res64;
8555     __m128i r32;
8556     r32  = vshrq_n_s32(a,b);
8557     r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8558     return64(r32);
8559 }
8560 
8561 _NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
8562 _NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
8563 {
8564     int32x2_t res64;
8565     __m128i r64;
8566     r64  = vshrq_n_s64(a,b);
8567     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8568     return64(r64);
8569 }
8570 
8571 _NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
8572 _NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
8573 {
8574     uint8x8_t res64;
8575     __m128i mask, r16;
8576     mask = _mm_set1_epi16(0xff);
8577     r16  = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8578     r16 = _mm_and_si128(r16, mask); //to avoid saturation
8579     r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
8580     return64(r16);
8581 }
8582 
8583 _NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
8584 _NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
8585 {
8586     uint16x4_t res64;
8587     __m128i mask, r32;
8588     mask = _mm_set1_epi32(0xffff);
8589     r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
8590     r32 = _mm_and_si128(r32, mask); //to avoid saturation
8591     r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
8592     return64(r32);
8593 }
8594 
8595 _NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
8596 _NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8597 {
8598     uint32x2_t res64;
8599     __m128i r64;
8600     r64  = vshrq_n_u64(a,b);
8601     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8602     return64(r64);
8603 }
8604 
8605 //************** Vector signed->unsigned narrowing saturating shift right by constant ********
8606 //*********************************************************************************************
8607 _NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
8608 _NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
8609 {
8610     uint8x8_t res64;
8611     __m128i r16;
8612     r16  = vshrq_n_s16(a,b);
8613     r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
8614     return64(r16);
8615 }
8616 
8617 _NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
8618 _NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
8619 {
8620     uint16x4_t res64;
8621     __m128i r32;
8622     r32  = vshrq_n_s32(a,b);
8623     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow(signed to unsigned), use low 64 bits only
8624     return64(r32);
8625 }
8626 
8627 _NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
8628 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8629 {
8630     _NEON2SSE_ALIGN_16 int64_t atmp[2];
8631     uint32x2_t res;
8632     int64_t res64;
8633     _mm_store_si128((__m128i*)atmp, a);
8634     if (atmp[0] < 0) {
8635         res.m64_u32[0] = 0;
8636     } else {
8637         res64 = (atmp[0] >> b);
8638         res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
8639     }
8640     if (atmp[1] < 0) {
8641         res.m64_u32[1] = 0;
8642     } else {
8643         res64 = (atmp[1] >> b);
8644         res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
8645     }
8646     return res;
8647 }
8648 
8649 //**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
8650 _NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
8651 _NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
8652 {
8653     //solution may be not optimal compared with the serial one
8654     __m128i r16;
8655     uint8x8_t res64;
8656     r16 = vrshrq_n_s16(a,b);
8657     r16 =  _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
8658     return64(r16);
8659 }
8660 
8661 _NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
8662 _NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
8663 {
8664     //solution may be not optimal compared with the serial one
8665     __m128i r32;
8666     uint16x4_t res64;
8667     r32 = vrshrq_n_s32(a,b);
8668     r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow (signed to unsigned), use low 64 bits only
8669     return64(r32);
8670 }
8671 
8672 _NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
8673 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8674 {
8675     _NEON2SSE_ALIGN_16 int64_t atmp[2];
8676     uint32x2_t res;
8677     int64_t res64;
8678     _mm_store_si128((__m128i*)atmp, a);
8679     if (atmp[0] < 0) {
8680         res.m64_u32[0] = 0;
8681     } else {
8682         res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
8683         res.m64_u32[0] = (uint32_t) ((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
8684     }
8685     if (atmp[1] < 0) {
8686         res.m64_u32[1] = 0;
8687     } else {
8688         res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
8689         res.m64_u32[1] = (uint32_t)((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
8690     }
8691     return res;
8692 }
8693 
8694 //***** Vector narrowing saturating shift right by constant ******
8695 //*****************************************************************
8696 _NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
8697 _NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
8698 {
8699     int8x8_t res64;
8700     __m128i r16;
8701     r16  = vshrq_n_s16(a,b);
8702     r16  = _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
8703     return64(r16);
8704 }
8705 
8706 _NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
8707 _NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
8708 {
8709     int16x4_t res64;
8710     __m128i r32;
8711     r32  = vshrq_n_s32(a,b);
8712     r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
8713     return64(r32);
8714 }
8715 
8716 _NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
8717 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8718 {
8719     //no optimal SIMD solution found
8720     _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
8721     int32x2_t res;
8722     _mm_store_si128((__m128i*)atmp, a);
8723     res64[0] = (atmp[0] >> b);
8724     res64[1] = (atmp[1] >> b);
8725     if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
8726     if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
8727     if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
8728     if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
8729     res.m64_i32[0] = (int32_t)res64[0];
8730     res.m64_i32[1] = (int32_t)res64[1];
8731     return res;
8732 }
8733 
8734 _NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
8735 _NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
8736 {
8737     uint8x8_t res64;
8738     __m128i r16;
8739     r16  = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8740     r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
8741     return64(r16);
8742 }
8743 
8744 _NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
8745 _NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
8746 {
8747     uint16x4_t res64;
8748     __m128i r32;
8749     r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8750     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
8751     return64(r32);
8752 }
8753 
8754 _NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
8755 _NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8756 {
8757     //serial solution may be faster
8758     uint32x2_t res64;
8759     __m128i r64, res_hi, zero;
8760     zero = _mm_setzero_si128();
8761     r64  = vshrq_n_u64(a,b);
8762     res_hi = _mm_srli_epi64(r64,  32);
8763     res_hi = _mm_cmpgt_epi32(res_hi, zero);
8764     r64 = _mm_or_si128(r64, res_hi);
8765     r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8766     return64(r64);
8767 }
8768 
8769 
8770 //********* Vector rounding narrowing shift right by constant *************************
8771 //****************************************************************************************
8772 _NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
8773 _NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
8774 {
8775     int8x8_t res64;
8776     __m128i r16;
8777      r16  = vrshrq_n_s16(a,b);
8778     r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8779     return64(r16);
8780 }
8781 
8782 _NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
8783 _NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
8784 {
8785     int16x4_t res64;
8786     __m128i r32;
8787     r32  = vrshrq_n_s32(a,b);
8788     r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8789     return64(r32);
8790 }
8791 
8792 _NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
8793 _NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
8794 {
8795     int32x2_t res64;
8796     __m128i r64;
8797     r64  = vrshrq_n_s64(a,b);
8798     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8799     return64(r64);
8800 }
8801 
8802 _NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
8803 _NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
8804 {
8805     uint8x8_t res64;
8806     __m128i mask, r16;
8807     mask = _mm_set1_epi16(0xff);
8808     r16  = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8809     r16 = _mm_and_si128(r16, mask); //to avoid saturation
8810     r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
8811     return64(r16);
8812 }
8813 
8814 _NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
8815 _NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
8816 {
8817     uint16x4_t res64;
8818     __m128i mask, r32;
8819     mask = _mm_set1_epi32(0xffff);
8820     r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8821     r32 = _mm_and_si128(r32, mask); //to avoid saturation
8822     r32 = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
8823     return64(r32);
8824 }
8825 
8826 _NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
8827 _NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
8828 {
8829     uint32x2_t res64;
8830     __m128i r64;
8831     r64  = vrshrq_n_u64(a,b);
8832     r64  =  _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8833     return64(r64);
8834 }
8835 
8836 //************* Vector rounding narrowing saturating shift right by constant ************
8837 //****************************************************************************************
8838 _NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
8839 _NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
8840 {
8841     int8x8_t res64;
8842     __m128i r16;
8843     r16  = vrshrq_n_s16(a,b);
8844     r16  =  _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
8845     return64(r16);
8846 }
8847 
8848 _NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
8849 _NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
8850 {
8851     int16x4_t res64;
8852     __m128i r32;
8853     r32  = vrshrq_n_s32(a,b);
8854     r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
8855     return64(r32);
8856 }
8857 
8858 _NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
8859 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8860 {
8861     //no optimal SIMD solution found
8862     _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
8863     int32x2_t res;
8864     _mm_store_si128((__m128i*)atmp, a);
8865     maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
8866     res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
8867     maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
8868     res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
8869     if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
8870     if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
8871     if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
8872     if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
8873     res.m64_i32[0] = (int32_t)res64[0];
8874     res.m64_i32[1] = (int32_t)res64[1];
8875     return res;
8876 }
8877 
8878 _NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
8879 _NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
8880 {
8881     uint8x8_t res64;
8882     __m128i r16;
8883     r16  = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8884     r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
8885     return64(r16);
8886 }
8887 
8888 _NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
8889 _NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
8890 {
8891     uint16x4_t res64;
8892     __m128i r32;
8893     r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8894     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
8895     return64(r32);
8896 }
8897 
8898 _NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
8899 _NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8900 {
8901     //serial solution may be faster
8902     uint32x2_t res64;
8903     __m128i r64, res_hi, zero;
8904     zero = _mm_setzero_si128();
8905     r64  = vrshrq_n_u64(a,b);
8906     res_hi = _mm_srli_epi64(r64,  32);
8907     res_hi = _mm_cmpgt_epi32(res_hi, zero);
8908     r64 = _mm_or_si128(r64, res_hi);
8909     r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8910     return64(r64);
8911 }
8912 
8913 //************** Vector widening shift left by constant ****************
8914 //************************************************************************
8915 _NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
8916 _NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
8917 {
8918     __m128i r;
8919     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8920     return _mm_slli_epi16 (r, b);
8921 }
8922 
8923 _NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
8924 _NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
8925 {
8926     __m128i r;
8927     r =  _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
8928     return _mm_slli_epi32 (r, b);
8929 }
8930 
8931 _NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
8932 _NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
8933 {
8934     __m128i r;
8935     r =  _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
8936     return _mm_slli_epi64 (r, b);
8937 }
8938 
8939 _NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
8940 _NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
8941 {
8942     //no uint8 to uint16 conversion available, manual conversion used
8943     __m128i zero,  r;
8944     zero = _mm_setzero_si128 ();
8945     r = _mm_unpacklo_epi8(_pM128i(a), zero);
8946     return _mm_slli_epi16 (r, b);
8947 }
8948 
8949 _NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
8950 _NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
8951 {
8952     //no uint16 to uint32 conversion available, manual conversion used
8953     __m128i zero,  r;
8954     zero = _mm_setzero_si128 ();
8955     r = _mm_unpacklo_epi16(_pM128i(a), zero);
8956     return _mm_slli_epi32 (r, b);
8957 }
8958 
8959 _NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
8960 _NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
8961 {
8962     //no uint32 to uint64 conversion available, manual conversion used
8963     __m128i zero,  r;
8964     zero = _mm_setzero_si128 ();
8965     r = _mm_unpacklo_epi32(_pM128i(a), zero);
8966     return _mm_slli_epi64 (r, b);
8967 }
8968 
8969 //************************************************************************************
8970 //**************************** Shifts with insert ************************************
8971 //************************************************************************************
8972 //takes each element in a vector,  shifts them by an immediate value,
8973 //and inserts the results in the destination vector. Bits shifted out of the each element are lost.
8974 
8975 //**************** Vector shift right and insert ************************************
8976 //Actually the "c" left bits from "a" are the only bits remained from "a"  after the shift.
8977 //All other bits are taken from b shifted.
8978 _NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
8979 _NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c)
8980 {
8981     int8x8_t res64;
8982     return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
8983 }
8984 
8985 
8986 _NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
8987 _NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c)
8988 {
8989     int16x4_t res64;
8990     return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
8991 }
8992 
8993 
8994 _NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
8995 _NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c)
8996 {
8997     int32x2_t res64;
8998     return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
8999 }
9000 
9001 
9002 _NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
9003 _NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
9004 {
9005     int64x1_t res;
9006     if (c ==64)
9007         res = a;
9008     else{
9009         res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
9010     }
9011     return res;
9012 }
9013 
9014 _NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
9015 #define vsri_n_u8 vsri_n_s8
9016 
9017 _NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
9018 #define vsri_n_u16 vsri_n_s16
9019 
9020 _NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
9021 #define vsri_n_u32 vsri_n_s32
9022 
9023 
9024 _NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
9025 #define vsri_n_u64 vsri_n_s64
9026 
9027 _NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
9028 #define vsri_n_p8 vsri_n_u8
9029 
9030 _NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
9031 #define vsri_n_p16 vsri_n_u16
9032 
9033 _NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9034 _NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
9035 {
9036     __m128i maskA, a_masked;
9037     uint8x16_t b_shift;
9038     _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
9039     maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
9040     a_masked = _mm_and_si128 (a, maskA);
9041     b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
9042     return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
9043 }
9044 
9045 _NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9046 _NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
9047 {
9048     //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
9049     uint16x8_t b_shift;
9050     uint16x8_t a_c;
9051     b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
9052     a_c = vshrq_n_u16( a, (16 - c));
9053     a_c  = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
9054     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9055 }
9056 
9057 _NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
9058 _NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
9059 {
9060     //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
9061     uint32x4_t b_shift;
9062     uint32x4_t a_c;
9063     b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
9064     a_c = vshrq_n_u32( a, (32 - c));
9065     a_c  = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
9066     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9067 }
9068 
9069 _NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
9070 _NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
9071 {
9072     //serial solution may be faster
9073     uint64x2_t b_shift;
9074     uint64x2_t a_c;
9075     b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
9076     a_c = _mm_srli_epi64(a, (64 - c));
9077     a_c  = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
9078     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9079 }
9080 
9081 _NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9082 #define vsriq_n_u8 vsriq_n_s8
9083 
9084 _NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9085 #define vsriq_n_u16 vsriq_n_s16
9086 
9087 _NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
9088 #define vsriq_n_u32 vsriq_n_s32
9089 
9090 _NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
9091 #define vsriq_n_u64 vsriq_n_s64
9092 
9093 _NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9094 #define vsriq_n_p8 vsriq_n_u8
9095 
9096 _NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9097 #define vsriq_n_p16 vsriq_n_u16
9098 
9099 //***** Vector shift left and insert *********************************************
9100 //*********************************************************************************
9101 //Actually the "c" right bits from "a" are the only bits remained from "a"  after the shift.
9102 //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
9103 _NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9104 _NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c)
9105 {
9106     int8x8_t res64;
9107     return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
9108 }
9109 
9110 
9111 _NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9112 _NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c)
9113 {
9114     int16x4_t res64;
9115     return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
9116 }
9117 
9118 
9119 _NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
9120 _NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c)
9121 {
9122     int32x2_t res64;
9123     return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
9124 }
9125 
9126 _NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
9127 _NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
9128 {
9129     int64x1_t res;
9130     res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
9131     return res;
9132 }
9133 
9134 
9135 _NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9136 #define vsli_n_u8 vsli_n_s8
9137 
9138 _NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9139 #define vsli_n_u16 vsli_n_s16
9140 
9141 _NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
9142 #define vsli_n_u32 vsli_n_s32
9143 
9144 _NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
9145 #define vsli_n_u64 vsli_n_s64
9146 
9147 _NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9148 #define vsli_n_p8 vsli_n_u8
9149 
9150 _NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9151 #define vsli_n_p16 vsli_n_u16
9152 
9153 _NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9154 _NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
9155 {
9156     __m128i maskA, a_masked;
9157     int8x16_t b_shift;
9158     _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
9159     maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
9160     b_shift = vshlq_n_s8( b, c);
9161     a_masked = _mm_and_si128 (a, maskA);
9162     return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
9163 }
9164 
9165 _NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9166 _NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
9167 {
9168     //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
9169     int16x8_t b_shift;
9170     int16x8_t a_c;
9171     b_shift = vshlq_n_s16( b, c);
9172     a_c = vshlq_n_s16( a, (16 - c));
9173     a_c  = _mm_srli_epi16(a_c, (16 - c));
9174     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9175 }
9176 
9177 _NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
9178 _NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
9179 {
9180     //solution may be  not optimal compared with the serial one
9181     //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
9182     int32x4_t b_shift;
9183     int32x4_t a_c;
9184     b_shift = vshlq_n_s32( b, c);
9185     a_c = vshlq_n_s32( a, (32 - c));
9186     a_c  = _mm_srli_epi32(a_c, (32 - c));
9187     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9188 }
9189 
9190 _NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
9191 _NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
9192 {
9193     //solution may be  not optimal compared with the serial one
9194     //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
9195     int64x2_t b_shift;
9196     int64x2_t a_c;
9197     b_shift = vshlq_n_s64( b, c);
9198     a_c = vshlq_n_s64( a, (64 - c));
9199     a_c  = _mm_srli_epi64(a_c, (64 - c));
9200     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9201 }
9202 
9203 _NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9204 #define vsliq_n_u8 vsliq_n_s8
9205 
9206 _NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9207 #define vsliq_n_u16 vsliq_n_s16
9208 
9209 _NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
9210 #define vsliq_n_u32 vsliq_n_s32
9211 
9212 _NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
9213 #define vsliq_n_u64 vsliq_n_s64
9214 
9215 _NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9216 #define vsliq_n_p8 vsliq_n_u8
9217 
9218 _NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9219 #define vsliq_n_p16 vsliq_n_u16
9220 
9221 // ***********************************************************************************************
9222 // ****************** Loads and stores of a single vector ***************************************
9223 // ***********************************************************************************************
9224 //Performs loads and stores of a single vector of some type.
9225 //*******************************  Loads ********************************************************
9226 // ***********************************************************************************************
9227 //We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
9228 //also for SSE3  supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
9229 // it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
9230 //If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
9231 #define LOAD_SI128(ptr) \
9232         ( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr))
9233 
9234 _NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9235 #define vld1q_u8 LOAD_SI128
9236 
9237 _NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9238 #define vld1q_u16 LOAD_SI128
9239 
9240 _NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9241 #define vld1q_u32 LOAD_SI128
9242 
9243 _NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9244 #define vld1q_u64 LOAD_SI128
9245 
9246 _NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9247 #define vld1q_s8 LOAD_SI128
9248 
9249 _NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9250 #define vld1q_s16 LOAD_SI128
9251 
9252 _NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9253 #define vld1q_s32 LOAD_SI128
9254 
9255 _NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9256 #define vld1q_s64 LOAD_SI128
9257 
9258 _NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
9259 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
9260 /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
9261 {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9262 __m128 f2;
9263 f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
9264 }*/
9265 
9266 _NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9267 _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
9268 {
9269     if( (((uintptr_t)(ptr)) & 15 ) == 0 ) //16 bits aligned
9270         return _mm_load_ps(ptr);
9271     else
9272         return _mm_loadu_ps(ptr);
9273 }
9274 
9275 _NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9276 #define vld1q_p8  LOAD_SI128
9277 
9278 _NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9279 #define vld1q_p16 LOAD_SI128
9280 
9281 _NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
9282 #define vld1_u8(ptr)  *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
9283 
9284 _NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
9285 #define vld1_u16 vld1_u8
9286 
9287 _NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
9288 #define vld1_u32 vld1_u8
9289 
9290 
9291 _NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9292 #define vld1_u64 vld1_u8
9293 
9294 _NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
9295 #define vld1_s8 vld1_u8
9296 
9297 _NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
9298 #define vld1_s16 vld1_u16
9299 
9300 _NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
9301 #define vld1_s32 vld1_u32
9302 
9303 _NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9304 #define vld1_s64 vld1_u64
9305 
9306 _NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
9307 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9308 
9309 _NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
9310 _NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
9311 {
9312     float32x2_t res;
9313     res.m64_f32[0] = *(ptr);
9314     res.m64_f32[1] = *(ptr + 1);
9315     return res;
9316 }
9317 
9318 _NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
9319 #define vld1_p8 vld1_u8
9320 
9321 _NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
9322 #define vld1_p16 vld1_u16
9323 
9324 
9325 _NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9326 _NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr)
9327 {
9328     if ((((uintptr_t)(ptr)) & 15) == 0) //16 bits aligned
9329         return _mm_load_pd(ptr);
9330     else
9331         return _mm_loadu_pd(ptr);
9332 }
9333 
9334 
9335 //***********************************************************************************************************
9336 //******* Lane load functions - insert the data at  vector's given position (lane) *************************
9337 //***********************************************************************************************************
9338 _NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9339 #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9340 
9341 _NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9342 #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9343 
9344 _NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9345 #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9346 
9347 _NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
9348 #define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
9349 
9350 
9351 _NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9352 #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9353 
9354 _NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9355 #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9356 
9357 _NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9358 #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9359 
9360 _NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9361 //current IA SIMD doesn't support float16
9362 
9363 _NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9364 _NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
9365 {
9366     //we need to deal with  ptr  16bit NOT aligned case
9367     __m128 p;
9368     p = _mm_set1_ps(*(ptr));
9369     return _MM_INSERT_PS(vec,  p, _INSERTPS_NDX(0, lane));
9370 }
9371 
9372 _NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
9373 #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
9374 
9375 _NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9376 #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9377 
9378 _NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9379 #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9380 
9381 _NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9382 _NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
9383 {
9384     uint8x8_t res;
9385     res = vec;
9386     res.m64_u8[lane] = *(ptr);
9387     return res;
9388 }
9389 
9390 _NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9391 _NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
9392 {
9393     uint16x4_t res;
9394     res = vec;
9395     res.m64_u16[lane] = *(ptr);
9396     return res;
9397 }
9398 
9399 _NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9400 _NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
9401 {
9402     uint32x2_t res;
9403     res = vec;
9404     res.m64_u32[lane] = *(ptr);
9405     return res;
9406 }
9407 
9408 _NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
9409 _NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
9410 {
9411     uint64x1_t res;
9412     res.m64_u64[0] = *(ptr);
9413     return res;
9414 }
9415 
9416 
9417 _NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9418 #define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
9419 
9420 _NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9421 #define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
9422 
9423 _NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9424 #define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
9425 
9426 _NEON2SSESTORAGE float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9427 //current IA SIMD doesn't support float16
9428 
9429 _NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9430 _NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
9431 {
9432     float32x2_t res;
9433     res = vec;
9434     res.m64_f32[lane] = *(ptr);
9435     return res;
9436 }
9437 
9438 _NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
9439 #define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
9440 
9441 _NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9442 #define vld1_lane_p8 vld1_lane_u8
9443 
9444 _NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9445 #define vld1_lane_p16 vld1_lane_s16
9446 
9447 // ****************** Load single value ( set all lanes of vector with same value from memory)**********************
9448 // ******************************************************************************************************************
9449 _NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9450 #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
9451 
9452 _NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9453 #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
9454 
9455 _NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9456 #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
9457 
9458 _NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9459 _NEON2SSE_INLINE uint64x2_t   vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
9460 {
9461     _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
9462     return LOAD_SI128(val);
9463 }
9464 
9465 _NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9466 #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
9467 
9468 _NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9469 #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
9470 
9471 _NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9472 #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
9473 
9474 _NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9475 #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
9476 
9477 _NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9478 //current IA SIMD doesn't support float16, need to go to 32 bits
9479 
9480 _NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9481 #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
9482 
9483 _NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9484 #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
9485 
9486 _NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9487 #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
9488 
9489 _NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8 (__transfersize (1)uint8_t const * ptr),_NEON2SSE_REASON_SLOW_SERIAL)9490 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9491 {
9492     uint8x8_t res;
9493     int i;
9494     for(i = 0; i<8; i++) {
9495         res.m64_u8[i] =  *(ptr);
9496     }
9497     return res;
9498 }
9499 
9500 _NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16 (__transfersize (1)uint16_t const * ptr),_NEON2SSE_REASON_SLOW_SERIAL)9501 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9502 {
9503     uint16x4_t res;
9504     int i;
9505     for(i = 0; i<4; i++) {
9506         res.m64_u16[i] =  *(ptr);
9507     }
9508     return res;
9509 }
9510 
9511 _NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32 (__transfersize (1)uint32_t const * ptr),_NEON2SSE_REASON_SLOW_SERIAL)9512 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9513 {
9514     uint32x2_t res;
9515     res.m64_u32[0] = *(ptr);
9516     res.m64_u32[1] = *(ptr);
9517     return res;
9518 }
9519 
9520 _NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9521 _NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
9522 {
9523     uint64x1_t res;
9524     res.m64_u64[0] = *(ptr);
9525     return res;
9526 }
9527 
9528 _NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9529 #define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
9530 
9531 
9532 _NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9533 #define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
9534 
9535 
9536 _NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9537 #define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
9538 
9539 
9540 _NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9541 #define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
9542 
9543 _NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9544 //current IA SIMD doesn't support float16
9545 
9546 _NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9547 _NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
9548 {
9549     float32x2_t res;
9550     res.m64_f32[0] = *(ptr);
9551     res.m64_f32[1] = res.m64_f32[0];
9552     return res; // use last 64bits only
9553 }
9554 
9555 _NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9556 #define vld1_dup_p8 vld1_dup_u8
9557 
9558 
9559 _NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9560 #define vld1_dup_p16 vld1_dup_u16
9561 
9562 
9563 //*************************************************************************************
9564 //********************************* Store **********************************************
9565 //*************************************************************************************
9566 // If ptr is 16bit aligned and you  need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
9567 //here we assume the case of  NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
9568 #define STORE_SI128(ptr, val) \
9569         (((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
9570 
9571 _NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
9572 #define vst1q_u8 STORE_SI128
9573 
9574 _NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
9575 #define vst1q_u16 STORE_SI128
9576 
9577 _NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
9578 #define vst1q_u32 STORE_SI128
9579 
9580 _NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
9581 #define vst1q_u64 STORE_SI128
9582 
9583 _NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
9584 #define vst1q_s8 STORE_SI128
9585 
9586 _NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
9587 #define vst1q_s16 STORE_SI128
9588 
9589 _NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
9590 #define vst1q_s32 STORE_SI128
9591 
9592 _NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
9593 #define vst1q_s64 STORE_SI128
9594 
9595 _NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
9596 // IA32 SIMD doesn't work with 16bit floats currently
9597 
9598 _NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
9599 _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
9600 {
9601     if( ((uintptr_t)(ptr) & 15)  == 0 ) //16 bits aligned
9602         _mm_store_ps (ptr, val);
9603     else
9604         _mm_storeu_ps (ptr, val);
9605 }
9606 
9607 _NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
9608 #define vst1q_p8  vst1q_u8
9609 
9610 _NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
9611 #define vst1q_p16 vst1q_u16
9612 
9613 _NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
9614 _NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
9615 {
9616     int i;
9617     for (i = 0; i<8; i++) {
9618         *(ptr + i) = ((uint8_t*)&val)[i];
9619     }
9620     //_mm_storel_epi64((__m128i*)ptr, val);
9621     return;
9622 }
9623 
9624 _NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
9625 _NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
9626 {
9627     int i;
9628     for (i = 0; i<4; i++) {
9629         *(ptr + i) = ((uint16_t*)&val)[i];
9630     }
9631     //_mm_storel_epi64((__m128i*)ptr, val);
9632     return;
9633 }
9634 
9635 _NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
9636 _NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
9637 {
9638     int i;
9639     for (i = 0; i<2; i++) {
9640         *(ptr + i) = ((uint32_t*)&val)[i];
9641     }
9642     //_mm_storel_epi64((__m128i*)ptr, val);
9643     return;
9644 }
9645 
9646 _NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
9647 _NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
9648 {
9649     *(ptr) = *((uint64_t*)&val);
9650     //_mm_storel_epi64((__m128i*)ptr, val);
9651     return;
9652 }
9653 
9654 _NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
9655 #define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
9656 
9657 _NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
9658 #define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
9659 
9660 _NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
9661 #define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
9662 
9663 _NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
9664 #define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
9665 
9666 _NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
9667 //current IA SIMD doesn't support float16
9668 
9669 _NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
9670 _NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
9671 {
9672     *(ptr) =   val.m64_f32[0];
9673     *(ptr + 1) = val.m64_f32[1];
9674     return;
9675 }
9676 
9677 _NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
9678 #define vst1_p8 vst1_u8
9679 
9680 _NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
9681 #define vst1_p16 vst1_u16
9682 
9683 //***********Store a lane of a vector into memory (extract given lane) *********************
9684 //******************************************************************************************
9685 _NEON2SSESTORAGE void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9686 #define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane)
9687 
9688 _NEON2SSESTORAGE void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9689 #define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane)
9690 
9691 _NEON2SSESTORAGE void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9692 #define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane)
9693 
9694 _NEON2SSESTORAGE void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
9695 #define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane)
9696 
9697 _NEON2SSESTORAGE void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9698 #define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane)
9699 
9700 _NEON2SSESTORAGE void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9701 #define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane)
9702 
9703 _NEON2SSESTORAGE void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9704 #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
9705 
9706 _NEON2SSESTORAGE void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
9707 #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
9708 
9709 _NEON2SSESTORAGE void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9710 //current IA SIMD doesn't support float16
9711 
9712 _NEON2SSESTORAGE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9713 _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
9714 {
9715     int32_t ilane;
9716     ilane = _MM_EXTRACT_PS(val,lane);
9717     *(ptr) =  *((float*)&ilane);
9718 }
9719 
9720 _NEON2SSESTORAGE void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9721 #define vst1q_lane_p8   vst1q_lane_u8
9722 
9723 _NEON2SSESTORAGE void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9724 #define vst1q_lane_p16   vst1q_lane_s16
9725 
9726 _NEON2SSESTORAGE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9727 _NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
9728 {
9729     *(ptr) = val.m64_u8[lane];
9730 }
9731 
9732 _NEON2SSESTORAGE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9733 _NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
9734 {
9735     *(ptr) = val.m64_u16[lane];
9736 }
9737 
9738 _NEON2SSESTORAGE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9739 _NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
9740 {
9741     *(ptr) = val.m64_u32[lane];
9742 }
9743 
9744 _NEON2SSESTORAGE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
9745 _NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
9746 {
9747     *(ptr) = val.m64_u64[0];
9748 }
9749 
9750 _NEON2SSESTORAGE void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9751 #define  vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
9752 
9753 _NEON2SSESTORAGE void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9754 #define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
9755 
9756 _NEON2SSESTORAGE void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9757 #define vst1_lane_s32(ptr, val, lane)  vst1_lane_u32((uint32_t*)ptr, val, lane)
9758 
9759 
9760 _NEON2SSESTORAGE void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
9761 #define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
9762 
9763 
9764 _NEON2SSESTORAGE void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9765 //current IA SIMD doesn't support float16
9766 
9767 _NEON2SSESTORAGE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9768 _NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
9769 {
9770     *(ptr) = val.m64_f32[lane];
9771 }
9772 
9773 _NEON2SSESTORAGE void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9774 #define vst1_lane_p8 vst1_lane_u8
9775 
9776 _NEON2SSESTORAGE void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9777 #define vst1_lane_p16 vst1_lane_s16
9778 
9779 //***********************************************************************************************
9780 //**************** Loads and stores of an N-element structure **********************************
9781 //***********************************************************************************************
9782 //These intrinsics load or store an n-element structure. The array structures are defined in the beginning
9783 //We assume ptr is NOT aligned in general case, for more details see  "Loads and stores of a single vector functions"
9784 //****************** 2 elements load  *********************************************
9785 _NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9786 _NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
9787 {
9788     uint8x16x2_t v;
9789     v.val[0] = vld1q_u8(ptr);
9790     v.val[1] = vld1q_u8((ptr + 16));
9791     v = vuzpq_s8(v.val[0], v.val[1]);
9792     return v;
9793 }
9794 
9795 _NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9796 _NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
9797 {
9798     uint16x8x2_t v;
9799     v.val[0] = vld1q_u16( ptr);
9800     v.val[1] = vld1q_u16( (ptr + 8));
9801     v = vuzpq_s16(v.val[0], v.val[1]);
9802     return v;
9803 }
9804 
9805 _NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9806 _NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9807 {
9808     uint32x4x2_t v;
9809     v.val[0] = vld1q_u32 ( ptr);
9810     v.val[1] = vld1q_u32 ( (ptr + 4));
9811     v = vuzpq_s32(v.val[0], v.val[1]);
9812     return v;
9813 }
9814 
9815 _NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
9816 #define  vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
9817 
9818 _NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9819 #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
9820 
9821 _NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9822 #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
9823 
9824 
9825 _NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
9826 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
9827 
9828 _NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9829 _NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9830 {
9831     float32x4x2_t v;
9832     v.val[0] =  vld1q_f32 (ptr);
9833     v.val[1] =  vld1q_f32 ((ptr + 4));
9834     v = vuzpq_f32(v.val[0], v.val[1]);
9835     return v;
9836 }
9837 
9838 _NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9839 #define  vld2q_p8 vld2q_u8
9840 
9841 _NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9842 #define vld2q_p16 vld2q_u16
9843 
9844 _NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9845 _NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
9846 {
9847     uint8x8x2_t v;
9848     __m128i ld128;
9849     ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
9850     ld128 =  _mm_shuffle_epi8(ld128, *(__m128i*)mask8_16_even_odd);
9851     vst1q_u8((v.val), ld128); //  v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
9852     return v;
9853 }
9854 
9855 _NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9856 _NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
9857 {
9858     _NEON2SSE_ALIGN_16 uint16x4x2_t v;
9859     __m128i ld128;
9860     ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
9861     ld128 = _mm_shuffle_epi8(ld128, *(__m128i*) mask8_32_even_odd);
9862     vst1q_u16((v.val), ld128);
9863     return v;
9864 }
9865 
9866 _NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9867 _NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
9868 {
9869     _NEON2SSE_ALIGN_16 uint32x2x2_t v;
9870     __m128i ld128;
9871     ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
9872     ld128 = _mm_shuffle_epi32(ld128,  0 | (2 << 2) | (1 << 4) | (3 << 6));
9873     vst1q_u32((v.val), ld128);
9874     return v;
9875 }
9876 
9877 _NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9878 _NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
9879 {
9880     uint64x1x2_t v;
9881     v.val[0].m64_u64[0] = *(ptr);
9882     v.val[1].m64_u64[0] = *(ptr + 1);
9883     return v;
9884 }
9885 
9886 _NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9887 #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
9888 
9889 _NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9890 #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
9891 
9892 _NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9893 #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
9894 
9895 _NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9896 #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
9897 
9898 _NEON2SSESTORAGE float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
9899 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
9900 
9901 _NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9902 _NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
9903 {
9904     float32x2x2_t v;
9905     v.val[0].m64_f32[0] = *(ptr);
9906     v.val[0].m64_f32[1] = *(ptr + 2);
9907     v.val[1].m64_f32[0] = *(ptr + 1);
9908     v.val[1].m64_f32[1] = *(ptr + 3);
9909     return v;
9910 }
9911 
9912 _NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9913 #define vld2_p8 vld2_u8
9914 
9915 _NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9916 #define vld2_p16 vld2_u16
9917 
9918 //******************** Triplets ***************************************
9919 //*********************************************************************
9920 _NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
9921 _NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
9922 {
9923     //a0,a1,a2,a3,...a7,a8,...a15,  b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
9924     //a:0,3,6,9,12,15,b:2,5,8,11,14,  c:1,4,7,10,13
9925     //a:1,4,7,10,13,  b:0,3,6,9,12,15,c:2,5,8,11,14,
9926     //a:2,5,8,11,14,  b:1,4,7,10,13,  c:0,3,6,9,12,15
9927     uint8x16x3_t v;
9928     __m128i tmp0, tmp1,tmp2, tmp3;
9929     _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
9930     _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
9931     _NEON2SSE_ALIGN_16 static const int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
9932 
9933     v.val[0] =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
9934     v.val[1] =  vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
9935     v.val[2] =  vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
9936 
9937     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
9938     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
9939     tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
9940 
9941     tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
9942     tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
9943     tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
9944     tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
9945     v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
9946     v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
9947 
9948     tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
9949     tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
9950     v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
9951     v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
9952     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
9953     v.val[1] =  _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
9954     v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
9955     tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
9956     tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
9957     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
9958 
9959     tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
9960     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
9961     v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
9962     v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
9963     v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
9964     tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
9965     v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
9966     return v;
9967 }
9968 
9969 _NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
9970 _NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
9971 {
9972     //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
9973     uint16x8x3_t v;
9974     __m128i tmp0, tmp1,tmp2, tmp3;
9975     _NEON2SSE_ALIGN_16 static const int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
9976     _NEON2SSE_ALIGN_16 static const int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
9977     _NEON2SSE_ALIGN_16 static const int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
9978 
9979     v.val[0] =  vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
9980     v.val[1] =  vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
9981     v.val[2] =  vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
9982 
9983     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
9984     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
9985     tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
9986 
9987     tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
9988     tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
9989     tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
9990     tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
9991     v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
9992     v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
9993 
9994     tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
9995     tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
9996     v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
9997     v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
9998     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
9999     v.val[1] =  _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
10000     v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
10001     tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
10002     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
10003     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
10004 
10005     tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
10006     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
10007     v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
10008     v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
10009     v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
10010     tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
10011     v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
10012     return v;
10013 }
10014 
10015 _NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10016 _NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
10017 {
10018     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
10019     uint32x4x3_t v;
10020     __m128i tmp0, tmp1,tmp2, tmp3;
10021     v.val[0] =  vld1q_u32 (ptr); //a0,a1,a2,a3,
10022     v.val[1] =  vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
10023     v.val[2] =  vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
10024 
10025     tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
10026     tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
10027     tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
10028 
10029     tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
10030     v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
10031     tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
10032     v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
10033     v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
10034     v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
10035     return v;
10036 }
10037 
10038 _NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10039 #define  vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
10040 
10041 _NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10042 #define  vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
10043 
10044 _NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10045 #define  vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
10046 
10047 _NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10048 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10049 
10050 _NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10051 _NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
10052 {
10053     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
10054     float32x4x3_t v;
10055     __m128 tmp0, tmp1,tmp2, tmp3;
10056     v.val[0] =  vld1q_f32 (ptr); //a0,a1,a2,a3,
10057     v.val[1] =  vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
10058     v.val[2] =  vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
10059 
10060     tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
10061     tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
10062     tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
10063     tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
10064 
10065     v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
10066     tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
10067     v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
10068     v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
10069     v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
10070     return v;
10071 }
10072 
10073 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10074 #define vld3q_p8 vld3q_u8
10075 
10076 _NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10077 #define vld3q_p16 vld3q_u16
10078 
10079 _NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10080 _NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
10081 {
10082     //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
10083     uint8x8x3_t v;
10084     __m128i val0, val1, val2, tmp0, tmp1;
10085     _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
10086     _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
10087     val0 =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
10088     val2 =  _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
10089 
10090     tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
10091     tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
10092     val0 = _mm_slli_si128(tmp0,10);
10093     val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
10094     val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
10095     val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
10096     _M64(v.val[0], val0);
10097     val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
10098     val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
10099     val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
10100     val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
10101     val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
10102     _M64(v.val[1], val1);
10103 
10104     tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
10105     val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
10106     val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
10107     val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
10108     _M64(v.val[2], val2);
10109     return v;
10110 }
10111 
10112 _NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10113 _NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
10114 {
10115     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
10116     uint16x4x3_t v;
10117     __m128i val0, val1, val2, tmp0, tmp1;
10118     _NEON2SSE_ALIGN_16 static const int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
10119     val0 =  vld1q_u16 (ptr); //a0,a1,a2,a3,  b0,b1,b2,b3
10120     val2 =  _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
10121 
10122     tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
10123     tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01     : c1, c2, c0, c3,
10124     val0 = _mm_slli_si128(tmp0,10);
10125     val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
10126     val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
10127     val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
10128     val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
10129     _M64(v.val[0], val0);
10130 
10131     val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
10132     val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
10133     val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
10134     val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
10135     val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
10136     _M64(v.val[1], val1);
10137 
10138     tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
10139     tmp1 = _mm_srli_si128(tmp1,4);
10140     tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
10141     val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
10142     _M64(v.val[2], val2);
10143     return v;
10144 }
10145 
10146 _NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10147 _NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
10148 {
10149     //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
10150     uint32x2x3_t v;
10151     __m128i val0, val1, val2;
10152     val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
10153     val2 =   _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
10154 
10155     val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
10156     _M64(v.val[0], val0);
10157     val2 =  _mm_slli_si128(val2, 8); //x, x,c0,c1,
10158     val1 =  _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
10159     _M64(v.val[1], val1);
10160     val2 =  _mm_srli_si128(val1, 8); //b0, c1, x, x,
10161     _M64(v.val[2], val2);
10162     return v;
10163 }
10164 _NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10165 _NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10166 {
10167     uint64x1x3_t v;
10168     v.val[0].m64_u64[0] = *(ptr);
10169     v.val[1].m64_u64[0] = *(ptr + 1);
10170     v.val[2].m64_u64[0] = *(ptr + 2);
10171     return v;
10172 }
10173 
10174 _NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10175 #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
10176 
10177 _NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10178 #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
10179 
10180 _NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10181 #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
10182 
10183 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10184 #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
10185 
10186 _NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10187 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10188 
10189 _NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10190 _NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
10191 {
10192     //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
10193     float32x2x3_t v;
10194     v.val[0].m64_f32[0] = *(ptr);
10195     v.val[0].m64_f32[1] = *(ptr + 3);
10196 
10197     v.val[1].m64_f32[0] = *(ptr + 1);
10198     v.val[1].m64_f32[1] = *(ptr + 4);
10199 
10200     v.val[2].m64_f32[0] = *(ptr + 2);
10201     v.val[2].m64_f32[1] = *(ptr + 5);
10202     return v;
10203 }
10204 
10205 _NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10206 #define vld3_p8 vld3_u8
10207 
10208 _NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10209 #define vld3_p16 vld3_u16
10210 
10211 //***************  Quadruples load ********************************
10212 //*****************************************************************
10213 _NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10214 _NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
10215 {
10216     uint8x16x4_t v;
10217     __m128i tmp3, tmp2, tmp1, tmp0;
10218 
10219     v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
10220     v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
10221     v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
10222     v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
10223 
10224     tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
10225     tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
10226     tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
10227     tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
10228 
10229     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8,  a1,a9, b1,b9, ....a3,a11, b3,b11
10230     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
10231     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
10232     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
10233 
10234     tmp0 =  _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8,  d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
10235     tmp1 =  _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
10236     tmp2 =  _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
10237     tmp3 =  _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
10238 
10239     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
10240     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
10241     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
10242     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
10243     return v;
10244 }
10245 
10246 _NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10247 _NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
10248 {
10249     uint16x8x4_t v;
10250     __m128i tmp3, tmp2, tmp1, tmp0;
10251     tmp0  =  vld1q_u16 (ptr); //a0,a1,a2,...a7
10252     tmp1  =  vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
10253     tmp2  =  vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
10254     tmp3  =  vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
10255     v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
10256     v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
10257     v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
10258     v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
10259     tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
10260     tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
10261     tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
10262     tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
10263     v.val[0] =  _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
10264     v.val[1] =  _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
10265     v.val[2] =  _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
10266     v.val[3] =  _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
10267     return v;
10268 }
10269 
10270 _NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10271 _NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10272 {
10273     uint32x4x4_t v;
10274     __m128i tmp3, tmp2, tmp1, tmp0;
10275     v.val[0] =  vld1q_u32 (ptr);
10276     v.val[1] =  vld1q_u32 ((ptr + 4));
10277     v.val[2] =  vld1q_u32 ((ptr + 8));
10278     v.val[3] =  vld1q_u32 ((ptr + 12));
10279     tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
10280     tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
10281     tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
10282     tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
10283     v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
10284     v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
10285     v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
10286     v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
10287     return v;
10288 }
10289 
10290 _NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10291 #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
10292 
10293 _NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10294 #define  vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
10295 
10296 _NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10297 #define  vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
10298 
10299 _NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10300 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10301 
10302 _NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10303 _NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10304 {
10305     float32x4x4_t v;
10306     __m128 tmp3, tmp2, tmp1, tmp0;
10307 
10308     v.val[0] =  vld1q_f32 ((float*) ptr);
10309     v.val[1] =  vld1q_f32 ((float*) (ptr + 4));
10310     v.val[2] =  vld1q_f32 ((float*) (ptr + 8));
10311     v.val[3] =  vld1q_f32 ((float*) (ptr + 12));
10312     tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
10313     tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
10314     tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
10315     tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
10316     v.val[0] = _mm_movelh_ps(tmp0, tmp2);
10317     v.val[1] = _mm_movehl_ps(tmp2, tmp0);
10318     v.val[2] = _mm_movelh_ps(tmp1, tmp3);
10319     v.val[3] = _mm_movehl_ps(tmp3, tmp1);
10320     return v;
10321 }
10322 
10323 _NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10324 #define vld4q_p8 vld4q_u8
10325 
10326 _NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10327 #define vld4q_p16 vld4q_s16
10328 
10329 _NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10330 _NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
10331 {
10332     uint8x8x4_t v;
10333     __m128i sh0, sh1;
10334     __m128i val0,  val2;
10335     _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
10336 
10337     val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
10338     val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
10339 
10340     sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
10341     sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
10342     val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
10343     vst1q_u8(&v.val[0], val0 );
10344     val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
10345     vst1q_u8(&v.val[2], val2 );
10346     return v;
10347 }
10348 
10349 _NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10350 _NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
10351 {
10352     uint16x4x4_t v;
10353     __m128i sh0, sh1;
10354     __m128i val0, val2;
10355     _NEON2SSE_ALIGN_16 static const int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
10356     val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
10357     val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
10358     sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
10359     sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
10360     val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
10361     vst1q_u16(&v.val[0], val0 );
10362     val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
10363     vst1q_u16(&v.val[2], val2 );
10364     return v;
10365 }
10366 
10367 _NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10368 _NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
10369 {
10370     //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10371     uint32x2x4_t v;
10372     __m128i val0, val01, val2;
10373     val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
10374     val2 =  vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
10375     val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
10376     val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
10377     vst1q_u32(&v.val[0], val01);
10378     vst1q_u32(&v.val[2], val2 );
10379     return v;
10380 }
10381 
10382 _NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10383 _NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10384 {
10385     uint64x1x4_t v;
10386     v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
10387     v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
10388     v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
10389     v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
10390     return v;
10391 }
10392 
10393 _NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10394 #define  vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
10395 
10396 _NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10397 #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
10398 
10399 _NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10400 #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
10401 
10402 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10403 #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
10404 
10405 _NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10406 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10407 
10408 _NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10409 _NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
10410 {
10411     //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10412     float32x2x4_t res;
10413     res.val[0].m64_f32[0] = *(ptr);
10414     res.val[0].m64_f32[1] = *(ptr + 4);
10415     res.val[1].m64_f32[0] = *(ptr + 1);
10416     res.val[1].m64_f32[1] = *(ptr + 5);
10417     res.val[2].m64_f32[0] = *(ptr + 2);
10418     res.val[2].m64_f32[1] = *(ptr + 6);
10419     res.val[3].m64_f32[0] = *(ptr + 3);
10420     res.val[3].m64_f32[1] = *(ptr + 7);
10421     return res;
10422 }
10423 
10424 _NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10425 #define vld4_p8 vld4_u8
10426 
10427 _NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10428 #define vld4_p16 vld4_u16
10429 
10430 //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
10431 //*******************************************************************************************************************
10432 _NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10433 _NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
10434 {
10435     uint8x8x2_t v;
10436     __m128i val0, val1;
10437     val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
10438     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
10439     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
10440     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10441     vst1q_u8(v.val, val0);
10442     return v;
10443 }
10444 
10445 _NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10446 _NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
10447 {
10448     uint16x4x2_t v;
10449     __m128i val0, val1;
10450     val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
10451     val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
10452     _M64(v.val[0], val0);
10453     val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
10454     _M64(v.val[1], val1);
10455     return v;
10456 }
10457 
10458 _NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10459 _NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10460 {
10461     uint32x2x2_t v;
10462     __m128i val0;
10463     val0 = LOAD_SI128(ptr); //0,1,x,x
10464     val0 = _mm_shuffle_epi32(val0,   0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
10465     vst1q_u32(v.val, val0);
10466     return v;
10467 }
10468 
10469 _NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10470 #define vld2_dup_u64 vld2_u64
10471 
10472 _NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10473 #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
10474 
10475 _NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10476 #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
10477 
10478 _NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10479 #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
10480 
10481 _NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10482 #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
10483 
10484 _NEON2SSESTORAGE float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10485 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10486 
10487 _NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10488 _NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10489 {
10490     float32x2x2_t v;
10491     v.val[0].m64_f32[0] = *(ptr); //0,0
10492     v.val[0].m64_f32[1] = *(ptr); //0,0
10493     v.val[1].m64_f32[0] = *(ptr + 1); //1,1
10494     v.val[1].m64_f32[1] = *(ptr + 1); //1,1
10495     return v;
10496 }
10497 
10498 _NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10499 #define vld2_dup_p8 vld2_dup_u8
10500 
10501 _NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10502 #define vld2_dup_p16 vld2_dup_s16
10503 
10504 //************* Duplicate (or propagate)triplets: *******************
10505 //********************************************************************
10506 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
10507 _NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10508 _NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
10509 {
10510     uint8x8x3_t v;
10511     __m128i val0, val1, val2;
10512     val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
10513     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
10514     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
10515     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10516     val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
10517     vst1q_u8(v.val, val0);
10518     _M64(v.val[2], val2);
10519     return v;
10520 }
10521 
10522 _NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10523 _NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
10524 {
10525     uint16x4x3_t v;
10526     __m128i val0, val1, val2;
10527     val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
10528     val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
10529     val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
10530     val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
10531     _M64(v.val[0], val0);
10532     _M64(v.val[1], val1);
10533     _M64(v.val[2], val2);
10534     return v;
10535 }
10536 
10537 _NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10538 _NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10539 {
10540     uint32x2x3_t v;
10541     __m128i val0, val1, val2;
10542     val2 = LOAD_SI128(ptr); //0,1,2,x
10543     val0 = _mm_shuffle_epi32(val2,   0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
10544     val1 = _mm_shuffle_epi32(val2,   1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
10545     val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
10546     _M64(v.val[0], val0);
10547     _M64(v.val[1], val1);
10548     _M64(v.val[2], val2);
10549     return v;
10550 }
10551 
10552 _NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10553 _NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10554 {
10555     uint64x1x3_t v;
10556     v.val[0].m64_u64[0] = *(ptr);
10557     v.val[1].m64_u64[0] = *(ptr + 1);
10558     v.val[2].m64_u64[0] = *(ptr + 2);
10559     return v;
10560 }
10561 
10562 _NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10563 #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
10564 
10565 _NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10566 #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
10567 
10568 _NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10569 #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
10570 
10571 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10572 #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
10573 
10574 
10575 _NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10576 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10577 
10578 _NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10579 _NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10580 {
10581     float32x2x3_t v;
10582     int i;
10583     for (i = 0; i<3; i++) {
10584         v.val[i].m64_f32[0] = *(ptr + i);
10585         v.val[i].m64_f32[1] = *(ptr + i);
10586     }
10587     return v;
10588 }
10589 
10590 _NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10591 #define vld3_dup_p8 vld3_dup_u8
10592 
10593 _NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10594 #define vld3_dup_p16 vld3_dup_s16
10595 
10596 
10597 //************* Duplicate (or propagate) quadruples: *******************
10598 //***********************************************************************
10599 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes  and  ptr[3] to all val[3] lanes
10600 _NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10601 _NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10602 {
10603     uint8x8x4_t v;
10604     __m128i val0, val1, val2;
10605     val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
10606     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
10607     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
10608     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10609     val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
10610     vst1q_u8(&v.val[0], val0);
10611     vst1q_u8(&v.val[2], val2);
10612     return v;
10613 }
10614 
10615 _NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10616 _NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10617 {
10618     uint16x4x4_t v;
10619     __m128i val0, val1, val2, val3;
10620     val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
10621     val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
10622     val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
10623     val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
10624     val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
10625     _M64(v.val[0], val0);
10626     _M64(v.val[1], val1);
10627     _M64(v.val[2], val2);
10628     _M64(v.val[3], val3);
10629     return v;
10630 }
10631 
10632 _NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10633 _NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10634 {
10635     uint32x2x4_t v;
10636     __m128i val0, val1, val2, val3;
10637     val3 = LOAD_SI128(ptr); //0,1,2,3
10638     val0 = _mm_shuffle_epi32(val3,   0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
10639     val1 = _mm_shuffle_epi32(val3,   1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
10640     val2 = _mm_shuffle_epi32(val3,   2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
10641     val3 = _mm_shuffle_epi32(val3,   3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
10642     _M64(v.val[0], val0);
10643     _M64(v.val[1], val1);
10644     _M64(v.val[2], val2);
10645     _M64(v.val[3], val3);
10646     return v;
10647 }
10648 
10649 _NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10650 _NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10651 {
10652     uint64x1x4_t v;
10653     v.val[0].m64_u64[0] = *(ptr);
10654     v.val[1].m64_u64[0] = *(ptr + 1);
10655     v.val[2].m64_u64[0] = *(ptr + 2);
10656     v.val[3].m64_u64[0] = *(ptr + 3);
10657     return v;
10658 }
10659 
10660 _NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10661 #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
10662 
10663 _NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10664 #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
10665 
10666 _NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10667 #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
10668 
10669 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10670 #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
10671 
10672 _NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10673 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10674 
10675 _NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10676 _NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10677 {
10678     float32x2x4_t v;
10679     int i;
10680     for (i = 0; i<4; i++) {
10681         v.val[i].m64_f32[0] = *(ptr + i);
10682         v.val[i].m64_f32[1] = *(ptr + i);
10683     }
10684     return v;
10685 }
10686 
10687 _NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const  * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10688 #define vld4_dup_p8 vld4_dup_u8
10689 
10690 _NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10691 #define vld4_dup_p16 vld4_dup_u16
10692 
10693 
10694 //**********************************************************************************
10695 //*******************Lane loads for  an N-element structures ***********************
10696 //**********************************************************************************
10697 //********************** Lane pairs  ************************************************
10698 //does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
10699 //we assume  src is 16 bit aligned
10700 
10701 //!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
10702 //to fix it the all functions below work with  xxxxxx_2t pointers and the corresponding original functions are redefined
10703 
10704 //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10705 _NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
10706 {
10707     uint16x8x2_t v;
10708     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
10709     v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
10710     return v;
10711 }
10712 #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
10713 
10714 //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10715 _NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10716 {
10717     uint32x4x2_t v;
10718     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
10719     v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
10720     return v;
10721 }
10722 #define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
10723 
10724 //int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10725 _NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
10726 {
10727     int16x8x2_t v;
10728     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
10729     v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
10730     return v;
10731 }
10732 #define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
10733 
10734 //int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10735 _NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
10736 {
10737     int32x4x2_t v;
10738     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
10739     v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
10740     return v;
10741 }
10742 #define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
10743 
10744 //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10745 //current IA SIMD doesn't support float16
10746 
10747 //float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10748 _NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10749 {
10750     float32x4x2_t v;
10751     v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
10752     v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
10753     return v;
10754 }
10755 #define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
10756 
10757 //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10758 #define vld2q_lane_p16 vld2q_lane_u16
10759 
10760 _NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10761 _NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
10762 {
10763     uint8x8x2_t v;
10764     v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
10765     v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
10766     return v;
10767 }
10768 
10769 _NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10770 _NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane)
10771 {
10772     uint16x4x2_t v;
10773     v.val[0]  =  vld1_lane_u16(ptr, src.val[0], lane);
10774     v.val[1]  = vld1_lane_u16((ptr + 1), src.val[1], lane);
10775     return v;
10776 }
10777 
10778 _NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10779 _NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane)
10780 {
10781     uint32x2x2_t v;
10782     v.val[0]  =  vld1_lane_u32(ptr, src.val[0], lane);
10783     v.val[1]  = vld1_lane_u32((ptr + 1), src.val[1], lane);
10784     return v;
10785 }
10786 
10787 _NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10788 #define vld2_lane_s8(ptr, src, lane)  vld2_lane_u8(( uint8_t*) ptr, src, lane)
10789 
10790 _NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10791 #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
10792 
10793 _NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10794 #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
10795 
10796 //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10797 //current IA SIMD doesn't support float16
10798 
10799 _NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
10800 _NEON2SSE_INLINE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t  src,__constrange(0,1) int lane)
10801 {
10802     float32x2x2_t v;
10803     v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
10804     v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
10805     return v;
10806 }
10807 
10808 //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10809 _NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
10810 #define vld2_lane_p8 vld2_lane_u8
10811 
10812 //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10813 _NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10814 #define vld2_lane_p16 vld2_lane_u16
10815 
10816 //*********** Lane triplets **********************
10817 //*************************************************
10818 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
10819 //we assume src is 16 bit aligned
10820 
10821 //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10822 _NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10823 {
10824     uint16x8x3_t v;
10825     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
10826     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
10827     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
10828     return v;
10829 }
10830 #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
10831 
10832 //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10833 _NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10834 {
10835     uint32x4x3_t v;
10836     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
10837     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
10838     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
10839     return v;
10840 }
10841 #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
10842 
10843 //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10844 _NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10845 {
10846     int16x8x3_t v;
10847     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
10848     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
10849     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
10850     return v;
10851 }
10852 #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
10853 
10854 //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10855 _NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10856 {
10857     int32x4x3_t v;
10858     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
10859     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
10860     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
10861     return v;
10862 }
10863 #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
10864 
10865 _NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10866 //current IA SIMD doesn't support float16
10867 #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
10868 
10869 
10870 //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10871 _NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10872 {
10873     float32x4x3_t v;
10874     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
10875     v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
10876     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
10877     return v;
10878 }
10879 #define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
10880 
10881 _NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10882 #define vld3q_lane_p16 vld3q_lane_u16
10883 
10884 _NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10885 _NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10886 {
10887     uint8x8x3_t v;
10888     v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
10889     v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
10890     v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
10891     return v;
10892 }
10893 
10894 _NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t   const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10895 _NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10896 {
10897     uint16x4x3_t v;
10898     v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
10899     v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
10900     v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
10901     return v;
10902 }
10903 
10904 _NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10905 _NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10906 {
10907     //need to merge into 128 bit anyway
10908     uint32x2x3_t v;
10909     v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);;
10910     v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);;
10911     v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);;
10912     return v;
10913 }
10914 
10915 _NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t  src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10916 #define vld3_lane_s8(ptr, src, lane)  vld3_lane_u8(( uint8_t*) ptr, src, lane)
10917 
10918 _NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t  src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10919 #define vld3_lane_s16(ptr, src, lane)  vld3_lane_u16(( uint16_t*) ptr, src, lane)
10920 
10921 _NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t  src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10922 #define vld3_lane_s32(ptr, src, lane)  vld3_lane_u32(( uint32_t*) ptr, src, lane)
10923 
10924 _NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10925 //current IA SIMD doesn't support float16
10926 
10927 _NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10928 _NEON2SSE_INLINE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10929 {
10930     float32x2x3_t v;
10931     v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
10932     v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
10933     v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
10934     return v;
10935 }
10936 
10937 _NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10938 #define vld3_lane_p8 vld3_lane_u8
10939 
10940 _NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10941 #define vld3_lane_p16 vld3_lane_u16
10942 
10943 //******************* Lane Quadruples  load ***************************
10944 //*********************************************************************
10945 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
10946 //we assume src is 16 bit aligned
10947 
10948 //uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10949 _NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
10950 {
10951     uint16x8x4_t v;
10952     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
10953     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
10954     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
10955     v.val[3] = _MM_INSERT_EPI16 ( src->val[3],  ptr[3], lane);
10956     return v;
10957 }
10958 #define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
10959 
10960 //uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10961 _NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
10962 {
10963     uint32x4x4_t v;
10964     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
10965     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
10966     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
10967     v.val[3] = _MM_INSERT_EPI32 ( src->val[3],  ptr[3], lane);
10968     return v;
10969 }
10970 #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
10971 
10972 //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10973 _NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10974 #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
10975 
10976 //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10977 _NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10978 #define vld4q_lane_s32(ptr, src, lane)  vld4q_lane_u32(( uint32_t*) ptr, src, lane)
10979 
10980 //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10981 _NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10982 //current IA SIMD doesn't support float16
10983 
10984 //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10985 _NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
10986 {
10987     float32x4x4_t v;
10988     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
10989     v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
10990     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
10991     v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
10992     return v;
10993 }
10994 #define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
10995 
10996 //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10997 _NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10998 #define vld4q_lane_p16 vld4q_lane_u16
10999 
11000 _NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11001 _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)
11002 {
11003     uint8x8x4_t v;
11004     v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
11005     v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
11006     v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
11007     v.val[3] = vld1_lane_u8((ptr + 3), src.val[3], lane);
11008     return v;
11009 }
11010 
11011 _NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11012 _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane)
11013 {
11014     uint16x4x4_t v;
11015     v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
11016     v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
11017     v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
11018     v.val[3] = vld1_lane_u16((ptr + 3), src.val[3], lane);
11019     return v;
11020 }
11021 
11022 _NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11023 _NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane)
11024 {
11025     uint32x2x4_t v;
11026     v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);
11027     v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);
11028     v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);
11029     v.val[3] = vld1_lane_u32((ptr + 3), src.val[3], lane);
11030     return v;
11031 }
11032 
11033 _NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11034 #define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
11035 
11036 _NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11037 #define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
11038 
11039 _NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11040 #define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
11041 
11042 //float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11043 _NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
11044 //current IA SIMD doesn't support float16
11045 
11046 _NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11047 _NEON2SSE_INLINE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)
11048 {
11049     //serial solution may be faster
11050     float32x2x4_t v;
11051     v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
11052     v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
11053     v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
11054     v.val[3] = vld1_lane_f32((ptr + 3), src.val[3], lane);
11055     return v;
11056 }
11057 
11058 _NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11059 #define vld4_lane_p8 vld4_lane_u8
11060 
11061 _NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11062 #define vld4_lane_p16 vld4_lane_u16
11063 
11064 //******************* Store duplets *********************************************
11065 //********************************************************************************
11066 //void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
11067 _NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val)
11068 {
11069     uint8x16x2_t v;
11070     v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
11071     v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
11072     vst1q_u8 (ptr, v.val[0]);
11073     vst1q_u8 ((ptr + 16),  v.val[1]);
11074 }
11075 #define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
11076 
11077 //void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
11078 _NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val)
11079 {
11080     uint16x8x2_t v;
11081     v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
11082     v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
11083     vst1q_u16 (ptr, v.val[0]);
11084     vst1q_u16 ((ptr + 8),  v.val[1]);
11085 }
11086 #define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
11087 
11088 //void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11089 _NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val)
11090 {
11091     uint32x4x2_t v;
11092     v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
11093     v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
11094     vst1q_u32 (ptr, v.val[0]);
11095     vst1q_u32 ((ptr + 4),  v.val[1]);
11096 }
11097 #define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
11098 
11099 //void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
11100 _NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);
11101 #define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
11102 
11103 //void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11104 _NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);
11105 #define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
11106 
11107 //void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
11108 _NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);
11109 #define vst2q_s32(ptr, val)  vst2q_u32((uint32_t*)(ptr), val)
11110 
11111 //void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11112 _NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);
11113 // IA32 SIMD doesn't work with 16bit floats currently
11114 
11115 //void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11116 _NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val)
11117 {
11118     float32x4x2_t v;
11119     v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
11120     v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
11121     vst1q_f32 (ptr, v.val[0]);
11122     vst1q_f32 ((ptr + 4),  v.val[1]);
11123 }
11124 #define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
11125 
11126 //void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
11127 _NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);
11128 #define vst2q_p8 vst2q_u8
11129 
11130 //void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11131 _NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
11132 #define vst2q_p16 vst2q_u16
11133 
11134 _NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11135 _NEON2SSE_INLINE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val)
11136 {
11137     __m128i v0;
11138     v0 = _mm_unpacklo_epi8(_pM128i(val.val[0]), _pM128i(val.val[1]));
11139     vst1q_u8 (ptr, v0);
11140 }
11141 
11142 _NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
11143 _NEON2SSE_INLINE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val)
11144 {
11145     __m128i v0;
11146     v0 = _mm_unpacklo_epi16(_pM128i(val.val[0]), _pM128i(val.val[1]));
11147     vst1q_u16 (ptr, v0);
11148 }
11149 
11150 _NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
11151 _NEON2SSE_INLINE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val)
11152 {
11153     __m128i v0;
11154     v0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1]));
11155     vst1q_u32 (ptr, v0);
11156 }
11157 
11158 _NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
11159 _NEON2SSE_INLINE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val)
11160 {
11161     *(ptr) = val.val[0].m64_u64[0];
11162     *(ptr + 1) = val.val[1].m64_u64[0];
11163 }
11164 
11165 _NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11166 #define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
11167 
11168 _NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11169 #define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
11170 
11171 _NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11172 #define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
11173 
11174 _NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
11175 #define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
11176 
11177 //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11178 //current IA SIMD doesn't support float16
11179 
11180 _NEON2SSESTORAGE void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11181 _NEON2SSE_INLINE void vst2_f32(__transfersize(4) float32_t* ptr, float32x2x2_t val)
11182 {
11183     *(ptr) =   val.val[0].m64_f32[0];
11184     *(ptr + 1) = val.val[1].m64_f32[0];
11185     *(ptr + 2) = val.val[0].m64_f32[1];
11186     *(ptr + 3) = val.val[1].m64_f32[1];
11187 }
11188 
11189 _NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t  val); // VST2.8 {d0, d1}, [r0]
11190 #define vst2_p8 vst2_u8
11191 
11192 _NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t  val); // VST2.16 {d0, d1}, [r0]
11193 #define vst2_p16 vst2_u16
11194 
11195 //******************** Triplets store  *****************************************
11196 //******************************************************************************
11197 //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
11198 _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
11199 {
11200     uint8x16x3_t v;
11201     __m128i v0,v1,v2, cff, bldmask;
11202     _NEON2SSE_ALIGN_16 static const uint8_t mask0[16]   = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
11203     _NEON2SSE_ALIGN_16 static const uint8_t mask1[16]   = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
11204     _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] =    {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
11205     _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
11206     _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
11207     _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
11208 
11209     v0 =  _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
11210     v2 =  _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25,  27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
11211     v1 =  _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25,  27,28, 30,31, 33,34
11212     v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
11213     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
11214     cff = _mm_cmpeq_epi8(v0, v0); //all ff
11215     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
11216     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
11217     vst1q_u8(ptr,   v.val[0]);
11218     v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
11219     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
11220     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
11221     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
11222     vst1q_u8((ptr + 16),  v.val[1]);
11223     v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
11224     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
11225     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
11226     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
11227     vst1q_u8((ptr + 32),  v.val[2]);
11228 }
11229 #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
11230 
11231 //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
11232 _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
11233 {
11234     uint16x8x3_t v;
11235     __m128i v0,v1,v2, cff, bldmask;
11236     _NEON2SSE_ALIGN_16 static const uint8_t mask0[16]   = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
11237     _NEON2SSE_ALIGN_16 static const uint8_t mask1[16]   = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
11238     _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] =    {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
11239     _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
11240     _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
11241     _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
11242 
11243     v0 =  _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
11244     v2 =  _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
11245     v1 =  _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
11246     v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
11247     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
11248     cff = _mm_cmpeq_epi16(v0, v0); //all ff
11249     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
11250     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
11251     vst1q_u16(ptr,      v.val[0]);
11252     v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
11253     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
11254     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
11255     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
11256     vst1q_u16((ptr + 8),  v.val[1]);
11257     v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
11258     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
11259     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
11260     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
11261     vst1q_u16((ptr + 16), v.val[2]);
11262 }
11263 #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
11264 
11265 //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11266 _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
11267 {
11268     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
11269     uint32x4x3_t v;
11270     __m128i tmp0, tmp1,tmp2;
11271     tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
11272     tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
11273     tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
11274     v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
11275     v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
11276     v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
11277     tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
11278     v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
11279 
11280     vst1q_u32(ptr,      v.val[0]);
11281     vst1q_u32((ptr + 4),  v.val[1]);
11282     vst1q_u32((ptr + 8),  v.val[2]);
11283 }
11284 #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
11285 
11286 //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
11287 _NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
11288 #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
11289 
11290 //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
11291 _NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
11292 #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
11293 
11294 //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
11295 _NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
11296 #define vst3q_s32(ptr, val)  vst3q_u32((uint32_t*)(ptr), val)
11297 
11298 //void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11299 _NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
11300 // IA32 SIMD doesn't work with 16bit floats currently
11301 
11302 //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11303 _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
11304 {
11305     float32x4x3_t v;
11306     __m128 tmp0, tmp1,tmp2;
11307     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
11308     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
11309     tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
11310     v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
11311     v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
11312     v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
11313     tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
11314     v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
11315 
11316     vst1q_f32( ptr,    v.val[0]);
11317     vst1q_f32( (ptr + 4),  v.val[1]);
11318     vst1q_f32( (ptr + 8),  v.val[2]);
11319 }
11320 #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
11321 
11322 //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
11323 _NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
11324 #define vst3q_p8 vst3q_u8
11325 
11326 //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11327 _NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
11328 #define vst3q_p16 vst3q_u16
11329 
11330 _NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11331 _NEON2SSE_INLINE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)
11332 {
11333     __m128i tmp, sh0, sh1, val0, val2;
11334     _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
11335     _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
11336     _NEON2SSE_ALIGN_16 static const uint8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
11337     _NEON2SSE_ALIGN_16 static const uint8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
11338     tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]) );
11339     sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
11340     val2 = _pM128i(val.val[2]);
11341     sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask0);
11342     val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
11343     vst1q_u8(ptr,   val0); //store as 128 bit structure
11344     sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
11345     sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask1);
11346     val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
11347     _M64((*(__m64_128*)(ptr + 16)),  val2); //need it to fit into *ptr memory
11348 }
11349 
11350 _NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11351 _NEON2SSE_INLINE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)
11352 {
11353     __m128i tmp, val0, val1, val2;
11354     _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
11355     _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23,   0,0,0,0,0,0,0,0};
11356     _NEON2SSE_ALIGN_16 static const uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0]  otherwise from v.val[1]
11357     _NEON2SSE_ALIGN_16 static const uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1]  otherwise from v.val[0]
11358     tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]));
11359     val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
11360     val2 = _pM128i(val.val[2]);
11361     val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
11362     val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
11363     vst1q_u16(ptr,    val0); //store as 128 bit structure
11364     val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
11365     val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
11366     val1 = _MM_BLENDV_EPI8(val0, val1,  *(__m128i*)mask1f); //change the operands order
11367     _M64((*(__m64_128*)(ptr + 8)),  val1); //need it to fit into *ptr memory
11368 }
11369 
11370 _NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
11371 _NEON2SSE_INLINE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)
11372 {
11373     //val.val[0]:0,3,val.val[1]:1,4; val.val[2]:2,5,x,x;
11374     __m128i val0, val1;
11375     val0 = _mm_unpacklo_epi64(_pM128i(val.val[1]), _pM128i(val.val[2])); //val[0]: 1,4,2,5
11376     val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
11377     val1 = _mm_srli_si128(val0, 8); //4,5, x,x
11378     _M64((*(__m64_128*)(ptr + 4)),  val1);
11379     val0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), val0); //0,1,3,2
11380     val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
11381     vst1q_u32(ptr, val0); //store as 128 bit structure
11382 }
11383 
11384 _NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val);// VST1.64 {d0, d1, d2}, [r0]
11385 _NEON2SSE_INLINE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)
11386 {
11387     *(ptr) = val.val[0].m64_u64[0];
11388     *(ptr + 1) = val.val[1].m64_u64[0];
11389     *(ptr + 2) = val.val[2].m64_u64[0];
11390 }
11391 
11392 _NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val);  // VST3.8 {d0, d1, d2}, [r0]
11393 #define vst3_s8(ptr, val) vst3_u8((uint8_t*)ptr, val)
11394 
11395 _NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val);  // VST3.16 {d0, d1, d2}, [r0]
11396 #define vst3_s16(ptr, val) vst3_u16((uint16_t*)ptr, val)
11397 
11398 _NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
11399 #define vst3_s32(ptr, val) vst3_u32((uint32_t*)ptr, val)
11400 
11401 _NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
11402 #define vst3_s64(ptr, val) vst3_u64((uint64_t*)ptr, val)
11403 
11404 //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11405 _NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
11406 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11407 
11408 _NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
11409 _NEON2SSE_INLINE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)
11410 {
11411     //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;   -> 0,2, 4,1, 3,5
11412     *(ptr) =   val.val[0].m64_f32[0];
11413     *(ptr + 1) = val.val[1].m64_f32[0];
11414     *(ptr + 2) = val.val[2].m64_f32[0];
11415     *(ptr + 3) = val.val[0].m64_f32[1];
11416     *(ptr + 4) = val.val[1].m64_f32[1];
11417     *(ptr + 5) = val.val[2].m64_f32[1];
11418 }
11419 
11420 _NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11421 #define vst3_p8 vst3_u8
11422 
11423 _NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11424 #define vst3_p16 vst3_u16
11425 
11426 //***************  Quadruples store ********************************
11427 //*********************************************************************
11428 //void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
11429 _NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
11430 {
11431     __m128i tmp1, tmp2, res;
11432     tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //  0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
11433     tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); //  2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
11434     res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
11435     vst1q_u8(ptr,  res);
11436     res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
11437     vst1q_u8((ptr + 16), res);
11438     tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
11439     tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
11440     res = _mm_unpacklo_epi16(tmp1, tmp2); //
11441     vst1q_u8((ptr + 32), res);
11442     res = _mm_unpackhi_epi16(tmp1, tmp2); //
11443     vst1q_u8((ptr + 48), res);
11444 }
11445 #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
11446 
11447 //void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
11448 _NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val)
11449 {
11450     uint16x8x4_t v;
11451     __m128i tmp1, tmp2;
11452     tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11453     tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11454     v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
11455     v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
11456     tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11457     tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11458     v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
11459     v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
11460     vst1q_u16(ptr,     v.val[0]);
11461     vst1q_u16((ptr + 8), v.val[1]);
11462     vst1q_u16((ptr + 16),v.val[2]);
11463     vst1q_u16((ptr + 24), v.val[3]);
11464 }
11465 #define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
11466 
11467 //void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11468 _NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val)
11469 {
11470     uint16x8x4_t v;
11471     __m128i tmp1, tmp2;
11472     tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11473     tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11474     v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
11475     v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
11476     tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11477     tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11478     v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
11479     v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
11480     vst1q_u32(ptr,      v.val[0]);
11481     vst1q_u32((ptr + 4),  v.val[1]);
11482     vst1q_u32((ptr + 8),  v.val[2]);
11483     vst1q_u32((ptr + 12), v.val[3]);
11484 }
11485 #define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
11486 
11487 //void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
11488 _NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);
11489 #define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
11490 
11491 //void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
11492 _NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);
11493 #define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
11494 
11495 //void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
11496 _NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);
11497 #define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
11498 
11499 //void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11500 _NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);
11501 // IA32 SIMD doesn't work with 16bit floats currently
11502 
11503 //void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11504 _NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val)
11505 {
11506     __m128 tmp3, tmp2, tmp1, tmp0;
11507     float32x4x4_t v;
11508     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
11509     tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
11510     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
11511     tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
11512     v.val[0] = _mm_movelh_ps(tmp0, tmp2);
11513     v.val[1] = _mm_movehl_ps(tmp2, tmp0);
11514     v.val[2] = _mm_movelh_ps(tmp1, tmp3);
11515     v.val[3] = _mm_movehl_ps(tmp3, tmp1);
11516     vst1q_f32(ptr,   v.val[0]);
11517     vst1q_f32((ptr + 4), v.val[1]);
11518     vst1q_f32((ptr + 8), v.val[2]);
11519     vst1q_f32((ptr + 12), v.val[3]);
11520 }
11521 #define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
11522 
11523 //void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
11524 _NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);
11525 #define vst4q_p8 vst4q_u8
11526 
11527 //void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11528 _NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);
11529 #define vst4q_p16 vst4q_s16
11530 
11531 _NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11532 _NEON2SSE_INLINE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)
11533 {
11534     __m128i sh0, sh1, val0, val2;
11535     sh0 = _mm_unpacklo_epi8(_pM128i(val.val[0]),_pM128i(val.val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
11536     sh1 = _mm_unpacklo_epi8(_pM128i(val.val[2]),_pM128i(val.val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
11537     val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
11538     val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
11539     vst1q_u8(ptr,    val0);
11540     vst1q_u8((ptr + 16),  val2);
11541 }
11542 
11543 _NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11544 _NEON2SSE_INLINE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)
11545 {
11546     __m128i sh0, sh1, val0, val2;
11547     sh0 = _mm_unpacklo_epi16(_pM128i(val.val[0]),_pM128i(val.val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
11548     sh1 = _mm_unpacklo_epi16(_pM128i(val.val[2]),_pM128i(val.val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
11549     val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
11550     val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
11551     vst1q_u16(ptr,      val0); //store as 128 bit structure
11552     vst1q_u16((ptr + 8),  val2);
11553 }
11554 
11555 _NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
11556 _NEON2SSE_INLINE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)
11557 {
11558     //0,4,   1,5,  2,6,  3,7
11559     __m128i sh0, sh1, val0, val1;
11560     sh0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1])); //0,1,4,5
11561     sh1 = _mm_unpacklo_epi32(_pM128i(val.val[2]), _pM128i(val.val[3])); //2,3,6,7
11562     val0 = _mm_unpacklo_epi64(sh0,sh1); //
11563     val1 = _mm_unpackhi_epi64(sh0,sh1); //
11564     vst1q_u32(ptr,     val0); //store as 128 bit structure
11565     vst1q_u32((ptr + 4),  val1);
11566 }
11567 
11568 _NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val);// VST1.64 {d0, d1, d2, d3}, [r0]
11569 _NEON2SSE_INLINE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)
11570 {
11571     *(ptr) =  val.val[0].m64_u64[0];
11572     *(ptr + 1) =  val.val[1].m64_u64[0];
11573     *(ptr + 2) =  val.val[2].m64_u64[0];
11574     *(ptr + 3) =  val.val[3].m64_u64[0];
11575 }
11576 
11577 //void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val)  //VST4.8 {d0, d1, d2, d3}, [r0]
11578 #define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
11579 
11580 //void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val)  // VST4.16 {d0, d1, d2, d3}, [r0]
11581 #define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
11582 
11583 //void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
11584 #define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
11585 
11586 //void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
11587 _NEON2SSESTORAGE void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);
11588 #define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
11589 
11590 //void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11591 _NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);
11592 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11593 
11594 _NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
11595 _NEON2SSE_INLINE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)
11596 {
11597     //0,4,   1,5,  2,6,  3,7 -> 0,1, 2,3, 4,5, 6,7
11598     *(ptr) =   val.val[0].m64_f32[0];
11599     *(ptr + 1) = val.val[1].m64_f32[0];
11600     *(ptr + 2) = val.val[2].m64_f32[0];
11601     *(ptr + 3) = val.val[3].m64_f32[0];
11602     *(ptr + 4) = val.val[0].m64_f32[1];
11603     *(ptr + 5) = val.val[1].m64_f32[1];
11604     *(ptr + 6) = val.val[2].m64_f32[1];
11605     *(ptr + 7) = val.val[3].m64_f32[1];
11606 }
11607 
11608 _NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11609 #define vst4_p8 vst4_u8
11610 
11611 _NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11612 #define vst4_p16 vst4_u16
11613 
11614 //*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors  *********************
11615 //********************************************************************************************************************
11616 //void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
11617 _NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane)
11618 {
11619     vst1q_lane_s16(ptr, val->val[0], lane);
11620     vst1q_lane_s16((ptr + 1), val->val[1], lane);
11621 }
11622 #define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
11623 
11624 //void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11625 _NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane)
11626 {
11627     vst1q_lane_u32(ptr, val->val[0], lane);
11628     vst1q_lane_u32((ptr + 1), val->val[1], lane);
11629 }
11630 #define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
11631 
11632 //void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11633 _NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);
11634 #define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
11635 
11636 //void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
11637 _NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);
11638 #define vst2q_lane_s32(ptr, val, lane)  vst2q_lane_u32((uint32_t*)ptr, val, lane)
11639 
11640 //void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11641 _NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);
11642 //current IA SIMD doesn't support float16
11643 
11644 //void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11645 _NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane)
11646 {
11647     vst1q_lane_f32(ptr, val->val[0], lane);
11648     vst1q_lane_f32((ptr + 1), val->val[1], lane);
11649 }
11650 #define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
11651 
11652 //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11653 _NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
11654 #define vst2q_lane_p16 vst2q_lane_s16
11655 
11656 _NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11657 _NEON2SSE_INLINE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
11658 {
11659     *(ptr) = val.val[0].m64_u8[lane];
11660     *(ptr + 1) = val.val[1].m64_u8[lane];
11661 }
11662 
11663 _NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11664 _NEON2SSE_INLINE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane)
11665 {
11666     *(ptr) = val.val[0].m64_u16[lane];
11667     *(ptr + 1) = val.val[1].m64_u16[lane];
11668 }
11669 
11670 _NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11671 _NEON2SSE_INLINE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane)
11672 {
11673     *(ptr) = val.val[0].m64_u32[lane];
11674     *(ptr + 1) = val.val[1].m64_u32[lane];
11675 }
11676 
11677 _NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11678 #define vst2_lane_s8(ptr, val, lane)  vst2_lane_u8((uint8_t*)ptr, val, lane)
11679 
11680 _NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11681 #define vst2_lane_s16(ptr, val, lane)  vst2_lane_u16((uint16_t*)ptr, val, lane)
11682 
11683 _NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11684 #define vst2_lane_s32(ptr, val, lane)  vst2_lane_u32((uint32_t*)ptr, val, lane)
11685 
11686 //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
11687 //current IA SIMD doesn't support float16
11688 
11689 _NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
11690 _NEON2SSE_INLINE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane)
11691 {
11692     *(ptr) = val.val[0].m64_f32[lane];
11693     *(ptr + 1) = val.val[1].m64_f32[lane];
11694 }
11695 
11696 _NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11697 #define vst2_lane_p8 vst2_lane_u8
11698 
11699 _NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11700 #define vst2_lane_p16 vst2_lane_u16
11701 
11702 //************************* Triple lanes  stores *******************************************************
11703 //*******************************************************************************************************
11704 //void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11705 _NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane)
11706 {
11707     vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
11708     vst1q_lane_u16((ptr + 2), val->val[2], lane);
11709 }
11710 #define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
11711 
11712 //void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11713 _NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane)
11714 {
11715     vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
11716     vst1q_lane_u32((ptr + 2), val->val[2], lane);
11717 }
11718 #define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
11719 
11720 //void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11721 _NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);
11722 #define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
11723 
11724 //void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11725 _NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);
11726 #define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
11727 
11728 //void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11729 _NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);
11730 //current IA SIMD doesn't support float16
11731 
11732 //void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11733 _NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane)
11734 {
11735     vst1q_lane_f32(ptr,   val->val[0], lane);
11736     vst1q_lane_f32((ptr + 1),   val->val[1], lane);
11737     vst1q_lane_f32((ptr + 2), val->val[2], lane);
11738 }
11739 #define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
11740 
11741 //void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11742 _NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);
11743 #define vst3q_lane_p16 vst3q_lane_s16
11744 
11745 _NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11746 _NEON2SSE_INLINE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)
11747 {
11748     *(ptr) =     val.val[0].m64_u8[lane];
11749     *(ptr + 1) = val.val[1].m64_u8[lane];
11750     *(ptr + 2) = val.val[2].m64_u8[lane];
11751 }
11752 
11753 _NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11754 _NEON2SSE_INLINE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)
11755 {
11756     *(ptr) =     val.val[0].m64_u16[lane];
11757     *(ptr + 1) = val.val[1].m64_u16[lane];
11758     *(ptr + 2) = val.val[2].m64_u16[lane];
11759 }
11760 
11761 _NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11762 _NEON2SSE_INLINE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)
11763 {
11764     *(ptr) =     val.val[0].m64_u32[lane];
11765     *(ptr + 1) = val.val[1].m64_u32[lane];
11766     *(ptr + 2) = val.val[2].m64_u32[lane];
11767 }
11768 
11769 _NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11770 #define  vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
11771 
11772 _NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11773 #define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
11774 
11775 _NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11776 #define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
11777 
11778 //void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11779 _NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);
11780 //current IA SIMD doesn't support float16
11781 
11782 _NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11783 _NEON2SSE_INLINE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)
11784 {
11785     *(ptr) = val.val[0].m64_f32[lane];
11786     *(ptr + 1) = val.val[1].m64_f32[lane];
11787     *(ptr + 2) = val.val[2].m64_f32[lane];
11788 }
11789 
11790 _NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11791 #define vst3_lane_p8 vst3_lane_u8
11792 
11793 _NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11794 #define vst3_lane_p16 vst3_lane_u16
11795 
11796 //******************************** Quadruple lanes stores ***********************************************
11797 //*******************************************************************************************************
11798 //void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11799 _NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane)
11800 {
11801     vst2q_lane_u16_ptr(ptr,    (uint16x8x2_t*)val4->val, lane);
11802     vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
11803 }
11804 #define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
11805 
11806 //void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11807 _NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane)
11808 {
11809     vst2q_lane_u32_ptr(ptr,     (uint32x4x2_t*)val4->val, lane);
11810     vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
11811 }
11812 #define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
11813 
11814 //void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11815 _NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);
11816 #define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
11817 
11818 //void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11819 _NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);
11820 #define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
11821 
11822 //void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11823 _NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);
11824 //current IA SIMD doesn't support float16
11825 
11826 //void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11827 _NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane)
11828 {
11829     vst1q_lane_f32(ptr,   val->val[0], lane);
11830     vst1q_lane_f32((ptr + 1), val->val[1], lane);
11831     vst1q_lane_f32((ptr + 2), val->val[2], lane);
11832     vst1q_lane_f32((ptr + 3), val->val[3], lane);
11833 }
11834 #define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
11835 
11836 //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11837 _NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
11838 #define vst4q_lane_p16 vst4q_lane_u16
11839 
11840 _NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11841 _NEON2SSE_INLINE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)
11842 {
11843     *(ptr) =     val.val[0].m64_u8[lane];
11844     *(ptr + 1) = val.val[1].m64_u8[lane];
11845     *(ptr + 2) = val.val[2].m64_u8[lane];
11846     *(ptr + 3) = val.val[3].m64_u8[lane];
11847 }
11848 
11849 _NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11850 _NEON2SSE_INLINE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)
11851 {
11852     *(ptr) =     val.val[0].m64_u16[lane];
11853     *(ptr + 1) = val.val[1].m64_u16[lane];
11854     *(ptr + 2) = val.val[2].m64_u16[lane];
11855     *(ptr + 3) = val.val[3].m64_u16[lane];
11856 }
11857 
11858 _NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11859 _NEON2SSE_INLINE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)
11860 {
11861     *(ptr) =     val.val[0].m64_u32[lane];
11862     *(ptr + 1) = val.val[1].m64_u32[lane];
11863     *(ptr + 2) = val.val[2].m64_u32[lane];
11864     *(ptr + 3) = val.val[3].m64_u32[lane];
11865 }
11866 
11867 _NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11868 #define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
11869 
11870 _NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11871 #define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
11872 
11873 _NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11874 #define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
11875 
11876 //void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11877 _NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
11878 //current IA SIMD doesn't support float16
11879 
11880 _NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t  val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11881 _NEON2SSE_INLINE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane)
11882 {
11883     *(ptr) = val.val[0].m64_f32[lane];
11884     *(ptr + 1) = val.val[1].m64_f32[lane];
11885     *(ptr + 2) = val.val[2].m64_f32[lane];
11886     *(ptr + 3) = val.val[3].m64_f32[lane];
11887 }
11888 
11889 _NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11890 #define vst4_lane_p8 vst4_lane_u8
11891 
11892 _NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11893 #define vst4_lane_p16 vst4_lane_u16
11894 
11895 //**************************************************************************************************
11896 //************************ Extract lanes from a vector ********************************************
11897 //**************************************************************************************************
11898 //These intrinsics extract a single lane (element) from a vector.
11899 _NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
11900 #define vget_lane_u8(vec, lane) vec.m64_u8[lane]
11901 
11902 _NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
11903 #define vget_lane_u16(vec, lane) vec.m64_u16[lane]
11904 
11905 
11906 _NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11907 #define vget_lane_u32(vec, lane) vec.m64_u32[lane]
11908 
11909 _NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
11910 #define vget_lane_s8(vec, lane) vec.m64_i8[lane]
11911 
11912 _NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
11913 #define vget_lane_s16(vec, lane) vec.m64_i16[lane]
11914 
11915 _NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11916 #define vget_lane_s32(vec, lane) vec.m64_i32[lane]
11917 
11918 _NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
11919 #define vget_lane_p8 vget_lane_u8
11920 
11921 _NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
11922 #define vget_lane_p16 vget_lane_u16
11923 
11924 _NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11925 #define vget_lane_f32(vec, lane) vec.m64_f32[lane]
11926 
11927 _NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
11928 #define vgetq_lane_u8 (uint8_t) _MM_EXTRACT_EPI8
11929 
11930 _NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
11931 #define  vgetq_lane_u16 (uint16_t) _MM_EXTRACT_EPI16
11932 
11933 _NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11934 #define vgetq_lane_u32 (uint32_t) _MM_EXTRACT_EPI32
11935 
11936 _NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
11937 #define vgetq_lane_s8 _MM_EXTRACT_EPI8
11938 
11939 _NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
11940 #define vgetq_lane_s16 _MM_EXTRACT_EPI16
11941 
11942 _NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11943 #define vgetq_lane_s32 _MM_EXTRACT_EPI32
11944 
11945 _NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
11946 #define vgetq_lane_p8 vgetq_lane_u8
11947 
11948 _NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
11949 #define vgetq_lane_p16 vgetq_lane_u16
11950 
11951 _NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11952 _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
11953 {
11954     int32_t ilane;
11955     ilane = _MM_EXTRACT_PS(vec,lane);
11956     return *(float*)&ilane;
11957 }
11958 
11959 _NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
11960 #define vget_lane_s64(vec, lane) vec.m64_i64[0]
11961 
11962 _NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
11963 #define vget_lane_u64(vec, lane) vec.m64_u64[0]
11964 
11965 
11966 _NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
11967 #define vgetq_lane_s64 _MM_EXTRACT_EPI64
11968 
11969 _NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
11970 #define vgetq_lane_u64 (uint64_t) _MM_EXTRACT_EPI64
11971 
11972 // ***************** Set lanes within a vector ********************************************
11973 // **************************************************************************************
11974 //These intrinsics set a single lane (element) within a vector.
11975 //same functions as vld1_lane_xx ones, but take the value to be set directly.
11976 
11977 _NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
11978 _NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
11979 {
11980     uint8_t val;
11981     val = value;
11982     return vld1_lane_u8(&val, vec,  lane);
11983 }
11984 
11985 _NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
11986 _NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
11987 {
11988     uint16_t val;
11989     val = value;
11990     return vld1_lane_u16(&val, vec,  lane);
11991 }
11992 
11993 _NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
11994 _NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
11995 {
11996     uint32_t val;
11997     val = value;
11998     return vld1_lane_u32(&val, vec,  lane);
11999 }
12000 
12001 _NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
12002 _NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
12003 {
12004     int8_t val;
12005     val = value;
12006     return vld1_lane_s8(&val, vec,  lane);
12007 }
12008 
12009 _NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12010 _NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
12011 {
12012     int16_t val;
12013     val = value;
12014     return vld1_lane_s16(&val, vec,  lane);
12015 }
12016 
12017 _NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12018 _NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
12019 {
12020     int32_t val;
12021     val = value;
12022     return vld1_lane_s32(&val, vec,  lane);
12023 }
12024 
12025 _NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
12026 #define vset_lane_p8  vset_lane_u8
12027 
12028 _NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12029 #define vset_lane_p16  vset_lane_u16
12030 
12031 _NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12032 _NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
12033 {
12034     float32_t val;
12035     val = value;
12036     return vld1_lane_f32(&val, vec,  lane);
12037 }
12038 
12039 _NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12040 _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
12041 {
12042     uint8_t val;
12043     val = value;
12044     return vld1q_lane_u8(&val, vec,  lane);
12045 }
12046 
12047 _NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12048 _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
12049 {
12050     uint16_t val;
12051     val = value;
12052     return vld1q_lane_u16(&val, vec,  lane);
12053 }
12054 
12055 _NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12056 _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
12057 {
12058     uint32_t val;
12059     val = value;
12060     return vld1q_lane_u32(&val, vec,  lane);
12061 }
12062 
12063 _NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12064 _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
12065 {
12066     int8_t val;
12067     val = value;
12068     return vld1q_lane_s8(&val, vec,  lane);
12069 }
12070 
12071 _NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12072 _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
12073 {
12074     int16_t val;
12075     val = value;
12076     return vld1q_lane_s16(&val, vec,  lane);
12077 }
12078 
12079 _NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12080 _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
12081 {
12082     int32_t val;
12083     val = value;
12084     return vld1q_lane_s32(&val, vec,  lane);
12085 }
12086 
12087 _NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12088 #define vsetq_lane_p8 vsetq_lane_u8
12089 
12090 _NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12091 #define vsetq_lane_p16 vsetq_lane_u16
12092 
12093 _NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12094 _NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
12095 {
12096     float32_t val;
12097     val = value;
12098     return vld1q_lane_f32(&val, vec,  lane);
12099 }
12100 
12101 _NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
12102 _NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
12103 {
12104     int64_t val;
12105     val = value;
12106     return vld1_lane_s64(&val, vec,  lane);
12107 }
12108 
12109 _NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
12110 _NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
12111 {
12112     uint64_t val;
12113     val = value;
12114     return vld1_lane_u64(&val, vec,  lane);
12115 }
12116 
12117 _NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
12118 _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
12119 {
12120     uint64_t val;
12121     val = value;
12122     return vld1q_lane_s64(&val, vec,  lane);
12123 }
12124 
12125 _NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
12126 #define vsetq_lane_u64 vsetq_lane_s64
12127 
12128 // *******************************************************************************
12129 // **************** Initialize a vector from bit pattern ***************************
12130 // *******************************************************************************
12131 //These intrinsics create a vector from a literal bit pattern.
12132 _NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
vcreate_s8(uint64_t a)12133 _NEON2SSE_INLINE int8x8_t vcreate_s8(uint64_t a)
12134 {
12135     return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
12136 }
12137 
12138 _NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
12139 #define vcreate_s16  vcreate_s8
12140 
12141 _NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
12142 #define vcreate_s32  vcreate_s8
12143 
12144 _NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
12145 //no IA32 SIMD avalilable
12146 
12147 _NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
vcreate_f32(uint64_t a)12148 _NEON2SSE_INLINE float32x2_t vcreate_f32(uint64_t a)
12149 {
12150     return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
12151 }
12152 
12153 _NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
12154 #define vcreate_u8 vcreate_s8
12155 
12156 _NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
12157 #define vcreate_u16 vcreate_s16
12158 
12159 _NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
12160 #define vcreate_u32 vcreate_s32
12161 
12162 _NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
12163 #define vcreate_u64  vcreate_s8
12164 
12165 
12166 _NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
12167 #define vcreate_p8 vcreate_u8
12168 
12169 _NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
12170 #define vcreate_p16 vcreate_u16
12171 
12172 _NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
12173 #define vcreate_s64 vcreate_u64
12174 
12175 //********************* Set all lanes to same value ********************************
12176 //*********************************************************************************
12177 //These intrinsics set all lanes to the same value.
12178 _NEON2SSESTORAGE uint8x8_t   vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8 (uint8_t value),_NEON2SSE_REASON_SLOW_SERIAL)12179 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t  vdup_n_u8(uint8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12180 {
12181     uint8x8_t res;
12182     int i;
12183     for (i = 0; i<8; i++) {
12184         res.m64_u8[i] = value;
12185     }
12186     return res;
12187 }
12188 
12189 _NEON2SSESTORAGE uint16x4_t   vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16 (uint16_t value),_NEON2SSE_REASON_SLOW_SERIAL)12190 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t  vdup_n_u16(uint16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12191 {
12192     uint16x4_t res;
12193     int i;
12194     for (i = 0; i<4; i++) {
12195         res.m64_u16[i] = value;
12196     }
12197     return res;
12198 }
12199 
12200 _NEON2SSESTORAGE uint32x2_t   vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32 (uint32_t value),_NEON2SSE_REASON_SLOW_SERIAL)12201 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t  vdup_n_u32(uint32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12202 {
12203     uint32x2_t res;
12204     res.m64_u32[0] = value;
12205     res.m64_u32[1] = value;
12206     return res;
12207 }
12208 
12209 _NEON2SSESTORAGE int8x8_t   vdup_n_s8(int8_t value); // VDUP.8 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8 (int8_t value),_NEON2SSE_REASON_SLOW_SERIAL)12210 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t  vdup_n_s8(int8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12211 {
12212     int8x8_t res;
12213     int i;
12214     for (i = 0; i<8; i++) {
12215         res.m64_i8[i] = value;
12216     }
12217     return res;
12218 }
12219 
12220 _NEON2SSESTORAGE int16x4_t   vdup_n_s16(int16_t value); // VDUP.16 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16 (int16_t value),_NEON2SSE_REASON_SLOW_SERIAL)12221 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t  vdup_n_s16(int16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12222 {
12223     int16x4_t res;
12224     int i;
12225     for (i = 0; i<4; i++) {
12226         res.m64_i16[i] = value;
12227     }
12228     return res;
12229 }
12230 
12231 _NEON2SSESTORAGE int32x2_t   vdup_n_s32(int32_t value); // VDUP.32 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32 (int32_t value),_NEON2SSE_REASON_SLOW_SERIAL)12232 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t  vdup_n_s32(int32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12233 {
12234     int32x2_t res;
12235     res.m64_i32[0] = value;
12236     res.m64_i32[1] = value;
12237     return res;
12238 }
12239 
12240 _NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
12241 #define vdup_n_p8 vdup_n_u8
12242 
12243 _NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
12244 #define vdup_n_p16 vdup_n_s16
12245 
12246 _NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
vdup_n_f32(float32_t value)12247 _NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
12248 {
12249     float32x2_t res;
12250     res.m64_f32[0] = value;
12251     res.m64_f32[1] = value;
12252     return res;
12253 }
12254 
12255 _NEON2SSESTORAGE uint8x16_t   vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
12256 #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
12257 
12258 _NEON2SSESTORAGE uint16x8_t   vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
12259 #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
12260 
12261 _NEON2SSESTORAGE uint32x4_t   vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
12262 #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
12263 
12264 _NEON2SSESTORAGE int8x16_t   vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
12265 #define vdupq_n_s8 _mm_set1_epi8
12266 
12267 _NEON2SSESTORAGE int16x8_t   vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
12268 #define vdupq_n_s16 _mm_set1_epi16
12269 
12270 _NEON2SSESTORAGE int32x4_t   vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
12271 #define vdupq_n_s32 _mm_set1_epi32
12272 
12273 _NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
12274 #define  vdupq_n_p8 vdupq_n_u8
12275 
12276 _NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
12277 #define  vdupq_n_p16 vdupq_n_u16
12278 
12279 _NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
12280 #define vdupq_n_f32 _mm_set1_ps
12281 
12282 _NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
vdup_n_s64(int64_t value)12283 _NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
12284 {
12285     int64x1_t res;
12286     res.m64_i64[0] = value;
12287     return res;
12288 }
12289 
12290 _NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
vdup_n_u64(uint64_t value)12291 _NEON2SSE_INLINE uint64x1_t  vdup_n_u64(uint64_t value)
12292 {
12293     uint64x1_t res;
12294     res.m64_u64[0] = value;
12295     return res;
12296 }
12297 
12298 _NEON2SSESTORAGE int64x2_t   vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
vdupq_n_s64(int64_t value)12299 _NEON2SSE_INLINE int64x2_t   vdupq_n_s64(int64_t value)
12300 {
12301     _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate
12302     return LOAD_SI128(value2);
12303 }
12304 
12305 _NEON2SSESTORAGE uint64x2_t   vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
vdupq_n_u64(uint64_t value)12306 _NEON2SSE_INLINE uint64x2_t   vdupq_n_u64(uint64_t value)
12307 {
12308     _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate
12309     return LOAD_SI128(val);
12310 }
12311 
12312 //****  Set all lanes to same value  ************************
12313 //Same functions as above - just aliaces.********************
12314 //Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
12315 _NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
12316 #define vmov_n_u8 vdup_n_s8
12317 
12318 _NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
12319 #define vmov_n_u16 vdup_n_s16
12320 
12321 _NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
12322 #define vmov_n_u32 vdup_n_u32
12323 
12324 _NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
12325 #define vmov_n_s8 vdup_n_s8
12326 
12327 _NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
12328 #define vmov_n_s16 vdup_n_s16
12329 
12330 _NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
12331 #define vmov_n_s32 vdup_n_s32
12332 
12333 _NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
12334 #define vmov_n_p8 vdup_n_u8
12335 
12336 _NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
12337 #define vmov_n_p16 vdup_n_s16
12338 
12339 _NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
12340 #define vmov_n_f32 vdup_n_f32
12341 
12342 _NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
12343 #define vmovq_n_u8 vdupq_n_u8
12344 
12345 _NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
12346 #define vmovq_n_u16 vdupq_n_s16
12347 
12348 _NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
12349 #define vmovq_n_u32 vdupq_n_u32
12350 
12351 _NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
12352 #define vmovq_n_s8 vdupq_n_s8
12353 
12354 _NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
12355 #define vmovq_n_s16 vdupq_n_s16
12356 
12357 _NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
12358 #define vmovq_n_s32 vdupq_n_s32
12359 
12360 _NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
12361 #define vmovq_n_p8 vdupq_n_u8
12362 
12363 _NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
12364 #define vmovq_n_p16 vdupq_n_s16
12365 
12366 _NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
12367 #define vmovq_n_f32 vdupq_n_f32
12368 
12369 _NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
12370 #define vmov_n_s64 vdup_n_s64
12371 
12372 _NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
12373 #define vmov_n_u64 vdup_n_u64
12374 
12375 _NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
12376 #define vmovq_n_s64 vdupq_n_s64
12377 
12378 _NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
12379 #define vmovq_n_u64 vdupq_n_u64
12380 
12381 //**************Set all lanes to the value of one lane of a vector *************
12382 //****************************************************************************
12383 //here shuffle is better solution than lane extraction followed by set1 function
12384 _NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12385 _NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
12386 {
12387     uint8x8_t res;
12388     uint8_t valane;
12389     int i = 0;
12390     valane = vec.m64_u8[lane];
12391     for (i = 0; i<8; i++) {
12392         res.m64_u8[i] = valane;
12393     }
12394     return res;
12395 }
12396 
12397 _NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12398 _NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
12399 {
12400     uint16x4_t res;
12401     uint16_t valane;
12402     valane = vec.m64_u16[lane];
12403     res.m64_u16[0] = valane;
12404     res.m64_u16[1] = valane;
12405     res.m64_u16[2] = valane;
12406     res.m64_u16[3] = valane;
12407     return res;
12408 }
12409 
12410 _NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12411 _NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
12412 {
12413     uint32x2_t res;
12414     res.m64_u32[0] = vec.m64_u32[lane];
12415     res.m64_u32[1] = res.m64_u32[0];
12416     return res;
12417 }
12418 
12419 _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec,  __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12420 #define vdup_lane_s8 vdup_lane_u8
12421 
12422 _NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec,  __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12423 #define vdup_lane_s16 vdup_lane_u16
12424 
12425 _NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec,  __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12426 #define vdup_lane_s32 vdup_lane_u32
12427 
12428 _NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12429 #define vdup_lane_p8 vdup_lane_u8
12430 
12431 _NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12432 #define vdup_lane_p16 vdup_lane_s16
12433 
12434 _NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12435 _NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
12436 {
12437     float32x2_t res;
12438     res.m64_f32[0] = vec.m64_f32[lane];
12439     res.m64_f32[1] = res.m64_f32[0];
12440     return res;
12441 }
12442 
12443 _NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12444 _NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
12445 {
12446     const int8_t lane8 = (int8_t) lane;
12447     _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8};
12448     return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
12449 }
12450 
12451 _NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12452 _NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
12453 {
12454     //we could use 8bit shuffle for 16 bit as well
12455     const int8_t lane16 = ((int8_t) lane) << 1;
12456     const int8_t lane16_1 = lane16 + 1;
12457     _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1,
12458                                                 lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1};
12459     return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
12460 }
12461 
12462 _NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12463 _NEON2SSE_INLINE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
12464 {
12465     //need to use function not macro to make it gcc friendly and meet the immediate const requirement for _mm_shuffle_epi32
12466     if (lane == 1)
12467         return _mm_shuffle_epi32 (_pM128i(vec), (1 | (1 << 2) | (1 << 4) | (1 << 6)) );
12468     else
12469         return _mm_shuffle_epi32 (_pM128i(vec), 0);
12470 }
12471 
12472 _NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12473 #define vdupq_lane_s8 vdupq_lane_u8
12474 
12475 _NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12476 #define vdupq_lane_s16 vdupq_lane_u16
12477 
12478 _NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12479 #define vdupq_lane_s32 vdupq_lane_u32
12480 
12481 _NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12482 #define vdupq_lane_p8 vdupq_lane_u8
12483 
12484 _NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12485 #define vdupq_lane_p16 vdupq_lane_s16
12486 
12487 _NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12488 #define  vdupq_lane_f32(vec, lane)  _mm_load1_ps((vec.m64_f32 + lane))
12489 
12490 _NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
12491 #define vdup_lane_s64(vec,lane) vec
12492 
12493 _NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
12494 #define vdup_lane_u64(vec,lane) vec
12495 
12496 _NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
12497 _NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
12498 {
12499     __m128i vec128;
12500     vec128 = _pM128i(vec);
12501     return _mm_unpacklo_epi64(vec128,vec128);
12502 }
12503 
12504 _NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
12505 #define vdupq_lane_u64 vdupq_lane_s64
12506 
12507 // ********************************************************************
12508 // ********************  Combining vectors *****************************
12509 // ********************************************************************
12510 //These intrinsics join two 64 bit vectors into a single 128bit vector.
12511 _NEON2SSESTORAGE int8x16_t   vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
vcombine_s8(int8x8_t low,int8x8_t high)12512 _NEON2SSE_INLINE int8x16_t  vcombine_s8(int8x8_t low, int8x8_t high)
12513 {
12514    return _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) );
12515 }
12516 
12517 _NEON2SSESTORAGE int16x8_t   vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
12518 #define vcombine_s16 vcombine_s8
12519 
12520 _NEON2SSESTORAGE int32x4_t   vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
12521 #define vcombine_s32 vcombine_s8
12522 
12523 _NEON2SSESTORAGE int64x2_t   vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
12524 #define vcombine_s64 vcombine_s8
12525 
12526 _NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
12527 //current IA SIMD doesn't support float16
12528 
12529 _NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
vcombine_f32(float32x2_t low,float32x2_t high)12530 _NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
12531 {
12532     __m128i res;
12533     res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
12534     return _M128(res);
12535 }
12536 
12537 _NEON2SSESTORAGE uint8x16_t   vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
12538 #define vcombine_u8 vcombine_s8
12539 
12540 _NEON2SSESTORAGE uint16x8_t   vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
12541 #define vcombine_u16 vcombine_s16
12542 
12543 _NEON2SSESTORAGE uint32x4_t   vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
12544 #define vcombine_u32 vcombine_s32
12545 
12546 _NEON2SSESTORAGE uint64x2_t   vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
12547 #define vcombine_u64 vcombine_s64
12548 
12549 _NEON2SSESTORAGE poly8x16_t   vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
12550 #define vcombine_p8 vcombine_u8
12551 
12552 _NEON2SSESTORAGE poly16x8_t   vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
12553 #define vcombine_p16 vcombine_u16
12554 
12555 //**********************************************************************
12556 //************************* Splitting vectors **************************
12557 //**********************************************************************
12558 //**************** Get high part ******************************************
12559 //These intrinsics split a 128 bit vector into 2 component 64 bit vectors
12560 _NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
vget_high_s8(int8x16_t a)12561 _NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
12562 {
12563     int8x8_t res64;
12564     __m128i res;
12565     res = _mm_unpackhi_epi64(a,a); //SSE2
12566     return64(res);
12567 }
12568 
12569 _NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
vget_high_s16(int16x8_t a)12570 _NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
12571 {
12572     int16x4_t res64;
12573     __m128i res;
12574     res =  _mm_unpackhi_epi64(a,a); //SSE2
12575     return64(res);
12576 }
12577 
12578 _NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
vget_high_s32(int32x4_t a)12579 _NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
12580 {
12581     int32x2_t res64;
12582     __m128i res;
12583     res =  _mm_unpackhi_epi64(a,a); //SSE2
12584     return64(res);
12585 }
12586 
12587 _NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
vget_high_s64(int64x2_t a)12588 _NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
12589 {
12590     int64x1_t res64;
12591     __m128i res;
12592     res =  _mm_unpackhi_epi64(a,a); //SSE2
12593     return64(res);
12594 }
12595 
12596 _NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
12597 // IA32 SIMD doesn't work with 16bit floats currently
12598 
12599 _NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
vget_high_f32(float32x4_t a)12600 _NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
12601 {
12602     __m128i res;
12603     __m64_128 res64;
12604     res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
12605     return64(res);
12606 }
12607 
12608 _NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
12609 #define vget_high_u8 vget_high_s8
12610 
12611 _NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
12612 #define vget_high_u16 vget_high_s16
12613 
12614 _NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
12615 #define vget_high_u32 vget_high_s32
12616 
12617 _NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
12618 #define vget_high_u64 vget_high_s64
12619 
12620 _NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
12621 #define vget_high_p8 vget_high_u8
12622 
12623 _NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
12624 #define vget_high_p16 vget_high_u16
12625 
12626 //********************** Get low part **********************
12627 //**********************************************************
12628 _NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
vget_low_s8(int8x16_t a)12629 _NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
12630 {
12631     int16x4_t res64;
12632     return64(a);
12633 }
12634 
12635 _NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
vget_low_s16(int16x8_t a)12636 _NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
12637 {
12638     int16x4_t res64;
12639     return64(a);
12640 }
12641 
12642 _NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
vget_low_s32(int32x4_t a)12643 _NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
12644 {
12645     int32x2_t res64;
12646     return64(a);
12647 }
12648 
12649 _NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
vget_low_s64(int64x2_t a)12650 _NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
12651 {
12652     int64x1_t res64;
12653     return64 (a);
12654 }
12655 
12656 _NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
12657 // IA32 SIMD doesn't work with 16bit floats currently
12658 
12659 _NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
vget_low_f32(float32x4_t a)12660 _NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
12661 {
12662     float32x2_t res64;
12663     _M64f(res64, a);
12664     return res64;
12665 }
12666 
12667 _NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
12668 #define vget_low_u8 vget_low_s8
12669 
12670 _NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
12671 #define vget_low_u16 vget_low_s16
12672 
12673 _NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
12674 #define vget_low_u32 vget_low_s32
12675 
12676 _NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
12677 #define vget_low_u64 vget_low_s64
12678 
12679 _NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
12680 #define vget_low_p8 vget_low_u8
12681 
12682 _NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
12683 #define vget_low_p16 vget_low_s16
12684 
12685 //**************************************************************************
12686 //************************ Converting vectors **********************************
12687 //**************************************************************************
12688 //************* Convert from float ***************************************
12689 // need to set _MM_SET_ROUNDING_MODE ( x) accordingly
12690 _NEON2SSESTORAGE int32x2_t   vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
vcvt_s32_f32(float32x2_t a)12691 _NEON2SSE_INLINE int32x2_t   vcvt_s32_f32(float32x2_t a)
12692 {
12693     int32x2_t res64;
12694     __m128i res;
12695     res =  _mm_cvtps_epi32(_pM128(a)); //use low 64 bits of result only
12696     return64(res);
12697 }
12698 
12699 _NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
vcvt_u32_f32(float32x2_t a)12700 _NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
12701 {
12702     uint32x2_t res64;
12703     __m128i res;
12704     res = vcvtq_u32_f32(_pM128(a));
12705     return64(res);
12706 }
12707 
12708 _NEON2SSESTORAGE int32x4_t  vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
vcvtq_s32_f32(float32x4_t a)12709 _NEON2SSE_INLINE int32x4_t  vcvtq_s32_f32(float32x4_t a)
12710 {
12711     __m128 dif;
12712     __m128i res;
12713     //_mm_cvttps_epi32 incorrectly treats the case a > =2.14748364e+009, therefore the special processing is necessary
12714     _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
12715     dif = _mm_cmpge_ps(a, *(__m128*)fmax);
12716     res = _mm_cvttps_epi32(a);
12717     return _mm_xor_si128(res, _M128i(dif));
12718 }
12719 
12720 _NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
vcvtq_u32_f32(float32x4_t a)12721 _NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
12722 {
12723     //No single instruction SSE solution  but we could implement it as following:
12724     __m128i res1, res2, zero, mask;
12725     __m128  max, min, dif;
12726     _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
12727     _NEON2SSE_ALIGN_16 static const float32_t fmax_unsigned[] = { 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f };
12728     zero = _mm_setzero_si128();
12729     mask = _mm_cmpgt_epi32(_M128i(a), zero);
12730     min = _mm_and_ps(_M128(mask), a);
12731     max = _mm_min_ps(min, *(__m128*)fmax_unsigned); //clamped in 0 - 4.29496729+009
12732 
12733     dif = _mm_sub_ps(max, *(__m128*)fmax);
12734     mask = _mm_cmpgt_epi32(_M128i(dif),zero);
12735     dif = _mm_and_ps(_M128(mask), dif);
12736 
12737     res1 = _mm_cvttps_epi32(dif);
12738     res2 = vcvtq_s32_f32(max);
12739     return _mm_add_epi32(res1, res2);
12740 }
12741 
12742 // ***** Convert to the fixed point  with   the number of fraction bits specified by b ***********
12743 //*************************************************************************************************
12744 _NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
12745 _NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
12746 {
12747     int32x2_t res64;
12748     return64(vcvtq_n_s32_f32(_pM128(a),b));
12749 }
12750 
12751 _NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
12752 _NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
12753 {
12754     uint32x2_t res;
12755     float convconst;
12756     convconst = (float)((uint32_t)1 << b);
12757     res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst);
12758     res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst);
12759     return res;
12760 }
12761 
12762 _NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
12763 _NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
12764 {
12765     float convconst;
12766     _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
12767     __m128 cconst128;
12768     __m128i mask, res;
12769     convconst = (float)(1 << b);
12770     cconst128 = vdupq_n_f32(convconst);
12771     res =  _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
12772     mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
12773     return _mm_xor_si128 (res,  mask); //res saturated for 0x80000000
12774 }
12775 
12776 _NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
12777 _NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
12778 {
12779     float convconst;
12780     __m128 cconst128;
12781     convconst = (float)(1 << b);
12782     cconst128 = vdupq_n_f32(convconst);
12783     return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
12784 }
12785 
12786 
12787 _NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
vcvtnq_s32_f32(float32x4_t a)12788 _NEON2SSE_INLINE int32x4_t vcvtnq_s32_f32(float32x4_t a)
12789 {
12790   return _mm_cvtps_epi32(a);
12791 }
12792 
12793 //***************** Convert to float *************************
12794 //*************************************************************
12795 _NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
vcvt_f32_s32(int32x2_t a)12796 _NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
12797 {
12798     float32x2_t res;
12799     res.m64_f32[0] = (float) a.m64_i32[0];
12800     res.m64_f32[1] = (float) a.m64_i32[1];
12801     return res;
12802 }
12803 
12804 _NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
vcvt_f32_u32(uint32x2_t a)12805 _NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
12806 {
12807     float32x2_t res;
12808     res.m64_f32[0] = (float) a.m64_u32[0];
12809     res.m64_f32[1] = (float) a.m64_u32[1];
12810     return res;
12811 }
12812 
12813 _NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
12814 #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
12815 
12816 _NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
vcvtq_f32_u32(uint32x4_t a)12817 _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
12818 {
12819     //solution may be not optimal
12820     __m128 two16, fHi, fLo;
12821     __m128i hi, lo;
12822     two16 = _mm_set1_ps((float)0x10000); //2^16
12823     // Avoid double rounding by doing two exact conversions
12824     // of high and low 16-bit segments
12825     hi = _mm_srli_epi32(a, 16);
12826     lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
12827     fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
12828     fLo = _mm_cvtepi32_ps(lo);
12829     // do single rounding according to current rounding mode
12830     return _mm_add_ps(fHi, fLo);
12831 }
12832 
12833 // ***** Convert to the float from fixed point  with   the number of fraction bits specified by b ***********
12834 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
12835 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
12836 {
12837     float32x2_t res;
12838     float convconst;
12839     convconst = (float)(1. / ((uint32_t)1 << b));
12840     res.m64_f32[0] =  a.m64_i32[0] * convconst;
12841     res.m64_f32[1] = a.m64_i32[1] * convconst;
12842     return res;
12843 }
12844 
12845 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
12846 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
12847 {
12848     float32x2_t res;
12849     float convconst;
12850     convconst = (float)(1. / ((uint32_t)1 << b));
12851     res.m64_f32[0] =  a.m64_u32[0] * convconst;
12852     res.m64_f32[1] = a.m64_u32[1] * convconst;
12853     return res;
12854 }
12855 
12856 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
12857 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
12858 {
12859     float convconst;
12860     __m128 cconst128, af;
12861     convconst = (float)(1. / ((uint32_t)1 << b));
12862     af = _mm_cvtepi32_ps(a);
12863     cconst128 = vdupq_n_f32(convconst);
12864     return _mm_mul_ps(af,cconst128);
12865 }
12866 
12867 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
12868 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
12869 {
12870     float convconst;
12871     __m128 cconst128, af;
12872     convconst = (float)(1. / (1 << b));
12873     af = vcvtq_f32_u32(a);
12874     cconst128 = vdupq_n_f32(convconst);
12875     return _mm_mul_ps(af,cconst128);
12876 }
12877 
12878 //**************Convert between floats ***********************
12879 //************************************************************
12880 _NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
12881 //Intel SIMD doesn't support 16bits floats curently
12882 
12883 _NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
12884 //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
12885 
12886 //************Vector narrow integer conversion (truncation) ******************
12887 //****************************************************************************
12888 _NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
vmovn_s16(int16x8_t a)12889 _NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
12890 {
12891     int8x8_t res64;
12892     __m128i res;
12893     res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
12894     return64(res);
12895 }
12896 
12897 _NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
vmovn_s32(int32x4_t a)12898 _NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
12899 {
12900     int16x4_t res64;
12901     __m128i res;
12902     res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
12903     return64(res);
12904 }
12905 
12906 _NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
vmovn_s64(int64x2_t a)12907 _NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
12908 {
12909     //may be not effective compared with a serial implementation
12910     int32x2_t res64;
12911     __m128i res;
12912     res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
12913     return64(res);
12914 }
12915 
12916 _NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
12917 #define vmovn_u16 vmovn_s16
12918 
12919 _NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
12920 #define vmovn_u32 vmovn_s32
12921 
12922 _NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
12923 #define vmovn_u64 vmovn_s64
12924 
12925 //**************** Vector long move   ***********************
12926 //***********************************************************
12927 _NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
vmovl_s8(int8x8_t a)12928 _NEON2SSE_INLINE int16x8_t vmovl_s8(int8x8_t a)
12929 {
12930     return _MM_CVTEPI8_EPI16(_pM128i(a)); //SSE4.1
12931 }
12932 
12933 _NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
vmovl_s16(int16x4_t a)12934 _NEON2SSE_INLINE int32x4_t vmovl_s16(int16x4_t a)
12935 {
12936     return _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1
12937 }
12938 
12939 _NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
vmovl_s32(int32x2_t a)12940 _NEON2SSE_INLINE int64x2_t  vmovl_s32(int32x2_t a)
12941 {
12942     return _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1
12943 }
12944 
12945 _NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
vmovl_u8(uint8x8_t a)12946 _NEON2SSE_INLINE uint16x8_t vmovl_u8(uint8x8_t a)
12947 {
12948     return _MM_CVTEPU8_EPI16(_pM128i(a)); //SSE4.1
12949 }
12950 
12951 _NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
vmovl_u16(uint16x4_t a)12952 _NEON2SSE_INLINE uint32x4_t  vmovl_u16(uint16x4_t a)
12953 {
12954     return _MM_CVTEPU16_EPI32(_pM128i(a)); //SSE4.1
12955 }
12956 
12957 _NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
vmovl_u32(uint32x2_t a)12958 _NEON2SSE_INLINE uint64x2_t  vmovl_u32(uint32x2_t a)
12959 {
12960     return _MM_CVTEPU32_EPI64(_pM128i(a)); //SSE4.1
12961 }
12962 
12963 //*************Vector saturating narrow integer*****************
12964 //**************************************************************
12965 _NEON2SSESTORAGE int8x8_t   vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
vqmovn_s16(int16x8_t a)12966 _NEON2SSE_INLINE int8x8_t   vqmovn_s16(int16x8_t a)
12967 {
12968     int8x8_t res64;
12969     __m128i res;
12970     res = _mm_packs_epi16(a, a);
12971     return64(res);
12972 }
12973 
12974 _NEON2SSESTORAGE int16x4_t   vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
vqmovn_s32(int32x4_t a)12975 _NEON2SSE_INLINE int16x4_t   vqmovn_s32(int32x4_t a)
12976 {
12977     int16x4_t res64;
12978     __m128i res;
12979     res = _mm_packs_epi32(a, a);
12980     return64(res);
12981 }
12982 
12983 _NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64 (int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)12984 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
12985 {
12986     int32x2_t res;
12987     _NEON2SSE_ALIGN_16 int64_t atmp[2];
12988     _mm_store_si128((__m128i*)atmp, a);
12989     if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
12990     if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
12991     if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
12992     if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
12993     res.m64_i32[0] = (int32_t)atmp[0];
12994     res.m64_i32[1] = (int32_t)atmp[1];
12995     return res;
12996 }
12997 
12998 _NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
vqmovn_u16(uint16x8_t a)12999 _NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
13000 {
13001     //no uint16 to uint8 conversion in SSE, need truncate to max signed first. Also trying to avoid _mm_shuffle_epi8 because of its big latency for old Atom CPUs
13002     uint8x8_t res64;
13003     __m128i c7fff, a_trunc, mask_trunc;
13004     c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
13005     a_trunc =  _mm_and_si128(a,  c7fff); // a truncated to max signed
13006     mask_trunc =  _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
13007     mask_trunc =  _mm_and_si128(mask_trunc,  c7fff);  //zero or c7fff if the 15-th bit had been set initially
13008     a_trunc = _mm_or_si128(a_trunc,  mask_trunc);
13009     a_trunc =  _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
13010     return64(a_trunc);
13011 }
13012 
13013 _NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
vqmovn_u32(uint32x4_t a)13014 _NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
13015 {
13016      #ifdef USE_SSE4
13017         //no uint32 to uint16 conversion in SSE, need truncate to max signed first
13018         uint16x4_t res64;
13019         __m128i c7fffffff, a_trunc, mask_trunc;
13020         c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
13021         a_trunc =  _mm_and_si128(a,  c7fffffff); // a truncated to max signed
13022         mask_trunc =  _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
13023         mask_trunc =  _mm_and_si128(mask_trunc,  c7fffffff);  //zero or c7fff if the 15-th bit had been set initially
13024         a_trunc = _mm_or_si128(a_trunc,  mask_trunc);
13025         a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
13026         return64(a_trunc);
13027     #else
13028         uint16x4_t res64;
13029        __m128i res_hi, mask;
13030         mask = _mm_setzero_si128();
13031         res_hi = _mm_srli_epi32(a, 16);
13032         res_hi = _mm_cmpeq_epi16(res_hi, mask);
13033         mask = _mm_cmpeq_epi16(mask,mask); //all fff
13034         mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >16 bits numbers
13035         res_hi = _mm_or_si128(a, mask); //saturated res
13036         res_hi = _mm_shuffle_epi8 (res_hi, *(__m128i*) mask8_32_even_odd); //go to 16 bits
13037         return64(res_hi);
13038     #endif
13039 }
13040 
13041 _NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
vqmovn_u64(uint64x2_t a)13042 _NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
13043 {
13044     //serial solution may be faster
13045     uint32x2_t res64;
13046     __m128i res_hi, mask;
13047     mask = _mm_setzero_si128();
13048     res_hi = _mm_srli_epi64(a, 32);
13049     res_hi = _mm_cmpeq_epi32(res_hi, mask);
13050     mask = _mm_cmpeq_epi32(mask,mask); //all fff
13051     mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
13052     res_hi = _mm_or_si128(a, mask);
13053     res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
13054     return64(res_hi);
13055 }
13056 //************* Vector saturating narrow integer signed->unsigned **************
13057 //*****************************************************************************
13058 _NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
vqmovun_s16(int16x8_t a)13059 _NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
13060 {
13061     uint8x8_t res64;
13062     __m128i res;
13063     res = _mm_packus_epi16(a, a); //use low 64bits only
13064     return64(res);
13065 }
13066 
13067 _NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
vqmovun_s32(int32x4_t a)13068 _NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
13069 {
13070     uint16x4_t res64;
13071     __m128i res;
13072     res = _MM_PACKUS1_EPI32(a); //use low 64bits only
13073     return64(res);
13074 }
13075 
13076 _NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
vqmovun_s64(int64x2_t a)13077 _NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
13078 {
13079     uint32x2_t res64;
13080     __m128i res_hi,res_lo, zero, cmp;
13081     zero = _mm_setzero_si128();
13082     res_hi = _mm_srli_epi64(a,  32);
13083     cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
13084     res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
13085     cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
13086     res_lo =  _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
13087     res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
13088     return64(res_lo);
13089 }
13090 
13091 // ********************************************************
13092 // **************** Table look up **************************
13093 // ********************************************************
13094 //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
13095 //in a table and generate a new vector. Indexes out of range return 0.
13096 //for Intel SIMD we need to set the MSB to 1 for zero return
13097 _NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
vtbl1_u8(uint8x8_t a,uint8x8_t b)13098 _NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
13099 {
13100     uint8x8_t res64;
13101     __m128i c7, maskgt, bmask, b128;
13102     c7 = _mm_set1_epi8 (7);
13103     b128 = _pM128i(b);
13104     maskgt = _mm_cmpgt_epi8(b128,c7);
13105     bmask = _mm_or_si128(b128,maskgt);
13106     bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
13107     return64(bmask);
13108 }
13109 
13110 _NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a,  int8x8_t b); // VTBL.8 d0, {d0}, d0
13111 #define vtbl1_s8 vtbl1_u8
13112 
13113 _NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
13114 #define vtbl1_p8 vtbl1_u8
13115 
13116 _NEON2SSESTORAGE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
vtbl2_u8(uint8x8x2_t a,uint8x8_t b)13117 _NEON2SSE_INLINE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b)
13118 {
13119     uint8x8_t res64;
13120     __m128i c15, a01, maskgt15, bmask, b128;
13121     c15 = _mm_set1_epi8 (15);
13122     b128 = _pM128i(b);
13123     maskgt15 = _mm_cmpgt_epi8(b128,c15);
13124     bmask = _mm_or_si128(b128, maskgt15);
13125     a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]), _pM128i(a.val[1]));
13126     a01 =  _mm_shuffle_epi8(a01, bmask);
13127     return64(a01);
13128 }
13129 
13130 //int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13131 #define vtbl2_s8 vtbl2_u8
13132 
13133 //poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13134 #define vtbl2_p8 vtbl2_u8
13135 
13136 _NEON2SSESTORAGE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
vtbl3_u8(uint8x8x3_t a,uint8x8_t b)13137 _NEON2SSE_INLINE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b)
13138 {
13139     //solution may be not optimal
13140     uint8x8_t res64;
13141     __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
13142     c15 = _mm_set1_epi8 (15);
13143     c23 = _mm_set1_epi8 (23);
13144     b128 = _pM128i(b);
13145     maskgt23 = _mm_cmpgt_epi8(b128,c23);
13146     bmask = _mm_or_si128(b128, maskgt23);
13147     maskgt15 = _mm_cmpgt_epi8(b128,c15);
13148     a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
13149     sh0 =  _mm_shuffle_epi8(a01, bmask);
13150     sh1 =  _mm_shuffle_epi8(_pM128i(a.val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
13151     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
13152     return64(sh0);
13153 }
13154 
13155 _NEON2SSESTORAGE int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13156 #define vtbl3_s8 vtbl3_u8
13157 
13158 _NEON2SSESTORAGE poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13159 #define vtbl3_p8 vtbl3_u8
13160 
13161 _NEON2SSESTORAGE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
vtbl4_u8(uint8x8x4_t a,uint8x8_t b)13162 _NEON2SSE_INLINE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b)
13163 {
13164     //solution may be not optimal
13165     uint8x8_t res64;
13166     __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
13167     c15 = _mm_set1_epi8 (15);
13168     c31 = _mm_set1_epi8 (31);
13169     b128 = _pM128i(b);
13170     maskgt31 = _mm_cmpgt_epi8(b128,c31);
13171     bmask = _mm_or_si128(b128, maskgt31);
13172     maskgt15 = _mm_cmpgt_epi8(b128,c15);
13173     a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
13174     a23 = _mm_unpacklo_epi64(_pM128i(a.val[2]),_pM128i(a.val[3]));
13175     sh0 =  _mm_shuffle_epi8(a01, bmask);
13176     sh1 =  _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
13177     sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
13178     return64(sh0);
13179 }
13180 
13181 _NEON2SSESTORAGE int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13182 #define vtbl4_s8 vtbl4_u8
13183 
13184 _NEON2SSESTORAGE poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13185 #define vtbl4_p8 vtbl4_u8
13186 
13187 //****************** Extended table look up intrinsics ***************************
13188 //**********************************************************************************
13189 //VTBX (Vector Table Extension) works in the same way as VTBL do,
13190 // except that indexes out of range leave the destination element unchanged.
13191 
13192 _NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
vtbx1_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)13193 _NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
13194 {
13195     uint8x8_t res64;
13196     __m128i c7, maskgt, sh, c128;
13197     c7 = _mm_set1_epi8 (7);
13198     c128 = _pM128i(c);
13199     maskgt = _mm_cmpgt_epi8(c128,c7);
13200     c7 = _mm_and_si128(maskgt,_pM128i(a));
13201     sh = _mm_shuffle_epi8(_pM128i(b),c128);
13202     sh = _mm_andnot_si128(maskgt,sh);
13203     sh =  _mm_or_si128(sh,c7);
13204     return64(sh);
13205 }
13206 
13207 _NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
13208 #define vtbx1_s8 vtbx1_u8
13209 
13210 _NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
13211 #define vtbx1_p8 vtbx1_u8
13212 
13213 _NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
vtbx2_u8(uint8x8_t a,uint8x8x2_t b,uint8x8_t c)13214 _NEON2SSE_INLINE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c)
13215 {
13216     uint8x8_t res64;
13217     __m128i c15, b01, maskgt15, sh, c128;
13218     c15 = _mm_set1_epi8 (15);
13219     c128 = _pM128i(c);
13220     maskgt15 = _mm_cmpgt_epi8(c128, c15);
13221     c15 = _mm_and_si128(maskgt15, _pM128i(a));
13222     b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]), _pM128i(b.val[1]));
13223     sh =  _mm_shuffle_epi8(b01, c128);
13224     sh = _mm_andnot_si128(maskgt15, sh);
13225     sh =  _mm_or_si128(sh,c15);
13226     return64(sh);
13227 }
13228 
13229 //int8x8_t vtbx2_s8(int8x8_t a,  int8x8x2_t b, int8x8_t c);  // VTBX.8 d0, {d0, d1}, d0
13230 #define vtbx2_s8 vtbx2_u8
13231 
13232 //poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13233 #define vtbx2_p8 vtbx2_u8
13234 
13235 _NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
vtbx3_u8(uint8x8_t a,uint8x8x3_t b,uint8x8_t c)13236 _NEON2SSE_INLINE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c)
13237 {
13238     //solution may be not optimal
13239     uint8x8_t res64;
13240     __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128;
13241     c15 = _mm_set1_epi8 (15);
13242     c23 = _mm_set1_epi8 (23);
13243     c128 = _pM128i(c);
13244     maskgt15 = _mm_cmpgt_epi8(c128,c15);
13245     maskgt23 = _mm_cmpgt_epi8(c128,c23);
13246     c23 = _mm_and_si128(maskgt23, _pM128i(a));
13247     b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
13248     sh0 =  _mm_shuffle_epi8(b01, c128);
13249     sh1 =  _mm_shuffle_epi8(_pM128i(b.val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
13250     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13251     sh0 = _mm_andnot_si128(maskgt23,sh0);
13252     sh0 = _mm_or_si128(sh0,c23);
13253     return64(sh0);
13254 }
13255 
13256 _NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13257 #define vtbx3_s8 vtbx3_u8
13258 
13259 _NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13260 #define vtbx3_p8 vtbx3_u8
13261 
13262 _NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
vtbx4_u8(uint8x8_t a,uint8x8x4_t b,uint8x8_t c)13263 _NEON2SSE_INLINE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c)
13264 {
13265     //solution may be not optimal
13266     uint8x8_t res64;
13267     __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
13268     c15 = _mm_set1_epi8 (15);
13269     c31 = _mm_set1_epi8 (31);
13270     c128 = _pM128i(c);
13271     maskgt15 = _mm_cmpgt_epi8(c128,c15);
13272     maskgt31 = _mm_cmpgt_epi8(c128,c31);
13273     c31 = _mm_and_si128(maskgt31, _pM128i(a));
13274 
13275     b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
13276     b23 = _mm_unpacklo_epi64(_pM128i(b.val[2]),_pM128i(b.val[3]));
13277     sh0 =  _mm_shuffle_epi8(b01, c128);
13278     sh1 =  _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
13279     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13280     sh0 = _mm_andnot_si128(maskgt31,sh0);
13281     sh0 =  _mm_or_si128(sh0,c31);
13282     return64(sh0);
13283 }
13284 
13285 _NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13286 #define vtbx4_s8 vtbx4_u8
13287 
13288 _NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13289 #define vtbx4_p8 vtbx4_u8
13290 
13291 //*************************************************************************************************
13292 // *************************** Operations with a scalar value *********************************
13293 //*************************************************************************************************
13294 
13295 //******* Vector multiply accumulate by scalar *************************************************
13296 //**********************************************************************************************
13297 _NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
13298 _NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
13299 {
13300     int16_t c;
13301     int16x4_t scalar;
13302     c = vget_lane_s16(v, l);
13303     scalar = vdup_n_s16(c);
13304     return vmla_s16(a, b, scalar);
13305 }
13306 
13307 _NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
13308 _NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
13309 {
13310     int32_t c;
13311     int32x2_t scalar;
13312     c = vget_lane_s32(v, l);
13313     scalar = vdup_n_s32(c);
13314     return vmla_s32(a, b, scalar);
13315 }
13316 
13317 _NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
13318 #define vmla_lane_u16 vmla_lane_s16
13319 
13320 
13321 _NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
13322 #define vmla_lane_u32 vmla_lane_s32
13323 
13324 _NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
13325 _NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
13326 {
13327     float32_t vlane;
13328     float32x2_t c;
13329     vlane = vget_lane_f32(v, l);
13330     c = vdup_n_f32(vlane);
13331     return vmla_f32(a,b,c);
13332 }
13333 
13334 _NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13335 _NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
13336 {
13337     int16_t vlane;
13338     int16x8_t c;
13339     vlane = vget_lane_s16(v, l);
13340     c = vdupq_n_s16(vlane);
13341     return vmlaq_s16(a,b,c);
13342 }
13343 
13344 _NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13345 _NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
13346 {
13347     int32_t vlane;
13348     int32x4_t c;
13349     vlane = vget_lane_s32(v, l);
13350     c = vdupq_n_s32(vlane);
13351     return vmlaq_s32(a,b,c);
13352 }
13353 
13354 _NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13355 #define vmlaq_lane_u16 vmlaq_lane_s16
13356 
13357 _NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13358 #define vmlaq_lane_u32 vmlaq_lane_s32
13359 
13360 _NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
13361 _NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
13362 {
13363     float32_t vlane;
13364     float32x4_t c;
13365     vlane = vget_lane_f32(v, l);
13366     c = vdupq_n_f32(vlane);
13367     return vmlaq_f32(a,b,c);
13368 }
13369 
13370 //***************** Vector widening multiply accumulate by scalar **********************
13371 //***************************************************************************************
13372 _NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
13373 _NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
13374 {
13375     int16_t vlane;
13376     int16x4_t c;
13377     vlane = vget_lane_s16(v, l);
13378     c = vdup_n_s16(vlane);
13379     return vmlal_s16(a, b, c);
13380 }
13381 
13382 _NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
13383 _NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
13384 {
13385     int32_t vlane;
13386     int32x2_t c;
13387     vlane = vget_lane_s32(v, l);
13388     c = vdup_n_s32(vlane);
13389     return vmlal_s32(a, b, c);
13390 }
13391 
13392 _NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
13393 _NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
13394 {
13395     uint16_t vlane;
13396     uint16x4_t c;
13397     vlane = vget_lane_u16(v, l);
13398     c = vdup_n_u16(vlane);
13399     return vmlal_u16(a, b, c);
13400 }
13401 
13402 _NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
13403 _NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
13404 {
13405     uint32_t vlane;
13406     uint32x2_t c;
13407     vlane = vget_lane_u32(v, l);
13408     c = vdup_n_u32(vlane);
13409     return vmlal_u32(a, b, c);
13410 }
13411 
13412 // ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
13413 // ************************************************************************************************
13414 _NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
13415 _NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
13416 {
13417     int16_t vlane;
13418     int16x4_t c;
13419     vlane = vget_lane_s16(v, l);
13420     c = vdup_n_s16(vlane);
13421     return vqdmlal_s16(a, b, c);
13422 }
13423 
13424 _NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
13425 _NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
13426 {
13427     int32_t vlane;
13428     uint32x2_t c;
13429     vlane = vget_lane_s32(v, l);
13430     c = vdup_n_s32(vlane);
13431     return vqdmlal_s32(a, b, c);
13432 }
13433 
13434 // ****** Vector multiply subtract by scalar *****************
13435 // *************************************************************
13436 _NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
13437 _NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
13438 {
13439     int16_t vlane;
13440     int16x4_t c;
13441     vlane = vget_lane_s16(v, l);
13442     c = vdup_n_s16(vlane);
13443     return vmls_s16(a, b, c);
13444 }
13445 
13446 _NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
13447 _NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
13448 {
13449     int32_t vlane;
13450     int32x2_t c;
13451     vlane = vget_lane_s32(v, l);
13452     c = vdup_n_s32(vlane);
13453     return vmls_s32(a, b, c);
13454 }
13455 
13456 _NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
13457 _NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
13458 {
13459     uint16_t vlane;
13460     uint16x4_t c;
13461     vlane = vget_lane_s16(v, l);
13462     c = vdup_n_s16(vlane);
13463     return vmls_s16(a, b, c);
13464 }
13465 
13466 _NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
13467 _NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
13468 {
13469     uint32_t vlane;
13470     uint32x2_t c;
13471     vlane = vget_lane_u32(v, l);
13472     c = vdup_n_u32(vlane);
13473     return vmls_u32(a, b, c);
13474 }
13475 
13476 _NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
13477 _NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
13478 {
13479     float32_t vlane;
13480     float32x2_t c;
13481     vlane = (float) vget_lane_f32(v, l);
13482     c = vdup_n_f32(vlane);
13483     return vmls_f32(a,b,c);
13484 }
13485 
13486 _NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
13487 _NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
13488 {
13489     int16_t vlane;
13490     int16x8_t c;
13491     vlane = vget_lane_s16(v, l);
13492     c = vdupq_n_s16(vlane);
13493     return vmlsq_s16(a, b,c);
13494 }
13495 
13496 _NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
13497 _NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
13498 {
13499     int32_t vlane;
13500     int32x4_t c;
13501     vlane = vget_lane_s32(v, l);
13502     c = vdupq_n_s32(vlane);
13503     return vmlsq_s32(a,b,c);
13504 }
13505 
13506 _NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13507 _NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
13508 {
13509     uint16_t vlane;
13510     uint16x8_t c;
13511     vlane = vget_lane_u16(v, l);
13512     c = vdupq_n_u16(vlane);
13513     return vmlsq_u16(a,b,c);
13514 }
13515 
13516 _NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13517 _NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
13518 {
13519     uint32_t vlane;
13520     uint32x4_t c;
13521     vlane = vget_lane_u32(v, l);
13522     c = vdupq_n_u32(vlane);
13523     return vmlsq_u32(a,b,c);
13524 }
13525 
13526 _NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
13527 _NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
13528 {
13529     float32_t vlane;
13530     float32x4_t c;
13531     vlane = (float) vget_lane_f32(v, l);
13532     c = vdupq_n_f32(vlane);
13533     return vmlsq_f32(a,b,c);
13534 }
13535 
13536 // **** Vector widening multiply subtract by scalar ****
13537 // ****************************************************
13538 _NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
13539 _NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
13540 {
13541     int16_t vlane;
13542     int16x4_t c;
13543     vlane = vget_lane_s16(v, l);
13544     c = vdup_n_s16(vlane);
13545     return vmlsl_s16(a, b, c);
13546 }
13547 
13548 _NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
13549 _NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
13550 {
13551     int32_t vlane;
13552     int32x2_t c;
13553     vlane = vget_lane_s32(v, l);
13554     c = vdup_n_s32(vlane);
13555     return vmlsl_s32(a, b, c);
13556 }
13557 
13558 _NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
13559 _NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
13560 {
13561     uint16_t vlane;
13562     uint16x4_t c;
13563     vlane = vget_lane_s16(v, l);
13564     c = vdup_n_s16(vlane);
13565     return vmlsl_s16(a, b, c);
13566 }
13567 
13568 _NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
13569 _NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
13570 {
13571     uint32_t vlane;
13572     uint32x2_t c;
13573     vlane = vget_lane_u32(v, l);
13574     c = vdup_n_u32(vlane);
13575     return vmlsl_u32(a, b, c);
13576 }
13577 
13578 //********* Vector widening saturating doubling multiply subtract by scalar **************************
13579 //******************************************************************************************************
13580 _NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
13581 _NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
13582 {
13583     int16_t vlane;
13584     int16x4_t c;
13585     vlane = vget_lane_s16(v, l);
13586     c = vdup_n_s16(vlane);
13587     return vqdmlsl_s16(a, b, c);
13588 }
13589 
13590 _NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
13591 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
13592 {
13593     int32_t vlane;
13594     int32x2_t c;
13595     vlane = vget_lane_s32(v, l);
13596     c = vdup_n_s32(vlane);
13597     return vqdmlsl_s32(a, b, c);
13598 }
13599 //********** Vector multiply with scalar *****************************
13600 _NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
vmul_n_s16(int16x4_t a,int16_t b)13601 _NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
13602 {
13603     int16x4_t b16x4;
13604     b16x4 = vdup_n_s16(b);
13605     return vmul_s16(a, b16x4);
13606 }
13607 
13608 _NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
vmul_n_s32(int32x2_t a,int32_t b)13609 _NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
13610 {
13611     //serial solution looks faster
13612     int32x2_t b32x2;
13613     b32x2 = vdup_n_s32(b);
13614     return vmul_s32(a, b32x2);
13615 }
13616 
13617 _NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
vmul_n_f32(float32x2_t a,float32_t b)13618 _NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
13619 {
13620     float32x2_t b32x2;
13621     b32x2 = vdup_n_f32(b);
13622     return vmul_f32(a, b32x2);
13623 }
13624 
13625 _NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
vmul_n_u16(uint16x4_t a,uint16_t b)13626 _NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
13627 {
13628     uint16x4_t b16x4;
13629     b16x4 = vdup_n_s16(b);
13630     return vmul_s16(a, b16x4);
13631 }
13632 
13633 _NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
vmul_n_u32(uint32x2_t a,uint32_t b)13634 _NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
13635 {
13636     //serial solution looks faster
13637     uint32x2_t b32x2;
13638     b32x2 = vdup_n_u32(b);
13639     return vmul_u32(a, b32x2);
13640 }
13641 
13642 _NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
vmulq_n_s16(int16x8_t a,int16_t b)13643 _NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
13644 {
13645     int16x8_t b16x8;
13646     b16x8 = vdupq_n_s16(b);
13647     return vmulq_s16(a, b16x8);
13648 }
13649 
13650 _NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
vmulq_n_s32(int32x4_t a,int32_t b)13651 _NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
13652 {
13653     int32x4_t b32x4;
13654     b32x4 = vdupq_n_s32(b);
13655     return vmulq_s32(a, b32x4);
13656 }
13657 
13658 _NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
vmulq_n_f32(float32x4_t a,float32_t b)13659 _NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
13660 {
13661     float32x4_t b32x4;
13662     b32x4 = vdupq_n_f32(b);
13663     return vmulq_f32(a, b32x4);
13664 }
13665 
13666 _NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
vmulq_n_u16(uint16x8_t a,uint16_t b)13667 _NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
13668 {
13669     uint16x8_t b16x8;
13670     b16x8 = vdupq_n_s16(b);
13671     return vmulq_s16(a, b16x8);
13672 }
13673 
13674 _NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
vmulq_n_u32(uint32x4_t a,uint32_t b)13675 _NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
13676 {
13677     uint32x4_t b32x4;
13678     b32x4 = vdupq_n_u32(b);
13679     return vmulq_u32(a, b32x4);
13680 }
13681 
13682 //********** Vector multiply lane *****************************
13683 _NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
13684 _NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
13685 {
13686     int16x4_t b16x4;
13687     int16_t vlane;
13688     vlane = vget_lane_s16(b, c);
13689     b16x4 = vdup_n_s16(vlane);
13690     return vmul_s16(a, b16x4);
13691 }
13692 
13693 _NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
13694 _NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
13695 {
13696     int32x2_t b32x2;
13697     int32_t vlane;
13698     vlane = vget_lane_s32(b, c);
13699     b32x2 = vdup_n_s32(vlane);
13700     return vmul_s32(a, b32x2);
13701 }
13702 
13703 _NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
13704 _NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
13705 {
13706     float32x2_t b32x2;
13707     float32_t vlane;
13708     vlane = vget_lane_f32(b, c);
13709     b32x2 = vdup_n_f32(vlane);
13710     return vmul_f32(a, b32x2);
13711 }
13712 
13713 _NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
13714 #define vmul_lane_u16 vmul_lane_s16
13715 
13716 _NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
13717 #define vmul_lane_u32 vmul_lane_s32
13718 
13719 _NEON2SSESTORAGE int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
13720 _NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
13721 {
13722     int16x8_t b16x8;
13723     int16_t vlane;
13724     vlane = vget_lane_s16(b, c);
13725     b16x8 = vdupq_n_s16(vlane);
13726     return vmulq_s16(a, b16x8);
13727 }
13728 
13729 _NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
13730 _NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
13731 {
13732     int32x4_t b32x4;
13733     int32_t vlane;
13734     vlane = vget_lane_s32(b, c);
13735     b32x4 = vdupq_n_s32(vlane);
13736     return vmulq_s32(a, b32x4);
13737 }
13738 
13739 _NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
13740 _NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
13741 {
13742     float32x4_t b32x4;
13743     float32_t vlane;
13744     vlane = vget_lane_f32(b, c);
13745     b32x4 = vdupq_n_f32(vlane);
13746     return vmulq_f32(a, b32x4);
13747 }
13748 
13749 _NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
13750 #define vmulq_lane_u16 vmulq_lane_s16
13751 
13752 _NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
13753 #define vmulq_lane_u32 vmulq_lane_s32
13754 
13755 //**** Vector long multiply with scalar ************
13756 _NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
vmull_n_s16(int16x4_t vec1,int16_t val2)13757 _NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
13758 {
13759     int16x4_t b16x4;
13760     b16x4 = vdup_n_s16(val2);
13761     return vmull_s16(vec1, b16x4);
13762 }
13763 
13764 _NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
vmull_n_s32(int32x2_t vec1,int32_t val2)13765 _NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
13766 {
13767     int32x2_t b32x2;
13768     b32x2 = vdup_n_s32(val2);
13769     return vmull_s32(vec1, b32x2);
13770 }
13771 
13772 _NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
vmull_n_u16(uint16x4_t vec1,uint16_t val2)13773 _NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
13774 {
13775     uint16x4_t b16x4;
13776     b16x4 = vdup_n_s16(val2);
13777     return vmull_s16(vec1, b16x4);
13778 }
13779 
13780 _NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
vmull_n_u32(uint32x2_t vec1,uint32_t val2)13781 _NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
13782 {
13783     uint32x2_t b32x2;
13784     b32x2 = vdup_n_u32(val2);
13785     return vmull_u32(vec1, b32x2);
13786 }
13787 
13788 //**** Vector long multiply by scalar ****
13789 _NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
13790 _NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
13791 {
13792     int16_t vlane;
13793     int16x4_t b;
13794     vlane = vget_lane_s16(val2, val3);
13795     b = vdup_n_s16(vlane);
13796     return vmull_s16(vec1, b);
13797 }
13798 
13799 _NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
13800 _NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
13801 {
13802     int32_t vlane;
13803     int32x2_t b;
13804     vlane = vget_lane_s32(val2, val3);
13805     b = vdup_n_s32(vlane);
13806     return vmull_s32(vec1, b);
13807 }
13808 
13809 _NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
13810 _NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
13811 {
13812     uint16_t vlane;
13813     uint16x4_t b;
13814     vlane = vget_lane_s16(val2, val3);
13815     b = vdup_n_s16(vlane);
13816     return vmull_s16(vec1, b);
13817 }
13818 
13819 _NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
13820 _NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
13821 {
13822     uint32_t vlane;
13823     uint32x2_t b;
13824     vlane = vget_lane_u32(val2, val3);
13825     b = vdup_n_u32(vlane);
13826     return vmull_u32(vec1, b);
13827 }
13828 
13829 //********* Vector saturating doubling long multiply with scalar  *******************
13830 _NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
vqdmull_n_s16(int16x4_t vec1,int16_t val2)13831 _NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
13832 {
13833     //the serial soulution may be faster due to saturation
13834     int16x4_t b;
13835     b = vdup_n_s16(val2);
13836     return vqdmull_s16(vec1, b);
13837 }
13838 
13839 _NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32 (int32x2_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_SERIAL)13840 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
13841 {
13842     int32x2_t b;
13843     b = vdup_n_s32(val2);
13844     return vqdmull_s32(vec1,b); //slow serial function!!!!
13845 }
13846 
13847 //************* Vector saturating doubling long multiply by scalar ***********************************************
13848 _NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
13849 _NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
13850 {
13851     int16_t c;
13852     int16x4_t scalar;
13853     c = vget_lane_s16(val2, val3);
13854     scalar = vdup_n_s16(c);
13855     return vqdmull_s16(vec1, scalar);
13856 }
13857 
13858 
13859 _NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULL.S32 q0,d0,d0[0]
13860 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
13861 {
13862     int32_t c;
13863     int32x2_t scalar;
13864     c = vget_lane_s32(val2, val3);
13865     scalar = vdup_n_s32(c);
13866     return vqdmull_s32(vec1,scalar); //slow serial function!!!!
13867 }
13868 
13869 // *****Vector saturating doubling multiply high with scalar *****
13870 _NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2); //  VQDMULH.S16 d0,d0,d0[0]
vqdmulh_n_s16(int16x4_t vec1,int16_t val2)13871 _NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2)
13872 {
13873     int16x4_t res64;
13874     return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
13875 }
13876 
13877 _NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2); //  VQDMULH.S32 d0,d0,d0[0]
vqdmulh_n_s32(int32x2_t vec1,int32_t val2)13878 _NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2)
13879 {
13880     int32x2_t res64;
13881     return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
13882 }
13883 
13884 _NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); //  VQDMULH.S16 q0,q0,d0[0]
vqdmulhq_n_s16(int16x8_t vec1,int16_t val2)13885 _NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) //  VQDMULH.S16 q0,q0,d0[0]
13886 {
13887     //solution may be not optimal
13888     int16x8_t scalar;
13889     scalar = vdupq_n_s16(val2);
13890     return vqdmulhq_s16(vec1, scalar);
13891 }
13892 
13893 _NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); //  VQDMULH.S32 q0,q0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32 (int32x4_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)13894 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13895 {
13896     int32x4_t scalar;
13897     scalar = vdupq_n_s32(val2);
13898     return vqdmulhq_s32(vec1, scalar);
13899 }
13900 
13901 //***** Vector saturating doubling multiply high by scalar ****************
13902 _NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 d0,d0,d0[0]
13903 _NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 d0,d0,d0[0]
13904 {
13905     //solution may be not optimal
13906     int16_t vlane;
13907     int16x4_t scalar;
13908     vlane = vget_lane_s16(val2, val3);
13909     scalar = vdup_n_s16(vlane);
13910     return vqdmulh_s16(vec1, scalar);
13911 }
13912 
13913 _NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 d0,d0,d0[0]
13914 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13915 {
13916     int32_t vlane;
13917     int32x2_t scalar;
13918     vlane = vget_lane_s32(val2, val3);
13919     scalar = vdup_n_s32(vlane);
13920     return vqdmulh_s32(vec1, scalar);
13921 }
13922 
13923 _NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 q0,q0,d0[0]
13924 _NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 q0,q0,d0[0]
13925 {
13926     //solution may be not optimal
13927     int16_t vlane;
13928     int16x8_t scalar;
13929     vlane = vget_lane_s16(val2, val3);
13930     scalar = vdupq_n_s16(vlane );
13931     return vqdmulhq_s16(vec1, scalar);
13932 }
13933 
13934 _NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 q0,q0,d0[0]
13935 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13936 {
13937     //solution may be not optimal
13938     int32_t vlane;
13939     int32x4_t scalar;
13940     vlane = vgetq_lane_s32(_pM128i(val2), val3);
13941     scalar = vdupq_n_s32(vlane );
13942     return vqdmulhq_s32(vec1, scalar);
13943 }
13944 
13945 //******** Vector saturating rounding doubling multiply high with scalar ***
13946 _NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
vqrdmulh_n_s16(int16x4_t vec1,int16_t val2)13947 _NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
13948 {
13949     //solution may be not optimal
13950     int16x4_t scalar;
13951     scalar = vdup_n_s16(val2);
13952     return vqrdmulh_s16(vec1, scalar);
13953 }
13954 
13955 _NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32 (int32x2_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)13956 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13957 {
13958     int32x2_t scalar;
13959     scalar = vdup_n_s32(val2);
13960     return vqrdmulh_s32(vec1, scalar);
13961 }
13962 
13963 _NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
vqrdmulhq_n_s16(int16x8_t vec1,int16_t val2)13964 _NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
13965 {
13966     //solution may be not optimal
13967     int16x8_t scalar;
13968     scalar = vdupq_n_s16(val2);
13969     return vqrdmulhq_s16(vec1, scalar);
13970 }
13971 
13972 _NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32 (int32x4_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)13973 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13974 {
13975     int32x4_t scalar;
13976     scalar = vdupq_n_s32(val2);
13977     return vqrdmulhq_s32(vec1, scalar);
13978 }
13979 
13980 //********* Vector rounding saturating doubling multiply high by scalar  ****
13981 _NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
13982 _NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
13983 {
13984     //solution may be not optimal
13985     int16_t vlane;
13986     int16x4_t scalar;
13987     vlane = vget_lane_s16(val2, val3);
13988     scalar = vdup_n_s16(vlane);
13989     return vqrdmulh_s16(vec1, scalar);
13990 }
13991 
13992 _NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
13993 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13994 {
13995     int32_t vlane;
13996     int32x2_t scalar;
13997     vlane = vget_lane_s32(val2, val3);
13998     scalar = vdup_n_s32(vlane);
13999     return vqrdmulh_s32(vec1, scalar);
14000 }
14001 
14002 _NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
14003 _NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
14004 {
14005     //solution may be not optimal
14006     int16_t vlane;
14007     int16x8_t scalar;
14008     vlane = vget_lane_s16(val2, val3);
14009     scalar = vdupq_n_s16(vlane);
14010     return vqrdmulhq_s16(vec1, scalar);
14011 }
14012 
14013 _NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
14014 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14015 {
14016     //solution may be not optimal
14017     int32_t vlane;
14018     int32x4_t scalar;
14019     vlane = vgetq_lane_s32(_pM128i(val2), val3);
14020     scalar = vdupq_n_s32(vlane );
14021     return vqrdmulhq_s32(vec1, scalar);
14022 }
14023 
14024 //**************Vector multiply accumulate with scalar *******************
14025 _NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
vmla_n_s16(int16x4_t a,int16x4_t b,int16_t c)14026 _NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
14027 {
14028     int16x4_t scalar;
14029     scalar = vdup_n_s16(c);
14030     return vmla_s16(a, b, scalar);
14031 }
14032 
14033 _NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
vmla_n_s32(int32x2_t a,int32x2_t b,int32_t c)14034 _NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
14035 {
14036     int32x2_t scalar;
14037     scalar = vdup_n_s32(c);
14038     return vmla_s32(a, b, scalar);
14039 }
14040 
14041 _NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a,  uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
14042 #define vmla_n_u16 vmla_n_s16
14043 
14044 
14045 _NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a,  uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
14046 #define vmla_n_u32 vmla_n_s32
14047 
14048 
14049 _NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
vmla_n_f32(float32x2_t a,float32x2_t b,float32_t c)14050 _NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
14051 {
14052     float32x2_t scalar;
14053     scalar = vdup_n_f32(c);
14054     return vmla_f32(a, b, scalar);
14055 }
14056 
14057 _NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
vmlaq_n_s16(int16x8_t a,int16x8_t b,int16_t c)14058 _NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
14059 {
14060     int16x8_t scalar;
14061     scalar = vdupq_n_s16(c);
14062     return vmlaq_s16(a,b,scalar);
14063 }
14064 
14065 _NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
vmlaq_n_s32(int32x4_t a,int32x4_t b,int32_t c)14066 _NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
14067 {
14068     int32x4_t scalar;
14069     scalar = vdupq_n_s32(c);
14070     return vmlaq_s32(a,b,scalar);
14071 }
14072 
14073 _NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
14074 #define vmlaq_n_u16 vmlaq_n_s16
14075 
14076 _NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
14077 #define vmlaq_n_u32 vmlaq_n_s32
14078 
14079 _NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
vmlaq_n_f32(float32x4_t a,float32x4_t b,float32_t c)14080 _NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
14081 {
14082     float32x4_t scalar;
14083     scalar = vdupq_n_f32(c);
14084     return vmlaq_f32(a,b,scalar);
14085 }
14086 
14087 //************Vector widening multiply accumulate with scalar****************************
14088 _NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
vmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)14089 _NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
14090 {
14091     int16x4_t vc;
14092     vc = vdup_n_s16(c);
14093     return vmlal_s16(a, b, vc);
14094 }
14095 
14096 _NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
vmlal_n_s32(int64x2_t a,int32x2_t b,int32_t c)14097 _NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
14098 {
14099     int32x2_t vc;
14100     vc = vdup_n_s32(c);
14101     return vmlal_s32(a, b, vc);
14102 }
14103 
14104 _NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
vmlal_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)14105 _NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
14106 {
14107     uint16x4_t vc;
14108     vc = vdup_n_u16(c);
14109     return vmlal_u16(a, b, vc);
14110 }
14111 
14112 _NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
vmlal_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)14113 _NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
14114 {
14115     uint32x2_t vc;
14116     vc = vdup_n_u32(c);
14117     return vmlal_u32(a, b, vc);
14118 }
14119 
14120 //************ Vector widening saturating doubling multiply accumulate with scalar **************
14121 _NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
vqdmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)14122 _NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14123 {
14124     //not optimal SIMD soulution, serial may be faster
14125     int16x4_t vc;
14126     vc = vdup_n_s16(c);
14127     return vqdmlal_s16(a, b, vc);
14128 }
14129 
14130 _NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32 (int64x2_t a,int32x2_t b,int32_t c),_NEON2SSE_REASON_SLOW_SERIAL)14131 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14132 {
14133     int32x2_t vc;
14134     vc = vdup_n_s32(c);
14135     return vqdmlal_s32(a, b, vc);
14136 }
14137 
14138 //******** Vector multiply subtract with scalar **************
14139 _NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
vmls_n_s16(int16x4_t a,int16x4_t b,int16_t c)14140 _NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
14141 {
14142     int16x4_t vc;
14143     vc = vdup_n_s16(c);
14144     return vmls_s16(a, b, vc);
14145 }
14146 
14147 _NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
vmls_n_s32(int32x2_t a,int32x2_t b,int32_t c)14148 _NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
14149 {
14150     int32x2_t vc;
14151     vc = vdup_n_s32(c);
14152     return vmls_s32(a, b, vc);
14153 }
14154 
14155 _NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
vmls_n_u16(uint16x4_t a,uint16x4_t b,uint16_t c)14156 _NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
14157 {
14158     uint16x4_t vc;
14159     vc = vdup_n_s16(c);
14160     return vmls_s16(a, b, vc);
14161 }
14162 
14163 _NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
vmls_n_u32(uint32x2_t a,uint32x2_t b,uint32_t c)14164 _NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
14165 {
14166     uint32x2_t vc;
14167     vc = vdup_n_u32(c);
14168     return vmls_u32(a, b, vc);
14169 }
14170 
14171 _NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
vmls_n_f32(float32x2_t a,float32x2_t b,float32_t c)14172 _NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
14173 {
14174     float32x2_t res;
14175     res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
14176     res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
14177     return res;
14178 }
14179 
14180 _NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
vmlsq_n_s16(int16x8_t a,int16x8_t b,int16_t c)14181 _NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
14182 {
14183     int16x8_t vc;
14184     vc = vdupq_n_s16(c);
14185     return vmlsq_s16(a, b,vc);
14186 }
14187 
14188 _NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
vmlsq_n_s32(int32x4_t a,int32x4_t b,int32_t c)14189 _NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
14190 {
14191     int32x4_t vc;
14192     vc = vdupq_n_s32(c);
14193     return vmlsq_s32(a,b,vc);
14194 }
14195 
14196 _NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
vmlsq_n_u16(uint16x8_t a,uint16x8_t b,uint16_t c)14197 _NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
14198 {
14199     uint16x8_t vc;
14200     vc = vdupq_n_u16(c);
14201     return vmlsq_u16(a,b,vc);
14202 }
14203 
14204 _NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
vmlsq_n_u32(uint32x4_t a,uint32x4_t b,uint32_t c)14205 _NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
14206 {
14207     uint32x4_t vc;
14208     vc = vdupq_n_u32(c);
14209     return vmlsq_u32(a,b,vc);
14210 }
14211 
14212 _NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
vmlsq_n_f32(float32x4_t a,float32x4_t b,float32_t c)14213 _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
14214 {
14215     float32x4_t vc;
14216     vc = vdupq_n_f32(c);
14217     return vmlsq_f32(a,b,vc);
14218 }
14219 
14220 //**** Vector widening multiply subtract with scalar ******
14221 _NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
vmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)14222 _NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
14223 {
14224     int16x4_t vc;
14225     vc = vdup_n_s16(c);
14226     return vmlsl_s16(a, b, vc);
14227 }
14228 
14229 _NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
vmlsl_n_s32(int64x2_t a,int32x2_t b,int32_t c)14230 _NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
14231 {
14232     int32x2_t vc;
14233     vc = vdup_n_s32(c);
14234     return vmlsl_s32(a, b, vc);
14235 }
14236 
14237 _NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
vmlsl_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)14238 _NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
14239 {
14240     uint16x4_t vc;
14241     vc = vdup_n_u16(c);
14242     return vmlsl_u16(a, b, vc);
14243 }
14244 
14245 _NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
vmlsl_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)14246 _NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
14247 {
14248     uint32x2_t vc;
14249     vc = vdup_n_u32(c);
14250     return vmlsl_u32(a, b, vc);
14251 }
14252 
14253 //***** Vector widening saturating doubling multiply subtract with scalar *********
14254 //**********************************************************************************
14255 _NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
vqdmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)14256 _NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14257 {
14258     int16x4_t vc;
14259     vc = vdup_n_s16(c);
14260     return vqdmlsl_s16(a, b, vc);
14261 }
14262 
14263 _NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32 (int64x2_t a,int32x2_t b,int32_t c),_NEON2SSE_REASON_SLOW_SERIAL)14264 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14265 {
14266     int32x2_t vc;
14267     vc = vdup_n_s32(c);
14268     return vqdmlsl_s32(a, b, vc);
14269 }
14270 
14271 //*******************  Vector extract ***********************************************
14272 //*************************************************************************************
14273 //VEXT (Vector Extract) extracts  elements from the bottom end of the second operand
14274 //vector and the top end of the first, concatenates them, and places the result in the destination vector
14275 //c elements from the bottom end of the second operand and (8-c) from the top end of the first
14276 _NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14277 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
14278 {
14279     int8x8_t res;
14280     int i;
14281     for (i = 0; i<8 - c; i++) {
14282         res.m64_i8[i] = a.m64_i8[i + c];
14283     }
14284     for(i = 0; i<c; i++) {
14285         res.m64_i8[8 - c + i] = b.m64_i8[i];
14286     }
14287     return res;
14288 }
14289 
14290 _NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14291 #define vext_u8 vext_s8
14292 //same result tested
14293 
14294 _NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14295 #define vext_p8 vext_u8
14296 
14297 _NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14298 _NEON2SSE_INLINE int16x4_t  _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14299 {
14300     int16x4_t res;
14301     int i;
14302     for (i = 0; i<4 - c; i++) {
14303         res.m64_i16[i] = a.m64_i16[i + c];
14304     }
14305     for(i = 0; i<c; i++) {
14306         res.m64_i16[4 - c + i] = b.m64_i16[i];
14307     }
14308     return res;
14309 }
14310 
14311 _NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14312 #define vext_u16 vext_s16
14313 
14314 _NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14315 #define vext_p16 vext_s16
14316 
14317 _NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14318 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14319 {
14320     int32x2_t res;
14321     if (c==0) {
14322         res.m64_i32[0] = a.m64_i32[0];
14323         res.m64_i32[1] = a.m64_i32[1];
14324     } else {
14325         res.m64_i32[0] = a.m64_i32[1];
14326         res.m64_i32[1] = b.m64_i32[0];
14327     }
14328     return res;
14329 }
14330 
14331 _NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14332 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14333 {
14334     float32x2_t res;
14335     if (c==0) {
14336         res.m64_f32[0] = a.m64_f32[0];
14337         res.m64_f32[1] = a.m64_f32[1];
14338     } else {
14339         res.m64_f32[0] = a.m64_f32[1];
14340         res.m64_f32[1] = b.m64_f32[0];
14341     }
14342     return res;
14343 }
14344 
14345 _NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14346 #define vext_u32 vext_s32
14347 
14348 
14349 _NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
14350 #define vext_s64(a,b,c) a
14351 
14352 _NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
14353 #define vext_u64(a,b,c) a
14354 
14355 _NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14356 #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14357 
14358 _NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14359 #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14360 
14361 _NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14362 #define vextq_p8 vextq_s8
14363 
14364 _NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14365 #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14366 
14367 _NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14368 #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14369 
14370 _NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14371 #define vextq_p16 vextq_s16
14372 
14373 _NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
14374 #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14375 
14376 _NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
14377 #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14378 
14379 _NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
14380 #define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
14381 
14382 _NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
14383 #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14384 
14385 _NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
14386 #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14387 
14388 //************ Reverse vector elements (swap endianness)*****************
14389 //*************************************************************************
14390 //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
14391 _NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
vrev64_s8(int8x8_t vec)14392 _NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
14393 {
14394     int8x8_t res64;
14395     __m128i res;
14396     res = vrev64q_s8(_pM128i(vec));
14397     return64(res);
14398 }
14399 
14400 _NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
vrev64_s16(int16x4_t vec)14401 _NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
14402 {
14403     int16x4_t res64;
14404     __m128i res;
14405     res = vrev64q_s16(_pM128i(vec));
14406     return64(res);
14407 }
14408 
14409 _NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
vrev64_s32(int32x2_t vec)14410 _NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
14411 {
14412     int32x2_t res;
14413     res.m64_i32[0] = vec.m64_i32[1];
14414     res.m64_i32[1] = vec.m64_i32[0];
14415     return res;
14416 }
14417 
14418 _NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
14419 #define vrev64_u8 vrev64_s8
14420 
14421 _NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
14422 #define vrev64_u16 vrev64_s16
14423 
14424 _NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
14425 #define vrev64_u32 vrev64_s32
14426 
14427 _NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
14428 #define vrev64_p8 vrev64_u8
14429 
14430 _NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
14431 #define vrev64_p16 vrev64_u16
14432 
14433 _NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
vrev64_f32(float32x2_t vec)14434 _NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
14435 {
14436     float32x2_t res;
14437     res.m64_f32[0] = vec.m64_f32[1];
14438     res.m64_f32[1] = vec.m64_f32[0];
14439     return res;
14440 }
14441 
14442 _NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
vrev64q_s8(int8x16_t vec)14443 _NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
14444 {
14445     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
14446     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
14447 }
14448 
14449 _NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
vrev64q_s16(int16x8_t vec)14450 _NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
14451 {
14452     //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
14453     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
14454     return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
14455 }
14456 
14457 _NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
vrev64q_s32(int32x4_t vec)14458 _NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
14459 {
14460     return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
14461 }
14462 
14463 _NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
14464 #define vrev64q_u8 vrev64q_s8
14465 
14466 _NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
14467 #define vrev64q_u16 vrev64q_s16
14468 
14469 _NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
14470 #define vrev64q_u32 vrev64q_s32
14471 
14472 _NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
14473 #define vrev64q_p8 vrev64q_u8
14474 
14475 _NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
14476 #define vrev64q_p16 vrev64q_u16
14477 
14478 _NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
14479 #define vrev64q_f32(vec) _mm_shuffle_ps (vec,  vec, _MM_SHUFFLE(2,3, 0,1))
14480 
14481 //********************  32 bit shuffles **********************
14482 //************************************************************
14483 _NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
vrev32_s8(int8x8_t vec)14484 _NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
14485 {
14486     int8x8_t res64;
14487     __m128i res;
14488     res = vrev32q_s8(_pM128i(vec));
14489     return64(res);
14490 }
14491 
14492 _NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
vrev32_s16(int16x4_t vec)14493 _NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
14494 {
14495     int16x4_t res64;
14496     __m128i res;
14497     res = vrev32q_s16(_pM128i(vec));
14498     return64(res);
14499 }
14500 
14501 _NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
14502 #define vrev32_u8 vrev32_s8
14503 
14504 _NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
14505 #define vrev32_u16 vrev32_s16
14506 
14507 _NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
14508 #define vrev32_p8 vrev32_u8
14509 
14510 _NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
14511 #define vrev32_p16 vrev32_u16
14512 
14513 _NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
vrev32q_s8(int8x16_t vec)14514 _NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
14515 {
14516     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
14517     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
14518 }
14519 
14520 _NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
vrev32q_s16(int16x8_t vec)14521 _NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
14522 {
14523     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
14524     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
14525 }
14526 
14527 _NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
14528 #define vrev32q_u8 vrev32q_s8
14529 
14530 _NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
14531 #define vrev32q_u16 vrev32q_s16
14532 
14533 _NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
14534 #define vrev32q_p8 vrev32q_u8
14535 
14536 _NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
14537 #define vrev32q_p16 vrev32q_u16
14538 
14539 //*************  16 bit shuffles **********************
14540 //******************************************************
14541 _NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
vrev16_s8(int8x8_t vec)14542 _NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
14543 {
14544     int8x8_t res64;
14545     __m128i res;
14546     res = vrev16q_s8(_pM128i(vec));
14547     return64(res);
14548 }
14549 
14550 _NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
14551 #define vrev16_u8 vrev16_s8
14552 
14553 _NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
14554 #define vrev16_p8 vrev16_u8
14555 
14556 _NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
vrev16q_s8(int8x16_t vec)14557 _NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
14558 {
14559     _NEON2SSE_ALIGN_16 static const int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
14560     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev8);
14561 }
14562 
14563 _NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
14564 #define vrev16q_u8 vrev16q_s8
14565 
14566 _NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
14567 #define vrev16q_p8 vrev16q_u8
14568 
14569 //*********************************************************************
14570 //**************** Other single operand arithmetic *******************
14571 //*********************************************************************
14572 
14573 //*********** Absolute: Vd[i] = |Va[i]| **********************************
14574 //************************************************************************
14575 _NEON2SSESTORAGE int8x8_t   vabs_s8(int8x8_t a); // VABS.S8 d0,d0
vabs_s8(int8x8_t a)14576 _NEON2SSE_INLINE int8x8_t   vabs_s8(int8x8_t a)
14577 {
14578     int8x8_t res64;
14579     __m128i res;
14580     res = _mm_abs_epi8(_pM128i(a));
14581     return64(res);
14582 }
14583 
14584 
14585 _NEON2SSESTORAGE int16x4_t   vabs_s16(int16x4_t a); // VABS.S16 d0,d0
vabs_s16(int16x4_t a)14586 _NEON2SSE_INLINE int16x4_t   vabs_s16(int16x4_t a)
14587 {
14588     int16x4_t res64;
14589     __m128i res;
14590     res = _mm_abs_epi16(_pM128i(a));
14591     return64(res);
14592 }
14593 
14594 _NEON2SSESTORAGE int32x2_t   vabs_s32(int32x2_t a); // VABS.S32 d0,d0
vabs_s32(int32x2_t a)14595 _NEON2SSE_INLINE int32x2_t   vabs_s32(int32x2_t a)
14596 {
14597     int32x2_t res64;
14598     __m128i res;
14599     res = _mm_abs_epi32(_pM128i(a));
14600     return64(res);
14601 }
14602 
14603 _NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
vabs_f32(float32x2_t a)14604 _NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
14605 {
14606     float32x4_t res;
14607     __m64_128 res64;
14608     _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14609     res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
14610     _M64f(res64, res);
14611     return res64;
14612 }
14613 
14614 _NEON2SSESTORAGE int8x16_t   vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
14615 #define vabsq_s8 _mm_abs_epi8
14616 
14617 _NEON2SSESTORAGE int16x8_t   vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
14618 #define vabsq_s16 _mm_abs_epi16
14619 
14620 _NEON2SSESTORAGE int32x4_t   vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
14621 #define vabsq_s32 _mm_abs_epi32
14622 
14623 _NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
vabsq_f32(float32x4_t a)14624 _NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
14625 {
14626     _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14627     return _mm_and_ps (a, *(__m128*)c7fffffff);
14628 }
14629 
14630 #ifdef _NEON2SSE_64BIT
14631 _NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
vabsq_s64(int64x2_t a)14632 _NEON2SSE_INLINE int64x2_t vabsq_s64(int64x2_t a) // VABS.S64 q0,q0
14633 {
14634     __m128i sign = _mm_srai_epi32 (_mm_shuffle_epi32 (a, 0xf5), 31);
14635     return _mm_sub_epi64 (_mm_xor_si128 (a, sign), sign);
14636 }
14637 
14638 _NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
vabsq_f64(float64x2_t a)14639 _NEON2SSE_INLINE float64x2_t vabsq_f64(float64x2_t a) // VABS.F64 q0,q0
14640 {
14641     _NEON2SSE_ALIGN_16 static const int64_t mask[2] = {0x7fffffffffffffffLL, 0x7fffffffffffffffLL};
14642     return _mm_and_pd (a, *(__m128d*)mask);
14643 }
14644 #endif
14645 
14646 //****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
14647 //**********************************************************************
14648 //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
14649 _NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
vqabs_s8(int8x8_t a)14650 _NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
14651 {
14652     int8x8_t res64;
14653     __m128i res;
14654     res = vqabsq_s8(_pM128i(a));
14655     return64(res);
14656 }
14657 
14658 _NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
vqabs_s16(int16x4_t a)14659 _NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
14660 {
14661     int16x4_t res64;
14662     __m128i res;
14663     res = vqabsq_s16(_pM128i(a));
14664     return64(res);
14665 }
14666 
14667 _NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
vqabs_s32(int32x2_t a)14668 _NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
14669 {
14670     int32x2_t res64;
14671     __m128i res;
14672     res = vqabsq_s32(_pM128i(a));
14673     return64(res);
14674 }
14675 
14676 _NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
vqabsq_s8(int8x16_t a)14677 _NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
14678 {
14679     __m128i c_128, abs, abs_cmp;
14680     c_128 = _mm_set1_epi8 ((int8_t)0x80); //-128
14681     abs = _mm_abs_epi8 (a);
14682     abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
14683     return _mm_xor_si128 (abs,  abs_cmp);
14684 }
14685 
14686 _NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
vqabsq_s16(int16x8_t a)14687 _NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
14688 {
14689     __m128i c_32768, abs, abs_cmp;
14690     c_32768 = _mm_set1_epi16 ((int16_t)0x8000); //-32768
14691     abs = _mm_abs_epi16 (a);
14692     abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
14693     return _mm_xor_si128 (abs,  abs_cmp);
14694 }
14695 
14696 _NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
vqabsq_s32(int32x4_t a)14697 _NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
14698 {
14699     __m128i c80000000, abs, abs_cmp;
14700     c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
14701     abs = _mm_abs_epi32 (a);
14702     abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
14703     return _mm_xor_si128 (abs,  abs_cmp);
14704 }
14705 
14706 //*************** Negate: Vd[i] = - Va[i] *************************************
14707 //*****************************************************************************
14708 //several Negate implementations possible for SIMD.
14709 //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
14710 _NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
vneg_s8(int8x8_t a)14711 _NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
14712 {
14713     int8x8_t res64;
14714     __m128i res;
14715     res = vnegq_s8(_pM128i(a));
14716     return64(res);
14717 }
14718 
14719 _NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
vneg_s16(int16x4_t a)14720 _NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
14721 {
14722     int16x4_t res64;
14723     __m128i res;
14724     res = vnegq_s16(_pM128i(a));
14725     return64(res);
14726 }
14727 
14728 _NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
vneg_s32(int32x2_t a)14729 _NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
14730 {
14731     int32x2_t res64;
14732     __m128i res;
14733     res = vnegq_s32(_pM128i(a));
14734     return64(res);
14735 }
14736 
14737 _NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
vneg_f32(float32x2_t a)14738 _NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
14739 {
14740     float32x4_t res;
14741     __m64_128 res64;
14742     _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14743     res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
14744     _M64f(res64, res);
14745     return res64;
14746 }
14747 
14748 _NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
vnegq_s8(int8x16_t a)14749 _NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
14750 {
14751     __m128i zero;
14752     zero = _mm_setzero_si128 ();
14753     return _mm_sub_epi8 (zero, a);
14754 } //or _mm_sign_epi8 (a, negative numbers vector)
14755 
14756 _NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
vnegq_s16(int16x8_t a)14757 _NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
14758 {
14759     __m128i zero;
14760     zero = _mm_setzero_si128 ();
14761     return _mm_sub_epi16 (zero, a);
14762 } //or _mm_sign_epi16 (a, negative numbers vector)
14763 
14764 _NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
vnegq_s32(int32x4_t a)14765 _NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
14766 {
14767     __m128i zero;
14768     zero = _mm_setzero_si128 ();
14769     return _mm_sub_epi32 (zero, a);
14770 } //or _mm_sign_epi32 (a, negative numbers vector)
14771 
14772 _NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
vnegq_f32(float32x4_t a)14773 _NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
14774 {
14775     _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14776     return _mm_xor_ps (a, *(__m128*) c80000000);
14777 }
14778 
14779 //************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
14780 //***************************************************************************************
14781 //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
14782 _NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
vqneg_s8(int8x8_t a)14783 _NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
14784 {
14785     int8x8_t res64;
14786     __m128i res;
14787     res = vqnegq_s8(_pM128i(a));
14788     return64(res);
14789 }
14790 
14791 _NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
vqneg_s16(int16x4_t a)14792 _NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
14793 {
14794     int16x4_t res64;
14795     __m128i res;
14796     res = vqnegq_s16(_pM128i(a));
14797     return64(res);
14798 }
14799 
14800 _NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
vqneg_s32(int32x2_t a)14801 _NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
14802 {
14803     int32x2_t res64;
14804     __m128i res;
14805     res = vqnegq_s32(_pM128i(a));
14806     return64(res);
14807 }
14808 
14809 _NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
vqnegq_s8(int8x16_t a)14810 _NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
14811 {
14812     __m128i zero;
14813     zero = _mm_setzero_si128 ();
14814     return _mm_subs_epi8 (zero, a); //saturating substraction
14815 }
14816 
14817 _NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
vqnegq_s16(int16x8_t a)14818 _NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
14819 {
14820     __m128i zero;
14821     zero = _mm_setzero_si128 ();
14822     return _mm_subs_epi16 (zero, a); //saturating substraction
14823 }
14824 
14825 _NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
vqnegq_s32(int32x4_t a)14826 _NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
14827 {
14828     //solution may be not optimal compared with a serial
14829     __m128i c80000000, zero, sub, cmp;
14830     c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
14831     zero = _mm_setzero_si128 ();
14832     sub =  _mm_sub_epi32 (zero, a); //substraction
14833     cmp = _mm_cmpeq_epi32 (a, c80000000);
14834     return _mm_xor_si128 (sub,  cmp);
14835 }
14836 
14837 //****************** Count leading zeros ********************************
14838 //**************************************************************************
14839 //no corresponding vector intrinsics in IA32, need to implement it.  While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14840 _NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
vclz_s8(int8x8_t a)14841 _NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
14842 {
14843     int8x8_t res64;
14844     __m128i res;
14845     res = vclzq_s8(_pM128i(a));
14846     return64(res);
14847 }
14848 
14849 _NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
vclz_s16(int16x4_t a)14850 _NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
14851 {
14852     int16x4_t res64;
14853     __m128i res;
14854     res = vclzq_s16(_pM128i(a));
14855     return64(res);
14856 }
14857 
14858 _NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
vclz_s32(int32x2_t a)14859 _NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
14860 {
14861     int32x2_t res64;
14862     __m128i res;
14863     res = vclzq_s32(_pM128i(a));
14864     return64(res);
14865 }
14866 
14867 
14868 _NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
14869 #define vclz_u8 vclz_s8
14870 
14871 _NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
14872 #define vclz_u16 vclz_s16
14873 
14874 _NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
14875 #define vclz_u32 vclz_s32
14876 
14877 _NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
vclzq_s8(int8x16_t a)14878 _NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
14879 {
14880     _NEON2SSE_ALIGN_16 static const int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
14881                                                             /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
14882                                                             /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
14883                                                             /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0                          };
14884     __m128i maskLOW, c4, lowclz, mask, hiclz;
14885     maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
14886     c4 = _mm_set1_epi8(4);
14887     lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
14888     mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
14889     mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
14890     hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
14891     mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
14892     lowclz = _mm_and_si128(lowclz,mask);
14893     return _mm_add_epi8(lowclz, hiclz);
14894 }
14895 
14896 _NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
vclzq_s16(int16x8_t a)14897 _NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
14898 {
14899     __m128i c7, res8x16, res8x16_swap;
14900     _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
14901     _NEON2SSE_ALIGN_16 static const uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
14902     c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
14903     res8x16 = vclzq_s8(a);
14904     res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
14905     res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
14906     res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
14907     c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
14908     res8x16 = _mm_and_si128(res8x16, c7); //lowclz
14909     return _mm_add_epi16(res8x16_swap, res8x16);
14910 }
14911 
14912 _NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
vclzq_s32(int32x4_t a)14913 _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
14914 {
14915     __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
14916     c55555555 = _mm_set1_epi32(0x55555555);
14917     c33333333 = _mm_set1_epi32(0x33333333);
14918     c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
14919     c3f = _mm_set1_epi32(0x3f);
14920     c32 = _mm_set1_epi32(32);
14921     tmp = _mm_srli_epi32(a, 1);
14922     res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
14923     tmp = _mm_srli_epi32(res, 2);
14924     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
14925     tmp = _mm_srli_epi32(res, 4);
14926     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
14927     tmp = _mm_srli_epi32(res, 8);
14928     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
14929     tmp = _mm_srli_epi32(res, 16);
14930     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
14931 
14932     tmp = _mm_srli_epi32(res, 1);
14933     tmp = _mm_and_si128(tmp, c55555555);
14934     res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
14935 
14936     tmp = _mm_srli_epi32(res, 2);
14937     tmp = _mm_and_si128(tmp, c33333333);
14938     tmp1 = _mm_and_si128(res, c33333333);
14939     res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
14940 
14941     tmp = _mm_srli_epi32(res, 4);
14942     tmp = _mm_add_epi32(tmp, res);
14943     res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
14944 
14945     tmp = _mm_srli_epi32(res, 8);
14946     res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
14947 
14948     tmp = _mm_srli_epi32(res, 16);
14949     res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
14950 
14951     res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
14952 
14953     return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
14954 }
14955 
14956 _NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
14957 #define vclzq_u8 vclzq_s8
14958 
14959 _NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
14960 #define vclzq_u16 vclzq_s16
14961 
14962 _NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
14963 #define vclzq_u32 vclzq_s32
14964 
14965 //************** Count leading sign bits **************************
14966 //********************************************************************
14967 //VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
14968 // the topmost bit, that are the same as the topmost bit, in each element in a vector
14969 //No corresponding vector intrinsics in IA32, need to implement it.
14970 //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14971 _NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
vcls_s8(int8x8_t a)14972 _NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
14973 {
14974     int8x8_t res64;
14975     __m128i res;
14976     res = vclsq_s8(_pM128i(a));
14977     return64(res);
14978 }
14979 
14980 _NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
vcls_s16(int16x4_t a)14981 _NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
14982 {
14983     int16x4_t res64;
14984     __m128i res;
14985     res = vclsq_s16(_pM128i(a));
14986     return64(res);
14987 }
14988 
14989 _NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
vcls_s32(int32x2_t a)14990 _NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
14991 {
14992     int32x2_t res64;
14993     __m128i res;
14994     res = vclsq_s32(_pM128i(a));
14995     return64(res);
14996 }
14997 
14998 _NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
vclsq_s8(int8x16_t a)14999 _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
15000 {
15001     __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
15002     cff = _mm_cmpeq_epi8 (a,a); //0xff
15003     c80 = _mm_set1_epi8((int8_t)0x80);
15004     c1 = _mm_set1_epi8(1);
15005     a_mask = _mm_and_si128(a, c80);
15006     a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
15007     a_neg = _mm_xor_si128(a, cff);
15008     a_neg = _mm_and_si128(a_mask, a_neg);
15009     a_pos = _mm_andnot_si128(a_mask, a);
15010     a_comb = _mm_or_si128(a_pos, a_neg);
15011     a_comb = vclzq_s8(a_comb);
15012     return _mm_sub_epi8(a_comb, c1);
15013 }
15014 
15015 _NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
vclsq_s16(int16x8_t a)15016 _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
15017 {
15018     __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
15019     cffff = _mm_cmpeq_epi16(a,a);
15020     c8000 =  _mm_slli_epi16(cffff, 15); //0x8000
15021     c1 = _mm_srli_epi16(cffff,15); //0x1
15022     a_mask = _mm_and_si128(a, c8000);
15023     a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
15024     a_neg = _mm_xor_si128(a, cffff);
15025     a_neg = _mm_and_si128(a_mask, a_neg);
15026     a_pos = _mm_andnot_si128(a_mask, a);
15027     a_comb = _mm_or_si128(a_pos, a_neg);
15028     a_comb = vclzq_s16(a_comb);
15029     return _mm_sub_epi16(a_comb, c1);
15030 }
15031 
15032 _NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
vclsq_s32(int32x4_t a)15033 _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
15034 {
15035     __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
15036     cffffffff = _mm_cmpeq_epi32(a,a);
15037     c80000000 =  _mm_slli_epi32(cffffffff, 31); //0x80000000
15038     c1 = _mm_srli_epi32(cffffffff,31); //0x1
15039     a_mask = _mm_and_si128(a, c80000000);
15040     a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
15041     a_neg = _mm_xor_si128(a, cffffffff);
15042     a_neg = _mm_and_si128(a_mask, a_neg);
15043     a_pos = _mm_andnot_si128(a_mask, a);
15044     a_comb = _mm_or_si128(a_pos, a_neg);
15045     a_comb = vclzq_s32(a_comb);
15046     return _mm_sub_epi32(a_comb, c1);
15047 }
15048 
15049 //************************* Count number of set bits   ********************************
15050 //*************************************************************************************
15051 //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2  _mm_popcnt__u32 (unsigned int v) for each element
15052 //another option is to do the following algorithm:
15053 
15054 _NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
vcnt_u8(uint8x8_t a)15055 _NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
15056 {
15057     uint8x8_t res64;
15058     __m128i res;
15059     res = vcntq_u8(_pM128i(a));
15060     return64(res);
15061 }
15062 
15063 _NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
15064 #define vcnt_s8 vcnt_u8
15065 
15066 _NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
15067 #define vcnt_p8 vcnt_u8
15068 
15069 _NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
vcntq_u8(uint8x16_t a)15070 _NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
15071 {
15072     _NEON2SSE_ALIGN_16 static const int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
15073                                                                  /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
15074                                                                  /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
15075                                                                  /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4};
15076     __m128i maskLOW, mask, lowpopcnt, hipopcnt;
15077     maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
15078     mask = _mm_and_si128(a, maskLOW);
15079     lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
15080     mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
15081     mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
15082     hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
15083     return _mm_add_epi8(lowpopcnt, hipopcnt);
15084 }
15085 
15086 _NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
15087 #define vcntq_s8 vcntq_u8
15088 
15089 _NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
15090 #define vcntq_p8 vcntq_u8
15091 
15092 //**************************************************************************************
15093 //*********************** Logical operations ****************************************
15094 //**************************************************************************************
15095 //************************** Bitwise not ***********************************
15096 //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
15097 _NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
vmvn_s8(int8x8_t a)15098 _NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
15099 {
15100     int8x8_t res64;
15101     __m128i res;
15102     res = vmvnq_s8(_pM128i(a));
15103     return64(res);
15104 }
15105 
15106 _NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
vmvn_s16(int16x4_t a)15107 _NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
15108 {
15109     int16x4_t res64;
15110     __m128i res;
15111     res = vmvnq_s16(_pM128i(a));
15112     return64(res);
15113 }
15114 
15115 _NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
vmvn_s32(int32x2_t a)15116 _NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
15117 {
15118     int32x2_t res64;
15119     __m128i res;
15120     res = vmvnq_s32(_pM128i(a));
15121     return64(res);
15122 }
15123 
15124 _NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
15125 #define vmvn_u8 vmvn_s8
15126 
15127 _NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
15128 #define vmvn_u16 vmvn_s16
15129 
15130 _NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
15131 #define vmvn_u32 vmvn_s32
15132 
15133 _NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
15134 #define vmvn_p8 vmvn_u8
15135 
15136 _NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
vmvnq_s8(int8x16_t a)15137 _NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
15138 {
15139     __m128i c1;
15140     c1 = _mm_cmpeq_epi8 (a,a); //0xff
15141     return _mm_andnot_si128 (a, c1);
15142 }
15143 
15144 _NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
vmvnq_s16(int16x8_t a)15145 _NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
15146 {
15147     __m128i c1;
15148     c1 = _mm_cmpeq_epi16 (a,a); //0xffff
15149     return _mm_andnot_si128 (a, c1);
15150 }
15151 
15152 _NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
vmvnq_s32(int32x4_t a)15153 _NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
15154 {
15155     __m128i c1;
15156     c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
15157     return _mm_andnot_si128 (a, c1);
15158 }
15159 
15160 _NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
15161 #define vmvnq_u8 vmvnq_s8
15162 
15163 _NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
15164 #define vmvnq_u16 vmvnq_s16
15165 
15166 _NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
15167 #define vmvnq_u32 vmvnq_s32
15168 
15169 _NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
15170 #define vmvnq_p8 vmvnq_u8
15171 
15172 //****************** Bitwise and ***********************
15173 //******************************************************
15174 _NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
vand_s8(int8x8_t a,int8x8_t b)15175 _NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
15176 {
15177     int8x8_t res64;
15178     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15179 }
15180 
15181 _NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
vand_s16(int16x4_t a,int16x4_t b)15182 _NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
15183 {
15184     int16x4_t res64;
15185     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15186 }
15187 
15188 _NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
vand_s32(int32x2_t a,int32x2_t b)15189 _NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
15190 {
15191     int32x2_t res64;
15192     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15193 }
15194 
15195 
15196 _NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a,  int64x1_t b); // VAND d0,d0,d0
vand_s64(int64x1_t a,int64x1_t b)15197 _NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a,  int64x1_t b)
15198 {
15199     int64x1_t res;
15200     res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
15201     return res;
15202 }
15203 
15204 _NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
15205 #define vand_u8 vand_s8
15206 
15207 _NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
15208 #define vand_u16 vand_s16
15209 
15210 _NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
15211 #define vand_u32 vand_s32
15212 
15213 _NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a,  uint64x1_t b); // VAND d0,d0,d0
15214 #define vand_u64 vand_s64
15215 
15216 
15217 _NEON2SSESTORAGE int8x16_t   vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
15218 #define vandq_s8 _mm_and_si128
15219 
15220 _NEON2SSESTORAGE int16x8_t   vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
15221 #define vandq_s16 _mm_and_si128
15222 
15223 _NEON2SSESTORAGE int32x4_t   vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
15224 #define vandq_s32 _mm_and_si128
15225 
15226 _NEON2SSESTORAGE int64x2_t   vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
15227 #define vandq_s64 _mm_and_si128
15228 
15229 _NEON2SSESTORAGE uint8x16_t   vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
15230 #define vandq_u8 _mm_and_si128
15231 
15232 _NEON2SSESTORAGE uint16x8_t   vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
15233 #define vandq_u16 _mm_and_si128
15234 
15235 _NEON2SSESTORAGE uint32x4_t   vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
15236 #define vandq_u32 _mm_and_si128
15237 
15238 _NEON2SSESTORAGE uint64x2_t   vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
15239 #define vandq_u64 _mm_and_si128
15240 
15241 //******************** Bitwise or *********************************
15242 //******************************************************************
15243 _NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
vorr_s8(int8x8_t a,int8x8_t b)15244 _NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
15245 {
15246     int8x8_t res64;
15247     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15248 }
15249 
15250 
15251 _NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
vorr_s16(int16x4_t a,int16x4_t b)15252 _NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
15253 {
15254     int16x4_t res64;
15255     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15256 }
15257 
15258 
15259 _NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
vorr_s32(int32x2_t a,int32x2_t b)15260 _NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
15261 {
15262     int32x2_t res64;
15263     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15264 }
15265 
15266 
15267 _NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a,  int64x1_t b); // VORR d0,d0,d0
vorr_s64(int64x1_t a,int64x1_t b)15268 _NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a,  int64x1_t b)
15269 {
15270     int64x1_t res;
15271     res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
15272     return res;
15273 }
15274 
15275 _NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
15276 #define vorr_u8 vorr_s8
15277 
15278 _NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
15279 #define vorr_u16 vorr_s16
15280 
15281 _NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
15282 #define vorr_u32 vorr_s32
15283 
15284 _NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a,  uint64x1_t b); // VORR d0,d0,d0
15285 #define vorr_u64 vorr_s64
15286 
15287 _NEON2SSESTORAGE int8x16_t   vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
15288 #define vorrq_s8 _mm_or_si128
15289 
15290 _NEON2SSESTORAGE int16x8_t   vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
15291 #define vorrq_s16 _mm_or_si128
15292 
15293 _NEON2SSESTORAGE int32x4_t   vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
15294 #define vorrq_s32 _mm_or_si128
15295 
15296 _NEON2SSESTORAGE int64x2_t   vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
15297 #define vorrq_s64 _mm_or_si128
15298 
15299 _NEON2SSESTORAGE uint8x16_t   vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
15300 #define vorrq_u8 _mm_or_si128
15301 
15302 _NEON2SSESTORAGE uint16x8_t   vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
15303 #define vorrq_u16 _mm_or_si128
15304 
15305 _NEON2SSESTORAGE uint32x4_t   vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
15306 #define vorrq_u32 _mm_or_si128
15307 
15308 _NEON2SSESTORAGE uint64x2_t   vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
15309 #define vorrq_u64 _mm_or_si128
15310 
15311 //************* Bitwise exclusive or (EOR or XOR) ******************
15312 //*******************************************************************
15313 _NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
veor_s8(int8x8_t a,int8x8_t b)15314 _NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
15315 {
15316     int8x8_t res64;
15317     return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
15318 }
15319 
15320 _NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
15321 #define veor_s16 veor_s8
15322 
15323 _NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
15324 #define veor_s32 veor_s8
15325 
15326 _NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a,  int64x1_t b); // VEOR d0,d0,d0
veor_s64(int64x1_t a,int64x1_t b)15327 _NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a,  int64x1_t b)
15328 {
15329     int64x1_t res;
15330     res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
15331     return res;
15332 }
15333 
15334 _NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
15335 #define veor_u8 veor_s8
15336 
15337 _NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
15338 #define veor_u16 veor_s16
15339 
15340 _NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
15341 #define veor_u32 veor_s32
15342 
15343 _NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a,  uint64x1_t b); // VEOR d0,d0,d0
15344 #define veor_u64 veor_s64
15345 
15346 _NEON2SSESTORAGE int8x16_t   veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
15347 #define veorq_s8 _mm_xor_si128
15348 
15349 _NEON2SSESTORAGE int16x8_t   veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
15350 #define veorq_s16 _mm_xor_si128
15351 
15352 _NEON2SSESTORAGE int32x4_t   veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
15353 #define veorq_s32 _mm_xor_si128
15354 
15355 _NEON2SSESTORAGE int64x2_t   veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
15356 #define veorq_s64 _mm_xor_si128
15357 
15358 _NEON2SSESTORAGE uint8x16_t   veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
15359 #define veorq_u8 _mm_xor_si128
15360 
15361 _NEON2SSESTORAGE uint16x8_t   veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
15362 #define veorq_u16 _mm_xor_si128
15363 
15364 _NEON2SSESTORAGE uint32x4_t   veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
15365 #define veorq_u32 _mm_xor_si128
15366 
15367 _NEON2SSESTORAGE uint64x2_t   veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
15368 #define veorq_u64 _mm_xor_si128
15369 
15370 //********************** Bit Clear **********************************
15371 //*******************************************************************
15372 //Logical AND complement (AND negation or AND NOT)
15373 _NEON2SSESTORAGE int8x8_t   vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
vbic_s8(int8x8_t a,int8x8_t b)15374 _NEON2SSE_INLINE int8x8_t   vbic_s8(int8x8_t a, int8x8_t b)
15375 {
15376     int8x8_t res64;
15377     return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
15378 }
15379 
15380 _NEON2SSESTORAGE int16x4_t   vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
15381 #define vbic_s16 vbic_s8
15382 
15383 _NEON2SSESTORAGE int32x2_t   vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
15384 #define vbic_s32 vbic_s8
15385 
15386 _NEON2SSESTORAGE int64x1_t   vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
vbic_s64(int64x1_t a,int64x1_t b)15387 _NEON2SSE_INLINE int64x1_t   vbic_s64(int64x1_t a, int64x1_t b)
15388 {
15389     int64x1_t res;
15390     res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
15391     return res;
15392 }
15393 
15394 _NEON2SSESTORAGE uint8x8_t   vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
15395 #define vbic_u8 vbic_s8
15396 
15397 _NEON2SSESTORAGE uint16x4_t   vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
15398 #define vbic_u16 vbic_s16
15399 
15400 _NEON2SSESTORAGE uint32x2_t   vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
15401 #define vbic_u32 vbic_s32
15402 
15403 _NEON2SSESTORAGE uint64x1_t   vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
15404 #define vbic_u64 vbic_s64
15405 
15406 _NEON2SSESTORAGE int8x16_t   vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
15407 #define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15408 
15409 _NEON2SSESTORAGE int16x8_t   vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
15410 #define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15411 
15412 _NEON2SSESTORAGE int32x4_t   vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
15413 #define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15414 
15415 _NEON2SSESTORAGE int64x2_t   vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
15416 #define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15417 
15418 _NEON2SSESTORAGE uint8x16_t   vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
15419 #define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15420 
15421 _NEON2SSESTORAGE uint16x8_t   vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
15422 #define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15423 
15424 _NEON2SSESTORAGE uint32x4_t   vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
15425 #define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15426 
15427 _NEON2SSESTORAGE uint64x2_t   vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
15428 #define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15429 
15430 //**************** Bitwise OR complement ********************************
15431 //**************************************** ********************************
15432 //no exact IA 32 match, need to implement it as following
15433 _NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a,  int8x8_t b); // VORN d0,d0,d0
vorn_s8(int8x8_t a,int8x8_t b)15434 _NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a,  int8x8_t b)
15435 {
15436     int8x8_t res64;
15437     return64(vornq_s8(_pM128i(a), _pM128i(b)));
15438 }
15439 
15440 
15441 _NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a,  int16x4_t b); // VORN d0,d0,d0
vorn_s16(int16x4_t a,int16x4_t b)15442 _NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a,  int16x4_t b)
15443 {
15444     int16x4_t res64;
15445     return64(vornq_s16(_pM128i(a), _pM128i(b)));
15446 }
15447 
15448 
15449 _NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a,  int32x2_t b); // VORN d0,d0,d0
vorn_s32(int32x2_t a,int32x2_t b)15450 _NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a,  int32x2_t b)
15451 {
15452     int32x2_t res64;
15453     return64(vornq_s32(_pM128i(a), _pM128i(b)));
15454 }
15455 
15456 
15457 _NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
vorn_s64(int64x1_t a,int64x1_t b)15458 _NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
15459 {
15460     int64x1_t res;
15461     res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
15462     return res;
15463 }
15464 
15465 _NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a,  uint8x8_t b); // VORN d0,d0,d0
15466 #define vorn_u8 vorn_s8
15467 
15468 
15469 _NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a,  uint16x4_t b); // VORN d0,d0,d0
15470 #define vorn_u16 vorn_s16
15471 
15472 _NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a,  uint32x2_t b); // VORN d0,d0,d0
15473 #define vorn_u32 vorn_s32
15474 
15475 _NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
15476 #define vorn_u64 vorn_s64
15477 
15478 
15479 _NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
vornq_s8(int8x16_t a,int8x16_t b)15480 _NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
15481 {
15482     __m128i b1;
15483     b1 = vmvnq_s8( b); //bitwise not for b
15484     return _mm_or_si128 (a, b1);
15485 }
15486 
15487 _NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
vornq_s16(int16x8_t a,int16x8_t b)15488 _NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
15489 {
15490     __m128i b1;
15491     b1 = vmvnq_s16( b); //bitwise not for b
15492     return _mm_or_si128 (a, b1);
15493 }
15494 
15495 _NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
vornq_s32(int32x4_t a,int32x4_t b)15496 _NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
15497 {
15498     __m128i b1;
15499     b1 = vmvnq_s32( b); //bitwise not for b
15500     return _mm_or_si128 (a, b1);
15501 }
15502 
15503 _NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
vornq_s64(int64x2_t a,int64x2_t b)15504 _NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
15505 {
15506     __m128i c1, b1;
15507     c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
15508     b1 = _mm_andnot_si128 (b, c1);
15509     return _mm_or_si128 (a, b1);
15510 }
15511 
15512 _NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
vornq_u8(uint8x16_t a,uint8x16_t b)15513 _NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
15514 {
15515     __m128i b1;
15516     b1 = vmvnq_u8( b); //bitwise not for b
15517     return _mm_or_si128 (a, b1);
15518 }
15519 
15520 _NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
vornq_u16(uint16x8_t a,uint16x8_t b)15521 _NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
15522 {
15523     __m128i b1;
15524     b1 = vmvnq_s16( b); //bitwise not for b
15525     return _mm_or_si128 (a, b1);
15526 }
15527 
15528 _NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
vornq_u32(uint32x4_t a,uint32x4_t b)15529 _NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
15530 {
15531     __m128i b1;
15532     b1 = vmvnq_u32( b); //bitwise not for b
15533     return _mm_or_si128 (a, b1);
15534 }
15535 _NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
15536 #define vornq_u64 vornq_s64
15537 
15538 //********************* Bitwise Select *****************************
15539 //******************************************************************
15540 //Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
15541 
15542 //VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
15543 //corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
15544 
15545 //VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
15546 //if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
15547 
15548 //VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
15549 //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
15550 
15551 //VBSL only is implemented for SIMD
15552 _NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
vbsl_s8(uint8x8_t a,int8x8_t b,int8x8_t c)15553 _NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
15554 {
15555     int8x8_t res64;
15556     __m128i res;
15557     res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
15558     return64(res);
15559 }
15560 
15561 _NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
15562 #define vbsl_s16 vbsl_s8
15563 
15564 _NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
15565 #define vbsl_s32 vbsl_s8
15566 
15567 _NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
vbsl_s64(uint64x1_t a,int64x1_t b,int64x1_t c)15568 _NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
15569 {
15570     int64x1_t res;
15571     res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
15572     return res;
15573 }
15574 
15575 _NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
15576 #define vbsl_u8 vbsl_s8
15577 
15578 _NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
15579 #define vbsl_u16 vbsl_s8
15580 
15581 _NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
15582 #define vbsl_u32 vbsl_s8
15583 
15584 _NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
15585 #define vbsl_u64 vbsl_s64
15586 
15587 _NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
vbsl_f32(uint32x2_t a,float32x2_t b,float32x2_t c)15588 _NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
15589 {
15590     __m128 sel1, sel2;
15591     __m64_128 res64;
15592     sel1 = _mm_and_ps   (_pM128(a), _pM128(b));
15593     sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
15594     sel1 = _mm_or_ps (sel1, sel2);
15595     _M64f(res64, sel1);
15596     return res64;
15597 }
15598 
15599 _NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
15600 #define  vbsl_p8 vbsl_s8
15601 
15602 _NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
15603 #define  vbsl_p16 vbsl_s8
15604 
15605 _NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
vbslq_s8(uint8x16_t a,int8x16_t b,int8x16_t c)15606 _NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
15607 {
15608     __m128i sel1, sel2;
15609     sel1 = _mm_and_si128   (a, b);
15610     sel2 = _mm_andnot_si128 (a, c);
15611     return _mm_or_si128 (sel1, sel2);
15612 }
15613 
15614 _NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
15615 #define vbslq_s16 vbslq_s8
15616 
15617 _NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
15618 #define vbslq_s32 vbslq_s8
15619 
15620 _NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
15621 #define vbslq_s64 vbslq_s8
15622 
15623 _NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
15624 #define vbslq_u8 vbslq_s8
15625 
15626 _NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
15627 #define vbslq_u16 vbslq_s8
15628 
15629 _NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
15630 #define vbslq_u32 vbslq_s8
15631 
15632 _NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
15633 #define vbslq_u64 vbslq_s8
15634 
15635 _NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
vbslq_f32(uint32x4_t a,float32x4_t b,float32x4_t c)15636 _NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
15637 {
15638     __m128 sel1, sel2;
15639     sel1 = _mm_and_ps   (*(__m128*)&a, b);
15640     sel2 = _mm_andnot_ps (*(__m128*)&a, c);
15641     return _mm_or_ps (sel1, sel2);
15642 }
15643 
15644 _NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
15645 #define vbslq_p8 vbslq_u8
15646 
15647 _NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
15648 #define vbslq_p16 vbslq_s8
15649 
15650 //************************************************************************************
15651 //**************** Transposition operations ****************************************
15652 //************************************************************************************
15653 //*****************  Vector Transpose ************************************************
15654 //************************************************************************************
15655 //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
15656 // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
15657 _NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
vtrn_s8(int8x8_t a,int8x8_t b)15658 _NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
15659 {
15660     int8x8x2_t val;
15661     __m128i tmp, val0;
15662     tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15663     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask8_32_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
15664     vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
15665     return val;
15666 }
15667 
15668 _NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
vtrn_s16(int16x4_t a,int16x4_t b)15669 _NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
15670 {
15671     int16x4x2_t val;
15672     __m128i tmp, val0;
15673     _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
15674     tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
15675     val0 =  _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
15676     vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
15677     return val;
15678 }
15679 
15680 _NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
vtrn_s32(int32x2_t a,int32x2_t b)15681 _NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
15682 {
15683     int32x2x2_t val;
15684     __m128i val0;
15685     val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
15686     vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
15687     return val;
15688 }
15689 
15690 _NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
15691 #define vtrn_u8 vtrn_s8
15692 
15693 _NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
15694 #define vtrn_u16 vtrn_s16
15695 
15696 _NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
15697 #define vtrn_u32 vtrn_s32
15698 
15699 _NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
vtrn_f32(float32x2_t a,float32x2_t b)15700 _NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
15701 {
15702     float32x2x2_t val;
15703     val.val[0].m64_f32[0] = a.m64_f32[0];
15704     val.val[0].m64_f32[1] = b.m64_f32[0];
15705     val.val[1].m64_f32[0] = a.m64_f32[1];
15706     val.val[1].m64_f32[1] = b.m64_f32[1];
15707     return val; //a0,b0,a1,b1
15708 }
15709 
15710 _NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
15711 #define  vtrn_p8 vtrn_u8
15712 
15713 _NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
15714 #define  vtrn_p16 vtrn_s16
15715 
15716 //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
vtrnq_s8(int8x16_t a,int8x16_t b)15717 _NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
15718 {
15719     int8x16x2_t r8x16;
15720     __m128i a_sh, b_sh;
15721     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15722     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15723 
15724     r8x16.val[0] =  _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
15725     r8x16.val[1] =  _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
15726     return r8x16;
15727 }
15728 
15729 _NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
vtrnq_s16(int16x8_t a,int16x8_t b)15730 _NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
15731 {
15732     int16x8x2_t v16x8;
15733     __m128i a_sh, b_sh;
15734     a_sh = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
15735     b_sh = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
15736     v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
15737     v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
15738     return v16x8;
15739 }
15740 
15741 _NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
vtrnq_s32(int32x4_t a,int32x4_t b)15742 _NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
15743 {
15744     //may be not optimal solution compared with serial
15745     int32x4x2_t v32x4;
15746     __m128i a_sh, b_sh;
15747     a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
15748     b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
15749 
15750     v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
15751     v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3,  b3
15752     return v32x4;
15753 }
15754 
15755 _NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
15756 #define vtrnq_u8 vtrnq_s8
15757 
15758 _NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
15759 #define vtrnq_u16 vtrnq_s16
15760 
15761 _NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
15762 #define vtrnq_u32 vtrnq_s32
15763 
15764 _NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
vtrnq_f32(float32x4_t a,float32x4_t b)15765 _NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
15766 {
15767     //may be not optimal solution compared with serial
15768     float32x4x2_t f32x4;
15769     __m128 a_sh, b_sh;
15770     a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
15771     b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
15772 
15773     f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
15774     f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3,  b3
15775     return f32x4;
15776 }
15777 
15778 _NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
15779 #define vtrnq_p8 vtrnq_s8
15780 
15781 _NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
15782 #define vtrnq_p16 vtrnq_s16
15783 
15784 //***************** Interleave elements ***************************
15785 //*****************************************************************
15786 //output has (a0,b0,a1,b1, a2,b2,.....)
15787 _NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
vzip_s8(int8x8_t a,int8x8_t b)15788 _NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
15789 {
15790     int8x8x2_t val;
15791     __m128i val0;
15792     val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
15793     vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15794     return val;
15795 }
15796 
15797 _NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
vzip_s16(int16x4_t a,int16x4_t b)15798 _NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
15799 {
15800     int16x4x2_t val;
15801     __m128i val0;
15802     val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
15803     vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15804     return val;
15805 }
15806 
15807 _NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
15808 #define vzip_s32 vtrn_s32
15809 
15810 _NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
15811 #define vzip_u8 vzip_s8
15812 
15813 _NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
15814 #define vzip_u16 vzip_s16
15815 
15816 _NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
15817 #define vzip_u32 vzip_s32
15818 
15819 _NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
15820 #define vzip_f32 vtrn_f32
15821 
15822 _NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
15823 #define vzip_p8 vzip_u8
15824 
15825 _NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
15826 #define vzip_p16 vzip_u16
15827 
15828 _NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
vzipq_s8(int8x16_t a,int8x16_t b)15829 _NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
15830 {
15831     int8x16x2_t r8x16;
15832     r8x16.val[0] =  _mm_unpacklo_epi8(a, b);
15833     r8x16.val[1] =  _mm_unpackhi_epi8(a, b);
15834     return r8x16;
15835 }
15836 
15837 _NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
vzipq_s16(int16x8_t a,int16x8_t b)15838 _NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
15839 {
15840     int16x8x2_t r16x8;
15841     r16x8.val[0] =  _mm_unpacklo_epi16(a, b);
15842     r16x8.val[1] =  _mm_unpackhi_epi16(a, b);
15843     return r16x8;
15844 }
15845 
15846 _NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
vzipq_s32(int32x4_t a,int32x4_t b)15847 _NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
15848 {
15849     int32x4x2_t r32x4;
15850     r32x4.val[0] =  _mm_unpacklo_epi32(a, b);
15851     r32x4.val[1] =  _mm_unpackhi_epi32(a, b);
15852     return r32x4;
15853 }
15854 
15855 _NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
15856 #define vzipq_u8 vzipq_s8
15857 
15858 _NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
15859 #define vzipq_u16 vzipq_s16
15860 
15861 _NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
15862 #define vzipq_u32 vzipq_s32
15863 
15864 _NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
vzipq_f32(float32x4_t a,float32x4_t b)15865 _NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
15866 {
15867     float32x4x2_t f32x4;
15868     f32x4.val[0] =   _mm_unpacklo_ps ( a,  b);
15869     f32x4.val[1] =   _mm_unpackhi_ps ( a,  b);
15870     return f32x4;
15871 }
15872 
15873 _NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
15874 #define vzipq_p8 vzipq_u8
15875 
15876 _NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
15877 #define vzipq_p16 vzipq_u16
15878 
15879 //*********************** De-Interleave elements *************************
15880 //*************************************************************************
15881 //As the result of these functions first val  contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
15882 //no such functions in IA32 SIMD, shuffle is required
15883 _NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
vuzp_s8(int8x8_t a,int8x8_t b)15884 _NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
15885 {
15886     int8x8x2_t val;
15887     __m128i tmp, val0;
15888     _NEON2SSE_ALIGN_16 static const int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13,  2, 6, 10, 14, 3, 7, 11,15};
15889     tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15890     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6),  (a1, a3, a5, a7, b1,b3, b5, b7)
15891     vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15892     return val;
15893 }
15894 
15895 _NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
vuzp_s16(int16x4_t a,int16x4_t b)15896 _NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
15897 {
15898     int16x4x2_t val;
15899     __m128i tmp, val0;
15900     _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1,  8,9,  2,3, 10,11,  4,5, 12,13, 6,7, 14,15};
15901     tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
15902     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
15903     vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15904     return val;
15905 }
15906 
15907 _NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
vuzp_s32(int32x2_t a,int32x2_t b)15908 _NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
15909 {
15910     int32x2x2_t val;
15911     __m128i val0;
15912     val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
15913     vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15914     return val;
15915 }
15916 
15917 _NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
15918 #define vuzp_u8 vuzp_s8
15919 
15920 _NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
15921 #define vuzp_u16 vuzp_s16
15922 
15923 _NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
15924 #define vuzp_u32 vuzp_s32
15925 
15926 _NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
15927 #define vuzp_f32 vzip_f32
15928 
15929 _NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
15930 #define vuzp_p8 vuzp_u8
15931 
15932 _NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
15933 #define vuzp_p16 vuzp_u16
15934 
15935 _NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
vuzpq_s8(int8x16_t a,int8x16_t b)15936 _NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
15937 {
15938     int8x16x2_t v8x16;
15939     __m128i a_sh, b_sh;
15940     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15941     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15942     //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
15943     v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14,  b0, b2, b4, b6, b8, b10, b12, b14,
15944     v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15,  b1, b3, b5, b7, b9, b11, b13, b15
15945     return v8x16;
15946 }
15947 
15948 _NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
vuzpq_s16(int16x8_t a,int16x8_t b)15949 _NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
15950 {
15951     int16x8x2_t v16x8;
15952     __m128i a_sh, b_sh;
15953      a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
15954     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_32_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
15955     v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
15956     v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
15957     return v16x8;
15958 }
15959 
15960 _NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
vuzpq_s32(int32x4_t a,int32x4_t b)15961 _NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
15962 {
15963     //may be not optimal solution compared with serial
15964     int32x4x2_t v32x4;
15965     __m128i a_sh, b_sh;
15966     a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
15967     b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
15968 
15969     v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
15970     v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
15971     return v32x4;
15972 }
15973 
15974 _NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
15975 #define vuzpq_u8 vuzpq_s8
15976 
15977 _NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
15978 #define vuzpq_u16 vuzpq_s16
15979 
15980 _NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
15981 #define vuzpq_u32 vuzpq_s32
15982 
15983 _NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
vuzpq_f32(float32x4_t a,float32x4_t b)15984 _NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
15985 {
15986     float32x4x2_t v32x4;
15987     v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
15988     v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
15989     return v32x4;
15990 }
15991 
15992 _NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
15993 #define vuzpq_p8 vuzpq_u8
15994 
15995 _NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
15996 #define vuzpq_p16 vuzpq_u16
15997 
15998 //##############################################################################################
15999 //*********************** Reinterpret cast intrinsics.******************************************
16000 //##############################################################################################
16001 // Not a part of oficial NEON instruction set but available in gcc compiler *********************
16002 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
16003 #define vreinterpret_p8_u32
16004 
16005 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
16006 #define vreinterpret_p8_u16
16007 
16008 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
16009 #define vreinterpret_p8_u8
16010 
16011 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
16012 #define vreinterpret_p8_s32
16013 
16014 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
16015 #define vreinterpret_p8_s16
16016 
16017 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
16018 #define vreinterpret_p8_s8
16019 
16020 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
16021 #define vreinterpret_p8_u64
16022 
16023 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
16024 #define vreinterpret_p8_s64
16025 
16026 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
16027 #define vreinterpret_p8_f32
16028 
16029 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
16030 #define vreinterpret_p8_p16
16031 
16032 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
16033 #define vreinterpretq_p8_u32
16034 
16035 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
16036 #define vreinterpretq_p8_u16
16037 
16038 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
16039 #define vreinterpretq_p8_u8
16040 
16041 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
16042 #define vreinterpretq_p8_s32
16043 
16044 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
16045 #define vreinterpretq_p8_s16
16046 
16047 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
16048 #define vreinterpretq_p8_s8
16049 
16050 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
16051 #define vreinterpretq_p8_u64
16052 
16053 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
16054 #define vreinterpretq_p8_s64
16055 
16056 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
16057 #define vreinterpretq_p8_f32(t) _M128i(t)
16058 
16059 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
16060 #define vreinterpretq_p8_p16
16061 
16062 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
16063 #define vreinterpret_p16_u32
16064 
16065 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
16066 #define vreinterpret_p16_u16
16067 
16068 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
16069 #define vreinterpret_p16_u8
16070 
16071 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
16072 #define vreinterpret_p16_s32
16073 
16074 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
16075 #define vreinterpret_p16_s16
16076 
16077 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
16078 #define vreinterpret_p16_s8
16079 
16080 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
16081 #define vreinterpret_p16_u64
16082 
16083 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
16084 #define vreinterpret_p16_s64
16085 
16086 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
16087 #define vreinterpret_p16_f32
16088 
16089 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
16090 #define vreinterpret_p16_p8
16091 
16092 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
16093 #define vreinterpretq_p16_u32
16094 
16095 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
16096 #define vreinterpretq_p16_u16
16097 
16098 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
16099 #define vreinterpretq_p16_s32
16100 
16101 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
16102 #define vreinterpretq_p16_s16
16103 
16104 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
16105 #define vreinterpretq_p16_s8
16106 
16107 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
16108 #define vreinterpretq_p16_u64
16109 
16110 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
16111 #define vreinterpretq_p16_s64
16112 
16113 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
16114 #define vreinterpretq_p16_f32(t) _M128i(t)
16115 
16116 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
16117 #define vreinterpretq_p16_p8  vreinterpretq_s16_p8
16118 
16119 //****  Integer to float  ******
16120 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
vreinterpret_f32_u32(uint32x2_t t)16121 _NEON2SSE_INLINE float32x2_t vreinterpret_f32_u32 (uint32x2_t t)
16122 {
16123     return (*(__m64_128*)&(t));
16124 }
16125 
16126 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
16127 #define vreinterpret_f32_u16 vreinterpret_f32_u32
16128 
16129 
16130 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
16131 #define vreinterpret_f32_u8 vreinterpret_f32_u32
16132 
16133 
16134 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s32 (int32x2_t t);
16135 #define vreinterpret_f32_s32 vreinterpret_f32_u32
16136 
16137 
16138 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s16 (int16x4_t t);
16139 #define vreinterpret_f32_s16 vreinterpret_f32_u32
16140 
16141 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s8 (int8x8_t t);
16142 #define vreinterpret_f32_s8 vreinterpret_f32_u32
16143 
16144 
16145 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u64(uint64x1_t t);
16146 #define vreinterpret_f32_u64 vreinterpret_f32_u32
16147 
16148 
16149 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s64 (int64x1_t t);
16150 #define vreinterpret_f32_s64 vreinterpret_f32_u32
16151 
16152 
16153 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
16154 #define vreinterpret_f32_p16 vreinterpret_f32_u32
16155 
16156 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
16157 #define vreinterpret_f32_p8 vreinterpret_f32_u32
16158 
16159 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
16160 #define  vreinterpretq_f32_u32(t) _M128(t)
16161 
16162 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
16163 #define vreinterpretq_f32_u16 vreinterpretq_f32_u32
16164 
16165 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
16166 #define vreinterpretq_f32_u8 vreinterpretq_f32_u32
16167 
16168 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
16169 #define vreinterpretq_f32_s32 vreinterpretq_f32_u32
16170 
16171 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
16172 #define vreinterpretq_f32_s16 vreinterpretq_f32_u32
16173 
16174 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
16175 #define vreinterpretq_f32_s8 vreinterpretq_f32_u32
16176 
16177 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
16178 #define vreinterpretq_f32_u64 vreinterpretq_f32_u32
16179 
16180 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
16181 #define vreinterpretq_f32_s64 vreinterpretq_f32_u32
16182 
16183 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
16184 #define vreinterpretq_f32_p16 vreinterpretq_f32_u32
16185 
16186 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
16187 #define vreinterpretq_f32_p8 vreinterpretq_f32_u32
16188 
16189 //*** Integer type conversions ******************
16190 //no conversion necessary for the following functions because it is same data type
16191 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
16192 #define vreinterpret_s64_u32
16193 
16194 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
16195 #define vreinterpret_s64_u16
16196 
16197 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
16198 #define vreinterpret_s64_u8
16199 
16200 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s32 (int32x2_t t);
16201 #define  vreinterpret_s64_s32
16202 
16203 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s16 (int16x4_t t);
16204 #define vreinterpret_s64_s16
16205 
16206 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s8 (int8x8_t t);
16207 #define  vreinterpret_s64_s8
16208 
16209 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
16210 #define  vreinterpret_s64_u64
16211 
16212 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_f32 (float32x2_t t);
16213 #define  vreinterpret_s64_f32
16214 
16215 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
16216 #define vreinterpret_s64_p16
16217 
16218 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
16219 #define vreinterpret_s64_p8
16220 
16221 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
16222 #define vreinterpretq_s64_u32
16223 
16224 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
16225 #define vreinterpretq_s64_s16
16226 
16227 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
16228 #define vreinterpretq_s64_u8
16229 
16230 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
16231 #define vreinterpretq_s64_s32
16232 
16233 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
16234 #define vreinterpretq_s64_u16
16235 
16236 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
16237 #define vreinterpretq_s64_s8
16238 
16239 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
16240 #define vreinterpretq_s64_u64
16241 
16242 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
16243 #define vreinterpretq_s64_f32(t) _M128i(t)
16244 
16245 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
16246 #define vreinterpretq_s64_p16
16247 
16248 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
16249 #define vreinterpretq_s64_p8
16250 
16251 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
16252 #define vreinterpret_u64_u32
16253 
16254 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
16255 #define vreinterpret_u64_u16
16256 
16257 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
16258 #define vreinterpret_u64_u8
16259 
16260 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
16261 #define vreinterpret_u64_s32
16262 
16263 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
16264 #define vreinterpret_u64_s16
16265 
16266 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
16267 #define vreinterpret_u64_s8
16268 
16269 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
16270 #define vreinterpret_u64_s64
16271 
16272 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
16273 #define vreinterpret_u64_f32
16274 
16275 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
16276 #define vreinterpret_u64_p16
16277 
16278 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
16279 #define vreinterpret_u64_p8
16280 
16281 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
16282 #define vreinterpretq_u64_u32
16283 
16284 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
16285 #define vreinterpretq_u64_u16
16286 
16287 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
16288 #define vreinterpretq_u64_u8
16289 
16290 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
16291 #define vreinterpretq_u64_s32
16292 
16293 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
16294 #define vreinterpretq_u64_s16
16295 
16296 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
16297 #define vreinterpretq_u64_s8
16298 
16299 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
16300 #define vreinterpretq_u64_s64
16301 
16302 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
16303 #define vreinterpretq_u64_f32(t) _M128i(t)
16304 
16305 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
16306 #define vreinterpretq_u64_p16
16307 
16308 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
16309 #define vreinterpretq_u64_p8
16310 
16311 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
16312 #define vreinterpret_s8_u32
16313 
16314 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
16315 #define vreinterpret_s8_u16
16316 
16317 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
16318 #define vreinterpret_s8_u8
16319 
16320 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s32 (int32x2_t t);
16321 #define vreinterpret_s8_s32
16322 
16323 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s16 (int16x4_t t);
16324 #define vreinterpret_s8_s16
16325 
16326 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
16327 #define vreinterpret_s8_u64
16328 
16329 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s64 (int64x1_t t);
16330 #define vreinterpret_s8_s64
16331 
16332 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_f32 (float32x2_t t);
16333 #define vreinterpret_s8_f32
16334 
16335 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
16336 #define vreinterpret_s8_p16
16337 
16338 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
16339 #define vreinterpret_s8_p8
16340 
16341 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
16342 #define vreinterpretq_s8_u32
16343 
16344 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
16345 #define vreinterpretq_s8_u16
16346 
16347 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
16348 #define vreinterpretq_s8_u8
16349 
16350 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
16351 #define vreinterpretq_s8_s32
16352 
16353 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
16354 #define vreinterpretq_s8_s16
16355 
16356 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
16357 #define vreinterpretq_s8_u64
16358 
16359 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
16360 #define vreinterpretq_s8_s64
16361 
16362 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
16363 #define vreinterpretq_s8_f32(t) _M128i(t)
16364 
16365 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
16366 #define vreinterpretq_s8_p16
16367 
16368 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
16369 #define vreinterpretq_s8_p8
16370 
16371 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
16372 #define vreinterpret_s16_u32
16373 
16374 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
16375 #define vreinterpret_s16_u16
16376 
16377 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
16378 #define vreinterpret_s16_u8
16379 
16380 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s32 (int32x2_t t);
16381 #define vreinterpret_s16_s32
16382 
16383 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s8 (int8x8_t t);
16384 #define vreinterpret_s16_s8
16385 
16386 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
16387 #define vreinterpret_s16_u64
16388 
16389 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s64 (int64x1_t t);
16390 #define vreinterpret_s16_s64
16391 
16392 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_f32 (float32x2_t t);
16393 #define vreinterpret_s16_f32
16394 
16395 
16396 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
16397 #define vreinterpret_s16_p16
16398 
16399 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
16400 #define vreinterpret_s16_p8
16401 
16402 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
16403 #define vreinterpretq_s16_u32
16404 
16405 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
16406 #define vreinterpretq_s16_u16
16407 
16408 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
16409 #define vreinterpretq_s16_u8
16410 
16411 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
16412 #define vreinterpretq_s16_s32
16413 
16414 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
16415 #define vreinterpretq_s16_s8
16416 
16417 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
16418 #define vreinterpretq_s16_u64
16419 
16420 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
16421 #define vreinterpretq_s16_s64
16422 
16423 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
16424 #define vreinterpretq_s16_f32(t) _M128i(t)
16425 
16426 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
16427 #define vreinterpretq_s16_p16
16428 
16429 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
16430 #define vreinterpretq_s16_p8
16431 
16432 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
16433 #define vreinterpret_s32_u32
16434 
16435 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
16436 #define vreinterpret_s32_u16
16437 
16438 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
16439 #define vreinterpret_s32_u8
16440 
16441 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s16 (int16x4_t t);
16442 #define vreinterpret_s32_s16
16443 
16444 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s8 (int8x8_t t);
16445 #define vreinterpret_s32_s8
16446 
16447 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
16448 #define vreinterpret_s32_u64
16449 
16450 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s64 (int64x1_t t);
16451 #define vreinterpret_s32_s64
16452 
16453 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_f32 (float32x2_t t);
16454 #define vreinterpret_s32_f32
16455 
16456 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
16457 #define vreinterpret_s32_p16
16458 
16459 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
16460 #define vreinterpret_s32_p8
16461 
16462 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
16463 #define vreinterpretq_s32_u32
16464 
16465 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
16466 #define vreinterpretq_s32_u16
16467 
16468 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
16469 #define vreinterpretq_s32_u8
16470 
16471 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
16472 #define vreinterpretq_s32_s16
16473 
16474 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
16475 #define vreinterpretq_s32_s8
16476 
16477 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
16478 #define vreinterpretq_s32_u64
16479 
16480 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
16481 #define vreinterpretq_s32_s64
16482 
16483 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
16484 #define vreinterpretq_s32_f32(t)  _M128i(t)
16485 
16486 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
16487 #define vreinterpretq_s32_p16
16488 
16489 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
16490 #define vreinterpretq_s32_p8
16491 
16492 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
16493 #define vreinterpret_u8_u32
16494 
16495 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
16496 #define vreinterpret_u8_u16
16497 
16498 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
16499 #define vreinterpret_u8_s32
16500 
16501 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
16502 #define vreinterpret_u8_s16
16503 
16504 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
16505 #define vreinterpret_u8_s8
16506 
16507 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
16508 #define vreinterpret_u8_u64
16509 
16510 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
16511 #define vreinterpret_u8_s64
16512 
16513 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
16514 #define vreinterpret_u8_f32
16515 
16516 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
16517 #define vreinterpret_u8_p16
16518 
16519 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
16520 #define vreinterpret_u8_p8
16521 
16522 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
16523 #define vreinterpretq_u8_u32
16524 
16525 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
16526 #define vreinterpretq_u8_u16
16527 
16528 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
16529 #define vreinterpretq_u8_s32
16530 
16531 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
16532 #define vreinterpretq_u8_s16
16533 
16534 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
16535 #define vreinterpretq_u8_s8
16536 
16537 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
16538 #define vreinterpretq_u8_u64
16539 
16540 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
16541 #define vreinterpretq_u8_s64
16542 
16543 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
16544 #define vreinterpretq_u8_f32(t) _M128i(t)
16545 
16546 
16547 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
16548 #define vreinterpretq_u8_p16
16549 
16550 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
16551 #define vreinterpretq_u8_p8
16552 
16553 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
16554 #define vreinterpret_u16_u32
16555 
16556 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
16557 #define vreinterpret_u16_u8
16558 
16559 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
16560 #define vreinterpret_u16_s32
16561 
16562 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
16563 #define vreinterpret_u16_s16
16564 
16565 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
16566 #define vreinterpret_u16_s8
16567 
16568 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
16569 #define vreinterpret_u16_u64
16570 
16571 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
16572 #define vreinterpret_u16_s64
16573 
16574 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
16575 #define vreinterpret_u16_f32
16576 
16577 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
16578 #define vreinterpret_u16_p16
16579 
16580 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
16581 #define vreinterpret_u16_p8
16582 
16583 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
16584 #define vreinterpretq_u16_u32
16585 
16586 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
16587 #define vreinterpretq_u16_u8
16588 
16589 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
16590 #define vreinterpretq_u16_s32
16591 
16592 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
16593 #define vreinterpretq_u16_s16
16594 
16595 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
16596 #define vreinterpretq_u16_s8
16597 
16598 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
16599 #define vreinterpretq_u16_u64
16600 
16601 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
16602 #define vreinterpretq_u16_s64
16603 
16604 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
16605 #define vreinterpretq_u16_f32(t) _M128i(t)
16606 
16607 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
16608 #define vreinterpretq_u16_p16
16609 
16610 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
16611 #define vreinterpretq_u16_p8
16612 
16613 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
16614 #define vreinterpret_u32_u16
16615 
16616 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
16617 #define vreinterpret_u32_u8
16618 
16619 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
16620 #define vreinterpret_u32_s32
16621 
16622 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
16623 #define vreinterpret_u32_s16
16624 
16625 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
16626 #define vreinterpret_u32_s8
16627 
16628 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
16629 #define vreinterpret_u32_u64
16630 
16631 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
16632 #define vreinterpret_u32_s64
16633 
16634 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
16635 #define vreinterpret_u32_f32
16636 
16637 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
16638 #define vreinterpret_u32_p16
16639 
16640 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
16641 #define vreinterpret_u32_p8
16642 
16643 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
16644 #define vreinterpretq_u32_u16
16645 
16646 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
16647 #define vreinterpretq_u32_u8
16648 
16649 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
16650 #define vreinterpretq_u32_s32
16651 
16652 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
16653 #define vreinterpretq_u32_s16
16654 
16655 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
16656 #define vreinterpretq_u32_s8
16657 
16658 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
16659 #define vreinterpretq_u32_u64
16660 
16661 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
16662 #define vreinterpretq_u32_s64
16663 
16664 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
16665 #define  vreinterpretq_u32_f32(t) _M128i(t)
16666 
16667 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
16668 #define vreinterpretq_u32_p16
16669 
16670 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
16671 #define vreinterpretq_u32_p8
16672 
16673 //*************  Round ******************
16674 _NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a);
16675 #ifdef USE_SSE4
16676 #   define vrndnq_f32(a) _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
16677 #else
_NEON2SSE_PERFORMANCE_WARNING(float32x4_t vrndnq_f32 (float32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)16678 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( float32x4_t vrndnq_f32(float32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
16679 {
16680     int i;
16681     _NEON2SSE_ALIGN_16 float32_t res[4];
16682     _mm_store_ps(res, a);
16683      for(i = 0; i<4; i++) {
16684        res[i] = nearbyintf(res[i]);
16685      }
16686     return _mm_load_ps(res);
16687 }
16688 #endif
16689 
16690 
16691 _NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a);
16692 #ifdef USE_SSE4
16693 #   define  vrndnq_f64(a)  _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
16694 #else
_NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64 (float64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)16695 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64(float64x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
16696 {
16697      _NEON2SSE_ALIGN_16 float64_t res[2];
16698      _mm_store_pd(res, a);
16699      res[0] = nearbyintf(res[0]);
16700      res[1] = nearbyintf(res[1]);
16701      return _mm_load_pd(res);
16702 }
16703 #endif
16704 
16705 
16706 
16707 //************* Sqrt ******************
16708 _NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a);
16709 #define vsqrtq_f32 _mm_sqrt_ps
16710 
16711 _NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a);
16712 #define vsqrtq_f64 _mm_sqrt_pd
16713 
16714 
16715 #endif /* NEON2SSE_H */
16716