• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to deal
5  * in the Software without restriction, including without limitation the rights
6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7  * copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19  * THE SOFTWARE.
20  *
21  *===-----------------------------------------------------------------------===
22  */
23 
24 #ifndef __EMMINTRIN_H
25 #define __EMMINTRIN_H
26 
27 #ifndef __SSE2__
28 #error "SSE2 instruction set not enabled"
29 #else
30 
31 #include <xmmintrin.h>
32 
33 typedef double __m128d __attribute__((__vector_size__(16)));
34 typedef long long __m128i __attribute__((__vector_size__(16)));
35 
36 /* Type defines.  */
37 typedef double __v2df __attribute__ ((__vector_size__ (16)));
38 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39 typedef short __v8hi __attribute__((__vector_size__(16)));
40 typedef char __v16qi __attribute__((__vector_size__(16)));
41 
42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_sd(__m128d __a,__m128d __b)43 _mm_add_sd(__m128d __a, __m128d __b)
44 {
45   __a[0] += __b[0];
46   return __a;
47 }
48 
49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_pd(__m128d __a,__m128d __b)50 _mm_add_pd(__m128d __a, __m128d __b)
51 {
52   return __a + __b;
53 }
54 
55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_sd(__m128d __a,__m128d __b)56 _mm_sub_sd(__m128d __a, __m128d __b)
57 {
58   __a[0] -= __b[0];
59   return __a;
60 }
61 
62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_pd(__m128d __a,__m128d __b)63 _mm_sub_pd(__m128d __a, __m128d __b)
64 {
65   return __a - __b;
66 }
67 
68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_sd(__m128d __a,__m128d __b)69 _mm_mul_sd(__m128d __a, __m128d __b)
70 {
71   __a[0] *= __b[0];
72   return __a;
73 }
74 
75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_pd(__m128d __a,__m128d __b)76 _mm_mul_pd(__m128d __a, __m128d __b)
77 {
78   return __a * __b;
79 }
80 
81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_sd(__m128d __a,__m128d __b)82 _mm_div_sd(__m128d __a, __m128d __b)
83 {
84   __a[0] /= __b[0];
85   return __a;
86 }
87 
88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_pd(__m128d __a,__m128d __b)89 _mm_div_pd(__m128d __a, __m128d __b)
90 {
91   return __a / __b;
92 }
93 
94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_sd(__m128d __a,__m128d __b)95 _mm_sqrt_sd(__m128d __a, __m128d __b)
96 {
97   __m128d __c = __builtin_ia32_sqrtsd(__b);
98   return (__m128d) { __c[0], __a[1] };
99 }
100 
101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_pd(__m128d __a)102 _mm_sqrt_pd(__m128d __a)
103 {
104   return __builtin_ia32_sqrtpd(__a);
105 }
106 
107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_sd(__m128d __a,__m128d __b)108 _mm_min_sd(__m128d __a, __m128d __b)
109 {
110   return __builtin_ia32_minsd(__a, __b);
111 }
112 
113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_pd(__m128d __a,__m128d __b)114 _mm_min_pd(__m128d __a, __m128d __b)
115 {
116   return __builtin_ia32_minpd(__a, __b);
117 }
118 
119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_sd(__m128d __a,__m128d __b)120 _mm_max_sd(__m128d __a, __m128d __b)
121 {
122   return __builtin_ia32_maxsd(__a, __b);
123 }
124 
125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_pd(__m128d __a,__m128d __b)126 _mm_max_pd(__m128d __a, __m128d __b)
127 {
128   return __builtin_ia32_maxpd(__a, __b);
129 }
130 
131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_and_pd(__m128d __a,__m128d __b)132 _mm_and_pd(__m128d __a, __m128d __b)
133 {
134   return (__m128d)((__v4si)__a & (__v4si)__b);
135 }
136 
137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_andnot_pd(__m128d __a,__m128d __b)138 _mm_andnot_pd(__m128d __a, __m128d __b)
139 {
140   return (__m128d)(~(__v4si)__a & (__v4si)__b);
141 }
142 
143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_or_pd(__m128d __a,__m128d __b)144 _mm_or_pd(__m128d __a, __m128d __b)
145 {
146   return (__m128d)((__v4si)__a | (__v4si)__b);
147 }
148 
149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_xor_pd(__m128d __a,__m128d __b)150 _mm_xor_pd(__m128d __a, __m128d __b)
151 {
152   return (__m128d)((__v4si)__a ^ (__v4si)__b);
153 }
154 
155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pd(__m128d __a,__m128d __b)156 _mm_cmpeq_pd(__m128d __a, __m128d __b)
157 {
158   return (__m128d)__builtin_ia32_cmppd(__a, __b, 0);
159 }
160 
161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_pd(__m128d __a,__m128d __b)162 _mm_cmplt_pd(__m128d __a, __m128d __b)
163 {
164   return (__m128d)__builtin_ia32_cmppd(__a, __b, 1);
165 }
166 
167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_pd(__m128d __a,__m128d __b)168 _mm_cmple_pd(__m128d __a, __m128d __b)
169 {
170   return (__m128d)__builtin_ia32_cmppd(__a, __b, 2);
171 }
172 
173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pd(__m128d __a,__m128d __b)174 _mm_cmpgt_pd(__m128d __a, __m128d __b)
175 {
176   return (__m128d)__builtin_ia32_cmppd(__b, __a, 1);
177 }
178 
179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_pd(__m128d __a,__m128d __b)180 _mm_cmpge_pd(__m128d __a, __m128d __b)
181 {
182   return (__m128d)__builtin_ia32_cmppd(__b, __a, 2);
183 }
184 
185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_pd(__m128d __a,__m128d __b)186 _mm_cmpord_pd(__m128d __a, __m128d __b)
187 {
188   return (__m128d)__builtin_ia32_cmppd(__a, __b, 7);
189 }
190 
191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_pd(__m128d __a,__m128d __b)192 _mm_cmpunord_pd(__m128d __a, __m128d __b)
193 {
194   return (__m128d)__builtin_ia32_cmppd(__a, __b, 3);
195 }
196 
197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_pd(__m128d __a,__m128d __b)198 _mm_cmpneq_pd(__m128d __a, __m128d __b)
199 {
200   return (__m128d)__builtin_ia32_cmppd(__a, __b, 4);
201 }
202 
203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_pd(__m128d __a,__m128d __b)204 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
205 {
206   return (__m128d)__builtin_ia32_cmppd(__a, __b, 5);
207 }
208 
209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_pd(__m128d __a,__m128d __b)210 _mm_cmpnle_pd(__m128d __a, __m128d __b)
211 {
212   return (__m128d)__builtin_ia32_cmppd(__a, __b, 6);
213 }
214 
215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_pd(__m128d __a,__m128d __b)216 _mm_cmpngt_pd(__m128d __a, __m128d __b)
217 {
218   return (__m128d)__builtin_ia32_cmppd(__b, __a, 5);
219 }
220 
221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_pd(__m128d __a,__m128d __b)222 _mm_cmpnge_pd(__m128d __a, __m128d __b)
223 {
224   return (__m128d)__builtin_ia32_cmppd(__b, __a, 6);
225 }
226 
227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_sd(__m128d __a,__m128d __b)228 _mm_cmpeq_sd(__m128d __a, __m128d __b)
229 {
230   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 0);
231 }
232 
233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_sd(__m128d __a,__m128d __b)234 _mm_cmplt_sd(__m128d __a, __m128d __b)
235 {
236   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 1);
237 }
238 
239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_sd(__m128d __a,__m128d __b)240 _mm_cmple_sd(__m128d __a, __m128d __b)
241 {
242   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 2);
243 }
244 
245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_sd(__m128d __a,__m128d __b)246 _mm_cmpgt_sd(__m128d __a, __m128d __b)
247 {
248   __m128d __c = __builtin_ia32_cmpsd(__b, __a, 1);
249   return (__m128d) { __c[0], __a[1] };
250 }
251 
252 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_sd(__m128d __a,__m128d __b)253 _mm_cmpge_sd(__m128d __a, __m128d __b)
254 {
255   __m128d __c = __builtin_ia32_cmpsd(__b, __a, 2);
256   return (__m128d) { __c[0], __a[1] };
257 }
258 
259 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_sd(__m128d __a,__m128d __b)260 _mm_cmpord_sd(__m128d __a, __m128d __b)
261 {
262   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 7);
263 }
264 
265 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_sd(__m128d __a,__m128d __b)266 _mm_cmpunord_sd(__m128d __a, __m128d __b)
267 {
268   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 3);
269 }
270 
271 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_sd(__m128d __a,__m128d __b)272 _mm_cmpneq_sd(__m128d __a, __m128d __b)
273 {
274   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 4);
275 }
276 
277 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_sd(__m128d __a,__m128d __b)278 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
279 {
280   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 5);
281 }
282 
283 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_sd(__m128d __a,__m128d __b)284 _mm_cmpnle_sd(__m128d __a, __m128d __b)
285 {
286   return (__m128d)__builtin_ia32_cmpsd(__a, __b, 6);
287 }
288 
289 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_sd(__m128d __a,__m128d __b)290 _mm_cmpngt_sd(__m128d __a, __m128d __b)
291 {
292   __m128d __c = __builtin_ia32_cmpsd(__b, __a, 5);
293   return (__m128d) { __c[0], __a[1] };
294 }
295 
296 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_sd(__m128d __a,__m128d __b)297 _mm_cmpnge_sd(__m128d __a, __m128d __b)
298 {
299   __m128d __c = __builtin_ia32_cmpsd(__b, __a, 6);
300   return (__m128d) { __c[0], __a[1] };
301 }
302 
303 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comieq_sd(__m128d __a,__m128d __b)304 _mm_comieq_sd(__m128d __a, __m128d __b)
305 {
306   return __builtin_ia32_comisdeq(__a, __b);
307 }
308 
309 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comilt_sd(__m128d __a,__m128d __b)310 _mm_comilt_sd(__m128d __a, __m128d __b)
311 {
312   return __builtin_ia32_comisdlt(__a, __b);
313 }
314 
315 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comile_sd(__m128d __a,__m128d __b)316 _mm_comile_sd(__m128d __a, __m128d __b)
317 {
318   return __builtin_ia32_comisdle(__a, __b);
319 }
320 
321 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comigt_sd(__m128d __a,__m128d __b)322 _mm_comigt_sd(__m128d __a, __m128d __b)
323 {
324   return __builtin_ia32_comisdgt(__a, __b);
325 }
326 
327 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comige_sd(__m128d __a,__m128d __b)328 _mm_comige_sd(__m128d __a, __m128d __b)
329 {
330   return __builtin_ia32_comisdge(__a, __b);
331 }
332 
333 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comineq_sd(__m128d __a,__m128d __b)334 _mm_comineq_sd(__m128d __a, __m128d __b)
335 {
336   return __builtin_ia32_comisdneq(__a, __b);
337 }
338 
339 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomieq_sd(__m128d __a,__m128d __b)340 _mm_ucomieq_sd(__m128d __a, __m128d __b)
341 {
342   return __builtin_ia32_ucomisdeq(__a, __b);
343 }
344 
345 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomilt_sd(__m128d __a,__m128d __b)346 _mm_ucomilt_sd(__m128d __a, __m128d __b)
347 {
348   return __builtin_ia32_ucomisdlt(__a, __b);
349 }
350 
351 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomile_sd(__m128d __a,__m128d __b)352 _mm_ucomile_sd(__m128d __a, __m128d __b)
353 {
354   return __builtin_ia32_ucomisdle(__a, __b);
355 }
356 
357 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomigt_sd(__m128d __a,__m128d __b)358 _mm_ucomigt_sd(__m128d __a, __m128d __b)
359 {
360   return __builtin_ia32_ucomisdgt(__a, __b);
361 }
362 
363 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomige_sd(__m128d __a,__m128d __b)364 _mm_ucomige_sd(__m128d __a, __m128d __b)
365 {
366   return __builtin_ia32_ucomisdge(__a, __b);
367 }
368 
369 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomineq_sd(__m128d __a,__m128d __b)370 _mm_ucomineq_sd(__m128d __a, __m128d __b)
371 {
372   return __builtin_ia32_ucomisdneq(__a, __b);
373 }
374 
375 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_ps(__m128d __a)376 _mm_cvtpd_ps(__m128d __a)
377 {
378   return __builtin_ia32_cvtpd2ps(__a);
379 }
380 
381 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pd(__m128 __a)382 _mm_cvtps_pd(__m128 __a)
383 {
384   return __builtin_ia32_cvtps2pd(__a);
385 }
386 
387 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_pd(__m128i __a)388 _mm_cvtepi32_pd(__m128i __a)
389 {
390   return __builtin_ia32_cvtdq2pd((__v4si)__a);
391 }
392 
393 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_epi32(__m128d __a)394 _mm_cvtpd_epi32(__m128d __a)
395 {
396   return __builtin_ia32_cvtpd2dq(__a);
397 }
398 
399 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si32(__m128d __a)400 _mm_cvtsd_si32(__m128d __a)
401 {
402   return __builtin_ia32_cvtsd2si(__a);
403 }
404 
405 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_ss(__m128 __a,__m128d __b)406 _mm_cvtsd_ss(__m128 __a, __m128d __b)
407 {
408   __a[0] = __b[0];
409   return __a;
410 }
411 
412 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_sd(__m128d __a,int __b)413 _mm_cvtsi32_sd(__m128d __a, int __b)
414 {
415   __a[0] = __b;
416   return __a;
417 }
418 
419 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_sd(__m128d __a,__m128 __b)420 _mm_cvtss_sd(__m128d __a, __m128 __b)
421 {
422   __a[0] = __b[0];
423   return __a;
424 }
425 
426 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_epi32(__m128d __a)427 _mm_cvttpd_epi32(__m128d __a)
428 {
429   return (__m128i)__builtin_ia32_cvttpd2dq(__a);
430 }
431 
432 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si32(__m128d __a)433 _mm_cvttsd_si32(__m128d __a)
434 {
435   return __a[0];
436 }
437 
438 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_pi32(__m128d __a)439 _mm_cvtpd_pi32(__m128d __a)
440 {
441   return (__m64)__builtin_ia32_cvtpd2pi(__a);
442 }
443 
444 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_pi32(__m128d __a)445 _mm_cvttpd_pi32(__m128d __a)
446 {
447   return (__m64)__builtin_ia32_cvttpd2pi(__a);
448 }
449 
450 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi32_pd(__m64 __a)451 _mm_cvtpi32_pd(__m64 __a)
452 {
453   return __builtin_ia32_cvtpi2pd((__v2si)__a);
454 }
455 
456 static __inline__ double __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_f64(__m128d __a)457 _mm_cvtsd_f64(__m128d __a)
458 {
459   return __a[0];
460 }
461 
462 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_pd(double const * __dp)463 _mm_load_pd(double const *__dp)
464 {
465   return *(__m128d*)__dp;
466 }
467 
468 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load1_pd(double const * __dp)469 _mm_load1_pd(double const *__dp)
470 {
471   struct __mm_load1_pd_struct {
472     double __u;
473   } __attribute__((__packed__, __may_alias__));
474   double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
475   return (__m128d){ __u, __u };
476 }
477 
478 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
479 
480 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadr_pd(double const * __dp)481 _mm_loadr_pd(double const *__dp)
482 {
483   __m128d __u = *(__m128d*)__dp;
484   return __builtin_shufflevector(__u, __u, 1, 0);
485 }
486 
487 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadu_pd(double const * __dp)488 _mm_loadu_pd(double const *__dp)
489 {
490   struct __loadu_pd {
491     __m128d __v;
492   } __attribute__((packed, may_alias));
493   return ((struct __loadu_pd*)__dp)->__v;
494 }
495 
496 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_sd(double const * __dp)497 _mm_load_sd(double const *__dp)
498 {
499   struct __mm_load_sd_struct {
500     double __u;
501   } __attribute__((__packed__, __may_alias__));
502   double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
503   return (__m128d){ __u, 0 };
504 }
505 
506 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pd(__m128d __a,double const * __dp)507 _mm_loadh_pd(__m128d __a, double const *__dp)
508 {
509   struct __mm_loadh_pd_struct {
510     double __u;
511   } __attribute__((__packed__, __may_alias__));
512   double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
513   return (__m128d){ __a[0], __u };
514 }
515 
516 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pd(__m128d __a,double const * __dp)517 _mm_loadl_pd(__m128d __a, double const *__dp)
518 {
519   struct __mm_loadl_pd_struct {
520     double __u;
521   } __attribute__((__packed__, __may_alias__));
522   double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
523   return (__m128d){ __u, __a[1] };
524 }
525 
526 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_sd(double __w)527 _mm_set_sd(double __w)
528 {
529   return (__m128d){ __w, 0 };
530 }
531 
532 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set1_pd(double __w)533 _mm_set1_pd(double __w)
534 {
535   return (__m128d){ __w, __w };
536 }
537 
538 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_pd(double __w,double __x)539 _mm_set_pd(double __w, double __x)
540 {
541   return (__m128d){ __x, __w };
542 }
543 
544 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setr_pd(double __w,double __x)545 _mm_setr_pd(double __w, double __x)
546 {
547   return (__m128d){ __w, __x };
548 }
549 
550 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setzero_pd(void)551 _mm_setzero_pd(void)
552 {
553   return (__m128d){ 0, 0 };
554 }
555 
556 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_move_sd(__m128d __a,__m128d __b)557 _mm_move_sd(__m128d __a, __m128d __b)
558 {
559   return (__m128d){ __b[0], __a[1] };
560 }
561 
562 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_sd(double * __dp,__m128d __a)563 _mm_store_sd(double *__dp, __m128d __a)
564 {
565   struct __mm_store_sd_struct {
566     double __u;
567   } __attribute__((__packed__, __may_alias__));
568   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
569 }
570 
571 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store1_pd(double * __dp,__m128d __a)572 _mm_store1_pd(double *__dp, __m128d __a)
573 {
574   struct __mm_store1_pd_struct {
575     double __u[2];
576   } __attribute__((__packed__, __may_alias__));
577   ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
578   ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
579 }
580 
581 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_pd(double * __dp,__m128d __a)582 _mm_store_pd(double *__dp, __m128d __a)
583 {
584   *(__m128d *)__dp = __a;
585 }
586 
587 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_pd(double * __dp,__m128d __a)588 _mm_storeu_pd(double *__dp, __m128d __a)
589 {
590   __builtin_ia32_storeupd(__dp, __a);
591 }
592 
593 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storer_pd(double * __dp,__m128d __a)594 _mm_storer_pd(double *__dp, __m128d __a)
595 {
596   __a = __builtin_shufflevector(__a, __a, 1, 0);
597   *(__m128d *)__dp = __a;
598 }
599 
600 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeh_pd(double * __dp,__m128d __a)601 _mm_storeh_pd(double *__dp, __m128d __a)
602 {
603   struct __mm_storeh_pd_struct {
604     double __u;
605   } __attribute__((__packed__, __may_alias__));
606   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
607 }
608 
609 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_pd(double * __dp,__m128d __a)610 _mm_storel_pd(double *__dp, __m128d __a)
611 {
612   struct __mm_storeh_pd_struct {
613     double __u;
614   } __attribute__((__packed__, __may_alias__));
615   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
616 }
617 
618 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi8(__m128i __a,__m128i __b)619 _mm_add_epi8(__m128i __a, __m128i __b)
620 {
621   return (__m128i)((__v16qi)__a + (__v16qi)__b);
622 }
623 
624 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi16(__m128i __a,__m128i __b)625 _mm_add_epi16(__m128i __a, __m128i __b)
626 {
627   return (__m128i)((__v8hi)__a + (__v8hi)__b);
628 }
629 
630 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi32(__m128i __a,__m128i __b)631 _mm_add_epi32(__m128i __a, __m128i __b)
632 {
633   return (__m128i)((__v4si)__a + (__v4si)__b);
634 }
635 
636 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_add_si64(__m64 __a,__m64 __b)637 _mm_add_si64(__m64 __a, __m64 __b)
638 {
639   return __a + __b;
640 }
641 
642 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi64(__m128i __a,__m128i __b)643 _mm_add_epi64(__m128i __a, __m128i __b)
644 {
645   return __a + __b;
646 }
647 
648 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi8(__m128i __a,__m128i __b)649 _mm_adds_epi8(__m128i __a, __m128i __b)
650 {
651   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
652 }
653 
654 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi16(__m128i __a,__m128i __b)655 _mm_adds_epi16(__m128i __a, __m128i __b)
656 {
657   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
658 }
659 
660 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu8(__m128i __a,__m128i __b)661 _mm_adds_epu8(__m128i __a, __m128i __b)
662 {
663   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
664 }
665 
666 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu16(__m128i __a,__m128i __b)667 _mm_adds_epu16(__m128i __a, __m128i __b)
668 {
669   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
670 }
671 
672 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu8(__m128i __a,__m128i __b)673 _mm_avg_epu8(__m128i __a, __m128i __b)
674 {
675   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
676 }
677 
678 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu16(__m128i __a,__m128i __b)679 _mm_avg_epu16(__m128i __a, __m128i __b)
680 {
681   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
682 }
683 
684 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_madd_epi16(__m128i __a,__m128i __b)685 _mm_madd_epi16(__m128i __a, __m128i __b)
686 {
687   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
688 }
689 
690 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi16(__m128i __a,__m128i __b)691 _mm_max_epi16(__m128i __a, __m128i __b)
692 {
693   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
694 }
695 
696 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu8(__m128i __a,__m128i __b)697 _mm_max_epu8(__m128i __a, __m128i __b)
698 {
699   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
700 }
701 
702 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi16(__m128i __a,__m128i __b)703 _mm_min_epi16(__m128i __a, __m128i __b)
704 {
705   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
706 }
707 
708 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu8(__m128i __a,__m128i __b)709 _mm_min_epu8(__m128i __a, __m128i __b)
710 {
711   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
712 }
713 
714 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epi16(__m128i __a,__m128i __b)715 _mm_mulhi_epi16(__m128i __a, __m128i __b)
716 {
717   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
718 }
719 
720 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epu16(__m128i __a,__m128i __b)721 _mm_mulhi_epu16(__m128i __a, __m128i __b)
722 {
723   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
724 }
725 
726 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mullo_epi16(__m128i __a,__m128i __b)727 _mm_mullo_epi16(__m128i __a, __m128i __b)
728 {
729   return (__m128i)((__v8hi)__a * (__v8hi)__b);
730 }
731 
732 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mul_su32(__m64 __a,__m64 __b)733 _mm_mul_su32(__m64 __a, __m64 __b)
734 {
735   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
736 }
737 
738 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mul_epu32(__m128i __a,__m128i __b)739 _mm_mul_epu32(__m128i __a, __m128i __b)
740 {
741   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
742 }
743 
744 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sad_epu8(__m128i __a,__m128i __b)745 _mm_sad_epu8(__m128i __a, __m128i __b)
746 {
747   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
748 }
749 
750 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi8(__m128i __a,__m128i __b)751 _mm_sub_epi8(__m128i __a, __m128i __b)
752 {
753   return (__m128i)((__v16qi)__a - (__v16qi)__b);
754 }
755 
756 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi16(__m128i __a,__m128i __b)757 _mm_sub_epi16(__m128i __a, __m128i __b)
758 {
759   return (__m128i)((__v8hi)__a - (__v8hi)__b);
760 }
761 
762 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi32(__m128i __a,__m128i __b)763 _mm_sub_epi32(__m128i __a, __m128i __b)
764 {
765   return (__m128i)((__v4si)__a - (__v4si)__b);
766 }
767 
768 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sub_si64(__m64 __a,__m64 __b)769 _mm_sub_si64(__m64 __a, __m64 __b)
770 {
771   return __a - __b;
772 }
773 
774 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi64(__m128i __a,__m128i __b)775 _mm_sub_epi64(__m128i __a, __m128i __b)
776 {
777   return __a - __b;
778 }
779 
780 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi8(__m128i __a,__m128i __b)781 _mm_subs_epi8(__m128i __a, __m128i __b)
782 {
783   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
784 }
785 
786 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi16(__m128i __a,__m128i __b)787 _mm_subs_epi16(__m128i __a, __m128i __b)
788 {
789   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
790 }
791 
792 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu8(__m128i __a,__m128i __b)793 _mm_subs_epu8(__m128i __a, __m128i __b)
794 {
795   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
796 }
797 
798 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu16(__m128i __a,__m128i __b)799 _mm_subs_epu16(__m128i __a, __m128i __b)
800 {
801   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
802 }
803 
804 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_and_si128(__m128i __a,__m128i __b)805 _mm_and_si128(__m128i __a, __m128i __b)
806 {
807   return __a & __b;
808 }
809 
810 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_andnot_si128(__m128i __a,__m128i __b)811 _mm_andnot_si128(__m128i __a, __m128i __b)
812 {
813   return ~__a & __b;
814 }
815 
816 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_or_si128(__m128i __a,__m128i __b)817 _mm_or_si128(__m128i __a, __m128i __b)
818 {
819   return __a | __b;
820 }
821 
822 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_xor_si128(__m128i __a,__m128i __b)823 _mm_xor_si128(__m128i __a, __m128i __b)
824 {
825   return __a ^ __b;
826 }
827 
828 #define _mm_slli_si128(a, count) __extension__ ({ \
829   __m128i __a = (a); \
830   (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
831 
832 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi16(__m128i __a,int __count)833 _mm_slli_epi16(__m128i __a, int __count)
834 {
835   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
836 }
837 
838 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi16(__m128i __a,__m128i __count)839 _mm_sll_epi16(__m128i __a, __m128i __count)
840 {
841   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
842 }
843 
844 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi32(__m128i __a,int __count)845 _mm_slli_epi32(__m128i __a, int __count)
846 {
847   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
848 }
849 
850 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi32(__m128i __a,__m128i __count)851 _mm_sll_epi32(__m128i __a, __m128i __count)
852 {
853   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
854 }
855 
856 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi64(__m128i __a,int __count)857 _mm_slli_epi64(__m128i __a, int __count)
858 {
859   return __builtin_ia32_psllqi128(__a, __count);
860 }
861 
862 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi64(__m128i __a,__m128i __count)863 _mm_sll_epi64(__m128i __a, __m128i __count)
864 {
865   return __builtin_ia32_psllq128(__a, __count);
866 }
867 
868 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi16(__m128i __a,int __count)869 _mm_srai_epi16(__m128i __a, int __count)
870 {
871   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
872 }
873 
874 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi16(__m128i __a,__m128i __count)875 _mm_sra_epi16(__m128i __a, __m128i __count)
876 {
877   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
878 }
879 
880 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi32(__m128i __a,int __count)881 _mm_srai_epi32(__m128i __a, int __count)
882 {
883   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
884 }
885 
886 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi32(__m128i __a,__m128i __count)887 _mm_sra_epi32(__m128i __a, __m128i __count)
888 {
889   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
890 }
891 
892 
893 #define _mm_srli_si128(a, count) __extension__ ({ \
894   __m128i __a = (a); \
895   (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
896 
897 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi16(__m128i __a,int __count)898 _mm_srli_epi16(__m128i __a, int __count)
899 {
900   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
901 }
902 
903 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi16(__m128i __a,__m128i __count)904 _mm_srl_epi16(__m128i __a, __m128i __count)
905 {
906   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
907 }
908 
909 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi32(__m128i __a,int __count)910 _mm_srli_epi32(__m128i __a, int __count)
911 {
912   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
913 }
914 
915 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi32(__m128i __a,__m128i __count)916 _mm_srl_epi32(__m128i __a, __m128i __count)
917 {
918   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
919 }
920 
921 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi64(__m128i __a,int __count)922 _mm_srli_epi64(__m128i __a, int __count)
923 {
924   return __builtin_ia32_psrlqi128(__a, __count);
925 }
926 
927 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi64(__m128i __a,__m128i __count)928 _mm_srl_epi64(__m128i __a, __m128i __count)
929 {
930   return __builtin_ia32_psrlq128(__a, __count);
931 }
932 
933 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi8(__m128i __a,__m128i __b)934 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
935 {
936   return (__m128i)((__v16qi)__a == (__v16qi)__b);
937 }
938 
939 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi16(__m128i __a,__m128i __b)940 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
941 {
942   return (__m128i)((__v8hi)__a == (__v8hi)__b);
943 }
944 
945 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi32(__m128i __a,__m128i __b)946 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
947 {
948   return (__m128i)((__v4si)__a == (__v4si)__b);
949 }
950 
951 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi8(__m128i __a,__m128i __b)952 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
953 {
954   /* This function always performs a signed comparison, but __v16qi is a char
955      which may be signed or unsigned. */
956   typedef signed char __v16qs __attribute__((__vector_size__(16)));
957   return (__m128i)((__v16qs)__a > (__v16qs)__b);
958 }
959 
960 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi16(__m128i __a,__m128i __b)961 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
962 {
963   return (__m128i)((__v8hi)__a > (__v8hi)__b);
964 }
965 
966 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi32(__m128i __a,__m128i __b)967 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
968 {
969   return (__m128i)((__v4si)__a > (__v4si)__b);
970 }
971 
972 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi8(__m128i __a,__m128i __b)973 _mm_cmplt_epi8(__m128i __a, __m128i __b)
974 {
975   return _mm_cmpgt_epi8(__b, __a);
976 }
977 
978 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi16(__m128i __a,__m128i __b)979 _mm_cmplt_epi16(__m128i __a, __m128i __b)
980 {
981   return _mm_cmpgt_epi16(__b, __a);
982 }
983 
984 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi32(__m128i __a,__m128i __b)985 _mm_cmplt_epi32(__m128i __a, __m128i __b)
986 {
987   return _mm_cmpgt_epi32(__b, __a);
988 }
989 
990 #ifdef __x86_64__
991 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_sd(__m128d __a,long long __b)992 _mm_cvtsi64_sd(__m128d __a, long long __b)
993 {
994   __a[0] = __b;
995   return __a;
996 }
997 
998 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si64(__m128d __a)999 _mm_cvtsd_si64(__m128d __a)
1000 {
1001   return __builtin_ia32_cvtsd2si64(__a);
1002 }
1003 
1004 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si64(__m128d __a)1005 _mm_cvttsd_si64(__m128d __a)
1006 {
1007   return __a[0];
1008 }
1009 #endif
1010 
1011 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_ps(__m128i __a)1012 _mm_cvtepi32_ps(__m128i __a)
1013 {
1014   return __builtin_ia32_cvtdq2ps((__v4si)__a);
1015 }
1016 
1017 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_epi32(__m128 __a)1018 _mm_cvtps_epi32(__m128 __a)
1019 {
1020   return (__m128i)__builtin_ia32_cvtps2dq(__a);
1021 }
1022 
1023 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttps_epi32(__m128 __a)1024 _mm_cvttps_epi32(__m128 __a)
1025 {
1026   return (__m128i)__builtin_ia32_cvttps2dq(__a);
1027 }
1028 
1029 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_si128(int __a)1030 _mm_cvtsi32_si128(int __a)
1031 {
1032   return (__m128i)(__v4si){ __a, 0, 0, 0 };
1033 }
1034 
1035 #ifdef __x86_64__
1036 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_si128(long long __a)1037 _mm_cvtsi64_si128(long long __a)
1038 {
1039   return (__m128i){ __a, 0 };
1040 }
1041 #endif
1042 
1043 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si32(__m128i __a)1044 _mm_cvtsi128_si32(__m128i __a)
1045 {
1046   __v4si __b = (__v4si)__a;
1047   return __b[0];
1048 }
1049 
1050 #ifdef __x86_64__
1051 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si64(__m128i __a)1052 _mm_cvtsi128_si64(__m128i __a)
1053 {
1054   return __a[0];
1055 }
1056 #endif
1057 
1058 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_load_si128(__m128i const * __p)1059 _mm_load_si128(__m128i const *__p)
1060 {
1061   return *__p;
1062 }
1063 
1064 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadu_si128(__m128i const * __p)1065 _mm_loadu_si128(__m128i const *__p)
1066 {
1067   struct __loadu_si128 {
1068     __m128i __v;
1069   } __attribute__((packed, may_alias));
1070   return ((struct __loadu_si128*)__p)->__v;
1071 }
1072 
1073 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadl_epi64(__m128i const * __p)1074 _mm_loadl_epi64(__m128i const *__p)
1075 {
1076   struct __mm_loadl_epi64_struct {
1077     long long __u;
1078   } __attribute__((__packed__, __may_alias__));
1079   return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1080 }
1081 
1082 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64x(long long q1,long long q0)1083 _mm_set_epi64x(long long q1, long long q0)
1084 {
1085   return (__m128i){ q0, q1 };
1086 }
1087 
1088 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64(__m64 q1,__m64 q0)1089 _mm_set_epi64(__m64 q1, __m64 q0)
1090 {
1091   return (__m128i){ (long long)q0, (long long)q1 };
1092 }
1093 
1094 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi32(int i3,int i2,int i1,int i0)1095 _mm_set_epi32(int i3, int i2, int i1, int i0)
1096 {
1097   return (__m128i)(__v4si){ i0, i1, i2, i3};
1098 }
1099 
1100 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi16(short w7,short w6,short w5,short w4,short w3,short w2,short w1,short w0)1101 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1102 {
1103   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1104 }
1105 
1106 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi8(char b15,char b14,char b13,char b12,char b11,char b10,char b9,char b8,char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0)1107 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1108 {
1109   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1110 }
1111 
1112 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64x(long long __q)1113 _mm_set1_epi64x(long long __q)
1114 {
1115   return (__m128i){ __q, __q };
1116 }
1117 
1118 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64(__m64 __q)1119 _mm_set1_epi64(__m64 __q)
1120 {
1121   return (__m128i){ (long long)__q, (long long)__q };
1122 }
1123 
1124 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi32(int __i)1125 _mm_set1_epi32(int __i)
1126 {
1127   return (__m128i)(__v4si){ __i, __i, __i, __i };
1128 }
1129 
1130 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi16(short __w)1131 _mm_set1_epi16(short __w)
1132 {
1133   return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
1134 }
1135 
1136 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi8(char __b)1137 _mm_set1_epi8(char __b)
1138 {
1139   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
1140 }
1141 
1142 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi64(__m64 q0,__m64 q1)1143 _mm_setr_epi64(__m64 q0, __m64 q1)
1144 {
1145   return (__m128i){ (long long)q0, (long long)q1 };
1146 }
1147 
1148 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi32(int i0,int i1,int i2,int i3)1149 _mm_setr_epi32(int i0, int i1, int i2, int i3)
1150 {
1151   return (__m128i)(__v4si){ i0, i1, i2, i3};
1152 }
1153 
1154 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)1155 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1156 {
1157   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1158 }
1159 
1160 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi8(char b0,char b1,char b2,char b3,char b4,char b5,char b6,char b7,char b8,char b9,char b10,char b11,char b12,char b13,char b14,char b15)1161 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1162 {
1163   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1164 }
1165 
1166 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setzero_si128(void)1167 _mm_setzero_si128(void)
1168 {
1169   return (__m128i){ 0LL, 0LL };
1170 }
1171 
1172 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_si128(__m128i * __p,__m128i __b)1173 _mm_store_si128(__m128i *__p, __m128i __b)
1174 {
1175   *__p = __b;
1176 }
1177 
1178 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_si128(__m128i * __p,__m128i __b)1179 _mm_storeu_si128(__m128i *__p, __m128i __b)
1180 {
1181   __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
1182 }
1183 
1184 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)1185 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1186 {
1187   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
1188 }
1189 
1190 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_epi64(__m128i * __p,__m128i __a)1191 _mm_storel_epi64(__m128i *__p, __m128i __a)
1192 {
1193   struct __mm_storel_epi64_struct {
1194     long long __u;
1195   } __attribute__((__packed__, __may_alias__));
1196   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
1197 }
1198 
1199 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_pd(double * __p,__m128d __a)1200 _mm_stream_pd(double *__p, __m128d __a)
1201 {
1202   __builtin_ia32_movntpd(__p, __a);
1203 }
1204 
1205 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si128(__m128i * __p,__m128i __a)1206 _mm_stream_si128(__m128i *__p, __m128i __a)
1207 {
1208   __builtin_ia32_movntdq(__p, __a);
1209 }
1210 
1211 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si32(int * __p,int __a)1212 _mm_stream_si32(int *__p, int __a)
1213 {
1214   __builtin_ia32_movnti(__p, __a);
1215 }
1216 
1217 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_clflush(void const * __p)1218 _mm_clflush(void const *__p)
1219 {
1220   __builtin_ia32_clflush(__p);
1221 }
1222 
1223 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_lfence(void)1224 _mm_lfence(void)
1225 {
1226   __builtin_ia32_lfence();
1227 }
1228 
1229 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_mfence(void)1230 _mm_mfence(void)
1231 {
1232   __builtin_ia32_mfence();
1233 }
1234 
1235 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi16(__m128i __a,__m128i __b)1236 _mm_packs_epi16(__m128i __a, __m128i __b)
1237 {
1238   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
1239 }
1240 
1241 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi32(__m128i __a,__m128i __b)1242 _mm_packs_epi32(__m128i __a, __m128i __b)
1243 {
1244   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
1245 }
1246 
1247 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi16(__m128i __a,__m128i __b)1248 _mm_packus_epi16(__m128i __a, __m128i __b)
1249 {
1250   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
1251 }
1252 
1253 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_extract_epi16(__m128i __a,int __imm)1254 _mm_extract_epi16(__m128i __a, int __imm)
1255 {
1256   __v8hi __b = (__v8hi)__a;
1257   return (unsigned short)__b[__imm];
1258 }
1259 
1260 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_insert_epi16(__m128i __a,int __b,int __imm)1261 _mm_insert_epi16(__m128i __a, int __b, int __imm)
1262 {
1263   __v8hi __c = (__v8hi)__a;
1264   __c[__imm & 7] = __b;
1265   return (__m128i)__c;
1266 }
1267 
1268 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_epi8(__m128i __a)1269 _mm_movemask_epi8(__m128i __a)
1270 {
1271   return __builtin_ia32_pmovmskb128((__v16qi)__a);
1272 }
1273 
1274 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1275   __m128i __a = (a); \
1276   (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
1277                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1278                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1279 
1280 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1281   __m128i __a = (a); \
1282   (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1283                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1284                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1285                                    4, 5, 6, 7); })
1286 
1287 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1288   __m128i __a = (a); \
1289   (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1290                                    0, 1, 2, 3, \
1291                                    4 + (((imm) & 0x03) >> 0), \
1292                                    4 + (((imm) & 0x0c) >> 2), \
1293                                    4 + (((imm) & 0x30) >> 4), \
1294                                    4 + (((imm) & 0xc0) >> 6)); })
1295 
1296 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi8(__m128i __a,__m128i __b)1297 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
1298 {
1299   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1300 }
1301 
1302 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi16(__m128i __a,__m128i __b)1303 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
1304 {
1305   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1306 }
1307 
1308 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi32(__m128i __a,__m128i __b)1309 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
1310 {
1311   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
1312 }
1313 
1314 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi64(__m128i __a,__m128i __b)1315 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
1316 {
1317   return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
1318 }
1319 
1320 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi8(__m128i __a,__m128i __b)1321 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
1322 {
1323   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1324 }
1325 
1326 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi16(__m128i __a,__m128i __b)1327 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
1328 {
1329   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1330 }
1331 
1332 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi32(__m128i __a,__m128i __b)1333 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
1334 {
1335   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
1336 }
1337 
1338 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi64(__m128i __a,__m128i __b)1339 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
1340 {
1341   return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
1342 }
1343 
1344 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_movepi64_pi64(__m128i __a)1345 _mm_movepi64_pi64(__m128i __a)
1346 {
1347   return (__m64)__a[0];
1348 }
1349 
1350 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_movpi64_pi64(__m64 __a)1351 _mm_movpi64_pi64(__m64 __a)
1352 {
1353   return (__m128i){ (long long)__a, 0 };
1354 }
1355 
1356 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_move_epi64(__m128i __a)1357 _mm_move_epi64(__m128i __a)
1358 {
1359   return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
1360 }
1361 
1362 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pd(__m128d __a,__m128d __b)1363 _mm_unpackhi_pd(__m128d __a, __m128d __b)
1364 {
1365   return __builtin_shufflevector(__a, __b, 1, 2+1);
1366 }
1367 
1368 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pd(__m128d __a,__m128d __b)1369 _mm_unpacklo_pd(__m128d __a, __m128d __b)
1370 {
1371   return __builtin_shufflevector(__a, __b, 0, 2+0);
1372 }
1373 
1374 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_pd(__m128d __a)1375 _mm_movemask_pd(__m128d __a)
1376 {
1377   return __builtin_ia32_movmskpd(__a);
1378 }
1379 
1380 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1381   __m128d __a = (a); \
1382   __m128d __b = (b); \
1383   __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
1384 
1385 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castpd_ps(__m128d __a)1386 _mm_castpd_ps(__m128d __a)
1387 {
1388   return (__m128)__a;
1389 }
1390 
1391 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castpd_si128(__m128d __a)1392 _mm_castpd_si128(__m128d __a)
1393 {
1394   return (__m128i)__a;
1395 }
1396 
1397 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castps_pd(__m128 __a)1398 _mm_castps_pd(__m128 __a)
1399 {
1400   return (__m128d)__a;
1401 }
1402 
1403 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castps_si128(__m128 __a)1404 _mm_castps_si128(__m128 __a)
1405 {
1406   return (__m128i)__a;
1407 }
1408 
1409 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_ps(__m128i __a)1410 _mm_castsi128_ps(__m128i __a)
1411 {
1412   return (__m128)__a;
1413 }
1414 
1415 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_pd(__m128i __a)1416 _mm_castsi128_pd(__m128i __a)
1417 {
1418   return (__m128d)__a;
1419 }
1420 
1421 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_pause(void)1422 _mm_pause(void)
1423 {
1424   __asm__ volatile ("pause");
1425 }
1426 
1427 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1428 
1429 #endif /* __SSE2__ */
1430 
1431 #endif /* __EMMINTRIN_H */
1432