1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24 #ifndef __EMMINTRIN_H
25 #define __EMMINTRIN_H
26
27 #ifndef __SSE2__
28 #error "SSE2 instruction set not enabled"
29 #else
30
31 #include <xmmintrin.h>
32
33 typedef double __m128d __attribute__((__vector_size__(16)));
34 typedef long long __m128i __attribute__((__vector_size__(16)));
35
36 /* Type defines. */
37 typedef double __v2df __attribute__ ((__vector_size__ (16)));
38 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39 typedef short __v8hi __attribute__((__vector_size__(16)));
40 typedef char __v16qi __attribute__((__vector_size__(16)));
41
42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_sd(__m128d __a,__m128d __b)43 _mm_add_sd(__m128d __a, __m128d __b)
44 {
45 __a[0] += __b[0];
46 return __a;
47 }
48
49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_pd(__m128d __a,__m128d __b)50 _mm_add_pd(__m128d __a, __m128d __b)
51 {
52 return __a + __b;
53 }
54
55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_sd(__m128d __a,__m128d __b)56 _mm_sub_sd(__m128d __a, __m128d __b)
57 {
58 __a[0] -= __b[0];
59 return __a;
60 }
61
62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_pd(__m128d __a,__m128d __b)63 _mm_sub_pd(__m128d __a, __m128d __b)
64 {
65 return __a - __b;
66 }
67
68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_sd(__m128d __a,__m128d __b)69 _mm_mul_sd(__m128d __a, __m128d __b)
70 {
71 __a[0] *= __b[0];
72 return __a;
73 }
74
75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_pd(__m128d __a,__m128d __b)76 _mm_mul_pd(__m128d __a, __m128d __b)
77 {
78 return __a * __b;
79 }
80
81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_sd(__m128d __a,__m128d __b)82 _mm_div_sd(__m128d __a, __m128d __b)
83 {
84 __a[0] /= __b[0];
85 return __a;
86 }
87
88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_pd(__m128d __a,__m128d __b)89 _mm_div_pd(__m128d __a, __m128d __b)
90 {
91 return __a / __b;
92 }
93
94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_sd(__m128d __a,__m128d __b)95 _mm_sqrt_sd(__m128d __a, __m128d __b)
96 {
97 __m128d __c = __builtin_ia32_sqrtsd(__b);
98 return (__m128d) { __c[0], __a[1] };
99 }
100
101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_pd(__m128d __a)102 _mm_sqrt_pd(__m128d __a)
103 {
104 return __builtin_ia32_sqrtpd(__a);
105 }
106
107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_sd(__m128d __a,__m128d __b)108 _mm_min_sd(__m128d __a, __m128d __b)
109 {
110 return __builtin_ia32_minsd(__a, __b);
111 }
112
113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_pd(__m128d __a,__m128d __b)114 _mm_min_pd(__m128d __a, __m128d __b)
115 {
116 return __builtin_ia32_minpd(__a, __b);
117 }
118
119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_sd(__m128d __a,__m128d __b)120 _mm_max_sd(__m128d __a, __m128d __b)
121 {
122 return __builtin_ia32_maxsd(__a, __b);
123 }
124
125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_pd(__m128d __a,__m128d __b)126 _mm_max_pd(__m128d __a, __m128d __b)
127 {
128 return __builtin_ia32_maxpd(__a, __b);
129 }
130
131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_and_pd(__m128d __a,__m128d __b)132 _mm_and_pd(__m128d __a, __m128d __b)
133 {
134 return (__m128d)((__v4si)__a & (__v4si)__b);
135 }
136
137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_andnot_pd(__m128d __a,__m128d __b)138 _mm_andnot_pd(__m128d __a, __m128d __b)
139 {
140 return (__m128d)(~(__v4si)__a & (__v4si)__b);
141 }
142
143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_or_pd(__m128d __a,__m128d __b)144 _mm_or_pd(__m128d __a, __m128d __b)
145 {
146 return (__m128d)((__v4si)__a | (__v4si)__b);
147 }
148
149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_xor_pd(__m128d __a,__m128d __b)150 _mm_xor_pd(__m128d __a, __m128d __b)
151 {
152 return (__m128d)((__v4si)__a ^ (__v4si)__b);
153 }
154
155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pd(__m128d __a,__m128d __b)156 _mm_cmpeq_pd(__m128d __a, __m128d __b)
157 {
158 return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
159 }
160
161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_pd(__m128d __a,__m128d __b)162 _mm_cmplt_pd(__m128d __a, __m128d __b)
163 {
164 return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
165 }
166
167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_pd(__m128d __a,__m128d __b)168 _mm_cmple_pd(__m128d __a, __m128d __b)
169 {
170 return (__m128d)__builtin_ia32_cmplepd(__a, __b);
171 }
172
173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pd(__m128d __a,__m128d __b)174 _mm_cmpgt_pd(__m128d __a, __m128d __b)
175 {
176 return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
177 }
178
179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_pd(__m128d __a,__m128d __b)180 _mm_cmpge_pd(__m128d __a, __m128d __b)
181 {
182 return (__m128d)__builtin_ia32_cmplepd(__b, __a);
183 }
184
185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_pd(__m128d __a,__m128d __b)186 _mm_cmpord_pd(__m128d __a, __m128d __b)
187 {
188 return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
189 }
190
191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_pd(__m128d __a,__m128d __b)192 _mm_cmpunord_pd(__m128d __a, __m128d __b)
193 {
194 return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
195 }
196
197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_pd(__m128d __a,__m128d __b)198 _mm_cmpneq_pd(__m128d __a, __m128d __b)
199 {
200 return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
201 }
202
203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_pd(__m128d __a,__m128d __b)204 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
205 {
206 return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
207 }
208
209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_pd(__m128d __a,__m128d __b)210 _mm_cmpnle_pd(__m128d __a, __m128d __b)
211 {
212 return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
213 }
214
215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_pd(__m128d __a,__m128d __b)216 _mm_cmpngt_pd(__m128d __a, __m128d __b)
217 {
218 return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
219 }
220
221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_pd(__m128d __a,__m128d __b)222 _mm_cmpnge_pd(__m128d __a, __m128d __b)
223 {
224 return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
225 }
226
227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_sd(__m128d __a,__m128d __b)228 _mm_cmpeq_sd(__m128d __a, __m128d __b)
229 {
230 return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
231 }
232
233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_sd(__m128d __a,__m128d __b)234 _mm_cmplt_sd(__m128d __a, __m128d __b)
235 {
236 return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
237 }
238
239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_sd(__m128d __a,__m128d __b)240 _mm_cmple_sd(__m128d __a, __m128d __b)
241 {
242 return (__m128d)__builtin_ia32_cmplesd(__a, __b);
243 }
244
245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_sd(__m128d __a,__m128d __b)246 _mm_cmpgt_sd(__m128d __a, __m128d __b)
247 {
248 __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
249 return (__m128d) { __c[0], __a[1] };
250 }
251
252 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_sd(__m128d __a,__m128d __b)253 _mm_cmpge_sd(__m128d __a, __m128d __b)
254 {
255 __m128d __c = __builtin_ia32_cmplesd(__b, __a);
256 return (__m128d) { __c[0], __a[1] };
257 }
258
259 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_sd(__m128d __a,__m128d __b)260 _mm_cmpord_sd(__m128d __a, __m128d __b)
261 {
262 return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
263 }
264
265 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_sd(__m128d __a,__m128d __b)266 _mm_cmpunord_sd(__m128d __a, __m128d __b)
267 {
268 return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
269 }
270
271 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_sd(__m128d __a,__m128d __b)272 _mm_cmpneq_sd(__m128d __a, __m128d __b)
273 {
274 return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
275 }
276
277 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_sd(__m128d __a,__m128d __b)278 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
279 {
280 return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
281 }
282
283 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_sd(__m128d __a,__m128d __b)284 _mm_cmpnle_sd(__m128d __a, __m128d __b)
285 {
286 return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
287 }
288
289 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_sd(__m128d __a,__m128d __b)290 _mm_cmpngt_sd(__m128d __a, __m128d __b)
291 {
292 __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
293 return (__m128d) { __c[0], __a[1] };
294 }
295
296 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_sd(__m128d __a,__m128d __b)297 _mm_cmpnge_sd(__m128d __a, __m128d __b)
298 {
299 __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
300 return (__m128d) { __c[0], __a[1] };
301 }
302
303 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comieq_sd(__m128d __a,__m128d __b)304 _mm_comieq_sd(__m128d __a, __m128d __b)
305 {
306 return __builtin_ia32_comisdeq(__a, __b);
307 }
308
309 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comilt_sd(__m128d __a,__m128d __b)310 _mm_comilt_sd(__m128d __a, __m128d __b)
311 {
312 return __builtin_ia32_comisdlt(__a, __b);
313 }
314
315 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comile_sd(__m128d __a,__m128d __b)316 _mm_comile_sd(__m128d __a, __m128d __b)
317 {
318 return __builtin_ia32_comisdle(__a, __b);
319 }
320
321 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comigt_sd(__m128d __a,__m128d __b)322 _mm_comigt_sd(__m128d __a, __m128d __b)
323 {
324 return __builtin_ia32_comisdgt(__a, __b);
325 }
326
327 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comige_sd(__m128d __a,__m128d __b)328 _mm_comige_sd(__m128d __a, __m128d __b)
329 {
330 return __builtin_ia32_comisdge(__a, __b);
331 }
332
333 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comineq_sd(__m128d __a,__m128d __b)334 _mm_comineq_sd(__m128d __a, __m128d __b)
335 {
336 return __builtin_ia32_comisdneq(__a, __b);
337 }
338
339 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomieq_sd(__m128d __a,__m128d __b)340 _mm_ucomieq_sd(__m128d __a, __m128d __b)
341 {
342 return __builtin_ia32_ucomisdeq(__a, __b);
343 }
344
345 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomilt_sd(__m128d __a,__m128d __b)346 _mm_ucomilt_sd(__m128d __a, __m128d __b)
347 {
348 return __builtin_ia32_ucomisdlt(__a, __b);
349 }
350
351 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomile_sd(__m128d __a,__m128d __b)352 _mm_ucomile_sd(__m128d __a, __m128d __b)
353 {
354 return __builtin_ia32_ucomisdle(__a, __b);
355 }
356
357 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomigt_sd(__m128d __a,__m128d __b)358 _mm_ucomigt_sd(__m128d __a, __m128d __b)
359 {
360 return __builtin_ia32_ucomisdgt(__a, __b);
361 }
362
363 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomige_sd(__m128d __a,__m128d __b)364 _mm_ucomige_sd(__m128d __a, __m128d __b)
365 {
366 return __builtin_ia32_ucomisdge(__a, __b);
367 }
368
369 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomineq_sd(__m128d __a,__m128d __b)370 _mm_ucomineq_sd(__m128d __a, __m128d __b)
371 {
372 return __builtin_ia32_ucomisdneq(__a, __b);
373 }
374
375 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_ps(__m128d __a)376 _mm_cvtpd_ps(__m128d __a)
377 {
378 return __builtin_ia32_cvtpd2ps(__a);
379 }
380
381 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pd(__m128 __a)382 _mm_cvtps_pd(__m128 __a)
383 {
384 return __builtin_ia32_cvtps2pd(__a);
385 }
386
387 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_pd(__m128i __a)388 _mm_cvtepi32_pd(__m128i __a)
389 {
390 return __builtin_ia32_cvtdq2pd((__v4si)__a);
391 }
392
393 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_epi32(__m128d __a)394 _mm_cvtpd_epi32(__m128d __a)
395 {
396 return __builtin_ia32_cvtpd2dq(__a);
397 }
398
399 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si32(__m128d __a)400 _mm_cvtsd_si32(__m128d __a)
401 {
402 return __builtin_ia32_cvtsd2si(__a);
403 }
404
405 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_ss(__m128 __a,__m128d __b)406 _mm_cvtsd_ss(__m128 __a, __m128d __b)
407 {
408 __a[0] = __b[0];
409 return __a;
410 }
411
412 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_sd(__m128d __a,int __b)413 _mm_cvtsi32_sd(__m128d __a, int __b)
414 {
415 __a[0] = __b;
416 return __a;
417 }
418
419 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_sd(__m128d __a,__m128 __b)420 _mm_cvtss_sd(__m128d __a, __m128 __b)
421 {
422 __a[0] = __b[0];
423 return __a;
424 }
425
426 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_epi32(__m128d __a)427 _mm_cvttpd_epi32(__m128d __a)
428 {
429 return (__m128i)__builtin_ia32_cvttpd2dq(__a);
430 }
431
432 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si32(__m128d __a)433 _mm_cvttsd_si32(__m128d __a)
434 {
435 return __a[0];
436 }
437
438 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_pi32(__m128d __a)439 _mm_cvtpd_pi32(__m128d __a)
440 {
441 return (__m64)__builtin_ia32_cvtpd2pi(__a);
442 }
443
444 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_pi32(__m128d __a)445 _mm_cvttpd_pi32(__m128d __a)
446 {
447 return (__m64)__builtin_ia32_cvttpd2pi(__a);
448 }
449
450 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi32_pd(__m64 __a)451 _mm_cvtpi32_pd(__m64 __a)
452 {
453 return __builtin_ia32_cvtpi2pd((__v2si)__a);
454 }
455
456 static __inline__ double __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_f64(__m128d __a)457 _mm_cvtsd_f64(__m128d __a)
458 {
459 return __a[0];
460 }
461
462 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_pd(double const * __dp)463 _mm_load_pd(double const *__dp)
464 {
465 return *(__m128d*)__dp;
466 }
467
468 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load1_pd(double const * __dp)469 _mm_load1_pd(double const *__dp)
470 {
471 struct __mm_load1_pd_struct {
472 double __u;
473 } __attribute__((__packed__, __may_alias__));
474 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
475 return (__m128d){ __u, __u };
476 }
477
478 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
479
480 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadr_pd(double const * __dp)481 _mm_loadr_pd(double const *__dp)
482 {
483 __m128d __u = *(__m128d*)__dp;
484 return __builtin_shufflevector(__u, __u, 1, 0);
485 }
486
487 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadu_pd(double const * __dp)488 _mm_loadu_pd(double const *__dp)
489 {
490 struct __loadu_pd {
491 __m128d __v;
492 } __attribute__((__packed__, __may_alias__));
493 return ((struct __loadu_pd*)__dp)->__v;
494 }
495
496 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_sd(double const * __dp)497 _mm_load_sd(double const *__dp)
498 {
499 struct __mm_load_sd_struct {
500 double __u;
501 } __attribute__((__packed__, __may_alias__));
502 double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
503 return (__m128d){ __u, 0 };
504 }
505
506 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pd(__m128d __a,double const * __dp)507 _mm_loadh_pd(__m128d __a, double const *__dp)
508 {
509 struct __mm_loadh_pd_struct {
510 double __u;
511 } __attribute__((__packed__, __may_alias__));
512 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
513 return (__m128d){ __a[0], __u };
514 }
515
516 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pd(__m128d __a,double const * __dp)517 _mm_loadl_pd(__m128d __a, double const *__dp)
518 {
519 struct __mm_loadl_pd_struct {
520 double __u;
521 } __attribute__((__packed__, __may_alias__));
522 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
523 return (__m128d){ __u, __a[1] };
524 }
525
526 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_sd(double __w)527 _mm_set_sd(double __w)
528 {
529 return (__m128d){ __w, 0 };
530 }
531
532 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set1_pd(double __w)533 _mm_set1_pd(double __w)
534 {
535 return (__m128d){ __w, __w };
536 }
537
538 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_pd(double __w,double __x)539 _mm_set_pd(double __w, double __x)
540 {
541 return (__m128d){ __x, __w };
542 }
543
544 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setr_pd(double __w,double __x)545 _mm_setr_pd(double __w, double __x)
546 {
547 return (__m128d){ __w, __x };
548 }
549
550 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setzero_pd(void)551 _mm_setzero_pd(void)
552 {
553 return (__m128d){ 0, 0 };
554 }
555
556 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_move_sd(__m128d __a,__m128d __b)557 _mm_move_sd(__m128d __a, __m128d __b)
558 {
559 return (__m128d){ __b[0], __a[1] };
560 }
561
562 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_sd(double * __dp,__m128d __a)563 _mm_store_sd(double *__dp, __m128d __a)
564 {
565 struct __mm_store_sd_struct {
566 double __u;
567 } __attribute__((__packed__, __may_alias__));
568 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
569 }
570
571 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store1_pd(double * __dp,__m128d __a)572 _mm_store1_pd(double *__dp, __m128d __a)
573 {
574 struct __mm_store1_pd_struct {
575 double __u[2];
576 } __attribute__((__packed__, __may_alias__));
577 ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
578 ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
579 }
580
581 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_pd(double * __dp,__m128d __a)582 _mm_store_pd(double *__dp, __m128d __a)
583 {
584 *(__m128d *)__dp = __a;
585 }
586
587 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_pd(double * __dp,__m128d __a)588 _mm_storeu_pd(double *__dp, __m128d __a)
589 {
590 __builtin_ia32_storeupd(__dp, __a);
591 }
592
593 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storer_pd(double * __dp,__m128d __a)594 _mm_storer_pd(double *__dp, __m128d __a)
595 {
596 __a = __builtin_shufflevector(__a, __a, 1, 0);
597 *(__m128d *)__dp = __a;
598 }
599
600 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeh_pd(double * __dp,__m128d __a)601 _mm_storeh_pd(double *__dp, __m128d __a)
602 {
603 struct __mm_storeh_pd_struct {
604 double __u;
605 } __attribute__((__packed__, __may_alias__));
606 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
607 }
608
609 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_pd(double * __dp,__m128d __a)610 _mm_storel_pd(double *__dp, __m128d __a)
611 {
612 struct __mm_storeh_pd_struct {
613 double __u;
614 } __attribute__((__packed__, __may_alias__));
615 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
616 }
617
618 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi8(__m128i __a,__m128i __b)619 _mm_add_epi8(__m128i __a, __m128i __b)
620 {
621 return (__m128i)((__v16qi)__a + (__v16qi)__b);
622 }
623
624 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi16(__m128i __a,__m128i __b)625 _mm_add_epi16(__m128i __a, __m128i __b)
626 {
627 return (__m128i)((__v8hi)__a + (__v8hi)__b);
628 }
629
630 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi32(__m128i __a,__m128i __b)631 _mm_add_epi32(__m128i __a, __m128i __b)
632 {
633 return (__m128i)((__v4si)__a + (__v4si)__b);
634 }
635
636 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_add_si64(__m64 __a,__m64 __b)637 _mm_add_si64(__m64 __a, __m64 __b)
638 {
639 return __a + __b;
640 }
641
642 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi64(__m128i __a,__m128i __b)643 _mm_add_epi64(__m128i __a, __m128i __b)
644 {
645 return __a + __b;
646 }
647
648 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi8(__m128i __a,__m128i __b)649 _mm_adds_epi8(__m128i __a, __m128i __b)
650 {
651 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
652 }
653
654 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi16(__m128i __a,__m128i __b)655 _mm_adds_epi16(__m128i __a, __m128i __b)
656 {
657 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
658 }
659
660 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu8(__m128i __a,__m128i __b)661 _mm_adds_epu8(__m128i __a, __m128i __b)
662 {
663 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
664 }
665
666 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu16(__m128i __a,__m128i __b)667 _mm_adds_epu16(__m128i __a, __m128i __b)
668 {
669 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
670 }
671
672 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu8(__m128i __a,__m128i __b)673 _mm_avg_epu8(__m128i __a, __m128i __b)
674 {
675 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
676 }
677
678 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu16(__m128i __a,__m128i __b)679 _mm_avg_epu16(__m128i __a, __m128i __b)
680 {
681 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
682 }
683
684 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_madd_epi16(__m128i __a,__m128i __b)685 _mm_madd_epi16(__m128i __a, __m128i __b)
686 {
687 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
688 }
689
690 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi16(__m128i __a,__m128i __b)691 _mm_max_epi16(__m128i __a, __m128i __b)
692 {
693 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
694 }
695
696 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu8(__m128i __a,__m128i __b)697 _mm_max_epu8(__m128i __a, __m128i __b)
698 {
699 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
700 }
701
702 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi16(__m128i __a,__m128i __b)703 _mm_min_epi16(__m128i __a, __m128i __b)
704 {
705 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
706 }
707
708 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu8(__m128i __a,__m128i __b)709 _mm_min_epu8(__m128i __a, __m128i __b)
710 {
711 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
712 }
713
714 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epi16(__m128i __a,__m128i __b)715 _mm_mulhi_epi16(__m128i __a, __m128i __b)
716 {
717 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
718 }
719
720 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epu16(__m128i __a,__m128i __b)721 _mm_mulhi_epu16(__m128i __a, __m128i __b)
722 {
723 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
724 }
725
726 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mullo_epi16(__m128i __a,__m128i __b)727 _mm_mullo_epi16(__m128i __a, __m128i __b)
728 {
729 return (__m128i)((__v8hi)__a * (__v8hi)__b);
730 }
731
732 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mul_su32(__m64 __a,__m64 __b)733 _mm_mul_su32(__m64 __a, __m64 __b)
734 {
735 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
736 }
737
738 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mul_epu32(__m128i __a,__m128i __b)739 _mm_mul_epu32(__m128i __a, __m128i __b)
740 {
741 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
742 }
743
744 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sad_epu8(__m128i __a,__m128i __b)745 _mm_sad_epu8(__m128i __a, __m128i __b)
746 {
747 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
748 }
749
750 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi8(__m128i __a,__m128i __b)751 _mm_sub_epi8(__m128i __a, __m128i __b)
752 {
753 return (__m128i)((__v16qi)__a - (__v16qi)__b);
754 }
755
756 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi16(__m128i __a,__m128i __b)757 _mm_sub_epi16(__m128i __a, __m128i __b)
758 {
759 return (__m128i)((__v8hi)__a - (__v8hi)__b);
760 }
761
762 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi32(__m128i __a,__m128i __b)763 _mm_sub_epi32(__m128i __a, __m128i __b)
764 {
765 return (__m128i)((__v4si)__a - (__v4si)__b);
766 }
767
768 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sub_si64(__m64 __a,__m64 __b)769 _mm_sub_si64(__m64 __a, __m64 __b)
770 {
771 return __a - __b;
772 }
773
774 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi64(__m128i __a,__m128i __b)775 _mm_sub_epi64(__m128i __a, __m128i __b)
776 {
777 return __a - __b;
778 }
779
780 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi8(__m128i __a,__m128i __b)781 _mm_subs_epi8(__m128i __a, __m128i __b)
782 {
783 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
784 }
785
786 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi16(__m128i __a,__m128i __b)787 _mm_subs_epi16(__m128i __a, __m128i __b)
788 {
789 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
790 }
791
792 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu8(__m128i __a,__m128i __b)793 _mm_subs_epu8(__m128i __a, __m128i __b)
794 {
795 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
796 }
797
798 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu16(__m128i __a,__m128i __b)799 _mm_subs_epu16(__m128i __a, __m128i __b)
800 {
801 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
802 }
803
804 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_and_si128(__m128i __a,__m128i __b)805 _mm_and_si128(__m128i __a, __m128i __b)
806 {
807 return __a & __b;
808 }
809
810 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_andnot_si128(__m128i __a,__m128i __b)811 _mm_andnot_si128(__m128i __a, __m128i __b)
812 {
813 return ~__a & __b;
814 }
815
816 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_or_si128(__m128i __a,__m128i __b)817 _mm_or_si128(__m128i __a, __m128i __b)
818 {
819 return __a | __b;
820 }
821
822 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_xor_si128(__m128i __a,__m128i __b)823 _mm_xor_si128(__m128i __a, __m128i __b)
824 {
825 return __a ^ __b;
826 }
827
828 #define _mm_slli_si128(a, imm) __extension__ ({ \
829 (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \
830 (__v16qi)(__m128i)(a), \
831 ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
832 ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
833 ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
834 ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
835 ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
836 ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
837 ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
838 ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
839 ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
840 ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
841 ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
842 ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
843 ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
844 ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
845 ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
846 ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
847
848 #define _mm_bslli_si128(a, imm) \
849 _mm_slli_si128((a), (imm))
850
851 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi16(__m128i __a,int __count)852 _mm_slli_epi16(__m128i __a, int __count)
853 {
854 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
855 }
856
857 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi16(__m128i __a,__m128i __count)858 _mm_sll_epi16(__m128i __a, __m128i __count)
859 {
860 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
861 }
862
863 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi32(__m128i __a,int __count)864 _mm_slli_epi32(__m128i __a, int __count)
865 {
866 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
867 }
868
869 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi32(__m128i __a,__m128i __count)870 _mm_sll_epi32(__m128i __a, __m128i __count)
871 {
872 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
873 }
874
875 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi64(__m128i __a,int __count)876 _mm_slli_epi64(__m128i __a, int __count)
877 {
878 return __builtin_ia32_psllqi128(__a, __count);
879 }
880
881 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi64(__m128i __a,__m128i __count)882 _mm_sll_epi64(__m128i __a, __m128i __count)
883 {
884 return __builtin_ia32_psllq128(__a, __count);
885 }
886
887 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi16(__m128i __a,int __count)888 _mm_srai_epi16(__m128i __a, int __count)
889 {
890 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
891 }
892
893 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi16(__m128i __a,__m128i __count)894 _mm_sra_epi16(__m128i __a, __m128i __count)
895 {
896 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
897 }
898
899 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi32(__m128i __a,int __count)900 _mm_srai_epi32(__m128i __a, int __count)
901 {
902 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
903 }
904
905 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi32(__m128i __a,__m128i __count)906 _mm_sra_epi32(__m128i __a, __m128i __count)
907 {
908 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
909 }
910
911 #define _mm_srli_si128(a, imm) __extension__ ({ \
912 (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \
913 (__v16qi)_mm_setzero_si128(), \
914 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0, \
915 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1, \
916 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2, \
917 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3, \
918 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4, \
919 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5, \
920 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6, \
921 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7, \
922 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8, \
923 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9, \
924 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
925 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
926 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
927 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
928 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
929 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
930
931 #define _mm_bsrli_si128(a, imm) \
932 _mm_srli_si128((a), (imm))
933
934 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi16(__m128i __a,int __count)935 _mm_srli_epi16(__m128i __a, int __count)
936 {
937 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
938 }
939
940 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi16(__m128i __a,__m128i __count)941 _mm_srl_epi16(__m128i __a, __m128i __count)
942 {
943 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
944 }
945
946 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi32(__m128i __a,int __count)947 _mm_srli_epi32(__m128i __a, int __count)
948 {
949 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
950 }
951
952 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi32(__m128i __a,__m128i __count)953 _mm_srl_epi32(__m128i __a, __m128i __count)
954 {
955 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
956 }
957
958 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi64(__m128i __a,int __count)959 _mm_srli_epi64(__m128i __a, int __count)
960 {
961 return __builtin_ia32_psrlqi128(__a, __count);
962 }
963
964 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi64(__m128i __a,__m128i __count)965 _mm_srl_epi64(__m128i __a, __m128i __count)
966 {
967 return __builtin_ia32_psrlq128(__a, __count);
968 }
969
970 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi8(__m128i __a,__m128i __b)971 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
972 {
973 return (__m128i)((__v16qi)__a == (__v16qi)__b);
974 }
975
976 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi16(__m128i __a,__m128i __b)977 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
978 {
979 return (__m128i)((__v8hi)__a == (__v8hi)__b);
980 }
981
982 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi32(__m128i __a,__m128i __b)983 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
984 {
985 return (__m128i)((__v4si)__a == (__v4si)__b);
986 }
987
988 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi8(__m128i __a,__m128i __b)989 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
990 {
991 /* This function always performs a signed comparison, but __v16qi is a char
992 which may be signed or unsigned. */
993 typedef signed char __v16qs __attribute__((__vector_size__(16)));
994 return (__m128i)((__v16qs)__a > (__v16qs)__b);
995 }
996
997 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi16(__m128i __a,__m128i __b)998 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
999 {
1000 return (__m128i)((__v8hi)__a > (__v8hi)__b);
1001 }
1002
1003 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi32(__m128i __a,__m128i __b)1004 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
1005 {
1006 return (__m128i)((__v4si)__a > (__v4si)__b);
1007 }
1008
1009 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi8(__m128i __a,__m128i __b)1010 _mm_cmplt_epi8(__m128i __a, __m128i __b)
1011 {
1012 return _mm_cmpgt_epi8(__b, __a);
1013 }
1014
1015 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi16(__m128i __a,__m128i __b)1016 _mm_cmplt_epi16(__m128i __a, __m128i __b)
1017 {
1018 return _mm_cmpgt_epi16(__b, __a);
1019 }
1020
1021 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi32(__m128i __a,__m128i __b)1022 _mm_cmplt_epi32(__m128i __a, __m128i __b)
1023 {
1024 return _mm_cmpgt_epi32(__b, __a);
1025 }
1026
1027 #ifdef __x86_64__
1028 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_sd(__m128d __a,long long __b)1029 _mm_cvtsi64_sd(__m128d __a, long long __b)
1030 {
1031 __a[0] = __b;
1032 return __a;
1033 }
1034
1035 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si64(__m128d __a)1036 _mm_cvtsd_si64(__m128d __a)
1037 {
1038 return __builtin_ia32_cvtsd2si64(__a);
1039 }
1040
1041 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si64(__m128d __a)1042 _mm_cvttsd_si64(__m128d __a)
1043 {
1044 return __a[0];
1045 }
1046 #endif
1047
1048 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_ps(__m128i __a)1049 _mm_cvtepi32_ps(__m128i __a)
1050 {
1051 return __builtin_ia32_cvtdq2ps((__v4si)__a);
1052 }
1053
1054 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_epi32(__m128 __a)1055 _mm_cvtps_epi32(__m128 __a)
1056 {
1057 return (__m128i)__builtin_ia32_cvtps2dq(__a);
1058 }
1059
1060 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttps_epi32(__m128 __a)1061 _mm_cvttps_epi32(__m128 __a)
1062 {
1063 return (__m128i)__builtin_ia32_cvttps2dq(__a);
1064 }
1065
1066 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_si128(int __a)1067 _mm_cvtsi32_si128(int __a)
1068 {
1069 return (__m128i)(__v4si){ __a, 0, 0, 0 };
1070 }
1071
1072 #ifdef __x86_64__
1073 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_si128(long long __a)1074 _mm_cvtsi64_si128(long long __a)
1075 {
1076 return (__m128i){ __a, 0 };
1077 }
1078 #endif
1079
1080 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si32(__m128i __a)1081 _mm_cvtsi128_si32(__m128i __a)
1082 {
1083 __v4si __b = (__v4si)__a;
1084 return __b[0];
1085 }
1086
1087 #ifdef __x86_64__
1088 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si64(__m128i __a)1089 _mm_cvtsi128_si64(__m128i __a)
1090 {
1091 return __a[0];
1092 }
1093 #endif
1094
1095 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_load_si128(__m128i const * __p)1096 _mm_load_si128(__m128i const *__p)
1097 {
1098 return *__p;
1099 }
1100
1101 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadu_si128(__m128i const * __p)1102 _mm_loadu_si128(__m128i const *__p)
1103 {
1104 struct __loadu_si128 {
1105 __m128i __v;
1106 } __attribute__((__packed__, __may_alias__));
1107 return ((struct __loadu_si128*)__p)->__v;
1108 }
1109
1110 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadl_epi64(__m128i const * __p)1111 _mm_loadl_epi64(__m128i const *__p)
1112 {
1113 struct __mm_loadl_epi64_struct {
1114 long long __u;
1115 } __attribute__((__packed__, __may_alias__));
1116 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1117 }
1118
1119 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64x(long long q1,long long q0)1120 _mm_set_epi64x(long long q1, long long q0)
1121 {
1122 return (__m128i){ q0, q1 };
1123 }
1124
1125 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64(__m64 q1,__m64 q0)1126 _mm_set_epi64(__m64 q1, __m64 q0)
1127 {
1128 return (__m128i){ (long long)q0, (long long)q1 };
1129 }
1130
1131 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi32(int i3,int i2,int i1,int i0)1132 _mm_set_epi32(int i3, int i2, int i1, int i0)
1133 {
1134 return (__m128i)(__v4si){ i0, i1, i2, i3};
1135 }
1136
1137 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi16(short w7,short w6,short w5,short w4,short w3,short w2,short w1,short w0)1138 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1139 {
1140 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1141 }
1142
1143 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi8(char b15,char b14,char b13,char b12,char b11,char b10,char b9,char b8,char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0)1144 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1145 {
1146 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1147 }
1148
1149 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64x(long long __q)1150 _mm_set1_epi64x(long long __q)
1151 {
1152 return (__m128i){ __q, __q };
1153 }
1154
1155 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64(__m64 __q)1156 _mm_set1_epi64(__m64 __q)
1157 {
1158 return (__m128i){ (long long)__q, (long long)__q };
1159 }
1160
1161 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi32(int __i)1162 _mm_set1_epi32(int __i)
1163 {
1164 return (__m128i)(__v4si){ __i, __i, __i, __i };
1165 }
1166
1167 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi16(short __w)1168 _mm_set1_epi16(short __w)
1169 {
1170 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
1171 }
1172
1173 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi8(char __b)1174 _mm_set1_epi8(char __b)
1175 {
1176 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
1177 }
1178
1179 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi64(__m64 q0,__m64 q1)1180 _mm_setr_epi64(__m64 q0, __m64 q1)
1181 {
1182 return (__m128i){ (long long)q0, (long long)q1 };
1183 }
1184
1185 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi32(int i0,int i1,int i2,int i3)1186 _mm_setr_epi32(int i0, int i1, int i2, int i3)
1187 {
1188 return (__m128i)(__v4si){ i0, i1, i2, i3};
1189 }
1190
1191 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)1192 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1193 {
1194 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1195 }
1196
1197 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi8(char b0,char b1,char b2,char b3,char b4,char b5,char b6,char b7,char b8,char b9,char b10,char b11,char b12,char b13,char b14,char b15)1198 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1199 {
1200 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1201 }
1202
1203 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setzero_si128(void)1204 _mm_setzero_si128(void)
1205 {
1206 return (__m128i){ 0LL, 0LL };
1207 }
1208
1209 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_si128(__m128i * __p,__m128i __b)1210 _mm_store_si128(__m128i *__p, __m128i __b)
1211 {
1212 *__p = __b;
1213 }
1214
1215 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_si128(__m128i * __p,__m128i __b)1216 _mm_storeu_si128(__m128i *__p, __m128i __b)
1217 {
1218 __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
1219 }
1220
1221 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)1222 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1223 {
1224 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
1225 }
1226
1227 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_epi64(__m128i * __p,__m128i __a)1228 _mm_storel_epi64(__m128i *__p, __m128i __a)
1229 {
1230 struct __mm_storel_epi64_struct {
1231 long long __u;
1232 } __attribute__((__packed__, __may_alias__));
1233 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
1234 }
1235
1236 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_pd(double * __p,__m128d __a)1237 _mm_stream_pd(double *__p, __m128d __a)
1238 {
1239 __builtin_ia32_movntpd(__p, __a);
1240 }
1241
1242 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si128(__m128i * __p,__m128i __a)1243 _mm_stream_si128(__m128i *__p, __m128i __a)
1244 {
1245 __builtin_ia32_movntdq(__p, __a);
1246 }
1247
1248 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si32(int * __p,int __a)1249 _mm_stream_si32(int *__p, int __a)
1250 {
1251 __builtin_ia32_movnti(__p, __a);
1252 }
1253
1254 #ifdef __x86_64__
1255 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si64(long long * __p,long long __a)1256 _mm_stream_si64(long long *__p, long long __a)
1257 {
1258 __builtin_ia32_movnti64(__p, __a);
1259 }
1260 #endif
1261
1262 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_clflush(void const * __p)1263 _mm_clflush(void const *__p)
1264 {
1265 __builtin_ia32_clflush(__p);
1266 }
1267
1268 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_lfence(void)1269 _mm_lfence(void)
1270 {
1271 __builtin_ia32_lfence();
1272 }
1273
1274 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_mfence(void)1275 _mm_mfence(void)
1276 {
1277 __builtin_ia32_mfence();
1278 }
1279
1280 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi16(__m128i __a,__m128i __b)1281 _mm_packs_epi16(__m128i __a, __m128i __b)
1282 {
1283 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
1284 }
1285
1286 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi32(__m128i __a,__m128i __b)1287 _mm_packs_epi32(__m128i __a, __m128i __b)
1288 {
1289 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
1290 }
1291
1292 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi16(__m128i __a,__m128i __b)1293 _mm_packus_epi16(__m128i __a, __m128i __b)
1294 {
1295 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
1296 }
1297
1298 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_extract_epi16(__m128i __a,int __imm)1299 _mm_extract_epi16(__m128i __a, int __imm)
1300 {
1301 __v8hi __b = (__v8hi)__a;
1302 return (unsigned short)__b[__imm & 7];
1303 }
1304
1305 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_insert_epi16(__m128i __a,int __b,int __imm)1306 _mm_insert_epi16(__m128i __a, int __b, int __imm)
1307 {
1308 __v8hi __c = (__v8hi)__a;
1309 __c[__imm & 7] = __b;
1310 return (__m128i)__c;
1311 }
1312
1313 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_epi8(__m128i __a)1314 _mm_movemask_epi8(__m128i __a)
1315 {
1316 return __builtin_ia32_pmovmskb128((__v16qi)__a);
1317 }
1318
1319 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1320 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
1321 (__v4si)_mm_set1_epi32(0), \
1322 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1323 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1324
1325 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1326 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
1327 (__v8hi)_mm_set1_epi16(0), \
1328 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1329 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1330 4, 5, 6, 7); })
1331
1332 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1333 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
1334 (__v8hi)_mm_set1_epi16(0), \
1335 0, 1, 2, 3, \
1336 4 + (((imm) & 0x03) >> 0), \
1337 4 + (((imm) & 0x0c) >> 2), \
1338 4 + (((imm) & 0x30) >> 4), \
1339 4 + (((imm) & 0xc0) >> 6)); })
1340
1341 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi8(__m128i __a,__m128i __b)1342 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
1343 {
1344 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1345 }
1346
1347 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi16(__m128i __a,__m128i __b)1348 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
1349 {
1350 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1351 }
1352
1353 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi32(__m128i __a,__m128i __b)1354 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
1355 {
1356 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
1357 }
1358
1359 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi64(__m128i __a,__m128i __b)1360 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
1361 {
1362 return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
1363 }
1364
1365 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi8(__m128i __a,__m128i __b)1366 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
1367 {
1368 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1369 }
1370
1371 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi16(__m128i __a,__m128i __b)1372 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
1373 {
1374 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1375 }
1376
1377 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi32(__m128i __a,__m128i __b)1378 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
1379 {
1380 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
1381 }
1382
1383 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi64(__m128i __a,__m128i __b)1384 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
1385 {
1386 return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
1387 }
1388
1389 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_movepi64_pi64(__m128i __a)1390 _mm_movepi64_pi64(__m128i __a)
1391 {
1392 return (__m64)__a[0];
1393 }
1394
1395 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_movpi64_epi64(__m64 __a)1396 _mm_movpi64_epi64(__m64 __a)
1397 {
1398 return (__m128i){ (long long)__a, 0 };
1399 }
1400
1401 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_move_epi64(__m128i __a)1402 _mm_move_epi64(__m128i __a)
1403 {
1404 return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
1405 }
1406
1407 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pd(__m128d __a,__m128d __b)1408 _mm_unpackhi_pd(__m128d __a, __m128d __b)
1409 {
1410 return __builtin_shufflevector(__a, __b, 1, 2+1);
1411 }
1412
1413 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pd(__m128d __a,__m128d __b)1414 _mm_unpacklo_pd(__m128d __a, __m128d __b)
1415 {
1416 return __builtin_shufflevector(__a, __b, 0, 2+0);
1417 }
1418
1419 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_pd(__m128d __a)1420 _mm_movemask_pd(__m128d __a)
1421 {
1422 return __builtin_ia32_movmskpd(__a);
1423 }
1424
1425 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1426 __builtin_shufflevector((__m128d)(a), (__m128d)(b), \
1427 (i) & 1, (((i) & 2) >> 1) + 2); })
1428
1429 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castpd_ps(__m128d __a)1430 _mm_castpd_ps(__m128d __a)
1431 {
1432 return (__m128)__a;
1433 }
1434
1435 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castpd_si128(__m128d __a)1436 _mm_castpd_si128(__m128d __a)
1437 {
1438 return (__m128i)__a;
1439 }
1440
1441 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castps_pd(__m128 __a)1442 _mm_castps_pd(__m128 __a)
1443 {
1444 return (__m128d)__a;
1445 }
1446
1447 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castps_si128(__m128 __a)1448 _mm_castps_si128(__m128 __a)
1449 {
1450 return (__m128i)__a;
1451 }
1452
1453 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_ps(__m128i __a)1454 _mm_castsi128_ps(__m128i __a)
1455 {
1456 return (__m128)__a;
1457 }
1458
1459 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_pd(__m128i __a)1460 _mm_castsi128_pd(__m128i __a)
1461 {
1462 return (__m128d)__a;
1463 }
1464
1465 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_pause(void)1466 _mm_pause(void)
1467 {
1468 __asm__ volatile ("pause");
1469 }
1470
1471 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1472
1473 #endif /* __SSE2__ */
1474
1475 #endif /* __EMMINTRIN_H */
1476