1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24 #ifndef __EMMINTRIN_H
25 #define __EMMINTRIN_H
26
27 #ifndef __SSE2__
28 #error "SSE2 instruction set not enabled"
29 #else
30
31 #include <xmmintrin.h>
32
33 typedef double __m128d __attribute__((__vector_size__(16)));
34 typedef long long __m128i __attribute__((__vector_size__(16)));
35
36 /* Type defines. */
37 typedef double __v2df __attribute__ ((__vector_size__ (16)));
38 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39 typedef short __v8hi __attribute__((__vector_size__(16)));
40 typedef char __v16qi __attribute__((__vector_size__(16)));
41
42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_sd(__m128d __a,__m128d __b)43 _mm_add_sd(__m128d __a, __m128d __b)
44 {
45 __a[0] += __b[0];
46 return __a;
47 }
48
49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_pd(__m128d __a,__m128d __b)50 _mm_add_pd(__m128d __a, __m128d __b)
51 {
52 return __a + __b;
53 }
54
55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_sd(__m128d __a,__m128d __b)56 _mm_sub_sd(__m128d __a, __m128d __b)
57 {
58 __a[0] -= __b[0];
59 return __a;
60 }
61
62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_pd(__m128d __a,__m128d __b)63 _mm_sub_pd(__m128d __a, __m128d __b)
64 {
65 return __a - __b;
66 }
67
68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_sd(__m128d __a,__m128d __b)69 _mm_mul_sd(__m128d __a, __m128d __b)
70 {
71 __a[0] *= __b[0];
72 return __a;
73 }
74
75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_pd(__m128d __a,__m128d __b)76 _mm_mul_pd(__m128d __a, __m128d __b)
77 {
78 return __a * __b;
79 }
80
81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_sd(__m128d __a,__m128d __b)82 _mm_div_sd(__m128d __a, __m128d __b)
83 {
84 __a[0] /= __b[0];
85 return __a;
86 }
87
88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_pd(__m128d __a,__m128d __b)89 _mm_div_pd(__m128d __a, __m128d __b)
90 {
91 return __a / __b;
92 }
93
94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_sd(__m128d __a,__m128d __b)95 _mm_sqrt_sd(__m128d __a, __m128d __b)
96 {
97 __m128d __c = __builtin_ia32_sqrtsd(__b);
98 return (__m128d) { __c[0], __a[1] };
99 }
100
101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_pd(__m128d __a)102 _mm_sqrt_pd(__m128d __a)
103 {
104 return __builtin_ia32_sqrtpd(__a);
105 }
106
107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_sd(__m128d __a,__m128d __b)108 _mm_min_sd(__m128d __a, __m128d __b)
109 {
110 return __builtin_ia32_minsd(__a, __b);
111 }
112
113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_pd(__m128d __a,__m128d __b)114 _mm_min_pd(__m128d __a, __m128d __b)
115 {
116 return __builtin_ia32_minpd(__a, __b);
117 }
118
119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_sd(__m128d __a,__m128d __b)120 _mm_max_sd(__m128d __a, __m128d __b)
121 {
122 return __builtin_ia32_maxsd(__a, __b);
123 }
124
125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_pd(__m128d __a,__m128d __b)126 _mm_max_pd(__m128d __a, __m128d __b)
127 {
128 return __builtin_ia32_maxpd(__a, __b);
129 }
130
131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_and_pd(__m128d __a,__m128d __b)132 _mm_and_pd(__m128d __a, __m128d __b)
133 {
134 return (__m128d)((__v4si)__a & (__v4si)__b);
135 }
136
137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_andnot_pd(__m128d __a,__m128d __b)138 _mm_andnot_pd(__m128d __a, __m128d __b)
139 {
140 return (__m128d)(~(__v4si)__a & (__v4si)__b);
141 }
142
143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_or_pd(__m128d __a,__m128d __b)144 _mm_or_pd(__m128d __a, __m128d __b)
145 {
146 return (__m128d)((__v4si)__a | (__v4si)__b);
147 }
148
149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_xor_pd(__m128d __a,__m128d __b)150 _mm_xor_pd(__m128d __a, __m128d __b)
151 {
152 return (__m128d)((__v4si)__a ^ (__v4si)__b);
153 }
154
155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pd(__m128d __a,__m128d __b)156 _mm_cmpeq_pd(__m128d __a, __m128d __b)
157 {
158 return (__m128d)__builtin_ia32_cmppd(__a, __b, 0);
159 }
160
161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_pd(__m128d __a,__m128d __b)162 _mm_cmplt_pd(__m128d __a, __m128d __b)
163 {
164 return (__m128d)__builtin_ia32_cmppd(__a, __b, 1);
165 }
166
167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_pd(__m128d __a,__m128d __b)168 _mm_cmple_pd(__m128d __a, __m128d __b)
169 {
170 return (__m128d)__builtin_ia32_cmppd(__a, __b, 2);
171 }
172
173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pd(__m128d __a,__m128d __b)174 _mm_cmpgt_pd(__m128d __a, __m128d __b)
175 {
176 return (__m128d)__builtin_ia32_cmppd(__b, __a, 1);
177 }
178
179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_pd(__m128d __a,__m128d __b)180 _mm_cmpge_pd(__m128d __a, __m128d __b)
181 {
182 return (__m128d)__builtin_ia32_cmppd(__b, __a, 2);
183 }
184
185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_pd(__m128d __a,__m128d __b)186 _mm_cmpord_pd(__m128d __a, __m128d __b)
187 {
188 return (__m128d)__builtin_ia32_cmppd(__a, __b, 7);
189 }
190
191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_pd(__m128d __a,__m128d __b)192 _mm_cmpunord_pd(__m128d __a, __m128d __b)
193 {
194 return (__m128d)__builtin_ia32_cmppd(__a, __b, 3);
195 }
196
197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_pd(__m128d __a,__m128d __b)198 _mm_cmpneq_pd(__m128d __a, __m128d __b)
199 {
200 return (__m128d)__builtin_ia32_cmppd(__a, __b, 4);
201 }
202
203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_pd(__m128d __a,__m128d __b)204 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
205 {
206 return (__m128d)__builtin_ia32_cmppd(__a, __b, 5);
207 }
208
209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_pd(__m128d __a,__m128d __b)210 _mm_cmpnle_pd(__m128d __a, __m128d __b)
211 {
212 return (__m128d)__builtin_ia32_cmppd(__a, __b, 6);
213 }
214
215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_pd(__m128d __a,__m128d __b)216 _mm_cmpngt_pd(__m128d __a, __m128d __b)
217 {
218 return (__m128d)__builtin_ia32_cmppd(__b, __a, 5);
219 }
220
221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_pd(__m128d __a,__m128d __b)222 _mm_cmpnge_pd(__m128d __a, __m128d __b)
223 {
224 return (__m128d)__builtin_ia32_cmppd(__b, __a, 6);
225 }
226
227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_sd(__m128d __a,__m128d __b)228 _mm_cmpeq_sd(__m128d __a, __m128d __b)
229 {
230 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 0);
231 }
232
233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_sd(__m128d __a,__m128d __b)234 _mm_cmplt_sd(__m128d __a, __m128d __b)
235 {
236 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 1);
237 }
238
239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_sd(__m128d __a,__m128d __b)240 _mm_cmple_sd(__m128d __a, __m128d __b)
241 {
242 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 2);
243 }
244
245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_sd(__m128d __a,__m128d __b)246 _mm_cmpgt_sd(__m128d __a, __m128d __b)
247 {
248 return (__m128d)__builtin_ia32_cmpsd(__b, __a, 1);
249 }
250
251 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_sd(__m128d __a,__m128d __b)252 _mm_cmpge_sd(__m128d __a, __m128d __b)
253 {
254 return (__m128d)__builtin_ia32_cmpsd(__b, __a, 2);
255 }
256
257 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_sd(__m128d __a,__m128d __b)258 _mm_cmpord_sd(__m128d __a, __m128d __b)
259 {
260 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 7);
261 }
262
263 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_sd(__m128d __a,__m128d __b)264 _mm_cmpunord_sd(__m128d __a, __m128d __b)
265 {
266 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 3);
267 }
268
269 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_sd(__m128d __a,__m128d __b)270 _mm_cmpneq_sd(__m128d __a, __m128d __b)
271 {
272 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 4);
273 }
274
275 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_sd(__m128d __a,__m128d __b)276 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
277 {
278 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 5);
279 }
280
281 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_sd(__m128d __a,__m128d __b)282 _mm_cmpnle_sd(__m128d __a, __m128d __b)
283 {
284 return (__m128d)__builtin_ia32_cmpsd(__a, __b, 6);
285 }
286
287 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_sd(__m128d __a,__m128d __b)288 _mm_cmpngt_sd(__m128d __a, __m128d __b)
289 {
290 return (__m128d)__builtin_ia32_cmpsd(__b, __a, 5);
291 }
292
293 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_sd(__m128d __a,__m128d __b)294 _mm_cmpnge_sd(__m128d __a, __m128d __b)
295 {
296 return (__m128d)__builtin_ia32_cmpsd(__b, __a, 6);
297 }
298
299 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comieq_sd(__m128d __a,__m128d __b)300 _mm_comieq_sd(__m128d __a, __m128d __b)
301 {
302 return __builtin_ia32_comisdeq(__a, __b);
303 }
304
305 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comilt_sd(__m128d __a,__m128d __b)306 _mm_comilt_sd(__m128d __a, __m128d __b)
307 {
308 return __builtin_ia32_comisdlt(__a, __b);
309 }
310
311 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comile_sd(__m128d __a,__m128d __b)312 _mm_comile_sd(__m128d __a, __m128d __b)
313 {
314 return __builtin_ia32_comisdle(__a, __b);
315 }
316
317 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comigt_sd(__m128d __a,__m128d __b)318 _mm_comigt_sd(__m128d __a, __m128d __b)
319 {
320 return __builtin_ia32_comisdgt(__a, __b);
321 }
322
323 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comige_sd(__m128d __a,__m128d __b)324 _mm_comige_sd(__m128d __a, __m128d __b)
325 {
326 return __builtin_ia32_comisdge(__a, __b);
327 }
328
329 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comineq_sd(__m128d __a,__m128d __b)330 _mm_comineq_sd(__m128d __a, __m128d __b)
331 {
332 return __builtin_ia32_comisdneq(__a, __b);
333 }
334
335 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomieq_sd(__m128d __a,__m128d __b)336 _mm_ucomieq_sd(__m128d __a, __m128d __b)
337 {
338 return __builtin_ia32_ucomisdeq(__a, __b);
339 }
340
341 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomilt_sd(__m128d __a,__m128d __b)342 _mm_ucomilt_sd(__m128d __a, __m128d __b)
343 {
344 return __builtin_ia32_ucomisdlt(__a, __b);
345 }
346
347 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomile_sd(__m128d __a,__m128d __b)348 _mm_ucomile_sd(__m128d __a, __m128d __b)
349 {
350 return __builtin_ia32_ucomisdle(__a, __b);
351 }
352
353 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomigt_sd(__m128d __a,__m128d __b)354 _mm_ucomigt_sd(__m128d __a, __m128d __b)
355 {
356 return __builtin_ia32_ucomisdgt(__a, __b);
357 }
358
359 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomige_sd(__m128d __a,__m128d __b)360 _mm_ucomige_sd(__m128d __a, __m128d __b)
361 {
362 return __builtin_ia32_ucomisdge(__a, __b);
363 }
364
365 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomineq_sd(__m128d __a,__m128d __b)366 _mm_ucomineq_sd(__m128d __a, __m128d __b)
367 {
368 return __builtin_ia32_ucomisdneq(__a, __b);
369 }
370
371 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_ps(__m128d __a)372 _mm_cvtpd_ps(__m128d __a)
373 {
374 return __builtin_ia32_cvtpd2ps(__a);
375 }
376
377 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pd(__m128 __a)378 _mm_cvtps_pd(__m128 __a)
379 {
380 return __builtin_ia32_cvtps2pd(__a);
381 }
382
383 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_pd(__m128i __a)384 _mm_cvtepi32_pd(__m128i __a)
385 {
386 return __builtin_ia32_cvtdq2pd((__v4si)__a);
387 }
388
389 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_epi32(__m128d __a)390 _mm_cvtpd_epi32(__m128d __a)
391 {
392 return __builtin_ia32_cvtpd2dq(__a);
393 }
394
395 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si32(__m128d __a)396 _mm_cvtsd_si32(__m128d __a)
397 {
398 return __builtin_ia32_cvtsd2si(__a);
399 }
400
401 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_ss(__m128 __a,__m128d __b)402 _mm_cvtsd_ss(__m128 __a, __m128d __b)
403 {
404 __a[0] = __b[0];
405 return __a;
406 }
407
408 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_sd(__m128d __a,int __b)409 _mm_cvtsi32_sd(__m128d __a, int __b)
410 {
411 __a[0] = __b;
412 return __a;
413 }
414
415 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_sd(__m128d __a,__m128 __b)416 _mm_cvtss_sd(__m128d __a, __m128 __b)
417 {
418 __a[0] = __b[0];
419 return __a;
420 }
421
422 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_epi32(__m128d __a)423 _mm_cvttpd_epi32(__m128d __a)
424 {
425 return (__m128i)__builtin_ia32_cvttpd2dq(__a);
426 }
427
428 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si32(__m128d __a)429 _mm_cvttsd_si32(__m128d __a)
430 {
431 return __a[0];
432 }
433
434 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_pi32(__m128d __a)435 _mm_cvtpd_pi32(__m128d __a)
436 {
437 return (__m64)__builtin_ia32_cvtpd2pi(__a);
438 }
439
440 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_pi32(__m128d __a)441 _mm_cvttpd_pi32(__m128d __a)
442 {
443 return (__m64)__builtin_ia32_cvttpd2pi(__a);
444 }
445
446 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi32_pd(__m64 __a)447 _mm_cvtpi32_pd(__m64 __a)
448 {
449 return __builtin_ia32_cvtpi2pd((__v2si)__a);
450 }
451
452 static __inline__ double __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_f64(__m128d __a)453 _mm_cvtsd_f64(__m128d __a)
454 {
455 return __a[0];
456 }
457
458 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_pd(double const * __dp)459 _mm_load_pd(double const *__dp)
460 {
461 return *(__m128d*)__dp;
462 }
463
464 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load1_pd(double const * __dp)465 _mm_load1_pd(double const *__dp)
466 {
467 struct __mm_load1_pd_struct {
468 double __u;
469 } __attribute__((__packed__, __may_alias__));
470 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
471 return (__m128d){ __u, __u };
472 }
473
474 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
475
476 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadr_pd(double const * __dp)477 _mm_loadr_pd(double const *__dp)
478 {
479 __m128d __u = *(__m128d*)__dp;
480 return __builtin_shufflevector(__u, __u, 1, 0);
481 }
482
483 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadu_pd(double const * __dp)484 _mm_loadu_pd(double const *__dp)
485 {
486 struct __loadu_pd {
487 __m128d __v;
488 } __attribute__((packed, may_alias));
489 return ((struct __loadu_pd*)__dp)->__v;
490 }
491
492 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_sd(double const * __dp)493 _mm_load_sd(double const *__dp)
494 {
495 struct __mm_load_sd_struct {
496 double __u;
497 } __attribute__((__packed__, __may_alias__));
498 double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
499 return (__m128d){ __u, 0 };
500 }
501
502 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pd(__m128d __a,double const * __dp)503 _mm_loadh_pd(__m128d __a, double const *__dp)
504 {
505 struct __mm_loadh_pd_struct {
506 double __u;
507 } __attribute__((__packed__, __may_alias__));
508 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
509 return (__m128d){ __a[0], __u };
510 }
511
512 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pd(__m128d __a,double const * __dp)513 _mm_loadl_pd(__m128d __a, double const *__dp)
514 {
515 struct __mm_loadl_pd_struct {
516 double __u;
517 } __attribute__((__packed__, __may_alias__));
518 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
519 return (__m128d){ __u, __a[1] };
520 }
521
522 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_sd(double __w)523 _mm_set_sd(double __w)
524 {
525 return (__m128d){ __w, 0 };
526 }
527
528 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set1_pd(double __w)529 _mm_set1_pd(double __w)
530 {
531 return (__m128d){ __w, __w };
532 }
533
534 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_pd(double __w,double __x)535 _mm_set_pd(double __w, double __x)
536 {
537 return (__m128d){ __x, __w };
538 }
539
540 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setr_pd(double __w,double __x)541 _mm_setr_pd(double __w, double __x)
542 {
543 return (__m128d){ __w, __x };
544 }
545
546 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setzero_pd(void)547 _mm_setzero_pd(void)
548 {
549 return (__m128d){ 0, 0 };
550 }
551
552 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_move_sd(__m128d __a,__m128d __b)553 _mm_move_sd(__m128d __a, __m128d __b)
554 {
555 return (__m128d){ __b[0], __a[1] };
556 }
557
558 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_sd(double * __dp,__m128d __a)559 _mm_store_sd(double *__dp, __m128d __a)
560 {
561 struct __mm_store_sd_struct {
562 double __u;
563 } __attribute__((__packed__, __may_alias__));
564 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
565 }
566
567 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store1_pd(double * __dp,__m128d __a)568 _mm_store1_pd(double *__dp, __m128d __a)
569 {
570 struct __mm_store1_pd_struct {
571 double __u[2];
572 } __attribute__((__packed__, __may_alias__));
573 ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
574 ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
575 }
576
577 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_pd(double * __dp,__m128d __a)578 _mm_store_pd(double *__dp, __m128d __a)
579 {
580 *(__m128d *)__dp = __a;
581 }
582
583 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_pd(double * __dp,__m128d __a)584 _mm_storeu_pd(double *__dp, __m128d __a)
585 {
586 __builtin_ia32_storeupd(__dp, __a);
587 }
588
589 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storer_pd(double * __dp,__m128d __a)590 _mm_storer_pd(double *__dp, __m128d __a)
591 {
592 __a = __builtin_shufflevector(__a, __a, 1, 0);
593 *(__m128d *)__dp = __a;
594 }
595
596 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeh_pd(double * __dp,__m128d __a)597 _mm_storeh_pd(double *__dp, __m128d __a)
598 {
599 struct __mm_storeh_pd_struct {
600 double __u;
601 } __attribute__((__packed__, __may_alias__));
602 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
603 }
604
605 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_pd(double * __dp,__m128d __a)606 _mm_storel_pd(double *__dp, __m128d __a)
607 {
608 struct __mm_storeh_pd_struct {
609 double __u;
610 } __attribute__((__packed__, __may_alias__));
611 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
612 }
613
614 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi8(__m128i __a,__m128i __b)615 _mm_add_epi8(__m128i __a, __m128i __b)
616 {
617 return (__m128i)((__v16qi)__a + (__v16qi)__b);
618 }
619
620 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi16(__m128i __a,__m128i __b)621 _mm_add_epi16(__m128i __a, __m128i __b)
622 {
623 return (__m128i)((__v8hi)__a + (__v8hi)__b);
624 }
625
626 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi32(__m128i __a,__m128i __b)627 _mm_add_epi32(__m128i __a, __m128i __b)
628 {
629 return (__m128i)((__v4si)__a + (__v4si)__b);
630 }
631
632 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_add_si64(__m64 __a,__m64 __b)633 _mm_add_si64(__m64 __a, __m64 __b)
634 {
635 return __a + __b;
636 }
637
638 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi64(__m128i __a,__m128i __b)639 _mm_add_epi64(__m128i __a, __m128i __b)
640 {
641 return __a + __b;
642 }
643
644 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi8(__m128i __a,__m128i __b)645 _mm_adds_epi8(__m128i __a, __m128i __b)
646 {
647 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
648 }
649
650 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi16(__m128i __a,__m128i __b)651 _mm_adds_epi16(__m128i __a, __m128i __b)
652 {
653 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
654 }
655
656 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu8(__m128i __a,__m128i __b)657 _mm_adds_epu8(__m128i __a, __m128i __b)
658 {
659 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
660 }
661
662 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu16(__m128i __a,__m128i __b)663 _mm_adds_epu16(__m128i __a, __m128i __b)
664 {
665 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
666 }
667
668 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu8(__m128i __a,__m128i __b)669 _mm_avg_epu8(__m128i __a, __m128i __b)
670 {
671 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
672 }
673
674 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu16(__m128i __a,__m128i __b)675 _mm_avg_epu16(__m128i __a, __m128i __b)
676 {
677 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
678 }
679
680 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_madd_epi16(__m128i __a,__m128i __b)681 _mm_madd_epi16(__m128i __a, __m128i __b)
682 {
683 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
684 }
685
686 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi16(__m128i __a,__m128i __b)687 _mm_max_epi16(__m128i __a, __m128i __b)
688 {
689 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
690 }
691
692 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu8(__m128i __a,__m128i __b)693 _mm_max_epu8(__m128i __a, __m128i __b)
694 {
695 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
696 }
697
698 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi16(__m128i __a,__m128i __b)699 _mm_min_epi16(__m128i __a, __m128i __b)
700 {
701 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
702 }
703
704 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu8(__m128i __a,__m128i __b)705 _mm_min_epu8(__m128i __a, __m128i __b)
706 {
707 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
708 }
709
710 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epi16(__m128i __a,__m128i __b)711 _mm_mulhi_epi16(__m128i __a, __m128i __b)
712 {
713 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
714 }
715
716 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epu16(__m128i __a,__m128i __b)717 _mm_mulhi_epu16(__m128i __a, __m128i __b)
718 {
719 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
720 }
721
722 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mullo_epi16(__m128i __a,__m128i __b)723 _mm_mullo_epi16(__m128i __a, __m128i __b)
724 {
725 return (__m128i)((__v8hi)__a * (__v8hi)__b);
726 }
727
728 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mul_su32(__m64 __a,__m64 __b)729 _mm_mul_su32(__m64 __a, __m64 __b)
730 {
731 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
732 }
733
734 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mul_epu32(__m128i __a,__m128i __b)735 _mm_mul_epu32(__m128i __a, __m128i __b)
736 {
737 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
738 }
739
740 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sad_epu8(__m128i __a,__m128i __b)741 _mm_sad_epu8(__m128i __a, __m128i __b)
742 {
743 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
744 }
745
746 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi8(__m128i __a,__m128i __b)747 _mm_sub_epi8(__m128i __a, __m128i __b)
748 {
749 return (__m128i)((__v16qi)__a - (__v16qi)__b);
750 }
751
752 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi16(__m128i __a,__m128i __b)753 _mm_sub_epi16(__m128i __a, __m128i __b)
754 {
755 return (__m128i)((__v8hi)__a - (__v8hi)__b);
756 }
757
758 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi32(__m128i __a,__m128i __b)759 _mm_sub_epi32(__m128i __a, __m128i __b)
760 {
761 return (__m128i)((__v4si)__a - (__v4si)__b);
762 }
763
764 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sub_si64(__m64 __a,__m64 __b)765 _mm_sub_si64(__m64 __a, __m64 __b)
766 {
767 return __a - __b;
768 }
769
770 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi64(__m128i __a,__m128i __b)771 _mm_sub_epi64(__m128i __a, __m128i __b)
772 {
773 return __a - __b;
774 }
775
776 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi8(__m128i __a,__m128i __b)777 _mm_subs_epi8(__m128i __a, __m128i __b)
778 {
779 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
780 }
781
782 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi16(__m128i __a,__m128i __b)783 _mm_subs_epi16(__m128i __a, __m128i __b)
784 {
785 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
786 }
787
788 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu8(__m128i __a,__m128i __b)789 _mm_subs_epu8(__m128i __a, __m128i __b)
790 {
791 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
792 }
793
794 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu16(__m128i __a,__m128i __b)795 _mm_subs_epu16(__m128i __a, __m128i __b)
796 {
797 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
798 }
799
800 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_and_si128(__m128i __a,__m128i __b)801 _mm_and_si128(__m128i __a, __m128i __b)
802 {
803 return __a & __b;
804 }
805
806 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_andnot_si128(__m128i __a,__m128i __b)807 _mm_andnot_si128(__m128i __a, __m128i __b)
808 {
809 return ~__a & __b;
810 }
811
812 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_or_si128(__m128i __a,__m128i __b)813 _mm_or_si128(__m128i __a, __m128i __b)
814 {
815 return __a | __b;
816 }
817
818 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_xor_si128(__m128i __a,__m128i __b)819 _mm_xor_si128(__m128i __a, __m128i __b)
820 {
821 return __a ^ __b;
822 }
823
824 #define _mm_slli_si128(a, count) __extension__ ({ \
825 __m128i __a = (a); \
826 (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
827
828 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi16(__m128i __a,int __count)829 _mm_slli_epi16(__m128i __a, int __count)
830 {
831 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
832 }
833
834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi16(__m128i __a,__m128i __count)835 _mm_sll_epi16(__m128i __a, __m128i __count)
836 {
837 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
838 }
839
840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi32(__m128i __a,int __count)841 _mm_slli_epi32(__m128i __a, int __count)
842 {
843 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
844 }
845
846 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi32(__m128i __a,__m128i __count)847 _mm_sll_epi32(__m128i __a, __m128i __count)
848 {
849 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
850 }
851
852 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi64(__m128i __a,int __count)853 _mm_slli_epi64(__m128i __a, int __count)
854 {
855 return __builtin_ia32_psllqi128(__a, __count);
856 }
857
858 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi64(__m128i __a,__m128i __count)859 _mm_sll_epi64(__m128i __a, __m128i __count)
860 {
861 return __builtin_ia32_psllq128(__a, __count);
862 }
863
864 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi16(__m128i __a,int __count)865 _mm_srai_epi16(__m128i __a, int __count)
866 {
867 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
868 }
869
870 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi16(__m128i __a,__m128i __count)871 _mm_sra_epi16(__m128i __a, __m128i __count)
872 {
873 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
874 }
875
876 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi32(__m128i __a,int __count)877 _mm_srai_epi32(__m128i __a, int __count)
878 {
879 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
880 }
881
882 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi32(__m128i __a,__m128i __count)883 _mm_sra_epi32(__m128i __a, __m128i __count)
884 {
885 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
886 }
887
888
889 #define _mm_srli_si128(a, count) __extension__ ({ \
890 __m128i __a = (a); \
891 (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
892
893 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi16(__m128i __a,int __count)894 _mm_srli_epi16(__m128i __a, int __count)
895 {
896 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
897 }
898
899 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi16(__m128i __a,__m128i __count)900 _mm_srl_epi16(__m128i __a, __m128i __count)
901 {
902 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
903 }
904
905 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi32(__m128i __a,int __count)906 _mm_srli_epi32(__m128i __a, int __count)
907 {
908 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
909 }
910
911 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi32(__m128i __a,__m128i __count)912 _mm_srl_epi32(__m128i __a, __m128i __count)
913 {
914 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
915 }
916
917 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi64(__m128i __a,int __count)918 _mm_srli_epi64(__m128i __a, int __count)
919 {
920 return __builtin_ia32_psrlqi128(__a, __count);
921 }
922
923 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi64(__m128i __a,__m128i __count)924 _mm_srl_epi64(__m128i __a, __m128i __count)
925 {
926 return __builtin_ia32_psrlq128(__a, __count);
927 }
928
929 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi8(__m128i __a,__m128i __b)930 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
931 {
932 return (__m128i)((__v16qi)__a == (__v16qi)__b);
933 }
934
935 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi16(__m128i __a,__m128i __b)936 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
937 {
938 return (__m128i)((__v8hi)__a == (__v8hi)__b);
939 }
940
941 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi32(__m128i __a,__m128i __b)942 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
943 {
944 return (__m128i)((__v4si)__a == (__v4si)__b);
945 }
946
947 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi8(__m128i __a,__m128i __b)948 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
949 {
950 /* This function always performs a signed comparison, but __v16qi is a char
951 which may be signed or unsigned. */
952 typedef signed char __v16qs __attribute__((__vector_size__(16)));
953 return (__m128i)((__v16qs)__a > (__v16qs)__b);
954 }
955
956 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi16(__m128i __a,__m128i __b)957 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
958 {
959 return (__m128i)((__v8hi)__a > (__v8hi)__b);
960 }
961
962 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi32(__m128i __a,__m128i __b)963 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
964 {
965 return (__m128i)((__v4si)__a > (__v4si)__b);
966 }
967
968 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi8(__m128i __a,__m128i __b)969 _mm_cmplt_epi8(__m128i __a, __m128i __b)
970 {
971 return _mm_cmpgt_epi8(__b, __a);
972 }
973
974 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi16(__m128i __a,__m128i __b)975 _mm_cmplt_epi16(__m128i __a, __m128i __b)
976 {
977 return _mm_cmpgt_epi16(__b, __a);
978 }
979
980 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi32(__m128i __a,__m128i __b)981 _mm_cmplt_epi32(__m128i __a, __m128i __b)
982 {
983 return _mm_cmpgt_epi32(__b, __a);
984 }
985
986 #ifdef __x86_64__
987 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_sd(__m128d __a,long long __b)988 _mm_cvtsi64_sd(__m128d __a, long long __b)
989 {
990 __a[0] = __b;
991 return __a;
992 }
993
994 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si64(__m128d __a)995 _mm_cvtsd_si64(__m128d __a)
996 {
997 return __builtin_ia32_cvtsd2si64(__a);
998 }
999
1000 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si64(__m128d __a)1001 _mm_cvttsd_si64(__m128d __a)
1002 {
1003 return __a[0];
1004 }
1005 #endif
1006
1007 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_ps(__m128i __a)1008 _mm_cvtepi32_ps(__m128i __a)
1009 {
1010 return __builtin_ia32_cvtdq2ps((__v4si)__a);
1011 }
1012
1013 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_epi32(__m128 __a)1014 _mm_cvtps_epi32(__m128 __a)
1015 {
1016 return (__m128i)__builtin_ia32_cvtps2dq(__a);
1017 }
1018
1019 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttps_epi32(__m128 __a)1020 _mm_cvttps_epi32(__m128 __a)
1021 {
1022 return (__m128i)__builtin_ia32_cvttps2dq(__a);
1023 }
1024
1025 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_si128(int __a)1026 _mm_cvtsi32_si128(int __a)
1027 {
1028 return (__m128i)(__v4si){ __a, 0, 0, 0 };
1029 }
1030
1031 #ifdef __x86_64__
1032 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_si128(long long __a)1033 _mm_cvtsi64_si128(long long __a)
1034 {
1035 return (__m128i){ __a, 0 };
1036 }
1037 #endif
1038
1039 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si32(__m128i __a)1040 _mm_cvtsi128_si32(__m128i __a)
1041 {
1042 __v4si __b = (__v4si)__a;
1043 return __b[0];
1044 }
1045
1046 #ifdef __x86_64__
1047 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si64(__m128i __a)1048 _mm_cvtsi128_si64(__m128i __a)
1049 {
1050 return __a[0];
1051 }
1052 #endif
1053
1054 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_load_si128(__m128i const * __p)1055 _mm_load_si128(__m128i const *__p)
1056 {
1057 return *__p;
1058 }
1059
1060 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadu_si128(__m128i const * __p)1061 _mm_loadu_si128(__m128i const *__p)
1062 {
1063 struct __loadu_si128 {
1064 __m128i __v;
1065 } __attribute__((packed, may_alias));
1066 return ((struct __loadu_si128*)__p)->__v;
1067 }
1068
1069 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadl_epi64(__m128i const * __p)1070 _mm_loadl_epi64(__m128i const *__p)
1071 {
1072 struct __mm_loadl_epi64_struct {
1073 long long __u;
1074 } __attribute__((__packed__, __may_alias__));
1075 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1076 }
1077
1078 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64x(long long q1,long long q0)1079 _mm_set_epi64x(long long q1, long long q0)
1080 {
1081 return (__m128i){ q0, q1 };
1082 }
1083
1084 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64(__m64 q1,__m64 q0)1085 _mm_set_epi64(__m64 q1, __m64 q0)
1086 {
1087 return (__m128i){ (long long)q0, (long long)q1 };
1088 }
1089
1090 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi32(int i3,int i2,int i1,int i0)1091 _mm_set_epi32(int i3, int i2, int i1, int i0)
1092 {
1093 return (__m128i)(__v4si){ i0, i1, i2, i3};
1094 }
1095
1096 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi16(short w7,short w6,short w5,short w4,short w3,short w2,short w1,short w0)1097 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1098 {
1099 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1100 }
1101
1102 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi8(char b15,char b14,char b13,char b12,char b11,char b10,char b9,char b8,char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0)1103 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1104 {
1105 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1106 }
1107
1108 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64x(long long __q)1109 _mm_set1_epi64x(long long __q)
1110 {
1111 return (__m128i){ __q, __q };
1112 }
1113
1114 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64(__m64 __q)1115 _mm_set1_epi64(__m64 __q)
1116 {
1117 return (__m128i){ (long long)__q, (long long)__q };
1118 }
1119
1120 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi32(int __i)1121 _mm_set1_epi32(int __i)
1122 {
1123 return (__m128i)(__v4si){ __i, __i, __i, __i };
1124 }
1125
1126 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi16(short __w)1127 _mm_set1_epi16(short __w)
1128 {
1129 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
1130 }
1131
1132 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi8(char __b)1133 _mm_set1_epi8(char __b)
1134 {
1135 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
1136 }
1137
1138 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi64(__m64 q0,__m64 q1)1139 _mm_setr_epi64(__m64 q0, __m64 q1)
1140 {
1141 return (__m128i){ (long long)q0, (long long)q1 };
1142 }
1143
1144 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi32(int i0,int i1,int i2,int i3)1145 _mm_setr_epi32(int i0, int i1, int i2, int i3)
1146 {
1147 return (__m128i)(__v4si){ i0, i1, i2, i3};
1148 }
1149
1150 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)1151 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1152 {
1153 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1154 }
1155
1156 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi8(char b0,char b1,char b2,char b3,char b4,char b5,char b6,char b7,char b8,char b9,char b10,char b11,char b12,char b13,char b14,char b15)1157 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1158 {
1159 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1160 }
1161
1162 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setzero_si128(void)1163 _mm_setzero_si128(void)
1164 {
1165 return (__m128i){ 0LL, 0LL };
1166 }
1167
1168 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_si128(__m128i * __p,__m128i __b)1169 _mm_store_si128(__m128i *__p, __m128i __b)
1170 {
1171 *__p = __b;
1172 }
1173
1174 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_si128(__m128i * __p,__m128i __b)1175 _mm_storeu_si128(__m128i *__p, __m128i __b)
1176 {
1177 __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
1178 }
1179
1180 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)1181 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1182 {
1183 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
1184 }
1185
1186 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_epi64(__m128i * __p,__m128i __a)1187 _mm_storel_epi64(__m128i *__p, __m128i __a)
1188 {
1189 struct __mm_storel_epi64_struct {
1190 long long __u;
1191 } __attribute__((__packed__, __may_alias__));
1192 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
1193 }
1194
1195 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_pd(double * __p,__m128d __a)1196 _mm_stream_pd(double *__p, __m128d __a)
1197 {
1198 __builtin_ia32_movntpd(__p, __a);
1199 }
1200
1201 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si128(__m128i * __p,__m128i __a)1202 _mm_stream_si128(__m128i *__p, __m128i __a)
1203 {
1204 __builtin_ia32_movntdq(__p, __a);
1205 }
1206
1207 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si32(int * __p,int __a)1208 _mm_stream_si32(int *__p, int __a)
1209 {
1210 __builtin_ia32_movnti(__p, __a);
1211 }
1212
1213 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_clflush(void const * __p)1214 _mm_clflush(void const *__p)
1215 {
1216 __builtin_ia32_clflush(__p);
1217 }
1218
1219 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_lfence(void)1220 _mm_lfence(void)
1221 {
1222 __builtin_ia32_lfence();
1223 }
1224
1225 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_mfence(void)1226 _mm_mfence(void)
1227 {
1228 __builtin_ia32_mfence();
1229 }
1230
1231 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi16(__m128i __a,__m128i __b)1232 _mm_packs_epi16(__m128i __a, __m128i __b)
1233 {
1234 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
1235 }
1236
1237 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi32(__m128i __a,__m128i __b)1238 _mm_packs_epi32(__m128i __a, __m128i __b)
1239 {
1240 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
1241 }
1242
1243 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi16(__m128i __a,__m128i __b)1244 _mm_packus_epi16(__m128i __a, __m128i __b)
1245 {
1246 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
1247 }
1248
1249 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_extract_epi16(__m128i __a,int __imm)1250 _mm_extract_epi16(__m128i __a, int __imm)
1251 {
1252 __v8hi __b = (__v8hi)__a;
1253 return (unsigned short)__b[__imm];
1254 }
1255
1256 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_insert_epi16(__m128i __a,int __b,int __imm)1257 _mm_insert_epi16(__m128i __a, int __b, int __imm)
1258 {
1259 __v8hi __c = (__v8hi)__a;
1260 __c[__imm & 7] = __b;
1261 return (__m128i)__c;
1262 }
1263
1264 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_epi8(__m128i __a)1265 _mm_movemask_epi8(__m128i __a)
1266 {
1267 return __builtin_ia32_pmovmskb128((__v16qi)__a);
1268 }
1269
1270 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1271 __m128i __a = (a); \
1272 (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
1273 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1274 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1275
1276 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1277 __m128i __a = (a); \
1278 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1279 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1280 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1281 4, 5, 6, 7); })
1282
1283 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1284 __m128i __a = (a); \
1285 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1286 0, 1, 2, 3, \
1287 4 + (((imm) & 0x03) >> 0), \
1288 4 + (((imm) & 0x0c) >> 2), \
1289 4 + (((imm) & 0x30) >> 4), \
1290 4 + (((imm) & 0xc0) >> 6)); })
1291
1292 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi8(__m128i __a,__m128i __b)1293 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
1294 {
1295 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1296 }
1297
1298 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi16(__m128i __a,__m128i __b)1299 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
1300 {
1301 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1302 }
1303
1304 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi32(__m128i __a,__m128i __b)1305 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
1306 {
1307 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
1308 }
1309
1310 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi64(__m128i __a,__m128i __b)1311 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
1312 {
1313 return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
1314 }
1315
1316 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi8(__m128i __a,__m128i __b)1317 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
1318 {
1319 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1320 }
1321
1322 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi16(__m128i __a,__m128i __b)1323 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
1324 {
1325 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1326 }
1327
1328 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi32(__m128i __a,__m128i __b)1329 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
1330 {
1331 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
1332 }
1333
1334 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi64(__m128i __a,__m128i __b)1335 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
1336 {
1337 return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
1338 }
1339
1340 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_movepi64_pi64(__m128i __a)1341 _mm_movepi64_pi64(__m128i __a)
1342 {
1343 return (__m64)__a[0];
1344 }
1345
1346 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_movpi64_pi64(__m64 __a)1347 _mm_movpi64_pi64(__m64 __a)
1348 {
1349 return (__m128i){ (long long)__a, 0 };
1350 }
1351
1352 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_move_epi64(__m128i __a)1353 _mm_move_epi64(__m128i __a)
1354 {
1355 return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
1356 }
1357
1358 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pd(__m128d __a,__m128d __b)1359 _mm_unpackhi_pd(__m128d __a, __m128d __b)
1360 {
1361 return __builtin_shufflevector(__a, __b, 1, 2+1);
1362 }
1363
1364 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pd(__m128d __a,__m128d __b)1365 _mm_unpacklo_pd(__m128d __a, __m128d __b)
1366 {
1367 return __builtin_shufflevector(__a, __b, 0, 2+0);
1368 }
1369
1370 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_pd(__m128d __a)1371 _mm_movemask_pd(__m128d __a)
1372 {
1373 return __builtin_ia32_movmskpd(__a);
1374 }
1375
1376 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1377 __m128d __a = (a); \
1378 __m128d __b = (b); \
1379 __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
1380
1381 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castpd_ps(__m128d __in)1382 _mm_castpd_ps(__m128d __in)
1383 {
1384 return (__m128)__in;
1385 }
1386
1387 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castpd_si128(__m128d __in)1388 _mm_castpd_si128(__m128d __in)
1389 {
1390 return (__m128i)__in;
1391 }
1392
1393 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castps_pd(__m128 __in)1394 _mm_castps_pd(__m128 __in)
1395 {
1396 return (__m128d)__in;
1397 }
1398
1399 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castps_si128(__m128 __in)1400 _mm_castps_si128(__m128 __in)
1401 {
1402 return (__m128i)__in;
1403 }
1404
1405 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_ps(__m128i __in)1406 _mm_castsi128_ps(__m128i __in)
1407 {
1408 return (__m128)__in;
1409 }
1410
1411 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_pd(__m128i __in)1412 _mm_castsi128_pd(__m128i __in)
1413 {
1414 return (__m128d)__in;
1415 }
1416
1417 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_pause(void)1418 _mm_pause(void)
1419 {
1420 __asm__ volatile ("pause");
1421 }
1422
1423 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1424
1425 #endif /* __SSE2__ */
1426
1427 #endif /* __EMMINTRIN_H */
1428