• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to deal
5  * in the Software without restriction, including without limitation the rights
6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7  * copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19  * THE SOFTWARE.
20  *
21  *===-----------------------------------------------------------------------===
22  */
23 
24 #ifndef __EMMINTRIN_H
25 #define __EMMINTRIN_H
26 
27 #ifndef __SSE2__
28 #error "SSE2 instruction set not enabled"
29 #else
30 
31 #include <xmmintrin.h>
32 
33 typedef double __m128d __attribute__((__vector_size__(16)));
34 typedef long long __m128i __attribute__((__vector_size__(16)));
35 
36 /* Type defines.  */
37 typedef double __v2df __attribute__ ((__vector_size__ (16)));
38 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39 typedef short __v8hi __attribute__((__vector_size__(16)));
40 typedef char __v16qi __attribute__((__vector_size__(16)));
41 
42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_sd(__m128d a,__m128d b)43 _mm_add_sd(__m128d a, __m128d b)
44 {
45   a[0] += b[0];
46   return a;
47 }
48 
49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_pd(__m128d a,__m128d b)50 _mm_add_pd(__m128d a, __m128d b)
51 {
52   return a + b;
53 }
54 
55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_sd(__m128d a,__m128d b)56 _mm_sub_sd(__m128d a, __m128d b)
57 {
58   a[0] -= b[0];
59   return a;
60 }
61 
62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_pd(__m128d a,__m128d b)63 _mm_sub_pd(__m128d a, __m128d b)
64 {
65   return a - b;
66 }
67 
68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_sd(__m128d a,__m128d b)69 _mm_mul_sd(__m128d a, __m128d b)
70 {
71   a[0] *= b[0];
72   return a;
73 }
74 
75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_pd(__m128d a,__m128d b)76 _mm_mul_pd(__m128d a, __m128d b)
77 {
78   return a * b;
79 }
80 
81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_sd(__m128d a,__m128d b)82 _mm_div_sd(__m128d a, __m128d b)
83 {
84   a[0] /= b[0];
85   return a;
86 }
87 
88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_pd(__m128d a,__m128d b)89 _mm_div_pd(__m128d a, __m128d b)
90 {
91   return a / b;
92 }
93 
94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_sd(__m128d a,__m128d b)95 _mm_sqrt_sd(__m128d a, __m128d b)
96 {
97   __m128d c = __builtin_ia32_sqrtsd(b);
98   return (__m128d) { c[0], a[1] };
99 }
100 
101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_pd(__m128d a)102 _mm_sqrt_pd(__m128d a)
103 {
104   return __builtin_ia32_sqrtpd(a);
105 }
106 
107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_sd(__m128d a,__m128d b)108 _mm_min_sd(__m128d a, __m128d b)
109 {
110   return __builtin_ia32_minsd(a, b);
111 }
112 
113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_pd(__m128d a,__m128d b)114 _mm_min_pd(__m128d a, __m128d b)
115 {
116   return __builtin_ia32_minpd(a, b);
117 }
118 
119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_sd(__m128d a,__m128d b)120 _mm_max_sd(__m128d a, __m128d b)
121 {
122   return __builtin_ia32_maxsd(a, b);
123 }
124 
125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_pd(__m128d a,__m128d b)126 _mm_max_pd(__m128d a, __m128d b)
127 {
128   return __builtin_ia32_maxpd(a, b);
129 }
130 
131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_and_pd(__m128d a,__m128d b)132 _mm_and_pd(__m128d a, __m128d b)
133 {
134   return (__m128d)((__v4si)a & (__v4si)b);
135 }
136 
137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_andnot_pd(__m128d a,__m128d b)138 _mm_andnot_pd(__m128d a, __m128d b)
139 {
140   return (__m128d)(~(__v4si)a & (__v4si)b);
141 }
142 
143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_or_pd(__m128d a,__m128d b)144 _mm_or_pd(__m128d a, __m128d b)
145 {
146   return (__m128d)((__v4si)a | (__v4si)b);
147 }
148 
149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_xor_pd(__m128d a,__m128d b)150 _mm_xor_pd(__m128d a, __m128d b)
151 {
152   return (__m128d)((__v4si)a ^ (__v4si)b);
153 }
154 
155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pd(__m128d a,__m128d b)156 _mm_cmpeq_pd(__m128d a, __m128d b)
157 {
158   return (__m128d)__builtin_ia32_cmppd(a, b, 0);
159 }
160 
161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_pd(__m128d a,__m128d b)162 _mm_cmplt_pd(__m128d a, __m128d b)
163 {
164   return (__m128d)__builtin_ia32_cmppd(a, b, 1);
165 }
166 
167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_pd(__m128d a,__m128d b)168 _mm_cmple_pd(__m128d a, __m128d b)
169 {
170   return (__m128d)__builtin_ia32_cmppd(a, b, 2);
171 }
172 
173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pd(__m128d a,__m128d b)174 _mm_cmpgt_pd(__m128d a, __m128d b)
175 {
176   return (__m128d)__builtin_ia32_cmppd(b, a, 1);
177 }
178 
179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_pd(__m128d a,__m128d b)180 _mm_cmpge_pd(__m128d a, __m128d b)
181 {
182   return (__m128d)__builtin_ia32_cmppd(b, a, 2);
183 }
184 
185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_pd(__m128d a,__m128d b)186 _mm_cmpord_pd(__m128d a, __m128d b)
187 {
188   return (__m128d)__builtin_ia32_cmppd(a, b, 7);
189 }
190 
191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_pd(__m128d a,__m128d b)192 _mm_cmpunord_pd(__m128d a, __m128d b)
193 {
194   return (__m128d)__builtin_ia32_cmppd(a, b, 3);
195 }
196 
197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_pd(__m128d a,__m128d b)198 _mm_cmpneq_pd(__m128d a, __m128d b)
199 {
200   return (__m128d)__builtin_ia32_cmppd(a, b, 4);
201 }
202 
203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_pd(__m128d a,__m128d b)204 _mm_cmpnlt_pd(__m128d a, __m128d b)
205 {
206   return (__m128d)__builtin_ia32_cmppd(a, b, 5);
207 }
208 
209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_pd(__m128d a,__m128d b)210 _mm_cmpnle_pd(__m128d a, __m128d b)
211 {
212   return (__m128d)__builtin_ia32_cmppd(a, b, 6);
213 }
214 
215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_pd(__m128d a,__m128d b)216 _mm_cmpngt_pd(__m128d a, __m128d b)
217 {
218   return (__m128d)__builtin_ia32_cmppd(b, a, 5);
219 }
220 
221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_pd(__m128d a,__m128d b)222 _mm_cmpnge_pd(__m128d a, __m128d b)
223 {
224   return (__m128d)__builtin_ia32_cmppd(b, a, 6);
225 }
226 
227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_sd(__m128d a,__m128d b)228 _mm_cmpeq_sd(__m128d a, __m128d b)
229 {
230   return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
231 }
232 
233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_sd(__m128d a,__m128d b)234 _mm_cmplt_sd(__m128d a, __m128d b)
235 {
236   return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
237 }
238 
239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_sd(__m128d a,__m128d b)240 _mm_cmple_sd(__m128d a, __m128d b)
241 {
242   return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
243 }
244 
245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_sd(__m128d a,__m128d b)246 _mm_cmpgt_sd(__m128d a, __m128d b)
247 {
248   return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
249 }
250 
251 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_sd(__m128d a,__m128d b)252 _mm_cmpge_sd(__m128d a, __m128d b)
253 {
254   return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
255 }
256 
257 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_sd(__m128d a,__m128d b)258 _mm_cmpord_sd(__m128d a, __m128d b)
259 {
260   return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
261 }
262 
263 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_sd(__m128d a,__m128d b)264 _mm_cmpunord_sd(__m128d a, __m128d b)
265 {
266   return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
267 }
268 
269 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_sd(__m128d a,__m128d b)270 _mm_cmpneq_sd(__m128d a, __m128d b)
271 {
272   return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
273 }
274 
275 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_sd(__m128d a,__m128d b)276 _mm_cmpnlt_sd(__m128d a, __m128d b)
277 {
278   return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
279 }
280 
281 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_sd(__m128d a,__m128d b)282 _mm_cmpnle_sd(__m128d a, __m128d b)
283 {
284   return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
285 }
286 
287 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_sd(__m128d a,__m128d b)288 _mm_cmpngt_sd(__m128d a, __m128d b)
289 {
290   return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
291 }
292 
293 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_sd(__m128d a,__m128d b)294 _mm_cmpnge_sd(__m128d a, __m128d b)
295 {
296   return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
297 }
298 
299 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comieq_sd(__m128d a,__m128d b)300 _mm_comieq_sd(__m128d a, __m128d b)
301 {
302   return __builtin_ia32_comisdeq(a, b);
303 }
304 
305 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comilt_sd(__m128d a,__m128d b)306 _mm_comilt_sd(__m128d a, __m128d b)
307 {
308   return __builtin_ia32_comisdlt(a, b);
309 }
310 
311 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comile_sd(__m128d a,__m128d b)312 _mm_comile_sd(__m128d a, __m128d b)
313 {
314   return __builtin_ia32_comisdle(a, b);
315 }
316 
317 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comigt_sd(__m128d a,__m128d b)318 _mm_comigt_sd(__m128d a, __m128d b)
319 {
320   return __builtin_ia32_comisdgt(a, b);
321 }
322 
323 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comige_sd(__m128d a,__m128d b)324 _mm_comige_sd(__m128d a, __m128d b)
325 {
326   return __builtin_ia32_comisdge(a, b);
327 }
328 
329 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comineq_sd(__m128d a,__m128d b)330 _mm_comineq_sd(__m128d a, __m128d b)
331 {
332   return __builtin_ia32_comisdneq(a, b);
333 }
334 
335 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomieq_sd(__m128d a,__m128d b)336 _mm_ucomieq_sd(__m128d a, __m128d b)
337 {
338   return __builtin_ia32_ucomisdeq(a, b);
339 }
340 
341 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomilt_sd(__m128d a,__m128d b)342 _mm_ucomilt_sd(__m128d a, __m128d b)
343 {
344   return __builtin_ia32_ucomisdlt(a, b);
345 }
346 
347 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomile_sd(__m128d a,__m128d b)348 _mm_ucomile_sd(__m128d a, __m128d b)
349 {
350   return __builtin_ia32_ucomisdle(a, b);
351 }
352 
353 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomigt_sd(__m128d a,__m128d b)354 _mm_ucomigt_sd(__m128d a, __m128d b)
355 {
356   return __builtin_ia32_ucomisdgt(a, b);
357 }
358 
359 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomige_sd(__m128d a,__m128d b)360 _mm_ucomige_sd(__m128d a, __m128d b)
361 {
362   return __builtin_ia32_ucomisdge(a, b);
363 }
364 
365 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomineq_sd(__m128d a,__m128d b)366 _mm_ucomineq_sd(__m128d a, __m128d b)
367 {
368   return __builtin_ia32_ucomisdneq(a, b);
369 }
370 
371 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_ps(__m128d a)372 _mm_cvtpd_ps(__m128d a)
373 {
374   return __builtin_ia32_cvtpd2ps(a);
375 }
376 
377 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pd(__m128 a)378 _mm_cvtps_pd(__m128 a)
379 {
380   return __builtin_ia32_cvtps2pd(a);
381 }
382 
383 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_pd(__m128i a)384 _mm_cvtepi32_pd(__m128i a)
385 {
386   return __builtin_ia32_cvtdq2pd((__v4si)a);
387 }
388 
389 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_epi32(__m128d a)390 _mm_cvtpd_epi32(__m128d a)
391 {
392   return __builtin_ia32_cvtpd2dq(a);
393 }
394 
395 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si32(__m128d a)396 _mm_cvtsd_si32(__m128d a)
397 {
398   return __builtin_ia32_cvtsd2si(a);
399 }
400 
401 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_ss(__m128 a,__m128d b)402 _mm_cvtsd_ss(__m128 a, __m128d b)
403 {
404   a[0] = b[0];
405   return a;
406 }
407 
408 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_sd(__m128d a,int b)409 _mm_cvtsi32_sd(__m128d a, int b)
410 {
411   a[0] = b;
412   return a;
413 }
414 
415 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_sd(__m128d a,__m128 b)416 _mm_cvtss_sd(__m128d a, __m128 b)
417 {
418   a[0] = b[0];
419   return a;
420 }
421 
422 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_epi32(__m128d a)423 _mm_cvttpd_epi32(__m128d a)
424 {
425   return (__m128i)__builtin_ia32_cvttpd2dq(a);
426 }
427 
428 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si32(__m128d a)429 _mm_cvttsd_si32(__m128d a)
430 {
431   return a[0];
432 }
433 
434 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_pi32(__m128d a)435 _mm_cvtpd_pi32(__m128d a)
436 {
437   return (__m64)__builtin_ia32_cvtpd2pi(a);
438 }
439 
440 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_pi32(__m128d a)441 _mm_cvttpd_pi32(__m128d a)
442 {
443   return (__m64)__builtin_ia32_cvttpd2pi(a);
444 }
445 
446 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi32_pd(__m64 a)447 _mm_cvtpi32_pd(__m64 a)
448 {
449   return __builtin_ia32_cvtpi2pd((__v2si)a);
450 }
451 
452 static __inline__ double __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_f64(__m128d a)453 _mm_cvtsd_f64(__m128d a)
454 {
455   return a[0];
456 }
457 
458 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_pd(double const * dp)459 _mm_load_pd(double const *dp)
460 {
461   return *(__m128d*)dp;
462 }
463 
464 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load1_pd(double const * dp)465 _mm_load1_pd(double const *dp)
466 {
467   struct __mm_load1_pd_struct {
468     double u;
469   } __attribute__((__packed__, __may_alias__));
470   double u = ((struct __mm_load1_pd_struct*)dp)->u;
471   return (__m128d){ u, u };
472 }
473 
474 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
475 
476 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadr_pd(double const * dp)477 _mm_loadr_pd(double const *dp)
478 {
479   __m128d u = *(__m128d*)dp;
480   return __builtin_shufflevector(u, u, 1, 0);
481 }
482 
483 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadu_pd(double const * dp)484 _mm_loadu_pd(double const *dp)
485 {
486   struct __loadu_pd {
487     __m128d v;
488   } __attribute__((packed, may_alias));
489   return ((struct __loadu_pd*)dp)->v;
490 }
491 
492 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_sd(double const * dp)493 _mm_load_sd(double const *dp)
494 {
495   struct __mm_load_sd_struct {
496     double u;
497   } __attribute__((__packed__, __may_alias__));
498   double u = ((struct __mm_load_sd_struct*)dp)->u;
499   return (__m128d){ u, 0 };
500 }
501 
502 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pd(__m128d a,double const * dp)503 _mm_loadh_pd(__m128d a, double const *dp)
504 {
505   struct __mm_loadh_pd_struct {
506     double u;
507   } __attribute__((__packed__, __may_alias__));
508   double u = ((struct __mm_loadh_pd_struct*)dp)->u;
509   return (__m128d){ a[0], u };
510 }
511 
512 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pd(__m128d a,double const * dp)513 _mm_loadl_pd(__m128d a, double const *dp)
514 {
515   struct __mm_loadl_pd_struct {
516     double u;
517   } __attribute__((__packed__, __may_alias__));
518   double u = ((struct __mm_loadl_pd_struct*)dp)->u;
519   return (__m128d){ u, a[1] };
520 }
521 
522 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_sd(double w)523 _mm_set_sd(double w)
524 {
525   return (__m128d){ w, 0 };
526 }
527 
528 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set1_pd(double w)529 _mm_set1_pd(double w)
530 {
531   return (__m128d){ w, w };
532 }
533 
534 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_pd(double w,double x)535 _mm_set_pd(double w, double x)
536 {
537   return (__m128d){ x, w };
538 }
539 
540 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setr_pd(double w,double x)541 _mm_setr_pd(double w, double x)
542 {
543   return (__m128d){ w, x };
544 }
545 
546 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setzero_pd(void)547 _mm_setzero_pd(void)
548 {
549   return (__m128d){ 0, 0 };
550 }
551 
552 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_move_sd(__m128d a,__m128d b)553 _mm_move_sd(__m128d a, __m128d b)
554 {
555   return (__m128d){ b[0], a[1] };
556 }
557 
558 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_sd(double * dp,__m128d a)559 _mm_store_sd(double *dp, __m128d a)
560 {
561   struct __mm_store_sd_struct {
562     double u;
563   } __attribute__((__packed__, __may_alias__));
564   ((struct __mm_store_sd_struct*)dp)->u = a[0];
565 }
566 
567 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store1_pd(double * dp,__m128d a)568 _mm_store1_pd(double *dp, __m128d a)
569 {
570   struct __mm_store1_pd_struct {
571     double u[2];
572   } __attribute__((__packed__, __may_alias__));
573   ((struct __mm_store1_pd_struct*)dp)->u[0] = a[0];
574   ((struct __mm_store1_pd_struct*)dp)->u[1] = a[0];
575 }
576 
577 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_pd(double * dp,__m128d a)578 _mm_store_pd(double *dp, __m128d a)
579 {
580   *(__m128d *)dp = a;
581 }
582 
583 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_pd(double * dp,__m128d a)584 _mm_storeu_pd(double *dp, __m128d a)
585 {
586   __builtin_ia32_storeupd(dp, a);
587 }
588 
589 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storer_pd(double * dp,__m128d a)590 _mm_storer_pd(double *dp, __m128d a)
591 {
592   a = __builtin_shufflevector(a, a, 1, 0);
593   *(__m128d *)dp = a;
594 }
595 
596 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeh_pd(double * dp,__m128d a)597 _mm_storeh_pd(double *dp, __m128d a)
598 {
599   struct __mm_storeh_pd_struct {
600     double u;
601   } __attribute__((__packed__, __may_alias__));
602   ((struct __mm_storeh_pd_struct*)dp)->u = a[1];
603 }
604 
605 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_pd(double * dp,__m128d a)606 _mm_storel_pd(double *dp, __m128d a)
607 {
608   struct __mm_storeh_pd_struct {
609     double u;
610   } __attribute__((__packed__, __may_alias__));
611   ((struct __mm_storeh_pd_struct*)dp)->u = a[0];
612 }
613 
614 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi8(__m128i a,__m128i b)615 _mm_add_epi8(__m128i a, __m128i b)
616 {
617   return (__m128i)((__v16qi)a + (__v16qi)b);
618 }
619 
620 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi16(__m128i a,__m128i b)621 _mm_add_epi16(__m128i a, __m128i b)
622 {
623   return (__m128i)((__v8hi)a + (__v8hi)b);
624 }
625 
626 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi32(__m128i a,__m128i b)627 _mm_add_epi32(__m128i a, __m128i b)
628 {
629   return (__m128i)((__v4si)a + (__v4si)b);
630 }
631 
632 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_add_si64(__m64 a,__m64 b)633 _mm_add_si64(__m64 a, __m64 b)
634 {
635   return a + b;
636 }
637 
638 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi64(__m128i a,__m128i b)639 _mm_add_epi64(__m128i a, __m128i b)
640 {
641   return a + b;
642 }
643 
644 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi8(__m128i a,__m128i b)645 _mm_adds_epi8(__m128i a, __m128i b)
646 {
647   return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
648 }
649 
650 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi16(__m128i a,__m128i b)651 _mm_adds_epi16(__m128i a, __m128i b)
652 {
653   return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
654 }
655 
656 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu8(__m128i a,__m128i b)657 _mm_adds_epu8(__m128i a, __m128i b)
658 {
659   return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
660 }
661 
662 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu16(__m128i a,__m128i b)663 _mm_adds_epu16(__m128i a, __m128i b)
664 {
665   return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
666 }
667 
668 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu8(__m128i a,__m128i b)669 _mm_avg_epu8(__m128i a, __m128i b)
670 {
671   return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
672 }
673 
674 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu16(__m128i a,__m128i b)675 _mm_avg_epu16(__m128i a, __m128i b)
676 {
677   return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
678 }
679 
680 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_madd_epi16(__m128i a,__m128i b)681 _mm_madd_epi16(__m128i a, __m128i b)
682 {
683   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
684 }
685 
686 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi16(__m128i a,__m128i b)687 _mm_max_epi16(__m128i a, __m128i b)
688 {
689   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
690 }
691 
692 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu8(__m128i a,__m128i b)693 _mm_max_epu8(__m128i a, __m128i b)
694 {
695   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
696 }
697 
698 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi16(__m128i a,__m128i b)699 _mm_min_epi16(__m128i a, __m128i b)
700 {
701   return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
702 }
703 
704 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu8(__m128i a,__m128i b)705 _mm_min_epu8(__m128i a, __m128i b)
706 {
707   return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
708 }
709 
710 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epi16(__m128i a,__m128i b)711 _mm_mulhi_epi16(__m128i a, __m128i b)
712 {
713   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
714 }
715 
716 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epu16(__m128i a,__m128i b)717 _mm_mulhi_epu16(__m128i a, __m128i b)
718 {
719   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
720 }
721 
722 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mullo_epi16(__m128i a,__m128i b)723 _mm_mullo_epi16(__m128i a, __m128i b)
724 {
725   return (__m128i)((__v8hi)a * (__v8hi)b);
726 }
727 
728 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mul_su32(__m64 a,__m64 b)729 _mm_mul_su32(__m64 a, __m64 b)
730 {
731   return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
732 }
733 
734 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mul_epu32(__m128i a,__m128i b)735 _mm_mul_epu32(__m128i a, __m128i b)
736 {
737   return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
738 }
739 
740 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sad_epu8(__m128i a,__m128i b)741 _mm_sad_epu8(__m128i a, __m128i b)
742 {
743   return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
744 }
745 
746 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi8(__m128i a,__m128i b)747 _mm_sub_epi8(__m128i a, __m128i b)
748 {
749   return (__m128i)((__v16qi)a - (__v16qi)b);
750 }
751 
752 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi16(__m128i a,__m128i b)753 _mm_sub_epi16(__m128i a, __m128i b)
754 {
755   return (__m128i)((__v8hi)a - (__v8hi)b);
756 }
757 
758 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi32(__m128i a,__m128i b)759 _mm_sub_epi32(__m128i a, __m128i b)
760 {
761   return (__m128i)((__v4si)a - (__v4si)b);
762 }
763 
764 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sub_si64(__m64 a,__m64 b)765 _mm_sub_si64(__m64 a, __m64 b)
766 {
767   return a - b;
768 }
769 
770 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi64(__m128i a,__m128i b)771 _mm_sub_epi64(__m128i a, __m128i b)
772 {
773   return a - b;
774 }
775 
776 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi8(__m128i a,__m128i b)777 _mm_subs_epi8(__m128i a, __m128i b)
778 {
779   return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
780 }
781 
782 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi16(__m128i a,__m128i b)783 _mm_subs_epi16(__m128i a, __m128i b)
784 {
785   return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
786 }
787 
788 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu8(__m128i a,__m128i b)789 _mm_subs_epu8(__m128i a, __m128i b)
790 {
791   return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
792 }
793 
794 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu16(__m128i a,__m128i b)795 _mm_subs_epu16(__m128i a, __m128i b)
796 {
797   return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
798 }
799 
800 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_and_si128(__m128i a,__m128i b)801 _mm_and_si128(__m128i a, __m128i b)
802 {
803   return a & b;
804 }
805 
806 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_andnot_si128(__m128i a,__m128i b)807 _mm_andnot_si128(__m128i a, __m128i b)
808 {
809   return ~a & b;
810 }
811 
812 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_or_si128(__m128i a,__m128i b)813 _mm_or_si128(__m128i a, __m128i b)
814 {
815   return a | b;
816 }
817 
818 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_xor_si128(__m128i a,__m128i b)819 _mm_xor_si128(__m128i a, __m128i b)
820 {
821   return a ^ b;
822 }
823 
824 #define _mm_slli_si128(a, count) __extension__ ({ \
825   __m128i __a = (a); \
826   (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
827 
828 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi16(__m128i a,int count)829 _mm_slli_epi16(__m128i a, int count)
830 {
831   return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
832 }
833 
834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi16(__m128i a,__m128i count)835 _mm_sll_epi16(__m128i a, __m128i count)
836 {
837   return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
838 }
839 
840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi32(__m128i a,int count)841 _mm_slli_epi32(__m128i a, int count)
842 {
843   return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
844 }
845 
846 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi32(__m128i a,__m128i count)847 _mm_sll_epi32(__m128i a, __m128i count)
848 {
849   return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
850 }
851 
852 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi64(__m128i a,int count)853 _mm_slli_epi64(__m128i a, int count)
854 {
855   return __builtin_ia32_psllqi128(a, count);
856 }
857 
858 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi64(__m128i a,__m128i count)859 _mm_sll_epi64(__m128i a, __m128i count)
860 {
861   return __builtin_ia32_psllq128(a, count);
862 }
863 
864 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi16(__m128i a,int count)865 _mm_srai_epi16(__m128i a, int count)
866 {
867   return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
868 }
869 
870 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi16(__m128i a,__m128i count)871 _mm_sra_epi16(__m128i a, __m128i count)
872 {
873   return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
874 }
875 
876 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi32(__m128i a,int count)877 _mm_srai_epi32(__m128i a, int count)
878 {
879   return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
880 }
881 
882 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi32(__m128i a,__m128i count)883 _mm_sra_epi32(__m128i a, __m128i count)
884 {
885   return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
886 }
887 
888 
889 #define _mm_srli_si128(a, count) __extension__ ({ \
890   __m128i __a = (a); \
891   (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
892 
893 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi16(__m128i a,int count)894 _mm_srli_epi16(__m128i a, int count)
895 {
896   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
897 }
898 
899 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi16(__m128i a,__m128i count)900 _mm_srl_epi16(__m128i a, __m128i count)
901 {
902   return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
903 }
904 
905 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi32(__m128i a,int count)906 _mm_srli_epi32(__m128i a, int count)
907 {
908   return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
909 }
910 
911 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi32(__m128i a,__m128i count)912 _mm_srl_epi32(__m128i a, __m128i count)
913 {
914   return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
915 }
916 
917 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi64(__m128i a,int count)918 _mm_srli_epi64(__m128i a, int count)
919 {
920   return __builtin_ia32_psrlqi128(a, count);
921 }
922 
923 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi64(__m128i a,__m128i count)924 _mm_srl_epi64(__m128i a, __m128i count)
925 {
926   return __builtin_ia32_psrlq128(a, count);
927 }
928 
929 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi8(__m128i a,__m128i b)930 _mm_cmpeq_epi8(__m128i a, __m128i b)
931 {
932   return (__m128i)((__v16qi)a == (__v16qi)b);
933 }
934 
935 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi16(__m128i a,__m128i b)936 _mm_cmpeq_epi16(__m128i a, __m128i b)
937 {
938   return (__m128i)((__v8hi)a == (__v8hi)b);
939 }
940 
941 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi32(__m128i a,__m128i b)942 _mm_cmpeq_epi32(__m128i a, __m128i b)
943 {
944   return (__m128i)((__v4si)a == (__v4si)b);
945 }
946 
947 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi8(__m128i a,__m128i b)948 _mm_cmpgt_epi8(__m128i a, __m128i b)
949 {
950   /* This function always performs a signed comparison, but __v16qi is a char
951      which may be signed or unsigned. */
952   typedef signed char __v16qs __attribute__((__vector_size__(16)));
953   return (__m128i)((__v16qs)a > (__v16qs)b);
954 }
955 
956 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi16(__m128i a,__m128i b)957 _mm_cmpgt_epi16(__m128i a, __m128i b)
958 {
959   return (__m128i)((__v8hi)a > (__v8hi)b);
960 }
961 
962 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi32(__m128i a,__m128i b)963 _mm_cmpgt_epi32(__m128i a, __m128i b)
964 {
965   return (__m128i)((__v4si)a > (__v4si)b);
966 }
967 
968 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi8(__m128i a,__m128i b)969 _mm_cmplt_epi8(__m128i a, __m128i b)
970 {
971   return _mm_cmpgt_epi8(b,a);
972 }
973 
974 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi16(__m128i a,__m128i b)975 _mm_cmplt_epi16(__m128i a, __m128i b)
976 {
977   return _mm_cmpgt_epi16(b,a);
978 }
979 
980 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi32(__m128i a,__m128i b)981 _mm_cmplt_epi32(__m128i a, __m128i b)
982 {
983   return _mm_cmpgt_epi32(b,a);
984 }
985 
986 #ifdef __x86_64__
987 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_sd(__m128d a,long long b)988 _mm_cvtsi64_sd(__m128d a, long long b)
989 {
990   a[0] = b;
991   return a;
992 }
993 
994 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si64(__m128d a)995 _mm_cvtsd_si64(__m128d a)
996 {
997   return __builtin_ia32_cvtsd2si64(a);
998 }
999 
1000 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si64(__m128d a)1001 _mm_cvttsd_si64(__m128d a)
1002 {
1003   return a[0];
1004 }
1005 #endif
1006 
1007 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_ps(__m128i a)1008 _mm_cvtepi32_ps(__m128i a)
1009 {
1010   return __builtin_ia32_cvtdq2ps((__v4si)a);
1011 }
1012 
1013 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_epi32(__m128 a)1014 _mm_cvtps_epi32(__m128 a)
1015 {
1016   return (__m128i)__builtin_ia32_cvtps2dq(a);
1017 }
1018 
1019 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttps_epi32(__m128 a)1020 _mm_cvttps_epi32(__m128 a)
1021 {
1022   return (__m128i)__builtin_ia32_cvttps2dq(a);
1023 }
1024 
1025 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_si128(int a)1026 _mm_cvtsi32_si128(int a)
1027 {
1028   return (__m128i)(__v4si){ a, 0, 0, 0 };
1029 }
1030 
1031 #ifdef __x86_64__
1032 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_si128(long long a)1033 _mm_cvtsi64_si128(long long a)
1034 {
1035   return (__m128i){ a, 0 };
1036 }
1037 #endif
1038 
1039 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si32(__m128i a)1040 _mm_cvtsi128_si32(__m128i a)
1041 {
1042   __v4si b = (__v4si)a;
1043   return b[0];
1044 }
1045 
1046 #ifdef __x86_64__
1047 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si64(__m128i a)1048 _mm_cvtsi128_si64(__m128i a)
1049 {
1050   return a[0];
1051 }
1052 #endif
1053 
1054 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_load_si128(__m128i const * p)1055 _mm_load_si128(__m128i const *p)
1056 {
1057   return *p;
1058 }
1059 
1060 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadu_si128(__m128i const * p)1061 _mm_loadu_si128(__m128i const *p)
1062 {
1063   struct __loadu_si128 {
1064     __m128i v;
1065   } __attribute__((packed, may_alias));
1066   return ((struct __loadu_si128*)p)->v;
1067 }
1068 
1069 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadl_epi64(__m128i const * p)1070 _mm_loadl_epi64(__m128i const *p)
1071 {
1072   struct __mm_loadl_epi64_struct {
1073     long long u;
1074   } __attribute__((__packed__, __may_alias__));
1075   return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0};
1076 }
1077 
1078 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64x(long long q1,long long q0)1079 _mm_set_epi64x(long long q1, long long q0)
1080 {
1081   return (__m128i){ q0, q1 };
1082 }
1083 
1084 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64(__m64 q1,__m64 q0)1085 _mm_set_epi64(__m64 q1, __m64 q0)
1086 {
1087   return (__m128i){ (long long)q0, (long long)q1 };
1088 }
1089 
1090 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi32(int i3,int i2,int i1,int i0)1091 _mm_set_epi32(int i3, int i2, int i1, int i0)
1092 {
1093   return (__m128i)(__v4si){ i0, i1, i2, i3};
1094 }
1095 
1096 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi16(short w7,short w6,short w5,short w4,short w3,short w2,short w1,short w0)1097 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1098 {
1099   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1100 }
1101 
1102 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi8(char b15,char b14,char b13,char b12,char b11,char b10,char b9,char b8,char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0)1103 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1104 {
1105   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1106 }
1107 
1108 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64x(long long q)1109 _mm_set1_epi64x(long long q)
1110 {
1111   return (__m128i){ q, q };
1112 }
1113 
1114 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64(__m64 q)1115 _mm_set1_epi64(__m64 q)
1116 {
1117   return (__m128i){ (long long)q, (long long)q };
1118 }
1119 
1120 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi32(int i)1121 _mm_set1_epi32(int i)
1122 {
1123   return (__m128i)(__v4si){ i, i, i, i };
1124 }
1125 
1126 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi16(short w)1127 _mm_set1_epi16(short w)
1128 {
1129   return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1130 }
1131 
1132 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi8(char b)1133 _mm_set1_epi8(char b)
1134 {
1135   return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1136 }
1137 
1138 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi64(__m64 q0,__m64 q1)1139 _mm_setr_epi64(__m64 q0, __m64 q1)
1140 {
1141   return (__m128i){ (long long)q0, (long long)q1 };
1142 }
1143 
1144 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi32(int i0,int i1,int i2,int i3)1145 _mm_setr_epi32(int i0, int i1, int i2, int i3)
1146 {
1147   return (__m128i)(__v4si){ i0, i1, i2, i3};
1148 }
1149 
1150 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)1151 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1152 {
1153   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1154 }
1155 
1156 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi8(char b0,char b1,char b2,char b3,char b4,char b5,char b6,char b7,char b8,char b9,char b10,char b11,char b12,char b13,char b14,char b15)1157 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1158 {
1159   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1160 }
1161 
1162 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setzero_si128(void)1163 _mm_setzero_si128(void)
1164 {
1165   return (__m128i){ 0LL, 0LL };
1166 }
1167 
1168 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_si128(__m128i * p,__m128i b)1169 _mm_store_si128(__m128i *p, __m128i b)
1170 {
1171   *p = b;
1172 }
1173 
1174 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_si128(__m128i * p,__m128i b)1175 _mm_storeu_si128(__m128i *p, __m128i b)
1176 {
1177   __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1178 }
1179 
1180 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_maskmoveu_si128(__m128i d,__m128i n,char * p)1181 _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1182 {
1183   __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1184 }
1185 
1186 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_epi64(__m128i * p,__m128i a)1187 _mm_storel_epi64(__m128i *p, __m128i a)
1188 {
1189   __builtin_ia32_storelv4si((__v2si *)p, a);
1190 }
1191 
1192 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_pd(double * p,__m128d a)1193 _mm_stream_pd(double *p, __m128d a)
1194 {
1195   __builtin_ia32_movntpd(p, a);
1196 }
1197 
1198 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si128(__m128i * p,__m128i a)1199 _mm_stream_si128(__m128i *p, __m128i a)
1200 {
1201   __builtin_ia32_movntdq(p, a);
1202 }
1203 
1204 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si32(int * p,int a)1205 _mm_stream_si32(int *p, int a)
1206 {
1207   __builtin_ia32_movnti(p, a);
1208 }
1209 
1210 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_clflush(void const * p)1211 _mm_clflush(void const *p)
1212 {
1213   __builtin_ia32_clflush(p);
1214 }
1215 
1216 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_lfence(void)1217 _mm_lfence(void)
1218 {
1219   __builtin_ia32_lfence();
1220 }
1221 
1222 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_mfence(void)1223 _mm_mfence(void)
1224 {
1225   __builtin_ia32_mfence();
1226 }
1227 
1228 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi16(__m128i a,__m128i b)1229 _mm_packs_epi16(__m128i a, __m128i b)
1230 {
1231   return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1232 }
1233 
1234 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi32(__m128i a,__m128i b)1235 _mm_packs_epi32(__m128i a, __m128i b)
1236 {
1237   return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1238 }
1239 
1240 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi16(__m128i a,__m128i b)1241 _mm_packus_epi16(__m128i a, __m128i b)
1242 {
1243   return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1244 }
1245 
1246 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_extract_epi16(__m128i a,int imm)1247 _mm_extract_epi16(__m128i a, int imm)
1248 {
1249   __v8hi b = (__v8hi)a;
1250   return (unsigned short)b[imm];
1251 }
1252 
1253 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_insert_epi16(__m128i a,int b,int imm)1254 _mm_insert_epi16(__m128i a, int b, int imm)
1255 {
1256   __v8hi c = (__v8hi)a;
1257   c[imm & 7] = b;
1258   return (__m128i)c;
1259 }
1260 
1261 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_epi8(__m128i a)1262 _mm_movemask_epi8(__m128i a)
1263 {
1264   return __builtin_ia32_pmovmskb128((__v16qi)a);
1265 }
1266 
1267 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1268   __m128i __a = (a); \
1269   (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
1270                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1271                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1272 
1273 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1274   __m128i __a = (a); \
1275   (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1276                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
1277                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1278                                    4, 5, 6, 7); })
1279 
1280 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1281   __m128i __a = (a); \
1282   (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
1283                                    0, 1, 2, 3, \
1284                                    4 + (((imm) & 0x03) >> 0), \
1285                                    4 + (((imm) & 0x0c) >> 2), \
1286                                    4 + (((imm) & 0x30) >> 4), \
1287                                    4 + (((imm) & 0xc0) >> 6)); })
1288 
1289 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi8(__m128i a,__m128i b)1290 _mm_unpackhi_epi8(__m128i a, __m128i b)
1291 {
1292   return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1293 }
1294 
1295 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi16(__m128i a,__m128i b)1296 _mm_unpackhi_epi16(__m128i a, __m128i b)
1297 {
1298   return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1299 }
1300 
1301 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi32(__m128i a,__m128i b)1302 _mm_unpackhi_epi32(__m128i a, __m128i b)
1303 {
1304   return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1305 }
1306 
1307 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi64(__m128i a,__m128i b)1308 _mm_unpackhi_epi64(__m128i a, __m128i b)
1309 {
1310   return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1311 }
1312 
1313 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi8(__m128i a,__m128i b)1314 _mm_unpacklo_epi8(__m128i a, __m128i b)
1315 {
1316   return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1317 }
1318 
1319 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi16(__m128i a,__m128i b)1320 _mm_unpacklo_epi16(__m128i a, __m128i b)
1321 {
1322   return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1323 }
1324 
1325 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi32(__m128i a,__m128i b)1326 _mm_unpacklo_epi32(__m128i a, __m128i b)
1327 {
1328   return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1329 }
1330 
1331 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi64(__m128i a,__m128i b)1332 _mm_unpacklo_epi64(__m128i a, __m128i b)
1333 {
1334   return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1335 }
1336 
1337 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_movepi64_pi64(__m128i a)1338 _mm_movepi64_pi64(__m128i a)
1339 {
1340   return (__m64)a[0];
1341 }
1342 
1343 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_movpi64_pi64(__m64 a)1344 _mm_movpi64_pi64(__m64 a)
1345 {
1346   return (__m128i){ (long long)a, 0 };
1347 }
1348 
1349 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_move_epi64(__m128i a)1350 _mm_move_epi64(__m128i a)
1351 {
1352   return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1353 }
1354 
1355 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pd(__m128d a,__m128d b)1356 _mm_unpackhi_pd(__m128d a, __m128d b)
1357 {
1358   return __builtin_shufflevector(a, b, 1, 2+1);
1359 }
1360 
1361 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pd(__m128d a,__m128d b)1362 _mm_unpacklo_pd(__m128d a, __m128d b)
1363 {
1364   return __builtin_shufflevector(a, b, 0, 2+0);
1365 }
1366 
1367 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_pd(__m128d a)1368 _mm_movemask_pd(__m128d a)
1369 {
1370   return __builtin_ia32_movmskpd(a);
1371 }
1372 
1373 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1374   __m128d __a = (a); \
1375   __m128d __b = (b); \
1376   __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
1377 
1378 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castpd_ps(__m128d in)1379 _mm_castpd_ps(__m128d in)
1380 {
1381   return (__m128)in;
1382 }
1383 
1384 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castpd_si128(__m128d in)1385 _mm_castpd_si128(__m128d in)
1386 {
1387   return (__m128i)in;
1388 }
1389 
1390 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castps_pd(__m128 in)1391 _mm_castps_pd(__m128 in)
1392 {
1393   return (__m128d)in;
1394 }
1395 
1396 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castps_si128(__m128 in)1397 _mm_castps_si128(__m128 in)
1398 {
1399   return (__m128i)in;
1400 }
1401 
1402 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_ps(__m128i in)1403 _mm_castsi128_ps(__m128i in)
1404 {
1405   return (__m128)in;
1406 }
1407 
1408 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_pd(__m128i in)1409 _mm_castsi128_pd(__m128i in)
1410 {
1411   return (__m128d)in;
1412 }
1413 
1414 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_pause(void)1415 _mm_pause(void)
1416 {
1417   __asm__ volatile ("pause");
1418 }
1419 
1420 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1421 
1422 #endif /* __SSE2__ */
1423 
1424 #endif /* __EMMINTRIN_H */
1425