• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to deal
5  * in the Software without restriction, including without limitation the rights
6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7  * copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19  * THE SOFTWARE.
20  *
21  *===-----------------------------------------------------------------------===
22  */
23 
24 #ifndef __EMMINTRIN_H
25 #define __EMMINTRIN_H
26 
27 #ifndef __SSE2__
28 #error "SSE2 instruction set not enabled"
29 #else
30 
31 #include <xmmintrin.h>
32 
33 typedef double __m128d __attribute__((__vector_size__(16)));
34 typedef long long __m128i __attribute__((__vector_size__(16)));
35 
36 /* Type defines.  */
37 typedef double __v2df __attribute__ ((__vector_size__ (16)));
38 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39 typedef short __v8hi __attribute__((__vector_size__(16)));
40 typedef char __v16qi __attribute__((__vector_size__(16)));
41 
42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_sd(__m128d a,__m128d b)43 _mm_add_sd(__m128d a, __m128d b)
44 {
45   a[0] += b[0];
46   return a;
47 }
48 
49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_pd(__m128d a,__m128d b)50 _mm_add_pd(__m128d a, __m128d b)
51 {
52   return a + b;
53 }
54 
55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_sd(__m128d a,__m128d b)56 _mm_sub_sd(__m128d a, __m128d b)
57 {
58   a[0] -= b[0];
59   return a;
60 }
61 
62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_pd(__m128d a,__m128d b)63 _mm_sub_pd(__m128d a, __m128d b)
64 {
65   return a - b;
66 }
67 
68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_sd(__m128d a,__m128d b)69 _mm_mul_sd(__m128d a, __m128d b)
70 {
71   a[0] *= b[0];
72   return a;
73 }
74 
75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_pd(__m128d a,__m128d b)76 _mm_mul_pd(__m128d a, __m128d b)
77 {
78   return a * b;
79 }
80 
81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_sd(__m128d a,__m128d b)82 _mm_div_sd(__m128d a, __m128d b)
83 {
84   a[0] /= b[0];
85   return a;
86 }
87 
88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_pd(__m128d a,__m128d b)89 _mm_div_pd(__m128d a, __m128d b)
90 {
91   return a / b;
92 }
93 
94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_sd(__m128d a,__m128d b)95 _mm_sqrt_sd(__m128d a, __m128d b)
96 {
97   __m128d c = __builtin_ia32_sqrtsd(b);
98   return (__m128d) { c[0], a[1] };
99 }
100 
101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_pd(__m128d a)102 _mm_sqrt_pd(__m128d a)
103 {
104   return __builtin_ia32_sqrtpd(a);
105 }
106 
107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_sd(__m128d a,__m128d b)108 _mm_min_sd(__m128d a, __m128d b)
109 {
110   return __builtin_ia32_minsd(a, b);
111 }
112 
113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_pd(__m128d a,__m128d b)114 _mm_min_pd(__m128d a, __m128d b)
115 {
116   return __builtin_ia32_minpd(a, b);
117 }
118 
119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_sd(__m128d a,__m128d b)120 _mm_max_sd(__m128d a, __m128d b)
121 {
122   return __builtin_ia32_maxsd(a, b);
123 }
124 
125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_pd(__m128d a,__m128d b)126 _mm_max_pd(__m128d a, __m128d b)
127 {
128   return __builtin_ia32_maxpd(a, b);
129 }
130 
131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_and_pd(__m128d a,__m128d b)132 _mm_and_pd(__m128d a, __m128d b)
133 {
134   return (__m128d)((__v4si)a & (__v4si)b);
135 }
136 
137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_andnot_pd(__m128d a,__m128d b)138 _mm_andnot_pd(__m128d a, __m128d b)
139 {
140   return (__m128d)(~(__v4si)a & (__v4si)b);
141 }
142 
143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_or_pd(__m128d a,__m128d b)144 _mm_or_pd(__m128d a, __m128d b)
145 {
146   return (__m128d)((__v4si)a | (__v4si)b);
147 }
148 
149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_xor_pd(__m128d a,__m128d b)150 _mm_xor_pd(__m128d a, __m128d b)
151 {
152   return (__m128d)((__v4si)a ^ (__v4si)b);
153 }
154 
155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pd(__m128d a,__m128d b)156 _mm_cmpeq_pd(__m128d a, __m128d b)
157 {
158   return (__m128d)__builtin_ia32_cmppd(a, b, 0);
159 }
160 
161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_pd(__m128d a,__m128d b)162 _mm_cmplt_pd(__m128d a, __m128d b)
163 {
164   return (__m128d)__builtin_ia32_cmppd(a, b, 1);
165 }
166 
167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_pd(__m128d a,__m128d b)168 _mm_cmple_pd(__m128d a, __m128d b)
169 {
170   return (__m128d)__builtin_ia32_cmppd(a, b, 2);
171 }
172 
173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pd(__m128d a,__m128d b)174 _mm_cmpgt_pd(__m128d a, __m128d b)
175 {
176   return (__m128d)__builtin_ia32_cmppd(b, a, 1);
177 }
178 
179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_pd(__m128d a,__m128d b)180 _mm_cmpge_pd(__m128d a, __m128d b)
181 {
182   return (__m128d)__builtin_ia32_cmppd(b, a, 2);
183 }
184 
185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_pd(__m128d a,__m128d b)186 _mm_cmpord_pd(__m128d a, __m128d b)
187 {
188   return (__m128d)__builtin_ia32_cmppd(a, b, 7);
189 }
190 
191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_pd(__m128d a,__m128d b)192 _mm_cmpunord_pd(__m128d a, __m128d b)
193 {
194   return (__m128d)__builtin_ia32_cmppd(a, b, 3);
195 }
196 
197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_pd(__m128d a,__m128d b)198 _mm_cmpneq_pd(__m128d a, __m128d b)
199 {
200   return (__m128d)__builtin_ia32_cmppd(a, b, 4);
201 }
202 
203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_pd(__m128d a,__m128d b)204 _mm_cmpnlt_pd(__m128d a, __m128d b)
205 {
206   return (__m128d)__builtin_ia32_cmppd(a, b, 5);
207 }
208 
209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_pd(__m128d a,__m128d b)210 _mm_cmpnle_pd(__m128d a, __m128d b)
211 {
212   return (__m128d)__builtin_ia32_cmppd(a, b, 6);
213 }
214 
215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_pd(__m128d a,__m128d b)216 _mm_cmpngt_pd(__m128d a, __m128d b)
217 {
218   return (__m128d)__builtin_ia32_cmppd(b, a, 5);
219 }
220 
221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_pd(__m128d a,__m128d b)222 _mm_cmpnge_pd(__m128d a, __m128d b)
223 {
224   return (__m128d)__builtin_ia32_cmppd(b, a, 6);
225 }
226 
227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_sd(__m128d a,__m128d b)228 _mm_cmpeq_sd(__m128d a, __m128d b)
229 {
230   return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
231 }
232 
233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_sd(__m128d a,__m128d b)234 _mm_cmplt_sd(__m128d a, __m128d b)
235 {
236   return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
237 }
238 
239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_sd(__m128d a,__m128d b)240 _mm_cmple_sd(__m128d a, __m128d b)
241 {
242   return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
243 }
244 
245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_sd(__m128d a,__m128d b)246 _mm_cmpgt_sd(__m128d a, __m128d b)
247 {
248   return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
249 }
250 
251 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_sd(__m128d a,__m128d b)252 _mm_cmpge_sd(__m128d a, __m128d b)
253 {
254   return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
255 }
256 
257 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_sd(__m128d a,__m128d b)258 _mm_cmpord_sd(__m128d a, __m128d b)
259 {
260   return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
261 }
262 
263 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_sd(__m128d a,__m128d b)264 _mm_cmpunord_sd(__m128d a, __m128d b)
265 {
266   return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
267 }
268 
269 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_sd(__m128d a,__m128d b)270 _mm_cmpneq_sd(__m128d a, __m128d b)
271 {
272   return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
273 }
274 
275 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_sd(__m128d a,__m128d b)276 _mm_cmpnlt_sd(__m128d a, __m128d b)
277 {
278   return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
279 }
280 
281 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_sd(__m128d a,__m128d b)282 _mm_cmpnle_sd(__m128d a, __m128d b)
283 {
284   return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
285 }
286 
287 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_sd(__m128d a,__m128d b)288 _mm_cmpngt_sd(__m128d a, __m128d b)
289 {
290   return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
291 }
292 
293 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_sd(__m128d a,__m128d b)294 _mm_cmpnge_sd(__m128d a, __m128d b)
295 {
296   return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
297 }
298 
299 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comieq_sd(__m128d a,__m128d b)300 _mm_comieq_sd(__m128d a, __m128d b)
301 {
302   return __builtin_ia32_comisdeq(a, b);
303 }
304 
305 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comilt_sd(__m128d a,__m128d b)306 _mm_comilt_sd(__m128d a, __m128d b)
307 {
308   return __builtin_ia32_comisdlt(a, b);
309 }
310 
311 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comile_sd(__m128d a,__m128d b)312 _mm_comile_sd(__m128d a, __m128d b)
313 {
314   return __builtin_ia32_comisdle(a, b);
315 }
316 
317 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comigt_sd(__m128d a,__m128d b)318 _mm_comigt_sd(__m128d a, __m128d b)
319 {
320   return __builtin_ia32_comisdgt(a, b);
321 }
322 
323 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comineq_sd(__m128d a,__m128d b)324 _mm_comineq_sd(__m128d a, __m128d b)
325 {
326   return __builtin_ia32_comisdneq(a, b);
327 }
328 
329 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomieq_sd(__m128d a,__m128d b)330 _mm_ucomieq_sd(__m128d a, __m128d b)
331 {
332   return __builtin_ia32_ucomisdeq(a, b);
333 }
334 
335 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomilt_sd(__m128d a,__m128d b)336 _mm_ucomilt_sd(__m128d a, __m128d b)
337 {
338   return __builtin_ia32_ucomisdlt(a, b);
339 }
340 
341 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomile_sd(__m128d a,__m128d b)342 _mm_ucomile_sd(__m128d a, __m128d b)
343 {
344   return __builtin_ia32_ucomisdle(a, b);
345 }
346 
347 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomigt_sd(__m128d a,__m128d b)348 _mm_ucomigt_sd(__m128d a, __m128d b)
349 {
350   return __builtin_ia32_ucomisdgt(a, b);
351 }
352 
353 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomineq_sd(__m128d a,__m128d b)354 _mm_ucomineq_sd(__m128d a, __m128d b)
355 {
356   return __builtin_ia32_ucomisdneq(a, b);
357 }
358 
359 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_ps(__m128d a)360 _mm_cvtpd_ps(__m128d a)
361 {
362   return __builtin_ia32_cvtpd2ps(a);
363 }
364 
365 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pd(__m128 a)366 _mm_cvtps_pd(__m128 a)
367 {
368   return __builtin_ia32_cvtps2pd(a);
369 }
370 
371 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_pd(__m128i a)372 _mm_cvtepi32_pd(__m128i a)
373 {
374   return __builtin_ia32_cvtdq2pd((__v4si)a);
375 }
376 
377 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_epi32(__m128d a)378 _mm_cvtpd_epi32(__m128d a)
379 {
380   return __builtin_ia32_cvtpd2dq(a);
381 }
382 
383 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si32(__m128d a)384 _mm_cvtsd_si32(__m128d a)
385 {
386   return __builtin_ia32_cvtsd2si(a);
387 }
388 
389 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_ss(__m128 a,__m128d b)390 _mm_cvtsd_ss(__m128 a, __m128d b)
391 {
392   a[0] = b[0];
393   return a;
394 }
395 
396 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_sd(__m128d a,int b)397 _mm_cvtsi32_sd(__m128d a, int b)
398 {
399   a[0] = b;
400   return a;
401 }
402 
403 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_sd(__m128d a,__m128 b)404 _mm_cvtss_sd(__m128d a, __m128 b)
405 {
406   a[0] = b[0];
407   return a;
408 }
409 
410 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_epi32(__m128d a)411 _mm_cvttpd_epi32(__m128d a)
412 {
413   return (__m128i)__builtin_ia32_cvttpd2dq(a);
414 }
415 
416 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si32(__m128d a)417 _mm_cvttsd_si32(__m128d a)
418 {
419   return a[0];
420 }
421 
422 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_pi32(__m128d a)423 _mm_cvtpd_pi32(__m128d a)
424 {
425   return (__m64)__builtin_ia32_cvtpd2pi(a);
426 }
427 
428 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_pi32(__m128d a)429 _mm_cvttpd_pi32(__m128d a)
430 {
431   return (__m64)__builtin_ia32_cvttpd2pi(a);
432 }
433 
434 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi32_pd(__m64 a)435 _mm_cvtpi32_pd(__m64 a)
436 {
437   return __builtin_ia32_cvtpi2pd((__v2si)a);
438 }
439 
440 static __inline__ double __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_f64(__m128d a)441 _mm_cvtsd_f64(__m128d a)
442 {
443   return a[0];
444 }
445 
446 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_pd(double const * dp)447 _mm_load_pd(double const *dp)
448 {
449   return *(__m128d*)dp;
450 }
451 
452 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load1_pd(double const * dp)453 _mm_load1_pd(double const *dp)
454 {
455   return (__m128d){ dp[0], dp[0] };
456 }
457 
458 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
459 
460 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadr_pd(double const * dp)461 _mm_loadr_pd(double const *dp)
462 {
463   return (__m128d){ dp[1], dp[0] };
464 }
465 
466 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadu_pd(double const * dp)467 _mm_loadu_pd(double const *dp)
468 {
469   struct __loadu_pd {
470     __m128d v;
471   } __attribute__((packed, may_alias));
472   return ((struct __loadu_pd*)dp)->v;
473 }
474 
475 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_sd(double const * dp)476 _mm_load_sd(double const *dp)
477 {
478   return (__m128d){ *dp, 0.0 };
479 }
480 
481 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pd(__m128d a,double const * dp)482 _mm_loadh_pd(__m128d a, double const *dp)
483 {
484   return (__m128d){ a[0], *dp };
485 }
486 
487 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pd(__m128d a,double const * dp)488 _mm_loadl_pd(__m128d a, double const *dp)
489 {
490   return (__m128d){ *dp, a[1] };
491 }
492 
493 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_sd(double w)494 _mm_set_sd(double w)
495 {
496   return (__m128d){ w, 0 };
497 }
498 
499 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set1_pd(double w)500 _mm_set1_pd(double w)
501 {
502   return (__m128d){ w, w };
503 }
504 
505 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_pd(double w,double x)506 _mm_set_pd(double w, double x)
507 {
508   return (__m128d){ x, w };
509 }
510 
511 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setr_pd(double w,double x)512 _mm_setr_pd(double w, double x)
513 {
514   return (__m128d){ w, x };
515 }
516 
517 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setzero_pd(void)518 _mm_setzero_pd(void)
519 {
520   return (__m128d){ 0, 0 };
521 }
522 
523 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_move_sd(__m128d a,__m128d b)524 _mm_move_sd(__m128d a, __m128d b)
525 {
526   return (__m128d){ b[0], a[1] };
527 }
528 
529 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_sd(double * dp,__m128d a)530 _mm_store_sd(double *dp, __m128d a)
531 {
532   dp[0] = a[0];
533 }
534 
535 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store1_pd(double * dp,__m128d a)536 _mm_store1_pd(double *dp, __m128d a)
537 {
538   dp[0] = a[0];
539   dp[1] = a[0];
540 }
541 
542 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_pd(double * dp,__m128d a)543 _mm_store_pd(double *dp, __m128d a)
544 {
545   *(__m128d *)dp = a;
546 }
547 
548 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_pd(double * dp,__m128d a)549 _mm_storeu_pd(double *dp, __m128d a)
550 {
551   __builtin_ia32_storeupd(dp, a);
552 }
553 
554 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storer_pd(double * dp,__m128d a)555 _mm_storer_pd(double *dp, __m128d a)
556 {
557   dp[0] = a[1];
558   dp[1] = a[0];
559 }
560 
561 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeh_pd(double * dp,__m128d a)562 _mm_storeh_pd(double *dp, __m128d a)
563 {
564   dp[0] = a[1];
565 }
566 
567 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_pd(double * dp,__m128d a)568 _mm_storel_pd(double *dp, __m128d a)
569 {
570   dp[0] = a[0];
571 }
572 
573 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi8(__m128i a,__m128i b)574 _mm_add_epi8(__m128i a, __m128i b)
575 {
576   return (__m128i)((__v16qi)a + (__v16qi)b);
577 }
578 
579 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi16(__m128i a,__m128i b)580 _mm_add_epi16(__m128i a, __m128i b)
581 {
582   return (__m128i)((__v8hi)a + (__v8hi)b);
583 }
584 
585 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi32(__m128i a,__m128i b)586 _mm_add_epi32(__m128i a, __m128i b)
587 {
588   return (__m128i)((__v4si)a + (__v4si)b);
589 }
590 
591 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_add_si64(__m64 a,__m64 b)592 _mm_add_si64(__m64 a, __m64 b)
593 {
594   return a + b;
595 }
596 
597 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi64(__m128i a,__m128i b)598 _mm_add_epi64(__m128i a, __m128i b)
599 {
600   return a + b;
601 }
602 
603 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi8(__m128i a,__m128i b)604 _mm_adds_epi8(__m128i a, __m128i b)
605 {
606   return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
607 }
608 
609 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi16(__m128i a,__m128i b)610 _mm_adds_epi16(__m128i a, __m128i b)
611 {
612   return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
613 }
614 
615 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu8(__m128i a,__m128i b)616 _mm_adds_epu8(__m128i a, __m128i b)
617 {
618   return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
619 }
620 
621 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu16(__m128i a,__m128i b)622 _mm_adds_epu16(__m128i a, __m128i b)
623 {
624   return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
625 }
626 
627 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu8(__m128i a,__m128i b)628 _mm_avg_epu8(__m128i a, __m128i b)
629 {
630   return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
631 }
632 
633 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu16(__m128i a,__m128i b)634 _mm_avg_epu16(__m128i a, __m128i b)
635 {
636   return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
637 }
638 
639 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_madd_epi16(__m128i a,__m128i b)640 _mm_madd_epi16(__m128i a, __m128i b)
641 {
642   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
643 }
644 
645 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi16(__m128i a,__m128i b)646 _mm_max_epi16(__m128i a, __m128i b)
647 {
648   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
649 }
650 
651 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu8(__m128i a,__m128i b)652 _mm_max_epu8(__m128i a, __m128i b)
653 {
654   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
655 }
656 
657 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi16(__m128i a,__m128i b)658 _mm_min_epi16(__m128i a, __m128i b)
659 {
660   return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
661 }
662 
663 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu8(__m128i a,__m128i b)664 _mm_min_epu8(__m128i a, __m128i b)
665 {
666   return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
667 }
668 
669 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epi16(__m128i a,__m128i b)670 _mm_mulhi_epi16(__m128i a, __m128i b)
671 {
672   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
673 }
674 
675 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epu16(__m128i a,__m128i b)676 _mm_mulhi_epu16(__m128i a, __m128i b)
677 {
678   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
679 }
680 
681 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mullo_epi16(__m128i a,__m128i b)682 _mm_mullo_epi16(__m128i a, __m128i b)
683 {
684   return (__m128i)((__v8hi)a * (__v8hi)b);
685 }
686 
687 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mul_su32(__m64 a,__m64 b)688 _mm_mul_su32(__m64 a, __m64 b)
689 {
690   return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
691 }
692 
693 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mul_epu32(__m128i a,__m128i b)694 _mm_mul_epu32(__m128i a, __m128i b)
695 {
696   return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
697 }
698 
699 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sad_epu8(__m128i a,__m128i b)700 _mm_sad_epu8(__m128i a, __m128i b)
701 {
702   return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
703 }
704 
705 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi8(__m128i a,__m128i b)706 _mm_sub_epi8(__m128i a, __m128i b)
707 {
708   return (__m128i)((__v16qi)a - (__v16qi)b);
709 }
710 
711 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi16(__m128i a,__m128i b)712 _mm_sub_epi16(__m128i a, __m128i b)
713 {
714   return (__m128i)((__v8hi)a - (__v8hi)b);
715 }
716 
717 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi32(__m128i a,__m128i b)718 _mm_sub_epi32(__m128i a, __m128i b)
719 {
720   return (__m128i)((__v4si)a - (__v4si)b);
721 }
722 
723 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sub_si64(__m64 a,__m64 b)724 _mm_sub_si64(__m64 a, __m64 b)
725 {
726   return a - b;
727 }
728 
729 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi64(__m128i a,__m128i b)730 _mm_sub_epi64(__m128i a, __m128i b)
731 {
732   return a - b;
733 }
734 
735 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi8(__m128i a,__m128i b)736 _mm_subs_epi8(__m128i a, __m128i b)
737 {
738   return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
739 }
740 
741 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi16(__m128i a,__m128i b)742 _mm_subs_epi16(__m128i a, __m128i b)
743 {
744   return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
745 }
746 
747 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu8(__m128i a,__m128i b)748 _mm_subs_epu8(__m128i a, __m128i b)
749 {
750   return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
751 }
752 
753 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu16(__m128i a,__m128i b)754 _mm_subs_epu16(__m128i a, __m128i b)
755 {
756   return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
757 }
758 
759 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_and_si128(__m128i a,__m128i b)760 _mm_and_si128(__m128i a, __m128i b)
761 {
762   return a & b;
763 }
764 
765 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_andnot_si128(__m128i a,__m128i b)766 _mm_andnot_si128(__m128i a, __m128i b)
767 {
768   return ~a & b;
769 }
770 
771 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_or_si128(__m128i a,__m128i b)772 _mm_or_si128(__m128i a, __m128i b)
773 {
774   return a | b;
775 }
776 
777 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_xor_si128(__m128i a,__m128i b)778 _mm_xor_si128(__m128i a, __m128i b)
779 {
780   return a ^ b;
781 }
782 
783 #define _mm_slli_si128(VEC, IMM) \
784   ((__m128i)__builtin_ia32_pslldqi128((__m128i)(VEC), (IMM)*8))
785 
786 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi16(__m128i a,int count)787 _mm_slli_epi16(__m128i a, int count)
788 {
789   return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
790 }
791 
792 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi16(__m128i a,__m128i count)793 _mm_sll_epi16(__m128i a, __m128i count)
794 {
795   return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
796 }
797 
798 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi32(__m128i a,int count)799 _mm_slli_epi32(__m128i a, int count)
800 {
801   return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
802 }
803 
804 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi32(__m128i a,__m128i count)805 _mm_sll_epi32(__m128i a, __m128i count)
806 {
807   return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
808 }
809 
810 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi64(__m128i a,int count)811 _mm_slli_epi64(__m128i a, int count)
812 {
813   return __builtin_ia32_psllqi128(a, count);
814 }
815 
816 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi64(__m128i a,__m128i count)817 _mm_sll_epi64(__m128i a, __m128i count)
818 {
819   return __builtin_ia32_psllq128(a, count);
820 }
821 
822 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi16(__m128i a,int count)823 _mm_srai_epi16(__m128i a, int count)
824 {
825   return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
826 }
827 
828 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi16(__m128i a,__m128i count)829 _mm_sra_epi16(__m128i a, __m128i count)
830 {
831   return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
832 }
833 
834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi32(__m128i a,int count)835 _mm_srai_epi32(__m128i a, int count)
836 {
837   return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
838 }
839 
840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi32(__m128i a,__m128i count)841 _mm_sra_epi32(__m128i a, __m128i count)
842 {
843   return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
844 }
845 
846 
847 #define _mm_srli_si128(VEC, IMM) \
848   ((__m128i)__builtin_ia32_psrldqi128((__m128i)(VEC), (IMM)*8))
849 
850 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi16(__m128i a,int count)851 _mm_srli_epi16(__m128i a, int count)
852 {
853   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
854 }
855 
856 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi16(__m128i a,__m128i count)857 _mm_srl_epi16(__m128i a, __m128i count)
858 {
859   return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
860 }
861 
862 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi32(__m128i a,int count)863 _mm_srli_epi32(__m128i a, int count)
864 {
865   return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
866 }
867 
868 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi32(__m128i a,__m128i count)869 _mm_srl_epi32(__m128i a, __m128i count)
870 {
871   return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
872 }
873 
874 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi64(__m128i a,int count)875 _mm_srli_epi64(__m128i a, int count)
876 {
877   return __builtin_ia32_psrlqi128(a, count);
878 }
879 
880 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi64(__m128i a,__m128i count)881 _mm_srl_epi64(__m128i a, __m128i count)
882 {
883   return __builtin_ia32_psrlq128(a, count);
884 }
885 
886 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi8(__m128i a,__m128i b)887 _mm_cmpeq_epi8(__m128i a, __m128i b)
888 {
889   return (__m128i)((__v16qi)a == (__v16qi)b);
890 }
891 
892 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi16(__m128i a,__m128i b)893 _mm_cmpeq_epi16(__m128i a, __m128i b)
894 {
895   return (__m128i)((__v8hi)a == (__v8hi)b);
896 }
897 
898 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi32(__m128i a,__m128i b)899 _mm_cmpeq_epi32(__m128i a, __m128i b)
900 {
901   return (__m128i)((__v4si)a == (__v4si)b);
902 }
903 
904 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi8(__m128i a,__m128i b)905 _mm_cmpgt_epi8(__m128i a, __m128i b)
906 {
907   return (__m128i)((__v16qi)a > (__v16qi)b);
908 }
909 
910 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi16(__m128i a,__m128i b)911 _mm_cmpgt_epi16(__m128i a, __m128i b)
912 {
913   return (__m128i)((__v8hi)a > (__v8hi)b);
914 }
915 
916 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi32(__m128i a,__m128i b)917 _mm_cmpgt_epi32(__m128i a, __m128i b)
918 {
919   return (__m128i)((__v4si)a > (__v4si)b);
920 }
921 
922 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi8(__m128i a,__m128i b)923 _mm_cmplt_epi8(__m128i a, __m128i b)
924 {
925   return _mm_cmpgt_epi8(b,a);
926 }
927 
928 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi16(__m128i a,__m128i b)929 _mm_cmplt_epi16(__m128i a, __m128i b)
930 {
931   return _mm_cmpgt_epi16(b,a);
932 }
933 
934 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi32(__m128i a,__m128i b)935 _mm_cmplt_epi32(__m128i a, __m128i b)
936 {
937   return _mm_cmpgt_epi32(b,a);
938 }
939 
940 #ifdef __x86_64__
941 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_sd(__m128d a,long long b)942 _mm_cvtsi64_sd(__m128d a, long long b)
943 {
944   a[0] = b;
945   return a;
946 }
947 
948 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si64(__m128d a)949 _mm_cvtsd_si64(__m128d a)
950 {
951   return __builtin_ia32_cvtsd2si64(a);
952 }
953 
954 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si64(__m128d a)955 _mm_cvttsd_si64(__m128d a)
956 {
957   return a[0];
958 }
959 #endif
960 
961 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_ps(__m128i a)962 _mm_cvtepi32_ps(__m128i a)
963 {
964   return __builtin_ia32_cvtdq2ps((__v4si)a);
965 }
966 
967 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_epi32(__m128 a)968 _mm_cvtps_epi32(__m128 a)
969 {
970   return (__m128i)__builtin_ia32_cvtps2dq(a);
971 }
972 
973 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttps_epi32(__m128 a)974 _mm_cvttps_epi32(__m128 a)
975 {
976   return (__m128i)__builtin_ia32_cvttps2dq(a);
977 }
978 
979 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_si128(int a)980 _mm_cvtsi32_si128(int a)
981 {
982   return (__m128i)(__v4si){ a, 0, 0, 0 };
983 }
984 
985 #ifdef __x86_64__
986 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_si128(long long a)987 _mm_cvtsi64_si128(long long a)
988 {
989   return (__m128i){ a, 0 };
990 }
991 #endif
992 
993 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si32(__m128i a)994 _mm_cvtsi128_si32(__m128i a)
995 {
996   __v4si b = (__v4si)a;
997   return b[0];
998 }
999 
1000 #ifdef __x86_64__
1001 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si64(__m128i a)1002 _mm_cvtsi128_si64(__m128i a)
1003 {
1004   return a[0];
1005 }
1006 #endif
1007 
1008 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_load_si128(__m128i const * p)1009 _mm_load_si128(__m128i const *p)
1010 {
1011   return *p;
1012 }
1013 
1014 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadu_si128(__m128i const * p)1015 _mm_loadu_si128(__m128i const *p)
1016 {
1017   struct __loadu_si128 {
1018     __m128i v;
1019   } __attribute__((packed, may_alias));
1020   return ((struct __loadu_si128*)p)->v;
1021 }
1022 
1023 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadl_epi64(__m128i const * p)1024 _mm_loadl_epi64(__m128i const *p)
1025 {
1026   return (__m128i) { *(long long*)p, 0};
1027 }
1028 
1029 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64x(long long q1,long long q0)1030 _mm_set_epi64x(long long q1, long long q0)
1031 {
1032   return (__m128i){ q0, q1 };
1033 }
1034 
1035 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64(__m64 q1,__m64 q0)1036 _mm_set_epi64(__m64 q1, __m64 q0)
1037 {
1038   return (__m128i){ (long long)q0, (long long)q1 };
1039 }
1040 
1041 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi32(int i3,int i2,int i1,int i0)1042 _mm_set_epi32(int i3, int i2, int i1, int i0)
1043 {
1044   return (__m128i)(__v4si){ i0, i1, i2, i3};
1045 }
1046 
1047 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi16(short w7,short w6,short w5,short w4,short w3,short w2,short w1,short w0)1048 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1049 {
1050   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1051 }
1052 
1053 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi8(char b15,char b14,char b13,char b12,char b11,char b10,char b9,char b8,char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0)1054 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1055 {
1056   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1057 }
1058 
1059 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64x(long long q)1060 _mm_set1_epi64x(long long q)
1061 {
1062   return (__m128i){ q, q };
1063 }
1064 
1065 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64(__m64 q)1066 _mm_set1_epi64(__m64 q)
1067 {
1068   return (__m128i){ (long long)q, (long long)q };
1069 }
1070 
1071 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi32(int i)1072 _mm_set1_epi32(int i)
1073 {
1074   return (__m128i)(__v4si){ i, i, i, i };
1075 }
1076 
1077 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi16(short w)1078 _mm_set1_epi16(short w)
1079 {
1080   return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1081 }
1082 
1083 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi8(char b)1084 _mm_set1_epi8(char b)
1085 {
1086   return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1087 }
1088 
1089 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi64(__m64 q0,__m64 q1)1090 _mm_setr_epi64(__m64 q0, __m64 q1)
1091 {
1092   return (__m128i){ (long long)q0, (long long)q1 };
1093 }
1094 
1095 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi32(int i0,int i1,int i2,int i3)1096 _mm_setr_epi32(int i0, int i1, int i2, int i3)
1097 {
1098   return (__m128i)(__v4si){ i0, i1, i2, i3};
1099 }
1100 
1101 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)1102 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1103 {
1104   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1105 }
1106 
1107 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi8(char b0,char b1,char b2,char b3,char b4,char b5,char b6,char b7,char b8,char b9,char b10,char b11,char b12,char b13,char b14,char b15)1108 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1109 {
1110   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1111 }
1112 
1113 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setzero_si128(void)1114 _mm_setzero_si128(void)
1115 {
1116   return (__m128i){ 0LL, 0LL };
1117 }
1118 
1119 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_si128(__m128i * p,__m128i b)1120 _mm_store_si128(__m128i *p, __m128i b)
1121 {
1122   *p = b;
1123 }
1124 
1125 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_si128(__m128i * p,__m128i b)1126 _mm_storeu_si128(__m128i *p, __m128i b)
1127 {
1128   __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1129 }
1130 
1131 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_maskmoveu_si128(__m128i d,__m128i n,char * p)1132 _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1133 {
1134   __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1135 }
1136 
1137 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_epi64(__m128i * p,__m128i a)1138 _mm_storel_epi64(__m128i *p, __m128i a)
1139 {
1140   __builtin_ia32_storelv4si((__v2si *)p, a);
1141 }
1142 
1143 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_pd(double * p,__m128d a)1144 _mm_stream_pd(double *p, __m128d a)
1145 {
1146   __builtin_ia32_movntpd(p, a);
1147 }
1148 
1149 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si128(__m128i * p,__m128i a)1150 _mm_stream_si128(__m128i *p, __m128i a)
1151 {
1152   __builtin_ia32_movntdq(p, a);
1153 }
1154 
1155 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si32(int * p,int a)1156 _mm_stream_si32(int *p, int a)
1157 {
1158   __builtin_ia32_movnti(p, a);
1159 }
1160 
1161 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_clflush(void const * p)1162 _mm_clflush(void const *p)
1163 {
1164   __builtin_ia32_clflush(p);
1165 }
1166 
1167 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_lfence(void)1168 _mm_lfence(void)
1169 {
1170   __builtin_ia32_lfence();
1171 }
1172 
1173 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_mfence(void)1174 _mm_mfence(void)
1175 {
1176   __builtin_ia32_mfence();
1177 }
1178 
1179 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi16(__m128i a,__m128i b)1180 _mm_packs_epi16(__m128i a, __m128i b)
1181 {
1182   return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1183 }
1184 
1185 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi32(__m128i a,__m128i b)1186 _mm_packs_epi32(__m128i a, __m128i b)
1187 {
1188   return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1189 }
1190 
1191 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi16(__m128i a,__m128i b)1192 _mm_packus_epi16(__m128i a, __m128i b)
1193 {
1194   return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1195 }
1196 
1197 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_extract_epi16(__m128i a,int imm)1198 _mm_extract_epi16(__m128i a, int imm)
1199 {
1200   __v8hi b = (__v8hi)a;
1201   return (unsigned short)b[imm];
1202 }
1203 
1204 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_insert_epi16(__m128i a,int b,int imm)1205 _mm_insert_epi16(__m128i a, int b, int imm)
1206 {
1207   __v8hi c = (__v8hi)a;
1208   c[imm & 7] = b;
1209   return (__m128i)c;
1210 }
1211 
1212 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_epi8(__m128i a)1213 _mm_movemask_epi8(__m128i a)
1214 {
1215   return __builtin_ia32_pmovmskb128((__v16qi)a);
1216 }
1217 
1218 #define _mm_shuffle_epi32(a, imm) \
1219   ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) _mm_set1_epi32(0), \
1220                                     (imm) & 0x3, ((imm) & 0xc) >> 2, \
1221                                     ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6))
1222 
1223 
1224 #define _mm_shufflelo_epi16(a, imm) \
1225   ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), \
1226                                     (imm) & 0x3, ((imm) & 0xc) >> 2, \
1227                                     ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1228                                     4, 5, 6, 7))
1229 #define _mm_shufflehi_epi16(a, imm) \
1230   ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), 0, 1, 2, 3, \
1231                                     4 + (((imm) & 0x03) >> 0), \
1232                                     4 + (((imm) & 0x0c) >> 2), \
1233                                     4 + (((imm) & 0x30) >> 4), \
1234                                     4 + (((imm) & 0xc0) >> 6)))
1235 
1236 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi8(__m128i a,__m128i b)1237 _mm_unpackhi_epi8(__m128i a, __m128i b)
1238 {
1239   return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1240 }
1241 
1242 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi16(__m128i a,__m128i b)1243 _mm_unpackhi_epi16(__m128i a, __m128i b)
1244 {
1245   return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1246 }
1247 
1248 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi32(__m128i a,__m128i b)1249 _mm_unpackhi_epi32(__m128i a, __m128i b)
1250 {
1251   return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1252 }
1253 
1254 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi64(__m128i a,__m128i b)1255 _mm_unpackhi_epi64(__m128i a, __m128i b)
1256 {
1257   return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1258 }
1259 
1260 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi8(__m128i a,__m128i b)1261 _mm_unpacklo_epi8(__m128i a, __m128i b)
1262 {
1263   return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1264 }
1265 
1266 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi16(__m128i a,__m128i b)1267 _mm_unpacklo_epi16(__m128i a, __m128i b)
1268 {
1269   return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1270 }
1271 
1272 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi32(__m128i a,__m128i b)1273 _mm_unpacklo_epi32(__m128i a, __m128i b)
1274 {
1275   return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1276 }
1277 
1278 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi64(__m128i a,__m128i b)1279 _mm_unpacklo_epi64(__m128i a, __m128i b)
1280 {
1281   return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1282 }
1283 
1284 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_movepi64_pi64(__m128i a)1285 _mm_movepi64_pi64(__m128i a)
1286 {
1287   return (__m64)a[0];
1288 }
1289 
1290 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_movpi64_pi64(__m64 a)1291 _mm_movpi64_pi64(__m64 a)
1292 {
1293   return (__m128i){ (long long)a, 0 };
1294 }
1295 
1296 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_move_epi64(__m128i a)1297 _mm_move_epi64(__m128i a)
1298 {
1299   return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1300 }
1301 
1302 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pd(__m128d a,__m128d b)1303 _mm_unpackhi_pd(__m128d a, __m128d b)
1304 {
1305   return __builtin_shufflevector(a, b, 1, 2+1);
1306 }
1307 
1308 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pd(__m128d a,__m128d b)1309 _mm_unpacklo_pd(__m128d a, __m128d b)
1310 {
1311   return __builtin_shufflevector(a, b, 0, 2+0);
1312 }
1313 
1314 static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_pd(__m128d a)1315 _mm_movemask_pd(__m128d a)
1316 {
1317   return __builtin_ia32_movmskpd(a);
1318 }
1319 
1320 #define _mm_shuffle_pd(a, b, i) \
1321   (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \
1322                                                        (((i) & 2) >> 1) + 2))
1323 
1324 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castpd_ps(__m128d in)1325 _mm_castpd_ps(__m128d in)
1326 {
1327   return (__m128)in;
1328 }
1329 
1330 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castpd_si128(__m128d in)1331 _mm_castpd_si128(__m128d in)
1332 {
1333   return (__m128i)in;
1334 }
1335 
1336 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castps_pd(__m128 in)1337 _mm_castps_pd(__m128 in)
1338 {
1339   return (__m128d)in;
1340 }
1341 
1342 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castps_si128(__m128 in)1343 _mm_castps_si128(__m128 in)
1344 {
1345   return (__m128i)in;
1346 }
1347 
1348 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_ps(__m128i in)1349 _mm_castsi128_ps(__m128i in)
1350 {
1351   return (__m128)in;
1352 }
1353 
1354 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_pd(__m128i in)1355 _mm_castsi128_pd(__m128i in)
1356 {
1357   return (__m128d)in;
1358 }
1359 
1360 static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_pause(void)1361 _mm_pause(void)
1362 {
1363   __asm__ volatile ("pause");
1364 }
1365 
1366 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1367 
1368 #endif /* __SSE2__ */
1369 
1370 #endif /* __EMMINTRIN_H */
1371