• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Loongson MMI optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
5  *                          All Rights Reserved.
6  *
7  * This software is provided 'as-is', without any express or implied
8  * warranty.  In no event will the authors be held liable for any damages
9  * arising from the use of this software.
10  *
11  * Permission is granted to anyone to use this software for any purpose,
12  * including commercial applications, and to alter it and redistribute it
13  * freely, subject to the following restrictions:
14  *
15  * 1. The origin of this software must not be misrepresented; you must not
16  *    claim that you wrote the original software. If you use this software
17  *    in a product, an acknowledgment in the product documentation would be
18  *    appreciated but is not required.
19  * 2. Altered source versions must be plainly marked as such, and must not be
20  *    misrepresented as being the original software.
21  * 3. This notice may not be removed or altered from any source distribution.
22  */
23 
24 #ifndef __LOONGSON_MMINTRIN_H__
25 #define __LOONGSON_MMINTRIN_H__
26 
27 #include <stdint.h>
28 
29 
30 #define FUNCTION_ATTRIBS \
31   __attribute__((__gnu_inline__, __always_inline__, __artificial__))
32 
33 
34 /* Vectors are stored in 64-bit floating-point registers. */
35 typedef double __m64;
36 
37 /* Having a 32-bit datatype allows us to use 32-bit loads in places like
38    load8888. */
39 typedef float __m32;
40 
41 
42 /********** Set Operations **********/
43 
44 extern __inline __m64
_mm_setzero_si64(void)45 _mm_setzero_si64(void)
46 {
47   return 0.0;
48 }
49 
50 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi8(uint8_t __b7,uint8_t __b6,uint8_t __b5,uint8_t __b4,uint8_t __b3,uint8_t __b2,uint8_t __b1,uint8_t __b0)51 _mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
52             uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
53 {
54   __m64 ret;
55   uint32_t lo = ((uint32_t)__b6 << 24) |
56                 ((uint32_t)__b4 << 16) |
57                 ((uint32_t)__b2 << 8) |
58                 (uint32_t)__b0;
59   uint32_t hi = ((uint32_t)__b7 << 24) |
60                 ((uint32_t)__b5 << 16) |
61                 ((uint32_t)__b3 << 8) |
62                 (uint32_t)__b1;
63 
64   asm("mtc1      %1, %0\n\t"
65       "mtc1      %2, $f0\n\t"
66       "punpcklbh %0, %0, $f0\n\t"
67       : "=f" (ret)
68       : "r" (lo), "r" (hi)
69       : "$f0"
70      );
71 
72   return ret;
73 }
74 
75 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi16(uint16_t __h3,uint16_t __h2,uint16_t __h1,uint16_t __h0)76 _mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
77 {
78   __m64 ret;
79   uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
80   uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
81 
82   asm("mtc1      %1, %0\n\t"
83       "mtc1      %2, $f0\n\t"
84       "punpcklhw %0, %0, $f0\n\t"
85       : "=f" (ret)
86       : "r" (lo), "r" (hi)
87       : "$f0"
88      );
89 
90   return ret;
91 }
92 
93 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
94   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
95 
96 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi32(uint32_t __i1,uint32_t __i0)97 _mm_set_pi32(uint32_t __i1, uint32_t __i0)
98 {
99   if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
100     uint64_t val = ((uint64_t)__i1 << 32) |
101                    ((uint64_t)__i0 <<  0);
102 
103     return *(__m64 *)&val;
104   } else if (__i1 == __i0) {
105     uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
106     __m64 ret;
107 
108     asm("pshufh %0, %1, %2\n\t"
109         : "=f" (ret)
110         : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
111        );
112 
113     return ret;
114   } else {
115     uint64_t val = ((uint64_t)__i1 << 32) |
116                    ((uint64_t)__i0 <<  0);
117 
118     return *(__m64 *)&val;
119   }
120 }
121 
122 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi8(uint8_t __b0)123 _mm_set1_pi8(uint8_t __b0)
124 {
125   __m64 ret;
126 
127   asm("sll    $8, %1, 8\n\t"
128       "or     %1, %1, $8\n\t"
129       "mtc1   %1, %0\n\t"
130       "mtc1   $0, $f0\n\t"
131       "pshufh %0, %0, $f0\n\t"
132       : "=f" (ret)
133       : "r" (__b0)
134       : "$8", "$f0"
135      );
136 
137   return ret;
138 }
139 
140 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi16(uint16_t __h0)141 _mm_set1_pi16(uint16_t __h0)
142 {
143   __m64 ret;
144 
145   asm("mtc1   %1, %0\n\t"
146       "mtc1   $0, $f0\n\t"
147       "pshufh %0, %0, $f0\n\t"
148       : "=f" (ret)
149       : "r" (__h0)
150       : "$8", "$f0"
151      );
152 
153   return ret;
154 }
155 
156 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi32(unsigned __i0)157 _mm_set1_pi32(unsigned __i0)
158 {
159   return _mm_set_pi32(__i0, __i0);
160 }
161 
162 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi8(uint8_t __h0,uint8_t __h1,uint8_t __h2,uint8_t __h3,uint8_t __h4,uint8_t __h5,uint8_t __h6,uint8_t __h7)163 _mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
164              uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
165 {
166   return _mm_set_pi8(__h7, __h6, __h5, __h4,
167                      __h3, __h2, __h1, __h0);
168 }
169 
170 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi16(uint16_t __w0,uint16_t __w1,uint16_t __w2,uint16_t __w3)171 _mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
172 {
173   return _mm_set_pi16(__w3, __w2, __w1, __w0);
174 }
175 
176 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi32(uint32_t __i0,uint32_t __i1)177 _mm_setr_pi32(uint32_t __i0, uint32_t __i1)
178 {
179   return _mm_set_pi32(__i1, __i0);
180 }
181 
182 
183 /********** Arithmetic Operations **********/
184 
185 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi8(__m64 __m1,__m64 __m2)186 _mm_add_pi8(__m64 __m1, __m64 __m2)
187 {
188   __m64 ret;
189 
190   asm("paddb %0, %1, %2\n\t"
191       : "=f" (ret)
192       : "f" (__m1), "f" (__m2)
193      );
194 
195   return ret;
196 }
197 
198 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi16(__m64 __m1,__m64 __m2)199 _mm_add_pi16(__m64 __m1, __m64 __m2)
200 {
201   __m64 ret;
202 
203   asm("paddh %0, %1, %2\n\t"
204       : "=f" (ret)
205       : "f" (__m1), "f" (__m2)
206      );
207 
208   return ret;
209 }
210 
211 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi32(__m64 __m1,__m64 __m2)212 _mm_add_pi32(__m64 __m1, __m64 __m2)
213 {
214   __m64 ret;
215 
216   asm("paddw %0, %1, %2\n\t"
217       : "=f" (ret)
218       : "f" (__m1), "f" (__m2)
219      );
220 
221   return ret;
222 }
223 
224 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_si64(__m64 __m1,__m64 __m2)225 _mm_add_si64(__m64 __m1, __m64 __m2)
226 {
227   __m64 ret;
228 
229   asm("paddd %0, %1, %2\n\t"
230       : "=f" (ret)
231       : "f" (__m1), "f" (__m2)
232      );
233 
234   return ret;
235 }
236 
237 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pi8(__m64 __m1,__m64 __m2)238 _mm_adds_pi8(__m64 __m1, __m64 __m2)
239 {
240   __m64 ret;
241 
242   asm("paddsb %0, %1, %2\n\t"
243       : "=f" (ret)
244       : "f" (__m1), "f" (__m2)
245      );
246 
247   return ret;
248 }
249 
250 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pi16(__m64 __m1,__m64 __m2)251 _mm_adds_pi16(__m64 __m1, __m64 __m2)
252 {
253   __m64 ret;
254 
255   asm("paddsh %0, %1, %2\n\t"
256       : "=f" (ret)
257       : "f" (__m1), "f" (__m2)
258      );
259 
260   return ret;
261 }
262 
263 
264 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pu8(__m64 __m1,__m64 __m2)265 _mm_adds_pu8(__m64 __m1, __m64 __m2)
266 {
267   __m64 ret;
268 
269   asm("paddusb %0, %1, %2\n\t"
270       : "=f" (ret)
271       : "f" (__m1), "f" (__m2)
272      );
273 
274   return ret;
275 }
276 
277 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pu16(__m64 __m1,__m64 __m2)278 _mm_adds_pu16(__m64 __m1, __m64 __m2)
279 {
280   __m64 ret;
281 
282   asm("paddush %0, %1, %2\n\t"
283       : "=f" (ret)
284       : "f" (__m1), "f" (__m2)
285      );
286 
287   return ret;
288 }
289 
290 extern __inline __m64 FUNCTION_ATTRIBS
_mm_avg_pu8(__m64 __m1,__m64 __m2)291 _mm_avg_pu8(__m64 __m1, __m64 __m2)
292 {
293   __m64 ret;
294 
295   asm("pavgb %0, %1, %2\n\t"
296       : "=f" (ret)
297       : "f" (__m1), "f" (__m2)
298      );
299 
300   return ret;
301 }
302 
303 extern __inline __m64 FUNCTION_ATTRIBS
_mm_avg_pu16(__m64 __m1,__m64 __m2)304 _mm_avg_pu16(__m64 __m1, __m64 __m2)
305 {
306   __m64 ret;
307 
308   asm("pavgh %0, %1, %2\n\t"
309       : "=f" (ret)
310       : "f" (__m1), "f" (__m2)
311      );
312 
313   return ret;
314 }
315 
316 extern __inline __m64 FUNCTION_ATTRIBS
_mm_madd_pi16(__m64 __m1,__m64 __m2)317 _mm_madd_pi16(__m64 __m1, __m64 __m2)
318 {
319   __m64 ret;
320 
321   asm("pmaddhw %0, %1, %2\n\t"
322       : "=f" (ret)
323       : "f" (__m1), "f" (__m2)
324      );
325 
326   return ret;
327 }
328 
329 extern __inline __m64 FUNCTION_ATTRIBS
_mm_max_pi16(__m64 __m1,__m64 __m2)330 _mm_max_pi16(__m64 __m1, __m64 __m2)
331 {
332   __m64 ret;
333 
334   asm("pmaxsh %0, %1, %2\n\t"
335       : "=f" (ret)
336       : "f" (__m1), "f" (__m2)
337      );
338 
339   return ret;
340 }
341 
342 extern __inline __m64 FUNCTION_ATTRIBS
_mm_max_pu8(__m64 __m1,__m64 __m2)343 _mm_max_pu8(__m64 __m1, __m64 __m2)
344 {
345   __m64 ret;
346 
347   asm("pmaxub %0, %1, %2\n\t"
348       : "=f" (ret)
349       : "f" (__m1), "f" (__m2)
350      );
351 
352   return ret;
353 }
354 
355 extern __inline __m64 FUNCTION_ATTRIBS
_mm_min_pi16(__m64 __m1,__m64 __m2)356 _mm_min_pi16(__m64 __m1, __m64 __m2)
357 {
358   __m64 ret;
359 
360   asm("pminsh %0, %1, %2\n\t"
361       : "=f" (ret)
362       : "f" (__m1), "f" (__m2)
363      );
364 
365   return ret;
366 }
367 
368 extern __inline __m64 FUNCTION_ATTRIBS
_mm_min_pu8(__m64 __m1,__m64 __m2)369 _mm_min_pu8(__m64 __m1, __m64 __m2)
370 {
371   __m64 ret;
372 
373   asm("pminub %0, %1, %2\n\t"
374       : "=f" (ret)
375       : "f" (__m1), "f" (__m2)
376      );
377 
378   return ret;
379 }
380 
381 extern __inline int FUNCTION_ATTRIBS
_mm_movemask_pi8(__m64 __m1)382 _mm_movemask_pi8(__m64 __m1)
383 {
384   int ret;
385 
386   asm("pmovmskb %0, %1\n\t"
387       : "=r" (ret)
388       : "y" (__m1)
389      );
390 
391   return ret;
392 }
393 
394 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mulhi_pi16(__m64 __m1,__m64 __m2)395 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
396 {
397   __m64 ret;
398 
399   asm("pmulhh %0, %1, %2\n\t"
400       : "=f" (ret)
401       : "f" (__m1), "f" (__m2)
402      );
403 
404   return ret;
405 }
406 
407 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mulhi_pu16(__m64 __m1,__m64 __m2)408 _mm_mulhi_pu16(__m64 __m1, __m64 __m2)
409 {
410   __m64 ret;
411 
412   asm("pmulhuh %0, %1, %2\n\t"
413       : "=f" (ret)
414       : "f" (__m1), "f" (__m2)
415      );
416 
417   return ret;
418 }
419 
420 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mullo_pi16(__m64 __m1,__m64 __m2)421 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
422 {
423   __m64 ret;
424 
425   asm("pmullh %0, %1, %2\n\t"
426       : "=f" (ret)
427       : "f" (__m1), "f" (__m2)
428      );
429 
430   return ret;
431 }
432 
433 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mul_pu32(__m64 __m1,__m64 __m2)434 _mm_mul_pu32(__m64 __m1, __m64 __m2)
435 {
436   __m64 ret;
437 
438   asm("pmuluw %0, %1, %2\n\t"
439       : "=f" (ret)
440       : "f" (__m1), "f" (__m2)
441      );
442 
443   return ret;
444 }
445 
446 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sad_pu8(__m64 __m1,__m64 __m2)447 _mm_sad_pu8(__m64 __m1, __m64 __m2)
448 {
449   __m64 ret;
450 
451   asm("psadbh %0, %1, %2\n\t"
452       : "=f" (ret)
453       : "f" (__m1), "f" (__m2)
454      );
455 
456   return ret;
457 }
458 
459 
460 extern __inline __m64 FUNCTION_ATTRIBS
_mm_asub_pu8(__m64 __m1,__m64 __m2)461 _mm_asub_pu8(__m64 __m1, __m64 __m2)
462 {
463   __m64 ret;
464 
465   asm("pasubub %0, %1, %2\n\t"
466       : "=f" (ret)
467       : "f" (__m1), "f" (__m2)
468      );
469 
470   return ret;
471 }
472 
473 extern __inline __m64 FUNCTION_ATTRIBS
_mm_biadd_pu8(__m64 __m1,__m64 __m2)474 _mm_biadd_pu8(__m64 __m1, __m64 __m2)
475 {
476   __m64 ret;
477 
478   asm("biadd %0, %1, %2\n\t"
479       : "=f" (ret)
480       : "f" (__m1), "f" (__m2)
481      );
482 
483   return ret;
484 }
485 
486 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi8(__m64 __m1,__m64 __m2)487 _mm_sub_pi8(__m64 __m1, __m64 __m2)
488 {
489   __m64 ret;
490 
491   asm("psubb %0, %1, %2\n\t"
492       : "=f" (ret)
493       : "f" (__m1), "f" (__m2)
494      );
495 
496   return ret;
497 }
498 
499 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi16(__m64 __m1,__m64 __m2)500 _mm_sub_pi16(__m64 __m1, __m64 __m2)
501 {
502   __m64 ret;
503 
504   asm("psubh %0, %1, %2\n\t"
505       : "=f" (ret)
506       : "f" (__m1), "f" (__m2)
507      );
508 
509   return ret;
510 }
511 
512 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi32(__m64 __m1,__m64 __m2)513 _mm_sub_pi32(__m64 __m1, __m64 __m2)
514 {
515   __m64 ret;
516 
517   asm("psubw %0, %1, %2\n\t"
518       : "=f" (ret)
519       : "f" (__m1), "f" (__m2)
520      );
521 
522   return ret;
523 }
524 
525 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_si64(__m64 __m1,__m64 __m2)526 _mm_sub_si64(__m64 __m1, __m64 __m2)
527 {
528   __m64 ret;
529 
530   asm("psubd %0, %1, %2\n\t"
531       : "=f" (ret)
532       : "f" (__m1), "f" (__m2)
533      );
534 
535   return ret;
536 }
537 
538 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pi8(__m64 __m1,__m64 __m2)539 _mm_subs_pi8(__m64 __m1, __m64 __m2)
540 {
541   __m64 ret;
542 
543   asm("psubsb %0, %1, %2\n\t"
544       : "=f" (ret)
545       : "f" (__m1), "f" (__m2)
546      );
547 
548   return ret;
549 }
550 
551 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pi16(__m64 __m1,__m64 __m2)552 _mm_subs_pi16(__m64 __m1, __m64 __m2)
553 {
554   __m64 ret;
555 
556   asm("psubsh %0, %1, %2\n\t"
557       : "=f" (ret)
558       : "f" (__m1), "f" (__m2)
559      );
560 
561   return ret;
562 }
563 
564 
565 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pu8(__m64 __m1,__m64 __m2)566 _mm_subs_pu8(__m64 __m1, __m64 __m2)
567 {
568   __m64 ret;
569 
570   asm("psubusb %0, %1, %2\n\t"
571       : "=f" (ret)
572       : "f" (__m1), "f" (__m2)
573      );
574 
575   return ret;
576 }
577 
578 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pu16(__m64 __m1,__m64 __m2)579 _mm_subs_pu16(__m64 __m1, __m64 __m2)
580 {
581   __m64 ret;
582 
583   asm("psubush %0, %1, %2\n\t"
584       : "=f" (ret)
585       : "f" (__m1), "f" (__m2)
586      );
587 
588   return ret;
589 }
590 
591 
592 /********** Logical Operations **********/
593 
594 extern __inline __m64 FUNCTION_ATTRIBS
_mm_and_si64(__m64 __m1,__m64 __m2)595 _mm_and_si64(__m64 __m1, __m64 __m2)
596 {
597   __m64 ret;
598 
599   asm("and %0, %1, %2\n\t"
600       : "=f" (ret)
601       : "f" (__m1), "f" (__m2)
602      );
603 
604   return ret;
605 }
606 
607 extern __inline __m64 FUNCTION_ATTRIBS
_mm_andnot_si64(__m64 __m1,__m64 __m2)608 _mm_andnot_si64(__m64 __m1, __m64 __m2)
609 {
610   __m64 ret;
611 
612   asm("andn %0, %1, %2\n\t"
613       : "=f" (ret)
614       : "f" (__m1), "f" (__m2)
615      );
616 
617   return ret;
618 }
619 
620 
621 extern __inline __m64 FUNCTION_ATTRIBS
_mm_or_si32(__m32 __m1,__m32 __m2)622 _mm_or_si32(__m32 __m1, __m32 __m2)
623 {
624   __m32 ret;
625 
626   asm("or %0, %1, %2\n\t"
627       : "=f" (ret)
628       : "f" (__m1), "f" (__m2)
629      );
630 
631   return ret;
632 }
633 
634 extern __inline __m64 FUNCTION_ATTRIBS
_mm_or_si64(__m64 __m1,__m64 __m2)635 _mm_or_si64(__m64 __m1, __m64 __m2)
636 {
637   __m64 ret;
638 
639   asm("or %0, %1, %2\n\t"
640       : "=f" (ret)
641       : "f" (__m1), "f" (__m2)
642      );
643 
644   return ret;
645 }
646 
647 extern __inline __m64 FUNCTION_ATTRIBS
_mm_xor_si64(__m64 __m1,__m64 __m2)648 _mm_xor_si64(__m64 __m1, __m64 __m2)
649 {
650   __m64 ret;
651 
652   asm("xor %0, %1, %2\n\t"
653       : "=f" (ret)
654       : "f" (__m1), "f" (__m2)
655      );
656 
657   return ret;
658 }
659 
660 
661 /********** Shift Operations **********/
662 
663 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_pi16(__m64 __m,int64_t __count)664 _mm_slli_pi16(__m64 __m, int64_t __count)
665 {
666   __m64 ret;
667 
668   asm("psllh  %0, %1, %2\n\t"
669       : "=f" (ret)
670       : "f" (__m), "f" (*(__m64 *)&__count)
671      );
672 
673   return ret;
674 }
675 
676 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_pi32(__m64 __m,int64_t __count)677 _mm_slli_pi32(__m64 __m, int64_t __count)
678 {
679   __m64 ret;
680 
681   asm("psllw %0, %1, %2\n\t"
682       : "=f" (ret)
683       : "f" (__m), "f" (*(__m64 *)&__count)
684      );
685 
686   return ret;
687 }
688 
689 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_si64(__m64 __m,int64_t __count)690 _mm_slli_si64(__m64 __m, int64_t __count)
691 {
692   __m64 ret;
693 
694   asm("dsll  %0, %1, %2\n\t"
695       : "=f" (ret)
696       : "f" (__m), "f" (*(__m64 *)&__count)
697      );
698 
699   return ret;
700 }
701 
702 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_pi16(__m64 __m,int64_t __count)703 _mm_srli_pi16(__m64 __m, int64_t __count)
704 {
705   __m64 ret;
706 
707   asm("psrlh %0, %1, %2\n\t"
708       : "=f" (ret)
709       : "f" (__m), "f" (*(__m64 *)&__count)
710      );
711 
712   return ret;
713 }
714 
715 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_pi32(__m64 __m,int64_t __count)716 _mm_srli_pi32(__m64 __m, int64_t __count)
717 {
718   __m64 ret;
719 
720   asm("psrlw %0, %1, %2\n\t"
721       : "=f" (ret)
722       : "f" (__m), "f" (*(__m64 *)&__count)
723      );
724 
725   return ret;
726 }
727 
728 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_si64(__m64 __m,int64_t __count)729 _mm_srli_si64(__m64 __m, int64_t __count)
730 {
731   __m64 ret;
732 
733   asm("dsrl  %0, %1, %2\n\t"
734       : "=f" (ret)
735       : "f" (__m), "f" (*(__m64 *)&__count)
736      );
737 
738   return ret;
739 }
740 
741 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_pi16(__m64 __m,int64_t __count)742 _mm_srai_pi16(__m64 __m, int64_t __count)
743 {
744   __m64 ret;
745 
746   asm("psrah %0, %1, %2\n\t"
747       : "=f" (ret)
748       : "f" (__m), "f" (*(__m64 *)&__count)
749      );
750 
751   return ret;
752 }
753 
754 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_pi32(__m64 __m,int64_t __count)755 _mm_srai_pi32(__m64 __m, int64_t __count)
756 {
757   __m64 ret;
758 
759   asm("psraw %0, %1, %2\n\t"
760       : "=f" (ret)
761       : "f" (__m), "f" (*(__m64 *)&__count)
762      );
763 
764   return ret;
765 }
766 
767 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_si64(__m64 __m,int64_t __count)768 _mm_srai_si64(__m64 __m, int64_t __count)
769 {
770   __m64 ret;
771 
772   asm("dsra %0, %1, %2\n\t"
773       : "=f" (ret)
774       : "f" (__m), "f" (*(__m64 *)&__count)
775      );
776 
777   return ret;
778 }
779 
780 
781 /********** Conversion Intrinsics **********/
782 
783 extern __inline __m64 FUNCTION_ATTRIBS
to_m64(uint64_t x)784 to_m64(uint64_t x)
785 {
786   return *(__m64 *)&x;
787 }
788 
789 extern __inline uint64_t FUNCTION_ATTRIBS
to_uint64(__m64 x)790 to_uint64(__m64 x)
791 {
792   return *(uint64_t *)&x;
793 }
794 
795 
796 /********** Comparison Intrinsics **********/
797 
798 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi8(__m64 __m1,__m64 __m2)799 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
800 {
801   __m64 ret;
802 
803   asm("pcmpeqb %0, %1, %2\n\t"
804       : "=f" (ret)
805       : "f" (__m1), "f" (__m2)
806      );
807 
808   return ret;
809 }
810 
811 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi16(__m64 __m1,__m64 __m2)812 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
813 {
814   __m64 ret;
815 
816   asm("pcmpeqh %0, %1, %2\n\t"
817       : "=f" (ret)
818       : "f" (__m1), "f" (__m2)
819      );
820 
821   return ret;
822 }
823 
824 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi32(__m64 __m1,__m64 __m2)825 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
826 {
827   __m64 ret;
828 
829   asm("pcmpeqw %0, %1, %2\n\t"
830       : "=f" (ret)
831       : "f" (__m1), "f" (__m2)
832      );
833 
834   return ret;
835 }
836 
837 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi8(__m64 __m1,__m64 __m2)838 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
839 {
840   __m64 ret;
841 
842   asm("pcmpgtb %0, %1, %2\n\t"
843       : "=f" (ret)
844       : "f" (__m1), "f" (__m2)
845      );
846 
847   return ret;
848 }
849 
850 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi16(__m64 __m1,__m64 __m2)851 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
852 {
853   __m64 ret;
854 
855   asm("pcmpgth %0, %1, %2\n\t"
856       : "=f" (ret)
857       : "f" (__m1), "f" (__m2)
858      );
859 
860   return ret;
861 }
862 
863 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi32(__m64 __m1,__m64 __m2)864 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
865 {
866   __m64 ret;
867 
868   asm("pcmpgtw %0, %1, %2\n\t"
869       : "=f" (ret)
870       : "f" (__m1), "f" (__m2)
871      );
872 
873   return ret;
874 }
875 
876 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi8(__m64 __m1,__m64 __m2)877 _mm_cmplt_pi8(__m64 __m1, __m64 __m2)
878 {
879   __m64 ret;
880 
881   asm("pcmpltb %0, %1, %2\n\t"
882       : "=f" (ret)
883       : "f" (__m1), "f" (__m2)
884      );
885 
886   return ret;
887 }
888 
889 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi16(__m64 __m1,__m64 __m2)890 _mm_cmplt_pi16(__m64 __m1, __m64 __m2)
891 {
892   __m64 ret;
893 
894   asm("pcmplth %0, %1, %2\n\t"
895       : "=f" (ret)
896       : "f" (__m1), "f" (__m2)
897      );
898 
899   return ret;
900 }
901 
902 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi32(__m64 __m1,__m64 __m2)903 _mm_cmplt_pi32(__m64 __m1, __m64 __m2)
904 {
905   __m64 ret;
906 
907   asm("pcmpltw %0, %1, %2\n\t"
908       : "=f" (ret)
909       : "f" (__m1), "f" (__m2)
910      );
911 
912   return ret;
913 }
914 
915 
916 /********** Miscellaneous Operations **********/
917 
918 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi16(__m64 __m1,__m64 __m2)919 _mm_packs_pi16(__m64 __m1, __m64 __m2)
920 {
921   __m64 ret;
922 
923   asm("packsshb %0, %1, %2\n\t"
924       : "=f" (ret)
925       : "f" (__m1), "f" (__m2)
926      );
927 
928   return ret;
929 }
930 
931 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi32(__m64 __m1,__m64 __m2)932 _mm_packs_pi32(__m64 __m1, __m64 __m2)
933 {
934   __m64 ret;
935 
936   asm("packsswh %0, %1, %2\n\t"
937       : "=f" (ret)
938       : "f" (__m1), "f" (__m2)
939      );
940 
941   return ret;
942 }
943 
944 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi32_f(__m64 __m1,__m64 __m2)945 _mm_packs_pi32_f(__m64 __m1, __m64 __m2)
946 {
947   __m64 ret;
948 
949   asm("packsswh %0, %1, %2\n\t"
950       : "=f" (ret)
951       : "f" (__m1), "f" (__m2)
952      );
953 
954   return ret;
955 }
956 
957 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pu16(__m64 __m1,__m64 __m2)958 _mm_packs_pu16(__m64 __m1, __m64 __m2)
959 {
960   __m64 ret;
961 
962   asm("packushb %0, %1, %2\n\t"
963       : "=f" (ret)
964       : "f" (__m1), "f" (__m2)
965      );
966 
967   return ret;
968 }
969 
970 extern __inline __m64 FUNCTION_ATTRIBS
_mm_extract_pi16(__m64 __m,int64_t __pos)971 _mm_extract_pi16(__m64 __m, int64_t __pos)
972 {
973   __m64 ret;
974 
975   asm("pextrh %0, %1, %2\n\t"
976       : "=f" (ret)
977       : "f" (__m), "f" (*(__m64 *)&__pos)
978      );
979 
980   return ret;
981 }
982 
983 extern __inline __m64 FUNCTION_ATTRIBS
_mm_insert_pi16(__m64 __m1,__m64 __m2,int64_t __pos)984 _mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
985 {
986   __m64 ret;
987 
988   switch (__pos) {
989   case 0:
990 
991     asm("pinsrh_0 %0, %1, %2\n\t"
992         : "=f" (ret)
993         : "f" (__m1), "f" (__m2), "i" (__pos)
994        );
995 
996     break;
997 
998   case 1:
999 
1000     asm("pinsrh_1 %0, %1, %2\n\t"
1001         : "=f" (ret)
1002         : "f" (__m1), "f" (__m2), "i" (__pos)
1003        );
1004 
1005     break;
1006   case 2:
1007 
1008     asm("pinsrh_2 %0, %1, %2\n\t"
1009         : "=f" (ret)
1010         : "f" (__m1), "f" (__m2), "i" (__pos)
1011        );
1012 
1013     break;
1014 
1015   case 3:
1016 
1017     asm("pinsrh_3 %0, %1, %2\n\t"
1018         : "=f" (ret)
1019         : "f" (__m1), "f" (__m2), "i" (__pos)
1020        );
1021 
1022     break;
1023   }
1024 
1025   return ret;
1026 }
1027 
1028 extern __inline __m64 FUNCTION_ATTRIBS
_mm_shuffle_pi16(__m64 __m,int64_t __n)1029 _mm_shuffle_pi16(__m64 __m, int64_t __n)
1030 {
1031   __m64 ret;
1032 
1033   asm("pshufh %0, %1, %2\n\t"
1034       : "=f" (ret)
1035       : "f" (__m), "f" (*(__m64 *)&__n)
1036      );
1037 
1038   return ret;
1039 }
1040 
1041 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi8(__m64 __m1,__m64 __m2)1042 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
1043 {
1044   __m64 ret;
1045 
1046   asm("punpckhbh %0, %1, %2\n\t"
1047       : "=f" (ret)
1048       : "f" (__m1), "f" (__m2)
1049      );
1050 
1051   return ret;
1052 }
1053 
1054 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi8_f(__m64 __m1,__m64 __m2)1055 _mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
1056 {
1057   __m64 ret;
1058 
1059   asm("punpckhbh %0, %1, %2\n\t"
1060       : "=f" (ret)
1061       : "f" (__m1), "f" (__m2)
1062      );
1063 
1064   return ret;
1065 }
1066 
1067 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi16(__m64 __m1,__m64 __m2)1068 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
1069 {
1070   __m64 ret;
1071 
1072   asm("punpckhhw %0, %1, %2\n\t"
1073       : "=f" (ret)
1074       : "f" (__m1), "f" (__m2)
1075      );
1076 
1077   return ret;
1078 }
1079 
1080 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi16_f(__m64 __m1,__m64 __m2)1081 _mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
1082 {
1083   __m64 ret;
1084 
1085   asm("punpckhhw %0, %1, %2\n\t"
1086       : "=f" (ret)
1087       : "f" (__m1), "f" (__m2)
1088      );
1089 
1090   return ret;
1091 }
1092 
1093 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi32(__m64 __m1,__m64 __m2)1094 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
1095 {
1096   __m64 ret;
1097 
1098   asm("punpckhwd %0, %1, %2\n\t"
1099       : "=f" (ret)
1100       : "f" (__m1), "f" (__m2)
1101      );
1102 
1103   return ret;
1104 }
1105 
1106 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8(__m64 __m1,__m64 __m2)1107 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
1108 {
1109   __m64 ret;
1110 
1111   asm("punpcklbh %0, %1, %2\n\t"
1112       : "=f" (ret)
1113       : "f" (__m1), "f" (__m2)
1114      );
1115 
1116   return ret;
1117 }
1118 
1119 /* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
1120    which preserves the data. */
1121 
1122 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8_f64(__m64 __m1,__m64 __m2)1123 _mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
1124 {
1125   __m64 ret;
1126 
1127   asm("punpcklbh %0, %1, %2\n\t"
1128       : "=f" (ret)
1129       : "f" (__m1), "f" (__m2)
1130      );
1131 
1132   return ret;
1133 }
1134 
1135 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
1136    datatype, which allows load8888 to use 32-bit loads. */
1137 
1138 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8_f(__m32 __m1,__m64 __m2)1139 _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
1140 {
1141   __m64 ret;
1142 
1143   asm("punpcklbh %0, %1, %2\n\t"
1144       : "=f" (ret)
1145       : "f" (__m1), "f" (__m2)
1146      );
1147 
1148   return ret;
1149 }
1150 
1151 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi16(__m64 __m1,__m64 __m2)1152 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
1153 {
1154   __m64 ret;
1155 
1156   asm("punpcklhw %0, %1, %2\n\t"
1157       : "=f" (ret)
1158       : "f" (__m1), "f" (__m2)
1159      );
1160 
1161   return ret;
1162 }
1163 
1164 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi16_f(__m64 __m1,__m64 __m2)1165 _mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
1166 {
1167   __m64 ret;
1168 
1169   asm("punpcklhw %0, %1, %2\n\t"
1170       : "=f" (ret)
1171       : "f" (__m1), "f" (__m2)
1172      );
1173 
1174   return ret;
1175 }
1176 
1177 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi32(__m64 __m1,__m64 __m2)1178 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
1179 {
1180   __m64 ret;
1181 
1182   asm("punpcklwd %0, %1, %2\n\t"
1183       : "=f" (ret)
1184       : "f" (__m1), "f" (__m2)
1185      );
1186 
1187   return ret;
1188 }
1189 
1190 
1191 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi32_f(__m64 __m1,__m64 __m2)1192 _mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
1193 {
1194   __m64 ret;
1195 
1196   asm("punpcklwd %0, %1, %2\n\t"
1197       : "=f" (ret)
1198       : "f" (__m1), "f" (__m2)
1199      );
1200 
1201   return ret;
1202 }
1203 
1204 extern __inline void FUNCTION_ATTRIBS
_mm_store_pi32(__m32 * dest,__m64 src)1205 _mm_store_pi32(__m32 *dest, __m64 src)
1206 {
1207   src = _mm_packs_pu16(src, _mm_setzero_si64());
1208 
1209   asm("swc1 %1, %0\n\t"
1210       : "=m" (*dest)
1211       : "f" (src)
1212       : "memory"
1213      );
1214 }
1215 
1216 extern __inline void FUNCTION_ATTRIBS
_mm_store_si64(__m64 * dest,__m64 src)1217 _mm_store_si64(__m64 *dest, __m64 src)
1218 {
1219   asm("gssdlc1 %1, 7+%0\n\t"
1220       "gssdrc1 %1, %0\n\t"
1221       : "=m" (*dest)
1222       : "f" (src)
1223       : "memory"
1224      );
1225 }
1226 
1227 extern __inline __m64 FUNCTION_ATTRIBS
_mm_load_si32(const __m32 * src)1228 _mm_load_si32(const __m32 *src)
1229 {
1230   __m32 ret;
1231 
1232   asm("lwc1 %0, %1\n\t"
1233       : "=f" (ret)
1234       : "m" (*src)
1235      );
1236 
1237   return ret;
1238 }
1239 
1240 extern __inline __m64 FUNCTION_ATTRIBS
_mm_load_si64(const __m64 * src)1241 _mm_load_si64(const __m64 *src)
1242 {
1243   __m64 ret;
1244 
1245   asm("ldc1 %0, %1\n\t"
1246       : "=f" (ret)
1247       : "m" (*src)
1248      );
1249 
1250   return ret;
1251 }
1252 
1253 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi8(const uint32_t * src)1254 _mm_loadlo_pi8(const uint32_t *src)
1255 {
1256   return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
1257 }
1258 
1259 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi8_f(__m64 src)1260 _mm_loadlo_pi8_f(__m64 src)
1261 {
1262   return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
1263 }
1264 
1265 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi8_f(__m64 src)1266 _mm_loadhi_pi8_f(__m64 src)
1267 {
1268   return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
1269 }
1270 
1271 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi16(__m64 src)1272 _mm_loadlo_pi16(__m64 src)
1273 {
1274   return _mm_unpacklo_pi16(src, _mm_setzero_si64());
1275 }
1276 
1277 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi16_f(__m64 src)1278 _mm_loadlo_pi16_f(__m64 src)
1279 {
1280   return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
1281 }
1282 
1283 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi16(__m64 src)1284 _mm_loadhi_pi16(__m64 src)
1285 {
1286   return _mm_unpackhi_pi16(src, _mm_setzero_si64());
1287 }
1288 
1289 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi16_f(__m64 src)1290 _mm_loadhi_pi16_f(__m64 src)
1291 {
1292   return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
1293 }
1294 
1295 extern __inline __m64 FUNCTION_ATTRIBS
_mm_expand_alpha(__m64 pixel)1296 _mm_expand_alpha(__m64 pixel)
1297 {
1298   return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
1299 }
1300 
1301 extern __inline __m64 FUNCTION_ATTRIBS
_mm_expand_alpha_rev(__m64 pixel)1302 _mm_expand_alpha_rev(__m64 pixel)
1303 {
1304   return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
1305 }
1306 
1307 #endif  /* __LOONGSON_MMINTRIN_H__ */
1308