1 /*
2 * Loongson MMI optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
5 * All Rights Reserved.
6 *
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
10 *
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
14 *
15 * 1. The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * 2. Altered source versions must be plainly marked as such, and must not be
20 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
22 */
23
24 #ifndef __LOONGSON_MMINTRIN_H__
25 #define __LOONGSON_MMINTRIN_H__
26
27 #include <stdint.h>
28
29
30 #define FUNCTION_ATTRIBS \
31 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
32
33
34 /* Vectors are stored in 64-bit floating-point registers. */
35 typedef double __m64;
36
37 /* Having a 32-bit datatype allows us to use 32-bit loads in places like
38 load8888. */
39 typedef float __m32;
40
41
42 /********** Set Operations **********/
43
44 extern __inline __m64
_mm_setzero_si64(void)45 _mm_setzero_si64(void)
46 {
47 return 0.0;
48 }
49
50 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi8(uint8_t __b7,uint8_t __b6,uint8_t __b5,uint8_t __b4,uint8_t __b3,uint8_t __b2,uint8_t __b1,uint8_t __b0)51 _mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
52 uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
53 {
54 __m64 ret;
55 uint32_t lo = ((uint32_t)__b6 << 24) |
56 ((uint32_t)__b4 << 16) |
57 ((uint32_t)__b2 << 8) |
58 (uint32_t)__b0;
59 uint32_t hi = ((uint32_t)__b7 << 24) |
60 ((uint32_t)__b5 << 16) |
61 ((uint32_t)__b3 << 8) |
62 (uint32_t)__b1;
63
64 asm("mtc1 %1, %0\n\t"
65 "mtc1 %2, $f0\n\t"
66 "punpcklbh %0, %0, $f0\n\t"
67 : "=f" (ret)
68 : "r" (lo), "r" (hi)
69 : "$f0"
70 );
71
72 return ret;
73 }
74
75 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi16(uint16_t __h3,uint16_t __h2,uint16_t __h1,uint16_t __h0)76 _mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
77 {
78 __m64 ret;
79 uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
80 uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
81
82 asm("mtc1 %1, %0\n\t"
83 "mtc1 %2, $f0\n\t"
84 "punpcklhw %0, %0, $f0\n\t"
85 : "=f" (ret)
86 : "r" (lo), "r" (hi)
87 : "$f0"
88 );
89
90 return ret;
91 }
92
93 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
94 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
95
96 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi32(uint32_t __i1,uint32_t __i0)97 _mm_set_pi32(uint32_t __i1, uint32_t __i0)
98 {
99 if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
100 uint64_t val = ((uint64_t)__i1 << 32) |
101 ((uint64_t)__i0 << 0);
102
103 return *(__m64 *)&val;
104 } else if (__i1 == __i0) {
105 uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
106 __m64 ret;
107
108 asm("pshufh %0, %1, %2\n\t"
109 : "=f" (ret)
110 : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
111 );
112
113 return ret;
114 } else {
115 uint64_t val = ((uint64_t)__i1 << 32) |
116 ((uint64_t)__i0 << 0);
117
118 return *(__m64 *)&val;
119 }
120 }
121
122 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi8(uint8_t __b0)123 _mm_set1_pi8(uint8_t __b0)
124 {
125 __m64 ret;
126
127 asm("sll $8, %1, 8\n\t"
128 "or %1, %1, $8\n\t"
129 "mtc1 %1, %0\n\t"
130 "mtc1 $0, $f0\n\t"
131 "pshufh %0, %0, $f0\n\t"
132 : "=f" (ret)
133 : "r" (__b0)
134 : "$8", "$f0"
135 );
136
137 return ret;
138 }
139
140 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi16(uint16_t __h0)141 _mm_set1_pi16(uint16_t __h0)
142 {
143 __m64 ret;
144
145 asm("mtc1 %1, %0\n\t"
146 "mtc1 $0, $f0\n\t"
147 "pshufh %0, %0, $f0\n\t"
148 : "=f" (ret)
149 : "r" (__h0)
150 : "$8", "$f0"
151 );
152
153 return ret;
154 }
155
156 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi32(unsigned __i0)157 _mm_set1_pi32(unsigned __i0)
158 {
159 return _mm_set_pi32(__i0, __i0);
160 }
161
162 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi8(uint8_t __h0,uint8_t __h1,uint8_t __h2,uint8_t __h3,uint8_t __h4,uint8_t __h5,uint8_t __h6,uint8_t __h7)163 _mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
164 uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
165 {
166 return _mm_set_pi8(__h7, __h6, __h5, __h4,
167 __h3, __h2, __h1, __h0);
168 }
169
170 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi16(uint16_t __w0,uint16_t __w1,uint16_t __w2,uint16_t __w3)171 _mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
172 {
173 return _mm_set_pi16(__w3, __w2, __w1, __w0);
174 }
175
176 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi32(uint32_t __i0,uint32_t __i1)177 _mm_setr_pi32(uint32_t __i0, uint32_t __i1)
178 {
179 return _mm_set_pi32(__i1, __i0);
180 }
181
182
183 /********** Arithmetic Operations **********/
184
185 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi8(__m64 __m1,__m64 __m2)186 _mm_add_pi8(__m64 __m1, __m64 __m2)
187 {
188 __m64 ret;
189
190 asm("paddb %0, %1, %2\n\t"
191 : "=f" (ret)
192 : "f" (__m1), "f" (__m2)
193 );
194
195 return ret;
196 }
197
198 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi16(__m64 __m1,__m64 __m2)199 _mm_add_pi16(__m64 __m1, __m64 __m2)
200 {
201 __m64 ret;
202
203 asm("paddh %0, %1, %2\n\t"
204 : "=f" (ret)
205 : "f" (__m1), "f" (__m2)
206 );
207
208 return ret;
209 }
210
211 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi32(__m64 __m1,__m64 __m2)212 _mm_add_pi32(__m64 __m1, __m64 __m2)
213 {
214 __m64 ret;
215
216 asm("paddw %0, %1, %2\n\t"
217 : "=f" (ret)
218 : "f" (__m1), "f" (__m2)
219 );
220
221 return ret;
222 }
223
224 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_si64(__m64 __m1,__m64 __m2)225 _mm_add_si64(__m64 __m1, __m64 __m2)
226 {
227 __m64 ret;
228
229 asm("paddd %0, %1, %2\n\t"
230 : "=f" (ret)
231 : "f" (__m1), "f" (__m2)
232 );
233
234 return ret;
235 }
236
237 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pi8(__m64 __m1,__m64 __m2)238 _mm_adds_pi8(__m64 __m1, __m64 __m2)
239 {
240 __m64 ret;
241
242 asm("paddsb %0, %1, %2\n\t"
243 : "=f" (ret)
244 : "f" (__m1), "f" (__m2)
245 );
246
247 return ret;
248 }
249
250 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pi16(__m64 __m1,__m64 __m2)251 _mm_adds_pi16(__m64 __m1, __m64 __m2)
252 {
253 __m64 ret;
254
255 asm("paddsh %0, %1, %2\n\t"
256 : "=f" (ret)
257 : "f" (__m1), "f" (__m2)
258 );
259
260 return ret;
261 }
262
263
264 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pu8(__m64 __m1,__m64 __m2)265 _mm_adds_pu8(__m64 __m1, __m64 __m2)
266 {
267 __m64 ret;
268
269 asm("paddusb %0, %1, %2\n\t"
270 : "=f" (ret)
271 : "f" (__m1), "f" (__m2)
272 );
273
274 return ret;
275 }
276
277 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pu16(__m64 __m1,__m64 __m2)278 _mm_adds_pu16(__m64 __m1, __m64 __m2)
279 {
280 __m64 ret;
281
282 asm("paddush %0, %1, %2\n\t"
283 : "=f" (ret)
284 : "f" (__m1), "f" (__m2)
285 );
286
287 return ret;
288 }
289
290 extern __inline __m64 FUNCTION_ATTRIBS
_mm_avg_pu8(__m64 __m1,__m64 __m2)291 _mm_avg_pu8(__m64 __m1, __m64 __m2)
292 {
293 __m64 ret;
294
295 asm("pavgb %0, %1, %2\n\t"
296 : "=f" (ret)
297 : "f" (__m1), "f" (__m2)
298 );
299
300 return ret;
301 }
302
303 extern __inline __m64 FUNCTION_ATTRIBS
_mm_avg_pu16(__m64 __m1,__m64 __m2)304 _mm_avg_pu16(__m64 __m1, __m64 __m2)
305 {
306 __m64 ret;
307
308 asm("pavgh %0, %1, %2\n\t"
309 : "=f" (ret)
310 : "f" (__m1), "f" (__m2)
311 );
312
313 return ret;
314 }
315
316 extern __inline __m64 FUNCTION_ATTRIBS
_mm_madd_pi16(__m64 __m1,__m64 __m2)317 _mm_madd_pi16(__m64 __m1, __m64 __m2)
318 {
319 __m64 ret;
320
321 asm("pmaddhw %0, %1, %2\n\t"
322 : "=f" (ret)
323 : "f" (__m1), "f" (__m2)
324 );
325
326 return ret;
327 }
328
329 extern __inline __m64 FUNCTION_ATTRIBS
_mm_max_pi16(__m64 __m1,__m64 __m2)330 _mm_max_pi16(__m64 __m1, __m64 __m2)
331 {
332 __m64 ret;
333
334 asm("pmaxsh %0, %1, %2\n\t"
335 : "=f" (ret)
336 : "f" (__m1), "f" (__m2)
337 );
338
339 return ret;
340 }
341
342 extern __inline __m64 FUNCTION_ATTRIBS
_mm_max_pu8(__m64 __m1,__m64 __m2)343 _mm_max_pu8(__m64 __m1, __m64 __m2)
344 {
345 __m64 ret;
346
347 asm("pmaxub %0, %1, %2\n\t"
348 : "=f" (ret)
349 : "f" (__m1), "f" (__m2)
350 );
351
352 return ret;
353 }
354
355 extern __inline __m64 FUNCTION_ATTRIBS
_mm_min_pi16(__m64 __m1,__m64 __m2)356 _mm_min_pi16(__m64 __m1, __m64 __m2)
357 {
358 __m64 ret;
359
360 asm("pminsh %0, %1, %2\n\t"
361 : "=f" (ret)
362 : "f" (__m1), "f" (__m2)
363 );
364
365 return ret;
366 }
367
368 extern __inline __m64 FUNCTION_ATTRIBS
_mm_min_pu8(__m64 __m1,__m64 __m2)369 _mm_min_pu8(__m64 __m1, __m64 __m2)
370 {
371 __m64 ret;
372
373 asm("pminub %0, %1, %2\n\t"
374 : "=f" (ret)
375 : "f" (__m1), "f" (__m2)
376 );
377
378 return ret;
379 }
380
381 extern __inline int FUNCTION_ATTRIBS
_mm_movemask_pi8(__m64 __m1)382 _mm_movemask_pi8(__m64 __m1)
383 {
384 int ret;
385
386 asm("pmovmskb %0, %1\n\t"
387 : "=r" (ret)
388 : "y" (__m1)
389 );
390
391 return ret;
392 }
393
394 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mulhi_pi16(__m64 __m1,__m64 __m2)395 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
396 {
397 __m64 ret;
398
399 asm("pmulhh %0, %1, %2\n\t"
400 : "=f" (ret)
401 : "f" (__m1), "f" (__m2)
402 );
403
404 return ret;
405 }
406
407 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mulhi_pu16(__m64 __m1,__m64 __m2)408 _mm_mulhi_pu16(__m64 __m1, __m64 __m2)
409 {
410 __m64 ret;
411
412 asm("pmulhuh %0, %1, %2\n\t"
413 : "=f" (ret)
414 : "f" (__m1), "f" (__m2)
415 );
416
417 return ret;
418 }
419
420 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mullo_pi16(__m64 __m1,__m64 __m2)421 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
422 {
423 __m64 ret;
424
425 asm("pmullh %0, %1, %2\n\t"
426 : "=f" (ret)
427 : "f" (__m1), "f" (__m2)
428 );
429
430 return ret;
431 }
432
433 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mul_pu32(__m64 __m1,__m64 __m2)434 _mm_mul_pu32(__m64 __m1, __m64 __m2)
435 {
436 __m64 ret;
437
438 asm("pmuluw %0, %1, %2\n\t"
439 : "=f" (ret)
440 : "f" (__m1), "f" (__m2)
441 );
442
443 return ret;
444 }
445
446 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sad_pu8(__m64 __m1,__m64 __m2)447 _mm_sad_pu8(__m64 __m1, __m64 __m2)
448 {
449 __m64 ret;
450
451 asm("psadbh %0, %1, %2\n\t"
452 : "=f" (ret)
453 : "f" (__m1), "f" (__m2)
454 );
455
456 return ret;
457 }
458
459
460 extern __inline __m64 FUNCTION_ATTRIBS
_mm_asub_pu8(__m64 __m1,__m64 __m2)461 _mm_asub_pu8(__m64 __m1, __m64 __m2)
462 {
463 __m64 ret;
464
465 asm("pasubub %0, %1, %2\n\t"
466 : "=f" (ret)
467 : "f" (__m1), "f" (__m2)
468 );
469
470 return ret;
471 }
472
473 extern __inline __m64 FUNCTION_ATTRIBS
_mm_biadd_pu8(__m64 __m1,__m64 __m2)474 _mm_biadd_pu8(__m64 __m1, __m64 __m2)
475 {
476 __m64 ret;
477
478 asm("biadd %0, %1, %2\n\t"
479 : "=f" (ret)
480 : "f" (__m1), "f" (__m2)
481 );
482
483 return ret;
484 }
485
486 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi8(__m64 __m1,__m64 __m2)487 _mm_sub_pi8(__m64 __m1, __m64 __m2)
488 {
489 __m64 ret;
490
491 asm("psubb %0, %1, %2\n\t"
492 : "=f" (ret)
493 : "f" (__m1), "f" (__m2)
494 );
495
496 return ret;
497 }
498
499 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi16(__m64 __m1,__m64 __m2)500 _mm_sub_pi16(__m64 __m1, __m64 __m2)
501 {
502 __m64 ret;
503
504 asm("psubh %0, %1, %2\n\t"
505 : "=f" (ret)
506 : "f" (__m1), "f" (__m2)
507 );
508
509 return ret;
510 }
511
512 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi32(__m64 __m1,__m64 __m2)513 _mm_sub_pi32(__m64 __m1, __m64 __m2)
514 {
515 __m64 ret;
516
517 asm("psubw %0, %1, %2\n\t"
518 : "=f" (ret)
519 : "f" (__m1), "f" (__m2)
520 );
521
522 return ret;
523 }
524
525 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_si64(__m64 __m1,__m64 __m2)526 _mm_sub_si64(__m64 __m1, __m64 __m2)
527 {
528 __m64 ret;
529
530 asm("psubd %0, %1, %2\n\t"
531 : "=f" (ret)
532 : "f" (__m1), "f" (__m2)
533 );
534
535 return ret;
536 }
537
538 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pi8(__m64 __m1,__m64 __m2)539 _mm_subs_pi8(__m64 __m1, __m64 __m2)
540 {
541 __m64 ret;
542
543 asm("psubsb %0, %1, %2\n\t"
544 : "=f" (ret)
545 : "f" (__m1), "f" (__m2)
546 );
547
548 return ret;
549 }
550
551 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pi16(__m64 __m1,__m64 __m2)552 _mm_subs_pi16(__m64 __m1, __m64 __m2)
553 {
554 __m64 ret;
555
556 asm("psubsh %0, %1, %2\n\t"
557 : "=f" (ret)
558 : "f" (__m1), "f" (__m2)
559 );
560
561 return ret;
562 }
563
564
565 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pu8(__m64 __m1,__m64 __m2)566 _mm_subs_pu8(__m64 __m1, __m64 __m2)
567 {
568 __m64 ret;
569
570 asm("psubusb %0, %1, %2\n\t"
571 : "=f" (ret)
572 : "f" (__m1), "f" (__m2)
573 );
574
575 return ret;
576 }
577
578 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pu16(__m64 __m1,__m64 __m2)579 _mm_subs_pu16(__m64 __m1, __m64 __m2)
580 {
581 __m64 ret;
582
583 asm("psubush %0, %1, %2\n\t"
584 : "=f" (ret)
585 : "f" (__m1), "f" (__m2)
586 );
587
588 return ret;
589 }
590
591
592 /********** Logical Operations **********/
593
594 extern __inline __m64 FUNCTION_ATTRIBS
_mm_and_si64(__m64 __m1,__m64 __m2)595 _mm_and_si64(__m64 __m1, __m64 __m2)
596 {
597 __m64 ret;
598
599 asm("and %0, %1, %2\n\t"
600 : "=f" (ret)
601 : "f" (__m1), "f" (__m2)
602 );
603
604 return ret;
605 }
606
607 extern __inline __m64 FUNCTION_ATTRIBS
_mm_andnot_si64(__m64 __m1,__m64 __m2)608 _mm_andnot_si64(__m64 __m1, __m64 __m2)
609 {
610 __m64 ret;
611
612 asm("andn %0, %1, %2\n\t"
613 : "=f" (ret)
614 : "f" (__m1), "f" (__m2)
615 );
616
617 return ret;
618 }
619
620
621 extern __inline __m64 FUNCTION_ATTRIBS
_mm_or_si32(__m32 __m1,__m32 __m2)622 _mm_or_si32(__m32 __m1, __m32 __m2)
623 {
624 __m32 ret;
625
626 asm("or %0, %1, %2\n\t"
627 : "=f" (ret)
628 : "f" (__m1), "f" (__m2)
629 );
630
631 return ret;
632 }
633
634 extern __inline __m64 FUNCTION_ATTRIBS
_mm_or_si64(__m64 __m1,__m64 __m2)635 _mm_or_si64(__m64 __m1, __m64 __m2)
636 {
637 __m64 ret;
638
639 asm("or %0, %1, %2\n\t"
640 : "=f" (ret)
641 : "f" (__m1), "f" (__m2)
642 );
643
644 return ret;
645 }
646
647 extern __inline __m64 FUNCTION_ATTRIBS
_mm_xor_si64(__m64 __m1,__m64 __m2)648 _mm_xor_si64(__m64 __m1, __m64 __m2)
649 {
650 __m64 ret;
651
652 asm("xor %0, %1, %2\n\t"
653 : "=f" (ret)
654 : "f" (__m1), "f" (__m2)
655 );
656
657 return ret;
658 }
659
660
661 /********** Shift Operations **********/
662
663 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_pi16(__m64 __m,int64_t __count)664 _mm_slli_pi16(__m64 __m, int64_t __count)
665 {
666 __m64 ret;
667
668 asm("psllh %0, %1, %2\n\t"
669 : "=f" (ret)
670 : "f" (__m), "f" (*(__m64 *)&__count)
671 );
672
673 return ret;
674 }
675
676 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_pi32(__m64 __m,int64_t __count)677 _mm_slli_pi32(__m64 __m, int64_t __count)
678 {
679 __m64 ret;
680
681 asm("psllw %0, %1, %2\n\t"
682 : "=f" (ret)
683 : "f" (__m), "f" (*(__m64 *)&__count)
684 );
685
686 return ret;
687 }
688
689 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_si64(__m64 __m,int64_t __count)690 _mm_slli_si64(__m64 __m, int64_t __count)
691 {
692 __m64 ret;
693
694 asm("dsll %0, %1, %2\n\t"
695 : "=f" (ret)
696 : "f" (__m), "f" (*(__m64 *)&__count)
697 );
698
699 return ret;
700 }
701
702 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_pi16(__m64 __m,int64_t __count)703 _mm_srli_pi16(__m64 __m, int64_t __count)
704 {
705 __m64 ret;
706
707 asm("psrlh %0, %1, %2\n\t"
708 : "=f" (ret)
709 : "f" (__m), "f" (*(__m64 *)&__count)
710 );
711
712 return ret;
713 }
714
715 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_pi32(__m64 __m,int64_t __count)716 _mm_srli_pi32(__m64 __m, int64_t __count)
717 {
718 __m64 ret;
719
720 asm("psrlw %0, %1, %2\n\t"
721 : "=f" (ret)
722 : "f" (__m), "f" (*(__m64 *)&__count)
723 );
724
725 return ret;
726 }
727
728 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_si64(__m64 __m,int64_t __count)729 _mm_srli_si64(__m64 __m, int64_t __count)
730 {
731 __m64 ret;
732
733 asm("dsrl %0, %1, %2\n\t"
734 : "=f" (ret)
735 : "f" (__m), "f" (*(__m64 *)&__count)
736 );
737
738 return ret;
739 }
740
741 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_pi16(__m64 __m,int64_t __count)742 _mm_srai_pi16(__m64 __m, int64_t __count)
743 {
744 __m64 ret;
745
746 asm("psrah %0, %1, %2\n\t"
747 : "=f" (ret)
748 : "f" (__m), "f" (*(__m64 *)&__count)
749 );
750
751 return ret;
752 }
753
754 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_pi32(__m64 __m,int64_t __count)755 _mm_srai_pi32(__m64 __m, int64_t __count)
756 {
757 __m64 ret;
758
759 asm("psraw %0, %1, %2\n\t"
760 : "=f" (ret)
761 : "f" (__m), "f" (*(__m64 *)&__count)
762 );
763
764 return ret;
765 }
766
767 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_si64(__m64 __m,int64_t __count)768 _mm_srai_si64(__m64 __m, int64_t __count)
769 {
770 __m64 ret;
771
772 asm("dsra %0, %1, %2\n\t"
773 : "=f" (ret)
774 : "f" (__m), "f" (*(__m64 *)&__count)
775 );
776
777 return ret;
778 }
779
780
781 /********** Conversion Intrinsics **********/
782
783 extern __inline __m64 FUNCTION_ATTRIBS
to_m64(uint64_t x)784 to_m64(uint64_t x)
785 {
786 return *(__m64 *)&x;
787 }
788
789 extern __inline uint64_t FUNCTION_ATTRIBS
to_uint64(__m64 x)790 to_uint64(__m64 x)
791 {
792 return *(uint64_t *)&x;
793 }
794
795
796 /********** Comparison Intrinsics **********/
797
798 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi8(__m64 __m1,__m64 __m2)799 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
800 {
801 __m64 ret;
802
803 asm("pcmpeqb %0, %1, %2\n\t"
804 : "=f" (ret)
805 : "f" (__m1), "f" (__m2)
806 );
807
808 return ret;
809 }
810
811 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi16(__m64 __m1,__m64 __m2)812 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
813 {
814 __m64 ret;
815
816 asm("pcmpeqh %0, %1, %2\n\t"
817 : "=f" (ret)
818 : "f" (__m1), "f" (__m2)
819 );
820
821 return ret;
822 }
823
824 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi32(__m64 __m1,__m64 __m2)825 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
826 {
827 __m64 ret;
828
829 asm("pcmpeqw %0, %1, %2\n\t"
830 : "=f" (ret)
831 : "f" (__m1), "f" (__m2)
832 );
833
834 return ret;
835 }
836
837 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi8(__m64 __m1,__m64 __m2)838 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
839 {
840 __m64 ret;
841
842 asm("pcmpgtb %0, %1, %2\n\t"
843 : "=f" (ret)
844 : "f" (__m1), "f" (__m2)
845 );
846
847 return ret;
848 }
849
850 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi16(__m64 __m1,__m64 __m2)851 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
852 {
853 __m64 ret;
854
855 asm("pcmpgth %0, %1, %2\n\t"
856 : "=f" (ret)
857 : "f" (__m1), "f" (__m2)
858 );
859
860 return ret;
861 }
862
863 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi32(__m64 __m1,__m64 __m2)864 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
865 {
866 __m64 ret;
867
868 asm("pcmpgtw %0, %1, %2\n\t"
869 : "=f" (ret)
870 : "f" (__m1), "f" (__m2)
871 );
872
873 return ret;
874 }
875
876 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi8(__m64 __m1,__m64 __m2)877 _mm_cmplt_pi8(__m64 __m1, __m64 __m2)
878 {
879 __m64 ret;
880
881 asm("pcmpltb %0, %1, %2\n\t"
882 : "=f" (ret)
883 : "f" (__m1), "f" (__m2)
884 );
885
886 return ret;
887 }
888
889 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi16(__m64 __m1,__m64 __m2)890 _mm_cmplt_pi16(__m64 __m1, __m64 __m2)
891 {
892 __m64 ret;
893
894 asm("pcmplth %0, %1, %2\n\t"
895 : "=f" (ret)
896 : "f" (__m1), "f" (__m2)
897 );
898
899 return ret;
900 }
901
902 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi32(__m64 __m1,__m64 __m2)903 _mm_cmplt_pi32(__m64 __m1, __m64 __m2)
904 {
905 __m64 ret;
906
907 asm("pcmpltw %0, %1, %2\n\t"
908 : "=f" (ret)
909 : "f" (__m1), "f" (__m2)
910 );
911
912 return ret;
913 }
914
915
916 /********** Miscellaneous Operations **********/
917
918 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi16(__m64 __m1,__m64 __m2)919 _mm_packs_pi16(__m64 __m1, __m64 __m2)
920 {
921 __m64 ret;
922
923 asm("packsshb %0, %1, %2\n\t"
924 : "=f" (ret)
925 : "f" (__m1), "f" (__m2)
926 );
927
928 return ret;
929 }
930
931 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi32(__m64 __m1,__m64 __m2)932 _mm_packs_pi32(__m64 __m1, __m64 __m2)
933 {
934 __m64 ret;
935
936 asm("packsswh %0, %1, %2\n\t"
937 : "=f" (ret)
938 : "f" (__m1), "f" (__m2)
939 );
940
941 return ret;
942 }
943
944 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi32_f(__m64 __m1,__m64 __m2)945 _mm_packs_pi32_f(__m64 __m1, __m64 __m2)
946 {
947 __m64 ret;
948
949 asm("packsswh %0, %1, %2\n\t"
950 : "=f" (ret)
951 : "f" (__m1), "f" (__m2)
952 );
953
954 return ret;
955 }
956
957 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pu16(__m64 __m1,__m64 __m2)958 _mm_packs_pu16(__m64 __m1, __m64 __m2)
959 {
960 __m64 ret;
961
962 asm("packushb %0, %1, %2\n\t"
963 : "=f" (ret)
964 : "f" (__m1), "f" (__m2)
965 );
966
967 return ret;
968 }
969
970 extern __inline __m64 FUNCTION_ATTRIBS
_mm_extract_pi16(__m64 __m,int64_t __pos)971 _mm_extract_pi16(__m64 __m, int64_t __pos)
972 {
973 __m64 ret;
974
975 asm("pextrh %0, %1, %2\n\t"
976 : "=f" (ret)
977 : "f" (__m), "f" (*(__m64 *)&__pos)
978 );
979
980 return ret;
981 }
982
983 extern __inline __m64 FUNCTION_ATTRIBS
_mm_insert_pi16(__m64 __m1,__m64 __m2,int64_t __pos)984 _mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
985 {
986 __m64 ret;
987
988 switch (__pos) {
989 case 0:
990
991 asm("pinsrh_0 %0, %1, %2\n\t"
992 : "=f" (ret)
993 : "f" (__m1), "f" (__m2), "i" (__pos)
994 );
995
996 break;
997
998 case 1:
999
1000 asm("pinsrh_1 %0, %1, %2\n\t"
1001 : "=f" (ret)
1002 : "f" (__m1), "f" (__m2), "i" (__pos)
1003 );
1004
1005 break;
1006 case 2:
1007
1008 asm("pinsrh_2 %0, %1, %2\n\t"
1009 : "=f" (ret)
1010 : "f" (__m1), "f" (__m2), "i" (__pos)
1011 );
1012
1013 break;
1014
1015 case 3:
1016
1017 asm("pinsrh_3 %0, %1, %2\n\t"
1018 : "=f" (ret)
1019 : "f" (__m1), "f" (__m2), "i" (__pos)
1020 );
1021
1022 break;
1023 }
1024
1025 return ret;
1026 }
1027
1028 extern __inline __m64 FUNCTION_ATTRIBS
_mm_shuffle_pi16(__m64 __m,int64_t __n)1029 _mm_shuffle_pi16(__m64 __m, int64_t __n)
1030 {
1031 __m64 ret;
1032
1033 asm("pshufh %0, %1, %2\n\t"
1034 : "=f" (ret)
1035 : "f" (__m), "f" (*(__m64 *)&__n)
1036 );
1037
1038 return ret;
1039 }
1040
1041 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi8(__m64 __m1,__m64 __m2)1042 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
1043 {
1044 __m64 ret;
1045
1046 asm("punpckhbh %0, %1, %2\n\t"
1047 : "=f" (ret)
1048 : "f" (__m1), "f" (__m2)
1049 );
1050
1051 return ret;
1052 }
1053
1054 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi8_f(__m64 __m1,__m64 __m2)1055 _mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
1056 {
1057 __m64 ret;
1058
1059 asm("punpckhbh %0, %1, %2\n\t"
1060 : "=f" (ret)
1061 : "f" (__m1), "f" (__m2)
1062 );
1063
1064 return ret;
1065 }
1066
1067 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi16(__m64 __m1,__m64 __m2)1068 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
1069 {
1070 __m64 ret;
1071
1072 asm("punpckhhw %0, %1, %2\n\t"
1073 : "=f" (ret)
1074 : "f" (__m1), "f" (__m2)
1075 );
1076
1077 return ret;
1078 }
1079
1080 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi16_f(__m64 __m1,__m64 __m2)1081 _mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
1082 {
1083 __m64 ret;
1084
1085 asm("punpckhhw %0, %1, %2\n\t"
1086 : "=f" (ret)
1087 : "f" (__m1), "f" (__m2)
1088 );
1089
1090 return ret;
1091 }
1092
1093 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi32(__m64 __m1,__m64 __m2)1094 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
1095 {
1096 __m64 ret;
1097
1098 asm("punpckhwd %0, %1, %2\n\t"
1099 : "=f" (ret)
1100 : "f" (__m1), "f" (__m2)
1101 );
1102
1103 return ret;
1104 }
1105
1106 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8(__m64 __m1,__m64 __m2)1107 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
1108 {
1109 __m64 ret;
1110
1111 asm("punpcklbh %0, %1, %2\n\t"
1112 : "=f" (ret)
1113 : "f" (__m1), "f" (__m2)
1114 );
1115
1116 return ret;
1117 }
1118
1119 /* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
1120 which preserves the data. */
1121
1122 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8_f64(__m64 __m1,__m64 __m2)1123 _mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
1124 {
1125 __m64 ret;
1126
1127 asm("punpcklbh %0, %1, %2\n\t"
1128 : "=f" (ret)
1129 : "f" (__m1), "f" (__m2)
1130 );
1131
1132 return ret;
1133 }
1134
1135 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
1136 datatype, which allows load8888 to use 32-bit loads. */
1137
1138 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8_f(__m32 __m1,__m64 __m2)1139 _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
1140 {
1141 __m64 ret;
1142
1143 asm("punpcklbh %0, %1, %2\n\t"
1144 : "=f" (ret)
1145 : "f" (__m1), "f" (__m2)
1146 );
1147
1148 return ret;
1149 }
1150
1151 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi16(__m64 __m1,__m64 __m2)1152 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
1153 {
1154 __m64 ret;
1155
1156 asm("punpcklhw %0, %1, %2\n\t"
1157 : "=f" (ret)
1158 : "f" (__m1), "f" (__m2)
1159 );
1160
1161 return ret;
1162 }
1163
1164 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi16_f(__m64 __m1,__m64 __m2)1165 _mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
1166 {
1167 __m64 ret;
1168
1169 asm("punpcklhw %0, %1, %2\n\t"
1170 : "=f" (ret)
1171 : "f" (__m1), "f" (__m2)
1172 );
1173
1174 return ret;
1175 }
1176
1177 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi32(__m64 __m1,__m64 __m2)1178 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
1179 {
1180 __m64 ret;
1181
1182 asm("punpcklwd %0, %1, %2\n\t"
1183 : "=f" (ret)
1184 : "f" (__m1), "f" (__m2)
1185 );
1186
1187 return ret;
1188 }
1189
1190
1191 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi32_f(__m64 __m1,__m64 __m2)1192 _mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
1193 {
1194 __m64 ret;
1195
1196 asm("punpcklwd %0, %1, %2\n\t"
1197 : "=f" (ret)
1198 : "f" (__m1), "f" (__m2)
1199 );
1200
1201 return ret;
1202 }
1203
1204 extern __inline void FUNCTION_ATTRIBS
_mm_store_pi32(__m32 * dest,__m64 src)1205 _mm_store_pi32(__m32 *dest, __m64 src)
1206 {
1207 src = _mm_packs_pu16(src, _mm_setzero_si64());
1208
1209 asm("swc1 %1, %0\n\t"
1210 : "=m" (*dest)
1211 : "f" (src)
1212 : "memory"
1213 );
1214 }
1215
1216 extern __inline void FUNCTION_ATTRIBS
_mm_store_si64(__m64 * dest,__m64 src)1217 _mm_store_si64(__m64 *dest, __m64 src)
1218 {
1219 asm("gssdlc1 %1, 7+%0\n\t"
1220 "gssdrc1 %1, %0\n\t"
1221 : "=m" (*dest)
1222 : "f" (src)
1223 : "memory"
1224 );
1225 }
1226
1227 extern __inline __m64 FUNCTION_ATTRIBS
_mm_load_si32(const __m32 * src)1228 _mm_load_si32(const __m32 *src)
1229 {
1230 __m32 ret;
1231
1232 asm("lwc1 %0, %1\n\t"
1233 : "=f" (ret)
1234 : "m" (*src)
1235 );
1236
1237 return ret;
1238 }
1239
1240 extern __inline __m64 FUNCTION_ATTRIBS
_mm_load_si64(const __m64 * src)1241 _mm_load_si64(const __m64 *src)
1242 {
1243 __m64 ret;
1244
1245 asm("ldc1 %0, %1\n\t"
1246 : "=f" (ret)
1247 : "m" (*src)
1248 );
1249
1250 return ret;
1251 }
1252
1253 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi8(const uint32_t * src)1254 _mm_loadlo_pi8(const uint32_t *src)
1255 {
1256 return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
1257 }
1258
1259 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi8_f(__m64 src)1260 _mm_loadlo_pi8_f(__m64 src)
1261 {
1262 return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
1263 }
1264
1265 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi8_f(__m64 src)1266 _mm_loadhi_pi8_f(__m64 src)
1267 {
1268 return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
1269 }
1270
1271 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi16(__m64 src)1272 _mm_loadlo_pi16(__m64 src)
1273 {
1274 return _mm_unpacklo_pi16(src, _mm_setzero_si64());
1275 }
1276
1277 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi16_f(__m64 src)1278 _mm_loadlo_pi16_f(__m64 src)
1279 {
1280 return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
1281 }
1282
1283 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi16(__m64 src)1284 _mm_loadhi_pi16(__m64 src)
1285 {
1286 return _mm_unpackhi_pi16(src, _mm_setzero_si64());
1287 }
1288
1289 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi16_f(__m64 src)1290 _mm_loadhi_pi16_f(__m64 src)
1291 {
1292 return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
1293 }
1294
1295 extern __inline __m64 FUNCTION_ATTRIBS
_mm_expand_alpha(__m64 pixel)1296 _mm_expand_alpha(__m64 pixel)
1297 {
1298 return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
1299 }
1300
1301 extern __inline __m64 FUNCTION_ATTRIBS
_mm_expand_alpha_rev(__m64 pixel)1302 _mm_expand_alpha_rev(__m64 pixel)
1303 {
1304 return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
1305 }
1306
1307 #endif /* __LOONGSON_MMINTRIN_H__ */
1308