1 /*
2 * Loongson MMI optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
5 * All Rights Reserved.
6 * Copyright (C) 2019, D. R. Commander. All Rights Reserved.
7 *
8 * This software is provided 'as-is', without any express or implied
9 * warranty. In no event will the authors be held liable for any damages
10 * arising from the use of this software.
11 *
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
15 *
16 * 1. The origin of this software must not be misrepresented; you must not
17 * claim that you wrote the original software. If you use this software
18 * in a product, an acknowledgment in the product documentation would be
19 * appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 * misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
23 */
24
25 #ifndef __LOONGSON_MMINTRIN_H__
26 #define __LOONGSON_MMINTRIN_H__
27
28 #include <stdint.h>
29
30
31 #define FUNCTION_ATTRIBS \
32 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
33
34
35 /* Vectors are stored in 64-bit floating-point registers. */
36 typedef double __m64;
37
38 /* Having a 32-bit datatype allows us to use 32-bit loads in places like
39 load8888. */
40 typedef float __m32;
41
42
43 /********** Set Operations **********/
44
45 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setzero_si64(void)46 _mm_setzero_si64(void)
47 {
48 return 0.0;
49 }
50
51 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi8(uint8_t __b7,uint8_t __b6,uint8_t __b5,uint8_t __b4,uint8_t __b3,uint8_t __b2,uint8_t __b1,uint8_t __b0)52 _mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
53 uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
54 {
55 __m64 ret;
56 uint32_t lo = ((uint32_t)__b6 << 24) |
57 ((uint32_t)__b4 << 16) |
58 ((uint32_t)__b2 << 8) |
59 (uint32_t)__b0;
60 uint32_t hi = ((uint32_t)__b7 << 24) |
61 ((uint32_t)__b5 << 16) |
62 ((uint32_t)__b3 << 8) |
63 (uint32_t)__b1;
64
65 asm("mtc1 %1, %0\n\t"
66 "mtc1 %2, $f0\n\t"
67 "punpcklbh %0, %0, $f0\n\t"
68 : "=f" (ret)
69 : "r" (lo), "r" (hi)
70 : "$f0"
71 );
72
73 return ret;
74 }
75
76 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi16(uint16_t __h3,uint16_t __h2,uint16_t __h1,uint16_t __h0)77 _mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
78 {
79 __m64 ret;
80 uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
81 uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
82
83 asm("mtc1 %1, %0\n\t"
84 "mtc1 %2, $f0\n\t"
85 "punpcklhw %0, %0, $f0\n\t"
86 : "=f" (ret)
87 : "r" (lo), "r" (hi)
88 : "$f0"
89 );
90
91 return ret;
92 }
93
94 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
95 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
96
97 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi32(uint32_t __i1,uint32_t __i0)98 _mm_set_pi32(uint32_t __i1, uint32_t __i0)
99 {
100 if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
101 uint64_t val = ((uint64_t)__i1 << 32) |
102 ((uint64_t)__i0 << 0);
103
104 return *(__m64 *)&val;
105 } else if (__i1 == __i0) {
106 uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
107 __m64 ret;
108
109 asm("pshufh %0, %1, %2\n\t"
110 : "=f" (ret)
111 : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
112 );
113
114 return ret;
115 } else {
116 uint64_t val = ((uint64_t)__i1 << 32) |
117 ((uint64_t)__i0 << 0);
118
119 return *(__m64 *)&val;
120 }
121 }
122
123 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi8(uint8_t __b0)124 _mm_set1_pi8(uint8_t __b0)
125 {
126 __m64 ret;
127
128 asm("sll $8, %1, 8\n\t"
129 "or %1, %1, $8\n\t"
130 "mtc1 %1, %0\n\t"
131 "mtc1 $0, $f0\n\t"
132 "pshufh %0, %0, $f0\n\t"
133 : "=f" (ret)
134 : "r" (__b0)
135 : "$8", "$f0"
136 );
137
138 return ret;
139 }
140
141 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi16(uint16_t __h0)142 _mm_set1_pi16(uint16_t __h0)
143 {
144 __m64 ret;
145
146 asm("mtc1 %1, %0\n\t"
147 "mtc1 $0, $f0\n\t"
148 "pshufh %0, %0, $f0\n\t"
149 : "=f" (ret)
150 : "r" (__h0)
151 : "$8", "$f0"
152 );
153
154 return ret;
155 }
156
157 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi32(unsigned __i0)158 _mm_set1_pi32(unsigned __i0)
159 {
160 return _mm_set_pi32(__i0, __i0);
161 }
162
163 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi8(uint8_t __h0,uint8_t __h1,uint8_t __h2,uint8_t __h3,uint8_t __h4,uint8_t __h5,uint8_t __h6,uint8_t __h7)164 _mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
165 uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
166 {
167 return _mm_set_pi8(__h7, __h6, __h5, __h4,
168 __h3, __h2, __h1, __h0);
169 }
170
171 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi16(uint16_t __w0,uint16_t __w1,uint16_t __w2,uint16_t __w3)172 _mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
173 {
174 return _mm_set_pi16(__w3, __w2, __w1, __w0);
175 }
176
177 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi32(uint32_t __i0,uint32_t __i1)178 _mm_setr_pi32(uint32_t __i0, uint32_t __i1)
179 {
180 return _mm_set_pi32(__i1, __i0);
181 }
182
183
184 /********** Arithmetic Operations **********/
185
186 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi8(__m64 __m1,__m64 __m2)187 _mm_add_pi8(__m64 __m1, __m64 __m2)
188 {
189 __m64 ret;
190
191 asm("paddb %0, %1, %2\n\t"
192 : "=f" (ret)
193 : "f" (__m1), "f" (__m2)
194 );
195
196 return ret;
197 }
198
199 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi16(__m64 __m1,__m64 __m2)200 _mm_add_pi16(__m64 __m1, __m64 __m2)
201 {
202 __m64 ret;
203
204 asm("paddh %0, %1, %2\n\t"
205 : "=f" (ret)
206 : "f" (__m1), "f" (__m2)
207 );
208
209 return ret;
210 }
211
212 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi32(__m64 __m1,__m64 __m2)213 _mm_add_pi32(__m64 __m1, __m64 __m2)
214 {
215 __m64 ret;
216
217 asm("paddw %0, %1, %2\n\t"
218 : "=f" (ret)
219 : "f" (__m1), "f" (__m2)
220 );
221
222 return ret;
223 }
224
225 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_si64(__m64 __m1,__m64 __m2)226 _mm_add_si64(__m64 __m1, __m64 __m2)
227 {
228 __m64 ret;
229
230 asm("paddd %0, %1, %2\n\t"
231 : "=f" (ret)
232 : "f" (__m1), "f" (__m2)
233 );
234
235 return ret;
236 }
237
238 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pi8(__m64 __m1,__m64 __m2)239 _mm_adds_pi8(__m64 __m1, __m64 __m2)
240 {
241 __m64 ret;
242
243 asm("paddsb %0, %1, %2\n\t"
244 : "=f" (ret)
245 : "f" (__m1), "f" (__m2)
246 );
247
248 return ret;
249 }
250
251 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pi16(__m64 __m1,__m64 __m2)252 _mm_adds_pi16(__m64 __m1, __m64 __m2)
253 {
254 __m64 ret;
255
256 asm("paddsh %0, %1, %2\n\t"
257 : "=f" (ret)
258 : "f" (__m1), "f" (__m2)
259 );
260
261 return ret;
262 }
263
264
265 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pu8(__m64 __m1,__m64 __m2)266 _mm_adds_pu8(__m64 __m1, __m64 __m2)
267 {
268 __m64 ret;
269
270 asm("paddusb %0, %1, %2\n\t"
271 : "=f" (ret)
272 : "f" (__m1), "f" (__m2)
273 );
274
275 return ret;
276 }
277
278 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pu16(__m64 __m1,__m64 __m2)279 _mm_adds_pu16(__m64 __m1, __m64 __m2)
280 {
281 __m64 ret;
282
283 asm("paddush %0, %1, %2\n\t"
284 : "=f" (ret)
285 : "f" (__m1), "f" (__m2)
286 );
287
288 return ret;
289 }
290
291 extern __inline __m64 FUNCTION_ATTRIBS
_mm_avg_pu8(__m64 __m1,__m64 __m2)292 _mm_avg_pu8(__m64 __m1, __m64 __m2)
293 {
294 __m64 ret;
295
296 asm("pavgb %0, %1, %2\n\t"
297 : "=f" (ret)
298 : "f" (__m1), "f" (__m2)
299 );
300
301 return ret;
302 }
303
304 extern __inline __m64 FUNCTION_ATTRIBS
_mm_avg_pu16(__m64 __m1,__m64 __m2)305 _mm_avg_pu16(__m64 __m1, __m64 __m2)
306 {
307 __m64 ret;
308
309 asm("pavgh %0, %1, %2\n\t"
310 : "=f" (ret)
311 : "f" (__m1), "f" (__m2)
312 );
313
314 return ret;
315 }
316
317 extern __inline __m64 FUNCTION_ATTRIBS
_mm_madd_pi16(__m64 __m1,__m64 __m2)318 _mm_madd_pi16(__m64 __m1, __m64 __m2)
319 {
320 __m64 ret;
321
322 asm("pmaddhw %0, %1, %2\n\t"
323 : "=f" (ret)
324 : "f" (__m1), "f" (__m2)
325 );
326
327 return ret;
328 }
329
330 extern __inline __m64 FUNCTION_ATTRIBS
_mm_max_pi16(__m64 __m1,__m64 __m2)331 _mm_max_pi16(__m64 __m1, __m64 __m2)
332 {
333 __m64 ret;
334
335 asm("pmaxsh %0, %1, %2\n\t"
336 : "=f" (ret)
337 : "f" (__m1), "f" (__m2)
338 );
339
340 return ret;
341 }
342
343 extern __inline __m64 FUNCTION_ATTRIBS
_mm_max_pu8(__m64 __m1,__m64 __m2)344 _mm_max_pu8(__m64 __m1, __m64 __m2)
345 {
346 __m64 ret;
347
348 asm("pmaxub %0, %1, %2\n\t"
349 : "=f" (ret)
350 : "f" (__m1), "f" (__m2)
351 );
352
353 return ret;
354 }
355
356 extern __inline __m64 FUNCTION_ATTRIBS
_mm_min_pi16(__m64 __m1,__m64 __m2)357 _mm_min_pi16(__m64 __m1, __m64 __m2)
358 {
359 __m64 ret;
360
361 asm("pminsh %0, %1, %2\n\t"
362 : "=f" (ret)
363 : "f" (__m1), "f" (__m2)
364 );
365
366 return ret;
367 }
368
369 extern __inline __m64 FUNCTION_ATTRIBS
_mm_min_pu8(__m64 __m1,__m64 __m2)370 _mm_min_pu8(__m64 __m1, __m64 __m2)
371 {
372 __m64 ret;
373
374 asm("pminub %0, %1, %2\n\t"
375 : "=f" (ret)
376 : "f" (__m1), "f" (__m2)
377 );
378
379 return ret;
380 }
381
382 extern __inline int FUNCTION_ATTRIBS
_mm_movemask_pi8(__m64 __m1)383 _mm_movemask_pi8(__m64 __m1)
384 {
385 int ret;
386
387 asm("pmovmskb %0, %1\n\t"
388 : "=r" (ret)
389 : "y" (__m1)
390 );
391
392 return ret;
393 }
394
395 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mulhi_pi16(__m64 __m1,__m64 __m2)396 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
397 {
398 __m64 ret;
399
400 asm("pmulhh %0, %1, %2\n\t"
401 : "=f" (ret)
402 : "f" (__m1), "f" (__m2)
403 );
404
405 return ret;
406 }
407
408 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mulhi_pu16(__m64 __m1,__m64 __m2)409 _mm_mulhi_pu16(__m64 __m1, __m64 __m2)
410 {
411 __m64 ret;
412
413 asm("pmulhuh %0, %1, %2\n\t"
414 : "=f" (ret)
415 : "f" (__m1), "f" (__m2)
416 );
417
418 return ret;
419 }
420
421 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mullo_pi16(__m64 __m1,__m64 __m2)422 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
423 {
424 __m64 ret;
425
426 asm("pmullh %0, %1, %2\n\t"
427 : "=f" (ret)
428 : "f" (__m1), "f" (__m2)
429 );
430
431 return ret;
432 }
433
434 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mul_pu32(__m64 __m1,__m64 __m2)435 _mm_mul_pu32(__m64 __m1, __m64 __m2)
436 {
437 __m64 ret;
438
439 asm("pmuluw %0, %1, %2\n\t"
440 : "=f" (ret)
441 : "f" (__m1), "f" (__m2)
442 );
443
444 return ret;
445 }
446
447 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sad_pu8(__m64 __m1,__m64 __m2)448 _mm_sad_pu8(__m64 __m1, __m64 __m2)
449 {
450 __m64 ret;
451
452 asm("psadbh %0, %1, %2\n\t"
453 : "=f" (ret)
454 : "f" (__m1), "f" (__m2)
455 );
456
457 return ret;
458 }
459
460
461 extern __inline __m64 FUNCTION_ATTRIBS
_mm_asub_pu8(__m64 __m1,__m64 __m2)462 _mm_asub_pu8(__m64 __m1, __m64 __m2)
463 {
464 __m64 ret;
465
466 asm("pasubub %0, %1, %2\n\t"
467 : "=f" (ret)
468 : "f" (__m1), "f" (__m2)
469 );
470
471 return ret;
472 }
473
474 extern __inline __m64 FUNCTION_ATTRIBS
_mm_biadd_pu8(__m64 __m1,__m64 __m2)475 _mm_biadd_pu8(__m64 __m1, __m64 __m2)
476 {
477 __m64 ret;
478
479 asm("biadd %0, %1, %2\n\t"
480 : "=f" (ret)
481 : "f" (__m1), "f" (__m2)
482 );
483
484 return ret;
485 }
486
487 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi8(__m64 __m1,__m64 __m2)488 _mm_sub_pi8(__m64 __m1, __m64 __m2)
489 {
490 __m64 ret;
491
492 asm("psubb %0, %1, %2\n\t"
493 : "=f" (ret)
494 : "f" (__m1), "f" (__m2)
495 );
496
497 return ret;
498 }
499
500 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi16(__m64 __m1,__m64 __m2)501 _mm_sub_pi16(__m64 __m1, __m64 __m2)
502 {
503 __m64 ret;
504
505 asm("psubh %0, %1, %2\n\t"
506 : "=f" (ret)
507 : "f" (__m1), "f" (__m2)
508 );
509
510 return ret;
511 }
512
513 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi32(__m64 __m1,__m64 __m2)514 _mm_sub_pi32(__m64 __m1, __m64 __m2)
515 {
516 __m64 ret;
517
518 asm("psubw %0, %1, %2\n\t"
519 : "=f" (ret)
520 : "f" (__m1), "f" (__m2)
521 );
522
523 return ret;
524 }
525
526 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_si64(__m64 __m1,__m64 __m2)527 _mm_sub_si64(__m64 __m1, __m64 __m2)
528 {
529 __m64 ret;
530
531 asm("psubd %0, %1, %2\n\t"
532 : "=f" (ret)
533 : "f" (__m1), "f" (__m2)
534 );
535
536 return ret;
537 }
538
539 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pi8(__m64 __m1,__m64 __m2)540 _mm_subs_pi8(__m64 __m1, __m64 __m2)
541 {
542 __m64 ret;
543
544 asm("psubsb %0, %1, %2\n\t"
545 : "=f" (ret)
546 : "f" (__m1), "f" (__m2)
547 );
548
549 return ret;
550 }
551
552 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pi16(__m64 __m1,__m64 __m2)553 _mm_subs_pi16(__m64 __m1, __m64 __m2)
554 {
555 __m64 ret;
556
557 asm("psubsh %0, %1, %2\n\t"
558 : "=f" (ret)
559 : "f" (__m1), "f" (__m2)
560 );
561
562 return ret;
563 }
564
565
566 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pu8(__m64 __m1,__m64 __m2)567 _mm_subs_pu8(__m64 __m1, __m64 __m2)
568 {
569 __m64 ret;
570
571 asm("psubusb %0, %1, %2\n\t"
572 : "=f" (ret)
573 : "f" (__m1), "f" (__m2)
574 );
575
576 return ret;
577 }
578
579 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pu16(__m64 __m1,__m64 __m2)580 _mm_subs_pu16(__m64 __m1, __m64 __m2)
581 {
582 __m64 ret;
583
584 asm("psubush %0, %1, %2\n\t"
585 : "=f" (ret)
586 : "f" (__m1), "f" (__m2)
587 );
588
589 return ret;
590 }
591
592
593 /********** Logical Operations **********/
594
595 extern __inline __m64 FUNCTION_ATTRIBS
_mm_and_si64(__m64 __m1,__m64 __m2)596 _mm_and_si64(__m64 __m1, __m64 __m2)
597 {
598 __m64 ret;
599
600 asm("and %0, %1, %2\n\t"
601 : "=f" (ret)
602 : "f" (__m1), "f" (__m2)
603 );
604
605 return ret;
606 }
607
608 extern __inline __m64 FUNCTION_ATTRIBS
_mm_andnot_si64(__m64 __m1,__m64 __m2)609 _mm_andnot_si64(__m64 __m1, __m64 __m2)
610 {
611 __m64 ret;
612
613 asm("andn %0, %1, %2\n\t"
614 : "=f" (ret)
615 : "f" (__m1), "f" (__m2)
616 );
617
618 return ret;
619 }
620
621
622 extern __inline __m64 FUNCTION_ATTRIBS
_mm_or_si32(__m32 __m1,__m32 __m2)623 _mm_or_si32(__m32 __m1, __m32 __m2)
624 {
625 __m32 ret;
626
627 asm("or %0, %1, %2\n\t"
628 : "=f" (ret)
629 : "f" (__m1), "f" (__m2)
630 );
631
632 return ret;
633 }
634
635 extern __inline __m64 FUNCTION_ATTRIBS
_mm_or_si64(__m64 __m1,__m64 __m2)636 _mm_or_si64(__m64 __m1, __m64 __m2)
637 {
638 __m64 ret;
639
640 asm("or %0, %1, %2\n\t"
641 : "=f" (ret)
642 : "f" (__m1), "f" (__m2)
643 );
644
645 return ret;
646 }
647
648 extern __inline __m64 FUNCTION_ATTRIBS
_mm_xor_si64(__m64 __m1,__m64 __m2)649 _mm_xor_si64(__m64 __m1, __m64 __m2)
650 {
651 __m64 ret;
652
653 asm("xor %0, %1, %2\n\t"
654 : "=f" (ret)
655 : "f" (__m1), "f" (__m2)
656 );
657
658 return ret;
659 }
660
661
662 /********** Shift Operations **********/
663
664 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_pi16(__m64 __m,int64_t __count)665 _mm_slli_pi16(__m64 __m, int64_t __count)
666 {
667 __m64 ret;
668
669 asm("psllh %0, %1, %2\n\t"
670 : "=f" (ret)
671 : "f" (__m), "f" (*(__m64 *)&__count)
672 );
673
674 return ret;
675 }
676
677 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_pi32(__m64 __m,int64_t __count)678 _mm_slli_pi32(__m64 __m, int64_t __count)
679 {
680 __m64 ret;
681
682 asm("psllw %0, %1, %2\n\t"
683 : "=f" (ret)
684 : "f" (__m), "f" (*(__m64 *)&__count)
685 );
686
687 return ret;
688 }
689
690 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_si64(__m64 __m,int64_t __count)691 _mm_slli_si64(__m64 __m, int64_t __count)
692 {
693 __m64 ret;
694
695 asm("dsll %0, %1, %2\n\t"
696 : "=f" (ret)
697 : "f" (__m), "f" (*(__m64 *)&__count)
698 );
699
700 return ret;
701 }
702
703 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_pi16(__m64 __m,int64_t __count)704 _mm_srli_pi16(__m64 __m, int64_t __count)
705 {
706 __m64 ret;
707
708 asm("psrlh %0, %1, %2\n\t"
709 : "=f" (ret)
710 : "f" (__m), "f" (*(__m64 *)&__count)
711 );
712
713 return ret;
714 }
715
716 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_pi32(__m64 __m,int64_t __count)717 _mm_srli_pi32(__m64 __m, int64_t __count)
718 {
719 __m64 ret;
720
721 asm("psrlw %0, %1, %2\n\t"
722 : "=f" (ret)
723 : "f" (__m), "f" (*(__m64 *)&__count)
724 );
725
726 return ret;
727 }
728
729 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_si64(__m64 __m,int64_t __count)730 _mm_srli_si64(__m64 __m, int64_t __count)
731 {
732 __m64 ret;
733
734 asm("dsrl %0, %1, %2\n\t"
735 : "=f" (ret)
736 : "f" (__m), "f" (*(__m64 *)&__count)
737 );
738
739 return ret;
740 }
741
742 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_pi16(__m64 __m,int64_t __count)743 _mm_srai_pi16(__m64 __m, int64_t __count)
744 {
745 __m64 ret;
746
747 asm("psrah %0, %1, %2\n\t"
748 : "=f" (ret)
749 : "f" (__m), "f" (*(__m64 *)&__count)
750 );
751
752 return ret;
753 }
754
755 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_pi32(__m64 __m,int64_t __count)756 _mm_srai_pi32(__m64 __m, int64_t __count)
757 {
758 __m64 ret;
759
760 asm("psraw %0, %1, %2\n\t"
761 : "=f" (ret)
762 : "f" (__m), "f" (*(__m64 *)&__count)
763 );
764
765 return ret;
766 }
767
768 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_si64(__m64 __m,int64_t __count)769 _mm_srai_si64(__m64 __m, int64_t __count)
770 {
771 __m64 ret;
772
773 asm("dsra %0, %1, %2\n\t"
774 : "=f" (ret)
775 : "f" (__m), "f" (*(__m64 *)&__count)
776 );
777
778 return ret;
779 }
780
781
782 /********** Conversion Intrinsics **********/
783
784 extern __inline __m64 FUNCTION_ATTRIBS
to_m64(uint64_t x)785 to_m64(uint64_t x)
786 {
787 return *(__m64 *)&x;
788 }
789
790 extern __inline uint64_t FUNCTION_ATTRIBS
to_uint64(__m64 x)791 to_uint64(__m64 x)
792 {
793 return *(uint64_t *)&x;
794 }
795
796
797 /********** Comparison Intrinsics **********/
798
799 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi8(__m64 __m1,__m64 __m2)800 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
801 {
802 __m64 ret;
803
804 asm("pcmpeqb %0, %1, %2\n\t"
805 : "=f" (ret)
806 : "f" (__m1), "f" (__m2)
807 );
808
809 return ret;
810 }
811
812 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi16(__m64 __m1,__m64 __m2)813 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
814 {
815 __m64 ret;
816
817 asm("pcmpeqh %0, %1, %2\n\t"
818 : "=f" (ret)
819 : "f" (__m1), "f" (__m2)
820 );
821
822 return ret;
823 }
824
825 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi32(__m64 __m1,__m64 __m2)826 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
827 {
828 __m64 ret;
829
830 asm("pcmpeqw %0, %1, %2\n\t"
831 : "=f" (ret)
832 : "f" (__m1), "f" (__m2)
833 );
834
835 return ret;
836 }
837
838 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi8(__m64 __m1,__m64 __m2)839 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
840 {
841 __m64 ret;
842
843 asm("pcmpgtb %0, %1, %2\n\t"
844 : "=f" (ret)
845 : "f" (__m1), "f" (__m2)
846 );
847
848 return ret;
849 }
850
851 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi16(__m64 __m1,__m64 __m2)852 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
853 {
854 __m64 ret;
855
856 asm("pcmpgth %0, %1, %2\n\t"
857 : "=f" (ret)
858 : "f" (__m1), "f" (__m2)
859 );
860
861 return ret;
862 }
863
864 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi32(__m64 __m1,__m64 __m2)865 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
866 {
867 __m64 ret;
868
869 asm("pcmpgtw %0, %1, %2\n\t"
870 : "=f" (ret)
871 : "f" (__m1), "f" (__m2)
872 );
873
874 return ret;
875 }
876
877 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi8(__m64 __m1,__m64 __m2)878 _mm_cmplt_pi8(__m64 __m1, __m64 __m2)
879 {
880 __m64 ret;
881
882 asm("pcmpltb %0, %1, %2\n\t"
883 : "=f" (ret)
884 : "f" (__m1), "f" (__m2)
885 );
886
887 return ret;
888 }
889
890 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi16(__m64 __m1,__m64 __m2)891 _mm_cmplt_pi16(__m64 __m1, __m64 __m2)
892 {
893 __m64 ret;
894
895 asm("pcmplth %0, %1, %2\n\t"
896 : "=f" (ret)
897 : "f" (__m1), "f" (__m2)
898 );
899
900 return ret;
901 }
902
903 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi32(__m64 __m1,__m64 __m2)904 _mm_cmplt_pi32(__m64 __m1, __m64 __m2)
905 {
906 __m64 ret;
907
908 asm("pcmpltw %0, %1, %2\n\t"
909 : "=f" (ret)
910 : "f" (__m1), "f" (__m2)
911 );
912
913 return ret;
914 }
915
916
917 /********** Miscellaneous Operations **********/
918
919 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi16(__m64 __m1,__m64 __m2)920 _mm_packs_pi16(__m64 __m1, __m64 __m2)
921 {
922 __m64 ret;
923
924 asm("packsshb %0, %1, %2\n\t"
925 : "=f" (ret)
926 : "f" (__m1), "f" (__m2)
927 );
928
929 return ret;
930 }
931
932 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi32(__m64 __m1,__m64 __m2)933 _mm_packs_pi32(__m64 __m1, __m64 __m2)
934 {
935 __m64 ret;
936
937 asm("packsswh %0, %1, %2\n\t"
938 : "=f" (ret)
939 : "f" (__m1), "f" (__m2)
940 );
941
942 return ret;
943 }
944
945 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi32_f(__m64 __m1,__m64 __m2)946 _mm_packs_pi32_f(__m64 __m1, __m64 __m2)
947 {
948 __m64 ret;
949
950 asm("packsswh %0, %1, %2\n\t"
951 : "=f" (ret)
952 : "f" (__m1), "f" (__m2)
953 );
954
955 return ret;
956 }
957
958 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pu16(__m64 __m1,__m64 __m2)959 _mm_packs_pu16(__m64 __m1, __m64 __m2)
960 {
961 __m64 ret;
962
963 asm("packushb %0, %1, %2\n\t"
964 : "=f" (ret)
965 : "f" (__m1), "f" (__m2)
966 );
967
968 return ret;
969 }
970
971 extern __inline __m64 FUNCTION_ATTRIBS
_mm_extract_pi16(__m64 __m,int64_t __pos)972 _mm_extract_pi16(__m64 __m, int64_t __pos)
973 {
974 __m64 ret;
975
976 asm("pextrh %0, %1, %2\n\t"
977 : "=f" (ret)
978 : "f" (__m), "f" (*(__m64 *)&__pos)
979 );
980
981 return ret;
982 }
983
984 extern __inline __m64 FUNCTION_ATTRIBS
_mm_insert_pi16(__m64 __m1,__m64 __m2,int64_t __pos)985 _mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
986 {
987 __m64 ret;
988
989 switch (__pos) {
990 case 0:
991
992 asm("pinsrh_0 %0, %1, %2\n\t"
993 : "=f" (ret)
994 : "f" (__m1), "f" (__m2), "i" (__pos)
995 );
996
997 break;
998
999 case 1:
1000
1001 asm("pinsrh_1 %0, %1, %2\n\t"
1002 : "=f" (ret)
1003 : "f" (__m1), "f" (__m2), "i" (__pos)
1004 );
1005
1006 break;
1007 case 2:
1008
1009 asm("pinsrh_2 %0, %1, %2\n\t"
1010 : "=f" (ret)
1011 : "f" (__m1), "f" (__m2), "i" (__pos)
1012 );
1013
1014 break;
1015
1016 case 3:
1017
1018 asm("pinsrh_3 %0, %1, %2\n\t"
1019 : "=f" (ret)
1020 : "f" (__m1), "f" (__m2), "i" (__pos)
1021 );
1022
1023 break;
1024 }
1025
1026 return ret;
1027 }
1028
1029 extern __inline __m64 FUNCTION_ATTRIBS
_mm_shuffle_pi16(__m64 __m,int64_t __n)1030 _mm_shuffle_pi16(__m64 __m, int64_t __n)
1031 {
1032 __m64 ret;
1033
1034 asm("pshufh %0, %1, %2\n\t"
1035 : "=f" (ret)
1036 : "f" (__m), "f" (*(__m64 *)&__n)
1037 );
1038
1039 return ret;
1040 }
1041
1042 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi8(__m64 __m1,__m64 __m2)1043 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
1044 {
1045 __m64 ret;
1046
1047 asm("punpckhbh %0, %1, %2\n\t"
1048 : "=f" (ret)
1049 : "f" (__m1), "f" (__m2)
1050 );
1051
1052 return ret;
1053 }
1054
1055 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi8_f(__m64 __m1,__m64 __m2)1056 _mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
1057 {
1058 __m64 ret;
1059
1060 asm("punpckhbh %0, %1, %2\n\t"
1061 : "=f" (ret)
1062 : "f" (__m1), "f" (__m2)
1063 );
1064
1065 return ret;
1066 }
1067
1068 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi16(__m64 __m1,__m64 __m2)1069 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
1070 {
1071 __m64 ret;
1072
1073 asm("punpckhhw %0, %1, %2\n\t"
1074 : "=f" (ret)
1075 : "f" (__m1), "f" (__m2)
1076 );
1077
1078 return ret;
1079 }
1080
1081 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi16_f(__m64 __m1,__m64 __m2)1082 _mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
1083 {
1084 __m64 ret;
1085
1086 asm("punpckhhw %0, %1, %2\n\t"
1087 : "=f" (ret)
1088 : "f" (__m1), "f" (__m2)
1089 );
1090
1091 return ret;
1092 }
1093
1094 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi32(__m64 __m1,__m64 __m2)1095 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
1096 {
1097 __m64 ret;
1098
1099 asm("punpckhwd %0, %1, %2\n\t"
1100 : "=f" (ret)
1101 : "f" (__m1), "f" (__m2)
1102 );
1103
1104 return ret;
1105 }
1106
1107 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8(__m64 __m1,__m64 __m2)1108 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
1109 {
1110 __m64 ret;
1111
1112 asm("punpcklbh %0, %1, %2\n\t"
1113 : "=f" (ret)
1114 : "f" (__m1), "f" (__m2)
1115 );
1116
1117 return ret;
1118 }
1119
1120 /* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
1121 which preserves the data. */
1122
1123 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8_f64(__m64 __m1,__m64 __m2)1124 _mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
1125 {
1126 __m64 ret;
1127
1128 asm("punpcklbh %0, %1, %2\n\t"
1129 : "=f" (ret)
1130 : "f" (__m1), "f" (__m2)
1131 );
1132
1133 return ret;
1134 }
1135
1136 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
1137 datatype, which allows load8888 to use 32-bit loads. */
1138
1139 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8_f(__m32 __m1,__m64 __m2)1140 _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
1141 {
1142 __m64 ret;
1143
1144 asm("punpcklbh %0, %1, %2\n\t"
1145 : "=f" (ret)
1146 : "f" (__m1), "f" (__m2)
1147 );
1148
1149 return ret;
1150 }
1151
1152 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi16(__m64 __m1,__m64 __m2)1153 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
1154 {
1155 __m64 ret;
1156
1157 asm("punpcklhw %0, %1, %2\n\t"
1158 : "=f" (ret)
1159 : "f" (__m1), "f" (__m2)
1160 );
1161
1162 return ret;
1163 }
1164
1165 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi16_f(__m64 __m1,__m64 __m2)1166 _mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
1167 {
1168 __m64 ret;
1169
1170 asm("punpcklhw %0, %1, %2\n\t"
1171 : "=f" (ret)
1172 : "f" (__m1), "f" (__m2)
1173 );
1174
1175 return ret;
1176 }
1177
1178 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi32(__m64 __m1,__m64 __m2)1179 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
1180 {
1181 __m64 ret;
1182
1183 asm("punpcklwd %0, %1, %2\n\t"
1184 : "=f" (ret)
1185 : "f" (__m1), "f" (__m2)
1186 );
1187
1188 return ret;
1189 }
1190
1191
1192 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi32_f(__m64 __m1,__m64 __m2)1193 _mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
1194 {
1195 __m64 ret;
1196
1197 asm("punpcklwd %0, %1, %2\n\t"
1198 : "=f" (ret)
1199 : "f" (__m1), "f" (__m2)
1200 );
1201
1202 return ret;
1203 }
1204
1205 extern __inline void FUNCTION_ATTRIBS
_mm_store_pi32(__m32 * dest,__m64 src)1206 _mm_store_pi32(__m32 *dest, __m64 src)
1207 {
1208 src = _mm_packs_pu16(src, _mm_setzero_si64());
1209
1210 asm("swc1 %1, %0\n\t"
1211 : "=m" (*dest)
1212 : "f" (src)
1213 : "memory"
1214 );
1215 }
1216
1217 extern __inline void FUNCTION_ATTRIBS
_mm_store_si64(__m64 * dest,__m64 src)1218 _mm_store_si64(__m64 *dest, __m64 src)
1219 {
1220 asm("gssdlc1 %1, 7+%0\n\t"
1221 "gssdrc1 %1, %0\n\t"
1222 : "=m" (*dest)
1223 : "f" (src)
1224 : "memory"
1225 );
1226 }
1227
1228 extern __inline __m64 FUNCTION_ATTRIBS
_mm_load_si32(const __m32 * src)1229 _mm_load_si32(const __m32 *src)
1230 {
1231 __m32 ret;
1232
1233 asm("lwc1 %0, %1\n\t"
1234 : "=f" (ret)
1235 : "m" (*src)
1236 );
1237
1238 return ret;
1239 }
1240
1241 extern __inline __m64 FUNCTION_ATTRIBS
_mm_load_si64(const __m64 * src)1242 _mm_load_si64(const __m64 *src)
1243 {
1244 __m64 ret;
1245
1246 asm("ldc1 %0, %1\n\t"
1247 : "=f" (ret)
1248 : "m" (*src)
1249 : "memory"
1250 );
1251
1252 return ret;
1253 }
1254
1255 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadu_si64(const __m64 * src)1256 _mm_loadu_si64(const __m64 *src)
1257 {
1258 __m64 ret;
1259
1260 asm("gsldlc1 %0, 7(%1)\n\t"
1261 "gsldrc1 %0, 0(%1)\n\t"
1262 : "=f" (ret)
1263 : "r" (src)
1264 : "memory"
1265 );
1266
1267 return ret;
1268 }
1269
1270 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi8(const uint32_t * src)1271 _mm_loadlo_pi8(const uint32_t *src)
1272 {
1273 return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
1274 }
1275
1276 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi8_f(__m64 src)1277 _mm_loadlo_pi8_f(__m64 src)
1278 {
1279 return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
1280 }
1281
1282 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi8_f(__m64 src)1283 _mm_loadhi_pi8_f(__m64 src)
1284 {
1285 return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
1286 }
1287
1288 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi16(__m64 src)1289 _mm_loadlo_pi16(__m64 src)
1290 {
1291 return _mm_unpacklo_pi16(src, _mm_setzero_si64());
1292 }
1293
1294 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi16_f(__m64 src)1295 _mm_loadlo_pi16_f(__m64 src)
1296 {
1297 return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
1298 }
1299
1300 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi16(__m64 src)1301 _mm_loadhi_pi16(__m64 src)
1302 {
1303 return _mm_unpackhi_pi16(src, _mm_setzero_si64());
1304 }
1305
1306 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi16_f(__m64 src)1307 _mm_loadhi_pi16_f(__m64 src)
1308 {
1309 return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
1310 }
1311
1312 extern __inline __m64 FUNCTION_ATTRIBS
_mm_expand_alpha(__m64 pixel)1313 _mm_expand_alpha(__m64 pixel)
1314 {
1315 return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
1316 }
1317
1318 extern __inline __m64 FUNCTION_ATTRIBS
_mm_expand_alpha_rev(__m64 pixel)1319 _mm_expand_alpha_rev(__m64 pixel)
1320 {
1321 return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
1322 }
1323
1324 #endif /* __LOONGSON_MMINTRIN_H__ */
1325