• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* The gcc-provided loongson intrinsic functions are way too fucking broken
2  * to be of any use, otherwise I'd use them.
3  *
4  * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
5  *   close enough that they could have implemented the _mm_*-style intrinsic
6  *   interface and had a ton of optimized code available to them. Instead they
7  *   implemented something much, much worse.
8  *
9  * - pshuf takes a dead first argument, causing extra instructions to be
10  *   generated.
11  *
12  * - There are no 64-bit shift or logical intrinsics, which means you have
13  *   to implement them with inline assembly, but this is a nightmare because
14  *   gcc doesn't understand that the integer vector datatypes are actually in
15  *   floating-point registers, so you end up with braindead code like
16  *
17  *	punpcklwd	$f9,$f9,$f5
18  *	    dmtc1	v0,$f8
19  *	punpcklwd	$f19,$f19,$f5
20  *	    dmfc1	t9,$f9
21  *	    dmtc1	v0,$f9
22  *	    dmtc1	t9,$f20
23  *	    dmfc1	s0,$f19
24  *	punpcklbh	$f20,$f20,$f2
25  *
26  *   where crap just gets copied back and forth between integer and floating-
27  *   point registers ad nauseum.
28  *
29  * Instead of trying to workaround the problems from these crap intrinsics, I
30  * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
31  * assembly.
32  */
33 
34 #include <stdint.h>
35 
36 /* vectors are stored in 64-bit floating-point registers */
37 typedef double __m64;
38 /* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
39 typedef float  __m32;
40 
41 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si64(void)42 _mm_setzero_si64 (void)
43 {
44 	return 0.0;
45 }
46 
47 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi16(__m64 __m1,__m64 __m2)48 _mm_add_pi16 (__m64 __m1, __m64 __m2)
49 {
50 	__m64 ret;
51 	asm("paddh %0, %1, %2\n\t"
52 	   : "=f" (ret)
53 	   : "f" (__m1), "f" (__m2)
54 	);
55 	return ret;
56 }
57 
58 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi32(__m64 __m1,__m64 __m2)59 _mm_add_pi32 (__m64 __m1, __m64 __m2)
60 {
61 	__m64 ret;
62 	asm("paddw %0, %1, %2\n\t"
63 	   : "=f" (ret)
64 	   : "f" (__m1), "f" (__m2)
65 	);
66 	return ret;
67 }
68 
69 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu16(__m64 __m1,__m64 __m2)70 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
71 {
72 	__m64 ret;
73 	asm("paddush %0, %1, %2\n\t"
74 	   : "=f" (ret)
75 	   : "f" (__m1), "f" (__m2)
76 	);
77 	return ret;
78 }
79 
80 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu8(__m64 __m1,__m64 __m2)81 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
82 {
83 	__m64 ret;
84 	asm("paddusb %0, %1, %2\n\t"
85 	   : "=f" (ret)
86 	   : "f" (__m1), "f" (__m2)
87 	);
88 	return ret;
89 }
90 
91 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si64(__m64 __m1,__m64 __m2)92 _mm_and_si64 (__m64 __m1, __m64 __m2)
93 {
94 	__m64 ret;
95 	asm("and %0, %1, %2\n\t"
96 	   : "=f" (ret)
97 	   : "f" (__m1), "f" (__m2)
98 	);
99 	return ret;
100 }
101 
102 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi32(__m64 __m1,__m64 __m2)103 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
104 {
105 	__m64 ret;
106 	asm("pcmpeqw %0, %1, %2\n\t"
107 	   : "=f" (ret)
108 	   : "f" (__m1), "f" (__m2)
109 	);
110 	return ret;
111 }
112 
113 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void)114 _mm_empty (void)
115 {
116 
117 }
118 
119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_pi16(__m64 __m1,__m64 __m2)120 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
121 {
122 	__m64 ret;
123 	asm("pmaddhw %0, %1, %2\n\t"
124 	   : "=f" (ret)
125 	   : "f" (__m1), "f" (__m2)
126 	);
127 	return ret;
128 }
129 
130 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16(__m64 __m1,__m64 __m2)131 _mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
132 {
133 	__m64 ret;
134 	asm("pmulhuh %0, %1, %2\n\t"
135 	   : "=f" (ret)
136 	   : "f" (__m1), "f" (__m2)
137 	);
138 	return ret;
139 }
140 
141 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_pi16(__m64 __m1,__m64 __m2)142 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
143 {
144 	__m64 ret;
145 	asm("pmullh %0, %1, %2\n\t"
146 	   : "=f" (ret)
147 	   : "f" (__m1), "f" (__m2)
148 	);
149 	return ret;
150 }
151 
152 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si64(__m64 __m1,__m64 __m2)153 _mm_or_si64 (__m64 __m1, __m64 __m2)
154 {
155 	__m64 ret;
156 	asm("or %0, %1, %2\n\t"
157 	   : "=f" (ret)
158 	   : "f" (__m1), "f" (__m2)
159 	);
160 	return ret;
161 }
162 
163 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pu16(__m64 __m1,__m64 __m2)164 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
165 {
166 	__m64 ret;
167 	asm("packushb %0, %1, %2\n\t"
168 	   : "=f" (ret)
169 	   : "f" (__m1), "f" (__m2)
170 	);
171 	return ret;
172 }
173 
174 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pi32(__m64 __m1,__m64 __m2)175 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
176 {
177 	__m64 ret;
178 	asm("packsswh %0, %1, %2\n\t"
179 	   : "=f" (ret)
180 	   : "f" (__m1), "f" (__m2)
181 	);
182 	return ret;
183 }
184 
185 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
186  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
187 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi16(uint16_t __w3,uint16_t __w2,uint16_t __w1,uint16_t __w0)188 _mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
189 {
190 	if (__builtin_constant_p (__w3) &&
191 	    __builtin_constant_p (__w2) &&
192 	    __builtin_constant_p (__w1) &&
193 	    __builtin_constant_p (__w0))
194 	{
195 		uint64_t val = ((uint64_t)__w3 << 48)
196 			     | ((uint64_t)__w2 << 32)
197 			     | ((uint64_t)__w1 << 16)
198 			     | ((uint64_t)__w0 <<  0);
199 		return *(__m64 *)&val;
200 	}
201 	else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0)
202 	{
203 		/* TODO: handle other cases */
204 		uint64_t val = __w3;
205 		uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0);
206 		__m64 ret;
207 		asm("pshufh %0, %1, %2\n\t"
208 		    : "=f" (ret)
209 		    : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
210 		);
211 		return ret;
212 	} else {
213 		uint64_t val = ((uint64_t)__w3 << 48)
214 			     | ((uint64_t)__w2 << 32)
215 			     | ((uint64_t)__w1 << 16)
216 			     | ((uint64_t)__w0 <<  0);
217 		return *(__m64 *)&val;
218 	}
219 }
220 
221 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi32(unsigned __i1,unsigned __i0)222 _mm_set_pi32 (unsigned __i1, unsigned __i0)
223 {
224 	if (__builtin_constant_p (__i1) &&
225 	    __builtin_constant_p (__i0))
226 	{
227 		uint64_t val = ((uint64_t)__i1 << 32)
228 			     | ((uint64_t)__i0 <<  0);
229 		return *(__m64 *)&val;
230 	}
231 	else if (__i1 == __i0)
232 	{
233 		uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0);
234 		__m64 ret;
235 		asm("pshufh %0, %1, %2\n\t"
236 		    : "=f" (ret)
237 		    : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
238 		);
239 		return ret;
240 	} else {
241 		uint64_t val = ((uint64_t)__i1 << 32)
242 			     | ((uint64_t)__i0 <<  0);
243 		return *(__m64 *)&val;
244 	}
245 }
246 #undef _MM_SHUFFLE
247 
248 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi16(__m64 __m,int64_t __n)249 _mm_shuffle_pi16 (__m64 __m, int64_t __n)
250 {
251 	__m64 ret;
252 	asm("pshufh %0, %1, %2\n\t"
253 	    : "=f" (ret)
254 	    : "f" (__m), "f" (*(__m64 *)&__n)
255 	);
256 	return ret;
257 }
258 
259 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_pi16(__m64 __m,int64_t __count)260 _mm_slli_pi16 (__m64 __m, int64_t __count)
261 {
262 	__m64 ret;
263 	asm("psllh  %0, %1, %2\n\t"
264 	   : "=f" (ret)
265 	   : "f" (__m), "f" (*(__m64 *)&__count)
266 	);
267 	return ret;
268 }
269 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si64(__m64 __m,int64_t __count)270 _mm_slli_si64 (__m64 __m, int64_t __count)
271 {
272 	__m64 ret;
273 	asm("dsll  %0, %1, %2\n\t"
274 	   : "=f" (ret)
275 	   : "f" (__m), "f" (*(__m64 *)&__count)
276 	);
277 	return ret;
278 }
279 
280 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi16(__m64 __m,int64_t __count)281 _mm_srli_pi16 (__m64 __m, int64_t __count)
282 {
283 	__m64 ret;
284 	asm("psrlh %0, %1, %2\n\t"
285 	   : "=f" (ret)
286 	   : "f" (__m), "f" (*(__m64 *)&__count)
287 	);
288 	return ret;
289 }
290 
291 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi32(__m64 __m,int64_t __count)292 _mm_srli_pi32 (__m64 __m, int64_t __count)
293 {
294 	__m64 ret;
295 	asm("psrlw %0, %1, %2\n\t"
296 	   : "=f" (ret)
297 	   : "f" (__m), "f" (*(__m64 *)&__count)
298 	);
299 	return ret;
300 }
301 
302 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si64(__m64 __m,int64_t __count)303 _mm_srli_si64 (__m64 __m, int64_t __count)
304 {
305 	__m64 ret;
306 	asm("dsrl  %0, %1, %2\n\t"
307 	   : "=f" (ret)
308 	   : "f" (__m), "f" (*(__m64 *)&__count)
309 	);
310 	return ret;
311 }
312 
313 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi16(__m64 __m1,__m64 __m2)314 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
315 {
316 	__m64 ret;
317 	asm("psubh %0, %1, %2\n\t"
318 	   : "=f" (ret)
319 	   : "f" (__m1), "f" (__m2)
320 	);
321 	return ret;
322 }
323 
324 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi8(__m64 __m1,__m64 __m2)325 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
326 {
327 	__m64 ret;
328 	asm("punpckhbh %0, %1, %2\n\t"
329 	   : "=f" (ret)
330 	   : "f" (__m1), "f" (__m2)
331 	);
332 	return ret;
333 }
334 
335 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi16(__m64 __m1,__m64 __m2)336 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
337 {
338 	__m64 ret;
339 	asm("punpckhhw %0, %1, %2\n\t"
340 	   : "=f" (ret)
341 	   : "f" (__m1), "f" (__m2)
342 	);
343 	return ret;
344 }
345 
346 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi8(__m64 __m1,__m64 __m2)347 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
348 {
349 	__m64 ret;
350 	asm("punpcklbh %0, %1, %2\n\t"
351 	   : "=f" (ret)
352 	   : "f" (__m1), "f" (__m2)
353 	);
354 	return ret;
355 }
356 
357 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
358  * allows load8888 to use 32-bit loads */
359 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi8_f(__m32 __m1,__m64 __m2)360 _mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
361 {
362 	__m64 ret;
363 	asm("punpcklbh %0, %1, %2\n\t"
364 	   : "=f" (ret)
365 	   : "f" (__m1), "f" (__m2)
366 	);
367 	return ret;
368 }
369 
370 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi16(__m64 __m1,__m64 __m2)371 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
372 {
373 	__m64 ret;
374 	asm("punpcklhw %0, %1, %2\n\t"
375 	   : "=f" (ret)
376 	   : "f" (__m1), "f" (__m2)
377 	);
378 	return ret;
379 }
380 
381 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si64(__m64 __m1,__m64 __m2)382 _mm_xor_si64 (__m64 __m1, __m64 __m2)
383 {
384 	__m64 ret;
385 	asm("xor %0, %1, %2\n\t"
386 	   : "=f" (ret)
387 	   : "f" (__m1), "f" (__m2)
388 	);
389 	return ret;
390 }
391 
392 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
loongson_extract_pi16(__m64 __m,int64_t __pos)393 loongson_extract_pi16 (__m64 __m, int64_t __pos)
394 {
395 	__m64 ret;
396 	asm("pextrh %0, %1, %2\n\t"
397 	   : "=f" (ret)
398 	   : "f" (__m), "f" (*(__m64 *)&__pos)
399 	);
400 	return ret;
401 }
402 
403 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
loongson_insert_pi16(__m64 __m1,__m64 __m2,int64_t __pos)404 loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos)
405 {
406 	__m64 ret;
407 	asm("pinsrh_%3 %0, %1, %2\n\t"
408 	   : "=f" (ret)
409 	   : "f" (__m1), "f" (__m2), "i" (__pos)
410 	);
411 	return ret;
412 }
413