1 /*
2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
5 *
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
15 *
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * SOFTWARE.
24 *
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28 *
29 * Based on work by Owen Taylor
30 */
31
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
37
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
40 #else
41 #include <mmintrin.h>
42 #endif
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
45 #include "pixman-inlines.h"
46
47 #ifdef VERBOSE
48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
49 #else
50 #define CHECKPOINT()
51 #endif
52
53 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
55 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void)56 _mm_empty (void)
57 {
58
59 }
60 #endif
61
62 #ifdef USE_X86_MMX
63 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64 # include <xmmintrin.h>
65 # else
66 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67 * instructions to be generated that we don't want. Just duplicate the
68 * functions we want to use. */
69 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8(__m64 __A)70 _mm_movemask_pi8 (__m64 __A)
71 {
72 int ret;
73
74 asm ("pmovmskb %1, %0\n\t"
75 : "=r" (ret)
76 : "y" (__A)
77 );
78
79 return ret;
80 }
81
82 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16(__m64 __A,__m64 __B)83 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
84 {
85 asm ("pmulhuw %1, %0\n\t"
86 : "+y" (__A)
87 : "y" (__B)
88 );
89 return __A;
90 }
91
92 # ifdef __OPTIMIZE__
93 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi16(__m64 __A,int8_t const __N)94 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
95 {
96 __m64 ret;
97
98 asm ("pshufw %2, %1, %0\n\t"
99 : "=y" (ret)
100 : "y" (__A), "K" (__N)
101 );
102
103 return ret;
104 }
105 # else
106 # define _mm_shuffle_pi16(A, N) \
107 ({ \
108 __m64 ret; \
109 \
110 asm ("pshufw %2, %1, %0\n\t" \
111 : "=y" (ret) \
112 : "y" (A), "K" ((const int8_t)N) \
113 ); \
114 \
115 ret; \
116 })
117 # endif
118 # endif
119 #endif
120
121 #ifndef _MSC_VER
122 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
123 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
124 #endif
125
126 /* Notes about writing mmx code
127 *
128 * give memory operands as the second operand. If you give it as the
129 * first, gcc will first load it into a register, then use that
130 * register
131 *
132 * ie. use
133 *
134 * _mm_mullo_pi16 (x, mmx_constant);
135 *
136 * not
137 *
138 * _mm_mullo_pi16 (mmx_constant, x);
139 *
140 * Also try to minimize dependencies. i.e. when you need a value, try
141 * to calculate it from a value that was calculated as early as
142 * possible.
143 */
144
145 /* --------------- MMX primitives ------------------------------------- */
146
147 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
148 * the name of the member used to access the data.
149 * If __m64 requires using mm_cvt* intrinsics functions to convert between
150 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
151 * If __m64 and uint64_t values can just be cast to each other directly,
152 * then define USE_M64_CASTS.
153 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
154 */
155 #ifdef _MSC_VER
156 # define M64_MEMBER m64_u64
157 #elif defined(__ICC)
158 # define USE_CVT_INTRINSICS
159 #elif defined(USE_LOONGSON_MMI)
160 # define USE_M64_DOUBLE
161 #elif defined(__GNUC__)
162 # define USE_M64_CASTS
163 #elif defined(__SUNPRO_C)
164 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
165 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
166 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
167 * is defined. If it is used, then the mm_cvt* intrinsics must be used.
168 */
169 # define USE_CVT_INTRINSICS
170 # else
171 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
172 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
173 */
174 # define M64_MEMBER l_
175 # endif
176 #endif
177
178 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
179 typedef uint64_t mmxdatafield;
180 #else
181 typedef __m64 mmxdatafield;
182 #endif
183
184 typedef struct
185 {
186 mmxdatafield mmx_4x00ff;
187 mmxdatafield mmx_4x0080;
188 mmxdatafield mmx_565_rgb;
189 mmxdatafield mmx_565_unpack_multiplier;
190 mmxdatafield mmx_565_pack_multiplier;
191 mmxdatafield mmx_565_r;
192 mmxdatafield mmx_565_g;
193 mmxdatafield mmx_565_b;
194 mmxdatafield mmx_packed_565_rb;
195 mmxdatafield mmx_packed_565_g;
196 mmxdatafield mmx_expand_565_g;
197 mmxdatafield mmx_expand_565_b;
198 mmxdatafield mmx_expand_565_r;
199 #ifndef USE_LOONGSON_MMI
200 mmxdatafield mmx_mask_0;
201 mmxdatafield mmx_mask_1;
202 mmxdatafield mmx_mask_2;
203 mmxdatafield mmx_mask_3;
204 #endif
205 mmxdatafield mmx_full_alpha;
206 mmxdatafield mmx_4x0101;
207 mmxdatafield mmx_ff000000;
208 } mmx_data_t;
209
210 #if defined(_MSC_VER)
211 # define MMXDATA_INIT(field, val) { val ## UI64 }
212 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
213 # define MMXDATA_INIT(field, val) field = { val ## ULL }
214 #else /* mmxdatafield is an integral type */
215 # define MMXDATA_INIT(field, val) field = val ## ULL
216 #endif
217
218 static const mmx_data_t c =
219 {
220 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
221 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
222 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
223 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
224 MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
225 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
226 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
227 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
228 MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
229 MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
230 MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
231 MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
232 MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
233 #ifndef USE_LOONGSON_MMI
234 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
235 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
236 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
237 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
238 #endif
239 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
240 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
241 MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
242 };
243
244 #ifdef USE_CVT_INTRINSICS
245 # define MC(x) to_m64 (c.mmx_ ## x)
246 #elif defined(USE_M64_CASTS)
247 # define MC(x) ((__m64)c.mmx_ ## x)
248 #elif defined(USE_M64_DOUBLE)
249 # define MC(x) (*(__m64 *)&c.mmx_ ## x)
250 #else
251 # define MC(x) c.mmx_ ## x
252 #endif
253
254 static force_inline __m64
to_m64(uint64_t x)255 to_m64 (uint64_t x)
256 {
257 #ifdef USE_CVT_INTRINSICS
258 return _mm_cvtsi64_m64 (x);
259 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
260 __m64 res;
261
262 res.M64_MEMBER = x;
263 return res;
264 #elif defined USE_M64_DOUBLE
265 return *(__m64 *)&x;
266 #else /* USE_M64_CASTS */
267 return (__m64)x;
268 #endif
269 }
270
271 static force_inline uint64_t
to_uint64(__m64 x)272 to_uint64 (__m64 x)
273 {
274 #ifdef USE_CVT_INTRINSICS
275 return _mm_cvtm64_si64 (x);
276 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
277 uint64_t res = x.M64_MEMBER;
278 return res;
279 #elif defined USE_M64_DOUBLE
280 return *(uint64_t *)&x;
281 #else /* USE_M64_CASTS */
282 return (uint64_t)x;
283 #endif
284 }
285
286 static force_inline __m64
shift(__m64 v,int s)287 shift (__m64 v,
288 int s)
289 {
290 if (s > 0)
291 return _mm_slli_si64 (v, s);
292 else if (s < 0)
293 return _mm_srli_si64 (v, -s);
294 else
295 return v;
296 }
297
298 static force_inline __m64
negate(__m64 mask)299 negate (__m64 mask)
300 {
301 return _mm_xor_si64 (mask, MC (4x00ff));
302 }
303
304 static force_inline __m64
pix_multiply(__m64 a,__m64 b)305 pix_multiply (__m64 a, __m64 b)
306 {
307 __m64 res;
308
309 res = _mm_mullo_pi16 (a, b);
310 res = _mm_adds_pu16 (res, MC (4x0080));
311 res = _mm_mulhi_pu16 (res, MC (4x0101));
312
313 return res;
314 }
315
316 static force_inline __m64
pix_add(__m64 a,__m64 b)317 pix_add (__m64 a, __m64 b)
318 {
319 return _mm_adds_pu8 (a, b);
320 }
321
322 static force_inline __m64
expand_alpha(__m64 pixel)323 expand_alpha (__m64 pixel)
324 {
325 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
326 }
327
328 static force_inline __m64
expand_alpha_rev(__m64 pixel)329 expand_alpha_rev (__m64 pixel)
330 {
331 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
332 }
333
334 static force_inline __m64
invert_colors(__m64 pixel)335 invert_colors (__m64 pixel)
336 {
337 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
338 }
339
340 static force_inline __m64
over(__m64 src,__m64 srca,__m64 dest)341 over (__m64 src,
342 __m64 srca,
343 __m64 dest)
344 {
345 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
346 }
347
348 static force_inline __m64
over_rev_non_pre(__m64 src,__m64 dest)349 over_rev_non_pre (__m64 src, __m64 dest)
350 {
351 __m64 srca = expand_alpha (src);
352 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
353
354 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
355 }
356
357 static force_inline __m64
in(__m64 src,__m64 mask)358 in (__m64 src, __m64 mask)
359 {
360 return pix_multiply (src, mask);
361 }
362
363 #ifndef _MSC_VER
364 static force_inline __m64
in_over(__m64 src,__m64 srca,__m64 mask,__m64 dest)365 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
366 {
367 return over (in (src, mask), pix_multiply (srca, mask), dest);
368 }
369
370 #else
371
372 #define in_over(src, srca, mask, dest) \
373 over (in (src, mask), pix_multiply (srca, mask), dest)
374
375 #endif
376
377 /* Elemental unaligned loads */
378
ldq_u(__m64 * p)379 static force_inline __m64 ldq_u(__m64 *p)
380 {
381 #ifdef USE_X86_MMX
382 /* x86's alignment restrictions are very relaxed. */
383 return *(__m64 *)p;
384 #elif defined USE_ARM_IWMMXT
385 int align = (uintptr_t)p & 7;
386 __m64 *aligned_p;
387 if (align == 0)
388 return *p;
389 aligned_p = (__m64 *)((uintptr_t)p & ~7);
390 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
391 #else
392 struct __una_u64 { __m64 x __attribute__((packed)); };
393 const struct __una_u64 *ptr = (const struct __una_u64 *) p;
394 return (__m64) ptr->x;
395 #endif
396 }
397
ldl_u(const uint32_t * p)398 static force_inline uint32_t ldl_u(const uint32_t *p)
399 {
400 #ifdef USE_X86_MMX
401 /* x86's alignment restrictions are very relaxed. */
402 return *p;
403 #else
404 struct __una_u32 { uint32_t x __attribute__((packed)); };
405 const struct __una_u32 *ptr = (const struct __una_u32 *) p;
406 return ptr->x;
407 #endif
408 }
409
410 static force_inline __m64
load(const uint32_t * v)411 load (const uint32_t *v)
412 {
413 #ifdef USE_LOONGSON_MMI
414 __m64 ret;
415 asm ("lwc1 %0, %1\n\t"
416 : "=f" (ret)
417 : "m" (*v)
418 );
419 return ret;
420 #else
421 return _mm_cvtsi32_si64 (*v);
422 #endif
423 }
424
425 static force_inline __m64
load8888(const uint32_t * v)426 load8888 (const uint32_t *v)
427 {
428 #ifdef USE_LOONGSON_MMI
429 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
430 #else
431 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
432 #endif
433 }
434
435 static force_inline __m64
load8888u(const uint32_t * v)436 load8888u (const uint32_t *v)
437 {
438 uint32_t l = ldl_u (v);
439 return load8888 (&l);
440 }
441
442 static force_inline __m64
pack8888(__m64 lo,__m64 hi)443 pack8888 (__m64 lo, __m64 hi)
444 {
445 return _mm_packs_pu16 (lo, hi);
446 }
447
448 static force_inline void
store(uint32_t * dest,__m64 v)449 store (uint32_t *dest, __m64 v)
450 {
451 #ifdef USE_LOONGSON_MMI
452 asm ("swc1 %1, %0\n\t"
453 : "=m" (*dest)
454 : "f" (v)
455 : "memory"
456 );
457 #else
458 *dest = _mm_cvtsi64_si32 (v);
459 #endif
460 }
461
462 static force_inline void
store8888(uint32_t * dest,__m64 v)463 store8888 (uint32_t *dest, __m64 v)
464 {
465 v = pack8888 (v, _mm_setzero_si64 ());
466 store (dest, v);
467 }
468
469 static force_inline pixman_bool_t
is_equal(__m64 a,__m64 b)470 is_equal (__m64 a, __m64 b)
471 {
472 #ifdef USE_LOONGSON_MMI
473 /* __m64 is double, we can compare directly. */
474 return a == b;
475 #else
476 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
477 #endif
478 }
479
480 static force_inline pixman_bool_t
is_opaque(__m64 v)481 is_opaque (__m64 v)
482 {
483 #ifdef USE_LOONGSON_MMI
484 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
485 #else
486 __m64 ffs = _mm_cmpeq_pi8 (v, v);
487 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
488 #endif
489 }
490
491 static force_inline pixman_bool_t
is_zero(__m64 v)492 is_zero (__m64 v)
493 {
494 return is_equal (v, _mm_setzero_si64 ());
495 }
496
497 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
498 *
499 * 00RR00GG00BB
500 *
501 * --- Expanding 565 in the low word ---
502 *
503 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
504 * m = m & (01f0003f001f);
505 * m = m * (008404100840);
506 * m = m >> 8;
507 *
508 * Note the trick here - the top word is shifted by another nibble to
509 * avoid it bumping into the middle word
510 */
511 static force_inline __m64
expand565(__m64 pixel,int pos)512 expand565 (__m64 pixel, int pos)
513 {
514 __m64 p = pixel;
515 __m64 t1, t2;
516
517 /* move pixel to low 16 bit and zero the rest */
518 #ifdef USE_LOONGSON_MMI
519 p = loongson_extract_pi16 (p, pos);
520 #else
521 p = shift (shift (p, (3 - pos) * 16), -48);
522 #endif
523
524 t1 = shift (p, 36 - 11);
525 t2 = shift (p, 16 - 5);
526
527 p = _mm_or_si64 (t1, p);
528 p = _mm_or_si64 (t2, p);
529 p = _mm_and_si64 (p, MC (565_rgb));
530
531 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
532 return _mm_srli_pi16 (pixel, 8);
533 }
534
535 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
536 *
537 * AARRGGBBRRGGBB
538 */
539 static force_inline void
expand_4xpacked565(__m64 vin,__m64 * vout0,__m64 * vout1,int full_alpha)540 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
541 {
542 __m64 t0, t1, alpha = _mm_setzero_si64 ();
543 __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
544 __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
545 __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
546 if (full_alpha)
547 alpha = _mm_cmpeq_pi32 (alpha, alpha);
548
549 /* Replicate high bits into empty low bits. */
550 r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
551 g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
552 b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
553
554 r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
555 g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
556 b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
557
558 t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
559 t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
560
561 *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */
562 *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */
563 }
564
565 static force_inline __m64
expand8888(__m64 in,int pos)566 expand8888 (__m64 in, int pos)
567 {
568 if (pos == 0)
569 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
570 else
571 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
572 }
573
574 static force_inline __m64
expandx888(__m64 in,int pos)575 expandx888 (__m64 in, int pos)
576 {
577 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
578 }
579
580 static force_inline void
expand_4x565(__m64 vin,__m64 * vout0,__m64 * vout1,__m64 * vout2,__m64 * vout3,int full_alpha)581 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
582 {
583 __m64 v0, v1;
584 expand_4xpacked565 (vin, &v0, &v1, full_alpha);
585 *vout0 = expand8888 (v0, 0);
586 *vout1 = expand8888 (v0, 1);
587 *vout2 = expand8888 (v1, 0);
588 *vout3 = expand8888 (v1, 1);
589 }
590
591 static force_inline __m64
pack_565(__m64 pixel,__m64 target,int pos)592 pack_565 (__m64 pixel, __m64 target, int pos)
593 {
594 __m64 p = pixel;
595 __m64 t = target;
596 __m64 r, g, b;
597
598 r = _mm_and_si64 (p, MC (565_r));
599 g = _mm_and_si64 (p, MC (565_g));
600 b = _mm_and_si64 (p, MC (565_b));
601
602 #ifdef USE_LOONGSON_MMI
603 r = shift (r, -(32 - 8));
604 g = shift (g, -(16 - 3));
605 b = shift (b, -(0 + 3));
606
607 p = _mm_or_si64 (r, g);
608 p = _mm_or_si64 (p, b);
609 return loongson_insert_pi16 (t, p, pos);
610 #else
611 r = shift (r, -(32 - 8) + pos * 16);
612 g = shift (g, -(16 - 3) + pos * 16);
613 b = shift (b, -(0 + 3) + pos * 16);
614
615 if (pos == 0)
616 t = _mm_and_si64 (t, MC (mask_0));
617 else if (pos == 1)
618 t = _mm_and_si64 (t, MC (mask_1));
619 else if (pos == 2)
620 t = _mm_and_si64 (t, MC (mask_2));
621 else if (pos == 3)
622 t = _mm_and_si64 (t, MC (mask_3));
623
624 p = _mm_or_si64 (r, t);
625 p = _mm_or_si64 (g, p);
626
627 return _mm_or_si64 (b, p);
628 #endif
629 }
630
631 static force_inline __m64
pack_4xpacked565(__m64 a,__m64 b)632 pack_4xpacked565 (__m64 a, __m64 b)
633 {
634 __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
635 __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
636
637 __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
638 __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
639
640 __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
641 __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
642
643 t0 = _mm_or_si64 (t0, g0);
644 t1 = _mm_or_si64 (t1, g1);
645
646 t0 = shift(t0, -5);
647 #ifdef USE_ARM_IWMMXT
648 t1 = shift(t1, -5);
649 return _mm_packs_pu32 (t0, t1);
650 #else
651 t1 = shift(t1, -5 + 16);
652 return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
653 #endif
654 }
655
656 #ifndef _MSC_VER
657
658 static force_inline __m64
pack_4x565(__m64 v0,__m64 v1,__m64 v2,__m64 v3)659 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
660 {
661 return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
662 }
663
664 static force_inline __m64
pix_add_mul(__m64 x,__m64 a,__m64 y,__m64 b)665 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
666 {
667 x = pix_multiply (x, a);
668 y = pix_multiply (y, b);
669
670 return pix_add (x, y);
671 }
672
673 #else
674
675 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
676
677 #define pack_4x565(v0, v1, v2, v3) \
678 pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
679
680 #define pix_add_mul(x, a, y, b) \
681 ( x = pix_multiply (x, a), \
682 y = pix_multiply (y, b), \
683 pix_add (x, y) )
684
685 #endif
686
687 /* --------------- MMX code patch for fbcompose.c --------------------- */
688
689 static force_inline __m64
combine(const uint32_t * src,const uint32_t * mask)690 combine (const uint32_t *src, const uint32_t *mask)
691 {
692 __m64 vsrc = load8888 (src);
693
694 if (mask)
695 {
696 __m64 m = load8888 (mask);
697
698 m = expand_alpha (m);
699 vsrc = pix_multiply (vsrc, m);
700 }
701
702 return vsrc;
703 }
704
705 static force_inline __m64
core_combine_over_u_pixel_mmx(__m64 vsrc,__m64 vdst)706 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
707 {
708 vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
709
710 if (is_opaque (vsrc))
711 {
712 return vsrc;
713 }
714 else if (!is_zero (vsrc))
715 {
716 return over (vsrc, expand_alpha (vsrc),
717 _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
718 }
719
720 return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
721 }
722
723 static void
mmx_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)724 mmx_combine_over_u (pixman_implementation_t *imp,
725 pixman_op_t op,
726 uint32_t * dest,
727 const uint32_t * src,
728 const uint32_t * mask,
729 int width)
730 {
731 const uint32_t *end = dest + width;
732
733 while (dest < end)
734 {
735 __m64 vsrc = combine (src, mask);
736
737 if (is_opaque (vsrc))
738 {
739 store8888 (dest, vsrc);
740 }
741 else if (!is_zero (vsrc))
742 {
743 __m64 sa = expand_alpha (vsrc);
744 store8888 (dest, over (vsrc, sa, load8888 (dest)));
745 }
746
747 ++dest;
748 ++src;
749 if (mask)
750 ++mask;
751 }
752 _mm_empty ();
753 }
754
755 static void
mmx_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)756 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
757 pixman_op_t op,
758 uint32_t * dest,
759 const uint32_t * src,
760 const uint32_t * mask,
761 int width)
762 {
763 const uint32_t *end = dest + width;
764
765 while (dest < end)
766 {
767 __m64 d, da;
768 __m64 s = combine (src, mask);
769
770 d = load8888 (dest);
771 da = expand_alpha (d);
772 store8888 (dest, over (d, da, s));
773
774 ++dest;
775 ++src;
776 if (mask)
777 mask++;
778 }
779 _mm_empty ();
780 }
781
782 static void
mmx_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)783 mmx_combine_in_u (pixman_implementation_t *imp,
784 pixman_op_t op,
785 uint32_t * dest,
786 const uint32_t * src,
787 const uint32_t * mask,
788 int width)
789 {
790 const uint32_t *end = dest + width;
791
792 while (dest < end)
793 {
794 __m64 a;
795 __m64 x = combine (src, mask);
796
797 a = load8888 (dest);
798 a = expand_alpha (a);
799 x = pix_multiply (x, a);
800
801 store8888 (dest, x);
802
803 ++dest;
804 ++src;
805 if (mask)
806 mask++;
807 }
808 _mm_empty ();
809 }
810
811 static void
mmx_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)812 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
813 pixman_op_t op,
814 uint32_t * dest,
815 const uint32_t * src,
816 const uint32_t * mask,
817 int width)
818 {
819 const uint32_t *end = dest + width;
820
821 while (dest < end)
822 {
823 __m64 a = combine (src, mask);
824 __m64 x;
825
826 x = load8888 (dest);
827 a = expand_alpha (a);
828 x = pix_multiply (x, a);
829 store8888 (dest, x);
830
831 ++dest;
832 ++src;
833 if (mask)
834 mask++;
835 }
836 _mm_empty ();
837 }
838
839 static void
mmx_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)840 mmx_combine_out_u (pixman_implementation_t *imp,
841 pixman_op_t op,
842 uint32_t * dest,
843 const uint32_t * src,
844 const uint32_t * mask,
845 int width)
846 {
847 const uint32_t *end = dest + width;
848
849 while (dest < end)
850 {
851 __m64 a;
852 __m64 x = combine (src, mask);
853
854 a = load8888 (dest);
855 a = expand_alpha (a);
856 a = negate (a);
857 x = pix_multiply (x, a);
858 store8888 (dest, x);
859
860 ++dest;
861 ++src;
862 if (mask)
863 mask++;
864 }
865 _mm_empty ();
866 }
867
868 static void
mmx_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)869 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
870 pixman_op_t op,
871 uint32_t * dest,
872 const uint32_t * src,
873 const uint32_t * mask,
874 int width)
875 {
876 const uint32_t *end = dest + width;
877
878 while (dest < end)
879 {
880 __m64 a = combine (src, mask);
881 __m64 x;
882
883 x = load8888 (dest);
884 a = expand_alpha (a);
885 a = negate (a);
886 x = pix_multiply (x, a);
887
888 store8888 (dest, x);
889
890 ++dest;
891 ++src;
892 if (mask)
893 mask++;
894 }
895 _mm_empty ();
896 }
897
898 static void
mmx_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)899 mmx_combine_atop_u (pixman_implementation_t *imp,
900 pixman_op_t op,
901 uint32_t * dest,
902 const uint32_t * src,
903 const uint32_t * mask,
904 int width)
905 {
906 const uint32_t *end = dest + width;
907
908 while (dest < end)
909 {
910 __m64 da, d, sia;
911 __m64 s = combine (src, mask);
912
913 d = load8888 (dest);
914 sia = expand_alpha (s);
915 sia = negate (sia);
916 da = expand_alpha (d);
917 s = pix_add_mul (s, da, d, sia);
918 store8888 (dest, s);
919
920 ++dest;
921 ++src;
922 if (mask)
923 mask++;
924 }
925 _mm_empty ();
926 }
927
928 static void
mmx_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)929 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
930 pixman_op_t op,
931 uint32_t * dest,
932 const uint32_t * src,
933 const uint32_t * mask,
934 int width)
935 {
936 const uint32_t *end;
937
938 end = dest + width;
939
940 while (dest < end)
941 {
942 __m64 dia, d, sa;
943 __m64 s = combine (src, mask);
944
945 d = load8888 (dest);
946 sa = expand_alpha (s);
947 dia = expand_alpha (d);
948 dia = negate (dia);
949 s = pix_add_mul (s, dia, d, sa);
950 store8888 (dest, s);
951
952 ++dest;
953 ++src;
954 if (mask)
955 mask++;
956 }
957 _mm_empty ();
958 }
959
960 static void
mmx_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)961 mmx_combine_xor_u (pixman_implementation_t *imp,
962 pixman_op_t op,
963 uint32_t * dest,
964 const uint32_t * src,
965 const uint32_t * mask,
966 int width)
967 {
968 const uint32_t *end = dest + width;
969
970 while (dest < end)
971 {
972 __m64 dia, d, sia;
973 __m64 s = combine (src, mask);
974
975 d = load8888 (dest);
976 sia = expand_alpha (s);
977 dia = expand_alpha (d);
978 sia = negate (sia);
979 dia = negate (dia);
980 s = pix_add_mul (s, dia, d, sia);
981 store8888 (dest, s);
982
983 ++dest;
984 ++src;
985 if (mask)
986 mask++;
987 }
988 _mm_empty ();
989 }
990
991 static void
mmx_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)992 mmx_combine_add_u (pixman_implementation_t *imp,
993 pixman_op_t op,
994 uint32_t * dest,
995 const uint32_t * src,
996 const uint32_t * mask,
997 int width)
998 {
999 const uint32_t *end = dest + width;
1000
1001 while (dest < end)
1002 {
1003 __m64 d;
1004 __m64 s = combine (src, mask);
1005
1006 d = load8888 (dest);
1007 s = pix_add (s, d);
1008 store8888 (dest, s);
1009
1010 ++dest;
1011 ++src;
1012 if (mask)
1013 mask++;
1014 }
1015 _mm_empty ();
1016 }
1017
1018 static void
mmx_combine_saturate_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1019 mmx_combine_saturate_u (pixman_implementation_t *imp,
1020 pixman_op_t op,
1021 uint32_t * dest,
1022 const uint32_t * src,
1023 const uint32_t * mask,
1024 int width)
1025 {
1026 const uint32_t *end = dest + width;
1027
1028 while (dest < end)
1029 {
1030 uint32_t s, sa, da;
1031 uint32_t d = *dest;
1032 __m64 ms = combine (src, mask);
1033 __m64 md = load8888 (dest);
1034
1035 store8888(&s, ms);
1036 da = ~d >> 24;
1037 sa = s >> 24;
1038
1039 if (sa > da)
1040 {
1041 uint32_t quot = DIV_UN8 (da, sa) << 24;
1042 __m64 msa = load8888 (");
1043 msa = expand_alpha (msa);
1044 ms = pix_multiply (ms, msa);
1045 }
1046
1047 md = pix_add (md, ms);
1048 store8888 (dest, md);
1049
1050 ++src;
1051 ++dest;
1052 if (mask)
1053 mask++;
1054 }
1055 _mm_empty ();
1056 }
1057
1058 static void
mmx_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1059 mmx_combine_src_ca (pixman_implementation_t *imp,
1060 pixman_op_t op,
1061 uint32_t * dest,
1062 const uint32_t * src,
1063 const uint32_t * mask,
1064 int width)
1065 {
1066 const uint32_t *end = src + width;
1067
1068 while (src < end)
1069 {
1070 __m64 a = load8888 (mask);
1071 __m64 s = load8888 (src);
1072
1073 s = pix_multiply (s, a);
1074 store8888 (dest, s);
1075
1076 ++src;
1077 ++mask;
1078 ++dest;
1079 }
1080 _mm_empty ();
1081 }
1082
1083 static void
mmx_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1084 mmx_combine_over_ca (pixman_implementation_t *imp,
1085 pixman_op_t op,
1086 uint32_t * dest,
1087 const uint32_t * src,
1088 const uint32_t * mask,
1089 int width)
1090 {
1091 const uint32_t *end = src + width;
1092
1093 while (src < end)
1094 {
1095 __m64 a = load8888 (mask);
1096 __m64 s = load8888 (src);
1097 __m64 d = load8888 (dest);
1098 __m64 sa = expand_alpha (s);
1099
1100 store8888 (dest, in_over (s, sa, a, d));
1101
1102 ++src;
1103 ++dest;
1104 ++mask;
1105 }
1106 _mm_empty ();
1107 }
1108
1109 static void
mmx_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1110 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1111 pixman_op_t op,
1112 uint32_t * dest,
1113 const uint32_t * src,
1114 const uint32_t * mask,
1115 int width)
1116 {
1117 const uint32_t *end = src + width;
1118
1119 while (src < end)
1120 {
1121 __m64 a = load8888 (mask);
1122 __m64 s = load8888 (src);
1123 __m64 d = load8888 (dest);
1124 __m64 da = expand_alpha (d);
1125
1126 store8888 (dest, over (d, da, in (s, a)));
1127
1128 ++src;
1129 ++dest;
1130 ++mask;
1131 }
1132 _mm_empty ();
1133 }
1134
1135 static void
mmx_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1136 mmx_combine_in_ca (pixman_implementation_t *imp,
1137 pixman_op_t op,
1138 uint32_t * dest,
1139 const uint32_t * src,
1140 const uint32_t * mask,
1141 int width)
1142 {
1143 const uint32_t *end = src + width;
1144
1145 while (src < end)
1146 {
1147 __m64 a = load8888 (mask);
1148 __m64 s = load8888 (src);
1149 __m64 d = load8888 (dest);
1150 __m64 da = expand_alpha (d);
1151
1152 s = pix_multiply (s, a);
1153 s = pix_multiply (s, da);
1154 store8888 (dest, s);
1155
1156 ++src;
1157 ++dest;
1158 ++mask;
1159 }
1160 _mm_empty ();
1161 }
1162
1163 static void
mmx_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1164 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1165 pixman_op_t op,
1166 uint32_t * dest,
1167 const uint32_t * src,
1168 const uint32_t * mask,
1169 int width)
1170 {
1171 const uint32_t *end = src + width;
1172
1173 while (src < end)
1174 {
1175 __m64 a = load8888 (mask);
1176 __m64 s = load8888 (src);
1177 __m64 d = load8888 (dest);
1178 __m64 sa = expand_alpha (s);
1179
1180 a = pix_multiply (a, sa);
1181 d = pix_multiply (d, a);
1182 store8888 (dest, d);
1183
1184 ++src;
1185 ++dest;
1186 ++mask;
1187 }
1188 _mm_empty ();
1189 }
1190
1191 static void
mmx_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1192 mmx_combine_out_ca (pixman_implementation_t *imp,
1193 pixman_op_t op,
1194 uint32_t * dest,
1195 const uint32_t * src,
1196 const uint32_t * mask,
1197 int width)
1198 {
1199 const uint32_t *end = src + width;
1200
1201 while (src < end)
1202 {
1203 __m64 a = load8888 (mask);
1204 __m64 s = load8888 (src);
1205 __m64 d = load8888 (dest);
1206 __m64 da = expand_alpha (d);
1207
1208 da = negate (da);
1209 s = pix_multiply (s, a);
1210 s = pix_multiply (s, da);
1211 store8888 (dest, s);
1212
1213 ++src;
1214 ++dest;
1215 ++mask;
1216 }
1217 _mm_empty ();
1218 }
1219
1220 static void
mmx_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1221 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1222 pixman_op_t op,
1223 uint32_t * dest,
1224 const uint32_t * src,
1225 const uint32_t * mask,
1226 int width)
1227 {
1228 const uint32_t *end = src + width;
1229
1230 while (src < end)
1231 {
1232 __m64 a = load8888 (mask);
1233 __m64 s = load8888 (src);
1234 __m64 d = load8888 (dest);
1235 __m64 sa = expand_alpha (s);
1236
1237 a = pix_multiply (a, sa);
1238 a = negate (a);
1239 d = pix_multiply (d, a);
1240 store8888 (dest, d);
1241
1242 ++src;
1243 ++dest;
1244 ++mask;
1245 }
1246 _mm_empty ();
1247 }
1248
1249 static void
mmx_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1250 mmx_combine_atop_ca (pixman_implementation_t *imp,
1251 pixman_op_t op,
1252 uint32_t * dest,
1253 const uint32_t * src,
1254 const uint32_t * mask,
1255 int width)
1256 {
1257 const uint32_t *end = src + width;
1258
1259 while (src < end)
1260 {
1261 __m64 a = load8888 (mask);
1262 __m64 s = load8888 (src);
1263 __m64 d = load8888 (dest);
1264 __m64 da = expand_alpha (d);
1265 __m64 sa = expand_alpha (s);
1266
1267 s = pix_multiply (s, a);
1268 a = pix_multiply (a, sa);
1269 a = negate (a);
1270 d = pix_add_mul (d, a, s, da);
1271 store8888 (dest, d);
1272
1273 ++src;
1274 ++dest;
1275 ++mask;
1276 }
1277 _mm_empty ();
1278 }
1279
1280 static void
mmx_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1281 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1282 pixman_op_t op,
1283 uint32_t * dest,
1284 const uint32_t * src,
1285 const uint32_t * mask,
1286 int width)
1287 {
1288 const uint32_t *end = src + width;
1289
1290 while (src < end)
1291 {
1292 __m64 a = load8888 (mask);
1293 __m64 s = load8888 (src);
1294 __m64 d = load8888 (dest);
1295 __m64 da = expand_alpha (d);
1296 __m64 sa = expand_alpha (s);
1297
1298 s = pix_multiply (s, a);
1299 a = pix_multiply (a, sa);
1300 da = negate (da);
1301 d = pix_add_mul (d, a, s, da);
1302 store8888 (dest, d);
1303
1304 ++src;
1305 ++dest;
1306 ++mask;
1307 }
1308 _mm_empty ();
1309 }
1310
1311 static void
mmx_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1312 mmx_combine_xor_ca (pixman_implementation_t *imp,
1313 pixman_op_t op,
1314 uint32_t * dest,
1315 const uint32_t * src,
1316 const uint32_t * mask,
1317 int width)
1318 {
1319 const uint32_t *end = src + width;
1320
1321 while (src < end)
1322 {
1323 __m64 a = load8888 (mask);
1324 __m64 s = load8888 (src);
1325 __m64 d = load8888 (dest);
1326 __m64 da = expand_alpha (d);
1327 __m64 sa = expand_alpha (s);
1328
1329 s = pix_multiply (s, a);
1330 a = pix_multiply (a, sa);
1331 da = negate (da);
1332 a = negate (a);
1333 d = pix_add_mul (d, a, s, da);
1334 store8888 (dest, d);
1335
1336 ++src;
1337 ++dest;
1338 ++mask;
1339 }
1340 _mm_empty ();
1341 }
1342
1343 static void
mmx_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1344 mmx_combine_add_ca (pixman_implementation_t *imp,
1345 pixman_op_t op,
1346 uint32_t * dest,
1347 const uint32_t * src,
1348 const uint32_t * mask,
1349 int width)
1350 {
1351 const uint32_t *end = src + width;
1352
1353 while (src < end)
1354 {
1355 __m64 a = load8888 (mask);
1356 __m64 s = load8888 (src);
1357 __m64 d = load8888 (dest);
1358
1359 s = pix_multiply (s, a);
1360 d = pix_add (s, d);
1361 store8888 (dest, d);
1362
1363 ++src;
1364 ++dest;
1365 ++mask;
1366 }
1367 _mm_empty ();
1368 }
1369
1370 /* ------------- MMX code paths called from fbpict.c -------------------- */
1371
1372 static void
mmx_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1373 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1374 pixman_composite_info_t *info)
1375 {
1376 PIXMAN_COMPOSITE_ARGS (info);
1377 uint32_t src;
1378 uint32_t *dst_line, *dst;
1379 int32_t w;
1380 int dst_stride;
1381 __m64 vsrc, vsrca;
1382
1383 CHECKPOINT ();
1384
1385 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1386
1387 if (src == 0)
1388 return;
1389
1390 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1391
1392 vsrc = load8888 (&src);
1393 vsrca = expand_alpha (vsrc);
1394
1395 while (height--)
1396 {
1397 dst = dst_line;
1398 dst_line += dst_stride;
1399 w = width;
1400
1401 CHECKPOINT ();
1402
1403 while (w && (uintptr_t)dst & 7)
1404 {
1405 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1406
1407 w--;
1408 dst++;
1409 }
1410
1411 while (w >= 2)
1412 {
1413 __m64 vdest;
1414 __m64 dest0, dest1;
1415
1416 vdest = *(__m64 *)dst;
1417
1418 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1419 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1420
1421 *(__m64 *)dst = pack8888 (dest0, dest1);
1422
1423 dst += 2;
1424 w -= 2;
1425 }
1426
1427 CHECKPOINT ();
1428
1429 if (w)
1430 {
1431 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1432 }
1433 }
1434
1435 _mm_empty ();
1436 }
1437
1438 static void
mmx_composite_over_n_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1439 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1440 pixman_composite_info_t *info)
1441 {
1442 PIXMAN_COMPOSITE_ARGS (info);
1443 uint32_t src;
1444 uint16_t *dst_line, *dst;
1445 int32_t w;
1446 int dst_stride;
1447 __m64 vsrc, vsrca;
1448
1449 CHECKPOINT ();
1450
1451 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1452
1453 if (src == 0)
1454 return;
1455
1456 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1457
1458 vsrc = load8888 (&src);
1459 vsrca = expand_alpha (vsrc);
1460
1461 while (height--)
1462 {
1463 dst = dst_line;
1464 dst_line += dst_stride;
1465 w = width;
1466
1467 CHECKPOINT ();
1468
1469 while (w && (uintptr_t)dst & 7)
1470 {
1471 uint64_t d = *dst;
1472 __m64 vdest = expand565 (to_m64 (d), 0);
1473
1474 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1475 *dst = to_uint64 (vdest);
1476
1477 w--;
1478 dst++;
1479 }
1480
1481 while (w >= 4)
1482 {
1483 __m64 vdest = *(__m64 *)dst;
1484 __m64 v0, v1, v2, v3;
1485
1486 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1487
1488 v0 = over (vsrc, vsrca, v0);
1489 v1 = over (vsrc, vsrca, v1);
1490 v2 = over (vsrc, vsrca, v2);
1491 v3 = over (vsrc, vsrca, v3);
1492
1493 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1494
1495 dst += 4;
1496 w -= 4;
1497 }
1498
1499 CHECKPOINT ();
1500
1501 while (w)
1502 {
1503 uint64_t d = *dst;
1504 __m64 vdest = expand565 (to_m64 (d), 0);
1505
1506 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1507 *dst = to_uint64 (vdest);
1508
1509 w--;
1510 dst++;
1511 }
1512 }
1513
1514 _mm_empty ();
1515 }
1516
1517 static void
mmx_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)1518 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1519 pixman_composite_info_t *info)
1520 {
1521 PIXMAN_COMPOSITE_ARGS (info);
1522 uint32_t src;
1523 uint32_t *dst_line;
1524 uint32_t *mask_line;
1525 int dst_stride, mask_stride;
1526 __m64 vsrc, vsrca;
1527
1528 CHECKPOINT ();
1529
1530 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1531
1532 if (src == 0)
1533 return;
1534
1535 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1536 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1537
1538 vsrc = load8888 (&src);
1539 vsrca = expand_alpha (vsrc);
1540
1541 while (height--)
1542 {
1543 int twidth = width;
1544 uint32_t *p = (uint32_t *)mask_line;
1545 uint32_t *q = (uint32_t *)dst_line;
1546
1547 while (twidth && (uintptr_t)q & 7)
1548 {
1549 uint32_t m = *(uint32_t *)p;
1550
1551 if (m)
1552 {
1553 __m64 vdest = load8888 (q);
1554 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1555 store8888 (q, vdest);
1556 }
1557
1558 twidth--;
1559 p++;
1560 q++;
1561 }
1562
1563 while (twidth >= 2)
1564 {
1565 uint32_t m0, m1;
1566 m0 = *p;
1567 m1 = *(p + 1);
1568
1569 if (m0 | m1)
1570 {
1571 __m64 dest0, dest1;
1572 __m64 vdest = *(__m64 *)q;
1573
1574 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1575 expand8888 (vdest, 0));
1576 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1577 expand8888 (vdest, 1));
1578
1579 *(__m64 *)q = pack8888 (dest0, dest1);
1580 }
1581
1582 p += 2;
1583 q += 2;
1584 twidth -= 2;
1585 }
1586
1587 if (twidth)
1588 {
1589 uint32_t m = *(uint32_t *)p;
1590
1591 if (m)
1592 {
1593 __m64 vdest = load8888 (q);
1594 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1595 store8888 (q, vdest);
1596 }
1597
1598 twidth--;
1599 p++;
1600 q++;
1601 }
1602
1603 dst_line += dst_stride;
1604 mask_line += mask_stride;
1605 }
1606
1607 _mm_empty ();
1608 }
1609
1610 static void
mmx_composite_over_8888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1611 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1612 pixman_composite_info_t *info)
1613 {
1614 PIXMAN_COMPOSITE_ARGS (info);
1615 uint32_t *dst_line, *dst;
1616 uint32_t *src_line, *src;
1617 uint32_t mask;
1618 __m64 vmask;
1619 int dst_stride, src_stride;
1620 int32_t w;
1621
1622 CHECKPOINT ();
1623
1624 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1625 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1626
1627 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1628 vmask = expand_alpha (load8888 (&mask));
1629
1630 while (height--)
1631 {
1632 dst = dst_line;
1633 dst_line += dst_stride;
1634 src = src_line;
1635 src_line += src_stride;
1636 w = width;
1637
1638 while (w && (uintptr_t)dst & 7)
1639 {
1640 __m64 s = load8888 (src);
1641 __m64 d = load8888 (dst);
1642
1643 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1644
1645 w--;
1646 dst++;
1647 src++;
1648 }
1649
1650 while (w >= 2)
1651 {
1652 __m64 vs = ldq_u ((__m64 *)src);
1653 __m64 vd = *(__m64 *)dst;
1654 __m64 vsrc0 = expand8888 (vs, 0);
1655 __m64 vsrc1 = expand8888 (vs, 1);
1656
1657 *(__m64 *)dst = pack8888 (
1658 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1659 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1660
1661 w -= 2;
1662 dst += 2;
1663 src += 2;
1664 }
1665
1666 if (w)
1667 {
1668 __m64 s = load8888 (src);
1669 __m64 d = load8888 (dst);
1670
1671 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1672 }
1673 }
1674
1675 _mm_empty ();
1676 }
1677
1678 static void
mmx_composite_over_x888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1679 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1680 pixman_composite_info_t *info)
1681 {
1682 PIXMAN_COMPOSITE_ARGS (info);
1683 uint32_t *dst_line, *dst;
1684 uint32_t *src_line, *src;
1685 uint32_t mask;
1686 __m64 vmask;
1687 int dst_stride, src_stride;
1688 int32_t w;
1689 __m64 srca;
1690
1691 CHECKPOINT ();
1692
1693 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1694 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1695 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1696
1697 vmask = expand_alpha (load8888 (&mask));
1698 srca = MC (4x00ff);
1699
1700 while (height--)
1701 {
1702 dst = dst_line;
1703 dst_line += dst_stride;
1704 src = src_line;
1705 src_line += src_stride;
1706 w = width;
1707
1708 while (w && (uintptr_t)dst & 7)
1709 {
1710 uint32_t ssrc = *src | 0xff000000;
1711 __m64 s = load8888 (&ssrc);
1712 __m64 d = load8888 (dst);
1713
1714 store8888 (dst, in_over (s, srca, vmask, d));
1715
1716 w--;
1717 dst++;
1718 src++;
1719 }
1720
1721 while (w >= 16)
1722 {
1723 __m64 vd0 = *(__m64 *)(dst + 0);
1724 __m64 vd1 = *(__m64 *)(dst + 2);
1725 __m64 vd2 = *(__m64 *)(dst + 4);
1726 __m64 vd3 = *(__m64 *)(dst + 6);
1727 __m64 vd4 = *(__m64 *)(dst + 8);
1728 __m64 vd5 = *(__m64 *)(dst + 10);
1729 __m64 vd6 = *(__m64 *)(dst + 12);
1730 __m64 vd7 = *(__m64 *)(dst + 14);
1731
1732 __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1733 __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1734 __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1735 __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1736 __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1737 __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1738 __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1739 __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1740
1741 vd0 = pack8888 (
1742 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1743 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1744
1745 vd1 = pack8888 (
1746 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1747 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1748
1749 vd2 = pack8888 (
1750 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1751 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1752
1753 vd3 = pack8888 (
1754 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1755 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1756
1757 vd4 = pack8888 (
1758 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1759 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1760
1761 vd5 = pack8888 (
1762 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1763 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1764
1765 vd6 = pack8888 (
1766 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1767 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1768
1769 vd7 = pack8888 (
1770 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1771 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1772
1773 *(__m64 *)(dst + 0) = vd0;
1774 *(__m64 *)(dst + 2) = vd1;
1775 *(__m64 *)(dst + 4) = vd2;
1776 *(__m64 *)(dst + 6) = vd3;
1777 *(__m64 *)(dst + 8) = vd4;
1778 *(__m64 *)(dst + 10) = vd5;
1779 *(__m64 *)(dst + 12) = vd6;
1780 *(__m64 *)(dst + 14) = vd7;
1781
1782 w -= 16;
1783 dst += 16;
1784 src += 16;
1785 }
1786
1787 while (w)
1788 {
1789 uint32_t ssrc = *src | 0xff000000;
1790 __m64 s = load8888 (&ssrc);
1791 __m64 d = load8888 (dst);
1792
1793 store8888 (dst, in_over (s, srca, vmask, d));
1794
1795 w--;
1796 dst++;
1797 src++;
1798 }
1799 }
1800
1801 _mm_empty ();
1802 }
1803
1804 static void
mmx_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1805 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1806 pixman_composite_info_t *info)
1807 {
1808 PIXMAN_COMPOSITE_ARGS (info);
1809 uint32_t *dst_line, *dst;
1810 uint32_t *src_line, *src;
1811 uint32_t s;
1812 int dst_stride, src_stride;
1813 uint8_t a;
1814 int32_t w;
1815
1816 CHECKPOINT ();
1817
1818 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1819 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1820
1821 while (height--)
1822 {
1823 dst = dst_line;
1824 dst_line += dst_stride;
1825 src = src_line;
1826 src_line += src_stride;
1827 w = width;
1828
1829 while (w--)
1830 {
1831 s = *src++;
1832 a = s >> 24;
1833
1834 if (a == 0xff)
1835 {
1836 *dst = s;
1837 }
1838 else if (s)
1839 {
1840 __m64 ms, sa;
1841 ms = load8888 (&s);
1842 sa = expand_alpha (ms);
1843 store8888 (dst, over (ms, sa, load8888 (dst)));
1844 }
1845
1846 dst++;
1847 }
1848 }
1849 _mm_empty ();
1850 }
1851
1852 static void
mmx_composite_over_8888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1853 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1854 pixman_composite_info_t *info)
1855 {
1856 PIXMAN_COMPOSITE_ARGS (info);
1857 uint16_t *dst_line, *dst;
1858 uint32_t *src_line, *src;
1859 int dst_stride, src_stride;
1860 int32_t w;
1861
1862 CHECKPOINT ();
1863
1864 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1865 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1866
1867 #if 0
1868 /* FIXME */
1869 assert (src_image->drawable == mask_image->drawable);
1870 #endif
1871
1872 while (height--)
1873 {
1874 dst = dst_line;
1875 dst_line += dst_stride;
1876 src = src_line;
1877 src_line += src_stride;
1878 w = width;
1879
1880 CHECKPOINT ();
1881
1882 while (w && (uintptr_t)dst & 7)
1883 {
1884 __m64 vsrc = load8888 (src);
1885 uint64_t d = *dst;
1886 __m64 vdest = expand565 (to_m64 (d), 0);
1887
1888 vdest = pack_565 (
1889 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1890
1891 *dst = to_uint64 (vdest);
1892
1893 w--;
1894 dst++;
1895 src++;
1896 }
1897
1898 CHECKPOINT ();
1899
1900 while (w >= 4)
1901 {
1902 __m64 vdest = *(__m64 *)dst;
1903 __m64 v0, v1, v2, v3;
1904 __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1905
1906 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1907
1908 vsrc0 = load8888 ((src + 0));
1909 vsrc1 = load8888 ((src + 1));
1910 vsrc2 = load8888 ((src + 2));
1911 vsrc3 = load8888 ((src + 3));
1912
1913 v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1914 v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1915 v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1916 v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1917
1918 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1919
1920 w -= 4;
1921 dst += 4;
1922 src += 4;
1923 }
1924
1925 CHECKPOINT ();
1926
1927 while (w)
1928 {
1929 __m64 vsrc = load8888 (src);
1930 uint64_t d = *dst;
1931 __m64 vdest = expand565 (to_m64 (d), 0);
1932
1933 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1934
1935 *dst = to_uint64 (vdest);
1936
1937 w--;
1938 dst++;
1939 src++;
1940 }
1941 }
1942
1943 _mm_empty ();
1944 }
1945
1946 static void
mmx_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1947 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1948 pixman_composite_info_t *info)
1949 {
1950 PIXMAN_COMPOSITE_ARGS (info);
1951 uint32_t src, srca;
1952 uint32_t *dst_line, *dst;
1953 uint8_t *mask_line, *mask;
1954 int dst_stride, mask_stride;
1955 int32_t w;
1956 __m64 vsrc, vsrca;
1957 uint64_t srcsrc;
1958
1959 CHECKPOINT ();
1960
1961 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1962
1963 srca = src >> 24;
1964 if (src == 0)
1965 return;
1966
1967 srcsrc = (uint64_t)src << 32 | src;
1968
1969 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1970 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1971
1972 vsrc = load8888 (&src);
1973 vsrca = expand_alpha (vsrc);
1974
1975 while (height--)
1976 {
1977 dst = dst_line;
1978 dst_line += dst_stride;
1979 mask = mask_line;
1980 mask_line += mask_stride;
1981 w = width;
1982
1983 CHECKPOINT ();
1984
1985 while (w && (uintptr_t)dst & 7)
1986 {
1987 uint64_t m = *mask;
1988
1989 if (m)
1990 {
1991 __m64 vdest = in_over (vsrc, vsrca,
1992 expand_alpha_rev (to_m64 (m)),
1993 load8888 (dst));
1994
1995 store8888 (dst, vdest);
1996 }
1997
1998 w--;
1999 mask++;
2000 dst++;
2001 }
2002
2003 CHECKPOINT ();
2004
2005 while (w >= 2)
2006 {
2007 uint64_t m0, m1;
2008
2009 m0 = *mask;
2010 m1 = *(mask + 1);
2011
2012 if (srca == 0xff && (m0 & m1) == 0xff)
2013 {
2014 *(uint64_t *)dst = srcsrc;
2015 }
2016 else if (m0 | m1)
2017 {
2018 __m64 vdest;
2019 __m64 dest0, dest1;
2020
2021 vdest = *(__m64 *)dst;
2022
2023 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2024 expand8888 (vdest, 0));
2025 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2026 expand8888 (vdest, 1));
2027
2028 *(__m64 *)dst = pack8888 (dest0, dest1);
2029 }
2030
2031 mask += 2;
2032 dst += 2;
2033 w -= 2;
2034 }
2035
2036 CHECKPOINT ();
2037
2038 if (w)
2039 {
2040 uint64_t m = *mask;
2041
2042 if (m)
2043 {
2044 __m64 vdest = load8888 (dst);
2045
2046 vdest = in_over (
2047 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2048 store8888 (dst, vdest);
2049 }
2050 }
2051 }
2052
2053 _mm_empty ();
2054 }
2055
2056 static pixman_bool_t
mmx_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)2057 mmx_fill (pixman_implementation_t *imp,
2058 uint32_t * bits,
2059 int stride,
2060 int bpp,
2061 int x,
2062 int y,
2063 int width,
2064 int height,
2065 uint32_t filler)
2066 {
2067 uint64_t fill;
2068 __m64 vfill;
2069 uint32_t byte_width;
2070 uint8_t *byte_line;
2071
2072 #if defined __GNUC__ && defined USE_X86_MMX
2073 __m64 v1, v2, v3, v4, v5, v6, v7;
2074 #endif
2075
2076 if (bpp != 16 && bpp != 32 && bpp != 8)
2077 return FALSE;
2078
2079 if (bpp == 8)
2080 {
2081 stride = stride * (int) sizeof (uint32_t) / 1;
2082 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2083 byte_width = width;
2084 stride *= 1;
2085 filler = (filler & 0xff) * 0x01010101;
2086 }
2087 else if (bpp == 16)
2088 {
2089 stride = stride * (int) sizeof (uint32_t) / 2;
2090 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2091 byte_width = 2 * width;
2092 stride *= 2;
2093 filler = (filler & 0xffff) * 0x00010001;
2094 }
2095 else
2096 {
2097 stride = stride * (int) sizeof (uint32_t) / 4;
2098 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2099 byte_width = 4 * width;
2100 stride *= 4;
2101 }
2102
2103 fill = ((uint64_t)filler << 32) | filler;
2104 vfill = to_m64 (fill);
2105
2106 #if defined __GNUC__ && defined USE_X86_MMX
2107 __asm__ (
2108 "movq %7, %0\n"
2109 "movq %7, %1\n"
2110 "movq %7, %2\n"
2111 "movq %7, %3\n"
2112 "movq %7, %4\n"
2113 "movq %7, %5\n"
2114 "movq %7, %6\n"
2115 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2116 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2117 : "y" (vfill));
2118 #endif
2119
2120 while (height--)
2121 {
2122 int w;
2123 uint8_t *d = byte_line;
2124
2125 byte_line += stride;
2126 w = byte_width;
2127
2128 if (w >= 1 && ((uintptr_t)d & 1))
2129 {
2130 *(uint8_t *)d = (filler & 0xff);
2131 w--;
2132 d++;
2133 }
2134
2135 if (w >= 2 && ((uintptr_t)d & 3))
2136 {
2137 *(uint16_t *)d = filler;
2138 w -= 2;
2139 d += 2;
2140 }
2141
2142 while (w >= 4 && ((uintptr_t)d & 7))
2143 {
2144 *(uint32_t *)d = filler;
2145
2146 w -= 4;
2147 d += 4;
2148 }
2149
2150 while (w >= 64)
2151 {
2152 #if defined __GNUC__ && defined USE_X86_MMX
2153 __asm__ (
2154 "movq %1, (%0)\n"
2155 "movq %2, 8(%0)\n"
2156 "movq %3, 16(%0)\n"
2157 "movq %4, 24(%0)\n"
2158 "movq %5, 32(%0)\n"
2159 "movq %6, 40(%0)\n"
2160 "movq %7, 48(%0)\n"
2161 "movq %8, 56(%0)\n"
2162 :
2163 : "r" (d),
2164 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2165 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2166 : "memory");
2167 #else
2168 *(__m64*) (d + 0) = vfill;
2169 *(__m64*) (d + 8) = vfill;
2170 *(__m64*) (d + 16) = vfill;
2171 *(__m64*) (d + 24) = vfill;
2172 *(__m64*) (d + 32) = vfill;
2173 *(__m64*) (d + 40) = vfill;
2174 *(__m64*) (d + 48) = vfill;
2175 *(__m64*) (d + 56) = vfill;
2176 #endif
2177 w -= 64;
2178 d += 64;
2179 }
2180
2181 while (w >= 4)
2182 {
2183 *(uint32_t *)d = filler;
2184
2185 w -= 4;
2186 d += 4;
2187 }
2188 if (w >= 2)
2189 {
2190 *(uint16_t *)d = filler;
2191 w -= 2;
2192 d += 2;
2193 }
2194 if (w >= 1)
2195 {
2196 *(uint8_t *)d = (filler & 0xff);
2197 w--;
2198 d++;
2199 }
2200
2201 }
2202
2203 _mm_empty ();
2204 return TRUE;
2205 }
2206
2207 static void
mmx_composite_src_x888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2208 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2209 pixman_composite_info_t *info)
2210 {
2211 PIXMAN_COMPOSITE_ARGS (info);
2212 uint16_t *dst_line, *dst;
2213 uint32_t *src_line, *src, s;
2214 int dst_stride, src_stride;
2215 int32_t w;
2216
2217 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2218 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2219
2220 while (height--)
2221 {
2222 dst = dst_line;
2223 dst_line += dst_stride;
2224 src = src_line;
2225 src_line += src_stride;
2226 w = width;
2227
2228 while (w && (uintptr_t)dst & 7)
2229 {
2230 s = *src++;
2231 *dst = convert_8888_to_0565 (s);
2232 dst++;
2233 w--;
2234 }
2235
2236 while (w >= 4)
2237 {
2238 __m64 vdest;
2239 __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2240 __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2241
2242 vdest = pack_4xpacked565 (vsrc0, vsrc1);
2243
2244 *(__m64 *)dst = vdest;
2245
2246 w -= 4;
2247 src += 4;
2248 dst += 4;
2249 }
2250
2251 while (w)
2252 {
2253 s = *src++;
2254 *dst = convert_8888_to_0565 (s);
2255 dst++;
2256 w--;
2257 }
2258 }
2259
2260 _mm_empty ();
2261 }
2262
2263 static void
mmx_composite_src_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2264 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2265 pixman_composite_info_t *info)
2266 {
2267 PIXMAN_COMPOSITE_ARGS (info);
2268 uint32_t src, srca;
2269 uint32_t *dst_line, *dst;
2270 uint8_t *mask_line, *mask;
2271 int dst_stride, mask_stride;
2272 int32_t w;
2273 __m64 vsrc;
2274 uint64_t srcsrc;
2275
2276 CHECKPOINT ();
2277
2278 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2279
2280 srca = src >> 24;
2281 if (src == 0)
2282 {
2283 mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2284 PIXMAN_FORMAT_BPP (dest_image->bits.format),
2285 dest_x, dest_y, width, height, 0);
2286 return;
2287 }
2288
2289 srcsrc = (uint64_t)src << 32 | src;
2290
2291 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2292 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2293
2294 vsrc = load8888 (&src);
2295
2296 while (height--)
2297 {
2298 dst = dst_line;
2299 dst_line += dst_stride;
2300 mask = mask_line;
2301 mask_line += mask_stride;
2302 w = width;
2303
2304 CHECKPOINT ();
2305
2306 while (w && (uintptr_t)dst & 7)
2307 {
2308 uint64_t m = *mask;
2309
2310 if (m)
2311 {
2312 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2313
2314 store8888 (dst, vdest);
2315 }
2316 else
2317 {
2318 *dst = 0;
2319 }
2320
2321 w--;
2322 mask++;
2323 dst++;
2324 }
2325
2326 CHECKPOINT ();
2327
2328 while (w >= 2)
2329 {
2330 uint64_t m0, m1;
2331 m0 = *mask;
2332 m1 = *(mask + 1);
2333
2334 if (srca == 0xff && (m0 & m1) == 0xff)
2335 {
2336 *(uint64_t *)dst = srcsrc;
2337 }
2338 else if (m0 | m1)
2339 {
2340 __m64 dest0, dest1;
2341
2342 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2343 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2344
2345 *(__m64 *)dst = pack8888 (dest0, dest1);
2346 }
2347 else
2348 {
2349 *(uint64_t *)dst = 0;
2350 }
2351
2352 mask += 2;
2353 dst += 2;
2354 w -= 2;
2355 }
2356
2357 CHECKPOINT ();
2358
2359 if (w)
2360 {
2361 uint64_t m = *mask;
2362
2363 if (m)
2364 {
2365 __m64 vdest = load8888 (dst);
2366
2367 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2368 store8888 (dst, vdest);
2369 }
2370 else
2371 {
2372 *dst = 0;
2373 }
2374 }
2375 }
2376
2377 _mm_empty ();
2378 }
2379
2380 static void
mmx_composite_over_n_8_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2381 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2382 pixman_composite_info_t *info)
2383 {
2384 PIXMAN_COMPOSITE_ARGS (info);
2385 uint32_t src, srca;
2386 uint16_t *dst_line, *dst;
2387 uint8_t *mask_line, *mask;
2388 int dst_stride, mask_stride;
2389 int32_t w;
2390 __m64 vsrc, vsrca, tmp;
2391 __m64 srcsrcsrcsrc;
2392
2393 CHECKPOINT ();
2394
2395 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2396
2397 srca = src >> 24;
2398 if (src == 0)
2399 return;
2400
2401 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2402 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2403
2404 vsrc = load8888 (&src);
2405 vsrca = expand_alpha (vsrc);
2406
2407 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2408 srcsrcsrcsrc = expand_alpha_rev (tmp);
2409
2410 while (height--)
2411 {
2412 dst = dst_line;
2413 dst_line += dst_stride;
2414 mask = mask_line;
2415 mask_line += mask_stride;
2416 w = width;
2417
2418 CHECKPOINT ();
2419
2420 while (w && (uintptr_t)dst & 7)
2421 {
2422 uint64_t m = *mask;
2423
2424 if (m)
2425 {
2426 uint64_t d = *dst;
2427 __m64 vd = to_m64 (d);
2428 __m64 vdest = in_over (
2429 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2430
2431 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2432 *dst = to_uint64 (vd);
2433 }
2434
2435 w--;
2436 mask++;
2437 dst++;
2438 }
2439
2440 CHECKPOINT ();
2441
2442 while (w >= 4)
2443 {
2444 uint64_t m0, m1, m2, m3;
2445 m0 = *mask;
2446 m1 = *(mask + 1);
2447 m2 = *(mask + 2);
2448 m3 = *(mask + 3);
2449
2450 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2451 {
2452 *(__m64 *)dst = srcsrcsrcsrc;
2453 }
2454 else if (m0 | m1 | m2 | m3)
2455 {
2456 __m64 vdest = *(__m64 *)dst;
2457 __m64 v0, v1, v2, v3;
2458 __m64 vm0, vm1, vm2, vm3;
2459
2460 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2461
2462 vm0 = to_m64 (m0);
2463 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2464
2465 vm1 = to_m64 (m1);
2466 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2467
2468 vm2 = to_m64 (m2);
2469 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2470
2471 vm3 = to_m64 (m3);
2472 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2473
2474 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2475 }
2476
2477 w -= 4;
2478 mask += 4;
2479 dst += 4;
2480 }
2481
2482 CHECKPOINT ();
2483
2484 while (w)
2485 {
2486 uint64_t m = *mask;
2487
2488 if (m)
2489 {
2490 uint64_t d = *dst;
2491 __m64 vd = to_m64 (d);
2492 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2493 expand565 (vd, 0));
2494 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2495 *dst = to_uint64 (vd);
2496 }
2497
2498 w--;
2499 mask++;
2500 dst++;
2501 }
2502 }
2503
2504 _mm_empty ();
2505 }
2506
2507 static void
mmx_composite_over_pixbuf_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2508 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2509 pixman_composite_info_t *info)
2510 {
2511 PIXMAN_COMPOSITE_ARGS (info);
2512 uint16_t *dst_line, *dst;
2513 uint32_t *src_line, *src;
2514 int dst_stride, src_stride;
2515 int32_t w;
2516
2517 CHECKPOINT ();
2518
2519 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2520 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2521
2522 #if 0
2523 /* FIXME */
2524 assert (src_image->drawable == mask_image->drawable);
2525 #endif
2526
2527 while (height--)
2528 {
2529 dst = dst_line;
2530 dst_line += dst_stride;
2531 src = src_line;
2532 src_line += src_stride;
2533 w = width;
2534
2535 CHECKPOINT ();
2536
2537 while (w && (uintptr_t)dst & 7)
2538 {
2539 __m64 vsrc = load8888 (src);
2540 uint64_t d = *dst;
2541 __m64 vdest = expand565 (to_m64 (d), 0);
2542
2543 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2544
2545 *dst = to_uint64 (vdest);
2546
2547 w--;
2548 dst++;
2549 src++;
2550 }
2551
2552 CHECKPOINT ();
2553
2554 while (w >= 4)
2555 {
2556 uint32_t s0, s1, s2, s3;
2557 unsigned char a0, a1, a2, a3;
2558
2559 s0 = *src;
2560 s1 = *(src + 1);
2561 s2 = *(src + 2);
2562 s3 = *(src + 3);
2563
2564 a0 = (s0 >> 24);
2565 a1 = (s1 >> 24);
2566 a2 = (s2 >> 24);
2567 a3 = (s3 >> 24);
2568
2569 if ((a0 & a1 & a2 & a3) == 0xFF)
2570 {
2571 __m64 v0 = invert_colors (load8888 (&s0));
2572 __m64 v1 = invert_colors (load8888 (&s1));
2573 __m64 v2 = invert_colors (load8888 (&s2));
2574 __m64 v3 = invert_colors (load8888 (&s3));
2575
2576 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2577 }
2578 else if (s0 | s1 | s2 | s3)
2579 {
2580 __m64 vdest = *(__m64 *)dst;
2581 __m64 v0, v1, v2, v3;
2582
2583 __m64 vsrc0 = load8888 (&s0);
2584 __m64 vsrc1 = load8888 (&s1);
2585 __m64 vsrc2 = load8888 (&s2);
2586 __m64 vsrc3 = load8888 (&s3);
2587
2588 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2589
2590 v0 = over_rev_non_pre (vsrc0, v0);
2591 v1 = over_rev_non_pre (vsrc1, v1);
2592 v2 = over_rev_non_pre (vsrc2, v2);
2593 v3 = over_rev_non_pre (vsrc3, v3);
2594
2595 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2596 }
2597
2598 w -= 4;
2599 dst += 4;
2600 src += 4;
2601 }
2602
2603 CHECKPOINT ();
2604
2605 while (w)
2606 {
2607 __m64 vsrc = load8888 (src);
2608 uint64_t d = *dst;
2609 __m64 vdest = expand565 (to_m64 (d), 0);
2610
2611 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2612
2613 *dst = to_uint64 (vdest);
2614
2615 w--;
2616 dst++;
2617 src++;
2618 }
2619 }
2620
2621 _mm_empty ();
2622 }
2623
2624 static void
mmx_composite_over_pixbuf_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2625 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2626 pixman_composite_info_t *info)
2627 {
2628 PIXMAN_COMPOSITE_ARGS (info);
2629 uint32_t *dst_line, *dst;
2630 uint32_t *src_line, *src;
2631 int dst_stride, src_stride;
2632 int32_t w;
2633
2634 CHECKPOINT ();
2635
2636 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2637 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2638
2639 #if 0
2640 /* FIXME */
2641 assert (src_image->drawable == mask_image->drawable);
2642 #endif
2643
2644 while (height--)
2645 {
2646 dst = dst_line;
2647 dst_line += dst_stride;
2648 src = src_line;
2649 src_line += src_stride;
2650 w = width;
2651
2652 while (w && (uintptr_t)dst & 7)
2653 {
2654 __m64 s = load8888 (src);
2655 __m64 d = load8888 (dst);
2656
2657 store8888 (dst, over_rev_non_pre (s, d));
2658
2659 w--;
2660 dst++;
2661 src++;
2662 }
2663
2664 while (w >= 2)
2665 {
2666 uint32_t s0, s1;
2667 unsigned char a0, a1;
2668 __m64 d0, d1;
2669
2670 s0 = *src;
2671 s1 = *(src + 1);
2672
2673 a0 = (s0 >> 24);
2674 a1 = (s1 >> 24);
2675
2676 if ((a0 & a1) == 0xFF)
2677 {
2678 d0 = invert_colors (load8888 (&s0));
2679 d1 = invert_colors (load8888 (&s1));
2680
2681 *(__m64 *)dst = pack8888 (d0, d1);
2682 }
2683 else if (s0 | s1)
2684 {
2685 __m64 vdest = *(__m64 *)dst;
2686
2687 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2688 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2689
2690 *(__m64 *)dst = pack8888 (d0, d1);
2691 }
2692
2693 w -= 2;
2694 dst += 2;
2695 src += 2;
2696 }
2697
2698 if (w)
2699 {
2700 __m64 s = load8888 (src);
2701 __m64 d = load8888 (dst);
2702
2703 store8888 (dst, over_rev_non_pre (s, d));
2704 }
2705 }
2706
2707 _mm_empty ();
2708 }
2709
2710 static void
mmx_composite_over_n_8888_0565_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2711 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2712 pixman_composite_info_t *info)
2713 {
2714 PIXMAN_COMPOSITE_ARGS (info);
2715 uint32_t src;
2716 uint16_t *dst_line;
2717 uint32_t *mask_line;
2718 int dst_stride, mask_stride;
2719 __m64 vsrc, vsrca;
2720
2721 CHECKPOINT ();
2722
2723 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2724
2725 if (src == 0)
2726 return;
2727
2728 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2729 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2730
2731 vsrc = load8888 (&src);
2732 vsrca = expand_alpha (vsrc);
2733
2734 while (height--)
2735 {
2736 int twidth = width;
2737 uint32_t *p = (uint32_t *)mask_line;
2738 uint16_t *q = (uint16_t *)dst_line;
2739
2740 while (twidth && ((uintptr_t)q & 7))
2741 {
2742 uint32_t m = *(uint32_t *)p;
2743
2744 if (m)
2745 {
2746 uint64_t d = *q;
2747 __m64 vdest = expand565 (to_m64 (d), 0);
2748 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2749 *q = to_uint64 (vdest);
2750 }
2751
2752 twidth--;
2753 p++;
2754 q++;
2755 }
2756
2757 while (twidth >= 4)
2758 {
2759 uint32_t m0, m1, m2, m3;
2760
2761 m0 = *p;
2762 m1 = *(p + 1);
2763 m2 = *(p + 2);
2764 m3 = *(p + 3);
2765
2766 if ((m0 | m1 | m2 | m3))
2767 {
2768 __m64 vdest = *(__m64 *)q;
2769 __m64 v0, v1, v2, v3;
2770
2771 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2772
2773 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2774 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2775 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2776 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2777
2778 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2779 }
2780 twidth -= 4;
2781 p += 4;
2782 q += 4;
2783 }
2784
2785 while (twidth)
2786 {
2787 uint32_t m;
2788
2789 m = *(uint32_t *)p;
2790 if (m)
2791 {
2792 uint64_t d = *q;
2793 __m64 vdest = expand565 (to_m64 (d), 0);
2794 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2795 *q = to_uint64 (vdest);
2796 }
2797
2798 twidth--;
2799 p++;
2800 q++;
2801 }
2802
2803 mask_line += mask_stride;
2804 dst_line += dst_stride;
2805 }
2806
2807 _mm_empty ();
2808 }
2809
2810 static void
mmx_composite_in_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2811 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2812 pixman_composite_info_t *info)
2813 {
2814 PIXMAN_COMPOSITE_ARGS (info);
2815 uint8_t *dst_line, *dst;
2816 uint8_t *mask_line, *mask;
2817 int dst_stride, mask_stride;
2818 int32_t w;
2819 uint32_t src;
2820 uint8_t sa;
2821 __m64 vsrc, vsrca;
2822
2823 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2824 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2825
2826 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2827
2828 sa = src >> 24;
2829
2830 vsrc = load8888 (&src);
2831 vsrca = expand_alpha (vsrc);
2832
2833 while (height--)
2834 {
2835 dst = dst_line;
2836 dst_line += dst_stride;
2837 mask = mask_line;
2838 mask_line += mask_stride;
2839 w = width;
2840
2841 while (w && (uintptr_t)dst & 7)
2842 {
2843 uint16_t tmp;
2844 uint8_t a;
2845 uint32_t m, d;
2846
2847 a = *mask++;
2848 d = *dst;
2849
2850 m = MUL_UN8 (sa, a, tmp);
2851 d = MUL_UN8 (m, d, tmp);
2852
2853 *dst++ = d;
2854 w--;
2855 }
2856
2857 while (w >= 4)
2858 {
2859 __m64 vmask;
2860 __m64 vdest;
2861
2862 vmask = load8888u ((uint32_t *)mask);
2863 vdest = load8888 ((uint32_t *)dst);
2864
2865 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2866
2867 dst += 4;
2868 mask += 4;
2869 w -= 4;
2870 }
2871
2872 while (w--)
2873 {
2874 uint16_t tmp;
2875 uint8_t a;
2876 uint32_t m, d;
2877
2878 a = *mask++;
2879 d = *dst;
2880
2881 m = MUL_UN8 (sa, a, tmp);
2882 d = MUL_UN8 (m, d, tmp);
2883
2884 *dst++ = d;
2885 }
2886 }
2887
2888 _mm_empty ();
2889 }
2890
2891 static void
mmx_composite_in_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2892 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2893 pixman_composite_info_t *info)
2894 {
2895 PIXMAN_COMPOSITE_ARGS (info);
2896 uint8_t *dst_line, *dst;
2897 uint8_t *src_line, *src;
2898 int src_stride, dst_stride;
2899 int32_t w;
2900
2901 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2902 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2903
2904 while (height--)
2905 {
2906 dst = dst_line;
2907 dst_line += dst_stride;
2908 src = src_line;
2909 src_line += src_stride;
2910 w = width;
2911
2912 while (w && (uintptr_t)dst & 3)
2913 {
2914 uint8_t s, d;
2915 uint16_t tmp;
2916
2917 s = *src;
2918 d = *dst;
2919
2920 *dst = MUL_UN8 (s, d, tmp);
2921
2922 src++;
2923 dst++;
2924 w--;
2925 }
2926
2927 while (w >= 4)
2928 {
2929 uint32_t *s = (uint32_t *)src;
2930 uint32_t *d = (uint32_t *)dst;
2931
2932 store8888 (d, in (load8888u (s), load8888 (d)));
2933
2934 w -= 4;
2935 dst += 4;
2936 src += 4;
2937 }
2938
2939 while (w--)
2940 {
2941 uint8_t s, d;
2942 uint16_t tmp;
2943
2944 s = *src;
2945 d = *dst;
2946
2947 *dst = MUL_UN8 (s, d, tmp);
2948
2949 src++;
2950 dst++;
2951 }
2952 }
2953
2954 _mm_empty ();
2955 }
2956
2957 static void
mmx_composite_add_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2958 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2959 pixman_composite_info_t *info)
2960 {
2961 PIXMAN_COMPOSITE_ARGS (info);
2962 uint8_t *dst_line, *dst;
2963 uint8_t *mask_line, *mask;
2964 int dst_stride, mask_stride;
2965 int32_t w;
2966 uint32_t src;
2967 uint8_t sa;
2968 __m64 vsrc, vsrca;
2969
2970 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2971 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2972
2973 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2974
2975 sa = src >> 24;
2976
2977 if (src == 0)
2978 return;
2979
2980 vsrc = load8888 (&src);
2981 vsrca = expand_alpha (vsrc);
2982
2983 while (height--)
2984 {
2985 dst = dst_line;
2986 dst_line += dst_stride;
2987 mask = mask_line;
2988 mask_line += mask_stride;
2989 w = width;
2990
2991 while (w && (uintptr_t)dst & 3)
2992 {
2993 uint16_t tmp;
2994 uint16_t a;
2995 uint32_t m, d;
2996 uint32_t r;
2997
2998 a = *mask++;
2999 d = *dst;
3000
3001 m = MUL_UN8 (sa, a, tmp);
3002 r = ADD_UN8 (m, d, tmp);
3003
3004 *dst++ = r;
3005 w--;
3006 }
3007
3008 while (w >= 4)
3009 {
3010 __m64 vmask;
3011 __m64 vdest;
3012
3013 vmask = load8888u ((uint32_t *)mask);
3014 vdest = load8888 ((uint32_t *)dst);
3015
3016 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3017
3018 dst += 4;
3019 mask += 4;
3020 w -= 4;
3021 }
3022
3023 while (w--)
3024 {
3025 uint16_t tmp;
3026 uint16_t a;
3027 uint32_t m, d;
3028 uint32_t r;
3029
3030 a = *mask++;
3031 d = *dst;
3032
3033 m = MUL_UN8 (sa, a, tmp);
3034 r = ADD_UN8 (m, d, tmp);
3035
3036 *dst++ = r;
3037 }
3038 }
3039
3040 _mm_empty ();
3041 }
3042
3043 static void
mmx_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)3044 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3045 pixman_composite_info_t *info)
3046 {
3047 PIXMAN_COMPOSITE_ARGS (info);
3048 uint8_t *dst_line, *dst;
3049 uint8_t *src_line, *src;
3050 int dst_stride, src_stride;
3051 int32_t w;
3052 uint8_t s, d;
3053 uint16_t t;
3054
3055 CHECKPOINT ();
3056
3057 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3058 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3059
3060 while (height--)
3061 {
3062 dst = dst_line;
3063 dst_line += dst_stride;
3064 src = src_line;
3065 src_line += src_stride;
3066 w = width;
3067
3068 while (w && (uintptr_t)dst & 7)
3069 {
3070 s = *src;
3071 d = *dst;
3072 t = d + s;
3073 s = t | (0 - (t >> 8));
3074 *dst = s;
3075
3076 dst++;
3077 src++;
3078 w--;
3079 }
3080
3081 while (w >= 8)
3082 {
3083 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3084 dst += 8;
3085 src += 8;
3086 w -= 8;
3087 }
3088
3089 while (w)
3090 {
3091 s = *src;
3092 d = *dst;
3093 t = d + s;
3094 s = t | (0 - (t >> 8));
3095 *dst = s;
3096
3097 dst++;
3098 src++;
3099 w--;
3100 }
3101 }
3102
3103 _mm_empty ();
3104 }
3105
3106 static void
mmx_composite_add_0565_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3107 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3108 pixman_composite_info_t *info)
3109 {
3110 PIXMAN_COMPOSITE_ARGS (info);
3111 uint16_t *dst_line, *dst;
3112 uint32_t d;
3113 uint16_t *src_line, *src;
3114 uint32_t s;
3115 int dst_stride, src_stride;
3116 int32_t w;
3117
3118 CHECKPOINT ();
3119
3120 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3121 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3122
3123 while (height--)
3124 {
3125 dst = dst_line;
3126 dst_line += dst_stride;
3127 src = src_line;
3128 src_line += src_stride;
3129 w = width;
3130
3131 while (w && (uintptr_t)dst & 7)
3132 {
3133 s = *src++;
3134 if (s)
3135 {
3136 d = *dst;
3137 s = convert_0565_to_8888 (s);
3138 if (d)
3139 {
3140 d = convert_0565_to_8888 (d);
3141 UN8x4_ADD_UN8x4 (s, d);
3142 }
3143 *dst = convert_8888_to_0565 (s);
3144 }
3145 dst++;
3146 w--;
3147 }
3148
3149 while (w >= 4)
3150 {
3151 __m64 vdest = *(__m64 *)dst;
3152 __m64 vsrc = ldq_u ((__m64 *)src);
3153 __m64 vd0, vd1;
3154 __m64 vs0, vs1;
3155
3156 expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3157 expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3158
3159 vd0 = _mm_adds_pu8 (vd0, vs0);
3160 vd1 = _mm_adds_pu8 (vd1, vs1);
3161
3162 *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3163
3164 dst += 4;
3165 src += 4;
3166 w -= 4;
3167 }
3168
3169 while (w--)
3170 {
3171 s = *src++;
3172 if (s)
3173 {
3174 d = *dst;
3175 s = convert_0565_to_8888 (s);
3176 if (d)
3177 {
3178 d = convert_0565_to_8888 (d);
3179 UN8x4_ADD_UN8x4 (s, d);
3180 }
3181 *dst = convert_8888_to_0565 (s);
3182 }
3183 dst++;
3184 }
3185 }
3186
3187 _mm_empty ();
3188 }
3189
3190 static void
mmx_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3191 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3192 pixman_composite_info_t *info)
3193 {
3194 PIXMAN_COMPOSITE_ARGS (info);
3195 uint32_t *dst_line, *dst;
3196 uint32_t *src_line, *src;
3197 int dst_stride, src_stride;
3198 int32_t w;
3199
3200 CHECKPOINT ();
3201
3202 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3203 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3204
3205 while (height--)
3206 {
3207 dst = dst_line;
3208 dst_line += dst_stride;
3209 src = src_line;
3210 src_line += src_stride;
3211 w = width;
3212
3213 while (w && (uintptr_t)dst & 7)
3214 {
3215 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3216 load ((const uint32_t *)dst)));
3217 dst++;
3218 src++;
3219 w--;
3220 }
3221
3222 while (w >= 2)
3223 {
3224 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3225 dst += 2;
3226 src += 2;
3227 w -= 2;
3228 }
3229
3230 if (w)
3231 {
3232 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3233 load ((const uint32_t *)dst)));
3234
3235 }
3236 }
3237
3238 _mm_empty ();
3239 }
3240
3241 static pixman_bool_t
mmx_blt(pixman_implementation_t * imp,uint32_t * src_bits,uint32_t * dst_bits,int src_stride,int dst_stride,int src_bpp,int dst_bpp,int src_x,int src_y,int dest_x,int dest_y,int width,int height)3242 mmx_blt (pixman_implementation_t *imp,
3243 uint32_t * src_bits,
3244 uint32_t * dst_bits,
3245 int src_stride,
3246 int dst_stride,
3247 int src_bpp,
3248 int dst_bpp,
3249 int src_x,
3250 int src_y,
3251 int dest_x,
3252 int dest_y,
3253 int width,
3254 int height)
3255 {
3256 uint8_t * src_bytes;
3257 uint8_t * dst_bytes;
3258 int byte_width;
3259
3260 if (src_bpp != dst_bpp)
3261 return FALSE;
3262
3263 if (src_bpp == 16)
3264 {
3265 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3266 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3267 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3268 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3269 byte_width = 2 * width;
3270 src_stride *= 2;
3271 dst_stride *= 2;
3272 }
3273 else if (src_bpp == 32)
3274 {
3275 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3276 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3277 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3278 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3279 byte_width = 4 * width;
3280 src_stride *= 4;
3281 dst_stride *= 4;
3282 }
3283 else
3284 {
3285 return FALSE;
3286 }
3287
3288 while (height--)
3289 {
3290 int w;
3291 uint8_t *s = src_bytes;
3292 uint8_t *d = dst_bytes;
3293 src_bytes += src_stride;
3294 dst_bytes += dst_stride;
3295 w = byte_width;
3296
3297 if (w >= 1 && ((uintptr_t)d & 1))
3298 {
3299 *(uint8_t *)d = *(uint8_t *)s;
3300 w -= 1;
3301 s += 1;
3302 d += 1;
3303 }
3304
3305 if (w >= 2 && ((uintptr_t)d & 3))
3306 {
3307 *(uint16_t *)d = *(uint16_t *)s;
3308 w -= 2;
3309 s += 2;
3310 d += 2;
3311 }
3312
3313 while (w >= 4 && ((uintptr_t)d & 7))
3314 {
3315 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3316
3317 w -= 4;
3318 s += 4;
3319 d += 4;
3320 }
3321
3322 while (w >= 64)
3323 {
3324 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3325 __asm__ (
3326 "movq (%1), %%mm0\n"
3327 "movq 8(%1), %%mm1\n"
3328 "movq 16(%1), %%mm2\n"
3329 "movq 24(%1), %%mm3\n"
3330 "movq 32(%1), %%mm4\n"
3331 "movq 40(%1), %%mm5\n"
3332 "movq 48(%1), %%mm6\n"
3333 "movq 56(%1), %%mm7\n"
3334
3335 "movq %%mm0, (%0)\n"
3336 "movq %%mm1, 8(%0)\n"
3337 "movq %%mm2, 16(%0)\n"
3338 "movq %%mm3, 24(%0)\n"
3339 "movq %%mm4, 32(%0)\n"
3340 "movq %%mm5, 40(%0)\n"
3341 "movq %%mm6, 48(%0)\n"
3342 "movq %%mm7, 56(%0)\n"
3343 :
3344 : "r" (d), "r" (s)
3345 : "memory",
3346 "%mm0", "%mm1", "%mm2", "%mm3",
3347 "%mm4", "%mm5", "%mm6", "%mm7");
3348 #else
3349 __m64 v0 = ldq_u ((__m64 *)(s + 0));
3350 __m64 v1 = ldq_u ((__m64 *)(s + 8));
3351 __m64 v2 = ldq_u ((__m64 *)(s + 16));
3352 __m64 v3 = ldq_u ((__m64 *)(s + 24));
3353 __m64 v4 = ldq_u ((__m64 *)(s + 32));
3354 __m64 v5 = ldq_u ((__m64 *)(s + 40));
3355 __m64 v6 = ldq_u ((__m64 *)(s + 48));
3356 __m64 v7 = ldq_u ((__m64 *)(s + 56));
3357 *(__m64 *)(d + 0) = v0;
3358 *(__m64 *)(d + 8) = v1;
3359 *(__m64 *)(d + 16) = v2;
3360 *(__m64 *)(d + 24) = v3;
3361 *(__m64 *)(d + 32) = v4;
3362 *(__m64 *)(d + 40) = v5;
3363 *(__m64 *)(d + 48) = v6;
3364 *(__m64 *)(d + 56) = v7;
3365 #endif
3366
3367 w -= 64;
3368 s += 64;
3369 d += 64;
3370 }
3371 while (w >= 4)
3372 {
3373 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3374
3375 w -= 4;
3376 s += 4;
3377 d += 4;
3378 }
3379 if (w >= 2)
3380 {
3381 *(uint16_t *)d = *(uint16_t *)s;
3382 w -= 2;
3383 s += 2;
3384 d += 2;
3385 }
3386 }
3387
3388 _mm_empty ();
3389
3390 return TRUE;
3391 }
3392
3393 static void
mmx_composite_copy_area(pixman_implementation_t * imp,pixman_composite_info_t * info)3394 mmx_composite_copy_area (pixman_implementation_t *imp,
3395 pixman_composite_info_t *info)
3396 {
3397 PIXMAN_COMPOSITE_ARGS (info);
3398
3399 mmx_blt (imp, src_image->bits.bits,
3400 dest_image->bits.bits,
3401 src_image->bits.rowstride,
3402 dest_image->bits.rowstride,
3403 PIXMAN_FORMAT_BPP (src_image->bits.format),
3404 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3405 src_x, src_y, dest_x, dest_y, width, height);
3406 }
3407
3408 static void
mmx_composite_over_x888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3409 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3410 pixman_composite_info_t *info)
3411 {
3412 PIXMAN_COMPOSITE_ARGS (info);
3413 uint32_t *src, *src_line;
3414 uint32_t *dst, *dst_line;
3415 uint8_t *mask, *mask_line;
3416 int src_stride, mask_stride, dst_stride;
3417 int32_t w;
3418
3419 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3420 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3421 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3422
3423 while (height--)
3424 {
3425 src = src_line;
3426 src_line += src_stride;
3427 dst = dst_line;
3428 dst_line += dst_stride;
3429 mask = mask_line;
3430 mask_line += mask_stride;
3431
3432 w = width;
3433
3434 while (w--)
3435 {
3436 uint64_t m = *mask;
3437
3438 if (m)
3439 {
3440 uint32_t ssrc = *src | 0xff000000;
3441 __m64 s = load8888 (&ssrc);
3442
3443 if (m == 0xff)
3444 {
3445 store8888 (dst, s);
3446 }
3447 else
3448 {
3449 __m64 sa = expand_alpha (s);
3450 __m64 vm = expand_alpha_rev (to_m64 (m));
3451 __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3452
3453 store8888 (dst, vdest);
3454 }
3455 }
3456
3457 mask++;
3458 dst++;
3459 src++;
3460 }
3461 }
3462
3463 _mm_empty ();
3464 }
3465
3466 static void
mmx_composite_over_reverse_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3467 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3468 pixman_composite_info_t *info)
3469 {
3470 PIXMAN_COMPOSITE_ARGS (info);
3471 uint32_t src;
3472 uint32_t *dst_line, *dst;
3473 int32_t w;
3474 int dst_stride;
3475 __m64 vsrc;
3476
3477 CHECKPOINT ();
3478
3479 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3480
3481 if (src == 0)
3482 return;
3483
3484 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3485
3486 vsrc = load8888 (&src);
3487
3488 while (height--)
3489 {
3490 dst = dst_line;
3491 dst_line += dst_stride;
3492 w = width;
3493
3494 CHECKPOINT ();
3495
3496 while (w && (uintptr_t)dst & 7)
3497 {
3498 __m64 vdest = load8888 (dst);
3499
3500 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3501
3502 w--;
3503 dst++;
3504 }
3505
3506 while (w >= 2)
3507 {
3508 __m64 vdest = *(__m64 *)dst;
3509 __m64 dest0 = expand8888 (vdest, 0);
3510 __m64 dest1 = expand8888 (vdest, 1);
3511
3512
3513 dest0 = over (dest0, expand_alpha (dest0), vsrc);
3514 dest1 = over (dest1, expand_alpha (dest1), vsrc);
3515
3516 *(__m64 *)dst = pack8888 (dest0, dest1);
3517
3518 dst += 2;
3519 w -= 2;
3520 }
3521
3522 CHECKPOINT ();
3523
3524 if (w)
3525 {
3526 __m64 vdest = load8888 (dst);
3527
3528 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3529 }
3530 }
3531
3532 _mm_empty ();
3533 }
3534
3535 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3536 #define BMSK (BSHIFT - 1)
3537
3538 #define BILINEAR_DECLARE_VARIABLES \
3539 const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
3540 const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
3541 const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \
3542 const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
3543 const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
3544 const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
3545 const __m64 mm_zero = _mm_setzero_si64 (); \
3546 __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3547
3548 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
3549 do { \
3550 /* fetch 2x2 pixel block into 2 mmx registers */ \
3551 __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
3552 __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
3553 /* vertical interpolation */ \
3554 __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
3555 __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
3556 __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
3557 __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
3558 __m64 hi = _mm_add_pi16 (t_hi, b_hi); \
3559 __m64 lo = _mm_add_pi16 (t_lo, b_lo); \
3560 vx += unit_x; \
3561 if (BILINEAR_INTERPOLATION_BITS < 8) \
3562 { \
3563 /* calculate horizontal weights */ \
3564 __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
3565 _mm_srli_pi16 (mm_x, \
3566 16 - BILINEAR_INTERPOLATION_BITS))); \
3567 /* horizontal interpolation */ \
3568 __m64 p = _mm_unpacklo_pi16 (lo, hi); \
3569 __m64 q = _mm_unpackhi_pi16 (lo, hi); \
3570 lo = _mm_madd_pi16 (p, mm_wh); \
3571 hi = _mm_madd_pi16 (q, mm_wh); \
3572 } \
3573 else \
3574 { \
3575 /* calculate horizontal weights */ \
3576 __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \
3577 16 - BILINEAR_INTERPOLATION_BITS)); \
3578 __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \
3579 16 - BILINEAR_INTERPOLATION_BITS); \
3580 /* horizontal interpolation */ \
3581 __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \
3582 __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \
3583 __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \
3584 __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \
3585 lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \
3586 _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \
3587 hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \
3588 _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \
3589 } \
3590 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3591 /* shift and pack the result */ \
3592 hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
3593 lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
3594 lo = _mm_packs_pi32 (lo, hi); \
3595 lo = _mm_packs_pu16 (lo, lo); \
3596 pix = lo; \
3597 } while (0)
3598
3599 #define BILINEAR_SKIP_ONE_PIXEL() \
3600 do { \
3601 vx += unit_x; \
3602 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3603 } while(0)
3604
3605 static force_inline void
scaled_bilinear_scanline_mmx_8888_8888_SRC(uint32_t * dst,const uint32_t * mask,const uint32_t * src_top,const uint32_t * src_bottom,int32_t w,int wt,int wb,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t max_vx,pixman_bool_t zero_src)3606 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
3607 const uint32_t * mask,
3608 const uint32_t * src_top,
3609 const uint32_t * src_bottom,
3610 int32_t w,
3611 int wt,
3612 int wb,
3613 pixman_fixed_t vx,
3614 pixman_fixed_t unit_x,
3615 pixman_fixed_t max_vx,
3616 pixman_bool_t zero_src)
3617 {
3618 BILINEAR_DECLARE_VARIABLES;
3619 __m64 pix;
3620
3621 while (w--)
3622 {
3623 BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3624 store (dst, pix);
3625 dst++;
3626 }
3627
3628 _mm_empty ();
3629 }
3630
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_SRC,scaled_bilinear_scanline_mmx_8888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3631 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3632 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3633 uint32_t, uint32_t, uint32_t,
3634 COVER, FLAG_NONE)
3635 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3636 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3637 uint32_t, uint32_t, uint32_t,
3638 PAD, FLAG_NONE)
3639 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3640 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3641 uint32_t, uint32_t, uint32_t,
3642 NONE, FLAG_NONE)
3643 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3644 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3645 uint32_t, uint32_t, uint32_t,
3646 NORMAL, FLAG_NONE)
3647
3648 static force_inline void
3649 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
3650 const uint32_t * mask,
3651 const uint32_t * src_top,
3652 const uint32_t * src_bottom,
3653 int32_t w,
3654 int wt,
3655 int wb,
3656 pixman_fixed_t vx,
3657 pixman_fixed_t unit_x,
3658 pixman_fixed_t max_vx,
3659 pixman_bool_t zero_src)
3660 {
3661 BILINEAR_DECLARE_VARIABLES;
3662 __m64 pix1, pix2;
3663
3664 while (w)
3665 {
3666 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3667
3668 if (!is_zero (pix1))
3669 {
3670 pix2 = load (dst);
3671 store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3672 }
3673
3674 w--;
3675 dst++;
3676 }
3677
3678 _mm_empty ();
3679 }
3680
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3681 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3682 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3683 uint32_t, uint32_t, uint32_t,
3684 COVER, FLAG_NONE)
3685 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3686 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3687 uint32_t, uint32_t, uint32_t,
3688 PAD, FLAG_NONE)
3689 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3690 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3691 uint32_t, uint32_t, uint32_t,
3692 NONE, FLAG_NONE)
3693 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3694 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3695 uint32_t, uint32_t, uint32_t,
3696 NORMAL, FLAG_NONE)
3697
3698 static force_inline void
3699 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst,
3700 const uint8_t * mask,
3701 const uint32_t * src_top,
3702 const uint32_t * src_bottom,
3703 int32_t w,
3704 int wt,
3705 int wb,
3706 pixman_fixed_t vx,
3707 pixman_fixed_t unit_x,
3708 pixman_fixed_t max_vx,
3709 pixman_bool_t zero_src)
3710 {
3711 BILINEAR_DECLARE_VARIABLES;
3712 __m64 pix1, pix2;
3713 uint32_t m;
3714
3715 while (w)
3716 {
3717 m = (uint32_t) *mask++;
3718
3719 if (m)
3720 {
3721 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3722
3723 if (m == 0xff && is_opaque (pix1))
3724 {
3725 store (dst, pix1);
3726 }
3727 else
3728 {
3729 __m64 ms, md, ma, msa;
3730
3731 pix2 = load (dst);
3732 ma = expand_alpha_rev (to_m64 (m));
3733 ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3734 md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3735
3736 msa = expand_alpha (ms);
3737
3738 store8888 (dst, (in_over (ms, msa, ma, md)));
3739 }
3740 }
3741 else
3742 {
3743 BILINEAR_SKIP_ONE_PIXEL ();
3744 }
3745
3746 w--;
3747 dst++;
3748 }
3749
3750 _mm_empty ();
3751 }
3752
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8_8888_OVER,uint32_t,uint8_t,uint32_t,COVER,FLAG_HAVE_NON_SOLID_MASK)3753 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3754 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3755 uint32_t, uint8_t, uint32_t,
3756 COVER, FLAG_HAVE_NON_SOLID_MASK)
3757 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3758 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3759 uint32_t, uint8_t, uint32_t,
3760 PAD, FLAG_HAVE_NON_SOLID_MASK)
3761 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3762 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3763 uint32_t, uint8_t, uint32_t,
3764 NONE, FLAG_HAVE_NON_SOLID_MASK)
3765 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3766 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3767 uint32_t, uint8_t, uint32_t,
3768 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3769
3770 static uint32_t *
3771 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3772 {
3773 int w = iter->width;
3774 uint32_t *dst = iter->buffer;
3775 uint32_t *src = (uint32_t *)iter->bits;
3776
3777 iter->bits += iter->stride;
3778
3779 while (w && ((uintptr_t)dst) & 7)
3780 {
3781 *dst++ = (*src++) | 0xff000000;
3782 w--;
3783 }
3784
3785 while (w >= 8)
3786 {
3787 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3788 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3789 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3790 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3791
3792 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3793 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3794 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3795 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3796
3797 dst += 8;
3798 src += 8;
3799 w -= 8;
3800 }
3801
3802 while (w)
3803 {
3804 *dst++ = (*src++) | 0xff000000;
3805 w--;
3806 }
3807
3808 _mm_empty ();
3809 return iter->buffer;
3810 }
3811
3812 static uint32_t *
mmx_fetch_r5g6b5(pixman_iter_t * iter,const uint32_t * mask)3813 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3814 {
3815 int w = iter->width;
3816 uint32_t *dst = iter->buffer;
3817 uint16_t *src = (uint16_t *)iter->bits;
3818
3819 iter->bits += iter->stride;
3820
3821 while (w && ((uintptr_t)dst) & 0x0f)
3822 {
3823 uint16_t s = *src++;
3824
3825 *dst++ = convert_0565_to_8888 (s);
3826 w--;
3827 }
3828
3829 while (w >= 4)
3830 {
3831 __m64 vsrc = ldq_u ((__m64 *)src);
3832 __m64 mm0, mm1;
3833
3834 expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3835
3836 *(__m64 *)(dst + 0) = mm0;
3837 *(__m64 *)(dst + 2) = mm1;
3838
3839 dst += 4;
3840 src += 4;
3841 w -= 4;
3842 }
3843
3844 while (w)
3845 {
3846 uint16_t s = *src++;
3847
3848 *dst++ = convert_0565_to_8888 (s);
3849 w--;
3850 }
3851
3852 _mm_empty ();
3853 return iter->buffer;
3854 }
3855
3856 static uint32_t *
mmx_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)3857 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3858 {
3859 int w = iter->width;
3860 uint32_t *dst = iter->buffer;
3861 uint8_t *src = iter->bits;
3862
3863 iter->bits += iter->stride;
3864
3865 while (w && (((uintptr_t)dst) & 15))
3866 {
3867 *dst++ = *(src++) << 24;
3868 w--;
3869 }
3870
3871 while (w >= 8)
3872 {
3873 __m64 mm0 = ldq_u ((__m64 *)src);
3874
3875 __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
3876 __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
3877 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3878 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3879 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3880 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3881
3882 *(__m64 *)(dst + 0) = mm3;
3883 *(__m64 *)(dst + 2) = mm4;
3884 *(__m64 *)(dst + 4) = mm5;
3885 *(__m64 *)(dst + 6) = mm6;
3886
3887 dst += 8;
3888 src += 8;
3889 w -= 8;
3890 }
3891
3892 while (w)
3893 {
3894 *dst++ = *(src++) << 24;
3895 w--;
3896 }
3897
3898 _mm_empty ();
3899 return iter->buffer;
3900 }
3901
3902 typedef struct
3903 {
3904 pixman_format_code_t format;
3905 pixman_iter_get_scanline_t get_scanline;
3906 } fetcher_info_t;
3907
3908 static const fetcher_info_t fetchers[] =
3909 {
3910 { PIXMAN_x8r8g8b8, mmx_fetch_x8r8g8b8 },
3911 { PIXMAN_r5g6b5, mmx_fetch_r5g6b5 },
3912 { PIXMAN_a8, mmx_fetch_a8 },
3913 { PIXMAN_null }
3914 };
3915
3916 static pixman_bool_t
mmx_src_iter_init(pixman_implementation_t * imp,pixman_iter_t * iter)3917 mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
3918 {
3919 pixman_image_t *image = iter->image;
3920
3921 #define FLAGS \
3922 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
3923 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3924
3925 if ((iter->iter_flags & ITER_NARROW) &&
3926 (iter->image_flags & FLAGS) == FLAGS)
3927 {
3928 const fetcher_info_t *f;
3929
3930 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
3931 {
3932 if (image->common.extended_format_code == f->format)
3933 {
3934 uint8_t *b = (uint8_t *)image->bits.bits;
3935 int s = image->bits.rowstride * 4;
3936
3937 iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
3938 iter->stride = s;
3939
3940 iter->get_scanline = f->get_scanline;
3941 return TRUE;
3942 }
3943 }
3944 }
3945
3946 return FALSE;
3947 }
3948
3949 static const pixman_fast_path_t mmx_fast_paths[] =
3950 {
3951 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
3952 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
3953 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
3954 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
3955 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
3956 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
3957 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3958 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3959 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
3960 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3961 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3962 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
3963 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3964 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3965 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
3966 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3967 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3968 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
3969 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
3970 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
3971 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
3972 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
3973 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
3974 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
3975 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
3976 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
3977 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
3978 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3979 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
3980 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
3981 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
3982 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
3983 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3984 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
3985 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3986 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3987
3988 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
3989 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
3990 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
3991 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
3992 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
3993 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
3994
3995 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
3996 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
3997
3998 PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
3999 PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
4000 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
4001 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
4002 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
4003 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
4004
4005 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
4006 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
4007 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
4008 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
4009 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
4010 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
4011 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
4012 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
4013 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
4014 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
4015 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4016 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4017 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4018 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4019 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
4020 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
4021
4022 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
4023 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
4024
4025 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4026 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4027 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4028 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4029 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4030 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4031
4032 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4033 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4034 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4035 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4036
4037 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ),
4038 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ),
4039 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ),
4040 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ),
4041
4042 { PIXMAN_OP_NONE },
4043 };
4044
4045 pixman_implementation_t *
_pixman_implementation_create_mmx(pixman_implementation_t * fallback)4046 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4047 {
4048 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4049
4050 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4051 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4052 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4053 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4054 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4055 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4056 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4057 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4058 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4059 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4060 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4061
4062 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4063 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4064 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4065 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4066 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4067 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4068 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4069 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4070 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4071 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4072 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4073
4074 imp->blt = mmx_blt;
4075 imp->fill = mmx_fill;
4076
4077 imp->src_iter_init = mmx_src_iter_init;
4078
4079 return imp;
4080 }
4081
4082 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
4083