• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2004, 2005 Red Hat, Inc.
3  * Copyright © 2004 Nicholas Miell
4  * Copyright © 2005 Trolltech AS
5  *
6  * Permission to use, copy, modify, distribute, and sell this software and its
7  * documentation for any purpose is hereby granted without fee, provided that
8  * the above copyright notice appear in all copies and that both that
9  * copyright notice and this permission notice appear in supporting
10  * documentation, and that the name of Red Hat not be used in advertising or
11  * publicity pertaining to distribution of the software without specific,
12  * written prior permission.  Red Hat makes no representations about the
13  * suitability of this software for any purpose.  It is provided "as is"
14  * without express or implied warranty.
15  *
16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23  * SOFTWARE.
24  *
25  * Author:  Søren Sandmann (sandmann@redhat.com)
26  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28  *
29  * Based on work by Owen Taylor
30  */
31 
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35 
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
37 
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
40 #else
41 #include <mmintrin.h>
42 #endif
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
45 #include "pixman-inlines.h"
46 
47 #ifdef VERBOSE
48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
49 #else
50 #define CHECKPOINT()
51 #endif
52 
53 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
55 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void)56 _mm_empty (void)
57 {
58 
59 }
60 #endif
61 
62 #ifdef USE_X86_MMX
63 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64 #  include <xmmintrin.h>
65 # else
66 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67  * instructions to be generated that we don't want. Just duplicate the
68  * functions we want to use.  */
69 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8(__m64 __A)70 _mm_movemask_pi8 (__m64 __A)
71 {
72     int ret;
73 
74     asm ("pmovmskb %1, %0\n\t"
75 	: "=r" (ret)
76 	: "y" (__A)
77     );
78 
79     return ret;
80 }
81 
82 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16(__m64 __A,__m64 __B)83 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
84 {
85     asm ("pmulhuw %1, %0\n\t"
86 	: "+y" (__A)
87 	: "y" (__B)
88     );
89     return __A;
90 }
91 
92 #  ifdef __OPTIMIZE__
93 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi16(__m64 __A,int8_t const __N)94 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
95 {
96     __m64 ret;
97 
98     asm ("pshufw %2, %1, %0\n\t"
99 	: "=y" (ret)
100 	: "y" (__A), "K" (__N)
101     );
102 
103     return ret;
104 }
105 #  else
106 #   define _mm_shuffle_pi16(A, N)					\
107     ({									\
108 	__m64 ret;							\
109 									\
110 	asm ("pshufw %2, %1, %0\n\t"					\
111 	     : "=y" (ret)						\
112 	     : "y" (A), "K" ((const int8_t)N)				\
113 	);								\
114 									\
115 	ret;								\
116     })
117 #  endif
118 # endif
119 #endif
120 
121 #ifndef _MSC_VER
122 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
123  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
124 #endif
125 
126 /* Notes about writing mmx code
127  *
128  * give memory operands as the second operand. If you give it as the
129  * first, gcc will first load it into a register, then use that
130  * register
131  *
132  *   ie. use
133  *
134  *         _mm_mullo_pi16 (x, mmx_constant);
135  *
136  *   not
137  *
138  *         _mm_mullo_pi16 (mmx_constant, x);
139  *
140  * Also try to minimize dependencies. i.e. when you need a value, try
141  * to calculate it from a value that was calculated as early as
142  * possible.
143  */
144 
145 /* --------------- MMX primitives ------------------------------------- */
146 
147 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
148  * the name of the member used to access the data.
149  * If __m64 requires using mm_cvt* intrinsics functions to convert between
150  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
151  * If __m64 and uint64_t values can just be cast to each other directly,
152  * then define USE_M64_CASTS.
153  * If __m64 is a double datatype, then define USE_M64_DOUBLE.
154  */
155 #ifdef _MSC_VER
156 # define M64_MEMBER m64_u64
157 #elif defined(__ICC)
158 # define USE_CVT_INTRINSICS
159 #elif defined(USE_LOONGSON_MMI)
160 # define USE_M64_DOUBLE
161 #elif defined(__GNUC__)
162 # define USE_M64_CASTS
163 #elif defined(__SUNPRO_C)
164 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
165 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
166  * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
167  * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
168  */
169 #  define USE_CVT_INTRINSICS
170 # else
171 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
172  * disabled, __m64 is defined as a struct containing "unsigned long long l_".
173  */
174 #  define M64_MEMBER l_
175 # endif
176 #endif
177 
178 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
179 typedef uint64_t mmxdatafield;
180 #else
181 typedef __m64 mmxdatafield;
182 #endif
183 
184 typedef struct
185 {
186     mmxdatafield mmx_4x00ff;
187     mmxdatafield mmx_4x0080;
188     mmxdatafield mmx_565_rgb;
189     mmxdatafield mmx_565_unpack_multiplier;
190     mmxdatafield mmx_565_pack_multiplier;
191     mmxdatafield mmx_565_r;
192     mmxdatafield mmx_565_g;
193     mmxdatafield mmx_565_b;
194     mmxdatafield mmx_packed_565_rb;
195     mmxdatafield mmx_packed_565_g;
196     mmxdatafield mmx_expand_565_g;
197     mmxdatafield mmx_expand_565_b;
198     mmxdatafield mmx_expand_565_r;
199 #ifndef USE_LOONGSON_MMI
200     mmxdatafield mmx_mask_0;
201     mmxdatafield mmx_mask_1;
202     mmxdatafield mmx_mask_2;
203     mmxdatafield mmx_mask_3;
204 #endif
205     mmxdatafield mmx_full_alpha;
206     mmxdatafield mmx_4x0101;
207     mmxdatafield mmx_ff000000;
208 } mmx_data_t;
209 
210 #if defined(_MSC_VER)
211 # define MMXDATA_INIT(field, val) { val ## UI64 }
212 #elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
213 # define MMXDATA_INIT(field, val) field =   { val ## ULL }
214 #else                           /* mmxdatafield is an integral type */
215 # define MMXDATA_INIT(field, val) field =   val ## ULL
216 #endif
217 
218 static const mmx_data_t c =
219 {
220     MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
221     MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
222     MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
223     MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
224     MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
225     MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
226     MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
227     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
228     MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
229     MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
230     MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
231     MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
232     MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
233 #ifndef USE_LOONGSON_MMI
234     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
235     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
236     MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
237     MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
238 #endif
239     MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
240     MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
241     MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
242 };
243 
244 #ifdef USE_CVT_INTRINSICS
245 #    define MC(x) to_m64 (c.mmx_ ## x)
246 #elif defined(USE_M64_CASTS)
247 #    define MC(x) ((__m64)c.mmx_ ## x)
248 #elif defined(USE_M64_DOUBLE)
249 #    define MC(x) (*(__m64 *)&c.mmx_ ## x)
250 #else
251 #    define MC(x) c.mmx_ ## x
252 #endif
253 
254 static force_inline __m64
to_m64(uint64_t x)255 to_m64 (uint64_t x)
256 {
257 #ifdef USE_CVT_INTRINSICS
258     return _mm_cvtsi64_m64 (x);
259 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
260     __m64 res;
261 
262     res.M64_MEMBER = x;
263     return res;
264 #elif defined USE_M64_DOUBLE
265     return *(__m64 *)&x;
266 #else /* USE_M64_CASTS */
267     return (__m64)x;
268 #endif
269 }
270 
271 static force_inline uint64_t
to_uint64(__m64 x)272 to_uint64 (__m64 x)
273 {
274 #ifdef USE_CVT_INTRINSICS
275     return _mm_cvtm64_si64 (x);
276 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
277     uint64_t res = x.M64_MEMBER;
278     return res;
279 #elif defined USE_M64_DOUBLE
280     return *(uint64_t *)&x;
281 #else /* USE_M64_CASTS */
282     return (uint64_t)x;
283 #endif
284 }
285 
286 static force_inline __m64
shift(__m64 v,int s)287 shift (__m64 v,
288        int   s)
289 {
290     if (s > 0)
291 	return _mm_slli_si64 (v, s);
292     else if (s < 0)
293 	return _mm_srli_si64 (v, -s);
294     else
295 	return v;
296 }
297 
298 static force_inline __m64
negate(__m64 mask)299 negate (__m64 mask)
300 {
301     return _mm_xor_si64 (mask, MC (4x00ff));
302 }
303 
304 static force_inline __m64
pix_multiply(__m64 a,__m64 b)305 pix_multiply (__m64 a, __m64 b)
306 {
307     __m64 res;
308 
309     res = _mm_mullo_pi16 (a, b);
310     res = _mm_adds_pu16 (res, MC (4x0080));
311     res = _mm_mulhi_pu16 (res, MC (4x0101));
312 
313     return res;
314 }
315 
316 static force_inline __m64
pix_add(__m64 a,__m64 b)317 pix_add (__m64 a, __m64 b)
318 {
319     return _mm_adds_pu8 (a, b);
320 }
321 
322 static force_inline __m64
expand_alpha(__m64 pixel)323 expand_alpha (__m64 pixel)
324 {
325     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
326 }
327 
328 static force_inline __m64
expand_alpha_rev(__m64 pixel)329 expand_alpha_rev (__m64 pixel)
330 {
331     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
332 }
333 
334 static force_inline __m64
invert_colors(__m64 pixel)335 invert_colors (__m64 pixel)
336 {
337     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
338 }
339 
340 static force_inline __m64
over(__m64 src,__m64 srca,__m64 dest)341 over (__m64 src,
342       __m64 srca,
343       __m64 dest)
344 {
345     return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
346 }
347 
348 static force_inline __m64
over_rev_non_pre(__m64 src,__m64 dest)349 over_rev_non_pre (__m64 src, __m64 dest)
350 {
351     __m64 srca = expand_alpha (src);
352     __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
353 
354     return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
355 }
356 
357 static force_inline __m64
in(__m64 src,__m64 mask)358 in (__m64 src, __m64 mask)
359 {
360     return pix_multiply (src, mask);
361 }
362 
363 #ifndef _MSC_VER
364 static force_inline __m64
in_over(__m64 src,__m64 srca,__m64 mask,__m64 dest)365 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
366 {
367     return over (in (src, mask), pix_multiply (srca, mask), dest);
368 }
369 
370 #else
371 
372 #define in_over(src, srca, mask, dest)					\
373     over (in (src, mask), pix_multiply (srca, mask), dest)
374 
375 #endif
376 
377 /* Elemental unaligned loads */
378 
ldq_u(__m64 * p)379 static force_inline __m64 ldq_u(__m64 *p)
380 {
381 #ifdef USE_X86_MMX
382     /* x86's alignment restrictions are very relaxed. */
383     return *(__m64 *)p;
384 #elif defined USE_ARM_IWMMXT
385     int align = (uintptr_t)p & 7;
386     __m64 *aligned_p;
387     if (align == 0)
388 	return *p;
389     aligned_p = (__m64 *)((uintptr_t)p & ~7);
390     return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
391 #else
392     struct __una_u64 { __m64 x __attribute__((packed)); };
393     const struct __una_u64 *ptr = (const struct __una_u64 *) p;
394     return (__m64) ptr->x;
395 #endif
396 }
397 
ldl_u(const uint32_t * p)398 static force_inline uint32_t ldl_u(const uint32_t *p)
399 {
400 #ifdef USE_X86_MMX
401     /* x86's alignment restrictions are very relaxed. */
402     return *p;
403 #else
404     struct __una_u32 { uint32_t x __attribute__((packed)); };
405     const struct __una_u32 *ptr = (const struct __una_u32 *) p;
406     return ptr->x;
407 #endif
408 }
409 
410 static force_inline __m64
load(const uint32_t * v)411 load (const uint32_t *v)
412 {
413 #ifdef USE_LOONGSON_MMI
414     __m64 ret;
415     asm ("lwc1 %0, %1\n\t"
416 	: "=f" (ret)
417 	: "m" (*v)
418     );
419     return ret;
420 #else
421     return _mm_cvtsi32_si64 (*v);
422 #endif
423 }
424 
425 static force_inline __m64
load8888(const uint32_t * v)426 load8888 (const uint32_t *v)
427 {
428 #ifdef USE_LOONGSON_MMI
429     return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
430 #else
431     return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
432 #endif
433 }
434 
435 static force_inline __m64
load8888u(const uint32_t * v)436 load8888u (const uint32_t *v)
437 {
438     uint32_t l = ldl_u (v);
439     return load8888 (&l);
440 }
441 
442 static force_inline __m64
pack8888(__m64 lo,__m64 hi)443 pack8888 (__m64 lo, __m64 hi)
444 {
445     return _mm_packs_pu16 (lo, hi);
446 }
447 
448 static force_inline void
store(uint32_t * dest,__m64 v)449 store (uint32_t *dest, __m64 v)
450 {
451 #ifdef USE_LOONGSON_MMI
452     asm ("swc1 %1, %0\n\t"
453 	: "=m" (*dest)
454 	: "f" (v)
455 	: "memory"
456     );
457 #else
458     *dest = _mm_cvtsi64_si32 (v);
459 #endif
460 }
461 
462 static force_inline void
store8888(uint32_t * dest,__m64 v)463 store8888 (uint32_t *dest, __m64 v)
464 {
465     v = pack8888 (v, _mm_setzero_si64 ());
466     store (dest, v);
467 }
468 
469 static force_inline pixman_bool_t
is_equal(__m64 a,__m64 b)470 is_equal (__m64 a, __m64 b)
471 {
472 #ifdef USE_LOONGSON_MMI
473     /* __m64 is double, we can compare directly. */
474     return a == b;
475 #else
476     return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
477 #endif
478 }
479 
480 static force_inline pixman_bool_t
is_opaque(__m64 v)481 is_opaque (__m64 v)
482 {
483 #ifdef USE_LOONGSON_MMI
484     return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
485 #else
486     __m64 ffs = _mm_cmpeq_pi8 (v, v);
487     return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
488 #endif
489 }
490 
491 static force_inline pixman_bool_t
is_zero(__m64 v)492 is_zero (__m64 v)
493 {
494     return is_equal (v, _mm_setzero_si64 ());
495 }
496 
497 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
498  *
499  *    00RR00GG00BB
500  *
501  * --- Expanding 565 in the low word ---
502  *
503  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
504  * m = m & (01f0003f001f);
505  * m = m * (008404100840);
506  * m = m >> 8;
507  *
508  * Note the trick here - the top word is shifted by another nibble to
509  * avoid it bumping into the middle word
510  */
511 static force_inline __m64
expand565(__m64 pixel,int pos)512 expand565 (__m64 pixel, int pos)
513 {
514     __m64 p = pixel;
515     __m64 t1, t2;
516 
517     /* move pixel to low 16 bit and zero the rest */
518 #ifdef USE_LOONGSON_MMI
519     p = loongson_extract_pi16 (p, pos);
520 #else
521     p = shift (shift (p, (3 - pos) * 16), -48);
522 #endif
523 
524     t1 = shift (p, 36 - 11);
525     t2 = shift (p, 16 - 5);
526 
527     p = _mm_or_si64 (t1, p);
528     p = _mm_or_si64 (t2, p);
529     p = _mm_and_si64 (p, MC (565_rgb));
530 
531     pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
532     return _mm_srli_pi16 (pixel, 8);
533 }
534 
535 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
536  *
537  *    AARRGGBBRRGGBB
538  */
539 static force_inline void
expand_4xpacked565(__m64 vin,__m64 * vout0,__m64 * vout1,int full_alpha)540 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
541 {
542     __m64 t0, t1, alpha = _mm_setzero_si64 ();
543     __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
544     __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
545     __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
546     if (full_alpha)
547 	alpha = _mm_cmpeq_pi32 (alpha, alpha);
548 
549     /* Replicate high bits into empty low bits. */
550     r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
551     g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
552     b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
553 
554     r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */
555     g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */
556     b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */
557 
558     t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */
559     t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */
560 
561     *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */
562     *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */
563 }
564 
565 static force_inline __m64
expand8888(__m64 in,int pos)566 expand8888 (__m64 in, int pos)
567 {
568     if (pos == 0)
569 	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
570     else
571 	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
572 }
573 
574 static force_inline __m64
expandx888(__m64 in,int pos)575 expandx888 (__m64 in, int pos)
576 {
577     return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
578 }
579 
580 static force_inline void
expand_4x565(__m64 vin,__m64 * vout0,__m64 * vout1,__m64 * vout2,__m64 * vout3,int full_alpha)581 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
582 {
583     __m64 v0, v1;
584     expand_4xpacked565 (vin, &v0, &v1, full_alpha);
585     *vout0 = expand8888 (v0, 0);
586     *vout1 = expand8888 (v0, 1);
587     *vout2 = expand8888 (v1, 0);
588     *vout3 = expand8888 (v1, 1);
589 }
590 
591 static force_inline __m64
pack_565(__m64 pixel,__m64 target,int pos)592 pack_565 (__m64 pixel, __m64 target, int pos)
593 {
594     __m64 p = pixel;
595     __m64 t = target;
596     __m64 r, g, b;
597 
598     r = _mm_and_si64 (p, MC (565_r));
599     g = _mm_and_si64 (p, MC (565_g));
600     b = _mm_and_si64 (p, MC (565_b));
601 
602 #ifdef USE_LOONGSON_MMI
603     r = shift (r, -(32 - 8));
604     g = shift (g, -(16 - 3));
605     b = shift (b, -(0  + 3));
606 
607     p = _mm_or_si64 (r, g);
608     p = _mm_or_si64 (p, b);
609     return loongson_insert_pi16 (t, p, pos);
610 #else
611     r = shift (r, -(32 - 8) + pos * 16);
612     g = shift (g, -(16 - 3) + pos * 16);
613     b = shift (b, -(0  + 3) + pos * 16);
614 
615     if (pos == 0)
616 	t = _mm_and_si64 (t, MC (mask_0));
617     else if (pos == 1)
618 	t = _mm_and_si64 (t, MC (mask_1));
619     else if (pos == 2)
620 	t = _mm_and_si64 (t, MC (mask_2));
621     else if (pos == 3)
622 	t = _mm_and_si64 (t, MC (mask_3));
623 
624     p = _mm_or_si64 (r, t);
625     p = _mm_or_si64 (g, p);
626 
627     return _mm_or_si64 (b, p);
628 #endif
629 }
630 
631 static force_inline __m64
pack_4xpacked565(__m64 a,__m64 b)632 pack_4xpacked565 (__m64 a, __m64 b)
633 {
634     __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
635     __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
636 
637     __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
638     __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
639 
640     __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
641     __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
642 
643     t0 = _mm_or_si64 (t0, g0);
644     t1 = _mm_or_si64 (t1, g1);
645 
646     t0 = shift(t0, -5);
647 #ifdef USE_ARM_IWMMXT
648     t1 = shift(t1, -5);
649     return _mm_packs_pu32 (t0, t1);
650 #else
651     t1 = shift(t1, -5 + 16);
652     return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
653 #endif
654 }
655 
656 #ifndef _MSC_VER
657 
658 static force_inline __m64
pack_4x565(__m64 v0,__m64 v1,__m64 v2,__m64 v3)659 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
660 {
661     return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
662 }
663 
664 static force_inline __m64
pix_add_mul(__m64 x,__m64 a,__m64 y,__m64 b)665 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
666 {
667     x = pix_multiply (x, a);
668     y = pix_multiply (y, b);
669 
670     return pix_add (x, y);
671 }
672 
673 #else
674 
675 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
676 
677 #define pack_4x565(v0, v1, v2, v3) \
678     pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
679 
680 #define pix_add_mul(x, a, y, b)	 \
681     ( x = pix_multiply (x, a),	 \
682       y = pix_multiply (y, b),	 \
683       pix_add (x, y) )
684 
685 #endif
686 
687 /* --------------- MMX code patch for fbcompose.c --------------------- */
688 
689 static force_inline __m64
combine(const uint32_t * src,const uint32_t * mask)690 combine (const uint32_t *src, const uint32_t *mask)
691 {
692     __m64 vsrc = load8888 (src);
693 
694     if (mask)
695     {
696 	__m64 m = load8888 (mask);
697 
698 	m = expand_alpha (m);
699 	vsrc = pix_multiply (vsrc, m);
700     }
701 
702     return vsrc;
703 }
704 
705 static force_inline __m64
core_combine_over_u_pixel_mmx(__m64 vsrc,__m64 vdst)706 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
707 {
708     vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
709 
710     if (is_opaque (vsrc))
711     {
712 	return vsrc;
713     }
714     else if (!is_zero (vsrc))
715     {
716 	return over (vsrc, expand_alpha (vsrc),
717 		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
718     }
719 
720     return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
721 }
722 
723 static void
mmx_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)724 mmx_combine_over_u (pixman_implementation_t *imp,
725                     pixman_op_t              op,
726                     uint32_t *               dest,
727                     const uint32_t *         src,
728                     const uint32_t *         mask,
729                     int                      width)
730 {
731     const uint32_t *end = dest + width;
732 
733     while (dest < end)
734     {
735 	__m64 vsrc = combine (src, mask);
736 
737 	if (is_opaque (vsrc))
738 	{
739 	    store8888 (dest, vsrc);
740 	}
741 	else if (!is_zero (vsrc))
742 	{
743 	    __m64 sa = expand_alpha (vsrc);
744 	    store8888 (dest, over (vsrc, sa, load8888 (dest)));
745 	}
746 
747 	++dest;
748 	++src;
749 	if (mask)
750 	    ++mask;
751     }
752     _mm_empty ();
753 }
754 
755 static void
mmx_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)756 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
757                             pixman_op_t              op,
758                             uint32_t *               dest,
759                             const uint32_t *         src,
760                             const uint32_t *         mask,
761                             int                      width)
762 {
763     const uint32_t *end = dest + width;
764 
765     while (dest < end)
766     {
767 	__m64 d, da;
768 	__m64 s = combine (src, mask);
769 
770 	d = load8888 (dest);
771 	da = expand_alpha (d);
772 	store8888 (dest, over (d, da, s));
773 
774 	++dest;
775 	++src;
776 	if (mask)
777 	    mask++;
778     }
779     _mm_empty ();
780 }
781 
782 static void
mmx_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)783 mmx_combine_in_u (pixman_implementation_t *imp,
784                   pixman_op_t              op,
785                   uint32_t *               dest,
786                   const uint32_t *         src,
787                   const uint32_t *         mask,
788                   int                      width)
789 {
790     const uint32_t *end = dest + width;
791 
792     while (dest < end)
793     {
794 	__m64 a;
795 	__m64 x = combine (src, mask);
796 
797 	a = load8888 (dest);
798 	a = expand_alpha (a);
799 	x = pix_multiply (x, a);
800 
801 	store8888 (dest, x);
802 
803 	++dest;
804 	++src;
805 	if (mask)
806 	    mask++;
807     }
808     _mm_empty ();
809 }
810 
811 static void
mmx_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)812 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
813                           pixman_op_t              op,
814                           uint32_t *               dest,
815                           const uint32_t *         src,
816                           const uint32_t *         mask,
817                           int                      width)
818 {
819     const uint32_t *end = dest + width;
820 
821     while (dest < end)
822     {
823 	__m64 a = combine (src, mask);
824 	__m64 x;
825 
826 	x = load8888 (dest);
827 	a = expand_alpha (a);
828 	x = pix_multiply (x, a);
829 	store8888 (dest, x);
830 
831 	++dest;
832 	++src;
833 	if (mask)
834 	    mask++;
835     }
836     _mm_empty ();
837 }
838 
839 static void
mmx_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)840 mmx_combine_out_u (pixman_implementation_t *imp,
841                    pixman_op_t              op,
842                    uint32_t *               dest,
843                    const uint32_t *         src,
844                    const uint32_t *         mask,
845                    int                      width)
846 {
847     const uint32_t *end = dest + width;
848 
849     while (dest < end)
850     {
851 	__m64 a;
852 	__m64 x = combine (src, mask);
853 
854 	a = load8888 (dest);
855 	a = expand_alpha (a);
856 	a = negate (a);
857 	x = pix_multiply (x, a);
858 	store8888 (dest, x);
859 
860 	++dest;
861 	++src;
862 	if (mask)
863 	    mask++;
864     }
865     _mm_empty ();
866 }
867 
868 static void
mmx_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)869 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
870                            pixman_op_t              op,
871                            uint32_t *               dest,
872                            const uint32_t *         src,
873                            const uint32_t *         mask,
874                            int                      width)
875 {
876     const uint32_t *end = dest + width;
877 
878     while (dest < end)
879     {
880 	__m64 a = combine (src, mask);
881 	__m64 x;
882 
883 	x = load8888 (dest);
884 	a = expand_alpha (a);
885 	a = negate (a);
886 	x = pix_multiply (x, a);
887 
888 	store8888 (dest, x);
889 
890 	++dest;
891 	++src;
892 	if (mask)
893 	    mask++;
894     }
895     _mm_empty ();
896 }
897 
898 static void
mmx_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)899 mmx_combine_atop_u (pixman_implementation_t *imp,
900                     pixman_op_t              op,
901                     uint32_t *               dest,
902                     const uint32_t *         src,
903                     const uint32_t *         mask,
904                     int                      width)
905 {
906     const uint32_t *end = dest + width;
907 
908     while (dest < end)
909     {
910 	__m64 da, d, sia;
911 	__m64 s = combine (src, mask);
912 
913 	d = load8888 (dest);
914 	sia = expand_alpha (s);
915 	sia = negate (sia);
916 	da = expand_alpha (d);
917 	s = pix_add_mul (s, da, d, sia);
918 	store8888 (dest, s);
919 
920 	++dest;
921 	++src;
922 	if (mask)
923 	    mask++;
924     }
925     _mm_empty ();
926 }
927 
928 static void
mmx_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)929 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
930                             pixman_op_t              op,
931                             uint32_t *               dest,
932                             const uint32_t *         src,
933                             const uint32_t *         mask,
934                             int                      width)
935 {
936     const uint32_t *end;
937 
938     end = dest + width;
939 
940     while (dest < end)
941     {
942 	__m64 dia, d, sa;
943 	__m64 s = combine (src, mask);
944 
945 	d = load8888 (dest);
946 	sa = expand_alpha (s);
947 	dia = expand_alpha (d);
948 	dia = negate (dia);
949 	s = pix_add_mul (s, dia, d, sa);
950 	store8888 (dest, s);
951 
952 	++dest;
953 	++src;
954 	if (mask)
955 	    mask++;
956     }
957     _mm_empty ();
958 }
959 
960 static void
mmx_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)961 mmx_combine_xor_u (pixman_implementation_t *imp,
962                    pixman_op_t              op,
963                    uint32_t *               dest,
964                    const uint32_t *         src,
965                    const uint32_t *         mask,
966                    int                      width)
967 {
968     const uint32_t *end = dest + width;
969 
970     while (dest < end)
971     {
972 	__m64 dia, d, sia;
973 	__m64 s = combine (src, mask);
974 
975 	d = load8888 (dest);
976 	sia = expand_alpha (s);
977 	dia = expand_alpha (d);
978 	sia = negate (sia);
979 	dia = negate (dia);
980 	s = pix_add_mul (s, dia, d, sia);
981 	store8888 (dest, s);
982 
983 	++dest;
984 	++src;
985 	if (mask)
986 	    mask++;
987     }
988     _mm_empty ();
989 }
990 
991 static void
mmx_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)992 mmx_combine_add_u (pixman_implementation_t *imp,
993                    pixman_op_t              op,
994                    uint32_t *               dest,
995                    const uint32_t *         src,
996                    const uint32_t *         mask,
997                    int                      width)
998 {
999     const uint32_t *end = dest + width;
1000 
1001     while (dest < end)
1002     {
1003 	__m64 d;
1004 	__m64 s = combine (src, mask);
1005 
1006 	d = load8888 (dest);
1007 	s = pix_add (s, d);
1008 	store8888 (dest, s);
1009 
1010 	++dest;
1011 	++src;
1012 	if (mask)
1013 	    mask++;
1014     }
1015     _mm_empty ();
1016 }
1017 
1018 static void
mmx_combine_saturate_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1019 mmx_combine_saturate_u (pixman_implementation_t *imp,
1020                         pixman_op_t              op,
1021                         uint32_t *               dest,
1022                         const uint32_t *         src,
1023                         const uint32_t *         mask,
1024                         int                      width)
1025 {
1026     const uint32_t *end = dest + width;
1027 
1028     while (dest < end)
1029     {
1030 	uint32_t s, sa, da;
1031 	uint32_t d = *dest;
1032 	__m64 ms = combine (src, mask);
1033 	__m64 md = load8888 (dest);
1034 
1035 	store8888(&s, ms);
1036 	da = ~d >> 24;
1037 	sa = s >> 24;
1038 
1039 	if (sa > da)
1040 	{
1041 	    uint32_t quot = DIV_UN8 (da, sa) << 24;
1042 	    __m64 msa = load8888 (&quot);
1043 	    msa = expand_alpha (msa);
1044 	    ms = pix_multiply (ms, msa);
1045 	}
1046 
1047 	md = pix_add (md, ms);
1048 	store8888 (dest, md);
1049 
1050 	++src;
1051 	++dest;
1052 	if (mask)
1053 	    mask++;
1054     }
1055     _mm_empty ();
1056 }
1057 
1058 static void
mmx_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1059 mmx_combine_src_ca (pixman_implementation_t *imp,
1060                     pixman_op_t              op,
1061                     uint32_t *               dest,
1062                     const uint32_t *         src,
1063                     const uint32_t *         mask,
1064                     int                      width)
1065 {
1066     const uint32_t *end = src + width;
1067 
1068     while (src < end)
1069     {
1070 	__m64 a = load8888 (mask);
1071 	__m64 s = load8888 (src);
1072 
1073 	s = pix_multiply (s, a);
1074 	store8888 (dest, s);
1075 
1076 	++src;
1077 	++mask;
1078 	++dest;
1079     }
1080     _mm_empty ();
1081 }
1082 
1083 static void
mmx_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1084 mmx_combine_over_ca (pixman_implementation_t *imp,
1085                      pixman_op_t              op,
1086                      uint32_t *               dest,
1087                      const uint32_t *         src,
1088                      const uint32_t *         mask,
1089                      int                      width)
1090 {
1091     const uint32_t *end = src + width;
1092 
1093     while (src < end)
1094     {
1095 	__m64 a = load8888 (mask);
1096 	__m64 s = load8888 (src);
1097 	__m64 d = load8888 (dest);
1098 	__m64 sa = expand_alpha (s);
1099 
1100 	store8888 (dest, in_over (s, sa, a, d));
1101 
1102 	++src;
1103 	++dest;
1104 	++mask;
1105     }
1106     _mm_empty ();
1107 }
1108 
1109 static void
mmx_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1110 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1111                              pixman_op_t              op,
1112                              uint32_t *               dest,
1113                              const uint32_t *         src,
1114                              const uint32_t *         mask,
1115                              int                      width)
1116 {
1117     const uint32_t *end = src + width;
1118 
1119     while (src < end)
1120     {
1121 	__m64 a = load8888 (mask);
1122 	__m64 s = load8888 (src);
1123 	__m64 d = load8888 (dest);
1124 	__m64 da = expand_alpha (d);
1125 
1126 	store8888 (dest, over (d, da, in (s, a)));
1127 
1128 	++src;
1129 	++dest;
1130 	++mask;
1131     }
1132     _mm_empty ();
1133 }
1134 
1135 static void
mmx_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1136 mmx_combine_in_ca (pixman_implementation_t *imp,
1137                    pixman_op_t              op,
1138                    uint32_t *               dest,
1139                    const uint32_t *         src,
1140                    const uint32_t *         mask,
1141                    int                      width)
1142 {
1143     const uint32_t *end = src + width;
1144 
1145     while (src < end)
1146     {
1147 	__m64 a = load8888 (mask);
1148 	__m64 s = load8888 (src);
1149 	__m64 d = load8888 (dest);
1150 	__m64 da = expand_alpha (d);
1151 
1152 	s = pix_multiply (s, a);
1153 	s = pix_multiply (s, da);
1154 	store8888 (dest, s);
1155 
1156 	++src;
1157 	++dest;
1158 	++mask;
1159     }
1160     _mm_empty ();
1161 }
1162 
1163 static void
mmx_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1164 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1165                            pixman_op_t              op,
1166                            uint32_t *               dest,
1167                            const uint32_t *         src,
1168                            const uint32_t *         mask,
1169                            int                      width)
1170 {
1171     const uint32_t *end = src + width;
1172 
1173     while (src < end)
1174     {
1175 	__m64 a = load8888 (mask);
1176 	__m64 s = load8888 (src);
1177 	__m64 d = load8888 (dest);
1178 	__m64 sa = expand_alpha (s);
1179 
1180 	a = pix_multiply (a, sa);
1181 	d = pix_multiply (d, a);
1182 	store8888 (dest, d);
1183 
1184 	++src;
1185 	++dest;
1186 	++mask;
1187     }
1188     _mm_empty ();
1189 }
1190 
1191 static void
mmx_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1192 mmx_combine_out_ca (pixman_implementation_t *imp,
1193                     pixman_op_t              op,
1194                     uint32_t *               dest,
1195                     const uint32_t *         src,
1196                     const uint32_t *         mask,
1197                     int                      width)
1198 {
1199     const uint32_t *end = src + width;
1200 
1201     while (src < end)
1202     {
1203 	__m64 a = load8888 (mask);
1204 	__m64 s = load8888 (src);
1205 	__m64 d = load8888 (dest);
1206 	__m64 da = expand_alpha (d);
1207 
1208 	da = negate (da);
1209 	s = pix_multiply (s, a);
1210 	s = pix_multiply (s, da);
1211 	store8888 (dest, s);
1212 
1213 	++src;
1214 	++dest;
1215 	++mask;
1216     }
1217     _mm_empty ();
1218 }
1219 
1220 static void
mmx_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1221 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1222                             pixman_op_t              op,
1223                             uint32_t *               dest,
1224                             const uint32_t *         src,
1225                             const uint32_t *         mask,
1226                             int                      width)
1227 {
1228     const uint32_t *end = src + width;
1229 
1230     while (src < end)
1231     {
1232 	__m64 a = load8888 (mask);
1233 	__m64 s = load8888 (src);
1234 	__m64 d = load8888 (dest);
1235 	__m64 sa = expand_alpha (s);
1236 
1237 	a = pix_multiply (a, sa);
1238 	a = negate (a);
1239 	d = pix_multiply (d, a);
1240 	store8888 (dest, d);
1241 
1242 	++src;
1243 	++dest;
1244 	++mask;
1245     }
1246     _mm_empty ();
1247 }
1248 
1249 static void
mmx_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1250 mmx_combine_atop_ca (pixman_implementation_t *imp,
1251                      pixman_op_t              op,
1252                      uint32_t *               dest,
1253                      const uint32_t *         src,
1254                      const uint32_t *         mask,
1255                      int                      width)
1256 {
1257     const uint32_t *end = src + width;
1258 
1259     while (src < end)
1260     {
1261 	__m64 a = load8888 (mask);
1262 	__m64 s = load8888 (src);
1263 	__m64 d = load8888 (dest);
1264 	__m64 da = expand_alpha (d);
1265 	__m64 sa = expand_alpha (s);
1266 
1267 	s = pix_multiply (s, a);
1268 	a = pix_multiply (a, sa);
1269 	a = negate (a);
1270 	d = pix_add_mul (d, a, s, da);
1271 	store8888 (dest, d);
1272 
1273 	++src;
1274 	++dest;
1275 	++mask;
1276     }
1277     _mm_empty ();
1278 }
1279 
1280 static void
mmx_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1281 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1282                              pixman_op_t              op,
1283                              uint32_t *               dest,
1284                              const uint32_t *         src,
1285                              const uint32_t *         mask,
1286                              int                      width)
1287 {
1288     const uint32_t *end = src + width;
1289 
1290     while (src < end)
1291     {
1292 	__m64 a = load8888 (mask);
1293 	__m64 s = load8888 (src);
1294 	__m64 d = load8888 (dest);
1295 	__m64 da = expand_alpha (d);
1296 	__m64 sa = expand_alpha (s);
1297 
1298 	s = pix_multiply (s, a);
1299 	a = pix_multiply (a, sa);
1300 	da = negate (da);
1301 	d = pix_add_mul (d, a, s, da);
1302 	store8888 (dest, d);
1303 
1304 	++src;
1305 	++dest;
1306 	++mask;
1307     }
1308     _mm_empty ();
1309 }
1310 
1311 static void
mmx_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1312 mmx_combine_xor_ca (pixman_implementation_t *imp,
1313                     pixman_op_t              op,
1314                     uint32_t *               dest,
1315                     const uint32_t *         src,
1316                     const uint32_t *         mask,
1317                     int                      width)
1318 {
1319     const uint32_t *end = src + width;
1320 
1321     while (src < end)
1322     {
1323 	__m64 a = load8888 (mask);
1324 	__m64 s = load8888 (src);
1325 	__m64 d = load8888 (dest);
1326 	__m64 da = expand_alpha (d);
1327 	__m64 sa = expand_alpha (s);
1328 
1329 	s = pix_multiply (s, a);
1330 	a = pix_multiply (a, sa);
1331 	da = negate (da);
1332 	a = negate (a);
1333 	d = pix_add_mul (d, a, s, da);
1334 	store8888 (dest, d);
1335 
1336 	++src;
1337 	++dest;
1338 	++mask;
1339     }
1340     _mm_empty ();
1341 }
1342 
1343 static void
mmx_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1344 mmx_combine_add_ca (pixman_implementation_t *imp,
1345                     pixman_op_t              op,
1346                     uint32_t *               dest,
1347                     const uint32_t *         src,
1348                     const uint32_t *         mask,
1349                     int                      width)
1350 {
1351     const uint32_t *end = src + width;
1352 
1353     while (src < end)
1354     {
1355 	__m64 a = load8888 (mask);
1356 	__m64 s = load8888 (src);
1357 	__m64 d = load8888 (dest);
1358 
1359 	s = pix_multiply (s, a);
1360 	d = pix_add (s, d);
1361 	store8888 (dest, d);
1362 
1363 	++src;
1364 	++dest;
1365 	++mask;
1366     }
1367     _mm_empty ();
1368 }
1369 
1370 /* ------------- MMX code paths called from fbpict.c -------------------- */
1371 
1372 static void
mmx_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1373 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1374                            pixman_composite_info_t *info)
1375 {
1376     PIXMAN_COMPOSITE_ARGS (info);
1377     uint32_t src;
1378     uint32_t    *dst_line, *dst;
1379     int32_t w;
1380     int dst_stride;
1381     __m64 vsrc, vsrca;
1382 
1383     CHECKPOINT ();
1384 
1385     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1386 
1387     if (src == 0)
1388 	return;
1389 
1390     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1391 
1392     vsrc = load8888 (&src);
1393     vsrca = expand_alpha (vsrc);
1394 
1395     while (height--)
1396     {
1397 	dst = dst_line;
1398 	dst_line += dst_stride;
1399 	w = width;
1400 
1401 	CHECKPOINT ();
1402 
1403 	while (w && (uintptr_t)dst & 7)
1404 	{
1405 	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1406 
1407 	    w--;
1408 	    dst++;
1409 	}
1410 
1411 	while (w >= 2)
1412 	{
1413 	    __m64 vdest;
1414 	    __m64 dest0, dest1;
1415 
1416 	    vdest = *(__m64 *)dst;
1417 
1418 	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1419 	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1420 
1421 	    *(__m64 *)dst = pack8888 (dest0, dest1);
1422 
1423 	    dst += 2;
1424 	    w -= 2;
1425 	}
1426 
1427 	CHECKPOINT ();
1428 
1429 	if (w)
1430 	{
1431 	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1432 	}
1433     }
1434 
1435     _mm_empty ();
1436 }
1437 
1438 static void
mmx_composite_over_n_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1439 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1440                            pixman_composite_info_t *info)
1441 {
1442     PIXMAN_COMPOSITE_ARGS (info);
1443     uint32_t src;
1444     uint16_t    *dst_line, *dst;
1445     int32_t w;
1446     int dst_stride;
1447     __m64 vsrc, vsrca;
1448 
1449     CHECKPOINT ();
1450 
1451     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1452 
1453     if (src == 0)
1454 	return;
1455 
1456     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1457 
1458     vsrc = load8888 (&src);
1459     vsrca = expand_alpha (vsrc);
1460 
1461     while (height--)
1462     {
1463 	dst = dst_line;
1464 	dst_line += dst_stride;
1465 	w = width;
1466 
1467 	CHECKPOINT ();
1468 
1469 	while (w && (uintptr_t)dst & 7)
1470 	{
1471 	    uint64_t d = *dst;
1472 	    __m64 vdest = expand565 (to_m64 (d), 0);
1473 
1474 	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1475 	    *dst = to_uint64 (vdest);
1476 
1477 	    w--;
1478 	    dst++;
1479 	}
1480 
1481 	while (w >= 4)
1482 	{
1483 	    __m64 vdest = *(__m64 *)dst;
1484 	    __m64 v0, v1, v2, v3;
1485 
1486 	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1487 
1488 	    v0 = over (vsrc, vsrca, v0);
1489 	    v1 = over (vsrc, vsrca, v1);
1490 	    v2 = over (vsrc, vsrca, v2);
1491 	    v3 = over (vsrc, vsrca, v3);
1492 
1493 	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1494 
1495 	    dst += 4;
1496 	    w -= 4;
1497 	}
1498 
1499 	CHECKPOINT ();
1500 
1501 	while (w)
1502 	{
1503 	    uint64_t d = *dst;
1504 	    __m64 vdest = expand565 (to_m64 (d), 0);
1505 
1506 	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1507 	    *dst = to_uint64 (vdest);
1508 
1509 	    w--;
1510 	    dst++;
1511 	}
1512     }
1513 
1514     _mm_empty ();
1515 }
1516 
1517 static void
mmx_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)1518 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1519                                    pixman_composite_info_t *info)
1520 {
1521     PIXMAN_COMPOSITE_ARGS (info);
1522     uint32_t src;
1523     uint32_t    *dst_line;
1524     uint32_t    *mask_line;
1525     int dst_stride, mask_stride;
1526     __m64 vsrc, vsrca;
1527 
1528     CHECKPOINT ();
1529 
1530     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1531 
1532     if (src == 0)
1533 	return;
1534 
1535     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1536     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1537 
1538     vsrc = load8888 (&src);
1539     vsrca = expand_alpha (vsrc);
1540 
1541     while (height--)
1542     {
1543 	int twidth = width;
1544 	uint32_t *p = (uint32_t *)mask_line;
1545 	uint32_t *q = (uint32_t *)dst_line;
1546 
1547 	while (twidth && (uintptr_t)q & 7)
1548 	{
1549 	    uint32_t m = *(uint32_t *)p;
1550 
1551 	    if (m)
1552 	    {
1553 		__m64 vdest = load8888 (q);
1554 		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1555 		store8888 (q, vdest);
1556 	    }
1557 
1558 	    twidth--;
1559 	    p++;
1560 	    q++;
1561 	}
1562 
1563 	while (twidth >= 2)
1564 	{
1565 	    uint32_t m0, m1;
1566 	    m0 = *p;
1567 	    m1 = *(p + 1);
1568 
1569 	    if (m0 | m1)
1570 	    {
1571 		__m64 dest0, dest1;
1572 		__m64 vdest = *(__m64 *)q;
1573 
1574 		dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1575 		                 expand8888 (vdest, 0));
1576 		dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1577 		                 expand8888 (vdest, 1));
1578 
1579 		*(__m64 *)q = pack8888 (dest0, dest1);
1580 	    }
1581 
1582 	    p += 2;
1583 	    q += 2;
1584 	    twidth -= 2;
1585 	}
1586 
1587 	if (twidth)
1588 	{
1589 	    uint32_t m = *(uint32_t *)p;
1590 
1591 	    if (m)
1592 	    {
1593 		__m64 vdest = load8888 (q);
1594 		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1595 		store8888 (q, vdest);
1596 	    }
1597 
1598 	    twidth--;
1599 	    p++;
1600 	    q++;
1601 	}
1602 
1603 	dst_line += dst_stride;
1604 	mask_line += mask_stride;
1605     }
1606 
1607     _mm_empty ();
1608 }
1609 
1610 static void
mmx_composite_over_8888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1611 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1612                                 pixman_composite_info_t *info)
1613 {
1614     PIXMAN_COMPOSITE_ARGS (info);
1615     uint32_t    *dst_line, *dst;
1616     uint32_t    *src_line, *src;
1617     uint32_t mask;
1618     __m64 vmask;
1619     int dst_stride, src_stride;
1620     int32_t w;
1621 
1622     CHECKPOINT ();
1623 
1624     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1625     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1626 
1627     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1628     vmask = expand_alpha (load8888 (&mask));
1629 
1630     while (height--)
1631     {
1632 	dst = dst_line;
1633 	dst_line += dst_stride;
1634 	src = src_line;
1635 	src_line += src_stride;
1636 	w = width;
1637 
1638 	while (w && (uintptr_t)dst & 7)
1639 	{
1640 	    __m64 s = load8888 (src);
1641 	    __m64 d = load8888 (dst);
1642 
1643 	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1644 
1645 	    w--;
1646 	    dst++;
1647 	    src++;
1648 	}
1649 
1650 	while (w >= 2)
1651 	{
1652 	    __m64 vs = ldq_u ((__m64 *)src);
1653 	    __m64 vd = *(__m64 *)dst;
1654 	    __m64 vsrc0 = expand8888 (vs, 0);
1655 	    __m64 vsrc1 = expand8888 (vs, 1);
1656 
1657 	    *(__m64 *)dst = pack8888 (
1658 	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1659 	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1660 
1661 	    w -= 2;
1662 	    dst += 2;
1663 	    src += 2;
1664 	}
1665 
1666 	if (w)
1667 	{
1668 	    __m64 s = load8888 (src);
1669 	    __m64 d = load8888 (dst);
1670 
1671 	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1672 	}
1673     }
1674 
1675     _mm_empty ();
1676 }
1677 
1678 static void
mmx_composite_over_x888_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1679 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1680                                 pixman_composite_info_t *info)
1681 {
1682     PIXMAN_COMPOSITE_ARGS (info);
1683     uint32_t *dst_line, *dst;
1684     uint32_t *src_line, *src;
1685     uint32_t mask;
1686     __m64 vmask;
1687     int dst_stride, src_stride;
1688     int32_t w;
1689     __m64 srca;
1690 
1691     CHECKPOINT ();
1692 
1693     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1694     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1695     mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1696 
1697     vmask = expand_alpha (load8888 (&mask));
1698     srca = MC (4x00ff);
1699 
1700     while (height--)
1701     {
1702 	dst = dst_line;
1703 	dst_line += dst_stride;
1704 	src = src_line;
1705 	src_line += src_stride;
1706 	w = width;
1707 
1708 	while (w && (uintptr_t)dst & 7)
1709 	{
1710 	    uint32_t ssrc = *src | 0xff000000;
1711 	    __m64 s = load8888 (&ssrc);
1712 	    __m64 d = load8888 (dst);
1713 
1714 	    store8888 (dst, in_over (s, srca, vmask, d));
1715 
1716 	    w--;
1717 	    dst++;
1718 	    src++;
1719 	}
1720 
1721 	while (w >= 16)
1722 	{
1723 	    __m64 vd0 = *(__m64 *)(dst + 0);
1724 	    __m64 vd1 = *(__m64 *)(dst + 2);
1725 	    __m64 vd2 = *(__m64 *)(dst + 4);
1726 	    __m64 vd3 = *(__m64 *)(dst + 6);
1727 	    __m64 vd4 = *(__m64 *)(dst + 8);
1728 	    __m64 vd5 = *(__m64 *)(dst + 10);
1729 	    __m64 vd6 = *(__m64 *)(dst + 12);
1730 	    __m64 vd7 = *(__m64 *)(dst + 14);
1731 
1732 	    __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1733 	    __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1734 	    __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1735 	    __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1736 	    __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1737 	    __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1738 	    __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1739 	    __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1740 
1741 	    vd0 = pack8888 (
1742 	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1743 	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1744 
1745 	    vd1 = pack8888 (
1746 	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1747 	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1748 
1749 	    vd2 = pack8888 (
1750 	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1751 	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1752 
1753 	    vd3 = pack8888 (
1754 	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1755 	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1756 
1757 	    vd4 = pack8888 (
1758 	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1759 	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1760 
1761 	    vd5 = pack8888 (
1762 	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1763 	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1764 
1765 	    vd6 = pack8888 (
1766 	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1767 	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1768 
1769 	    vd7 = pack8888 (
1770 	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1771 	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1772 
1773 	    *(__m64 *)(dst + 0) = vd0;
1774 	    *(__m64 *)(dst + 2) = vd1;
1775 	    *(__m64 *)(dst + 4) = vd2;
1776 	    *(__m64 *)(dst + 6) = vd3;
1777 	    *(__m64 *)(dst + 8) = vd4;
1778 	    *(__m64 *)(dst + 10) = vd5;
1779 	    *(__m64 *)(dst + 12) = vd6;
1780 	    *(__m64 *)(dst + 14) = vd7;
1781 
1782 	    w -= 16;
1783 	    dst += 16;
1784 	    src += 16;
1785 	}
1786 
1787 	while (w)
1788 	{
1789 	    uint32_t ssrc = *src | 0xff000000;
1790 	    __m64 s = load8888 (&ssrc);
1791 	    __m64 d = load8888 (dst);
1792 
1793 	    store8888 (dst, in_over (s, srca, vmask, d));
1794 
1795 	    w--;
1796 	    dst++;
1797 	    src++;
1798 	}
1799     }
1800 
1801     _mm_empty ();
1802 }
1803 
1804 static void
mmx_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1805 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1806                               pixman_composite_info_t *info)
1807 {
1808     PIXMAN_COMPOSITE_ARGS (info);
1809     uint32_t *dst_line, *dst;
1810     uint32_t *src_line, *src;
1811     uint32_t s;
1812     int dst_stride, src_stride;
1813     uint8_t a;
1814     int32_t w;
1815 
1816     CHECKPOINT ();
1817 
1818     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1819     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1820 
1821     while (height--)
1822     {
1823 	dst = dst_line;
1824 	dst_line += dst_stride;
1825 	src = src_line;
1826 	src_line += src_stride;
1827 	w = width;
1828 
1829 	while (w--)
1830 	{
1831 	    s = *src++;
1832 	    a = s >> 24;
1833 
1834 	    if (a == 0xff)
1835 	    {
1836 		*dst = s;
1837 	    }
1838 	    else if (s)
1839 	    {
1840 		__m64 ms, sa;
1841 		ms = load8888 (&s);
1842 		sa = expand_alpha (ms);
1843 		store8888 (dst, over (ms, sa, load8888 (dst)));
1844 	    }
1845 
1846 	    dst++;
1847 	}
1848     }
1849     _mm_empty ();
1850 }
1851 
1852 static void
mmx_composite_over_8888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)1853 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1854                               pixman_composite_info_t *info)
1855 {
1856     PIXMAN_COMPOSITE_ARGS (info);
1857     uint16_t    *dst_line, *dst;
1858     uint32_t    *src_line, *src;
1859     int dst_stride, src_stride;
1860     int32_t w;
1861 
1862     CHECKPOINT ();
1863 
1864     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1865     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1866 
1867 #if 0
1868     /* FIXME */
1869     assert (src_image->drawable == mask_image->drawable);
1870 #endif
1871 
1872     while (height--)
1873     {
1874 	dst = dst_line;
1875 	dst_line += dst_stride;
1876 	src = src_line;
1877 	src_line += src_stride;
1878 	w = width;
1879 
1880 	CHECKPOINT ();
1881 
1882 	while (w && (uintptr_t)dst & 7)
1883 	{
1884 	    __m64 vsrc = load8888 (src);
1885 	    uint64_t d = *dst;
1886 	    __m64 vdest = expand565 (to_m64 (d), 0);
1887 
1888 	    vdest = pack_565 (
1889 		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1890 
1891 	    *dst = to_uint64 (vdest);
1892 
1893 	    w--;
1894 	    dst++;
1895 	    src++;
1896 	}
1897 
1898 	CHECKPOINT ();
1899 
1900 	while (w >= 4)
1901 	{
1902 	    __m64 vdest = *(__m64 *)dst;
1903 	    __m64 v0, v1, v2, v3;
1904 	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1905 
1906 	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1907 
1908 	    vsrc0 = load8888 ((src + 0));
1909 	    vsrc1 = load8888 ((src + 1));
1910 	    vsrc2 = load8888 ((src + 2));
1911 	    vsrc3 = load8888 ((src + 3));
1912 
1913 	    v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1914 	    v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1915 	    v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1916 	    v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1917 
1918 	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1919 
1920 	    w -= 4;
1921 	    dst += 4;
1922 	    src += 4;
1923 	}
1924 
1925 	CHECKPOINT ();
1926 
1927 	while (w)
1928 	{
1929 	    __m64 vsrc = load8888 (src);
1930 	    uint64_t d = *dst;
1931 	    __m64 vdest = expand565 (to_m64 (d), 0);
1932 
1933 	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1934 
1935 	    *dst = to_uint64 (vdest);
1936 
1937 	    w--;
1938 	    dst++;
1939 	    src++;
1940 	}
1941     }
1942 
1943     _mm_empty ();
1944 }
1945 
1946 static void
mmx_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)1947 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1948                              pixman_composite_info_t *info)
1949 {
1950     PIXMAN_COMPOSITE_ARGS (info);
1951     uint32_t src, srca;
1952     uint32_t *dst_line, *dst;
1953     uint8_t *mask_line, *mask;
1954     int dst_stride, mask_stride;
1955     int32_t w;
1956     __m64 vsrc, vsrca;
1957     uint64_t srcsrc;
1958 
1959     CHECKPOINT ();
1960 
1961     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1962 
1963     srca = src >> 24;
1964     if (src == 0)
1965 	return;
1966 
1967     srcsrc = (uint64_t)src << 32 | src;
1968 
1969     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1970     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1971 
1972     vsrc = load8888 (&src);
1973     vsrca = expand_alpha (vsrc);
1974 
1975     while (height--)
1976     {
1977 	dst = dst_line;
1978 	dst_line += dst_stride;
1979 	mask = mask_line;
1980 	mask_line += mask_stride;
1981 	w = width;
1982 
1983 	CHECKPOINT ();
1984 
1985 	while (w && (uintptr_t)dst & 7)
1986 	{
1987 	    uint64_t m = *mask;
1988 
1989 	    if (m)
1990 	    {
1991 		__m64 vdest = in_over (vsrc, vsrca,
1992 				       expand_alpha_rev (to_m64 (m)),
1993 				       load8888 (dst));
1994 
1995 		store8888 (dst, vdest);
1996 	    }
1997 
1998 	    w--;
1999 	    mask++;
2000 	    dst++;
2001 	}
2002 
2003 	CHECKPOINT ();
2004 
2005 	while (w >= 2)
2006 	{
2007 	    uint64_t m0, m1;
2008 
2009 	    m0 = *mask;
2010 	    m1 = *(mask + 1);
2011 
2012 	    if (srca == 0xff && (m0 & m1) == 0xff)
2013 	    {
2014 		*(uint64_t *)dst = srcsrc;
2015 	    }
2016 	    else if (m0 | m1)
2017 	    {
2018 		__m64 vdest;
2019 		__m64 dest0, dest1;
2020 
2021 		vdest = *(__m64 *)dst;
2022 
2023 		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2024 				 expand8888 (vdest, 0));
2025 		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2026 				 expand8888 (vdest, 1));
2027 
2028 		*(__m64 *)dst = pack8888 (dest0, dest1);
2029 	    }
2030 
2031 	    mask += 2;
2032 	    dst += 2;
2033 	    w -= 2;
2034 	}
2035 
2036 	CHECKPOINT ();
2037 
2038 	if (w)
2039 	{
2040 	    uint64_t m = *mask;
2041 
2042 	    if (m)
2043 	    {
2044 		__m64 vdest = load8888 (dst);
2045 
2046 		vdest = in_over (
2047 		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2048 		store8888 (dst, vdest);
2049 	    }
2050 	}
2051     }
2052 
2053     _mm_empty ();
2054 }
2055 
2056 static pixman_bool_t
mmx_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)2057 mmx_fill (pixman_implementation_t *imp,
2058           uint32_t *               bits,
2059           int                      stride,
2060           int                      bpp,
2061           int                      x,
2062           int                      y,
2063           int                      width,
2064           int                      height,
2065           uint32_t		   filler)
2066 {
2067     uint64_t fill;
2068     __m64 vfill;
2069     uint32_t byte_width;
2070     uint8_t     *byte_line;
2071 
2072 #if defined __GNUC__ && defined USE_X86_MMX
2073     __m64 v1, v2, v3, v4, v5, v6, v7;
2074 #endif
2075 
2076     if (bpp != 16 && bpp != 32 && bpp != 8)
2077 	return FALSE;
2078 
2079     if (bpp == 8)
2080     {
2081 	stride = stride * (int) sizeof (uint32_t) / 1;
2082 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2083 	byte_width = width;
2084 	stride *= 1;
2085         filler = (filler & 0xff) * 0x01010101;
2086     }
2087     else if (bpp == 16)
2088     {
2089 	stride = stride * (int) sizeof (uint32_t) / 2;
2090 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2091 	byte_width = 2 * width;
2092 	stride *= 2;
2093         filler = (filler & 0xffff) * 0x00010001;
2094     }
2095     else
2096     {
2097 	stride = stride * (int) sizeof (uint32_t) / 4;
2098 	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2099 	byte_width = 4 * width;
2100 	stride *= 4;
2101     }
2102 
2103     fill = ((uint64_t)filler << 32) | filler;
2104     vfill = to_m64 (fill);
2105 
2106 #if defined __GNUC__ && defined USE_X86_MMX
2107     __asm__ (
2108         "movq		%7,	%0\n"
2109         "movq		%7,	%1\n"
2110         "movq		%7,	%2\n"
2111         "movq		%7,	%3\n"
2112         "movq		%7,	%4\n"
2113         "movq		%7,	%5\n"
2114         "movq		%7,	%6\n"
2115 	: "=&y" (v1), "=&y" (v2), "=&y" (v3),
2116 	  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2117 	: "y" (vfill));
2118 #endif
2119 
2120     while (height--)
2121     {
2122 	int w;
2123 	uint8_t *d = byte_line;
2124 
2125 	byte_line += stride;
2126 	w = byte_width;
2127 
2128 	if (w >= 1 && ((uintptr_t)d & 1))
2129 	{
2130 	    *(uint8_t *)d = (filler & 0xff);
2131 	    w--;
2132 	    d++;
2133 	}
2134 
2135 	if (w >= 2 && ((uintptr_t)d & 3))
2136 	{
2137 	    *(uint16_t *)d = filler;
2138 	    w -= 2;
2139 	    d += 2;
2140 	}
2141 
2142 	while (w >= 4 && ((uintptr_t)d & 7))
2143 	{
2144 	    *(uint32_t *)d = filler;
2145 
2146 	    w -= 4;
2147 	    d += 4;
2148 	}
2149 
2150 	while (w >= 64)
2151 	{
2152 #if defined __GNUC__ && defined USE_X86_MMX
2153 	    __asm__ (
2154 	        "movq	%1,	  (%0)\n"
2155 	        "movq	%2,	 8(%0)\n"
2156 	        "movq	%3,	16(%0)\n"
2157 	        "movq	%4,	24(%0)\n"
2158 	        "movq	%5,	32(%0)\n"
2159 	        "movq	%6,	40(%0)\n"
2160 	        "movq	%7,	48(%0)\n"
2161 	        "movq	%8,	56(%0)\n"
2162 		:
2163 		: "r" (d),
2164 		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2165 		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2166 		: "memory");
2167 #else
2168 	    *(__m64*) (d +  0) = vfill;
2169 	    *(__m64*) (d +  8) = vfill;
2170 	    *(__m64*) (d + 16) = vfill;
2171 	    *(__m64*) (d + 24) = vfill;
2172 	    *(__m64*) (d + 32) = vfill;
2173 	    *(__m64*) (d + 40) = vfill;
2174 	    *(__m64*) (d + 48) = vfill;
2175 	    *(__m64*) (d + 56) = vfill;
2176 #endif
2177 	    w -= 64;
2178 	    d += 64;
2179 	}
2180 
2181 	while (w >= 4)
2182 	{
2183 	    *(uint32_t *)d = filler;
2184 
2185 	    w -= 4;
2186 	    d += 4;
2187 	}
2188 	if (w >= 2)
2189 	{
2190 	    *(uint16_t *)d = filler;
2191 	    w -= 2;
2192 	    d += 2;
2193 	}
2194 	if (w >= 1)
2195 	{
2196 	    *(uint8_t *)d = (filler & 0xff);
2197 	    w--;
2198 	    d++;
2199 	}
2200 
2201     }
2202 
2203     _mm_empty ();
2204     return TRUE;
2205 }
2206 
2207 static void
mmx_composite_src_x888_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2208 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2209                              pixman_composite_info_t *info)
2210 {
2211     PIXMAN_COMPOSITE_ARGS (info);
2212     uint16_t    *dst_line, *dst;
2213     uint32_t    *src_line, *src, s;
2214     int dst_stride, src_stride;
2215     int32_t w;
2216 
2217     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2218     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2219 
2220     while (height--)
2221     {
2222 	dst = dst_line;
2223 	dst_line += dst_stride;
2224 	src = src_line;
2225 	src_line += src_stride;
2226 	w = width;
2227 
2228 	while (w && (uintptr_t)dst & 7)
2229 	{
2230 	    s = *src++;
2231 	    *dst = convert_8888_to_0565 (s);
2232 	    dst++;
2233 	    w--;
2234 	}
2235 
2236 	while (w >= 4)
2237 	{
2238 	    __m64 vdest;
2239 	    __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2240 	    __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2241 
2242 	    vdest = pack_4xpacked565 (vsrc0, vsrc1);
2243 
2244 	    *(__m64 *)dst = vdest;
2245 
2246 	    w -= 4;
2247 	    src += 4;
2248 	    dst += 4;
2249 	}
2250 
2251 	while (w)
2252 	{
2253 	    s = *src++;
2254 	    *dst = convert_8888_to_0565 (s);
2255 	    dst++;
2256 	    w--;
2257 	}
2258     }
2259 
2260     _mm_empty ();
2261 }
2262 
2263 static void
mmx_composite_src_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2264 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2265                             pixman_composite_info_t *info)
2266 {
2267     PIXMAN_COMPOSITE_ARGS (info);
2268     uint32_t src, srca;
2269     uint32_t    *dst_line, *dst;
2270     uint8_t     *mask_line, *mask;
2271     int dst_stride, mask_stride;
2272     int32_t w;
2273     __m64 vsrc;
2274     uint64_t srcsrc;
2275 
2276     CHECKPOINT ();
2277 
2278     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2279 
2280     srca = src >> 24;
2281     if (src == 0)
2282     {
2283 	mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2284 		  PIXMAN_FORMAT_BPP (dest_image->bits.format),
2285 		  dest_x, dest_y, width, height, 0);
2286 	return;
2287     }
2288 
2289     srcsrc = (uint64_t)src << 32 | src;
2290 
2291     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2292     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2293 
2294     vsrc = load8888 (&src);
2295 
2296     while (height--)
2297     {
2298 	dst = dst_line;
2299 	dst_line += dst_stride;
2300 	mask = mask_line;
2301 	mask_line += mask_stride;
2302 	w = width;
2303 
2304 	CHECKPOINT ();
2305 
2306 	while (w && (uintptr_t)dst & 7)
2307 	{
2308 	    uint64_t m = *mask;
2309 
2310 	    if (m)
2311 	    {
2312 		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2313 
2314 		store8888 (dst, vdest);
2315 	    }
2316 	    else
2317 	    {
2318 		*dst = 0;
2319 	    }
2320 
2321 	    w--;
2322 	    mask++;
2323 	    dst++;
2324 	}
2325 
2326 	CHECKPOINT ();
2327 
2328 	while (w >= 2)
2329 	{
2330 	    uint64_t m0, m1;
2331 	    m0 = *mask;
2332 	    m1 = *(mask + 1);
2333 
2334 	    if (srca == 0xff && (m0 & m1) == 0xff)
2335 	    {
2336 		*(uint64_t *)dst = srcsrc;
2337 	    }
2338 	    else if (m0 | m1)
2339 	    {
2340 		__m64 dest0, dest1;
2341 
2342 		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2343 		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2344 
2345 		*(__m64 *)dst = pack8888 (dest0, dest1);
2346 	    }
2347 	    else
2348 	    {
2349 		*(uint64_t *)dst = 0;
2350 	    }
2351 
2352 	    mask += 2;
2353 	    dst += 2;
2354 	    w -= 2;
2355 	}
2356 
2357 	CHECKPOINT ();
2358 
2359 	if (w)
2360 	{
2361 	    uint64_t m = *mask;
2362 
2363 	    if (m)
2364 	    {
2365 		__m64 vdest = load8888 (dst);
2366 
2367 		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2368 		store8888 (dst, vdest);
2369 	    }
2370 	    else
2371 	    {
2372 		*dst = 0;
2373 	    }
2374 	}
2375     }
2376 
2377     _mm_empty ();
2378 }
2379 
2380 static void
mmx_composite_over_n_8_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2381 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2382                              pixman_composite_info_t *info)
2383 {
2384     PIXMAN_COMPOSITE_ARGS (info);
2385     uint32_t src, srca;
2386     uint16_t *dst_line, *dst;
2387     uint8_t *mask_line, *mask;
2388     int dst_stride, mask_stride;
2389     int32_t w;
2390     __m64 vsrc, vsrca, tmp;
2391     __m64 srcsrcsrcsrc;
2392 
2393     CHECKPOINT ();
2394 
2395     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2396 
2397     srca = src >> 24;
2398     if (src == 0)
2399 	return;
2400 
2401     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2402     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2403 
2404     vsrc = load8888 (&src);
2405     vsrca = expand_alpha (vsrc);
2406 
2407     tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2408     srcsrcsrcsrc = expand_alpha_rev (tmp);
2409 
2410     while (height--)
2411     {
2412 	dst = dst_line;
2413 	dst_line += dst_stride;
2414 	mask = mask_line;
2415 	mask_line += mask_stride;
2416 	w = width;
2417 
2418 	CHECKPOINT ();
2419 
2420 	while (w && (uintptr_t)dst & 7)
2421 	{
2422 	    uint64_t m = *mask;
2423 
2424 	    if (m)
2425 	    {
2426 		uint64_t d = *dst;
2427 		__m64 vd = to_m64 (d);
2428 		__m64 vdest = in_over (
2429 		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2430 
2431 		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2432 		*dst = to_uint64 (vd);
2433 	    }
2434 
2435 	    w--;
2436 	    mask++;
2437 	    dst++;
2438 	}
2439 
2440 	CHECKPOINT ();
2441 
2442 	while (w >= 4)
2443 	{
2444 	    uint64_t m0, m1, m2, m3;
2445 	    m0 = *mask;
2446 	    m1 = *(mask + 1);
2447 	    m2 = *(mask + 2);
2448 	    m3 = *(mask + 3);
2449 
2450 	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2451 	    {
2452 		*(__m64 *)dst = srcsrcsrcsrc;
2453 	    }
2454 	    else if (m0 | m1 | m2 | m3)
2455 	    {
2456 		__m64 vdest = *(__m64 *)dst;
2457 		__m64 v0, v1, v2, v3;
2458 		__m64 vm0, vm1, vm2, vm3;
2459 
2460 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2461 
2462 		vm0 = to_m64 (m0);
2463 		v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2464 
2465 		vm1 = to_m64 (m1);
2466 		v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2467 
2468 		vm2 = to_m64 (m2);
2469 		v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2470 
2471 		vm3 = to_m64 (m3);
2472 		v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2473 
2474 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2475 	    }
2476 
2477 	    w -= 4;
2478 	    mask += 4;
2479 	    dst += 4;
2480 	}
2481 
2482 	CHECKPOINT ();
2483 
2484 	while (w)
2485 	{
2486 	    uint64_t m = *mask;
2487 
2488 	    if (m)
2489 	    {
2490 		uint64_t d = *dst;
2491 		__m64 vd = to_m64 (d);
2492 		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2493 				       expand565 (vd, 0));
2494 		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2495 		*dst = to_uint64 (vd);
2496 	    }
2497 
2498 	    w--;
2499 	    mask++;
2500 	    dst++;
2501 	}
2502     }
2503 
2504     _mm_empty ();
2505 }
2506 
2507 static void
mmx_composite_over_pixbuf_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)2508 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2509                                 pixman_composite_info_t *info)
2510 {
2511     PIXMAN_COMPOSITE_ARGS (info);
2512     uint16_t    *dst_line, *dst;
2513     uint32_t    *src_line, *src;
2514     int dst_stride, src_stride;
2515     int32_t w;
2516 
2517     CHECKPOINT ();
2518 
2519     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2520     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2521 
2522 #if 0
2523     /* FIXME */
2524     assert (src_image->drawable == mask_image->drawable);
2525 #endif
2526 
2527     while (height--)
2528     {
2529 	dst = dst_line;
2530 	dst_line += dst_stride;
2531 	src = src_line;
2532 	src_line += src_stride;
2533 	w = width;
2534 
2535 	CHECKPOINT ();
2536 
2537 	while (w && (uintptr_t)dst & 7)
2538 	{
2539 	    __m64 vsrc = load8888 (src);
2540 	    uint64_t d = *dst;
2541 	    __m64 vdest = expand565 (to_m64 (d), 0);
2542 
2543 	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2544 
2545 	    *dst = to_uint64 (vdest);
2546 
2547 	    w--;
2548 	    dst++;
2549 	    src++;
2550 	}
2551 
2552 	CHECKPOINT ();
2553 
2554 	while (w >= 4)
2555 	{
2556 	    uint32_t s0, s1, s2, s3;
2557 	    unsigned char a0, a1, a2, a3;
2558 
2559 	    s0 = *src;
2560 	    s1 = *(src + 1);
2561 	    s2 = *(src + 2);
2562 	    s3 = *(src + 3);
2563 
2564 	    a0 = (s0 >> 24);
2565 	    a1 = (s1 >> 24);
2566 	    a2 = (s2 >> 24);
2567 	    a3 = (s3 >> 24);
2568 
2569 	    if ((a0 & a1 & a2 & a3) == 0xFF)
2570 	    {
2571 		__m64 v0 = invert_colors (load8888 (&s0));
2572 		__m64 v1 = invert_colors (load8888 (&s1));
2573 		__m64 v2 = invert_colors (load8888 (&s2));
2574 		__m64 v3 = invert_colors (load8888 (&s3));
2575 
2576 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2577 	    }
2578 	    else if (s0 | s1 | s2 | s3)
2579 	    {
2580 		__m64 vdest = *(__m64 *)dst;
2581 		__m64 v0, v1, v2, v3;
2582 
2583 		__m64 vsrc0 = load8888 (&s0);
2584 		__m64 vsrc1 = load8888 (&s1);
2585 		__m64 vsrc2 = load8888 (&s2);
2586 		__m64 vsrc3 = load8888 (&s3);
2587 
2588 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2589 
2590 		v0 = over_rev_non_pre (vsrc0, v0);
2591 		v1 = over_rev_non_pre (vsrc1, v1);
2592 		v2 = over_rev_non_pre (vsrc2, v2);
2593 		v3 = over_rev_non_pre (vsrc3, v3);
2594 
2595 		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2596 	    }
2597 
2598 	    w -= 4;
2599 	    dst += 4;
2600 	    src += 4;
2601 	}
2602 
2603 	CHECKPOINT ();
2604 
2605 	while (w)
2606 	{
2607 	    __m64 vsrc = load8888 (src);
2608 	    uint64_t d = *dst;
2609 	    __m64 vdest = expand565 (to_m64 (d), 0);
2610 
2611 	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2612 
2613 	    *dst = to_uint64 (vdest);
2614 
2615 	    w--;
2616 	    dst++;
2617 	    src++;
2618 	}
2619     }
2620 
2621     _mm_empty ();
2622 }
2623 
2624 static void
mmx_composite_over_pixbuf_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2625 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2626                                 pixman_composite_info_t *info)
2627 {
2628     PIXMAN_COMPOSITE_ARGS (info);
2629     uint32_t    *dst_line, *dst;
2630     uint32_t    *src_line, *src;
2631     int dst_stride, src_stride;
2632     int32_t w;
2633 
2634     CHECKPOINT ();
2635 
2636     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2637     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2638 
2639 #if 0
2640     /* FIXME */
2641     assert (src_image->drawable == mask_image->drawable);
2642 #endif
2643 
2644     while (height--)
2645     {
2646 	dst = dst_line;
2647 	dst_line += dst_stride;
2648 	src = src_line;
2649 	src_line += src_stride;
2650 	w = width;
2651 
2652 	while (w && (uintptr_t)dst & 7)
2653 	{
2654 	    __m64 s = load8888 (src);
2655 	    __m64 d = load8888 (dst);
2656 
2657 	    store8888 (dst, over_rev_non_pre (s, d));
2658 
2659 	    w--;
2660 	    dst++;
2661 	    src++;
2662 	}
2663 
2664 	while (w >= 2)
2665 	{
2666 	    uint32_t s0, s1;
2667 	    unsigned char a0, a1;
2668 	    __m64 d0, d1;
2669 
2670 	    s0 = *src;
2671 	    s1 = *(src + 1);
2672 
2673 	    a0 = (s0 >> 24);
2674 	    a1 = (s1 >> 24);
2675 
2676 	    if ((a0 & a1) == 0xFF)
2677 	    {
2678 		d0 = invert_colors (load8888 (&s0));
2679 		d1 = invert_colors (load8888 (&s1));
2680 
2681 		*(__m64 *)dst = pack8888 (d0, d1);
2682 	    }
2683 	    else if (s0 | s1)
2684 	    {
2685 		__m64 vdest = *(__m64 *)dst;
2686 
2687 		d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2688 		d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2689 
2690 		*(__m64 *)dst = pack8888 (d0, d1);
2691 	    }
2692 
2693 	    w -= 2;
2694 	    dst += 2;
2695 	    src += 2;
2696 	}
2697 
2698 	if (w)
2699 	{
2700 	    __m64 s = load8888 (src);
2701 	    __m64 d = load8888 (dst);
2702 
2703 	    store8888 (dst, over_rev_non_pre (s, d));
2704 	}
2705     }
2706 
2707     _mm_empty ();
2708 }
2709 
2710 static void
mmx_composite_over_n_8888_0565_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2711 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2712                                    pixman_composite_info_t *info)
2713 {
2714     PIXMAN_COMPOSITE_ARGS (info);
2715     uint32_t src;
2716     uint16_t    *dst_line;
2717     uint32_t    *mask_line;
2718     int dst_stride, mask_stride;
2719     __m64 vsrc, vsrca;
2720 
2721     CHECKPOINT ();
2722 
2723     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2724 
2725     if (src == 0)
2726 	return;
2727 
2728     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2729     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2730 
2731     vsrc = load8888 (&src);
2732     vsrca = expand_alpha (vsrc);
2733 
2734     while (height--)
2735     {
2736 	int twidth = width;
2737 	uint32_t *p = (uint32_t *)mask_line;
2738 	uint16_t *q = (uint16_t *)dst_line;
2739 
2740 	while (twidth && ((uintptr_t)q & 7))
2741 	{
2742 	    uint32_t m = *(uint32_t *)p;
2743 
2744 	    if (m)
2745 	    {
2746 		uint64_t d = *q;
2747 		__m64 vdest = expand565 (to_m64 (d), 0);
2748 		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2749 		*q = to_uint64 (vdest);
2750 	    }
2751 
2752 	    twidth--;
2753 	    p++;
2754 	    q++;
2755 	}
2756 
2757 	while (twidth >= 4)
2758 	{
2759 	    uint32_t m0, m1, m2, m3;
2760 
2761 	    m0 = *p;
2762 	    m1 = *(p + 1);
2763 	    m2 = *(p + 2);
2764 	    m3 = *(p + 3);
2765 
2766 	    if ((m0 | m1 | m2 | m3))
2767 	    {
2768 		__m64 vdest = *(__m64 *)q;
2769 		__m64 v0, v1, v2, v3;
2770 
2771 		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2772 
2773 		v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2774 		v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2775 		v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2776 		v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2777 
2778 		*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2779 	    }
2780 	    twidth -= 4;
2781 	    p += 4;
2782 	    q += 4;
2783 	}
2784 
2785 	while (twidth)
2786 	{
2787 	    uint32_t m;
2788 
2789 	    m = *(uint32_t *)p;
2790 	    if (m)
2791 	    {
2792 		uint64_t d = *q;
2793 		__m64 vdest = expand565 (to_m64 (d), 0);
2794 		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2795 		*q = to_uint64 (vdest);
2796 	    }
2797 
2798 	    twidth--;
2799 	    p++;
2800 	    q++;
2801 	}
2802 
2803 	mask_line += mask_stride;
2804 	dst_line += dst_stride;
2805     }
2806 
2807     _mm_empty ();
2808 }
2809 
2810 static void
mmx_composite_in_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2811 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2812                         pixman_composite_info_t *info)
2813 {
2814     PIXMAN_COMPOSITE_ARGS (info);
2815     uint8_t *dst_line, *dst;
2816     uint8_t *mask_line, *mask;
2817     int dst_stride, mask_stride;
2818     int32_t w;
2819     uint32_t src;
2820     uint8_t sa;
2821     __m64 vsrc, vsrca;
2822 
2823     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2824     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2825 
2826     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2827 
2828     sa = src >> 24;
2829 
2830     vsrc = load8888 (&src);
2831     vsrca = expand_alpha (vsrc);
2832 
2833     while (height--)
2834     {
2835 	dst = dst_line;
2836 	dst_line += dst_stride;
2837 	mask = mask_line;
2838 	mask_line += mask_stride;
2839 	w = width;
2840 
2841 	while (w && (uintptr_t)dst & 7)
2842 	{
2843 	    uint16_t tmp;
2844 	    uint8_t a;
2845 	    uint32_t m, d;
2846 
2847 	    a = *mask++;
2848 	    d = *dst;
2849 
2850 	    m = MUL_UN8 (sa, a, tmp);
2851 	    d = MUL_UN8 (m, d, tmp);
2852 
2853 	    *dst++ = d;
2854 	    w--;
2855 	}
2856 
2857 	while (w >= 4)
2858 	{
2859 	    __m64 vmask;
2860 	    __m64 vdest;
2861 
2862 	    vmask = load8888u ((uint32_t *)mask);
2863 	    vdest = load8888 ((uint32_t *)dst);
2864 
2865 	    store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2866 
2867 	    dst += 4;
2868 	    mask += 4;
2869 	    w -= 4;
2870 	}
2871 
2872 	while (w--)
2873 	{
2874 	    uint16_t tmp;
2875 	    uint8_t a;
2876 	    uint32_t m, d;
2877 
2878 	    a = *mask++;
2879 	    d = *dst;
2880 
2881 	    m = MUL_UN8 (sa, a, tmp);
2882 	    d = MUL_UN8 (m, d, tmp);
2883 
2884 	    *dst++ = d;
2885 	}
2886     }
2887 
2888     _mm_empty ();
2889 }
2890 
2891 static void
mmx_composite_in_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2892 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2893                       pixman_composite_info_t *info)
2894 {
2895     PIXMAN_COMPOSITE_ARGS (info);
2896     uint8_t     *dst_line, *dst;
2897     uint8_t     *src_line, *src;
2898     int src_stride, dst_stride;
2899     int32_t w;
2900 
2901     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2902     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2903 
2904     while (height--)
2905     {
2906 	dst = dst_line;
2907 	dst_line += dst_stride;
2908 	src = src_line;
2909 	src_line += src_stride;
2910 	w = width;
2911 
2912 	while (w && (uintptr_t)dst & 3)
2913 	{
2914 	    uint8_t s, d;
2915 	    uint16_t tmp;
2916 
2917 	    s = *src;
2918 	    d = *dst;
2919 
2920 	    *dst = MUL_UN8 (s, d, tmp);
2921 
2922 	    src++;
2923 	    dst++;
2924 	    w--;
2925 	}
2926 
2927 	while (w >= 4)
2928 	{
2929 	    uint32_t *s = (uint32_t *)src;
2930 	    uint32_t *d = (uint32_t *)dst;
2931 
2932 	    store8888 (d, in (load8888u (s), load8888 (d)));
2933 
2934 	    w -= 4;
2935 	    dst += 4;
2936 	    src += 4;
2937 	}
2938 
2939 	while (w--)
2940 	{
2941 	    uint8_t s, d;
2942 	    uint16_t tmp;
2943 
2944 	    s = *src;
2945 	    d = *dst;
2946 
2947 	    *dst = MUL_UN8 (s, d, tmp);
2948 
2949 	    src++;
2950 	    dst++;
2951 	}
2952     }
2953 
2954     _mm_empty ();
2955 }
2956 
2957 static void
mmx_composite_add_n_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2958 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2959 			 pixman_composite_info_t *info)
2960 {
2961     PIXMAN_COMPOSITE_ARGS (info);
2962     uint8_t     *dst_line, *dst;
2963     uint8_t     *mask_line, *mask;
2964     int dst_stride, mask_stride;
2965     int32_t w;
2966     uint32_t src;
2967     uint8_t sa;
2968     __m64 vsrc, vsrca;
2969 
2970     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2971     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2972 
2973     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2974 
2975     sa = src >> 24;
2976 
2977     if (src == 0)
2978 	return;
2979 
2980     vsrc = load8888 (&src);
2981     vsrca = expand_alpha (vsrc);
2982 
2983     while (height--)
2984     {
2985 	dst = dst_line;
2986 	dst_line += dst_stride;
2987 	mask = mask_line;
2988 	mask_line += mask_stride;
2989 	w = width;
2990 
2991 	while (w && (uintptr_t)dst & 3)
2992 	{
2993 	    uint16_t tmp;
2994 	    uint16_t a;
2995 	    uint32_t m, d;
2996 	    uint32_t r;
2997 
2998 	    a = *mask++;
2999 	    d = *dst;
3000 
3001 	    m = MUL_UN8 (sa, a, tmp);
3002 	    r = ADD_UN8 (m, d, tmp);
3003 
3004 	    *dst++ = r;
3005 	    w--;
3006 	}
3007 
3008 	while (w >= 4)
3009 	{
3010 	    __m64 vmask;
3011 	    __m64 vdest;
3012 
3013 	    vmask = load8888u ((uint32_t *)mask);
3014 	    vdest = load8888 ((uint32_t *)dst);
3015 
3016 	    store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3017 
3018 	    dst += 4;
3019 	    mask += 4;
3020 	    w -= 4;
3021 	}
3022 
3023 	while (w--)
3024 	{
3025 	    uint16_t tmp;
3026 	    uint16_t a;
3027 	    uint32_t m, d;
3028 	    uint32_t r;
3029 
3030 	    a = *mask++;
3031 	    d = *dst;
3032 
3033 	    m = MUL_UN8 (sa, a, tmp);
3034 	    r = ADD_UN8 (m, d, tmp);
3035 
3036 	    *dst++ = r;
3037 	}
3038     }
3039 
3040     _mm_empty ();
3041 }
3042 
3043 static void
mmx_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)3044 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3045 		       pixman_composite_info_t *info)
3046 {
3047     PIXMAN_COMPOSITE_ARGS (info);
3048     uint8_t *dst_line, *dst;
3049     uint8_t *src_line, *src;
3050     int dst_stride, src_stride;
3051     int32_t w;
3052     uint8_t s, d;
3053     uint16_t t;
3054 
3055     CHECKPOINT ();
3056 
3057     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3058     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3059 
3060     while (height--)
3061     {
3062 	dst = dst_line;
3063 	dst_line += dst_stride;
3064 	src = src_line;
3065 	src_line += src_stride;
3066 	w = width;
3067 
3068 	while (w && (uintptr_t)dst & 7)
3069 	{
3070 	    s = *src;
3071 	    d = *dst;
3072 	    t = d + s;
3073 	    s = t | (0 - (t >> 8));
3074 	    *dst = s;
3075 
3076 	    dst++;
3077 	    src++;
3078 	    w--;
3079 	}
3080 
3081 	while (w >= 8)
3082 	{
3083 	    *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3084 	    dst += 8;
3085 	    src += 8;
3086 	    w -= 8;
3087 	}
3088 
3089 	while (w)
3090 	{
3091 	    s = *src;
3092 	    d = *dst;
3093 	    t = d + s;
3094 	    s = t | (0 - (t >> 8));
3095 	    *dst = s;
3096 
3097 	    dst++;
3098 	    src++;
3099 	    w--;
3100 	}
3101     }
3102 
3103     _mm_empty ();
3104 }
3105 
3106 static void
mmx_composite_add_0565_0565(pixman_implementation_t * imp,pixman_composite_info_t * info)3107 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3108                              pixman_composite_info_t *info)
3109 {
3110     PIXMAN_COMPOSITE_ARGS (info);
3111     uint16_t    *dst_line, *dst;
3112     uint32_t	d;
3113     uint16_t    *src_line, *src;
3114     uint32_t	s;
3115     int dst_stride, src_stride;
3116     int32_t w;
3117 
3118     CHECKPOINT ();
3119 
3120     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3121     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3122 
3123     while (height--)
3124     {
3125 	dst = dst_line;
3126 	dst_line += dst_stride;
3127 	src = src_line;
3128 	src_line += src_stride;
3129 	w = width;
3130 
3131 	while (w && (uintptr_t)dst & 7)
3132 	{
3133 	    s = *src++;
3134 	    if (s)
3135 	    {
3136 		d = *dst;
3137 		s = convert_0565_to_8888 (s);
3138 		if (d)
3139 		{
3140 		    d = convert_0565_to_8888 (d);
3141 		    UN8x4_ADD_UN8x4 (s, d);
3142 		}
3143 		*dst = convert_8888_to_0565 (s);
3144 	    }
3145 	    dst++;
3146 	    w--;
3147 	}
3148 
3149 	while (w >= 4)
3150 	{
3151 	    __m64 vdest = *(__m64 *)dst;
3152 	    __m64 vsrc = ldq_u ((__m64 *)src);
3153 	    __m64 vd0, vd1;
3154 	    __m64 vs0, vs1;
3155 
3156 	    expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3157 	    expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3158 
3159 	    vd0 = _mm_adds_pu8 (vd0, vs0);
3160 	    vd1 = _mm_adds_pu8 (vd1, vs1);
3161 
3162 	    *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3163 
3164 	    dst += 4;
3165 	    src += 4;
3166 	    w -= 4;
3167 	}
3168 
3169 	while (w--)
3170 	{
3171 	    s = *src++;
3172 	    if (s)
3173 	    {
3174 		d = *dst;
3175 		s = convert_0565_to_8888 (s);
3176 		if (d)
3177 		{
3178 		    d = convert_0565_to_8888 (d);
3179 		    UN8x4_ADD_UN8x4 (s, d);
3180 		}
3181 		*dst = convert_8888_to_0565 (s);
3182 	    }
3183 	    dst++;
3184 	}
3185     }
3186 
3187     _mm_empty ();
3188 }
3189 
3190 static void
mmx_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3191 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3192                              pixman_composite_info_t *info)
3193 {
3194     PIXMAN_COMPOSITE_ARGS (info);
3195     uint32_t    *dst_line, *dst;
3196     uint32_t    *src_line, *src;
3197     int dst_stride, src_stride;
3198     int32_t w;
3199 
3200     CHECKPOINT ();
3201 
3202     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3203     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3204 
3205     while (height--)
3206     {
3207 	dst = dst_line;
3208 	dst_line += dst_stride;
3209 	src = src_line;
3210 	src_line += src_stride;
3211 	w = width;
3212 
3213 	while (w && (uintptr_t)dst & 7)
3214 	{
3215 	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3216 	                              load ((const uint32_t *)dst)));
3217 	    dst++;
3218 	    src++;
3219 	    w--;
3220 	}
3221 
3222 	while (w >= 2)
3223 	{
3224 	    *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3225 	    dst += 2;
3226 	    src += 2;
3227 	    w -= 2;
3228 	}
3229 
3230 	if (w)
3231 	{
3232 	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3233 	                              load ((const uint32_t *)dst)));
3234 
3235 	}
3236     }
3237 
3238     _mm_empty ();
3239 }
3240 
3241 static pixman_bool_t
mmx_blt(pixman_implementation_t * imp,uint32_t * src_bits,uint32_t * dst_bits,int src_stride,int dst_stride,int src_bpp,int dst_bpp,int src_x,int src_y,int dest_x,int dest_y,int width,int height)3242 mmx_blt (pixman_implementation_t *imp,
3243          uint32_t *               src_bits,
3244          uint32_t *               dst_bits,
3245          int                      src_stride,
3246          int                      dst_stride,
3247          int                      src_bpp,
3248          int                      dst_bpp,
3249          int                      src_x,
3250          int                      src_y,
3251          int                      dest_x,
3252          int                      dest_y,
3253          int                      width,
3254          int                      height)
3255 {
3256     uint8_t *   src_bytes;
3257     uint8_t *   dst_bytes;
3258     int byte_width;
3259 
3260     if (src_bpp != dst_bpp)
3261 	return FALSE;
3262 
3263     if (src_bpp == 16)
3264     {
3265 	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3266 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3267 	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3268 	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3269 	byte_width = 2 * width;
3270 	src_stride *= 2;
3271 	dst_stride *= 2;
3272     }
3273     else if (src_bpp == 32)
3274     {
3275 	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3276 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3277 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3278 	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3279 	byte_width = 4 * width;
3280 	src_stride *= 4;
3281 	dst_stride *= 4;
3282     }
3283     else
3284     {
3285 	return FALSE;
3286     }
3287 
3288     while (height--)
3289     {
3290 	int w;
3291 	uint8_t *s = src_bytes;
3292 	uint8_t *d = dst_bytes;
3293 	src_bytes += src_stride;
3294 	dst_bytes += dst_stride;
3295 	w = byte_width;
3296 
3297 	if (w >= 1 && ((uintptr_t)d & 1))
3298 	{
3299 	    *(uint8_t *)d = *(uint8_t *)s;
3300 	    w -= 1;
3301 	    s += 1;
3302 	    d += 1;
3303 	}
3304 
3305 	if (w >= 2 && ((uintptr_t)d & 3))
3306 	{
3307 	    *(uint16_t *)d = *(uint16_t *)s;
3308 	    w -= 2;
3309 	    s += 2;
3310 	    d += 2;
3311 	}
3312 
3313 	while (w >= 4 && ((uintptr_t)d & 7))
3314 	{
3315 	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
3316 
3317 	    w -= 4;
3318 	    s += 4;
3319 	    d += 4;
3320 	}
3321 
3322 	while (w >= 64)
3323 	{
3324 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3325 	    __asm__ (
3326 	        "movq	  (%1),	  %%mm0\n"
3327 	        "movq	 8(%1),	  %%mm1\n"
3328 	        "movq	16(%1),	  %%mm2\n"
3329 	        "movq	24(%1),	  %%mm3\n"
3330 	        "movq	32(%1),	  %%mm4\n"
3331 	        "movq	40(%1),	  %%mm5\n"
3332 	        "movq	48(%1),	  %%mm6\n"
3333 	        "movq	56(%1),	  %%mm7\n"
3334 
3335 	        "movq	%%mm0,	  (%0)\n"
3336 	        "movq	%%mm1,	 8(%0)\n"
3337 	        "movq	%%mm2,	16(%0)\n"
3338 	        "movq	%%mm3,	24(%0)\n"
3339 	        "movq	%%mm4,	32(%0)\n"
3340 	        "movq	%%mm5,	40(%0)\n"
3341 	        "movq	%%mm6,	48(%0)\n"
3342 	        "movq	%%mm7,	56(%0)\n"
3343 		:
3344 		: "r" (d), "r" (s)
3345 		: "memory",
3346 		  "%mm0", "%mm1", "%mm2", "%mm3",
3347 		  "%mm4", "%mm5", "%mm6", "%mm7");
3348 #else
3349 	    __m64 v0 = ldq_u ((__m64 *)(s + 0));
3350 	    __m64 v1 = ldq_u ((__m64 *)(s + 8));
3351 	    __m64 v2 = ldq_u ((__m64 *)(s + 16));
3352 	    __m64 v3 = ldq_u ((__m64 *)(s + 24));
3353 	    __m64 v4 = ldq_u ((__m64 *)(s + 32));
3354 	    __m64 v5 = ldq_u ((__m64 *)(s + 40));
3355 	    __m64 v6 = ldq_u ((__m64 *)(s + 48));
3356 	    __m64 v7 = ldq_u ((__m64 *)(s + 56));
3357 	    *(__m64 *)(d + 0)  = v0;
3358 	    *(__m64 *)(d + 8)  = v1;
3359 	    *(__m64 *)(d + 16) = v2;
3360 	    *(__m64 *)(d + 24) = v3;
3361 	    *(__m64 *)(d + 32) = v4;
3362 	    *(__m64 *)(d + 40) = v5;
3363 	    *(__m64 *)(d + 48) = v6;
3364 	    *(__m64 *)(d + 56) = v7;
3365 #endif
3366 
3367 	    w -= 64;
3368 	    s += 64;
3369 	    d += 64;
3370 	}
3371 	while (w >= 4)
3372 	{
3373 	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
3374 
3375 	    w -= 4;
3376 	    s += 4;
3377 	    d += 4;
3378 	}
3379 	if (w >= 2)
3380 	{
3381 	    *(uint16_t *)d = *(uint16_t *)s;
3382 	    w -= 2;
3383 	    s += 2;
3384 	    d += 2;
3385 	}
3386     }
3387 
3388     _mm_empty ();
3389 
3390     return TRUE;
3391 }
3392 
3393 static void
mmx_composite_copy_area(pixman_implementation_t * imp,pixman_composite_info_t * info)3394 mmx_composite_copy_area (pixman_implementation_t *imp,
3395                          pixman_composite_info_t *info)
3396 {
3397     PIXMAN_COMPOSITE_ARGS (info);
3398 
3399     mmx_blt (imp, src_image->bits.bits,
3400 	     dest_image->bits.bits,
3401 	     src_image->bits.rowstride,
3402 	     dest_image->bits.rowstride,
3403 	     PIXMAN_FORMAT_BPP (src_image->bits.format),
3404 	     PIXMAN_FORMAT_BPP (dest_image->bits.format),
3405 	     src_x, src_y, dest_x, dest_y, width, height);
3406 }
3407 
3408 static void
mmx_composite_over_x888_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3409 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3410                                 pixman_composite_info_t *info)
3411 {
3412     PIXMAN_COMPOSITE_ARGS (info);
3413     uint32_t  *src, *src_line;
3414     uint32_t  *dst, *dst_line;
3415     uint8_t  *mask, *mask_line;
3416     int src_stride, mask_stride, dst_stride;
3417     int32_t w;
3418 
3419     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3420     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3421     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3422 
3423     while (height--)
3424     {
3425 	src = src_line;
3426 	src_line += src_stride;
3427 	dst = dst_line;
3428 	dst_line += dst_stride;
3429 	mask = mask_line;
3430 	mask_line += mask_stride;
3431 
3432 	w = width;
3433 
3434 	while (w--)
3435 	{
3436 	    uint64_t m = *mask;
3437 
3438 	    if (m)
3439 	    {
3440 		uint32_t ssrc = *src | 0xff000000;
3441 		__m64 s = load8888 (&ssrc);
3442 
3443 		if (m == 0xff)
3444 		{
3445 		    store8888 (dst, s);
3446 		}
3447 		else
3448 		{
3449 		    __m64 sa = expand_alpha (s);
3450 		    __m64 vm = expand_alpha_rev (to_m64 (m));
3451 		    __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3452 
3453 		    store8888 (dst, vdest);
3454 		}
3455 	    }
3456 
3457 	    mask++;
3458 	    dst++;
3459 	    src++;
3460 	}
3461     }
3462 
3463     _mm_empty ();
3464 }
3465 
3466 static void
mmx_composite_over_reverse_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)3467 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3468                                    pixman_composite_info_t *info)
3469 {
3470     PIXMAN_COMPOSITE_ARGS (info);
3471     uint32_t src;
3472     uint32_t    *dst_line, *dst;
3473     int32_t w;
3474     int dst_stride;
3475     __m64 vsrc;
3476 
3477     CHECKPOINT ();
3478 
3479     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3480 
3481     if (src == 0)
3482 	return;
3483 
3484     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3485 
3486     vsrc = load8888 (&src);
3487 
3488     while (height--)
3489     {
3490 	dst = dst_line;
3491 	dst_line += dst_stride;
3492 	w = width;
3493 
3494 	CHECKPOINT ();
3495 
3496 	while (w && (uintptr_t)dst & 7)
3497 	{
3498 	    __m64 vdest = load8888 (dst);
3499 
3500 	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3501 
3502 	    w--;
3503 	    dst++;
3504 	}
3505 
3506 	while (w >= 2)
3507 	{
3508 	    __m64 vdest = *(__m64 *)dst;
3509 	    __m64 dest0 = expand8888 (vdest, 0);
3510 	    __m64 dest1 = expand8888 (vdest, 1);
3511 
3512 
3513 	    dest0 = over (dest0, expand_alpha (dest0), vsrc);
3514 	    dest1 = over (dest1, expand_alpha (dest1), vsrc);
3515 
3516 	    *(__m64 *)dst = pack8888 (dest0, dest1);
3517 
3518 	    dst += 2;
3519 	    w -= 2;
3520 	}
3521 
3522 	CHECKPOINT ();
3523 
3524 	if (w)
3525 	{
3526 	    __m64 vdest = load8888 (dst);
3527 
3528 	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3529 	}
3530     }
3531 
3532     _mm_empty ();
3533 }
3534 
3535 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3536 #define BMSK (BSHIFT - 1)
3537 
3538 #define BILINEAR_DECLARE_VARIABLES						\
3539     const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
3540     const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
3541     const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\
3542     const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
3543     const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
3544     const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
3545     const __m64 mm_zero = _mm_setzero_si64 ();					\
3546     __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3547 
3548 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
3549 do {										\
3550     /* fetch 2x2 pixel block into 2 mmx registers */				\
3551     __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
3552     __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
3553     /* vertical interpolation */						\
3554     __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
3555     __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
3556     __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
3557     __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
3558     __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
3559     __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
3560     vx += unit_x;								\
3561     if (BILINEAR_INTERPOLATION_BITS < 8)					\
3562     {										\
3563 	/* calculate horizontal weights */					\
3564 	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
3565 			  _mm_srli_pi16 (mm_x,					\
3566 					 16 - BILINEAR_INTERPOLATION_BITS)));	\
3567 	/* horizontal interpolation */						\
3568 	__m64 p = _mm_unpacklo_pi16 (lo, hi);					\
3569 	__m64 q = _mm_unpackhi_pi16 (lo, hi);					\
3570 	lo = _mm_madd_pi16 (p, mm_wh);						\
3571 	hi = _mm_madd_pi16 (q, mm_wh);						\
3572     }										\
3573     else									\
3574     {										\
3575 	/* calculate horizontal weights */					\
3576 	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
3577 					16 - BILINEAR_INTERPOLATION_BITS));	\
3578 	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
3579 					16 - BILINEAR_INTERPOLATION_BITS);	\
3580 	/* horizontal interpolation */						\
3581 	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
3582 	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
3583 	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
3584 	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
3585 	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),		\
3586 			   _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));		\
3587 	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),		\
3588 			   _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));		\
3589     }										\
3590     mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
3591     /* shift and pack the result */						\
3592     hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
3593     lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
3594     lo = _mm_packs_pi32 (lo, hi);						\
3595     lo = _mm_packs_pu16 (lo, lo);						\
3596     pix = lo;									\
3597 } while (0)
3598 
3599 #define BILINEAR_SKIP_ONE_PIXEL()						\
3600 do {										\
3601     vx += unit_x;								\
3602     mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
3603 } while(0)
3604 
3605 static force_inline void
scaled_bilinear_scanline_mmx_8888_8888_SRC(uint32_t * dst,const uint32_t * mask,const uint32_t * src_top,const uint32_t * src_bottom,int32_t w,int wt,int wb,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t max_vx,pixman_bool_t zero_src)3606 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
3607 					    const uint32_t * mask,
3608 					    const uint32_t * src_top,
3609 					    const uint32_t * src_bottom,
3610 					    int32_t          w,
3611 					    int              wt,
3612 					    int              wb,
3613 					    pixman_fixed_t   vx,
3614 					    pixman_fixed_t   unit_x,
3615 					    pixman_fixed_t   max_vx,
3616 					    pixman_bool_t    zero_src)
3617 {
3618     BILINEAR_DECLARE_VARIABLES;
3619     __m64 pix;
3620 
3621     while (w--)
3622     {
3623 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3624 	store (dst, pix);
3625 	dst++;
3626     }
3627 
3628     _mm_empty ();
3629 }
3630 
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_SRC,scaled_bilinear_scanline_mmx_8888_8888_SRC,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3631 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3632 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3633 			       uint32_t, uint32_t, uint32_t,
3634 			       COVER, FLAG_NONE)
3635 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3636 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3637 			       uint32_t, uint32_t, uint32_t,
3638 			       PAD, FLAG_NONE)
3639 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3640 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3641 			       uint32_t, uint32_t, uint32_t,
3642 			       NONE, FLAG_NONE)
3643 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3644 			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3645 			       uint32_t, uint32_t, uint32_t,
3646 			       NORMAL, FLAG_NONE)
3647 
3648 static force_inline void
3649 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
3650 					     const uint32_t * mask,
3651 					     const uint32_t * src_top,
3652 					     const uint32_t * src_bottom,
3653 					     int32_t          w,
3654 					     int              wt,
3655 					     int              wb,
3656 					     pixman_fixed_t   vx,
3657 					     pixman_fixed_t   unit_x,
3658 					     pixman_fixed_t   max_vx,
3659 					     pixman_bool_t    zero_src)
3660 {
3661     BILINEAR_DECLARE_VARIABLES;
3662     __m64 pix1, pix2;
3663 
3664     while (w)
3665     {
3666 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3667 
3668 	if (!is_zero (pix1))
3669 	{
3670 	    pix2 = load (dst);
3671 	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3672 	}
3673 
3674 	w--;
3675 	dst++;
3676     }
3677 
3678     _mm_empty ();
3679 }
3680 
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8888_OVER,uint32_t,uint32_t,uint32_t,COVER,FLAG_NONE)3681 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3682 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3683 			       uint32_t, uint32_t, uint32_t,
3684 			       COVER, FLAG_NONE)
3685 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3686 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3687 			       uint32_t, uint32_t, uint32_t,
3688 			       PAD, FLAG_NONE)
3689 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3690 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3691 			       uint32_t, uint32_t, uint32_t,
3692 			       NONE, FLAG_NONE)
3693 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3694 			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3695 			       uint32_t, uint32_t, uint32_t,
3696 			       NORMAL, FLAG_NONE)
3697 
3698 static force_inline void
3699 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
3700 					       const uint8_t  * mask,
3701 					       const uint32_t * src_top,
3702 					       const uint32_t * src_bottom,
3703 					       int32_t          w,
3704 					       int              wt,
3705 					       int              wb,
3706 					       pixman_fixed_t   vx,
3707 					       pixman_fixed_t   unit_x,
3708 					       pixman_fixed_t   max_vx,
3709 					       pixman_bool_t    zero_src)
3710 {
3711     BILINEAR_DECLARE_VARIABLES;
3712     __m64 pix1, pix2;
3713     uint32_t m;
3714 
3715     while (w)
3716     {
3717 	m = (uint32_t) *mask++;
3718 
3719 	if (m)
3720 	{
3721 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3722 
3723 	    if (m == 0xff && is_opaque (pix1))
3724 	    {
3725 		store (dst, pix1);
3726 	    }
3727 	    else
3728 	    {
3729 		__m64 ms, md, ma, msa;
3730 
3731 		pix2 = load (dst);
3732 		ma = expand_alpha_rev (to_m64 (m));
3733 		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3734 		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3735 
3736 		msa = expand_alpha (ms);
3737 
3738 		store8888 (dst, (in_over (ms, msa, ma, md)));
3739 	    }
3740 	}
3741 	else
3742 	{
3743 	    BILINEAR_SKIP_ONE_PIXEL ();
3744 	}
3745 
3746 	w--;
3747 	dst++;
3748     }
3749 
3750     _mm_empty ();
3751 }
3752 
FAST_BILINEAR_MAINLOOP_COMMON(mmx_8888_8_8888_cover_OVER,scaled_bilinear_scanline_mmx_8888_8_8888_OVER,uint32_t,uint8_t,uint32_t,COVER,FLAG_HAVE_NON_SOLID_MASK)3753 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3754 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3755 			       uint32_t, uint8_t, uint32_t,
3756 			       COVER, FLAG_HAVE_NON_SOLID_MASK)
3757 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3758 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3759 			       uint32_t, uint8_t, uint32_t,
3760 			       PAD, FLAG_HAVE_NON_SOLID_MASK)
3761 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3762 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3763 			       uint32_t, uint8_t, uint32_t,
3764 			       NONE, FLAG_HAVE_NON_SOLID_MASK)
3765 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3766 			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3767 			       uint32_t, uint8_t, uint32_t,
3768 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3769 
3770 static uint32_t *
3771 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3772 {
3773     int w = iter->width;
3774     uint32_t *dst = iter->buffer;
3775     uint32_t *src = (uint32_t *)iter->bits;
3776 
3777     iter->bits += iter->stride;
3778 
3779     while (w && ((uintptr_t)dst) & 7)
3780     {
3781 	*dst++ = (*src++) | 0xff000000;
3782 	w--;
3783     }
3784 
3785     while (w >= 8)
3786     {
3787 	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3788 	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3789 	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3790 	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3791 
3792 	*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3793 	*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3794 	*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3795 	*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3796 
3797 	dst += 8;
3798 	src += 8;
3799 	w -= 8;
3800     }
3801 
3802     while (w)
3803     {
3804 	*dst++ = (*src++) | 0xff000000;
3805 	w--;
3806     }
3807 
3808     _mm_empty ();
3809     return iter->buffer;
3810 }
3811 
3812 static uint32_t *
mmx_fetch_r5g6b5(pixman_iter_t * iter,const uint32_t * mask)3813 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3814 {
3815     int w = iter->width;
3816     uint32_t *dst = iter->buffer;
3817     uint16_t *src = (uint16_t *)iter->bits;
3818 
3819     iter->bits += iter->stride;
3820 
3821     while (w && ((uintptr_t)dst) & 0x0f)
3822     {
3823 	uint16_t s = *src++;
3824 
3825 	*dst++ = convert_0565_to_8888 (s);
3826 	w--;
3827     }
3828 
3829     while (w >= 4)
3830     {
3831 	__m64 vsrc = ldq_u ((__m64 *)src);
3832 	__m64 mm0, mm1;
3833 
3834 	expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3835 
3836 	*(__m64 *)(dst + 0) = mm0;
3837 	*(__m64 *)(dst + 2) = mm1;
3838 
3839 	dst += 4;
3840 	src += 4;
3841 	w -= 4;
3842     }
3843 
3844     while (w)
3845     {
3846 	uint16_t s = *src++;
3847 
3848 	*dst++ = convert_0565_to_8888 (s);
3849 	w--;
3850     }
3851 
3852     _mm_empty ();
3853     return iter->buffer;
3854 }
3855 
3856 static uint32_t *
mmx_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)3857 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3858 {
3859     int w = iter->width;
3860     uint32_t *dst = iter->buffer;
3861     uint8_t *src = iter->bits;
3862 
3863     iter->bits += iter->stride;
3864 
3865     while (w && (((uintptr_t)dst) & 15))
3866     {
3867         *dst++ = *(src++) << 24;
3868         w--;
3869     }
3870 
3871     while (w >= 8)
3872     {
3873 	__m64 mm0 = ldq_u ((__m64 *)src);
3874 
3875 	__m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
3876 	__m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
3877 	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3878 	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3879 	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3880 	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3881 
3882 	*(__m64 *)(dst + 0) = mm3;
3883 	*(__m64 *)(dst + 2) = mm4;
3884 	*(__m64 *)(dst + 4) = mm5;
3885 	*(__m64 *)(dst + 6) = mm6;
3886 
3887 	dst += 8;
3888 	src += 8;
3889 	w -= 8;
3890     }
3891 
3892     while (w)
3893     {
3894 	*dst++ = *(src++) << 24;
3895 	w--;
3896     }
3897 
3898     _mm_empty ();
3899     return iter->buffer;
3900 }
3901 
3902 typedef struct
3903 {
3904     pixman_format_code_t	format;
3905     pixman_iter_get_scanline_t	get_scanline;
3906 } fetcher_info_t;
3907 
3908 static const fetcher_info_t fetchers[] =
3909 {
3910     { PIXMAN_x8r8g8b8,		mmx_fetch_x8r8g8b8 },
3911     { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
3912     { PIXMAN_a8,		mmx_fetch_a8 },
3913     { PIXMAN_null }
3914 };
3915 
3916 static pixman_bool_t
mmx_src_iter_init(pixman_implementation_t * imp,pixman_iter_t * iter)3917 mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
3918 {
3919     pixman_image_t *image = iter->image;
3920 
3921 #define FLAGS								\
3922     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
3923      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3924 
3925     if ((iter->iter_flags & ITER_NARROW)			&&
3926 	(iter->image_flags & FLAGS) == FLAGS)
3927     {
3928 	const fetcher_info_t *f;
3929 
3930 	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
3931 	{
3932 	    if (image->common.extended_format_code == f->format)
3933 	    {
3934 		uint8_t *b = (uint8_t *)image->bits.bits;
3935 		int s = image->bits.rowstride * 4;
3936 
3937 		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
3938 		iter->stride = s;
3939 
3940 		iter->get_scanline = f->get_scanline;
3941 		return TRUE;
3942 	    }
3943 	}
3944     }
3945 
3946     return FALSE;
3947 }
3948 
3949 static const pixman_fast_path_t mmx_fast_paths[] =
3950 {
3951     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
3952     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
3953     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
3954     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
3955     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
3956     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
3957     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3958     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3959     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
3960     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3961     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3962     PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
3963     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3964     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3965     PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
3966     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3967     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3968     PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
3969     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
3970     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
3971     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
3972     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
3973     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
3974     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
3975     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
3976     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
3977     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
3978     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
3979     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
3980     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
3981     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
3982     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
3983     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
3984     PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
3985     PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3986     PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3987 
3988     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
3989     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
3990     PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
3991     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
3992     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
3993     PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
3994 
3995     PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
3996     PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
3997 
3998     PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
3999     PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
4000     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
4001     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
4002     PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
4003     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
4004 
4005     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4006     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4007     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4008     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4009     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
4010     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
4011     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
4012     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
4013     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
4014     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
4015     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4016     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4017     PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4018     PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4019     PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
4020     PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
4021 
4022     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
4023     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
4024 
4025     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
4026     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4027     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4028     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
4029     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4030     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4031 
4032     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
4033     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
4034     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
4035     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
4036 
4037     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
4038     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
4039     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
4040     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
4041 
4042     { PIXMAN_OP_NONE },
4043 };
4044 
4045 pixman_implementation_t *
_pixman_implementation_create_mmx(pixman_implementation_t * fallback)4046 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4047 {
4048     pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4049 
4050     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4051     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4052     imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4053     imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4054     imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4055     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4056     imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4057     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4058     imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4059     imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4060     imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4061 
4062     imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4063     imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4064     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4065     imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4066     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4067     imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4068     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4069     imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4070     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4071     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4072     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4073 
4074     imp->blt = mmx_blt;
4075     imp->fill = mmx_fill;
4076 
4077     imp->src_iter_init = mmx_src_iter_init;
4078 
4079     return imp;
4080 }
4081 
4082 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
4083