1 /*
2 * Copyright © 2007 Luca Barbato
3 *
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of Luca Barbato not be used in advertising or
9 * publicity pertaining to distribution of the software without specific,
10 * written prior permission. Luca Barbato makes no representations about the
11 * suitability of this software for any purpose. It is provided "as is"
12 * without express or implied warranty.
13 *
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
21 * SOFTWARE.
22 *
23 * Author: Luca Barbato (lu_zero@gentoo.org)
24 *
25 * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include <config.h>
30 #endif
31 #include "pixman-private.h"
32 #include "pixman-combine32.h"
33 #include "pixman-inlines.h"
34 #include <altivec.h>
35
36 #define AVV(x...) {x}
37
38 static vector unsigned int mask_ff000000;
39 static vector unsigned int mask_red;
40 static vector unsigned int mask_green;
41 static vector unsigned int mask_blue;
42 static vector unsigned int mask_565_fix_rb;
43 static vector unsigned int mask_565_fix_g;
44
45 static force_inline vector unsigned int
splat_alpha(vector unsigned int pix)46 splat_alpha (vector unsigned int pix)
47 {
48 #ifdef WORDS_BIGENDIAN
49 return vec_perm (pix, pix,
50 (vector unsigned char)AVV (
51 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
52 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
53 #else
54 return vec_perm (pix, pix,
55 (vector unsigned char)AVV (
56 0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07,
57 0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F));
58 #endif
59 }
60
61 static force_inline vector unsigned int
splat_pixel(vector unsigned int pix)62 splat_pixel (vector unsigned int pix)
63 {
64 return vec_perm (pix, pix,
65 (vector unsigned char)AVV (
66 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
67 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03));
68 }
69
70 static force_inline vector unsigned int
pix_multiply(vector unsigned int p,vector unsigned int a)71 pix_multiply (vector unsigned int p, vector unsigned int a)
72 {
73 vector unsigned short hi, lo, mod;
74
75 /* unpack to short */
76 hi = (vector unsigned short)
77 #ifdef WORDS_BIGENDIAN
78 vec_mergeh ((vector unsigned char)AVV (0),
79 (vector unsigned char)p);
80 #else
81 vec_mergeh ((vector unsigned char) p,
82 (vector unsigned char) AVV (0));
83 #endif
84
85 mod = (vector unsigned short)
86 #ifdef WORDS_BIGENDIAN
87 vec_mergeh ((vector unsigned char)AVV (0),
88 (vector unsigned char)a);
89 #else
90 vec_mergeh ((vector unsigned char) a,
91 (vector unsigned char) AVV (0));
92 #endif
93
94 hi = vec_mladd (hi, mod, (vector unsigned short)
95 AVV (0x0080, 0x0080, 0x0080, 0x0080,
96 0x0080, 0x0080, 0x0080, 0x0080));
97
98 hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
99
100 hi = vec_sr (hi, vec_splat_u16 (8));
101
102 /* unpack to short */
103 lo = (vector unsigned short)
104 #ifdef WORDS_BIGENDIAN
105 vec_mergel ((vector unsigned char)AVV (0),
106 (vector unsigned char)p);
107 #else
108 vec_mergel ((vector unsigned char) p,
109 (vector unsigned char) AVV (0));
110 #endif
111
112 mod = (vector unsigned short)
113 #ifdef WORDS_BIGENDIAN
114 vec_mergel ((vector unsigned char)AVV (0),
115 (vector unsigned char)a);
116 #else
117 vec_mergel ((vector unsigned char) a,
118 (vector unsigned char) AVV (0));
119 #endif
120
121 lo = vec_mladd (lo, mod, (vector unsigned short)
122 AVV (0x0080, 0x0080, 0x0080, 0x0080,
123 0x0080, 0x0080, 0x0080, 0x0080));
124
125 lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
126
127 lo = vec_sr (lo, vec_splat_u16 (8));
128
129 return (vector unsigned int)vec_packsu (hi, lo);
130 }
131
132 static force_inline vector unsigned int
pix_add(vector unsigned int a,vector unsigned int b)133 pix_add (vector unsigned int a, vector unsigned int b)
134 {
135 return (vector unsigned int)vec_adds ((vector unsigned char)a,
136 (vector unsigned char)b);
137 }
138
139 static force_inline vector unsigned int
pix_add_mul(vector unsigned int x,vector unsigned int a,vector unsigned int y,vector unsigned int b)140 pix_add_mul (vector unsigned int x,
141 vector unsigned int a,
142 vector unsigned int y,
143 vector unsigned int b)
144 {
145 vector unsigned int t1, t2;
146
147 t1 = pix_multiply (x, a);
148 t2 = pix_multiply (y, b);
149
150 return pix_add (t1, t2);
151 }
152
153 static force_inline vector unsigned int
negate(vector unsigned int src)154 negate (vector unsigned int src)
155 {
156 return vec_nor (src, src);
157 }
158
159 /* dest*~srca + src */
160 static force_inline vector unsigned int
over(vector unsigned int src,vector unsigned int srca,vector unsigned int dest)161 over (vector unsigned int src,
162 vector unsigned int srca,
163 vector unsigned int dest)
164 {
165 vector unsigned char tmp = (vector unsigned char)
166 pix_multiply (dest, negate (srca));
167
168 tmp = vec_adds ((vector unsigned char)src, tmp);
169 return (vector unsigned int)tmp;
170 }
171
172 /* in == pix_multiply */
173 #define in_over(src, srca, mask, dest) \
174 over (pix_multiply (src, mask), \
175 pix_multiply (srca, mask), dest)
176
177 #ifdef WORDS_BIGENDIAN
178
179 #define COMPUTE_SHIFT_MASK(source) \
180 source ## _mask = vec_lvsl (0, source);
181
182 #define COMPUTE_SHIFT_MASKS(dest, source) \
183 source ## _mask = vec_lvsl (0, source);
184
185 #define COMPUTE_SHIFT_MASKC(dest, source, mask) \
186 mask ## _mask = vec_lvsl (0, mask); \
187 source ## _mask = vec_lvsl (0, source);
188
189 #define LOAD_VECTOR(source) \
190 do \
191 { \
192 vector unsigned char tmp1, tmp2; \
193 tmp1 = (typeof(tmp1))vec_ld (0, source); \
194 tmp2 = (typeof(tmp2))vec_ld (15, source); \
195 v ## source = (typeof(v ## source)) \
196 vec_perm (tmp1, tmp2, source ## _mask); \
197 } while (0)
198
199 #define LOAD_VECTORS(dest, source) \
200 do \
201 { \
202 LOAD_VECTOR(source); \
203 v ## dest = (typeof(v ## dest))vec_ld (0, dest); \
204 } while (0)
205
206 #define LOAD_VECTORSC(dest, source, mask) \
207 do \
208 { \
209 LOAD_VECTORS(dest, source); \
210 LOAD_VECTOR(mask); \
211 } while (0)
212
213 #define DECLARE_SRC_MASK_VAR vector unsigned char src_mask
214 #define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask
215
216 #else
217
218 /* Now the COMPUTE_SHIFT_{MASK, MASKS, MASKC} below are just no-op.
219 * They are defined that way because little endian altivec can do unaligned
220 * reads natively and have no need for constructing the permutation pattern
221 * variables.
222 */
223 #define COMPUTE_SHIFT_MASK(source)
224
225 #define COMPUTE_SHIFT_MASKS(dest, source)
226
227 #define COMPUTE_SHIFT_MASKC(dest, source, mask)
228
229 # define LOAD_VECTOR(source) \
230 v ## source = (typeof(v ## source))vec_xl(0, source);
231
232 # define LOAD_VECTORS(dest, source) \
233 LOAD_VECTOR(source); \
234 LOAD_VECTOR(dest); \
235
236 # define LOAD_VECTORSC(dest, source, mask) \
237 LOAD_VECTORS(dest, source); \
238 LOAD_VECTOR(mask); \
239
240 #define DECLARE_SRC_MASK_VAR
241 #define DECLARE_MASK_MASK_VAR
242
243 #endif /* WORDS_BIGENDIAN */
244
245 #define LOAD_VECTORSM(dest, source, mask) \
246 LOAD_VECTORSC (dest, source, mask); \
247 v ## source = pix_multiply (v ## source, \
248 splat_alpha (v ## mask));
249
250 #define STORE_VECTOR(dest) \
251 vec_st ((vector unsigned int) v ## dest, 0, dest);
252
253 /* load 4 pixels from a 16-byte boundary aligned address */
254 static force_inline vector unsigned int
load_128_aligned(const uint32_t * src)255 load_128_aligned (const uint32_t* src)
256 {
257 return *((vector unsigned int *) src);
258 }
259
260 /* load 4 pixels from a unaligned address */
261 static force_inline vector unsigned int
load_128_unaligned(const uint32_t * src)262 load_128_unaligned (const uint32_t* src)
263 {
264 vector unsigned int vsrc;
265 DECLARE_SRC_MASK_VAR;
266
267 COMPUTE_SHIFT_MASK (src);
268 LOAD_VECTOR (src);
269
270 return vsrc;
271 }
272
273 /* save 4 pixels on a 16-byte boundary aligned address */
274 static force_inline void
save_128_aligned(uint32_t * data,vector unsigned int vdata)275 save_128_aligned (uint32_t* data,
276 vector unsigned int vdata)
277 {
278 STORE_VECTOR(data)
279 }
280
281 static force_inline vector unsigned int
create_mask_1x32_128(const uint32_t * src)282 create_mask_1x32_128 (const uint32_t *src)
283 {
284 vector unsigned int vsrc;
285 DECLARE_SRC_MASK_VAR;
286
287 COMPUTE_SHIFT_MASK (src);
288 LOAD_VECTOR (src);
289 return vec_splat(vsrc, 0);
290 }
291
292 static force_inline vector unsigned int
create_mask_32_128(uint32_t mask)293 create_mask_32_128 (uint32_t mask)
294 {
295 return create_mask_1x32_128(&mask);
296 }
297
298 static force_inline vector unsigned int
unpacklo_128_16x8(vector unsigned int data1,vector unsigned int data2)299 unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2)
300 {
301 vector unsigned char lo;
302
303 /* unpack to short */
304 lo = (vector unsigned char)
305 #ifdef WORDS_BIGENDIAN
306 vec_mergel ((vector unsigned char) data2,
307 (vector unsigned char) data1);
308 #else
309 vec_mergel ((vector unsigned char) data1,
310 (vector unsigned char) data2);
311 #endif
312
313 return (vector unsigned int) lo;
314 }
315
316 static force_inline vector unsigned int
unpackhi_128_16x8(vector unsigned int data1,vector unsigned int data2)317 unpackhi_128_16x8 (vector unsigned int data1, vector unsigned int data2)
318 {
319 vector unsigned char hi;
320
321 /* unpack to short */
322 hi = (vector unsigned char)
323 #ifdef WORDS_BIGENDIAN
324 vec_mergeh ((vector unsigned char) data2,
325 (vector unsigned char) data1);
326 #else
327 vec_mergeh ((vector unsigned char) data1,
328 (vector unsigned char) data2);
329 #endif
330
331 return (vector unsigned int) hi;
332 }
333
334 static force_inline vector unsigned int
unpacklo_128_8x16(vector unsigned int data1,vector unsigned int data2)335 unpacklo_128_8x16 (vector unsigned int data1, vector unsigned int data2)
336 {
337 vector unsigned short lo;
338
339 /* unpack to char */
340 lo = (vector unsigned short)
341 #ifdef WORDS_BIGENDIAN
342 vec_mergel ((vector unsigned short) data2,
343 (vector unsigned short) data1);
344 #else
345 vec_mergel ((vector unsigned short) data1,
346 (vector unsigned short) data2);
347 #endif
348
349 return (vector unsigned int) lo;
350 }
351
352 static force_inline vector unsigned int
unpackhi_128_8x16(vector unsigned int data1,vector unsigned int data2)353 unpackhi_128_8x16 (vector unsigned int data1, vector unsigned int data2)
354 {
355 vector unsigned short hi;
356
357 /* unpack to char */
358 hi = (vector unsigned short)
359 #ifdef WORDS_BIGENDIAN
360 vec_mergeh ((vector unsigned short) data2,
361 (vector unsigned short) data1);
362 #else
363 vec_mergeh ((vector unsigned short) data1,
364 (vector unsigned short) data2);
365 #endif
366
367 return (vector unsigned int) hi;
368 }
369
370 static force_inline void
unpack_128_2x128(vector unsigned int data1,vector unsigned int data2,vector unsigned int * data_lo,vector unsigned int * data_hi)371 unpack_128_2x128 (vector unsigned int data1, vector unsigned int data2,
372 vector unsigned int* data_lo, vector unsigned int* data_hi)
373 {
374 *data_lo = unpacklo_128_16x8(data1, data2);
375 *data_hi = unpackhi_128_16x8(data1, data2);
376 }
377
378 static force_inline void
unpack_128_2x128_16(vector unsigned int data1,vector unsigned int data2,vector unsigned int * data_lo,vector unsigned int * data_hi)379 unpack_128_2x128_16 (vector unsigned int data1, vector unsigned int data2,
380 vector unsigned int* data_lo, vector unsigned int* data_hi)
381 {
382 *data_lo = unpacklo_128_8x16(data1, data2);
383 *data_hi = unpackhi_128_8x16(data1, data2);
384 }
385
386 static force_inline vector unsigned int
unpack_565_to_8888(vector unsigned int lo)387 unpack_565_to_8888 (vector unsigned int lo)
388 {
389 vector unsigned int r, g, b, rb, t;
390
391 r = vec_and (vec_sl(lo, create_mask_32_128(8)), mask_red);
392 g = vec_and (vec_sl(lo, create_mask_32_128(5)), mask_green);
393 b = vec_and (vec_sl(lo, create_mask_32_128(3)), mask_blue);
394
395 rb = vec_or (r, b);
396 t = vec_and (rb, mask_565_fix_rb);
397 t = vec_sr (t, create_mask_32_128(5));
398 rb = vec_or (rb, t);
399
400 t = vec_and (g, mask_565_fix_g);
401 t = vec_sr (t, create_mask_32_128(6));
402 g = vec_or (g, t);
403
404 return vec_or (rb, g);
405 }
406
407 static force_inline int
is_opaque(vector unsigned int x)408 is_opaque (vector unsigned int x)
409 {
410 uint32_t cmp_result;
411 vector bool int ffs = vec_cmpeq(x, x);
412
413 cmp_result = vec_all_eq(x, ffs);
414
415 return (cmp_result & 0x8888) == 0x8888;
416 }
417
418 static force_inline int
is_zero(vector unsigned int x)419 is_zero (vector unsigned int x)
420 {
421 uint32_t cmp_result;
422
423 cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
424
425 return cmp_result == 0xffff;
426 }
427
428 static force_inline int
is_transparent(vector unsigned int x)429 is_transparent (vector unsigned int x)
430 {
431 uint32_t cmp_result;
432
433 cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
434 return (cmp_result & 0x8888) == 0x8888;
435 }
436
437 static force_inline uint32_t
core_combine_over_u_pixel_vmx(uint32_t src,uint32_t dst)438 core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
439 {
440 uint32_t a;
441
442 a = ALPHA_8(src);
443
444 if (a == 0xff)
445 {
446 return src;
447 }
448 else if (src)
449 {
450 UN8x4_MUL_UN8_ADD_UN8x4(dst, (~a & MASK), src);
451 }
452
453 return dst;
454 }
455
456 static force_inline uint32_t
combine1(const uint32_t * ps,const uint32_t * pm)457 combine1 (const uint32_t *ps, const uint32_t *pm)
458 {
459 uint32_t s = *ps;
460
461 if (pm)
462 UN8x4_MUL_UN8(s, ALPHA_8(*pm));
463
464 return s;
465 }
466
467 static force_inline vector unsigned int
combine4(const uint32_t * ps,const uint32_t * pm)468 combine4 (const uint32_t* ps, const uint32_t* pm)
469 {
470 vector unsigned int src, msk;
471
472 if (pm)
473 {
474 msk = load_128_unaligned(pm);
475
476 if (is_transparent(msk))
477 return (vector unsigned int) AVV(0);
478 }
479
480 src = load_128_unaligned(ps);
481
482 if (pm)
483 src = pix_multiply(src, msk);
484
485 return src;
486 }
487
488 static void
vmx_combine_over_u_no_mask(uint32_t * dest,const uint32_t * src,int width)489 vmx_combine_over_u_no_mask (uint32_t * dest,
490 const uint32_t *src,
491 int width)
492 {
493 int i;
494 vector unsigned int vdest, vsrc;
495 DECLARE_SRC_MASK_VAR;
496
497 while (width && ((uintptr_t)dest & 15))
498 {
499 uint32_t s = *src++;
500 uint32_t d = *dest;
501 uint32_t ia = ALPHA_8 (~s);
502
503 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
504
505 *dest++ = d;
506 width--;
507 }
508
509 COMPUTE_SHIFT_MASKS (dest, src);
510
511 /* printf ("%s\n",__PRETTY_FUNCTION__); */
512 for (i = width / 4; i > 0; i--)
513 {
514
515 LOAD_VECTORS (dest, src);
516
517 vdest = over (vsrc, splat_alpha (vsrc), vdest);
518
519 STORE_VECTOR (dest);
520
521 src += 4;
522 dest += 4;
523 }
524
525 for (i = width % 4; --i >= 0;)
526 {
527 uint32_t s = src[i];
528 uint32_t d = dest[i];
529 uint32_t ia = ALPHA_8 (~s);
530
531 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
532
533 dest[i] = d;
534 }
535 }
536
537 static void
vmx_combine_over_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)538 vmx_combine_over_u_mask (uint32_t * dest,
539 const uint32_t *src,
540 const uint32_t *mask,
541 int width)
542 {
543 int i;
544 vector unsigned int vdest, vsrc, vmask;
545 DECLARE_SRC_MASK_VAR;
546 DECLARE_MASK_MASK_VAR;
547
548 while (width && ((uintptr_t)dest & 15))
549 {
550 uint32_t m = ALPHA_8 (*mask++);
551 uint32_t s = *src++;
552 uint32_t d = *dest;
553 uint32_t ia;
554
555 UN8x4_MUL_UN8 (s, m);
556
557 ia = ALPHA_8 (~s);
558
559 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
560 *dest++ = d;
561 width--;
562 }
563
564 COMPUTE_SHIFT_MASKC (dest, src, mask);
565
566 /* printf ("%s\n",__PRETTY_FUNCTION__); */
567 for (i = width / 4; i > 0; i--)
568 {
569 LOAD_VECTORSM (dest, src, mask);
570
571 vdest = over (vsrc, splat_alpha (vsrc), vdest);
572
573 STORE_VECTOR (dest);
574
575 src += 4;
576 dest += 4;
577 mask += 4;
578 }
579
580 for (i = width % 4; --i >= 0;)
581 {
582 uint32_t m = ALPHA_8 (mask[i]);
583 uint32_t s = src[i];
584 uint32_t d = dest[i];
585 uint32_t ia;
586
587 UN8x4_MUL_UN8 (s, m);
588
589 ia = ALPHA_8 (~s);
590
591 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
592 dest[i] = d;
593 }
594 }
595
596 static void
vmx_combine_over_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)597 vmx_combine_over_u (pixman_implementation_t *imp,
598 pixman_op_t op,
599 uint32_t * dest,
600 const uint32_t * src,
601 const uint32_t * mask,
602 int width)
603 {
604 if (mask)
605 vmx_combine_over_u_mask (dest, src, mask, width);
606 else
607 vmx_combine_over_u_no_mask (dest, src, width);
608 }
609
610 static void
vmx_combine_over_reverse_u_no_mask(uint32_t * dest,const uint32_t * src,int width)611 vmx_combine_over_reverse_u_no_mask (uint32_t * dest,
612 const uint32_t *src,
613 int width)
614 {
615 int i;
616 vector unsigned int vdest, vsrc;
617 DECLARE_SRC_MASK_VAR;
618
619 while (width && ((uintptr_t)dest & 15))
620 {
621 uint32_t s = *src++;
622 uint32_t d = *dest;
623 uint32_t ia = ALPHA_8 (~d);
624
625 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
626 *dest++ = s;
627 width--;
628 }
629
630 COMPUTE_SHIFT_MASKS (dest, src);
631
632 /* printf ("%s\n",__PRETTY_FUNCTION__); */
633 for (i = width / 4; i > 0; i--)
634 {
635
636 LOAD_VECTORS (dest, src);
637
638 vdest = over (vdest, splat_alpha (vdest), vsrc);
639
640 STORE_VECTOR (dest);
641
642 src += 4;
643 dest += 4;
644 }
645
646 for (i = width % 4; --i >= 0;)
647 {
648 uint32_t s = src[i];
649 uint32_t d = dest[i];
650 uint32_t ia = ALPHA_8 (~dest[i]);
651
652 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
653 dest[i] = s;
654 }
655 }
656
657 static void
vmx_combine_over_reverse_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)658 vmx_combine_over_reverse_u_mask (uint32_t * dest,
659 const uint32_t *src,
660 const uint32_t *mask,
661 int width)
662 {
663 int i;
664 vector unsigned int vdest, vsrc, vmask;
665 DECLARE_SRC_MASK_VAR;
666 DECLARE_MASK_MASK_VAR;
667
668 while (width && ((uintptr_t)dest & 15))
669 {
670 uint32_t m = ALPHA_8 (*mask++);
671 uint32_t s = *src++;
672 uint32_t d = *dest;
673 uint32_t ia = ALPHA_8 (~d);
674
675 UN8x4_MUL_UN8 (s, m);
676
677 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
678 *dest++ = s;
679 width--;
680 }
681
682 COMPUTE_SHIFT_MASKC (dest, src, mask);
683
684 /* printf ("%s\n",__PRETTY_FUNCTION__); */
685 for (i = width / 4; i > 0; i--)
686 {
687
688 LOAD_VECTORSM (dest, src, mask);
689
690 vdest = over (vdest, splat_alpha (vdest), vsrc);
691
692 STORE_VECTOR (dest);
693
694 src += 4;
695 dest += 4;
696 mask += 4;
697 }
698
699 for (i = width % 4; --i >= 0;)
700 {
701 uint32_t m = ALPHA_8 (mask[i]);
702 uint32_t s = src[i];
703 uint32_t d = dest[i];
704 uint32_t ia = ALPHA_8 (~dest[i]);
705
706 UN8x4_MUL_UN8 (s, m);
707
708 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
709 dest[i] = s;
710 }
711 }
712
713 static void
vmx_combine_over_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)714 vmx_combine_over_reverse_u (pixman_implementation_t *imp,
715 pixman_op_t op,
716 uint32_t * dest,
717 const uint32_t * src,
718 const uint32_t * mask,
719 int width)
720 {
721 if (mask)
722 vmx_combine_over_reverse_u_mask (dest, src, mask, width);
723 else
724 vmx_combine_over_reverse_u_no_mask (dest, src, width);
725 }
726
727 static void
vmx_combine_in_u_no_mask(uint32_t * dest,const uint32_t * src,int width)728 vmx_combine_in_u_no_mask (uint32_t * dest,
729 const uint32_t *src,
730 int width)
731 {
732 int i;
733 vector unsigned int vdest, vsrc;
734 DECLARE_SRC_MASK_VAR;
735
736 while (width && ((uintptr_t)dest & 15))
737 {
738 uint32_t s = *src++;
739 uint32_t a = ALPHA_8 (*dest);
740
741 UN8x4_MUL_UN8 (s, a);
742 *dest++ = s;
743 width--;
744 }
745
746 COMPUTE_SHIFT_MASKS (dest, src);
747
748 /* printf ("%s\n",__PRETTY_FUNCTION__); */
749 for (i = width / 4; i > 0; i--)
750 {
751 LOAD_VECTORS (dest, src);
752
753 vdest = pix_multiply (vsrc, splat_alpha (vdest));
754
755 STORE_VECTOR (dest);
756
757 src += 4;
758 dest += 4;
759 }
760
761 for (i = width % 4; --i >= 0;)
762 {
763 uint32_t s = src[i];
764 uint32_t a = ALPHA_8 (dest[i]);
765
766 UN8x4_MUL_UN8 (s, a);
767 dest[i] = s;
768 }
769 }
770
771 static void
vmx_combine_in_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)772 vmx_combine_in_u_mask (uint32_t * dest,
773 const uint32_t *src,
774 const uint32_t *mask,
775 int width)
776 {
777 int i;
778 vector unsigned int vdest, vsrc, vmask;
779 DECLARE_SRC_MASK_VAR;
780 DECLARE_MASK_MASK_VAR;
781
782 while (width && ((uintptr_t)dest & 15))
783 {
784 uint32_t m = ALPHA_8 (*mask++);
785 uint32_t s = *src++;
786 uint32_t a = ALPHA_8 (*dest);
787
788 UN8x4_MUL_UN8 (s, m);
789 UN8x4_MUL_UN8 (s, a);
790
791 *dest++ = s;
792 width--;
793 }
794
795 COMPUTE_SHIFT_MASKC (dest, src, mask);
796
797 /* printf ("%s\n",__PRETTY_FUNCTION__); */
798 for (i = width / 4; i > 0; i--)
799 {
800 LOAD_VECTORSM (dest, src, mask);
801
802 vdest = pix_multiply (vsrc, splat_alpha (vdest));
803
804 STORE_VECTOR (dest);
805
806 src += 4;
807 dest += 4;
808 mask += 4;
809 }
810
811 for (i = width % 4; --i >= 0;)
812 {
813 uint32_t m = ALPHA_8 (mask[i]);
814 uint32_t s = src[i];
815 uint32_t a = ALPHA_8 (dest[i]);
816
817 UN8x4_MUL_UN8 (s, m);
818 UN8x4_MUL_UN8 (s, a);
819
820 dest[i] = s;
821 }
822 }
823
824 static void
vmx_combine_in_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)825 vmx_combine_in_u (pixman_implementation_t *imp,
826 pixman_op_t op,
827 uint32_t * dest,
828 const uint32_t * src,
829 const uint32_t * mask,
830 int width)
831 {
832 if (mask)
833 vmx_combine_in_u_mask (dest, src, mask, width);
834 else
835 vmx_combine_in_u_no_mask (dest, src, width);
836 }
837
838 static void
vmx_combine_in_reverse_u_no_mask(uint32_t * dest,const uint32_t * src,int width)839 vmx_combine_in_reverse_u_no_mask (uint32_t * dest,
840 const uint32_t *src,
841 int width)
842 {
843 int i;
844 vector unsigned int vdest, vsrc;
845 DECLARE_SRC_MASK_VAR;
846
847 while (width && ((uintptr_t)dest & 15))
848 {
849 uint32_t d = *dest;
850 uint32_t a = ALPHA_8 (*src++);
851
852 UN8x4_MUL_UN8 (d, a);
853
854 *dest++ = d;
855 width--;
856 }
857
858 COMPUTE_SHIFT_MASKS (dest, src);
859
860 /* printf ("%s\n",__PRETTY_FUNCTION__); */
861 for (i = width / 4; i > 0; i--)
862 {
863 LOAD_VECTORS (dest, src);
864
865 vdest = pix_multiply (vdest, splat_alpha (vsrc));
866
867 STORE_VECTOR (dest);
868
869 src += 4;
870 dest += 4;
871 }
872
873 for (i = width % 4; --i >= 0;)
874 {
875 uint32_t d = dest[i];
876 uint32_t a = ALPHA_8 (src[i]);
877
878 UN8x4_MUL_UN8 (d, a);
879
880 dest[i] = d;
881 }
882 }
883
884 static void
vmx_combine_in_reverse_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)885 vmx_combine_in_reverse_u_mask (uint32_t * dest,
886 const uint32_t *src,
887 const uint32_t *mask,
888 int width)
889 {
890 int i;
891 vector unsigned int vdest, vsrc, vmask;
892 DECLARE_SRC_MASK_VAR;
893 DECLARE_MASK_MASK_VAR;
894
895 while (width && ((uintptr_t)dest & 15))
896 {
897 uint32_t m = ALPHA_8 (*mask++);
898 uint32_t d = *dest;
899 uint32_t a = *src++;
900
901 UN8x4_MUL_UN8 (a, m);
902 a = ALPHA_8 (a);
903 UN8x4_MUL_UN8 (d, a);
904
905 *dest++ = d;
906 width--;
907 }
908
909 COMPUTE_SHIFT_MASKC (dest, src, mask);
910
911 /* printf ("%s\n",__PRETTY_FUNCTION__); */
912 for (i = width / 4; i > 0; i--)
913 {
914 LOAD_VECTORSM (dest, src, mask);
915
916 vdest = pix_multiply (vdest, splat_alpha (vsrc));
917
918 STORE_VECTOR (dest);
919
920 src += 4;
921 dest += 4;
922 mask += 4;
923 }
924
925 for (i = width % 4; --i >= 0;)
926 {
927 uint32_t m = ALPHA_8 (mask[i]);
928 uint32_t d = dest[i];
929 uint32_t a = src[i];
930
931 UN8x4_MUL_UN8 (a, m);
932 a = ALPHA_8 (a);
933 UN8x4_MUL_UN8 (d, a);
934
935 dest[i] = d;
936 }
937 }
938
939 static void
vmx_combine_in_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)940 vmx_combine_in_reverse_u (pixman_implementation_t *imp,
941 pixman_op_t op,
942 uint32_t * dest,
943 const uint32_t * src,
944 const uint32_t * mask,
945 int width)
946 {
947 if (mask)
948 vmx_combine_in_reverse_u_mask (dest, src, mask, width);
949 else
950 vmx_combine_in_reverse_u_no_mask (dest, src, width);
951 }
952
953 static void
vmx_combine_out_u_no_mask(uint32_t * dest,const uint32_t * src,int width)954 vmx_combine_out_u_no_mask (uint32_t * dest,
955 const uint32_t *src,
956 int width)
957 {
958 int i;
959 vector unsigned int vdest, vsrc;
960 DECLARE_SRC_MASK_VAR;
961
962 while (width && ((uintptr_t)dest & 15))
963 {
964 uint32_t s = *src++;
965 uint32_t a = ALPHA_8 (~(*dest));
966
967 UN8x4_MUL_UN8 (s, a);
968
969 *dest++ = s;
970 width--;
971 }
972
973 COMPUTE_SHIFT_MASKS (dest, src);
974
975 /* printf ("%s\n",__PRETTY_FUNCTION__); */
976 for (i = width / 4; i > 0; i--)
977 {
978 LOAD_VECTORS (dest, src);
979
980 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
981
982 STORE_VECTOR (dest);
983
984 src += 4;
985 dest += 4;
986 }
987
988 for (i = width % 4; --i >= 0;)
989 {
990 uint32_t s = src[i];
991 uint32_t a = ALPHA_8 (~dest[i]);
992
993 UN8x4_MUL_UN8 (s, a);
994
995 dest[i] = s;
996 }
997 }
998
999 static void
vmx_combine_out_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1000 vmx_combine_out_u_mask (uint32_t * dest,
1001 const uint32_t *src,
1002 const uint32_t *mask,
1003 int width)
1004 {
1005 int i;
1006 vector unsigned int vdest, vsrc, vmask;
1007 DECLARE_SRC_MASK_VAR;
1008 DECLARE_MASK_MASK_VAR;
1009
1010 while (width && ((uintptr_t)dest & 15))
1011 {
1012 uint32_t m = ALPHA_8 (*mask++);
1013 uint32_t s = *src++;
1014 uint32_t a = ALPHA_8 (~(*dest));
1015
1016 UN8x4_MUL_UN8 (s, m);
1017 UN8x4_MUL_UN8 (s, a);
1018
1019 *dest++ = s;
1020 width--;
1021 }
1022
1023 COMPUTE_SHIFT_MASKC (dest, src, mask);
1024
1025 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1026 for (i = width / 4; i > 0; i--)
1027 {
1028 LOAD_VECTORSM (dest, src, mask);
1029
1030 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
1031
1032 STORE_VECTOR (dest);
1033
1034 src += 4;
1035 dest += 4;
1036 mask += 4;
1037 }
1038
1039 for (i = width % 4; --i >= 0;)
1040 {
1041 uint32_t m = ALPHA_8 (mask[i]);
1042 uint32_t s = src[i];
1043 uint32_t a = ALPHA_8 (~dest[i]);
1044
1045 UN8x4_MUL_UN8 (s, m);
1046 UN8x4_MUL_UN8 (s, a);
1047
1048 dest[i] = s;
1049 }
1050 }
1051
1052 static void
vmx_combine_out_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1053 vmx_combine_out_u (pixman_implementation_t *imp,
1054 pixman_op_t op,
1055 uint32_t * dest,
1056 const uint32_t * src,
1057 const uint32_t * mask,
1058 int width)
1059 {
1060 if (mask)
1061 vmx_combine_out_u_mask (dest, src, mask, width);
1062 else
1063 vmx_combine_out_u_no_mask (dest, src, width);
1064 }
1065
1066 static void
vmx_combine_out_reverse_u_no_mask(uint32_t * dest,const uint32_t * src,int width)1067 vmx_combine_out_reverse_u_no_mask (uint32_t * dest,
1068 const uint32_t *src,
1069 int width)
1070 {
1071 int i;
1072 vector unsigned int vdest, vsrc;
1073 DECLARE_SRC_MASK_VAR;
1074
1075 while (width && ((uintptr_t)dest & 15))
1076 {
1077 uint32_t d = *dest;
1078 uint32_t a = ALPHA_8 (~(*src++));
1079
1080 UN8x4_MUL_UN8 (d, a);
1081
1082 *dest++ = d;
1083 width--;
1084 }
1085
1086 COMPUTE_SHIFT_MASKS (dest, src);
1087
1088 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1089 for (i = width / 4; i > 0; i--)
1090 {
1091
1092 LOAD_VECTORS (dest, src);
1093
1094 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
1095
1096 STORE_VECTOR (dest);
1097
1098 src += 4;
1099 dest += 4;
1100 }
1101
1102 for (i = width % 4; --i >= 0;)
1103 {
1104 uint32_t d = dest[i];
1105 uint32_t a = ALPHA_8 (~src[i]);
1106
1107 UN8x4_MUL_UN8 (d, a);
1108
1109 dest[i] = d;
1110 }
1111 }
1112
1113 static void
vmx_combine_out_reverse_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1114 vmx_combine_out_reverse_u_mask (uint32_t * dest,
1115 const uint32_t *src,
1116 const uint32_t *mask,
1117 int width)
1118 {
1119 int i;
1120 vector unsigned int vdest, vsrc, vmask;
1121 DECLARE_SRC_MASK_VAR;
1122 DECLARE_MASK_MASK_VAR;
1123
1124 while (width && ((uintptr_t)dest & 15))
1125 {
1126 uint32_t m = ALPHA_8 (*mask++);
1127 uint32_t d = *dest;
1128 uint32_t a = *src++;
1129
1130 UN8x4_MUL_UN8 (a, m);
1131 a = ALPHA_8 (~a);
1132 UN8x4_MUL_UN8 (d, a);
1133
1134 *dest++ = d;
1135 width--;
1136 }
1137
1138 COMPUTE_SHIFT_MASKC (dest, src, mask);
1139
1140 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1141 for (i = width / 4; i > 0; i--)
1142 {
1143 LOAD_VECTORSM (dest, src, mask);
1144
1145 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
1146
1147 STORE_VECTOR (dest);
1148
1149 src += 4;
1150 dest += 4;
1151 mask += 4;
1152 }
1153
1154 for (i = width % 4; --i >= 0;)
1155 {
1156 uint32_t m = ALPHA_8 (mask[i]);
1157 uint32_t d = dest[i];
1158 uint32_t a = src[i];
1159
1160 UN8x4_MUL_UN8 (a, m);
1161 a = ALPHA_8 (~a);
1162 UN8x4_MUL_UN8 (d, a);
1163
1164 dest[i] = d;
1165 }
1166 }
1167
1168 static void
vmx_combine_out_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1169 vmx_combine_out_reverse_u (pixman_implementation_t *imp,
1170 pixman_op_t op,
1171 uint32_t * dest,
1172 const uint32_t * src,
1173 const uint32_t * mask,
1174 int width)
1175 {
1176 if (mask)
1177 vmx_combine_out_reverse_u_mask (dest, src, mask, width);
1178 else
1179 vmx_combine_out_reverse_u_no_mask (dest, src, width);
1180 }
1181
1182 static void
vmx_combine_atop_u_no_mask(uint32_t * dest,const uint32_t * src,int width)1183 vmx_combine_atop_u_no_mask (uint32_t * dest,
1184 const uint32_t *src,
1185 int width)
1186 {
1187 int i;
1188 vector unsigned int vdest, vsrc;
1189 DECLARE_SRC_MASK_VAR;
1190
1191 while (width && ((uintptr_t)dest & 15))
1192 {
1193 uint32_t s = *src++;
1194 uint32_t d = *dest;
1195 uint32_t dest_a = ALPHA_8 (d);
1196 uint32_t src_ia = ALPHA_8 (~s);
1197
1198 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1199
1200 *dest++ = s;
1201 width--;
1202 }
1203
1204 COMPUTE_SHIFT_MASKS (dest, src);
1205
1206 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1207 for (i = width / 4; i > 0; i--)
1208 {
1209 LOAD_VECTORS (dest, src);
1210
1211 vdest = pix_add_mul (vsrc, splat_alpha (vdest),
1212 vdest, splat_alpha (negate (vsrc)));
1213
1214 STORE_VECTOR (dest);
1215
1216 src += 4;
1217 dest += 4;
1218 }
1219
1220 for (i = width % 4; --i >= 0;)
1221 {
1222 uint32_t s = src[i];
1223 uint32_t d = dest[i];
1224 uint32_t dest_a = ALPHA_8 (d);
1225 uint32_t src_ia = ALPHA_8 (~s);
1226
1227 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1228
1229 dest[i] = s;
1230 }
1231 }
1232
1233 static void
vmx_combine_atop_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1234 vmx_combine_atop_u_mask (uint32_t * dest,
1235 const uint32_t *src,
1236 const uint32_t *mask,
1237 int width)
1238 {
1239 int i;
1240 vector unsigned int vdest, vsrc, vmask;
1241 DECLARE_SRC_MASK_VAR;
1242 DECLARE_MASK_MASK_VAR;
1243
1244 while (width && ((uintptr_t)dest & 15))
1245 {
1246 uint32_t m = ALPHA_8 (*mask++);
1247 uint32_t s = *src++;
1248 uint32_t d = *dest;
1249 uint32_t dest_a = ALPHA_8 (d);
1250 uint32_t src_ia;
1251
1252 UN8x4_MUL_UN8 (s, m);
1253
1254 src_ia = ALPHA_8 (~s);
1255
1256 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1257
1258 *dest++ = s;
1259 width--;
1260 }
1261
1262 COMPUTE_SHIFT_MASKC (dest, src, mask);
1263
1264 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1265 for (i = width / 4; i > 0; i--)
1266 {
1267 LOAD_VECTORSM (dest, src, mask);
1268
1269 vdest = pix_add_mul (vsrc, splat_alpha (vdest),
1270 vdest, splat_alpha (negate (vsrc)));
1271
1272 STORE_VECTOR (dest);
1273
1274 src += 4;
1275 dest += 4;
1276 mask += 4;
1277 }
1278
1279 for (i = width % 4; --i >= 0;)
1280 {
1281 uint32_t m = ALPHA_8 (mask[i]);
1282 uint32_t s = src[i];
1283 uint32_t d = dest[i];
1284 uint32_t dest_a = ALPHA_8 (d);
1285 uint32_t src_ia;
1286
1287 UN8x4_MUL_UN8 (s, m);
1288
1289 src_ia = ALPHA_8 (~s);
1290
1291 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1292
1293 dest[i] = s;
1294 }
1295 }
1296
1297 static void
vmx_combine_atop_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1298 vmx_combine_atop_u (pixman_implementation_t *imp,
1299 pixman_op_t op,
1300 uint32_t * dest,
1301 const uint32_t * src,
1302 const uint32_t * mask,
1303 int width)
1304 {
1305 if (mask)
1306 vmx_combine_atop_u_mask (dest, src, mask, width);
1307 else
1308 vmx_combine_atop_u_no_mask (dest, src, width);
1309 }
1310
1311 static void
vmx_combine_atop_reverse_u_no_mask(uint32_t * dest,const uint32_t * src,int width)1312 vmx_combine_atop_reverse_u_no_mask (uint32_t * dest,
1313 const uint32_t *src,
1314 int width)
1315 {
1316 int i;
1317 vector unsigned int vdest, vsrc;
1318 DECLARE_SRC_MASK_VAR;
1319
1320 while (width && ((uintptr_t)dest & 15))
1321 {
1322 uint32_t s = *src++;
1323 uint32_t d = *dest;
1324 uint32_t src_a = ALPHA_8 (s);
1325 uint32_t dest_ia = ALPHA_8 (~d);
1326
1327 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1328
1329 *dest++ = s;
1330 width--;
1331 }
1332
1333 COMPUTE_SHIFT_MASKS (dest, src);
1334
1335 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1336 for (i = width / 4; i > 0; i--)
1337 {
1338 LOAD_VECTORS (dest, src);
1339
1340 vdest = pix_add_mul (vdest, splat_alpha (vsrc),
1341 vsrc, splat_alpha (negate (vdest)));
1342
1343 STORE_VECTOR (dest);
1344
1345 src += 4;
1346 dest += 4;
1347 }
1348
1349 for (i = width % 4; --i >= 0;)
1350 {
1351 uint32_t s = src[i];
1352 uint32_t d = dest[i];
1353 uint32_t src_a = ALPHA_8 (s);
1354 uint32_t dest_ia = ALPHA_8 (~d);
1355
1356 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1357
1358 dest[i] = s;
1359 }
1360 }
1361
1362 static void
vmx_combine_atop_reverse_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1363 vmx_combine_atop_reverse_u_mask (uint32_t * dest,
1364 const uint32_t *src,
1365 const uint32_t *mask,
1366 int width)
1367 {
1368 int i;
1369 vector unsigned int vdest, vsrc, vmask;
1370 DECLARE_SRC_MASK_VAR;
1371 DECLARE_MASK_MASK_VAR;
1372
1373 while (width && ((uintptr_t)dest & 15))
1374 {
1375 uint32_t m = ALPHA_8 (*mask++);
1376 uint32_t s = *src++;
1377 uint32_t d = *dest;
1378 uint32_t src_a;
1379 uint32_t dest_ia = ALPHA_8 (~d);
1380
1381 UN8x4_MUL_UN8 (s, m);
1382
1383 src_a = ALPHA_8 (s);
1384
1385 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1386
1387 *dest++ = s;
1388 width--;
1389 }
1390
1391 COMPUTE_SHIFT_MASKC (dest, src, mask);
1392
1393 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1394 for (i = width / 4; i > 0; i--)
1395 {
1396 LOAD_VECTORSM (dest, src, mask);
1397
1398 vdest = pix_add_mul (vdest, splat_alpha (vsrc),
1399 vsrc, splat_alpha (negate (vdest)));
1400
1401 STORE_VECTOR (dest);
1402
1403 src += 4;
1404 dest += 4;
1405 mask += 4;
1406 }
1407
1408 for (i = width % 4; --i >= 0;)
1409 {
1410 uint32_t m = ALPHA_8 (mask[i]);
1411 uint32_t s = src[i];
1412 uint32_t d = dest[i];
1413 uint32_t src_a;
1414 uint32_t dest_ia = ALPHA_8 (~d);
1415
1416 UN8x4_MUL_UN8 (s, m);
1417
1418 src_a = ALPHA_8 (s);
1419
1420 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1421
1422 dest[i] = s;
1423 }
1424 }
1425
1426 static void
vmx_combine_atop_reverse_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1427 vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
1428 pixman_op_t op,
1429 uint32_t * dest,
1430 const uint32_t * src,
1431 const uint32_t * mask,
1432 int width)
1433 {
1434 if (mask)
1435 vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
1436 else
1437 vmx_combine_atop_reverse_u_no_mask (dest, src, width);
1438 }
1439
1440 static void
vmx_combine_xor_u_no_mask(uint32_t * dest,const uint32_t * src,int width)1441 vmx_combine_xor_u_no_mask (uint32_t * dest,
1442 const uint32_t *src,
1443 int width)
1444 {
1445 int i;
1446 vector unsigned int vdest, vsrc;
1447 DECLARE_SRC_MASK_VAR;
1448
1449 while (width && ((uintptr_t)dest & 15))
1450 {
1451 uint32_t s = *src++;
1452 uint32_t d = *dest;
1453 uint32_t src_ia = ALPHA_8 (~s);
1454 uint32_t dest_ia = ALPHA_8 (~d);
1455
1456 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1457
1458 *dest++ = s;
1459 width--;
1460 }
1461
1462 COMPUTE_SHIFT_MASKS (dest, src);
1463
1464 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1465 for (i = width / 4; i > 0; i--)
1466 {
1467 LOAD_VECTORS (dest, src);
1468
1469 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
1470 vdest, splat_alpha (negate (vsrc)));
1471
1472 STORE_VECTOR (dest);
1473
1474 src += 4;
1475 dest += 4;
1476 }
1477
1478 for (i = width % 4; --i >= 0;)
1479 {
1480 uint32_t s = src[i];
1481 uint32_t d = dest[i];
1482 uint32_t src_ia = ALPHA_8 (~s);
1483 uint32_t dest_ia = ALPHA_8 (~d);
1484
1485 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1486
1487 dest[i] = s;
1488 }
1489 }
1490
1491 static void
vmx_combine_xor_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1492 vmx_combine_xor_u_mask (uint32_t * dest,
1493 const uint32_t *src,
1494 const uint32_t *mask,
1495 int width)
1496 {
1497 int i;
1498 vector unsigned int vdest, vsrc, vmask;
1499 DECLARE_SRC_MASK_VAR;
1500 DECLARE_MASK_MASK_VAR;
1501
1502 while (width && ((uintptr_t)dest & 15))
1503 {
1504 uint32_t m = ALPHA_8 (*mask++);
1505 uint32_t s = *src++;
1506 uint32_t d = *dest;
1507 uint32_t src_ia;
1508 uint32_t dest_ia = ALPHA_8 (~d);
1509
1510 UN8x4_MUL_UN8 (s, m);
1511
1512 src_ia = ALPHA_8 (~s);
1513
1514 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1515
1516 *dest++ = s;
1517 width--;
1518 }
1519
1520 COMPUTE_SHIFT_MASKC (dest, src, mask);
1521
1522 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1523 for (i = width / 4; i > 0; i--)
1524 {
1525 LOAD_VECTORSM (dest, src, mask);
1526
1527 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
1528 vdest, splat_alpha (negate (vsrc)));
1529
1530 STORE_VECTOR (dest);
1531
1532 src += 4;
1533 dest += 4;
1534 mask += 4;
1535 }
1536
1537 for (i = width % 4; --i >= 0;)
1538 {
1539 uint32_t m = ALPHA_8 (mask[i]);
1540 uint32_t s = src[i];
1541 uint32_t d = dest[i];
1542 uint32_t src_ia;
1543 uint32_t dest_ia = ALPHA_8 (~d);
1544
1545 UN8x4_MUL_UN8 (s, m);
1546
1547 src_ia = ALPHA_8 (~s);
1548
1549 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1550
1551 dest[i] = s;
1552 }
1553 }
1554
1555 static void
vmx_combine_xor_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1556 vmx_combine_xor_u (pixman_implementation_t *imp,
1557 pixman_op_t op,
1558 uint32_t * dest,
1559 const uint32_t * src,
1560 const uint32_t * mask,
1561 int width)
1562 {
1563 if (mask)
1564 vmx_combine_xor_u_mask (dest, src, mask, width);
1565 else
1566 vmx_combine_xor_u_no_mask (dest, src, width);
1567 }
1568
1569 static void
vmx_combine_add_u_no_mask(uint32_t * dest,const uint32_t * src,int width)1570 vmx_combine_add_u_no_mask (uint32_t * dest,
1571 const uint32_t *src,
1572 int width)
1573 {
1574 int i;
1575 vector unsigned int vdest, vsrc;
1576 DECLARE_SRC_MASK_VAR;
1577
1578 while (width && ((uintptr_t)dest & 15))
1579 {
1580 uint32_t s = *src++;
1581 uint32_t d = *dest;
1582
1583 UN8x4_ADD_UN8x4 (d, s);
1584
1585 *dest++ = d;
1586 width--;
1587 }
1588
1589 COMPUTE_SHIFT_MASKS (dest, src);
1590 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1591 for (i = width / 4; i > 0; i--)
1592 {
1593 LOAD_VECTORS (dest, src);
1594
1595 vdest = pix_add (vsrc, vdest);
1596
1597 STORE_VECTOR (dest);
1598
1599 src += 4;
1600 dest += 4;
1601 }
1602
1603 for (i = width % 4; --i >= 0;)
1604 {
1605 uint32_t s = src[i];
1606 uint32_t d = dest[i];
1607
1608 UN8x4_ADD_UN8x4 (d, s);
1609
1610 dest[i] = d;
1611 }
1612 }
1613
1614 static void
vmx_combine_add_u_mask(uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1615 vmx_combine_add_u_mask (uint32_t * dest,
1616 const uint32_t *src,
1617 const uint32_t *mask,
1618 int width)
1619 {
1620 int i;
1621 vector unsigned int vdest, vsrc, vmask;
1622 DECLARE_SRC_MASK_VAR;
1623 DECLARE_MASK_MASK_VAR;
1624
1625 while (width && ((uintptr_t)dest & 15))
1626 {
1627 uint32_t m = ALPHA_8 (*mask++);
1628 uint32_t s = *src++;
1629 uint32_t d = *dest;
1630
1631 UN8x4_MUL_UN8 (s, m);
1632 UN8x4_ADD_UN8x4 (d, s);
1633
1634 *dest++ = d;
1635 width--;
1636 }
1637
1638 COMPUTE_SHIFT_MASKC (dest, src, mask);
1639
1640 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1641 for (i = width / 4; i > 0; i--)
1642 {
1643 LOAD_VECTORSM (dest, src, mask);
1644
1645 vdest = pix_add (vsrc, vdest);
1646
1647 STORE_VECTOR (dest);
1648
1649 src += 4;
1650 dest += 4;
1651 mask += 4;
1652 }
1653
1654 for (i = width % 4; --i >= 0;)
1655 {
1656 uint32_t m = ALPHA_8 (mask[i]);
1657 uint32_t s = src[i];
1658 uint32_t d = dest[i];
1659
1660 UN8x4_MUL_UN8 (s, m);
1661 UN8x4_ADD_UN8x4 (d, s);
1662
1663 dest[i] = d;
1664 }
1665 }
1666
1667 static void
vmx_combine_add_u(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1668 vmx_combine_add_u (pixman_implementation_t *imp,
1669 pixman_op_t op,
1670 uint32_t * dest,
1671 const uint32_t * src,
1672 const uint32_t * mask,
1673 int width)
1674 {
1675 if (mask)
1676 vmx_combine_add_u_mask (dest, src, mask, width);
1677 else
1678 vmx_combine_add_u_no_mask (dest, src, width);
1679 }
1680
1681 static void
vmx_combine_src_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1682 vmx_combine_src_ca (pixman_implementation_t *imp,
1683 pixman_op_t op,
1684 uint32_t * dest,
1685 const uint32_t * src,
1686 const uint32_t * mask,
1687 int width)
1688 {
1689 int i;
1690 vector unsigned int vdest, vsrc, vmask;
1691 DECLARE_SRC_MASK_VAR;
1692 DECLARE_MASK_MASK_VAR;
1693
1694 while (width && ((uintptr_t)dest & 15))
1695 {
1696 uint32_t a = *mask++;
1697 uint32_t s = *src++;
1698
1699 UN8x4_MUL_UN8x4 (s, a);
1700
1701 *dest++ = s;
1702 width--;
1703 }
1704
1705 COMPUTE_SHIFT_MASKC (dest, src, mask);
1706
1707 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1708 for (i = width / 4; i > 0; i--)
1709 {
1710 LOAD_VECTORSC (dest, src, mask);
1711
1712 vdest = pix_multiply (vsrc, vmask);
1713
1714 STORE_VECTOR (dest);
1715
1716 mask += 4;
1717 src += 4;
1718 dest += 4;
1719 }
1720
1721 for (i = width % 4; --i >= 0;)
1722 {
1723 uint32_t a = mask[i];
1724 uint32_t s = src[i];
1725
1726 UN8x4_MUL_UN8x4 (s, a);
1727
1728 dest[i] = s;
1729 }
1730 }
1731
1732 static void
vmx_combine_over_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1733 vmx_combine_over_ca (pixman_implementation_t *imp,
1734 pixman_op_t op,
1735 uint32_t * dest,
1736 const uint32_t * src,
1737 const uint32_t * mask,
1738 int width)
1739 {
1740 int i;
1741 vector unsigned int vdest, vsrc, vmask;
1742 DECLARE_SRC_MASK_VAR;
1743 DECLARE_MASK_MASK_VAR;
1744
1745 while (width && ((uintptr_t)dest & 15))
1746 {
1747 uint32_t a = *mask++;
1748 uint32_t s = *src++;
1749 uint32_t d = *dest;
1750 uint32_t sa = ALPHA_8 (s);
1751
1752 UN8x4_MUL_UN8x4 (s, a);
1753 UN8x4_MUL_UN8 (a, sa);
1754 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
1755
1756 *dest++ = d;
1757 width--;
1758 }
1759
1760 COMPUTE_SHIFT_MASKC (dest, src, mask);
1761
1762 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1763 for (i = width / 4; i > 0; i--)
1764 {
1765 LOAD_VECTORSC (dest, src, mask);
1766
1767 vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
1768
1769 STORE_VECTOR (dest);
1770
1771 mask += 4;
1772 src += 4;
1773 dest += 4;
1774 }
1775
1776 for (i = width % 4; --i >= 0;)
1777 {
1778 uint32_t a = mask[i];
1779 uint32_t s = src[i];
1780 uint32_t d = dest[i];
1781 uint32_t sa = ALPHA_8 (s);
1782
1783 UN8x4_MUL_UN8x4 (s, a);
1784 UN8x4_MUL_UN8 (a, sa);
1785 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
1786
1787 dest[i] = d;
1788 }
1789 }
1790
1791 static void
vmx_combine_over_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1792 vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1793 pixman_op_t op,
1794 uint32_t * dest,
1795 const uint32_t * src,
1796 const uint32_t * mask,
1797 int width)
1798 {
1799 int i;
1800 vector unsigned int vdest, vsrc, vmask;
1801 DECLARE_SRC_MASK_VAR;
1802 DECLARE_MASK_MASK_VAR;
1803
1804 while (width && ((uintptr_t)dest & 15))
1805 {
1806 uint32_t a = *mask++;
1807 uint32_t s = *src++;
1808 uint32_t d = *dest;
1809 uint32_t ida = ALPHA_8 (~d);
1810
1811 UN8x4_MUL_UN8x4 (s, a);
1812 UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
1813
1814 *dest++ = s;
1815 width--;
1816 }
1817
1818 COMPUTE_SHIFT_MASKC (dest, src, mask);
1819
1820 /* printf("%s\n",__PRETTY_FUNCTION__); */
1821 for (i = width / 4; i > 0; i--)
1822 {
1823 LOAD_VECTORSC (dest, src, mask);
1824
1825 vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
1826
1827 STORE_VECTOR (dest);
1828
1829 mask += 4;
1830 src += 4;
1831 dest += 4;
1832 }
1833
1834 for (i = width % 4; --i >= 0;)
1835 {
1836 uint32_t a = mask[i];
1837 uint32_t s = src[i];
1838 uint32_t d = dest[i];
1839 uint32_t ida = ALPHA_8 (~d);
1840
1841 UN8x4_MUL_UN8x4 (s, a);
1842 UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
1843
1844 dest[i] = s;
1845 }
1846 }
1847
1848 static void
vmx_combine_in_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1849 vmx_combine_in_ca (pixman_implementation_t *imp,
1850 pixman_op_t op,
1851 uint32_t * dest,
1852 const uint32_t * src,
1853 const uint32_t * mask,
1854 int width)
1855 {
1856 int i;
1857 vector unsigned int vdest, vsrc, vmask;
1858 DECLARE_SRC_MASK_VAR;
1859 DECLARE_MASK_MASK_VAR;
1860
1861 while (width && ((uintptr_t)dest & 15))
1862 {
1863 uint32_t a = *mask++;
1864 uint32_t s = *src++;
1865 uint32_t da = ALPHA_8 (*dest);
1866
1867 UN8x4_MUL_UN8x4 (s, a);
1868 UN8x4_MUL_UN8 (s, da);
1869
1870 *dest++ = s;
1871 width--;
1872 }
1873
1874 COMPUTE_SHIFT_MASKC (dest, src, mask);
1875
1876 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1877 for (i = width / 4; i > 0; i--)
1878 {
1879 LOAD_VECTORSC (dest, src, mask);
1880
1881 vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
1882
1883 STORE_VECTOR (dest);
1884
1885 src += 4;
1886 dest += 4;
1887 mask += 4;
1888 }
1889
1890 for (i = width % 4; --i >= 0;)
1891 {
1892 uint32_t a = mask[i];
1893 uint32_t s = src[i];
1894 uint32_t da = ALPHA_8 (dest[i]);
1895
1896 UN8x4_MUL_UN8x4 (s, a);
1897 UN8x4_MUL_UN8 (s, da);
1898
1899 dest[i] = s;
1900 }
1901 }
1902
1903 static void
vmx_combine_in_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1904 vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1905 pixman_op_t op,
1906 uint32_t * dest,
1907 const uint32_t * src,
1908 const uint32_t * mask,
1909 int width)
1910 {
1911 int i;
1912 vector unsigned int vdest, vsrc, vmask;
1913 DECLARE_SRC_MASK_VAR;
1914 DECLARE_MASK_MASK_VAR;
1915
1916 while (width && ((uintptr_t)dest & 15))
1917 {
1918 uint32_t a = *mask++;
1919 uint32_t d = *dest;
1920 uint32_t sa = ALPHA_8 (*src++);
1921
1922 UN8x4_MUL_UN8 (a, sa);
1923 UN8x4_MUL_UN8x4 (d, a);
1924
1925 *dest++ = d;
1926 width--;
1927 }
1928
1929 COMPUTE_SHIFT_MASKC (dest, src, mask);
1930
1931 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1932 for (i = width / 4; i > 0; i--)
1933 {
1934
1935 LOAD_VECTORSC (dest, src, mask);
1936
1937 vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
1938
1939 STORE_VECTOR (dest);
1940
1941 src += 4;
1942 dest += 4;
1943 mask += 4;
1944 }
1945
1946 for (i = width % 4; --i >= 0;)
1947 {
1948 uint32_t a = mask[i];
1949 uint32_t d = dest[i];
1950 uint32_t sa = ALPHA_8 (src[i]);
1951
1952 UN8x4_MUL_UN8 (a, sa);
1953 UN8x4_MUL_UN8x4 (d, a);
1954
1955 dest[i] = d;
1956 }
1957 }
1958
1959 static void
vmx_combine_out_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)1960 vmx_combine_out_ca (pixman_implementation_t *imp,
1961 pixman_op_t op,
1962 uint32_t * dest,
1963 const uint32_t * src,
1964 const uint32_t * mask,
1965 int width)
1966 {
1967 int i;
1968 vector unsigned int vdest, vsrc, vmask;
1969 DECLARE_SRC_MASK_VAR;
1970 DECLARE_MASK_MASK_VAR;
1971
1972 while (width && ((uintptr_t)dest & 15))
1973 {
1974 uint32_t a = *mask++;
1975 uint32_t s = *src++;
1976 uint32_t d = *dest;
1977 uint32_t da = ALPHA_8 (~d);
1978
1979 UN8x4_MUL_UN8x4 (s, a);
1980 UN8x4_MUL_UN8 (s, da);
1981
1982 *dest++ = s;
1983 width--;
1984 }
1985
1986 COMPUTE_SHIFT_MASKC (dest, src, mask);
1987
1988 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1989 for (i = width / 4; i > 0; i--)
1990 {
1991 LOAD_VECTORSC (dest, src, mask);
1992
1993 vdest = pix_multiply (
1994 pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
1995
1996 STORE_VECTOR (dest);
1997
1998 src += 4;
1999 dest += 4;
2000 mask += 4;
2001 }
2002
2003 for (i = width % 4; --i >= 0;)
2004 {
2005 uint32_t a = mask[i];
2006 uint32_t s = src[i];
2007 uint32_t d = dest[i];
2008 uint32_t da = ALPHA_8 (~d);
2009
2010 UN8x4_MUL_UN8x4 (s, a);
2011 UN8x4_MUL_UN8 (s, da);
2012
2013 dest[i] = s;
2014 }
2015 }
2016
2017 static void
vmx_combine_out_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)2018 vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
2019 pixman_op_t op,
2020 uint32_t * dest,
2021 const uint32_t * src,
2022 const uint32_t * mask,
2023 int width)
2024 {
2025 int i;
2026 vector unsigned int vdest, vsrc, vmask;
2027 DECLARE_SRC_MASK_VAR;
2028 DECLARE_MASK_MASK_VAR;
2029
2030 while (width && ((uintptr_t)dest & 15))
2031 {
2032 uint32_t a = *mask++;
2033 uint32_t s = *src++;
2034 uint32_t d = *dest;
2035 uint32_t sa = ALPHA_8 (s);
2036
2037 UN8x4_MUL_UN8 (a, sa);
2038 UN8x4_MUL_UN8x4 (d, ~a);
2039
2040 *dest++ = d;
2041 width--;
2042 }
2043
2044 COMPUTE_SHIFT_MASKC (dest, src, mask);
2045
2046 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2047 for (i = width / 4; i > 0; i--)
2048 {
2049 LOAD_VECTORSC (dest, src, mask);
2050
2051 vdest = pix_multiply (
2052 vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
2053
2054 STORE_VECTOR (dest);
2055
2056 src += 4;
2057 dest += 4;
2058 mask += 4;
2059 }
2060
2061 for (i = width % 4; --i >= 0;)
2062 {
2063 uint32_t a = mask[i];
2064 uint32_t s = src[i];
2065 uint32_t d = dest[i];
2066 uint32_t sa = ALPHA_8 (s);
2067
2068 UN8x4_MUL_UN8 (a, sa);
2069 UN8x4_MUL_UN8x4 (d, ~a);
2070
2071 dest[i] = d;
2072 }
2073 }
2074
2075 static void
vmx_combine_atop_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)2076 vmx_combine_atop_ca (pixman_implementation_t *imp,
2077 pixman_op_t op,
2078 uint32_t * dest,
2079 const uint32_t * src,
2080 const uint32_t * mask,
2081 int width)
2082 {
2083 int i;
2084 vector unsigned int vdest, vsrc, vmask, vsrca;
2085 DECLARE_SRC_MASK_VAR;
2086 DECLARE_MASK_MASK_VAR;
2087
2088 while (width && ((uintptr_t)dest & 15))
2089 {
2090 uint32_t a = *mask++;
2091 uint32_t s = *src++;
2092 uint32_t d = *dest;
2093 uint32_t sa = ALPHA_8 (s);
2094 uint32_t da = ALPHA_8 (d);
2095
2096 UN8x4_MUL_UN8x4 (s, a);
2097 UN8x4_MUL_UN8 (a, sa);
2098 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2099
2100 *dest++ = d;
2101 width--;
2102 }
2103
2104 COMPUTE_SHIFT_MASKC (dest, src, mask);
2105
2106 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2107 for (i = width / 4; i > 0; i--)
2108 {
2109 LOAD_VECTORSC (dest, src, mask);
2110
2111 vsrca = splat_alpha (vsrc);
2112
2113 vsrc = pix_multiply (vsrc, vmask);
2114 vmask = pix_multiply (vmask, vsrca);
2115
2116 vdest = pix_add_mul (vsrc, splat_alpha (vdest),
2117 negate (vmask), vdest);
2118
2119 STORE_VECTOR (dest);
2120
2121 src += 4;
2122 dest += 4;
2123 mask += 4;
2124 }
2125
2126 for (i = width % 4; --i >= 0;)
2127 {
2128 uint32_t a = mask[i];
2129 uint32_t s = src[i];
2130 uint32_t d = dest[i];
2131 uint32_t sa = ALPHA_8 (s);
2132 uint32_t da = ALPHA_8 (d);
2133
2134 UN8x4_MUL_UN8x4 (s, a);
2135 UN8x4_MUL_UN8 (a, sa);
2136 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2137
2138 dest[i] = d;
2139 }
2140 }
2141
2142 static void
vmx_combine_atop_reverse_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)2143 vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
2144 pixman_op_t op,
2145 uint32_t * dest,
2146 const uint32_t * src,
2147 const uint32_t * mask,
2148 int width)
2149 {
2150 int i;
2151 vector unsigned int vdest, vsrc, vmask;
2152 DECLARE_SRC_MASK_VAR;
2153 DECLARE_MASK_MASK_VAR;
2154
2155 while (width && ((uintptr_t)dest & 15))
2156 {
2157 uint32_t a = *mask++;
2158 uint32_t s = *src++;
2159 uint32_t d = *dest;
2160 uint32_t sa = ALPHA_8 (s);
2161 uint32_t da = ALPHA_8 (~d);
2162
2163 UN8x4_MUL_UN8x4 (s, a);
2164 UN8x4_MUL_UN8 (a, sa);
2165 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
2166
2167 *dest++ = d;
2168 width--;
2169 }
2170
2171 COMPUTE_SHIFT_MASKC (dest, src, mask);
2172
2173 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2174 for (i = width / 4; i > 0; i--)
2175 {
2176 LOAD_VECTORSC (dest, src, mask);
2177
2178 vdest = pix_add_mul (vdest,
2179 pix_multiply (vmask, splat_alpha (vsrc)),
2180 pix_multiply (vsrc, vmask),
2181 negate (splat_alpha (vdest)));
2182
2183 STORE_VECTOR (dest);
2184
2185 src += 4;
2186 dest += 4;
2187 mask += 4;
2188 }
2189
2190 for (i = width % 4; --i >= 0;)
2191 {
2192 uint32_t a = mask[i];
2193 uint32_t s = src[i];
2194 uint32_t d = dest[i];
2195 uint32_t sa = ALPHA_8 (s);
2196 uint32_t da = ALPHA_8 (~d);
2197
2198 UN8x4_MUL_UN8x4 (s, a);
2199 UN8x4_MUL_UN8 (a, sa);
2200 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
2201
2202 dest[i] = d;
2203 }
2204 }
2205
2206 static void
vmx_combine_xor_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)2207 vmx_combine_xor_ca (pixman_implementation_t *imp,
2208 pixman_op_t op,
2209 uint32_t * dest,
2210 const uint32_t * src,
2211 const uint32_t * mask,
2212 int width)
2213 {
2214 int i;
2215 vector unsigned int vdest, vsrc, vmask;
2216 DECLARE_SRC_MASK_VAR;
2217 DECLARE_MASK_MASK_VAR;
2218
2219 while (width && ((uintptr_t)dest & 15))
2220 {
2221 uint32_t a = *mask++;
2222 uint32_t s = *src++;
2223 uint32_t d = *dest;
2224 uint32_t sa = ALPHA_8 (s);
2225 uint32_t da = ALPHA_8 (~d);
2226
2227 UN8x4_MUL_UN8x4 (s, a);
2228 UN8x4_MUL_UN8 (a, sa);
2229 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2230
2231 *dest++ = d;
2232 width--;
2233 }
2234
2235 COMPUTE_SHIFT_MASKC (dest, src, mask);
2236
2237 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2238 for (i = width / 4; i > 0; i--)
2239 {
2240 LOAD_VECTORSC (dest, src, mask);
2241
2242 vdest = pix_add_mul (vdest,
2243 negate (pix_multiply (vmask, splat_alpha (vsrc))),
2244 pix_multiply (vsrc, vmask),
2245 negate (splat_alpha (vdest)));
2246
2247 STORE_VECTOR (dest);
2248
2249 src += 4;
2250 dest += 4;
2251 mask += 4;
2252 }
2253
2254 for (i = width % 4; --i >= 0;)
2255 {
2256 uint32_t a = mask[i];
2257 uint32_t s = src[i];
2258 uint32_t d = dest[i];
2259 uint32_t sa = ALPHA_8 (s);
2260 uint32_t da = ALPHA_8 (~d);
2261
2262 UN8x4_MUL_UN8x4 (s, a);
2263 UN8x4_MUL_UN8 (a, sa);
2264 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2265
2266 dest[i] = d;
2267 }
2268 }
2269
2270 static void
vmx_combine_add_ca(pixman_implementation_t * imp,pixman_op_t op,uint32_t * dest,const uint32_t * src,const uint32_t * mask,int width)2271 vmx_combine_add_ca (pixman_implementation_t *imp,
2272 pixman_op_t op,
2273 uint32_t * dest,
2274 const uint32_t * src,
2275 const uint32_t * mask,
2276 int width)
2277 {
2278 int i;
2279 vector unsigned int vdest, vsrc, vmask;
2280 DECLARE_SRC_MASK_VAR;
2281 DECLARE_MASK_MASK_VAR;
2282
2283 while (width && ((uintptr_t)dest & 15))
2284 {
2285 uint32_t a = *mask++;
2286 uint32_t s = *src++;
2287 uint32_t d = *dest;
2288
2289 UN8x4_MUL_UN8x4 (s, a);
2290 UN8x4_ADD_UN8x4 (s, d);
2291
2292 *dest++ = s;
2293 width--;
2294 }
2295
2296 COMPUTE_SHIFT_MASKC (dest, src, mask);
2297
2298 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2299 for (i = width / 4; i > 0; i--)
2300 {
2301 LOAD_VECTORSC (dest, src, mask);
2302
2303 vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
2304
2305 STORE_VECTOR (dest);
2306
2307 src += 4;
2308 dest += 4;
2309 mask += 4;
2310 }
2311
2312 for (i = width % 4; --i >= 0;)
2313 {
2314 uint32_t a = mask[i];
2315 uint32_t s = src[i];
2316 uint32_t d = dest[i];
2317
2318 UN8x4_MUL_UN8x4 (s, a);
2319 UN8x4_ADD_UN8x4 (s, d);
2320
2321 dest[i] = s;
2322 }
2323 }
2324
2325 static void
vmx_composite_over_n_8_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2326 vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
2327 pixman_composite_info_t *info)
2328 {
2329 PIXMAN_COMPOSITE_ARGS (info);
2330 uint32_t src, srca;
2331 uint32_t *dst_line, *dst;
2332 uint8_t *mask_line;
2333 int dst_stride, mask_stride;
2334 int32_t w;
2335 uint32_t m, d, s, ia;
2336
2337 vector unsigned int vsrc, valpha, vmask, vdst;
2338
2339 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2340
2341 srca = ALPHA_8(src);
2342 if (src == 0)
2343 return;
2344
2345 PIXMAN_IMAGE_GET_LINE (
2346 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2347 PIXMAN_IMAGE_GET_LINE (
2348 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2349
2350 vsrc = (vector unsigned int) {src, src, src, src};
2351 valpha = splat_alpha(vsrc);
2352
2353 while (height--)
2354 {
2355 const uint8_t *pm = mask_line;
2356 dst = dst_line;
2357 dst_line += dst_stride;
2358 mask_line += mask_stride;
2359 w = width;
2360
2361 while (w && (uintptr_t)dst & 15)
2362 {
2363 s = src;
2364 m = *pm++;
2365
2366 if (m)
2367 {
2368 d = *dst;
2369 UN8x4_MUL_UN8 (s, m);
2370 ia = ALPHA_8 (~s);
2371 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
2372 *dst = d;
2373 }
2374
2375 w--;
2376 dst++;
2377 }
2378
2379 while (w >= 4)
2380 {
2381 m = *((uint32_t*)pm);
2382
2383 if (srca == 0xff && m == 0xffffffff)
2384 {
2385 save_128_aligned(dst, vsrc);
2386 }
2387 else if (m)
2388 {
2389 vmask = splat_pixel((vector unsigned int) {m, m, m, m});
2390
2391 /* dst is 16-byte aligned */
2392 vdst = in_over (vsrc, valpha, vmask, load_128_aligned (dst));
2393
2394 save_128_aligned(dst, vdst);
2395 }
2396
2397 w -= 4;
2398 dst += 4;
2399 pm += 4;
2400 }
2401
2402 while (w)
2403 {
2404 s = src;
2405 m = *pm++;
2406
2407 if (m)
2408 {
2409 d = *dst;
2410 UN8x4_MUL_UN8 (s, m);
2411 ia = ALPHA_8 (~s);
2412 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
2413 *dst = d;
2414 }
2415
2416 w--;
2417 dst++;
2418 }
2419 }
2420
2421 }
2422
2423 static pixman_bool_t
vmx_fill(pixman_implementation_t * imp,uint32_t * bits,int stride,int bpp,int x,int y,int width,int height,uint32_t filler)2424 vmx_fill (pixman_implementation_t *imp,
2425 uint32_t * bits,
2426 int stride,
2427 int bpp,
2428 int x,
2429 int y,
2430 int width,
2431 int height,
2432 uint32_t filler)
2433 {
2434 uint32_t byte_width;
2435 uint8_t *byte_line;
2436
2437 vector unsigned int vfiller;
2438
2439 if (bpp == 8)
2440 {
2441 uint8_t b;
2442 uint16_t w;
2443
2444 stride = stride * (int) sizeof (uint32_t) / 1;
2445 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2446 byte_width = width;
2447 stride *= 1;
2448
2449 b = filler & 0xff;
2450 w = (b << 8) | b;
2451 filler = (w << 16) | w;
2452 }
2453 else if (bpp == 16)
2454 {
2455 stride = stride * (int) sizeof (uint32_t) / 2;
2456 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2457 byte_width = 2 * width;
2458 stride *= 2;
2459
2460 filler = (filler & 0xffff) * 0x00010001;
2461 }
2462 else if (bpp == 32)
2463 {
2464 stride = stride * (int) sizeof (uint32_t) / 4;
2465 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2466 byte_width = 4 * width;
2467 stride *= 4;
2468 }
2469 else
2470 {
2471 return FALSE;
2472 }
2473
2474 vfiller = create_mask_1x32_128(&filler);
2475
2476 while (height--)
2477 {
2478 int w;
2479 uint8_t *d = byte_line;
2480 byte_line += stride;
2481 w = byte_width;
2482
2483 if (w >= 1 && ((uintptr_t)d & 1))
2484 {
2485 *(uint8_t *)d = filler;
2486 w -= 1;
2487 d += 1;
2488 }
2489
2490 while (w >= 2 && ((uintptr_t)d & 3))
2491 {
2492 *(uint16_t *)d = filler;
2493 w -= 2;
2494 d += 2;
2495 }
2496
2497 while (w >= 4 && ((uintptr_t)d & 15))
2498 {
2499 *(uint32_t *)d = filler;
2500
2501 w -= 4;
2502 d += 4;
2503 }
2504
2505 while (w >= 128)
2506 {
2507 vec_st(vfiller, 0, (uint32_t *) d);
2508 vec_st(vfiller, 0, (uint32_t *) d + 4);
2509 vec_st(vfiller, 0, (uint32_t *) d + 8);
2510 vec_st(vfiller, 0, (uint32_t *) d + 12);
2511 vec_st(vfiller, 0, (uint32_t *) d + 16);
2512 vec_st(vfiller, 0, (uint32_t *) d + 20);
2513 vec_st(vfiller, 0, (uint32_t *) d + 24);
2514 vec_st(vfiller, 0, (uint32_t *) d + 28);
2515
2516 d += 128;
2517 w -= 128;
2518 }
2519
2520 if (w >= 64)
2521 {
2522 vec_st(vfiller, 0, (uint32_t *) d);
2523 vec_st(vfiller, 0, (uint32_t *) d + 4);
2524 vec_st(vfiller, 0, (uint32_t *) d + 8);
2525 vec_st(vfiller, 0, (uint32_t *) d + 12);
2526
2527 d += 64;
2528 w -= 64;
2529 }
2530
2531 if (w >= 32)
2532 {
2533 vec_st(vfiller, 0, (uint32_t *) d);
2534 vec_st(vfiller, 0, (uint32_t *) d + 4);
2535
2536 d += 32;
2537 w -= 32;
2538 }
2539
2540 if (w >= 16)
2541 {
2542 vec_st(vfiller, 0, (uint32_t *) d);
2543
2544 d += 16;
2545 w -= 16;
2546 }
2547
2548 while (w >= 4)
2549 {
2550 *(uint32_t *)d = filler;
2551
2552 w -= 4;
2553 d += 4;
2554 }
2555
2556 if (w >= 2)
2557 {
2558 *(uint16_t *)d = filler;
2559 w -= 2;
2560 d += 2;
2561 }
2562
2563 if (w >= 1)
2564 {
2565 *(uint8_t *)d = filler;
2566 w -= 1;
2567 d += 1;
2568 }
2569 }
2570
2571 return TRUE;
2572 }
2573
2574 static void
vmx_composite_src_x888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2575 vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
2576 pixman_composite_info_t *info)
2577 {
2578 PIXMAN_COMPOSITE_ARGS (info);
2579 uint32_t *dst_line, *dst;
2580 uint32_t *src_line, *src;
2581 int32_t w;
2582 int dst_stride, src_stride;
2583
2584 PIXMAN_IMAGE_GET_LINE (
2585 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2586 PIXMAN_IMAGE_GET_LINE (
2587 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2588
2589 while (height--)
2590 {
2591 dst = dst_line;
2592 dst_line += dst_stride;
2593 src = src_line;
2594 src_line += src_stride;
2595 w = width;
2596
2597 while (w && (uintptr_t)dst & 15)
2598 {
2599 *dst++ = *src++ | 0xff000000;
2600 w--;
2601 }
2602
2603 while (w >= 16)
2604 {
2605 vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4;
2606
2607 vmx_src1 = load_128_unaligned (src);
2608 vmx_src2 = load_128_unaligned (src + 4);
2609 vmx_src3 = load_128_unaligned (src + 8);
2610 vmx_src4 = load_128_unaligned (src + 12);
2611
2612 save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000));
2613 save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000));
2614 save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000));
2615 save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000));
2616
2617 dst += 16;
2618 src += 16;
2619 w -= 16;
2620 }
2621
2622 while (w)
2623 {
2624 *dst++ = *src++ | 0xff000000;
2625 w--;
2626 }
2627 }
2628 }
2629
2630 static void
vmx_composite_over_n_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2631 vmx_composite_over_n_8888 (pixman_implementation_t *imp,
2632 pixman_composite_info_t *info)
2633 {
2634 PIXMAN_COMPOSITE_ARGS (info);
2635 uint32_t *dst_line, *dst;
2636 uint32_t src, ia;
2637 int i, w, dst_stride;
2638 vector unsigned int vdst, vsrc, via;
2639
2640 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2641
2642 if (src == 0)
2643 return;
2644
2645 PIXMAN_IMAGE_GET_LINE (
2646 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2647
2648 vsrc = (vector unsigned int){src, src, src, src};
2649 via = negate (splat_alpha (vsrc));
2650 ia = ALPHA_8 (~src);
2651
2652 while (height--)
2653 {
2654 dst = dst_line;
2655 dst_line += dst_stride;
2656 w = width;
2657
2658 while (w && ((uintptr_t)dst & 15))
2659 {
2660 uint32_t d = *dst;
2661 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
2662 *dst++ = d;
2663 w--;
2664 }
2665
2666 for (i = w / 4; i > 0; i--)
2667 {
2668 vdst = pix_multiply (load_128_aligned (dst), via);
2669 save_128_aligned (dst, pix_add (vsrc, vdst));
2670 dst += 4;
2671 }
2672
2673 for (i = w % 4; --i >= 0;)
2674 {
2675 uint32_t d = dst[i];
2676 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
2677 dst[i] = d;
2678 }
2679 }
2680 }
2681
2682 static void
vmx_composite_over_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2683 vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
2684 pixman_composite_info_t *info)
2685 {
2686 PIXMAN_COMPOSITE_ARGS (info);
2687 int dst_stride, src_stride;
2688 uint32_t *dst_line, *dst;
2689 uint32_t *src_line, *src;
2690
2691 PIXMAN_IMAGE_GET_LINE (
2692 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2693 PIXMAN_IMAGE_GET_LINE (
2694 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2695
2696 dst = dst_line;
2697 src = src_line;
2698
2699 while (height--)
2700 {
2701 vmx_combine_over_u (imp, op, dst, src, NULL, width);
2702
2703 dst += dst_stride;
2704 src += src_stride;
2705 }
2706 }
2707
2708 static void
vmx_composite_over_n_8888_8888_ca(pixman_implementation_t * imp,pixman_composite_info_t * info)2709 vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2710 pixman_composite_info_t *info)
2711 {
2712 PIXMAN_COMPOSITE_ARGS (info);
2713 uint32_t src, ia;
2714 uint32_t *dst_line, d;
2715 uint32_t *mask_line, m;
2716 uint32_t pack_cmp;
2717 int dst_stride, mask_stride;
2718
2719 vector unsigned int vsrc, valpha, vmask, vdest;
2720
2721 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2722
2723 if (src == 0)
2724 return;
2725
2726 PIXMAN_IMAGE_GET_LINE (
2727 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2728 PIXMAN_IMAGE_GET_LINE (
2729 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2730
2731 vsrc = (vector unsigned int) {src, src, src, src};
2732 valpha = splat_alpha(vsrc);
2733 ia = ALPHA_8 (src);
2734
2735 while (height--)
2736 {
2737 int w = width;
2738 const uint32_t *pm = (uint32_t *)mask_line;
2739 uint32_t *pd = (uint32_t *)dst_line;
2740 uint32_t s;
2741
2742 dst_line += dst_stride;
2743 mask_line += mask_stride;
2744
2745 while (w && (uintptr_t)pd & 15)
2746 {
2747 s = src;
2748 m = *pm++;
2749
2750 if (m)
2751 {
2752 d = *pd;
2753 UN8x4_MUL_UN8x4 (s, m);
2754 UN8x4_MUL_UN8 (m, ia);
2755 m = ~m;
2756 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
2757 *pd = d;
2758 }
2759
2760 pd++;
2761 w--;
2762 }
2763
2764 while (w >= 4)
2765 {
2766 /* pm is NOT necessarily 16-byte aligned */
2767 vmask = load_128_unaligned (pm);
2768
2769 pack_cmp = vec_all_eq(vmask, (vector unsigned int) AVV(0));
2770
2771 /* if all bits in mask are zero, pack_cmp is not 0 */
2772 if (pack_cmp == 0)
2773 {
2774 /* pd is 16-byte aligned */
2775 vdest = in_over (vsrc, valpha, vmask, load_128_aligned (pd));
2776
2777 save_128_aligned(pd, vdest);
2778 }
2779
2780 pd += 4;
2781 pm += 4;
2782 w -= 4;
2783 }
2784
2785 while (w)
2786 {
2787 s = src;
2788 m = *pm++;
2789
2790 if (m)
2791 {
2792 d = *pd;
2793 UN8x4_MUL_UN8x4 (s, m);
2794 UN8x4_MUL_UN8 (m, ia);
2795 m = ~m;
2796 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
2797 *pd = d;
2798 }
2799
2800 pd++;
2801 w--;
2802 }
2803 }
2804 }
2805
2806 static void
vmx_composite_add_8_8(pixman_implementation_t * imp,pixman_composite_info_t * info)2807 vmx_composite_add_8_8 (pixman_implementation_t *imp,
2808 pixman_composite_info_t *info)
2809 {
2810 PIXMAN_COMPOSITE_ARGS (info);
2811 uint8_t *dst_line, *dst;
2812 uint8_t *src_line, *src;
2813 int dst_stride, src_stride;
2814 int32_t w;
2815 uint16_t t;
2816
2817 PIXMAN_IMAGE_GET_LINE (
2818 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2819 PIXMAN_IMAGE_GET_LINE (
2820 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2821
2822 while (height--)
2823 {
2824 dst = dst_line;
2825 src = src_line;
2826
2827 dst_line += dst_stride;
2828 src_line += src_stride;
2829 w = width;
2830
2831 /* Small head */
2832 while (w && (uintptr_t)dst & 3)
2833 {
2834 t = (*dst) + (*src++);
2835 *dst++ = t | (0 - (t >> 8));
2836 w--;
2837 }
2838
2839 vmx_combine_add_u (imp, op,
2840 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
2841
2842 /* Small tail */
2843 dst += w & 0xfffc;
2844 src += w & 0xfffc;
2845
2846 w &= 3;
2847
2848 while (w)
2849 {
2850 t = (*dst) + (*src++);
2851 *dst++ = t | (0 - (t >> 8));
2852 w--;
2853 }
2854 }
2855 }
2856
2857 static void
vmx_composite_add_8888_8888(pixman_implementation_t * imp,pixman_composite_info_t * info)2858 vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2859 pixman_composite_info_t *info)
2860 {
2861 PIXMAN_COMPOSITE_ARGS (info);
2862 uint32_t *dst_line, *dst;
2863 uint32_t *src_line, *src;
2864 int dst_stride, src_stride;
2865
2866 PIXMAN_IMAGE_GET_LINE (
2867 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2868 PIXMAN_IMAGE_GET_LINE (
2869 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2870
2871 while (height--)
2872 {
2873 dst = dst_line;
2874 dst_line += dst_stride;
2875 src = src_line;
2876 src_line += src_stride;
2877
2878 vmx_combine_add_u (imp, op, dst, src, NULL, width);
2879 }
2880 }
2881
2882 static force_inline void
scaled_nearest_scanline_vmx_8888_8888_OVER(uint32_t * pd,const uint32_t * ps,int32_t w,pixman_fixed_t vx,pixman_fixed_t unit_x,pixman_fixed_t src_width_fixed,pixman_bool_t fully_transparent_src)2883 scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd,
2884 const uint32_t* ps,
2885 int32_t w,
2886 pixman_fixed_t vx,
2887 pixman_fixed_t unit_x,
2888 pixman_fixed_t src_width_fixed,
2889 pixman_bool_t fully_transparent_src)
2890 {
2891 uint32_t s, d;
2892 const uint32_t* pm = NULL;
2893
2894 vector unsigned int vsrc, vdst;
2895
2896 if (fully_transparent_src)
2897 return;
2898
2899 /* Align dst on a 16-byte boundary */
2900 while (w && ((uintptr_t)pd & 15))
2901 {
2902 d = *pd;
2903 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
2904 vx += unit_x;
2905 while (vx >= 0)
2906 vx -= src_width_fixed;
2907
2908 *pd++ = core_combine_over_u_pixel_vmx (s, d);
2909 if (pm)
2910 pm++;
2911 w--;
2912 }
2913
2914 while (w >= 4)
2915 {
2916 vector unsigned int tmp;
2917 uint32_t tmp1, tmp2, tmp3, tmp4;
2918
2919 tmp1 = *(ps + pixman_fixed_to_int (vx));
2920 vx += unit_x;
2921 while (vx >= 0)
2922 vx -= src_width_fixed;
2923 tmp2 = *(ps + pixman_fixed_to_int (vx));
2924 vx += unit_x;
2925 while (vx >= 0)
2926 vx -= src_width_fixed;
2927 tmp3 = *(ps + pixman_fixed_to_int (vx));
2928 vx += unit_x;
2929 while (vx >= 0)
2930 vx -= src_width_fixed;
2931 tmp4 = *(ps + pixman_fixed_to_int (vx));
2932 vx += unit_x;
2933 while (vx >= 0)
2934 vx -= src_width_fixed;
2935
2936 tmp[0] = tmp1;
2937 tmp[1] = tmp2;
2938 tmp[2] = tmp3;
2939 tmp[3] = tmp4;
2940
2941 vsrc = combine4 ((const uint32_t *) &tmp, pm);
2942
2943 if (is_opaque (vsrc))
2944 {
2945 save_128_aligned (pd, vsrc);
2946 }
2947 else if (!is_zero (vsrc))
2948 {
2949 vdst = over(vsrc, splat_alpha(vsrc), load_128_aligned (pd));
2950
2951 save_128_aligned (pd, vdst);
2952 }
2953
2954 w -= 4;
2955 pd += 4;
2956 if (pm)
2957 pm += 4;
2958 }
2959
2960 while (w)
2961 {
2962 d = *pd;
2963 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
2964 vx += unit_x;
2965 while (vx >= 0)
2966 vx -= src_width_fixed;
2967
2968 *pd++ = core_combine_over_u_pixel_vmx (s, d);
2969 if (pm)
2970 pm++;
2971
2972 w--;
2973 }
2974 }
2975
2976 FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
2977 scaled_nearest_scanline_vmx_8888_8888_OVER,
2978 uint32_t, uint32_t, COVER)
2979 FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER,
2980 scaled_nearest_scanline_vmx_8888_8888_OVER,
2981 uint32_t, uint32_t, NONE)
2982 FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER,
2983 scaled_nearest_scanline_vmx_8888_8888_OVER,
2984 uint32_t, uint32_t, PAD)
2985 FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
2986 scaled_nearest_scanline_vmx_8888_8888_OVER,
2987 uint32_t, uint32_t, NORMAL)
2988
2989 static const pixman_fast_path_t vmx_fast_paths[] =
2990 {
2991 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, vmx_composite_over_n_8888),
2992 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, vmx_composite_over_n_8888),
2993 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
2994 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
2995 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
2996 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
2997 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888),
2998 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888),
2999 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888),
3000 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888),
3001 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
3002 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
3003 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
3004 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
3005
3006 /* PIXMAN_OP_ADD */
3007 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
3008 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888),
3009 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888),
3010
3011 /* PIXMAN_OP_SRC */
3012 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
3013 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
3014
3015 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888),
3016 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888),
3017 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888),
3018 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888),
3019
3020 { PIXMAN_OP_NONE },
3021 };
3022
3023 static uint32_t *
vmx_fetch_x8r8g8b8(pixman_iter_t * iter,const uint32_t * mask)3024 vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3025 {
3026 int w = iter->width;
3027 vector unsigned int ff000000 = mask_ff000000;
3028 uint32_t *dst = iter->buffer;
3029 uint32_t *src = (uint32_t *)iter->bits;
3030
3031 iter->bits += iter->stride;
3032
3033 while (w && ((uintptr_t)dst) & 0x0f)
3034 {
3035 *dst++ = (*src++) | 0xff000000;
3036 w--;
3037 }
3038
3039 while (w >= 4)
3040 {
3041 save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000));
3042
3043 dst += 4;
3044 src += 4;
3045 w -= 4;
3046 }
3047
3048 while (w)
3049 {
3050 *dst++ = (*src++) | 0xff000000;
3051 w--;
3052 }
3053
3054 return iter->buffer;
3055 }
3056
3057 static uint32_t *
vmx_fetch_a8(pixman_iter_t * iter,const uint32_t * mask)3058 vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3059 {
3060 int w = iter->width;
3061 uint32_t *dst = iter->buffer;
3062 uint8_t *src = iter->bits;
3063 vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
3064
3065 iter->bits += iter->stride;
3066
3067 while (w && (((uintptr_t)dst) & 15))
3068 {
3069 *dst++ = *(src++) << 24;
3070 w--;
3071 }
3072
3073 while (w >= 16)
3074 {
3075 vmx0 = load_128_unaligned((uint32_t *) src);
3076
3077 unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2);
3078 unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4);
3079 unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6);
3080
3081 save_128_aligned(dst, vmx6);
3082 save_128_aligned((dst + 4), vmx5);
3083 save_128_aligned((dst + 8), vmx4);
3084 save_128_aligned((dst + 12), vmx3);
3085
3086 dst += 16;
3087 src += 16;
3088 w -= 16;
3089 }
3090
3091 while (w)
3092 {
3093 *dst++ = *(src++) << 24;
3094 w--;
3095 }
3096
3097 return iter->buffer;
3098 }
3099
3100 #define IMAGE_FLAGS \
3101 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
3102 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3103
3104 static const pixman_iter_info_t vmx_iters[] =
3105 {
3106 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
3107 _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
3108 },
3109 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
3110 _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL
3111 },
3112 { PIXMAN_null },
3113 };
3114
3115 pixman_implementation_t *
_pixman_implementation_create_vmx(pixman_implementation_t * fallback)3116 _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
3117 {
3118 pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
3119
3120 /* VMX constants */
3121 mask_ff000000 = create_mask_32_128 (0xff000000);
3122 mask_red = create_mask_32_128 (0x00f80000);
3123 mask_green = create_mask_32_128 (0x0000fc00);
3124 mask_blue = create_mask_32_128 (0x000000f8);
3125 mask_565_fix_rb = create_mask_32_128 (0x00e000e0);
3126 mask_565_fix_g = create_mask_32_128 (0x0000c000);
3127
3128 /* Set up function pointers */
3129
3130 imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
3131 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
3132 imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
3133 imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
3134 imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
3135 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
3136 imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
3137 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
3138 imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
3139
3140 imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
3141
3142 imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
3143 imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
3144 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
3145 imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
3146 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
3147 imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
3148 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
3149 imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
3150 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
3151 imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
3152 imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
3153
3154 imp->fill = vmx_fill;
3155
3156 imp->iter_info = vmx_iters;
3157
3158 return imp;
3159 }
3160