• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 
9 #include "SkBlitRow_opts_SSE2.h"
10 #include "SkBitmapProcState_opts_SSE2.h"
11 #include "SkColorPriv.h"
12 #include "SkUtils.h"
13 
14 #include <emmintrin.h>
15 
16 /* SSE2 version of S32_Blend_BlitRow32()
17  * portable version is in core/SkBlitRow_D32.cpp
18  */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20                               const SkPMColor* SK_RESTRICT src,
21                               int count, U8CPU alpha) {
22     SkASSERT(alpha <= 255);
23     if (count <= 0) {
24         return;
25     }
26 
27     uint32_t src_scale = SkAlpha255To256(alpha);
28     uint32_t dst_scale = 256 - src_scale;
29 
30     if (count >= 4) {
31         SkASSERT(((size_t)dst & 0x03) == 0);
32         while (((size_t)dst & 0x0F) != 0) {
33             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34             src++;
35             dst++;
36             count--;
37         }
38 
39         const __m128i *s = reinterpret_cast<const __m128i*>(src);
40         __m128i *d = reinterpret_cast<__m128i*>(dst);
41         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
42         __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
43 
44         // Move scale factors to upper byte of word
45         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
46         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
47         while (count >= 4) {
48             // Load 4 pixels each of src and dest.
49             __m128i src_pixel = _mm_loadu_si128(s);
50             __m128i dst_pixel = _mm_load_si128(d);
51 
52             // Interleave Atom port 0/1 operations based on the execution port
53             // constraints that multiply can only be executed on port 0 (while
54             // boolean operations can be executed on either port 0 or port 1)
55             // because GCC currently doesn't do a good job scheduling
56             // instructions based on these constraints.
57 
58             // Get red and blue pixels into lower byte of each word.
59             // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
60             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
61 
62             // Multiply by scale.
63             // (4 x (0, rs.h, 0, bs.h))
64             // where rs.h stands for the higher byte of r * scale, and
65             // bs.h the higher byte of b * scale.
66             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
67 
68             // Get alpha and green pixels into higher byte of each word.
69             // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
70             __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
71 
72             // Multiply by scale.
73             // (4 x (as.h, as.l, gs.h, gs.l))
74             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
75 
76             // Clear the lower byte of the a*scale and g*scale results
77             // (4 x (as.h, 0, gs.h, 0))
78             src_ag = _mm_and_si128(src_ag, ag_mask);
79 
80             // Operations the destination pixels are the same as on the
81             // source pixels. See the comments above.
82             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
83             dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
84             __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
85             dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
86             dst_ag = _mm_and_si128(dst_ag, ag_mask);
87 
88             // Combine back into RGBA.
89             // (4 x (as.h, rs.h, gs.h, bs.h))
90             src_pixel = _mm_or_si128(src_rb, src_ag);
91             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
92 
93             // Add result
94             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
95             _mm_store_si128(d, result);
96             s++;
97             d++;
98             count -= 4;
99         }
100         src = reinterpret_cast<const SkPMColor*>(s);
101         dst = reinterpret_cast<SkPMColor*>(d);
102     }
103 
104     while (count > 0) {
105         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
106         src++;
107         dst++;
108         count--;
109     }
110 }
111 
S32A_Opaque_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)112 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
113                                 const SkPMColor* SK_RESTRICT src,
114                                 int count, U8CPU alpha) {
115     SkASSERT(alpha == 255);
116     if (count <= 0) {
117         return;
118     }
119 
120     if (count >= 4) {
121         SkASSERT(((size_t)dst & 0x03) == 0);
122         while (((size_t)dst & 0x0F) != 0) {
123             *dst = SkPMSrcOver(*src, *dst);
124             src++;
125             dst++;
126             count--;
127         }
128 
129         const __m128i *s = reinterpret_cast<const __m128i*>(src);
130         __m128i *d = reinterpret_cast<__m128i*>(dst);
131 #ifdef SK_USE_ACCURATE_BLENDING
132         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
133         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
134         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
135         while (count >= 4) {
136             // Load 4 pixels
137             __m128i src_pixel = _mm_loadu_si128(s);
138             __m128i dst_pixel = _mm_load_si128(d);
139 
140             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
141             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
142             // Shift alphas down to lower 8 bits of each quad.
143             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
144 
145             // Copy alpha to upper 3rd byte of each quad
146             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
147 
148             // Subtract alphas from 255, to get 0..255
149             alpha = _mm_sub_epi16(c_255, alpha);
150 
151             // Multiply by red and blue by src alpha.
152             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
153             // Multiply by alpha and green by src alpha.
154             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
155 
156             // dst_rb_low = (dst_rb >> 8)
157             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
158             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
159 
160             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
161             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
162             dst_rb = _mm_add_epi16(dst_rb, c_128);
163             dst_rb = _mm_srli_epi16(dst_rb, 8);
164 
165             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
166             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
167             dst_ag = _mm_add_epi16(dst_ag, c_128);
168             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
169 
170             // Combine back into RGBA.
171             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
172 
173             // Add result
174             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
175             _mm_store_si128(d, result);
176             s++;
177             d++;
178             count -= 4;
179         }
180     #else
181         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
182         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
183         while (count >= 4) {
184             // Load 4 pixels
185             __m128i src_pixel = _mm_loadu_si128(s);
186             __m128i dst_pixel = _mm_load_si128(d);
187 
188             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
189             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
190 
191             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
192             __m128i alpha = _mm_srli_epi16(src_pixel, 8);
193 
194             // (a0, a0, a1, a1, a2, g2, a3, g3)
195             alpha = _mm_shufflehi_epi16(alpha, 0xF5);
196 
197             // (a0, a0, a1, a1, a2, a2, a3, a3)
198             alpha = _mm_shufflelo_epi16(alpha, 0xF5);
199 
200             // Subtract alphas from 256, to get 1..256
201             alpha = _mm_sub_epi16(c_256, alpha);
202 
203             // Multiply by red and blue by src alpha.
204             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
205             // Multiply by alpha and green by src alpha.
206             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
207 
208             // Divide by 256.
209             dst_rb = _mm_srli_epi16(dst_rb, 8);
210 
211             // Mask out high bits (already in the right place)
212             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
213 
214             // Combine back into RGBA.
215             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
216 
217             // Add result
218             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
219             _mm_store_si128(d, result);
220             s++;
221             d++;
222             count -= 4;
223         }
224 #endif
225         src = reinterpret_cast<const SkPMColor*>(s);
226         dst = reinterpret_cast<SkPMColor*>(d);
227     }
228 
229     while (count > 0) {
230         *dst = SkPMSrcOver(*src, *dst);
231         src++;
232         dst++;
233         count--;
234     }
235 }
236 
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)237 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
238                                const SkPMColor* SK_RESTRICT src,
239                                int count, U8CPU alpha) {
240     SkASSERT(alpha <= 255);
241     if (count <= 0) {
242         return;
243     }
244 
245     if (count >= 4) {
246         while (((size_t)dst & 0x0F) != 0) {
247             *dst = SkBlendARGB32(*src, *dst, alpha);
248             src++;
249             dst++;
250             count--;
251         }
252 
253         uint32_t src_scale = SkAlpha255To256(alpha);
254 
255         const __m128i *s = reinterpret_cast<const __m128i*>(src);
256         __m128i *d = reinterpret_cast<__m128i*>(dst);
257         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
258         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
259         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
260         while (count >= 4) {
261             // Load 4 pixels each of src and dest.
262             __m128i src_pixel = _mm_loadu_si128(s);
263             __m128i dst_pixel = _mm_load_si128(d);
264 
265             // Get red and blue pixels into lower byte of each word.
266             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
267             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
268 
269             // Get alpha and green into lower byte of each word.
270             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
271             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
272 
273             // Put per-pixel alpha in low byte of each word.
274             // After the following two statements, the dst_alpha looks like
275             // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
276             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
277             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
278 
279             // dst_alpha = dst_alpha * src_scale
280             // Because src_scales are in the higher byte of each word and
281             // we use mulhi here, the resulting alpha values are already
282             // in the right place and don't need to be divided by 256.
283             // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
284             dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
285 
286             // Subtract alphas from 256, to get 1..256
287             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
288 
289             // Multiply red and blue by dst pixel alpha.
290             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
291             // Multiply alpha and green by dst pixel alpha.
292             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
293 
294             // Multiply red and blue by global alpha.
295             // (4 x (0, rs.h, 0, bs.h))
296             // where rs.h stands for the higher byte of r * src_scale,
297             // and bs.h the higher byte of b * src_scale.
298             // Again, because we use mulhi, the resuling red and blue
299             // values are already in the right place and don't need to
300             // be divided by 256.
301             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
302             // Multiply alpha and green by global alpha.
303             // (4 x (0, as.h, 0, gs.h))
304             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
305 
306             // Divide by 256.
307             dst_rb = _mm_srli_epi16(dst_rb, 8);
308 
309             // Mask out low bits (goodies already in the right place; no need to divide)
310             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
311             // Shift alpha and green to higher byte of each word.
312             // (4 x (as.h, 0, gs.h, 0))
313             src_ag = _mm_slli_epi16(src_ag, 8);
314 
315             // Combine back into RGBA.
316             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
317             src_pixel = _mm_or_si128(src_rb, src_ag);
318 
319             // Add two pixels into result.
320             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
321             _mm_store_si128(d, result);
322             s++;
323             d++;
324             count -= 4;
325         }
326         src = reinterpret_cast<const SkPMColor*>(s);
327         dst = reinterpret_cast<SkPMColor*>(d);
328     }
329 
330     while (count > 0) {
331         *dst = SkBlendARGB32(*src, *dst, alpha);
332         src++;
333         dst++;
334         count--;
335     }
336 }
337 
338 /* SSE2 version of Color32()
339  * portable version is in core/SkBlitRow_D32.cpp
340  */
Color32_SSE2(SkPMColor dst[],const SkPMColor src[],int count,SkPMColor color)341 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
342                   SkPMColor color) {
343 
344     if (count <= 0) {
345         return;
346     }
347 
348     if (0 == color) {
349         if (src != dst) {
350             memcpy(dst, src, count * sizeof(SkPMColor));
351         }
352         return;
353     }
354 
355     unsigned colorA = SkGetPackedA32(color);
356     if (255 == colorA) {
357         sk_memset32(dst, color, count);
358     } else {
359         unsigned scale = 256 - SkAlpha255To256(colorA);
360 
361         if (count >= 4) {
362             SkASSERT(((size_t)dst & 0x03) == 0);
363             while (((size_t)dst & 0x0F) != 0) {
364                 *dst = color + SkAlphaMulQ(*src, scale);
365                 src++;
366                 dst++;
367                 count--;
368             }
369 
370             const __m128i *s = reinterpret_cast<const __m128i*>(src);
371             __m128i *d = reinterpret_cast<__m128i*>(dst);
372             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
373             __m128i src_scale_wide = _mm_set1_epi16(scale);
374             __m128i color_wide = _mm_set1_epi32(color);
375             while (count >= 4) {
376                 // Load 4 pixels each of src and dest.
377                 __m128i src_pixel = _mm_loadu_si128(s);
378 
379                 // Get red and blue pixels into lower byte of each word.
380                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
381 
382                 // Get alpha and green into lower byte of each word.
383                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
384 
385                 // Multiply by scale.
386                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
387                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
388 
389                 // Divide by 256.
390                 src_rb = _mm_srli_epi16(src_rb, 8);
391                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
392 
393                 // Combine back into RGBA.
394                 src_pixel = _mm_or_si128(src_rb, src_ag);
395 
396                 // Add color to result.
397                 __m128i result = _mm_add_epi8(color_wide, src_pixel);
398 
399                 // Store result.
400                 _mm_store_si128(d, result);
401                 s++;
402                 d++;
403                 count -= 4;
404             }
405             src = reinterpret_cast<const SkPMColor*>(s);
406             dst = reinterpret_cast<SkPMColor*>(d);
407          }
408 
409         while (count > 0) {
410             *dst = color + SkAlphaMulQ(*src, scale);
411             src += 1;
412             dst += 1;
413             count--;
414         }
415     }
416 }
417 
SkARGB32_A8_BlitMask_SSE2(void * device,size_t dstRB,const void * maskPtr,size_t maskRB,SkColor origColor,int width,int height)418 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
419                                size_t maskRB, SkColor origColor,
420                                int width, int height) {
421     SkPMColor color = SkPreMultiplyColor(origColor);
422     size_t dstOffset = dstRB - (width << 2);
423     size_t maskOffset = maskRB - width;
424     SkPMColor* dst = (SkPMColor *)device;
425     const uint8_t* mask = (const uint8_t*)maskPtr;
426     do {
427         int count = width;
428         if (count >= 4) {
429             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
430                 *dst = SkBlendARGB32(color, *dst, *mask);
431                 mask++;
432                 dst++;
433                 count--;
434             }
435             __m128i *d = reinterpret_cast<__m128i*>(dst);
436             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
437             __m128i c_256 = _mm_set1_epi16(256);
438             __m128i c_1 = _mm_set1_epi16(1);
439             __m128i src_pixel = _mm_set1_epi32(color);
440             while (count >= 4) {
441                 // Load 4 pixels each of src and dest.
442                 __m128i dst_pixel = _mm_load_si128(d);
443 
444                 //set the aphla value
445                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
446                                 0, *(mask+3),0, \
447                                 *(mask+2),0, *(mask+2),\
448                                 0,*(mask+1), 0,*(mask+1),\
449                                 0, *mask,0,*mask);
450 
451                 //call SkAlpha255To256()
452                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
453 
454                 // Get red and blue pixels into lower byte of each word.
455                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
456                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
457 
458                 // Get alpha and green into lower byte of each word.
459                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
460                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
461 
462                 // Put per-pixel alpha in low byte of each word.
463                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
464                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
465 
466                 // dst_alpha = dst_alpha * src_scale
467                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
468 
469                 // Divide by 256.
470                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
471 
472                 // Subtract alphas from 256, to get 1..256
473                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
474                 // Multiply red and blue by dst pixel alpha.
475                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
476                 // Multiply alpha and green by dst pixel alpha.
477                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
478 
479                 // Multiply red and blue by global alpha.
480                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
481                 // Multiply alpha and green by global alpha.
482                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
483                 // Divide by 256.
484                 dst_rb = _mm_srli_epi16(dst_rb, 8);
485                 src_rb = _mm_srli_epi16(src_rb, 8);
486 
487                 // Mask out low bits (goodies already in the right place; no need to divide)
488                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
489                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
490 
491                 // Combine back into RGBA.
492                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
493                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
494 
495                 // Add two pixels into result.
496                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
497                 _mm_store_si128(d, result);
498                 // load the next 4 pixel
499                 mask = mask + 4;
500                 d++;
501                 count -= 4;
502             }
503             dst = reinterpret_cast<SkPMColor *>(d);
504         }
505         while(count > 0) {
506             *dst= SkBlendARGB32(color, *dst, *mask);
507             dst += 1;
508             mask++;
509             count --;
510         }
511         dst = (SkPMColor *)((char*)dst + dstOffset);
512         mask += maskOffset;
513     } while (--height != 0);
514 }
515 
516 // The following (left) shifts cause the top 5 bits of the mask components to
517 // line up with the corresponding components in an SkPMColor.
518 // Note that the mask's RGB16 order may differ from the SkPMColor order.
519 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
520 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
521 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
522 
523 #if SK_R16x5_R32x5_SHIFT == 0
524     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
525 #elif SK_R16x5_R32x5_SHIFT > 0
526     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
527 #else
528     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
529 #endif
530 
531 #if SK_G16x5_G32x5_SHIFT == 0
532     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
533 #elif SK_G16x5_G32x5_SHIFT > 0
534     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
535 #else
536     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
537 #endif
538 
539 #if SK_B16x5_B32x5_SHIFT == 0
540     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
541 #elif SK_B16x5_B32x5_SHIFT > 0
542     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
543 #else
544     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
545 #endif
546 
SkBlendLCD16_SSE2(__m128i & srci,__m128i & dst,__m128i & mask,__m128i & scale)547 static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,
548                                  __m128i &mask, __m128i &scale) {
549     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
550     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
551                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
552 
553     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
554                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
555 
556     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
557                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
558 
559     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
560     mask = _mm_or_si128(_mm_or_si128(r, g), b);
561 
562     // Interleave R,G,B into the lower byte of word.
563     __m128i maskLo, maskHi;
564     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
565     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
566 
567     // Upscale to 0..32
568     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
569     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
570 
571     maskLo = _mm_mullo_epi16(maskLo, scale);
572     maskHi = _mm_mullo_epi16(maskHi, scale);
573 
574     maskLo = _mm_srli_epi16(maskLo, 8);
575     maskHi = _mm_srli_epi16(maskHi, 8);
576 
577     // Interleave R,G,B into the lower byte of the word.
578     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
579     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
580 
581     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
582     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
583 
584     maskLo = _mm_srai_epi16(maskLo, 5);
585     maskHi = _mm_srai_epi16(maskHi, 5);
586 
587     // Add two pixels into result.
588     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
589     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
590 
591     // Pack into 4 32bit dst pixels
592     return _mm_packus_epi16(resultLo, resultHi);
593 }
594 
SkBlendLCD16Opaque_SSE2(__m128i & srci,__m128i & dst,__m128i & mask)595 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,
596                                        __m128i &mask) {
597     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
598     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
599                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
600 
601     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
602                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
603 
604     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
605                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
606 
607     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
608     mask = _mm_or_si128(_mm_or_si128(r, g), b);
609 
610     // Interleave R,G,B into the lower byte of word.
611     __m128i maskLo, maskHi;
612     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
613     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
614 
615     // Upscale to 0..32
616     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
617     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
618 
619     // Interleave R,G,B into the lower byte of the word.
620     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
621     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
622 
623     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
624     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
625 
626     maskLo = _mm_srai_epi16(maskLo, 5);
627     maskHi = _mm_srai_epi16(maskHi, 5);
628 
629     // Add two pixels into result.
630     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
631     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
632 
633     // Pack into 4 32bit dst pixels and force opaque.
634     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
635                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
636 }
637 
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)638 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
639                          SkColor color, int width, SkPMColor) {
640     if (width <= 0) {
641         return;
642     }
643 
644     int srcA = SkColorGetA(color);
645     int srcR = SkColorGetR(color);
646     int srcG = SkColorGetG(color);
647     int srcB = SkColorGetB(color);
648 
649     srcA = SkAlpha255To256(srcA);
650 
651     if (width >= 4) {
652         SkASSERT(((size_t)dst & 0x03) == 0);
653         while (((size_t)dst & 0x0F) != 0) {
654             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
655             src++;
656             dst++;
657             width--;
658         }
659 
660         __m128i *d = reinterpret_cast<__m128i*>(dst);
661         __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
662         srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
663         __m128i scale = _mm_set1_epi16(srcA);
664         while (width >= 4) {
665             __m128i dst_pixel = _mm_load_si128(d);
666             __m128i mask_pixel = _mm_loadl_epi64(
667                                      reinterpret_cast<const __m128i*>(src));
668 
669             // Check whether mask_pixels are equal to 0 and get the highest bit
670             // of each byte of result, if mask pixes are all zero, we will get
671             // pack_cmp to 0xFFFF
672             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
673                                              _mm_setzero_si128()));
674 
675             // if mask pixels are not all zero, we will blend the dst pixels
676             if (pack_cmp != 0xFFFF) {
677                 // Unpack 4 16bit mask pixels to
678                 // (p0, 0, p1, 0, p2, 0, p3, 0)
679                 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
680                                                 _mm_setzero_si128());
681 
682                 // Process 4 32bit dst pixels
683                 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,
684                                                    mask_pixel, scale);
685                 _mm_store_si128(d, result);
686             }
687 
688             d++;
689             src += 4;
690             width -= 4;
691         }
692 
693         dst = reinterpret_cast<SkPMColor*>(d);
694     }
695 
696     while (width > 0) {
697         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
698         src++;
699         dst++;
700         width--;
701     }
702 }
703 
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)704 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
705                                SkColor color, int width, SkPMColor opaqueDst) {
706     if (width <= 0) {
707         return;
708     }
709 
710     int srcR = SkColorGetR(color);
711     int srcG = SkColorGetG(color);
712     int srcB = SkColorGetB(color);
713 
714     if (width >= 4) {
715         SkASSERT(((size_t)dst & 0x03) == 0);
716         while (((size_t)dst & 0x0F) != 0) {
717             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
718             src++;
719             dst++;
720             width--;
721         }
722 
723         __m128i *d = reinterpret_cast<__m128i*>(dst);
724         __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
725         srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
726         while (width >= 4) {
727             __m128i dst_pixel = _mm_load_si128(d);
728             __m128i mask_pixel = _mm_loadl_epi64(
729                                      reinterpret_cast<const __m128i*>(src));
730 
731             // Check whether mask_pixels are equal to 0 and get the highest bit
732             // of each byte of result, if mask pixes are all zero, we will get
733             // pack_cmp to 0xFFFF
734             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
735                                              _mm_setzero_si128()));
736 
737             // if mask pixels are not all zero, we will blend the dst pixels
738             if (pack_cmp != 0xFFFF) {
739                 // Unpack 4 16bit mask pixels to
740                 // (p0, 0, p1, 0, p2, 0, p3, 0)
741                 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
742                                                 _mm_setzero_si128());
743 
744                 // Process 4 32bit dst pixels
745                 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,
746                                                          mask_pixel);
747                 _mm_store_si128(d, result);
748             }
749 
750             d++;
751             src += 4;
752             width -= 4;
753         }
754 
755         dst = reinterpret_cast<SkPMColor*>(d);
756     }
757 
758     while (width > 0) {
759         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
760         src++;
761         dst++;
762         width--;
763     }
764 }
765