• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 
9 #include "SkBlitRow_opts_SSE2.h"
10 #include "SkColorPriv.h"
11 #include "SkUtils.h"
12 
13 #include <emmintrin.h>
14 
15 /* SSE2 version of S32_Blend_BlitRow32()
16  * portable version is in core/SkBlitRow_D32.cpp
17  */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)18 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
19                               const SkPMColor* SK_RESTRICT src,
20                               int count, U8CPU alpha) {
21     SkASSERT(alpha <= 255);
22     if (count <= 0) {
23         return;
24     }
25 
26     uint32_t src_scale = SkAlpha255To256(alpha);
27     uint32_t dst_scale = 256 - src_scale;
28 
29     if (count >= 4) {
30         SkASSERT(((size_t)dst & 0x03) == 0);
31         while (((size_t)dst & 0x0F) != 0) {
32             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
33             src++;
34             dst++;
35             count--;
36         }
37 
38         const __m128i *s = reinterpret_cast<const __m128i*>(src);
39         __m128i *d = reinterpret_cast<__m128i*>(dst);
40         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
41         __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
42 
43         // Move scale factors to upper byte of word
44         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
45         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
46         while (count >= 4) {
47             // Load 4 pixels each of src and dest.
48             __m128i src_pixel = _mm_loadu_si128(s);
49             __m128i dst_pixel = _mm_load_si128(d);
50 
51             // Interleave Atom port 0/1 operations based on the execution port
52             // constraints that multiply can only be executed on port 0 (while
53             // boolean operations can be executed on either port 0 or port 1)
54             // because GCC currently doesn't do a good job scheduling
55             // instructions based on these constraints.
56 
57             // Get red and blue pixels into lower byte of each word.
58             // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
59             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
60 
61             // Multiply by scale.
62             // (4 x (0, rs.h, 0, bs.h))
63             // where rs.h stands for the higher byte of r * scale, and
64             // bs.h the higher byte of b * scale.
65             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
66 
67             // Get alpha and green pixels into higher byte of each word.
68             // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
69             __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
70 
71             // Multiply by scale.
72             // (4 x (as.h, as.l, gs.h, gs.l))
73             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
74 
75             // Clear the lower byte of the a*scale and g*scale results
76             // (4 x (as.h, 0, gs.h, 0))
77             src_ag = _mm_and_si128(src_ag, ag_mask);
78 
79             // Operations the destination pixels are the same as on the
80             // source pixels. See the comments above.
81             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
82             dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
83             __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
84             dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
85             dst_ag = _mm_and_si128(dst_ag, ag_mask);
86 
87             // Combine back into RGBA.
88             // (4 x (as.h, rs.h, gs.h, bs.h))
89             src_pixel = _mm_or_si128(src_rb, src_ag);
90             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
91 
92             // Add result
93             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
94             _mm_store_si128(d, result);
95             s++;
96             d++;
97             count -= 4;
98         }
99         src = reinterpret_cast<const SkPMColor*>(s);
100         dst = reinterpret_cast<SkPMColor*>(d);
101     }
102 
103     while (count > 0) {
104         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
105         src++;
106         dst++;
107         count--;
108     }
109 }
110 
S32A_Opaque_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)111 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
112                                 const SkPMColor* SK_RESTRICT src,
113                                 int count, U8CPU alpha) {
114     SkASSERT(alpha == 255);
115     if (count <= 0) {
116         return;
117     }
118 
119     if (count >= 4) {
120         SkASSERT(((size_t)dst & 0x03) == 0);
121         while (((size_t)dst & 0x0F) != 0) {
122             *dst = SkPMSrcOver(*src, *dst);
123             src++;
124             dst++;
125             count--;
126         }
127 
128         const __m128i *s = reinterpret_cast<const __m128i*>(src);
129         __m128i *d = reinterpret_cast<__m128i*>(dst);
130 #ifdef SK_USE_ACCURATE_BLENDING
131         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
132         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
133         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
134         while (count >= 4) {
135             // Load 4 pixels
136             __m128i src_pixel = _mm_loadu_si128(s);
137             __m128i dst_pixel = _mm_load_si128(d);
138 
139             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
140             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
141             // Shift alphas down to lower 8 bits of each quad.
142             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
143 
144             // Copy alpha to upper 3rd byte of each quad
145             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
146 
147             // Subtract alphas from 255, to get 0..255
148             alpha = _mm_sub_epi16(c_255, alpha);
149 
150             // Multiply by red and blue by src alpha.
151             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
152             // Multiply by alpha and green by src alpha.
153             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
154 
155             // dst_rb_low = (dst_rb >> 8)
156             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
157             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
158 
159             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
160             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
161             dst_rb = _mm_add_epi16(dst_rb, c_128);
162             dst_rb = _mm_srli_epi16(dst_rb, 8);
163 
164             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
165             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
166             dst_ag = _mm_add_epi16(dst_ag, c_128);
167             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
168 
169             // Combine back into RGBA.
170             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
171 
172             // Add result
173             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
174             _mm_store_si128(d, result);
175             s++;
176             d++;
177             count -= 4;
178         }
179     #else
180         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
181         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
182         while (count >= 4) {
183             // Load 4 pixels
184             __m128i src_pixel = _mm_loadu_si128(s);
185             __m128i dst_pixel = _mm_load_si128(d);
186 
187             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
188             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
189 
190             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
191             __m128i alpha = _mm_srli_epi16(src_pixel, 8);
192 
193             // (a0, a0, a1, a1, a2, g2, a3, g3)
194             alpha = _mm_shufflehi_epi16(alpha, 0xF5);
195 
196             // (a0, a0, a1, a1, a2, a2, a3, a3)
197             alpha = _mm_shufflelo_epi16(alpha, 0xF5);
198 
199             // Subtract alphas from 256, to get 1..256
200             alpha = _mm_sub_epi16(c_256, alpha);
201 
202             // Multiply by red and blue by src alpha.
203             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
204             // Multiply by alpha and green by src alpha.
205             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
206 
207             // Divide by 256.
208             dst_rb = _mm_srli_epi16(dst_rb, 8);
209 
210             // Mask out high bits (already in the right place)
211             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
212 
213             // Combine back into RGBA.
214             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
215 
216             // Add result
217             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
218             _mm_store_si128(d, result);
219             s++;
220             d++;
221             count -= 4;
222         }
223 #endif
224         src = reinterpret_cast<const SkPMColor*>(s);
225         dst = reinterpret_cast<SkPMColor*>(d);
226     }
227 
228     while (count > 0) {
229         *dst = SkPMSrcOver(*src, *dst);
230         src++;
231         dst++;
232         count--;
233     }
234 }
235 
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)236 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
237                                const SkPMColor* SK_RESTRICT src,
238                                int count, U8CPU alpha) {
239     SkASSERT(alpha <= 255);
240     if (count <= 0) {
241         return;
242     }
243 
244     if (count >= 4) {
245         while (((size_t)dst & 0x0F) != 0) {
246             *dst = SkBlendARGB32(*src, *dst, alpha);
247             src++;
248             dst++;
249             count--;
250         }
251 
252         uint32_t src_scale = SkAlpha255To256(alpha);
253 
254         const __m128i *s = reinterpret_cast<const __m128i*>(src);
255         __m128i *d = reinterpret_cast<__m128i*>(dst);
256         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
257         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
258         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
259         while (count >= 4) {
260             // Load 4 pixels each of src and dest.
261             __m128i src_pixel = _mm_loadu_si128(s);
262             __m128i dst_pixel = _mm_load_si128(d);
263 
264             // Get red and blue pixels into lower byte of each word.
265             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
266             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
267 
268             // Get alpha and green into lower byte of each word.
269             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
270             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
271 
272             // Put per-pixel alpha in low byte of each word.
273             // After the following two statements, the dst_alpha looks like
274             // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
275             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
276             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
277 
278             // dst_alpha = dst_alpha * src_scale
279             // Because src_scales are in the higher byte of each word and
280             // we use mulhi here, the resulting alpha values are already
281             // in the right place and don't need to be divided by 256.
282             // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
283             dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
284 
285             // Subtract alphas from 256, to get 1..256
286             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
287 
288             // Multiply red and blue by dst pixel alpha.
289             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
290             // Multiply alpha and green by dst pixel alpha.
291             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
292 
293             // Multiply red and blue by global alpha.
294             // (4 x (0, rs.h, 0, bs.h))
295             // where rs.h stands for the higher byte of r * src_scale,
296             // and bs.h the higher byte of b * src_scale.
297             // Again, because we use mulhi, the resuling red and blue
298             // values are already in the right place and don't need to
299             // be divided by 256.
300             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
301             // Multiply alpha and green by global alpha.
302             // (4 x (0, as.h, 0, gs.h))
303             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
304 
305             // Divide by 256.
306             dst_rb = _mm_srli_epi16(dst_rb, 8);
307 
308             // Mask out low bits (goodies already in the right place; no need to divide)
309             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
310             // Shift alpha and green to higher byte of each word.
311             // (4 x (as.h, 0, gs.h, 0))
312             src_ag = _mm_slli_epi16(src_ag, 8);
313 
314             // Combine back into RGBA.
315             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
316             src_pixel = _mm_or_si128(src_rb, src_ag);
317 
318             // Add two pixels into result.
319             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
320             _mm_store_si128(d, result);
321             s++;
322             d++;
323             count -= 4;
324         }
325         src = reinterpret_cast<const SkPMColor*>(s);
326         dst = reinterpret_cast<SkPMColor*>(d);
327     }
328 
329     while (count > 0) {
330         *dst = SkBlendARGB32(*src, *dst, alpha);
331         src++;
332         dst++;
333         count--;
334     }
335 }
336 
337 /* SSE2 version of Color32()
338  * portable version is in core/SkBlitRow_D32.cpp
339  */
Color32_SSE2(SkPMColor dst[],const SkPMColor src[],int count,SkPMColor color)340 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
341                   SkPMColor color) {
342 
343     if (count <= 0) {
344         return;
345     }
346 
347     if (0 == color) {
348         if (src != dst) {
349             memcpy(dst, src, count * sizeof(SkPMColor));
350         }
351         return;
352     }
353 
354     unsigned colorA = SkGetPackedA32(color);
355     if (255 == colorA) {
356         sk_memset32(dst, color, count);
357     } else {
358         unsigned scale = 256 - SkAlpha255To256(colorA);
359 
360         if (count >= 4) {
361             SkASSERT(((size_t)dst & 0x03) == 0);
362             while (((size_t)dst & 0x0F) != 0) {
363                 *dst = color + SkAlphaMulQ(*src, scale);
364                 src++;
365                 dst++;
366                 count--;
367             }
368 
369             const __m128i *s = reinterpret_cast<const __m128i*>(src);
370             __m128i *d = reinterpret_cast<__m128i*>(dst);
371             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
372             __m128i src_scale_wide = _mm_set1_epi16(scale);
373             __m128i color_wide = _mm_set1_epi32(color);
374             while (count >= 4) {
375                 // Load 4 pixels each of src and dest.
376                 __m128i src_pixel = _mm_loadu_si128(s);
377 
378                 // Get red and blue pixels into lower byte of each word.
379                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
380 
381                 // Get alpha and green into lower byte of each word.
382                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
383 
384                 // Multiply by scale.
385                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
386                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
387 
388                 // Divide by 256.
389                 src_rb = _mm_srli_epi16(src_rb, 8);
390                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
391 
392                 // Combine back into RGBA.
393                 src_pixel = _mm_or_si128(src_rb, src_ag);
394 
395                 // Add color to result.
396                 __m128i result = _mm_add_epi8(color_wide, src_pixel);
397 
398                 // Store result.
399                 _mm_store_si128(d, result);
400                 s++;
401                 d++;
402                 count -= 4;
403             }
404             src = reinterpret_cast<const SkPMColor*>(s);
405             dst = reinterpret_cast<SkPMColor*>(d);
406          }
407 
408         while (count > 0) {
409             *dst = color + SkAlphaMulQ(*src, scale);
410             src += 1;
411             dst += 1;
412             count--;
413         }
414     }
415 }
416 
SkARGB32_A8_BlitMask_SSE2(void * device,size_t dstRB,const void * maskPtr,size_t maskRB,SkColor origColor,int width,int height)417 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
418                                size_t maskRB, SkColor origColor,
419                                int width, int height) {
420     SkPMColor color = SkPreMultiplyColor(origColor);
421     size_t dstOffset = dstRB - (width << 2);
422     size_t maskOffset = maskRB - width;
423     SkPMColor* dst = (SkPMColor *)device;
424     const uint8_t* mask = (const uint8_t*)maskPtr;
425     do {
426         int count = width;
427         if (count >= 4) {
428             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
429                 *dst = SkBlendARGB32(color, *dst, *mask);
430                 mask++;
431                 dst++;
432                 count--;
433             }
434             __m128i *d = reinterpret_cast<__m128i*>(dst);
435             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
436             __m128i c_256 = _mm_set1_epi16(256);
437             __m128i c_1 = _mm_set1_epi16(1);
438             __m128i src_pixel = _mm_set1_epi32(color);
439             while (count >= 4) {
440                 // Load 4 pixels each of src and dest.
441                 __m128i dst_pixel = _mm_load_si128(d);
442 
443                 //set the aphla value
444                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
445                                 0, *(mask+3),0, \
446                                 *(mask+2),0, *(mask+2),\
447                                 0,*(mask+1), 0,*(mask+1),\
448                                 0, *mask,0,*mask);
449 
450                 //call SkAlpha255To256()
451                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
452 
453                 // Get red and blue pixels into lower byte of each word.
454                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
455                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
456 
457                 // Get alpha and green into lower byte of each word.
458                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
459                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
460 
461                 // Put per-pixel alpha in low byte of each word.
462                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
463                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
464 
465                 // dst_alpha = dst_alpha * src_scale
466                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
467 
468                 // Divide by 256.
469                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
470 
471                 // Subtract alphas from 256, to get 1..256
472                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
473                 // Multiply red and blue by dst pixel alpha.
474                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
475                 // Multiply alpha and green by dst pixel alpha.
476                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
477 
478                 // Multiply red and blue by global alpha.
479                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
480                 // Multiply alpha and green by global alpha.
481                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
482                 // Divide by 256.
483                 dst_rb = _mm_srli_epi16(dst_rb, 8);
484                 src_rb = _mm_srli_epi16(src_rb, 8);
485 
486                 // Mask out low bits (goodies already in the right place; no need to divide)
487                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
488                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
489 
490                 // Combine back into RGBA.
491                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
492                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
493 
494                 // Add two pixels into result.
495                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
496                 _mm_store_si128(d, result);
497                 // load the next 4 pixel
498                 mask = mask + 4;
499                 d++;
500                 count -= 4;
501             }
502             dst = reinterpret_cast<SkPMColor *>(d);
503         }
504         while(count > 0) {
505             *dst= SkBlendARGB32(color, *dst, *mask);
506             dst += 1;
507             mask++;
508             count --;
509         }
510         dst = (SkPMColor *)((char*)dst + dstOffset);
511         mask += maskOffset;
512     } while (--height != 0);
513 }
514 
SkBlendLCD16_SSE2(__m128i & srci,__m128i & dst,__m128i & mask,__m128i & scale)515 static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,
516                                  __m128i &mask, __m128i &scale) {
517     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
518     __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
519                               16-SK_R16_SHIFT-(SK_R16_BITS-5)),
520                               _mm_set1_epi32(0x001F0000));
521 
522     __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
523                               8-SK_G16_SHIFT-(SK_G16_BITS-5)),
524                               _mm_set1_epi32(0x00001F00));
525 
526     __m128i b = _mm_and_si128(_mm_slli_epi32(mask,
527                               SK_B16_BITS-5),
528                               _mm_set1_epi32(0x0000001F));
529 
530     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
531     mask = _mm_or_si128(_mm_or_si128(r, g), b);
532 
533     // Interleave R,G,B into the lower byte of word.
534     __m128i maskLo, maskHi;
535     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
536     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
537 
538     // Upscale to 0..32
539     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
540     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
541 
542     maskLo = _mm_mullo_epi16(maskLo, scale);
543     maskHi = _mm_mullo_epi16(maskHi, scale);
544 
545     maskLo = _mm_srli_epi16(maskLo, 8);
546     maskHi = _mm_srli_epi16(maskHi, 8);
547 
548     // Interleave R,G,B into the lower byte of the word.
549     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
550     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
551 
552     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
553     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
554 
555     maskLo = _mm_srai_epi16(maskLo, 5);
556     maskHi = _mm_srai_epi16(maskHi, 5);
557 
558     // Add two pixels into result.
559     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
560     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
561 
562     // Pack into 4 32bit dst pixels
563     return _mm_packus_epi16(resultLo, resultHi);
564 }
565 
SkBlendLCD16Opaque_SSE2(__m128i & srci,__m128i & dst,__m128i & mask)566 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,
567                                        __m128i &mask) {
568     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
569     __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
570                               16-SK_R16_SHIFT-(SK_R16_BITS-5)),
571                               _mm_set1_epi32(0x001F0000));
572 
573     __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
574                               8-SK_G16_SHIFT-(SK_G16_BITS-5)),
575                               _mm_set1_epi32(0x00001F00));
576 
577     __m128i b = _mm_and_si128(_mm_slli_epi32(mask, SK_B16_BITS-5),
578                               _mm_set1_epi32(0x0000001F));
579 
580     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
581     mask = _mm_or_si128(_mm_or_si128(r, g), b);
582 
583     // Interleave R,G,B into the lower byte of word.
584     __m128i maskLo, maskHi;
585     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
586     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
587 
588     // Upscale to 0..32
589     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
590     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
591 
592     // Interleave R,G,B into the lower byte of the word.
593     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
594     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
595 
596     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
597     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
598 
599     maskLo = _mm_srai_epi16(maskLo, 5);
600     maskHi = _mm_srai_epi16(maskHi, 5);
601 
602     // Add two pixels into result.
603     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
604     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
605 
606     // Pack into 4 32bit dst pixels
607     return _mm_packus_epi16(resultLo, resultHi);
608 }
609 
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)610 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
611                          SkColor color, int width, SkPMColor) {
612     if (width <= 0) {
613         return;
614     }
615 
616     int srcA = SkColorGetA(color);
617     int srcR = SkColorGetR(color);
618     int srcG = SkColorGetG(color);
619     int srcB = SkColorGetB(color);
620 
621     srcA = SkAlpha255To256(srcA);
622 
623     if (width >= 4) {
624         SkASSERT(((size_t)dst & 0x03) == 0);
625         while (((size_t)dst & 0x0F) != 0) {
626             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
627             src++;
628             dst++;
629             width--;
630         }
631 
632         __m128i *d = reinterpret_cast<__m128i*>(dst);
633         __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
634         srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
635         __m128i scale = _mm_set1_epi16(srcA);
636         while (width >= 4) {
637             __m128i dst_pixel = _mm_load_si128(d);
638             __m128i mask_pixel = _mm_loadl_epi64(
639                                      reinterpret_cast<const __m128i*>(src));
640 
641             // Check whether mask_pixels are equal to 0 and get the highest bit
642             // of each byte of result, if mask pixes are all zero, we will get
643             // pack_cmp to 0xFFFF
644             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
645                                              _mm_setzero_si128()));
646 
647             // if mask pixels are not all zero, we will blend the dst pixels
648             if (pack_cmp != 0xFFFF) {
649                 // Unpack 4 16bit mask pixels to
650                 // (p0, 0, p1, 0, p2, 0, p3, 0)
651                 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
652                                                 _mm_setzero_si128());
653 
654                 // Process 4 32bit dst pixels
655                 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,
656                                                    mask_pixel, scale);
657                 _mm_store_si128(d, result);
658             }
659 
660             d++;
661             src += 4;
662             width -= 4;
663         }
664 
665         dst = reinterpret_cast<SkPMColor*>(d);
666     }
667 
668     while (width > 0) {
669         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
670         src++;
671         dst++;
672         width--;
673     }
674 }
675 
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)676 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
677                                SkColor color, int width, SkPMColor opaqueDst) {
678     if (width <= 0) {
679         return;
680     }
681 
682     int srcR = SkColorGetR(color);
683     int srcG = SkColorGetG(color);
684     int srcB = SkColorGetB(color);
685 
686     if (width >= 4) {
687         SkASSERT(((size_t)dst & 0x03) == 0);
688         while (((size_t)dst & 0x0F) != 0) {
689             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
690             src++;
691             dst++;
692             width--;
693         }
694 
695         __m128i *d = reinterpret_cast<__m128i*>(dst);
696         __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
697         srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
698         while (width >= 4) {
699             __m128i dst_pixel = _mm_load_si128(d);
700             __m128i mask_pixel = _mm_loadl_epi64(
701                                      reinterpret_cast<const __m128i*>(src));
702 
703             // Check whether mask_pixels are equal to 0 and get the highest bit
704             // of each byte of result, if mask pixes are all zero, we will get
705             // pack_cmp to 0xFFFF
706             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
707                                              _mm_setzero_si128()));
708 
709             // if mask pixels are not all zero, we will blend the dst pixels
710             if (pack_cmp != 0xFFFF) {
711                 // Unpack 4 16bit mask pixels to
712                 // (p0, 0, p1, 0, p2, 0, p3, 0)
713                 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
714                                                 _mm_setzero_si128());
715 
716                 // Process 4 32bit dst pixels
717                 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,
718                                                          mask_pixel);
719                 _mm_store_si128(d, result);
720             }
721 
722             d++;
723             src += 4;
724             width -= 4;
725         }
726 
727         dst = reinterpret_cast<SkPMColor*>(d);
728     }
729 
730     while (width > 0) {
731         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
732         src++;
733         dst++;
734         width--;
735     }
736 }
737