• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 
9 #include "SkBlitRow_opts_SSE2.h"
10 #include "SkBitmapProcState_opts_SSE2.h"
11 #include "SkColorPriv.h"
12 #include "SkUtils.h"
13 
14 #include <emmintrin.h>
15 
16 /* SSE2 version of S32_Blend_BlitRow32()
17  * portable version is in core/SkBlitRow_D32.cpp
18  */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20                               const SkPMColor* SK_RESTRICT src,
21                               int count, U8CPU alpha) {
22     SkASSERT(alpha <= 255);
23     if (count <= 0) {
24         return;
25     }
26 
27     uint32_t src_scale = SkAlpha255To256(alpha);
28     uint32_t dst_scale = 256 - src_scale;
29 
30     if (count >= 4) {
31         SkASSERT(((size_t)dst & 0x03) == 0);
32         while (((size_t)dst & 0x0F) != 0) {
33             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34             src++;
35             dst++;
36             count--;
37         }
38 
39         const __m128i *s = reinterpret_cast<const __m128i*>(src);
40         __m128i *d = reinterpret_cast<__m128i*>(dst);
41         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
42         __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
43 
44         // Move scale factors to upper byte of word
45         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
46         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
47         while (count >= 4) {
48             // Load 4 pixels each of src and dest.
49             __m128i src_pixel = _mm_loadu_si128(s);
50             __m128i dst_pixel = _mm_load_si128(d);
51 
52             // Interleave Atom port 0/1 operations based on the execution port
53             // constraints that multiply can only be executed on port 0 (while
54             // boolean operations can be executed on either port 0 or port 1)
55             // because GCC currently doesn't do a good job scheduling
56             // instructions based on these constraints.
57 
58             // Get red and blue pixels into lower byte of each word.
59             // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
60             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
61 
62             // Multiply by scale.
63             // (4 x (0, rs.h, 0, bs.h))
64             // where rs.h stands for the higher byte of r * scale, and
65             // bs.h the higher byte of b * scale.
66             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
67 
68             // Get alpha and green pixels into higher byte of each word.
69             // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
70             __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
71 
72             // Multiply by scale.
73             // (4 x (as.h, as.l, gs.h, gs.l))
74             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
75 
76             // Clear the lower byte of the a*scale and g*scale results
77             // (4 x (as.h, 0, gs.h, 0))
78             src_ag = _mm_and_si128(src_ag, ag_mask);
79 
80             // Operations the destination pixels are the same as on the
81             // source pixels. See the comments above.
82             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
83             dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
84             __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
85             dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
86             dst_ag = _mm_and_si128(dst_ag, ag_mask);
87 
88             // Combine back into RGBA.
89             // (4 x (as.h, rs.h, gs.h, bs.h))
90             src_pixel = _mm_or_si128(src_rb, src_ag);
91             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
92 
93             // Add result
94             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
95             _mm_store_si128(d, result);
96             s++;
97             d++;
98             count -= 4;
99         }
100         src = reinterpret_cast<const SkPMColor*>(s);
101         dst = reinterpret_cast<SkPMColor*>(d);
102     }
103 
104     while (count > 0) {
105         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
106         src++;
107         dst++;
108         count--;
109     }
110 }
111 
S32A_Opaque_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)112 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
113                                 const SkPMColor* SK_RESTRICT src,
114                                 int count, U8CPU alpha) {
115     SkASSERT(alpha == 255);
116     if (count <= 0) {
117         return;
118     }
119 
120     if (count >= 4) {
121         SkASSERT(((size_t)dst & 0x03) == 0);
122         while (((size_t)dst & 0x0F) != 0) {
123             *dst = SkPMSrcOver(*src, *dst);
124             src++;
125             dst++;
126             count--;
127         }
128 
129         const __m128i *s = reinterpret_cast<const __m128i*>(src);
130         __m128i *d = reinterpret_cast<__m128i*>(dst);
131 #ifdef SK_USE_ACCURATE_BLENDING
132         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
133         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
134         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
135         while (count >= 4) {
136             // Load 4 pixels
137             __m128i src_pixel = _mm_loadu_si128(s);
138             __m128i dst_pixel = _mm_load_si128(d);
139 
140             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
141             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
142             // Shift alphas down to lower 8 bits of each quad.
143             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
144 
145             // Copy alpha to upper 3rd byte of each quad
146             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
147 
148             // Subtract alphas from 255, to get 0..255
149             alpha = _mm_sub_epi16(c_255, alpha);
150 
151             // Multiply by red and blue by src alpha.
152             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
153             // Multiply by alpha and green by src alpha.
154             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
155 
156             // dst_rb_low = (dst_rb >> 8)
157             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
158             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
159 
160             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
161             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
162             dst_rb = _mm_add_epi16(dst_rb, c_128);
163             dst_rb = _mm_srli_epi16(dst_rb, 8);
164 
165             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
166             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
167             dst_ag = _mm_add_epi16(dst_ag, c_128);
168             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
169 
170             // Combine back into RGBA.
171             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
172 
173             // Add result
174             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
175             _mm_store_si128(d, result);
176             s++;
177             d++;
178             count -= 4;
179         }
180     #else
181         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
182         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
183         while (count >= 4) {
184             // Load 4 pixels
185             __m128i src_pixel = _mm_loadu_si128(s);
186             __m128i dst_pixel = _mm_load_si128(d);
187 
188             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
189             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
190 
191             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
192             __m128i alpha = _mm_srli_epi16(src_pixel, 8);
193 
194             // (a0, a0, a1, a1, a2, g2, a3, g3)
195             alpha = _mm_shufflehi_epi16(alpha, 0xF5);
196 
197             // (a0, a0, a1, a1, a2, a2, a3, a3)
198             alpha = _mm_shufflelo_epi16(alpha, 0xF5);
199 
200             // Subtract alphas from 256, to get 1..256
201             alpha = _mm_sub_epi16(c_256, alpha);
202 
203             // Multiply by red and blue by src alpha.
204             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
205             // Multiply by alpha and green by src alpha.
206             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
207 
208             // Divide by 256.
209             dst_rb = _mm_srli_epi16(dst_rb, 8);
210 
211             // Mask out high bits (already in the right place)
212             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
213 
214             // Combine back into RGBA.
215             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
216 
217             // Add result
218             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
219             _mm_store_si128(d, result);
220             s++;
221             d++;
222             count -= 4;
223         }
224 #endif
225         src = reinterpret_cast<const SkPMColor*>(s);
226         dst = reinterpret_cast<SkPMColor*>(d);
227     }
228 
229     while (count > 0) {
230         *dst = SkPMSrcOver(*src, *dst);
231         src++;
232         dst++;
233         count--;
234     }
235 }
236 
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)237 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
238                                const SkPMColor* SK_RESTRICT src,
239                                int count, U8CPU alpha) {
240     SkASSERT(alpha <= 255);
241     if (count <= 0) {
242         return;
243     }
244 
245     if (count >= 4) {
246         while (((size_t)dst & 0x0F) != 0) {
247             *dst = SkBlendARGB32(*src, *dst, alpha);
248             src++;
249             dst++;
250             count--;
251         }
252 
253         uint32_t src_scale = SkAlpha255To256(alpha);
254 
255         const __m128i *s = reinterpret_cast<const __m128i*>(src);
256         __m128i *d = reinterpret_cast<__m128i*>(dst);
257         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
258         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
259         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
260         while (count >= 4) {
261             // Load 4 pixels each of src and dest.
262             __m128i src_pixel = _mm_loadu_si128(s);
263             __m128i dst_pixel = _mm_load_si128(d);
264 
265             // Get red and blue pixels into lower byte of each word.
266             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
267             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
268 
269             // Get alpha and green into lower byte of each word.
270             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
271             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
272 
273             // Put per-pixel alpha in low byte of each word.
274             // After the following two statements, the dst_alpha looks like
275             // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
276             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
277             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
278 
279             // dst_alpha = dst_alpha * src_scale
280             // Because src_scales are in the higher byte of each word and
281             // we use mulhi here, the resulting alpha values are already
282             // in the right place and don't need to be divided by 256.
283             // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
284             dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
285 
286             // Subtract alphas from 256, to get 1..256
287             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
288 
289             // Multiply red and blue by dst pixel alpha.
290             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
291             // Multiply alpha and green by dst pixel alpha.
292             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
293 
294             // Multiply red and blue by global alpha.
295             // (4 x (0, rs.h, 0, bs.h))
296             // where rs.h stands for the higher byte of r * src_scale,
297             // and bs.h the higher byte of b * src_scale.
298             // Again, because we use mulhi, the resuling red and blue
299             // values are already in the right place and don't need to
300             // be divided by 256.
301             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
302             // Multiply alpha and green by global alpha.
303             // (4 x (0, as.h, 0, gs.h))
304             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
305 
306             // Divide by 256.
307             dst_rb = _mm_srli_epi16(dst_rb, 8);
308 
309             // Mask out low bits (goodies already in the right place; no need to divide)
310             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
311             // Shift alpha and green to higher byte of each word.
312             // (4 x (as.h, 0, gs.h, 0))
313             src_ag = _mm_slli_epi16(src_ag, 8);
314 
315             // Combine back into RGBA.
316             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
317             src_pixel = _mm_or_si128(src_rb, src_ag);
318 
319             // Add two pixels into result.
320             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
321             _mm_store_si128(d, result);
322             s++;
323             d++;
324             count -= 4;
325         }
326         src = reinterpret_cast<const SkPMColor*>(s);
327         dst = reinterpret_cast<SkPMColor*>(d);
328     }
329 
330     while (count > 0) {
331         *dst = SkBlendARGB32(*src, *dst, alpha);
332         src++;
333         dst++;
334         count--;
335     }
336 }
337 
338 /* SSE2 version of Color32()
339  * portable version is in core/SkBlitRow_D32.cpp
340  */
Color32_SSE2(SkPMColor dst[],const SkPMColor src[],int count,SkPMColor color)341 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
342                   SkPMColor color) {
343 
344     if (count <= 0) {
345         return;
346     }
347 
348     if (0 == color) {
349         if (src != dst) {
350             memcpy(dst, src, count * sizeof(SkPMColor));
351         }
352         return;
353     }
354 
355     unsigned colorA = SkGetPackedA32(color);
356     if (255 == colorA) {
357         sk_memset32(dst, color, count);
358     } else {
359         unsigned scale = 256 - SkAlpha255To256(colorA);
360 
361         if (count >= 4) {
362             SkASSERT(((size_t)dst & 0x03) == 0);
363             while (((size_t)dst & 0x0F) != 0) {
364                 *dst = color + SkAlphaMulQ(*src, scale);
365                 src++;
366                 dst++;
367                 count--;
368             }
369 
370             const __m128i *s = reinterpret_cast<const __m128i*>(src);
371             __m128i *d = reinterpret_cast<__m128i*>(dst);
372             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
373             __m128i src_scale_wide = _mm_set1_epi16(scale);
374             __m128i color_wide = _mm_set1_epi32(color);
375             while (count >= 4) {
376                 // Load 4 pixels each of src and dest.
377                 __m128i src_pixel = _mm_loadu_si128(s);
378 
379                 // Get red and blue pixels into lower byte of each word.
380                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
381 
382                 // Get alpha and green into lower byte of each word.
383                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
384 
385                 // Multiply by scale.
386                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
387                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
388 
389                 // Divide by 256.
390                 src_rb = _mm_srli_epi16(src_rb, 8);
391                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
392 
393                 // Combine back into RGBA.
394                 src_pixel = _mm_or_si128(src_rb, src_ag);
395 
396                 // Add color to result.
397                 __m128i result = _mm_add_epi8(color_wide, src_pixel);
398 
399                 // Store result.
400                 _mm_store_si128(d, result);
401                 s++;
402                 d++;
403                 count -= 4;
404             }
405             src = reinterpret_cast<const SkPMColor*>(s);
406             dst = reinterpret_cast<SkPMColor*>(d);
407          }
408 
409         while (count > 0) {
410             *dst = color + SkAlphaMulQ(*src, scale);
411             src += 1;
412             dst += 1;
413             count--;
414         }
415     }
416 }
417 
SkARGB32_A8_BlitMask_SSE2(void * device,size_t dstRB,const void * maskPtr,size_t maskRB,SkColor origColor,int width,int height)418 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
419                                size_t maskRB, SkColor origColor,
420                                int width, int height) {
421     SkPMColor color = SkPreMultiplyColor(origColor);
422     size_t dstOffset = dstRB - (width << 2);
423     size_t maskOffset = maskRB - width;
424     SkPMColor* dst = (SkPMColor *)device;
425     const uint8_t* mask = (const uint8_t*)maskPtr;
426     do {
427         int count = width;
428         if (count >= 4) {
429             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
430                 *dst = SkBlendARGB32(color, *dst, *mask);
431                 mask++;
432                 dst++;
433                 count--;
434             }
435             __m128i *d = reinterpret_cast<__m128i*>(dst);
436             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
437             __m128i c_256 = _mm_set1_epi16(256);
438             __m128i c_1 = _mm_set1_epi16(1);
439             __m128i src_pixel = _mm_set1_epi32(color);
440             while (count >= 4) {
441                 // Load 4 pixels each of src and dest.
442                 __m128i dst_pixel = _mm_load_si128(d);
443 
444                 //set the aphla value
445                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
446                                 0, *(mask+3),0, \
447                                 *(mask+2),0, *(mask+2),\
448                                 0,*(mask+1), 0,*(mask+1),\
449                                 0, *mask,0,*mask);
450 
451                 //call SkAlpha255To256()
452                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
453 
454                 // Get red and blue pixels into lower byte of each word.
455                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
456                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
457 
458                 // Get alpha and green into lower byte of each word.
459                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
460                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
461 
462                 // Put per-pixel alpha in low byte of each word.
463                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
464                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
465 
466                 // dst_alpha = dst_alpha * src_scale
467                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
468 
469                 // Divide by 256.
470                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
471 
472                 // Subtract alphas from 256, to get 1..256
473                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
474                 // Multiply red and blue by dst pixel alpha.
475                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
476                 // Multiply alpha and green by dst pixel alpha.
477                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
478 
479                 // Multiply red and blue by global alpha.
480                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
481                 // Multiply alpha and green by global alpha.
482                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
483                 // Divide by 256.
484                 dst_rb = _mm_srli_epi16(dst_rb, 8);
485                 src_rb = _mm_srli_epi16(src_rb, 8);
486 
487                 // Mask out low bits (goodies already in the right place; no need to divide)
488                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
489                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
490 
491                 // Combine back into RGBA.
492                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
493                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
494 
495                 // Add two pixels into result.
496                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
497                 _mm_store_si128(d, result);
498                 // load the next 4 pixel
499                 mask = mask + 4;
500                 d++;
501                 count -= 4;
502             }
503             dst = reinterpret_cast<SkPMColor *>(d);
504         }
505         while(count > 0) {
506             *dst= SkBlendARGB32(color, *dst, *mask);
507             dst += 1;
508             mask++;
509             count --;
510         }
511         dst = (SkPMColor *)((char*)dst + dstOffset);
512         mask += maskOffset;
513     } while (--height != 0);
514 }
515 
516 // The following (left) shifts cause the top 5 bits of the mask components to
517 // line up with the corresponding components in an SkPMColor.
518 // Note that the mask's RGB16 order may differ from the SkPMColor order.
519 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
520 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
521 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
522 
523 #if SK_R16x5_R32x5_SHIFT == 0
524     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
525 #elif SK_R16x5_R32x5_SHIFT > 0
526     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
527 #else
528     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
529 #endif
530 
531 #if SK_G16x5_G32x5_SHIFT == 0
532     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
533 #elif SK_G16x5_G32x5_SHIFT > 0
534     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
535 #else
536     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
537 #endif
538 
539 #if SK_B16x5_B32x5_SHIFT == 0
540     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
541 #elif SK_B16x5_B32x5_SHIFT > 0
542     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
543 #else
544     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
545 #endif
546 
SkBlendLCD16_SSE2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)547 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
548                                  __m128i &mask, __m128i &srcA) {
549     // In the following comments, the components of src, dst and mask are
550     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
551     // by an R, G, B, or A suffix. Components of one of the four pixels that
552     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
553     // example is the blue channel of the second destination pixel. Memory
554     // layout is shown for an ARGB byte order in a color value.
555 
556     // src and srcA store 8-bit values interleaved with zeros.
557     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
558     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
559     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
560     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
561     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
562     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
563     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
564 
565     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
566     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
567     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
568                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
569 
570     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
571     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
572                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
573 
574     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
575     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
576                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
577 
578     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
579     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
580     // 8-bit position
581     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
582     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
583     mask = _mm_or_si128(_mm_or_si128(r, g), b);
584 
585     // Interleave R,G,B into the lower byte of word.
586     // i.e. split the sixteen 8-bit values from mask into two sets of eight
587     // 16-bit values, padded by zero.
588     __m128i maskLo, maskHi;
589     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
590     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
591     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
592     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
593 
594     // Upscale from 0..31 to 0..32
595     // (allows to replace division by left-shift further down)
596     // Left-shift each component by 4 and add the result back to that component,
597     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
598     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
599     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
600 
601     // Multiply each component of maskLo and maskHi by srcA
602     maskLo = _mm_mullo_epi16(maskLo, srcA);
603     maskHi = _mm_mullo_epi16(maskHi, srcA);
604 
605     // Left shift mask components by 8 (divide by 256)
606     maskLo = _mm_srli_epi16(maskLo, 8);
607     maskHi = _mm_srli_epi16(maskHi, 8);
608 
609     // Interleave R,G,B into the lower byte of the word
610     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
611     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
612     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
613     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
614 
615     // mask = (src - dst) * mask
616     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
617     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
618 
619     // mask = (src - dst) * mask >> 5
620     maskLo = _mm_srai_epi16(maskLo, 5);
621     maskHi = _mm_srai_epi16(maskHi, 5);
622 
623     // Add two pixels into result.
624     // result = dst + ((src - dst) * mask >> 5)
625     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
626     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
627 
628     // Pack into 4 32bit dst pixels.
629     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
630     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
631     // clamping to 255 if necessary.
632     return _mm_packus_epi16(resultLo, resultHi);
633 }
634 
SkBlendLCD16Opaque_SSE2(__m128i & src,__m128i & dst,__m128i & mask)635 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
636                                        __m128i &mask) {
637     // In the following comments, the components of src, dst and mask are
638     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
639     // by an R, G, B, or A suffix. Components of one of the four pixels that
640     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
641     // example is the blue channel of the second destination pixel. Memory
642     // layout is shown for an ARGB byte order in a color value.
643 
644     // src and srcA store 8-bit values interleaved with zeros.
645     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
646     // mask stores 16-bit values (shown as high and low bytes) interleaved with
647     // zeros
648     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
649     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
650 
651     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
652     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
653     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
654                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
655 
656     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
657     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
658                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
659 
660     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
661     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
662                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
663 
664     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
665     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
666     // 8-bit position
667     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
668     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
669     mask = _mm_or_si128(_mm_or_si128(r, g), b);
670 
671     // Interleave R,G,B into the lower byte of word.
672     // i.e. split the sixteen 8-bit values from mask into two sets of eight
673     // 16-bit values, padded by zero.
674     __m128i maskLo, maskHi;
675     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
676     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
677     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
678     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
679 
680     // Upscale from 0..31 to 0..32
681     // (allows to replace division by left-shift further down)
682     // Left-shift each component by 4 and add the result back to that component,
683     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
684     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
685     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
686 
687     // Interleave R,G,B into the lower byte of the word
688     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
689     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
690     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
691     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
692 
693     // mask = (src - dst) * mask
694     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
695     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
696 
697     // mask = (src - dst) * mask >> 5
698     maskLo = _mm_srai_epi16(maskLo, 5);
699     maskHi = _mm_srai_epi16(maskHi, 5);
700 
701     // Add two pixels into result.
702     // result = dst + ((src - dst) * mask >> 5)
703     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
704     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
705 
706     // Pack into 4 32bit dst pixels and force opaque.
707     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
708     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
709     // clamping to 255 if necessary. Set alpha components to 0xFF.
710     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
711                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
712 }
713 
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)714 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
715                          SkColor src, int width, SkPMColor) {
716     if (width <= 0) {
717         return;
718     }
719 
720     int srcA = SkColorGetA(src);
721     int srcR = SkColorGetR(src);
722     int srcG = SkColorGetG(src);
723     int srcB = SkColorGetB(src);
724 
725     srcA = SkAlpha255To256(srcA);
726 
727     if (width >= 4) {
728         SkASSERT(((size_t)dst & 0x03) == 0);
729         while (((size_t)dst & 0x0F) != 0) {
730             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
731             mask++;
732             dst++;
733             width--;
734         }
735 
736         __m128i *d = reinterpret_cast<__m128i*>(dst);
737         // Set alpha to 0xFF and replicate source four times in SSE register.
738         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
739         // Interleave with zeros to get two sets of four 16-bit values.
740         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
741         // Set srcA_sse to contain eight copies of srcA, padded with zero.
742         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
743         __m128i srcA_sse = _mm_set1_epi16(srcA);
744         while (width >= 4) {
745             // Load four destination pixels into dst_sse.
746             __m128i dst_sse = _mm_load_si128(d);
747             // Load four 16-bit masks into lower half of mask_sse.
748             __m128i mask_sse = _mm_loadl_epi64(
749                                    reinterpret_cast<const __m128i*>(mask));
750 
751             // Check whether masks are equal to 0 and get the highest bit
752             // of each byte of result, if masks are all zero, we will get
753             // pack_cmp to 0xFFFF
754             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
755                                              _mm_setzero_si128()));
756 
757             // if mask pixels are not all zero, we will blend the dst pixels
758             if (pack_cmp != 0xFFFF) {
759                 // Unpack 4 16bit mask pixels to
760                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
761                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
762                 mask_sse = _mm_unpacklo_epi16(mask_sse,
763                                               _mm_setzero_si128());
764 
765                 // Process 4 32bit dst pixels
766                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
767                                                    mask_sse, srcA_sse);
768                 _mm_store_si128(d, result);
769             }
770 
771             d++;
772             mask += 4;
773             width -= 4;
774         }
775 
776         dst = reinterpret_cast<SkPMColor*>(d);
777     }
778 
779     while (width > 0) {
780         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
781         mask++;
782         dst++;
783         width--;
784     }
785 }
786 
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)787 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
788                                SkColor src, int width, SkPMColor opaqueDst) {
789     if (width <= 0) {
790         return;
791     }
792 
793     int srcR = SkColorGetR(src);
794     int srcG = SkColorGetG(src);
795     int srcB = SkColorGetB(src);
796 
797     if (width >= 4) {
798         SkASSERT(((size_t)dst & 0x03) == 0);
799         while (((size_t)dst & 0x0F) != 0) {
800             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
801             mask++;
802             dst++;
803             width--;
804         }
805 
806         __m128i *d = reinterpret_cast<__m128i*>(dst);
807         // Set alpha to 0xFF and replicate source four times in SSE register.
808         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
809         // Set srcA_sse to contain eight copies of srcA, padded with zero.
810         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
811         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
812         while (width >= 4) {
813             // Load four destination pixels into dst_sse.
814             __m128i dst_sse = _mm_load_si128(d);
815             // Load four 16-bit masks into lower half of mask_sse.
816             __m128i mask_sse = _mm_loadl_epi64(
817                                    reinterpret_cast<const __m128i*>(mask));
818 
819             // Check whether masks are equal to 0 and get the highest bit
820             // of each byte of result, if masks are all zero, we will get
821             // pack_cmp to 0xFFFF
822             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
823                                              _mm_setzero_si128()));
824 
825             // if mask pixels are not all zero, we will blend the dst pixels
826             if (pack_cmp != 0xFFFF) {
827                 // Unpack 4 16bit mask pixels to
828                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
829                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
830                 mask_sse = _mm_unpacklo_epi16(mask_sse,
831                                               _mm_setzero_si128());
832 
833                 // Process 4 32bit dst pixels
834                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
835                                                          mask_sse);
836                 _mm_store_si128(d, result);
837             }
838 
839             d++;
840             mask += 4;
841             width -= 4;
842         }
843 
844         dst = reinterpret_cast<SkPMColor*>(d);
845     }
846 
847     while (width > 0) {
848         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
849         mask++;
850         dst++;
851         width--;
852     }
853 }
854