• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include <emmintrin.h>
9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBlitRow_opts_SSE2.h"
11 #include "SkColorPriv.h"
12 #include "SkColor_opts_SSE2.h"
13 #include "SkDither.h"
14 #include "SkMSAN.h"
15 #include "SkUtils.h"
16 
17 /* SSE2 version of S32_Blend_BlitRow32()
18  * portable version is in core/SkBlitRow_D32.cpp
19  */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)20 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21                               const SkPMColor* SK_RESTRICT src,
22                               int count, U8CPU alpha) {
23     SkASSERT(alpha <= 255);
24     if (count <= 0) {
25         return;
26     }
27 
28     uint32_t src_scale = SkAlpha255To256(alpha);
29     uint32_t dst_scale = 256 - src_scale;
30 
31     if (count >= 4) {
32         SkASSERT(((size_t)dst & 0x03) == 0);
33         while (((size_t)dst & 0x0F) != 0) {
34             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
35             src++;
36             dst++;
37             count--;
38         }
39 
40         const __m128i *s = reinterpret_cast<const __m128i*>(src);
41         __m128i *d = reinterpret_cast<__m128i*>(dst);
42 
43         while (count >= 4) {
44             // Load 4 pixels each of src and dest.
45             __m128i src_pixel = _mm_loadu_si128(s);
46             __m128i dst_pixel = _mm_load_si128(d);
47 
48             src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
49             dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
50 
51             // Add result
52             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
53             _mm_store_si128(d, result);
54             s++;
55             d++;
56             count -= 4;
57         }
58         src = reinterpret_cast<const SkPMColor*>(s);
59         dst = reinterpret_cast<SkPMColor*>(d);
60     }
61 
62     while (count > 0) {
63         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
64         src++;
65         dst++;
66         count--;
67     }
68 }
69 
S32A_Opaque_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)70 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
71                                 const SkPMColor* SK_RESTRICT src,
72                                 int count, U8CPU alpha) {
73     sk_msan_assert_initialized(src, src+count);
74 
75     SkASSERT(alpha == 255);
76     if (count <= 0) {
77         return;
78     }
79 
80 #ifdef SK_USE_ACCURATE_BLENDING
81     if (count >= 4) {
82         SkASSERT(((size_t)dst & 0x03) == 0);
83         while (((size_t)dst & 0x0F) != 0) {
84             *dst = SkPMSrcOver(*src, *dst);
85             src++;
86             dst++;
87             count--;
88         }
89 
90         const __m128i *s = reinterpret_cast<const __m128i*>(src);
91         __m128i *d = reinterpret_cast<__m128i*>(dst);
92         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
93         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
94         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
95         while (count >= 4) {
96             // Load 4 pixels
97             __m128i src_pixel = _mm_loadu_si128(s);
98             __m128i dst_pixel = _mm_load_si128(d);
99 
100             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
101             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
102             // Shift alphas down to lower 8 bits of each quad.
103             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
104 
105             // Copy alpha to upper 3rd byte of each quad
106             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
107 
108             // Subtract alphas from 255, to get 0..255
109             alpha = _mm_sub_epi16(c_255, alpha);
110 
111             // Multiply by red and blue by src alpha.
112             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
113             // Multiply by alpha and green by src alpha.
114             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
115 
116             // dst_rb_low = (dst_rb >> 8)
117             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
118             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
119 
120             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
121             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
122             dst_rb = _mm_add_epi16(dst_rb, c_128);
123             dst_rb = _mm_srli_epi16(dst_rb, 8);
124 
125             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
126             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
127             dst_ag = _mm_add_epi16(dst_ag, c_128);
128             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
129 
130             // Combine back into RGBA.
131             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
132 
133             // Add result
134             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
135             _mm_store_si128(d, result);
136             s++;
137             d++;
138             count -= 4;
139         }
140         src = reinterpret_cast<const SkPMColor*>(s);
141         dst = reinterpret_cast<SkPMColor*>(d);
142     }
143 
144     while (count > 0) {
145         *dst = SkPMSrcOver(*src, *dst);
146         src++;
147         dst++;
148         count--;
149     }
150 #else
151     int count16 = count / 16;
152     __m128i* dst4 = (__m128i*)dst;
153     const __m128i* src4 = (const __m128i*)src;
154 
155     for (int i = 0; i < count16 * 4; i += 4) {
156         // Load 16 source pixels.
157         __m128i s0 = _mm_loadu_si128(src4+i+0),
158                 s1 = _mm_loadu_si128(src4+i+1),
159                 s2 = _mm_loadu_si128(src4+i+2),
160                 s3 = _mm_loadu_si128(src4+i+3);
161 
162         const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
163         const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
164         __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
165         if (0xffff == _mm_movemask_epi8(cmp)) {
166             // All 16 source pixels are fully transparent. There's nothing to do!
167             continue;
168         }
169         const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
170         cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
171         if (0xffff == _mm_movemask_epi8(cmp)) {
172             // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
173             _mm_storeu_si128(dst4+i+0, s0);
174             _mm_storeu_si128(dst4+i+1, s1);
175             _mm_storeu_si128(dst4+i+2, s2);
176             _mm_storeu_si128(dst4+i+3, s3);
177             continue;
178         }
179         // The general slow case: do the blend for all 16 pixels.
180         _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
181         _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
182         _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
183         _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
184     }
185 
186     // Wrap up the last <= 15 pixels.
187     SkASSERT(count - (count16*16) <= 15);
188     for (int i = count16*16; i < count; i++) {
189         // This check is not really necessarily, but it prevents pointless autovectorization.
190         if (src[i] & 0xFF000000) {
191             dst[i] = SkPMSrcOver(src[i], dst[i]);
192         }
193     }
194 #endif
195 }
196 
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)197 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
198                                const SkPMColor* SK_RESTRICT src,
199                                int count, U8CPU alpha) {
200     SkASSERT(alpha <= 255);
201     if (count <= 0) {
202         return;
203     }
204 
205     if (count >= 4) {
206         while (((size_t)dst & 0x0F) != 0) {
207             *dst = SkBlendARGB32(*src, *dst, alpha);
208             src++;
209             dst++;
210             count--;
211         }
212 
213         const __m128i *s = reinterpret_cast<const __m128i*>(src);
214         __m128i *d = reinterpret_cast<__m128i*>(dst);
215         while (count >= 4) {
216             // Load 4 pixels each of src and dest.
217             __m128i src_pixel = _mm_loadu_si128(s);
218             __m128i dst_pixel = _mm_load_si128(d);
219 
220             __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
221             _mm_store_si128(d, result);
222             s++;
223             d++;
224             count -= 4;
225         }
226         src = reinterpret_cast<const SkPMColor*>(s);
227         dst = reinterpret_cast<SkPMColor*>(d);
228     }
229 
230     while (count > 0) {
231         *dst = SkBlendARGB32(*src, *dst, alpha);
232         src++;
233         dst++;
234         count--;
235     }
236 }
237 
Color32A_D565_SSE2(uint16_t dst[],SkPMColor src,int count,int x,int y)238 void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
239     SkASSERT(count > 0);
240 
241     uint32_t src_expand = (SkGetPackedG32(src) << 24) |
242                           (SkGetPackedR32(src) << 13) |
243                           (SkGetPackedB32(src) << 2);
244     unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
245 
246     // Check if we have enough pixels to run SIMD
247     if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
248         __m128i* dst_wide;
249         const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
250         const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
251         const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
252         const __m128i scale_wide = _mm_set1_epi16(scale);
253         const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
254         const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
255 
256         // Align dst to an even 16 byte address (0-7 pixels)
257         while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
258             *dst = SkBlend32_RGB16(src_expand, *dst, scale);
259             dst += 1;
260             count--;
261         }
262 
263         dst_wide = reinterpret_cast<__m128i*>(dst);
264         do {
265             // Load eight RGB565 pixels
266             __m128i pixels = _mm_load_si128(dst_wide);
267 
268             // Mask out sub-pixels
269             __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
270             __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
271             pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
272             __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
273 
274             // Scale with alpha
275             pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
276             pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
277             pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
278 
279             // Add src_X_wide and shift down again
280             pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
281             pixel_R = _mm_srli_epi16(pixel_R, 5);
282             pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
283             pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
284             pixel_B = _mm_srli_epi16(pixel_B, 5);
285 
286             // Combine into RGB565 and store
287             pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
288             pixel_G = _mm_and_si128(pixel_G, mask_green);
289             pixels = _mm_or_si128(pixel_R, pixel_G);
290             pixels = _mm_or_si128(pixels, pixel_B);
291             _mm_store_si128(dst_wide, pixels);
292             count -= 8;
293             dst_wide++;
294         } while (count >= 8);
295 
296         dst = reinterpret_cast<uint16_t*>(dst_wide);
297     }
298 
299     // Small loop to handle remaining pixels.
300     while (count > 0) {
301         *dst = SkBlend32_RGB16(src_expand, *dst, scale);
302         dst += 1;
303         count--;
304     }
305 }
306 
307 // The following (left) shifts cause the top 5 bits of the mask components to
308 // line up with the corresponding components in an SkPMColor.
309 // Note that the mask's RGB16 order may differ from the SkPMColor order.
310 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
311 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
312 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
313 
314 #if SK_R16x5_R32x5_SHIFT == 0
315     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
316 #elif SK_R16x5_R32x5_SHIFT > 0
317     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
318 #else
319     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
320 #endif
321 
322 #if SK_G16x5_G32x5_SHIFT == 0
323     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
324 #elif SK_G16x5_G32x5_SHIFT > 0
325     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
326 #else
327     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
328 #endif
329 
330 #if SK_B16x5_B32x5_SHIFT == 0
331     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
332 #elif SK_B16x5_B32x5_SHIFT > 0
333     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
334 #else
335     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
336 #endif
337 
SkBlendLCD16_SSE2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)338 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
339                                  __m128i &mask, __m128i &srcA) {
340     // In the following comments, the components of src, dst and mask are
341     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
342     // by an R, G, B, or A suffix. Components of one of the four pixels that
343     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
344     // example is the blue channel of the second destination pixel. Memory
345     // layout is shown for an ARGB byte order in a color value.
346 
347     // src and srcA store 8-bit values interleaved with zeros.
348     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
349     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
350     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
351     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
352     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
353     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
354     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
355 
356     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
357     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
358     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
359                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
360 
361     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
362     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
363                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
364 
365     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
366     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
367                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
368 
369     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
370     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
371     // 8-bit position
372     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
373     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
374     mask = _mm_or_si128(_mm_or_si128(r, g), b);
375 
376     // Interleave R,G,B into the lower byte of word.
377     // i.e. split the sixteen 8-bit values from mask into two sets of eight
378     // 16-bit values, padded by zero.
379     __m128i maskLo, maskHi;
380     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
381     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
382     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
383     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
384 
385     // Upscale from 0..31 to 0..32
386     // (allows to replace division by left-shift further down)
387     // Left-shift each component by 4 and add the result back to that component,
388     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
389     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
390     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
391 
392     // Multiply each component of maskLo and maskHi by srcA
393     maskLo = _mm_mullo_epi16(maskLo, srcA);
394     maskHi = _mm_mullo_epi16(maskHi, srcA);
395 
396     // Left shift mask components by 8 (divide by 256)
397     maskLo = _mm_srli_epi16(maskLo, 8);
398     maskHi = _mm_srli_epi16(maskHi, 8);
399 
400     // Interleave R,G,B into the lower byte of the word
401     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
402     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
403     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
404     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
405 
406     // mask = (src - dst) * mask
407     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
408     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
409 
410     // mask = (src - dst) * mask >> 5
411     maskLo = _mm_srai_epi16(maskLo, 5);
412     maskHi = _mm_srai_epi16(maskHi, 5);
413 
414     // Add two pixels into result.
415     // result = dst + ((src - dst) * mask >> 5)
416     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
417     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
418 
419     // Pack into 4 32bit dst pixels.
420     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
421     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
422     // clamping to 255 if necessary.
423     return _mm_packus_epi16(resultLo, resultHi);
424 }
425 
SkBlendLCD16Opaque_SSE2(__m128i & src,__m128i & dst,__m128i & mask)426 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
427                                        __m128i &mask) {
428     // In the following comments, the components of src, dst and mask are
429     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
430     // by an R, G, B, or A suffix. Components of one of the four pixels that
431     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
432     // example is the blue channel of the second destination pixel. Memory
433     // layout is shown for an ARGB byte order in a color value.
434 
435     // src and srcA store 8-bit values interleaved with zeros.
436     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
437     // mask stores 16-bit values (shown as high and low bytes) interleaved with
438     // zeros
439     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
440     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
441 
442     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
443     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
444     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
445                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
446 
447     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
448     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
449                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
450 
451     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
452     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
453                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
454 
455     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
456     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
457     // 8-bit position
458     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
459     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
460     mask = _mm_or_si128(_mm_or_si128(r, g), b);
461 
462     // Interleave R,G,B into the lower byte of word.
463     // i.e. split the sixteen 8-bit values from mask into two sets of eight
464     // 16-bit values, padded by zero.
465     __m128i maskLo, maskHi;
466     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
467     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
468     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
469     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
470 
471     // Upscale from 0..31 to 0..32
472     // (allows to replace division by left-shift further down)
473     // Left-shift each component by 4 and add the result back to that component,
474     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
475     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
476     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
477 
478     // Interleave R,G,B into the lower byte of the word
479     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
480     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
481     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
482     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
483 
484     // mask = (src - dst) * mask
485     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
486     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
487 
488     // mask = (src - dst) * mask >> 5
489     maskLo = _mm_srai_epi16(maskLo, 5);
490     maskHi = _mm_srai_epi16(maskHi, 5);
491 
492     // Add two pixels into result.
493     // result = dst + ((src - dst) * mask >> 5)
494     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
495     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
496 
497     // Pack into 4 32bit dst pixels and force opaque.
498     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
499     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
500     // clamping to 255 if necessary. Set alpha components to 0xFF.
501     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
502                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
503 }
504 
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)505 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
506                          SkColor src, int width, SkPMColor) {
507     if (width <= 0) {
508         return;
509     }
510 
511     int srcA = SkColorGetA(src);
512     int srcR = SkColorGetR(src);
513     int srcG = SkColorGetG(src);
514     int srcB = SkColorGetB(src);
515 
516     srcA = SkAlpha255To256(srcA);
517 
518     if (width >= 4) {
519         SkASSERT(((size_t)dst & 0x03) == 0);
520         while (((size_t)dst & 0x0F) != 0) {
521             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
522             mask++;
523             dst++;
524             width--;
525         }
526 
527         __m128i *d = reinterpret_cast<__m128i*>(dst);
528         // Set alpha to 0xFF and replicate source four times in SSE register.
529         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
530         // Interleave with zeros to get two sets of four 16-bit values.
531         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
532         // Set srcA_sse to contain eight copies of srcA, padded with zero.
533         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
534         __m128i srcA_sse = _mm_set1_epi16(srcA);
535         while (width >= 4) {
536             // Load four destination pixels into dst_sse.
537             __m128i dst_sse = _mm_load_si128(d);
538             // Load four 16-bit masks into lower half of mask_sse.
539             __m128i mask_sse = _mm_loadl_epi64(
540                                    reinterpret_cast<const __m128i*>(mask));
541 
542             // Check whether masks are equal to 0 and get the highest bit
543             // of each byte of result, if masks are all zero, we will get
544             // pack_cmp to 0xFFFF
545             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
546                                              _mm_setzero_si128()));
547 
548             // if mask pixels are not all zero, we will blend the dst pixels
549             if (pack_cmp != 0xFFFF) {
550                 // Unpack 4 16bit mask pixels to
551                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
552                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
553                 mask_sse = _mm_unpacklo_epi16(mask_sse,
554                                               _mm_setzero_si128());
555 
556                 // Process 4 32bit dst pixels
557                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
558                                                    mask_sse, srcA_sse);
559                 _mm_store_si128(d, result);
560             }
561 
562             d++;
563             mask += 4;
564             width -= 4;
565         }
566 
567         dst = reinterpret_cast<SkPMColor*>(d);
568     }
569 
570     while (width > 0) {
571         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
572         mask++;
573         dst++;
574         width--;
575     }
576 }
577 
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)578 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
579                                SkColor src, int width, SkPMColor opaqueDst) {
580     if (width <= 0) {
581         return;
582     }
583 
584     int srcR = SkColorGetR(src);
585     int srcG = SkColorGetG(src);
586     int srcB = SkColorGetB(src);
587 
588     if (width >= 4) {
589         SkASSERT(((size_t)dst & 0x03) == 0);
590         while (((size_t)dst & 0x0F) != 0) {
591             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
592             mask++;
593             dst++;
594             width--;
595         }
596 
597         __m128i *d = reinterpret_cast<__m128i*>(dst);
598         // Set alpha to 0xFF and replicate source four times in SSE register.
599         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
600         // Set srcA_sse to contain eight copies of srcA, padded with zero.
601         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
602         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
603         while (width >= 4) {
604             // Load four destination pixels into dst_sse.
605             __m128i dst_sse = _mm_load_si128(d);
606             // Load four 16-bit masks into lower half of mask_sse.
607             __m128i mask_sse = _mm_loadl_epi64(
608                                    reinterpret_cast<const __m128i*>(mask));
609 
610             // Check whether masks are equal to 0 and get the highest bit
611             // of each byte of result, if masks are all zero, we will get
612             // pack_cmp to 0xFFFF
613             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
614                                              _mm_setzero_si128()));
615 
616             // if mask pixels are not all zero, we will blend the dst pixels
617             if (pack_cmp != 0xFFFF) {
618                 // Unpack 4 16bit mask pixels to
619                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
620                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
621                 mask_sse = _mm_unpacklo_epi16(mask_sse,
622                                               _mm_setzero_si128());
623 
624                 // Process 4 32bit dst pixels
625                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
626                                                          mask_sse);
627                 _mm_store_si128(d, result);
628             }
629 
630             d++;
631             mask += 4;
632             width -= 4;
633         }
634 
635         dst = reinterpret_cast<SkPMColor*>(d);
636     }
637 
638     while (width > 0) {
639         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
640         mask++;
641         dst++;
642         width--;
643     }
644 }
645 
646 /* SSE2 version of S32_D565_Opaque()
647  * portable version is in core/SkBlitRow_D16.cpp
648  */
S32_D565_Opaque_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)649 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
650                           const SkPMColor* SK_RESTRICT src, int count,
651                           U8CPU alpha, int /*x*/, int /*y*/) {
652     SkASSERT(255 == alpha);
653 
654     if (count <= 0) {
655         return;
656     }
657 
658     if (count >= 8) {
659         while (((size_t)dst & 0x0F) != 0) {
660             SkPMColor c = *src++;
661             SkPMColorAssert(c);
662 
663             *dst++ = SkPixel32ToPixel16_ToU16(c);
664             count--;
665         }
666 
667         const __m128i* s = reinterpret_cast<const __m128i*>(src);
668         __m128i* d = reinterpret_cast<__m128i*>(dst);
669 
670         while (count >= 8) {
671             // Load 8 pixels of src.
672             __m128i src_pixel1 = _mm_loadu_si128(s++);
673             __m128i src_pixel2 = _mm_loadu_si128(s++);
674 
675             __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
676             _mm_store_si128(d++, d_pixel);
677             count -= 8;
678         }
679         src = reinterpret_cast<const SkPMColor*>(s);
680         dst = reinterpret_cast<uint16_t*>(d);
681     }
682 
683     if (count > 0) {
684         do {
685             SkPMColor c = *src++;
686             SkPMColorAssert(c);
687             *dst++ = SkPixel32ToPixel16_ToU16(c);
688         } while (--count != 0);
689     }
690 }
691 
692 /* SSE2 version of S32A_D565_Opaque()
693  * portable version is in core/SkBlitRow_D16.cpp
694  */
S32A_D565_Opaque_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)695 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
696                            const SkPMColor* SK_RESTRICT src,
697                            int count, U8CPU alpha, int /*x*/, int /*y*/) {
698     SkASSERT(255 == alpha);
699 
700     if (count <= 0) {
701         return;
702     }
703 
704     if (count >= 8) {
705         // Make dst 16 bytes alignment
706         while (((size_t)dst & 0x0F) != 0) {
707             SkPMColor c = *src++;
708             if (c) {
709               *dst = SkSrcOver32To16(c, *dst);
710             }
711             dst += 1;
712             count--;
713         }
714 
715         const __m128i* s = reinterpret_cast<const __m128i*>(src);
716         __m128i* d = reinterpret_cast<__m128i*>(dst);
717         __m128i var255 = _mm_set1_epi16(255);
718         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
719         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
720         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
721 
722         while (count >= 8) {
723             // Load 8 pixels of src.
724             __m128i src_pixel1 = _mm_loadu_si128(s++);
725             __m128i src_pixel2 = _mm_loadu_si128(s++);
726 
727             // Check whether src pixels are equal to 0 and get the highest bit
728             // of each byte of result, if src pixels are all zero, src_cmp1 and
729             // src_cmp2 will be 0xFFFF.
730             int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
731                                              _mm_setzero_si128()));
732             int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
733                                              _mm_setzero_si128()));
734             if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
735                 d++;
736                 count -= 8;
737                 continue;
738             }
739 
740             // Load 8 pixels of dst.
741             __m128i dst_pixel = _mm_load_si128(d);
742 
743             // Extract A from src.
744             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
745             sa1 = _mm_srli_epi32(sa1, 24);
746             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
747             sa2 = _mm_srli_epi32(sa2, 24);
748             __m128i sa = _mm_packs_epi32(sa1, sa2);
749 
750             // Extract R from src.
751             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
752             sr1 = _mm_srli_epi32(sr1, 24);
753             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
754             sr2 = _mm_srli_epi32(sr2, 24);
755             __m128i sr = _mm_packs_epi32(sr1, sr2);
756 
757             // Extract G from src.
758             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
759             sg1 = _mm_srli_epi32(sg1, 24);
760             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
761             sg2 = _mm_srli_epi32(sg2, 24);
762             __m128i sg = _mm_packs_epi32(sg1, sg2);
763 
764             // Extract B from src.
765             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
766             sb1 = _mm_srli_epi32(sb1, 24);
767             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
768             sb2 = _mm_srli_epi32(sb2, 24);
769             __m128i sb = _mm_packs_epi32(sb1, sb2);
770 
771             // Extract R G B from dst.
772             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
773             dr = _mm_and_si128(dr, r16_mask);
774             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
775             dg = _mm_and_si128(dg, g16_mask);
776             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
777             db = _mm_and_si128(db, b16_mask);
778 
779             __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
780 
781             // Calculate R G B of result.
782             // Original algorithm is in SkSrcOver32To16().
783             dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
784             dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
785             dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
786             dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
787             db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
788             db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
789 
790             // Pack R G B into 16-bit color.
791             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
792 
793             // Store 8 16-bit colors in dst.
794             _mm_store_si128(d++, d_pixel);
795             count -= 8;
796         }
797 
798         src = reinterpret_cast<const SkPMColor*>(s);
799         dst = reinterpret_cast<uint16_t*>(d);
800     }
801 
802     if (count > 0) {
803         do {
804             SkPMColor c = *src++;
805             SkPMColorAssert(c);
806             if (c) {
807                 *dst = SkSrcOver32To16(c, *dst);
808             }
809             dst += 1;
810         } while (--count != 0);
811     }
812 }
813 
S32_D565_Opaque_Dither_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)814 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
815                                  const SkPMColor* SK_RESTRICT src,
816                                  int count, U8CPU alpha, int x, int y) {
817     SkASSERT(255 == alpha);
818 
819     if (count <= 0) {
820         return;
821     }
822 
823     if (count >= 8) {
824         while (((size_t)dst & 0x0F) != 0) {
825             DITHER_565_SCAN(y);
826             SkPMColor c = *src++;
827             SkPMColorAssert(c);
828 
829             unsigned dither = DITHER_VALUE(x);
830             *dst++ = SkDitherRGB32To565(c, dither);
831             DITHER_INC_X(x);
832             count--;
833         }
834 
835         unsigned short dither_value[8];
836         __m128i dither;
837 #ifdef ENABLE_DITHER_MATRIX_4X4
838         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
839         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
840         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
841         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
842         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
843 #else
844         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
845         dither_value[0] = dither_value[4] = (dither_scan
846                                              >> (((x) & 3) << 2)) & 0xF;
847         dither_value[1] = dither_value[5] = (dither_scan
848                                              >> (((x + 1) & 3) << 2)) & 0xF;
849         dither_value[2] = dither_value[6] = (dither_scan
850                                              >> (((x + 2) & 3) << 2)) & 0xF;
851         dither_value[3] = dither_value[7] = (dither_scan
852                                              >> (((x + 3) & 3) << 2)) & 0xF;
853 #endif
854         dither = _mm_loadu_si128((__m128i*) dither_value);
855 
856         const __m128i* s = reinterpret_cast<const __m128i*>(src);
857         __m128i* d = reinterpret_cast<__m128i*>(dst);
858 
859         while (count >= 8) {
860             // Load 8 pixels of src.
861             __m128i src_pixel1 = _mm_loadu_si128(s++);
862             __m128i src_pixel2 = _mm_loadu_si128(s++);
863 
864             // Extract R from src.
865             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
866             sr1 = _mm_srli_epi32(sr1, 24);
867             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
868             sr2 = _mm_srli_epi32(sr2, 24);
869             __m128i sr = _mm_packs_epi32(sr1, sr2);
870 
871             // SkDITHER_R32To565(sr, dither)
872             __m128i sr_offset = _mm_srli_epi16(sr, 5);
873             sr = _mm_add_epi16(sr, dither);
874             sr = _mm_sub_epi16(sr, sr_offset);
875             sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
876 
877             // Extract G from src.
878             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
879             sg1 = _mm_srli_epi32(sg1, 24);
880             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
881             sg2 = _mm_srli_epi32(sg2, 24);
882             __m128i sg = _mm_packs_epi32(sg1, sg2);
883 
884             // SkDITHER_R32To565(sg, dither)
885             __m128i sg_offset = _mm_srli_epi16(sg, 6);
886             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
887             sg = _mm_sub_epi16(sg, sg_offset);
888             sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
889 
890             // Extract B from src.
891             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
892             sb1 = _mm_srli_epi32(sb1, 24);
893             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
894             sb2 = _mm_srli_epi32(sb2, 24);
895             __m128i sb = _mm_packs_epi32(sb1, sb2);
896 
897             // SkDITHER_R32To565(sb, dither)
898             __m128i sb_offset = _mm_srli_epi16(sb, 5);
899             sb = _mm_add_epi16(sb, dither);
900             sb = _mm_sub_epi16(sb, sb_offset);
901             sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
902 
903             // Pack and store 16-bit dst pixel.
904             __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
905             _mm_store_si128(d++, d_pixel);
906 
907             count -= 8;
908             x += 8;
909         }
910 
911         src = reinterpret_cast<const SkPMColor*>(s);
912         dst = reinterpret_cast<uint16_t*>(d);
913     }
914 
915     if (count > 0) {
916         DITHER_565_SCAN(y);
917         do {
918             SkPMColor c = *src++;
919             SkPMColorAssert(c);
920 
921             unsigned dither = DITHER_VALUE(x);
922             *dst++ = SkDitherRGB32To565(c, dither);
923             DITHER_INC_X(x);
924         } while (--count != 0);
925     }
926 }
927 
928 /* SSE2 version of S32A_D565_Opaque_Dither()
929  * portable version is in core/SkBlitRow_D16.cpp
930  */
S32A_D565_Opaque_Dither_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)931 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
932                                   const SkPMColor* SK_RESTRICT src,
933                                   int count, U8CPU alpha, int x, int y) {
934     SkASSERT(255 == alpha);
935 
936     if (count <= 0) {
937         return;
938     }
939 
940     if (count >= 8) {
941         while (((size_t)dst & 0x0F) != 0) {
942             DITHER_565_SCAN(y);
943             SkPMColor c = *src++;
944             SkPMColorAssert(c);
945             if (c) {
946                 unsigned a = SkGetPackedA32(c);
947 
948                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
949 
950                 unsigned sr = SkGetPackedR32(c);
951                 unsigned sg = SkGetPackedG32(c);
952                 unsigned sb = SkGetPackedB32(c);
953                 sr = SkDITHER_R32_FOR_565(sr, d);
954                 sg = SkDITHER_G32_FOR_565(sg, d);
955                 sb = SkDITHER_B32_FOR_565(sb, d);
956 
957                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
958                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
959                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
960                 // now src and dst expanded are in g:11 r:10 x:1 b:10
961                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
962             }
963             dst += 1;
964             DITHER_INC_X(x);
965             count--;
966         }
967 
968         unsigned short dither_value[8];
969         __m128i dither, dither_cur;
970 #ifdef ENABLE_DITHER_MATRIX_4X4
971         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
972         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
973         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
974         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
975         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
976 #else
977         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
978         dither_value[0] = dither_value[4] = (dither_scan
979                                              >> (((x) & 3) << 2)) & 0xF;
980         dither_value[1] = dither_value[5] = (dither_scan
981                                              >> (((x + 1) & 3) << 2)) & 0xF;
982         dither_value[2] = dither_value[6] = (dither_scan
983                                              >> (((x + 2) & 3) << 2)) & 0xF;
984         dither_value[3] = dither_value[7] = (dither_scan
985                                              >> (((x + 3) & 3) << 2)) & 0xF;
986 #endif
987         dither = _mm_loadu_si128((__m128i*) dither_value);
988 
989         const __m128i* s = reinterpret_cast<const __m128i*>(src);
990         __m128i* d = reinterpret_cast<__m128i*>(dst);
991         __m128i var256 = _mm_set1_epi16(256);
992         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
993         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
994         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
995 
996         while (count >= 8) {
997             // Load 8 pixels of src and dst.
998             __m128i src_pixel1 = _mm_loadu_si128(s++);
999             __m128i src_pixel2 = _mm_loadu_si128(s++);
1000             __m128i dst_pixel = _mm_load_si128(d);
1001 
1002             // Extract A from src.
1003             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
1004             sa1 = _mm_srli_epi32(sa1, 24);
1005             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
1006             sa2 = _mm_srli_epi32(sa2, 24);
1007             __m128i sa = _mm_packs_epi32(sa1, sa2);
1008 
1009             // Calculate current dither value.
1010             dither_cur = _mm_mullo_epi16(dither,
1011                                          _mm_add_epi16(sa, _mm_set1_epi16(1)));
1012             dither_cur = _mm_srli_epi16(dither_cur, 8);
1013 
1014             // Extract R from src.
1015             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1016             sr1 = _mm_srli_epi32(sr1, 24);
1017             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1018             sr2 = _mm_srli_epi32(sr2, 24);
1019             __m128i sr = _mm_packs_epi32(sr1, sr2);
1020 
1021             // SkDITHER_R32_FOR_565(sr, d)
1022             __m128i sr_offset = _mm_srli_epi16(sr, 5);
1023             sr = _mm_add_epi16(sr, dither_cur);
1024             sr = _mm_sub_epi16(sr, sr_offset);
1025 
1026             // Expand sr.
1027             sr = _mm_slli_epi16(sr, 2);
1028 
1029             // Extract G from src.
1030             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1031             sg1 = _mm_srli_epi32(sg1, 24);
1032             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1033             sg2 = _mm_srli_epi32(sg2, 24);
1034             __m128i sg = _mm_packs_epi32(sg1, sg2);
1035 
1036             // sg = SkDITHER_G32_FOR_565(sg, d).
1037             __m128i sg_offset = _mm_srli_epi16(sg, 6);
1038             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1039             sg = _mm_sub_epi16(sg, sg_offset);
1040 
1041             // Expand sg.
1042             sg = _mm_slli_epi16(sg, 3);
1043 
1044             // Extract B from src.
1045             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1046             sb1 = _mm_srli_epi32(sb1, 24);
1047             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1048             sb2 = _mm_srli_epi32(sb2, 24);
1049             __m128i sb = _mm_packs_epi32(sb1, sb2);
1050 
1051             // sb = SkDITHER_B32_FOR_565(sb, d).
1052             __m128i sb_offset = _mm_srli_epi16(sb, 5);
1053             sb = _mm_add_epi16(sb, dither_cur);
1054             sb = _mm_sub_epi16(sb, sb_offset);
1055 
1056             // Expand sb.
1057             sb = _mm_slli_epi16(sb, 2);
1058 
1059             // Extract R G B from dst.
1060             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1061             dr = _mm_and_si128(dr, r16_mask);
1062             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1063             dg = _mm_and_si128(dg, g16_mask);
1064             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1065             db = _mm_and_si128(db, b16_mask);
1066 
1067             // SkAlpha255To256(255 - a) >> 3
1068             __m128i isa = _mm_sub_epi16(var256, sa);
1069             isa = _mm_srli_epi16(isa, 3);
1070 
1071             dr = _mm_mullo_epi16(dr, isa);
1072             dr = _mm_add_epi16(dr, sr);
1073             dr = _mm_srli_epi16(dr, 5);
1074 
1075             dg = _mm_mullo_epi16(dg, isa);
1076             dg = _mm_add_epi16(dg, sg);
1077             dg = _mm_srli_epi16(dg, 5);
1078 
1079             db = _mm_mullo_epi16(db, isa);
1080             db = _mm_add_epi16(db, sb);
1081             db = _mm_srli_epi16(db, 5);
1082 
1083             // Package and store dst pixel.
1084             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1085             _mm_store_si128(d++, d_pixel);
1086 
1087             count -= 8;
1088             x += 8;
1089         }
1090 
1091         src = reinterpret_cast<const SkPMColor*>(s);
1092         dst = reinterpret_cast<uint16_t*>(d);
1093     }
1094 
1095     if (count > 0) {
1096         DITHER_565_SCAN(y);
1097         do {
1098             SkPMColor c = *src++;
1099             SkPMColorAssert(c);
1100             if (c) {
1101                 unsigned a = SkGetPackedA32(c);
1102 
1103                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1104 
1105                 unsigned sr = SkGetPackedR32(c);
1106                 unsigned sg = SkGetPackedG32(c);
1107                 unsigned sb = SkGetPackedB32(c);
1108                 sr = SkDITHER_R32_FOR_565(sr, d);
1109                 sg = SkDITHER_G32_FOR_565(sg, d);
1110                 sb = SkDITHER_B32_FOR_565(sb, d);
1111 
1112                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1113                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1114                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1115                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1116                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1117             }
1118             dst += 1;
1119             DITHER_INC_X(x);
1120         } while (--count != 0);
1121     }
1122 }
1123