• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2006 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkShader.h"
9 #include "include/private/SkColorData.h"
10 #include "include/private/SkVx.h"
11 #include "src/core/SkCoreBlitters.h"
12 #include "src/core/SkUtils.h"
13 #include "src/core/SkXfermodePriv.h"
14 
upscale_31_to_32(int value)15 static inline int upscale_31_to_32(int value) {
16     SkASSERT((unsigned)value <= 31);
17     return value + (value >> 4);
18 }
19 
blend_32(int src,int dst,int scale)20 static inline int blend_32(int src, int dst, int scale) {
21     SkASSERT((unsigned)src <= 0xFF);
22     SkASSERT((unsigned)dst <= 0xFF);
23     SkASSERT((unsigned)scale <= 32);
24     return dst + ((src - dst) * scale >> 5);
25 }
26 
blend_lcd16(int srcA,int srcR,int srcG,int srcB,SkPMColor dst,uint16_t mask)27 static inline SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB,
28                                      SkPMColor dst, uint16_t mask) {
29     if (mask == 0) {
30         return dst;
31     }
32 
33     /*  We want all of these in 5bits, hence the shifts in case one of them
34      *  (green) is 6bits.
35      */
36     int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
37     int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
38     int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
39 
40     // Now upscale them to 0..32, so we can use blend32
41     maskR = upscale_31_to_32(maskR);
42     maskG = upscale_31_to_32(maskG);
43     maskB = upscale_31_to_32(maskB);
44 
45     // srcA has been upscaled to 256 before passed into this function
46     maskR = maskR * srcA >> 8;
47     maskG = maskG * srcA >> 8;
48     maskB = maskB * srcA >> 8;
49 
50     int dstR = SkGetPackedR32(dst);
51     int dstG = SkGetPackedG32(dst);
52     int dstB = SkGetPackedB32(dst);
53 
54     // LCD blitting is only supported if the dst is known/required
55     // to be opaque
56     return SkPackARGB32(0xFF,
57                         blend_32(srcR, dstR, maskR),
58                         blend_32(srcG, dstG, maskG),
59                         blend_32(srcB, dstB, maskB));
60 }
61 
blend_lcd16_opaque(int srcR,int srcG,int srcB,SkPMColor dst,uint16_t mask,SkPMColor opaqueDst)62 static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
63                                            SkPMColor dst, uint16_t mask,
64                                            SkPMColor opaqueDst) {
65     if (mask == 0) {
66         return dst;
67     }
68 
69     if (0xFFFF == mask) {
70         return opaqueDst;
71     }
72 
73     /*  We want all of these in 5bits, hence the shifts in case one of them
74      *  (green) is 6bits.
75      */
76     int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
77     int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
78     int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
79 
80     // Now upscale them to 0..32, so we can use blend32
81     maskR = upscale_31_to_32(maskR);
82     maskG = upscale_31_to_32(maskG);
83     maskB = upscale_31_to_32(maskB);
84 
85     int dstR = SkGetPackedR32(dst);
86     int dstG = SkGetPackedG32(dst);
87     int dstB = SkGetPackedB32(dst);
88 
89     // LCD blitting is only supported if the dst is known/required
90     // to be opaque
91     return SkPackARGB32(0xFF,
92                         blend_32(srcR, dstR, maskR),
93                         blend_32(srcG, dstG, maskG),
94                         blend_32(srcB, dstB, maskB));
95 }
96 
97 
98 // TODO: rewrite at least the SSE code here.  It's miserable.
99 
100 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
101     #include <emmintrin.h>
102 
103     // The following (left) shifts cause the top 5 bits of the mask components to
104     // line up with the corresponding components in an SkPMColor.
105     // Note that the mask's RGB16 order may differ from the SkPMColor order.
106     #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
107     #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
108     #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
109 
110     #if SK_R16x5_R32x5_SHIFT == 0
111         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
112     #elif SK_R16x5_R32x5_SHIFT > 0
113         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
114     #else
115         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
116     #endif
117 
118     #if SK_G16x5_G32x5_SHIFT == 0
119         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
120     #elif SK_G16x5_G32x5_SHIFT > 0
121         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
122     #else
123         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
124     #endif
125 
126     #if SK_B16x5_B32x5_SHIFT == 0
127         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
128     #elif SK_B16x5_B32x5_SHIFT > 0
129         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
130     #else
131         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
132     #endif
133 
blend_lcd16_sse2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)134     static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
135         // In the following comments, the components of src, dst and mask are
136         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
137         // by an R, G, B, or A suffix. Components of one of the four pixels that
138         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
139         // example is the blue channel of the second destination pixel. Memory
140         // layout is shown for an ARGB byte order in a color value.
141 
142         // src and srcA store 8-bit values interleaved with zeros.
143         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
144         // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
145         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
146         // mask stores 16-bit values (compressed three channels) interleaved with zeros.
147         // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
148         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
149         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
150 
151         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
152         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
153         __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
154                                   _mm_set1_epi32(0x1F << SK_R32_SHIFT));
155 
156         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
157         __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
158                                   _mm_set1_epi32(0x1F << SK_G32_SHIFT));
159 
160         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
161         __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
162                                   _mm_set1_epi32(0x1F << SK_B32_SHIFT));
163 
164         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
165         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
166         // 8-bit position
167         // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
168         //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
169         mask = _mm_or_si128(_mm_or_si128(r, g), b);
170 
171         // Interleave R,G,B into the lower byte of word.
172         // i.e. split the sixteen 8-bit values from mask into two sets of eight
173         // 16-bit values, padded by zero.
174         __m128i maskLo, maskHi;
175         // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
176         maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
177         // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
178         maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
179 
180         // Upscale from 0..31 to 0..32
181         // (allows to replace division by left-shift further down)
182         // Left-shift each component by 4 and add the result back to that component,
183         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
184         maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
185         maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
186 
187         // Multiply each component of maskLo and maskHi by srcA
188         maskLo = _mm_mullo_epi16(maskLo, srcA);
189         maskHi = _mm_mullo_epi16(maskHi, srcA);
190 
191         // Left shift mask components by 8 (divide by 256)
192         maskLo = _mm_srli_epi16(maskLo, 8);
193         maskHi = _mm_srli_epi16(maskHi, 8);
194 
195         // Interleave R,G,B into the lower byte of the word
196         // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
197         __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
198         // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
199         __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
200 
201         // mask = (src - dst) * mask
202         maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
203         maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
204 
205         // mask = (src - dst) * mask >> 5
206         maskLo = _mm_srai_epi16(maskLo, 5);
207         maskHi = _mm_srai_epi16(maskHi, 5);
208 
209         // Add two pixels into result.
210         // result = dst + ((src - dst) * mask >> 5)
211         __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
212         __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
213 
214         // Pack into 4 32bit dst pixels.
215         // resultLo and resultHi contain eight 16-bit components (two pixels) each.
216         // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
217         // clamping to 255 if necessary.
218         return _mm_packus_epi16(resultLo, resultHi);
219     }
220 
blend_lcd16_opaque_sse2(__m128i & src,__m128i & dst,__m128i & mask)221     static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask) {
222         // In the following comments, the components of src, dst and mask are
223         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
224         // by an R, G, B, or A suffix. Components of one of the four pixels that
225         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
226         // example is the blue channel of the second destination pixel. Memory
227         // layout is shown for an ARGB byte order in a color value.
228 
229         // src and srcA store 8-bit values interleaved with zeros.
230         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
231         // mask stores 16-bit values (shown as high and low bytes) interleaved with
232         // zeros
233         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
234         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
235 
236         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
237         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
238         __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
239                                   _mm_set1_epi32(0x1F << SK_R32_SHIFT));
240 
241         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
242         __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
243                                   _mm_set1_epi32(0x1F << SK_G32_SHIFT));
244 
245         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
246         __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
247                                   _mm_set1_epi32(0x1F << SK_B32_SHIFT));
248 
249         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
250         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
251         // 8-bit position
252         // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
253         //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
254         mask = _mm_or_si128(_mm_or_si128(r, g), b);
255 
256         // Interleave R,G,B into the lower byte of word.
257         // i.e. split the sixteen 8-bit values from mask into two sets of eight
258         // 16-bit values, padded by zero.
259         __m128i maskLo, maskHi;
260         // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
261         maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
262         // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
263         maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
264 
265         // Upscale from 0..31 to 0..32
266         // (allows to replace division by left-shift further down)
267         // Left-shift each component by 4 and add the result back to that component,
268         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
269         maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
270         maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
271 
272         // Interleave R,G,B into the lower byte of the word
273         // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
274         __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
275         // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
276         __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
277 
278         // mask = (src - dst) * mask
279         maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
280         maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
281 
282         // mask = (src - dst) * mask >> 5
283         maskLo = _mm_srai_epi16(maskLo, 5);
284         maskHi = _mm_srai_epi16(maskHi, 5);
285 
286         // Add two pixels into result.
287         // result = dst + ((src - dst) * mask >> 5)
288         __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
289         __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
290 
291         // Pack into 4 32bit dst pixels and force opaque.
292         // resultLo and resultHi contain eight 16-bit components (two pixels) each.
293         // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
294         // clamping to 255 if necessary. Set alpha components to 0xFF.
295         return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
296                             _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
297     }
298 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)299     void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
300         if (width <= 0) {
301             return;
302         }
303 
304         int srcA = SkColorGetA(src);
305         int srcR = SkColorGetR(src);
306         int srcG = SkColorGetG(src);
307         int srcB = SkColorGetB(src);
308 
309         srcA = SkAlpha255To256(srcA);
310 
311         if (width >= 4) {
312             SkASSERT(((size_t)dst & 0x03) == 0);
313             while (((size_t)dst & 0x0F) != 0) {
314                 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
315                 mask++;
316                 dst++;
317                 width--;
318             }
319 
320             __m128i *d = reinterpret_cast<__m128i*>(dst);
321             // Set alpha to 0xFF and replicate source four times in SSE register.
322             __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
323             // Interleave with zeros to get two sets of four 16-bit values.
324             src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
325             // Set srcA_sse to contain eight copies of srcA, padded with zero.
326             // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
327             __m128i srcA_sse = _mm_set1_epi16(srcA);
328             while (width >= 4) {
329                 // Load four destination pixels into dst_sse.
330                 __m128i dst_sse = _mm_load_si128(d);
331                 // Load four 16-bit masks into lower half of mask_sse.
332                 __m128i mask_sse = _mm_loadl_epi64(
333                                        reinterpret_cast<const __m128i*>(mask));
334 
335                 // Check whether masks are equal to 0 and get the highest bit
336                 // of each byte of result, if masks are all zero, we will get
337                 // pack_cmp to 0xFFFF
338                 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
339                                                  _mm_setzero_si128()));
340 
341                 // if mask pixels are not all zero, we will blend the dst pixels
342                 if (pack_cmp != 0xFFFF) {
343                     // Unpack 4 16bit mask pixels to
344                     // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
345                     //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
346                     mask_sse = _mm_unpacklo_epi16(mask_sse,
347                                                   _mm_setzero_si128());
348 
349                     // Process 4 32bit dst pixels
350                     __m128i result = blend_lcd16_sse2(src_sse, dst_sse, mask_sse, srcA_sse);
351                     _mm_store_si128(d, result);
352                 }
353 
354                 d++;
355                 mask += 4;
356                 width -= 4;
357             }
358 
359             dst = reinterpret_cast<SkPMColor*>(d);
360         }
361 
362         while (width > 0) {
363             *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
364             mask++;
365             dst++;
366             width--;
367         }
368     }
369 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)370     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
371                                    SkColor src, int width, SkPMColor opaqueDst) {
372         if (width <= 0) {
373             return;
374         }
375 
376         int srcR = SkColorGetR(src);
377         int srcG = SkColorGetG(src);
378         int srcB = SkColorGetB(src);
379 
380         if (width >= 4) {
381             SkASSERT(((size_t)dst & 0x03) == 0);
382             while (((size_t)dst & 0x0F) != 0) {
383                 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
384                 mask++;
385                 dst++;
386                 width--;
387             }
388 
389             __m128i *d = reinterpret_cast<__m128i*>(dst);
390             // Set alpha to 0xFF and replicate source four times in SSE register.
391             __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
392             // Set srcA_sse to contain eight copies of srcA, padded with zero.
393             // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
394             src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
395             while (width >= 4) {
396                 // Load four destination pixels into dst_sse.
397                 __m128i dst_sse = _mm_load_si128(d);
398                 // Load four 16-bit masks into lower half of mask_sse.
399                 __m128i mask_sse = _mm_loadl_epi64(
400                                        reinterpret_cast<const __m128i*>(mask));
401 
402                 // Check whether masks are equal to 0 and get the highest bit
403                 // of each byte of result, if masks are all zero, we will get
404                 // pack_cmp to 0xFFFF
405                 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
406                                                  _mm_setzero_si128()));
407 
408                 // if mask pixels are not all zero, we will blend the dst pixels
409                 if (pack_cmp != 0xFFFF) {
410                     // Unpack 4 16bit mask pixels to
411                     // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
412                     //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
413                     mask_sse = _mm_unpacklo_epi16(mask_sse,
414                                                   _mm_setzero_si128());
415 
416                     // Process 4 32bit dst pixels
417                     __m128i result = blend_lcd16_opaque_sse2(src_sse, dst_sse, mask_sse);
418                     _mm_store_si128(d, result);
419                 }
420 
421                 d++;
422                 mask += 4;
423                 width -= 4;
424             }
425 
426             dst = reinterpret_cast<SkPMColor*>(d);
427         }
428 
429         while (width > 0) {
430             *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
431             mask++;
432             dst++;
433             width--;
434         }
435     }
436 
437 #elif defined(SK_ARM_HAS_NEON)
438     #include <arm_neon.h>
439 
440     #define NEON_A (SK_A32_SHIFT / 8)
441     #define NEON_R (SK_R32_SHIFT / 8)
442     #define NEON_G (SK_G32_SHIFT / 8)
443     #define NEON_B (SK_B32_SHIFT / 8)
444 
blend_32_neon(uint8x8_t src,uint8x8_t dst,uint16x8_t scale)445     static inline uint8x8_t blend_32_neon(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
446         int16x8_t src_wide, dst_wide;
447 
448         src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
449         dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));
450 
451         src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);
452 
453         dst_wide += vshrq_n_s16(src_wide, 5);
454 
455         return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
456     }
457 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)458     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t src[],
459                                SkColor color, int width,
460                                SkPMColor opaqueDst) {
461         int colR = SkColorGetR(color);
462         int colG = SkColorGetG(color);
463         int colB = SkColorGetB(color);
464 
465         uint8x8_t vcolR = vdup_n_u8(colR);
466         uint8x8_t vcolG = vdup_n_u8(colG);
467         uint8x8_t vcolB = vdup_n_u8(colB);
468         uint8x8_t vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
469         uint8x8_t vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
470         uint8x8_t vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
471         uint8x8_t vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
472 
473         while (width >= 8) {
474             uint8x8x4_t vdst;
475             uint16x8_t vmask;
476             uint16x8_t vmaskR, vmaskG, vmaskB;
477             uint8x8_t vsel_trans, vsel_opq;
478 
479             vdst = vld4_u8((uint8_t*)dst);
480             vmask = vld1q_u16(src);
481 
482             // Prepare compare masks
483             vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
484             vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
485 
486             // Get all the color masks on 5 bits
487             vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
488             vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
489                                  SK_B16_BITS + SK_R16_BITS + 1);
490             vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
491 
492             // Upscale to 0..32
493             vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
494             vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
495             vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
496 
497             vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
498             vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
499 
500             vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
501             vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
502             vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
503 
504             vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
505             vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
506             vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
507 
508             vst4_u8((uint8_t*)dst, vdst);
509 
510             dst += 8;
511             src += 8;
512             width -= 8;
513         }
514 
515         // Leftovers
516         for (int i = 0; i < width; i++) {
517             dst[i] = blend_lcd16_opaque(colR, colG, colB, dst[i], src[i], opaqueDst);
518         }
519     }
520 
blit_row_lcd16(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)521     void blit_row_lcd16(SkPMColor dst[], const uint16_t src[],
522                         SkColor color, int width, SkPMColor) {
523         int colA = SkColorGetA(color);
524         int colR = SkColorGetR(color);
525         int colG = SkColorGetG(color);
526         int colB = SkColorGetB(color);
527 
528         colA = SkAlpha255To256(colA);
529 
530         uint16x8_t vcolA = vdupq_n_u16(colA);
531         uint8x8_t vcolR = vdup_n_u8(colR);
532         uint8x8_t vcolG = vdup_n_u8(colG);
533         uint8x8_t vcolB = vdup_n_u8(colB);
534 
535         while (width >= 8) {
536             uint8x8x4_t vdst;
537             uint16x8_t vmask;
538             uint16x8_t vmaskR, vmaskG, vmaskB;
539 
540             vdst = vld4_u8((uint8_t*)dst);
541             vmask = vld1q_u16(src);
542 
543             // Get all the color masks on 5 bits
544             vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
545             vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
546                                  SK_B16_BITS + SK_R16_BITS + 1);
547             vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
548 
549             // Upscale to 0..32
550             vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
551             vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
552             vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
553 
554             vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
555             vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
556             vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
557 
558             vdst.val[NEON_A] = vdup_n_u8(0xFF);
559             vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
560             vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
561             vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
562 
563             vst4_u8((uint8_t*)dst, vdst);
564 
565             dst += 8;
566             src += 8;
567             width -= 8;
568         }
569 
570         for (int i = 0; i < width; i++) {
571             dst[i] = blend_lcd16(colA, colR, colG, colB, dst[i], src[i]);
572         }
573     }
574 
575 #else
576 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)577     static inline void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[],
578                                       SkColor src, int width, SkPMColor) {
579         int srcA = SkColorGetA(src);
580         int srcR = SkColorGetR(src);
581         int srcG = SkColorGetG(src);
582         int srcB = SkColorGetB(src);
583 
584         srcA = SkAlpha255To256(srcA);
585 
586         for (int i = 0; i < width; i++) {
587             dst[i] = blend_lcd16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
588         }
589     }
590 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)591     static inline void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
592                                              SkColor src, int width,
593                                              SkPMColor opaqueDst) {
594         int srcR = SkColorGetR(src);
595         int srcG = SkColorGetG(src);
596         int srcB = SkColorGetB(src);
597 
598         for (int i = 0; i < width; i++) {
599             dst[i] = blend_lcd16_opaque(srcR, srcG, srcB, dst[i], mask[i], opaqueDst);
600         }
601     }
602 
603 #endif
604 
blit_color(const SkPixmap & device,const SkMask & mask,const SkIRect & clip,SkColor color)605 static bool blit_color(const SkPixmap& device,
606                        const SkMask& mask,
607                        const SkIRect& clip,
608                        SkColor color) {
609     int x = clip.fLeft,
610         y = clip.fTop;
611 
612     if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
613         SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
614                                  (const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
615                                  color, clip.width(), clip.height());
616         return true;
617     }
618 
619     if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
620         auto dstRow  = device.writable_addr32(x,y);
621         auto maskRow = (const uint16_t*)mask.getAddr(x,y);
622 
623         auto blit_row = blit_row_lcd16;
624         SkPMColor opaqueDst = 0;  // ignored unless opaque
625 
626         if (0xff == SkColorGetA(color)) {
627             blit_row  = blit_row_lcd16_opaque;
628             opaqueDst = SkPreMultiplyColor(color);
629         }
630 
631         for (int height = clip.height(); height --> 0; ) {
632             blit_row(dstRow, maskRow, color, clip.width(), opaqueDst);
633 
634             dstRow  = (SkPMColor*)     ((      char*) dstRow + device.rowBytes());
635             maskRow = (const uint16_t*)((const char*)maskRow +  mask.fRowBytes);
636         }
637         return true;
638     }
639 
640     return false;
641 }
642 
643 ///////////////////////////////////////////////////////////////////////////////
644 
SkARGB32_Blit32(const SkPixmap & device,const SkMask & mask,const SkIRect & clip,SkPMColor srcColor)645 static void SkARGB32_Blit32(const SkPixmap& device, const SkMask& mask,
646                             const SkIRect& clip, SkPMColor srcColor) {
647     U8CPU alpha = SkGetPackedA32(srcColor);
648     unsigned flags = SkBlitRow::kSrcPixelAlpha_Flag32;
649     if (alpha != 255) {
650         flags |= SkBlitRow::kGlobalAlpha_Flag32;
651     }
652     SkBlitRow::Proc32 proc = SkBlitRow::Factory32(flags);
653 
654     int x = clip.fLeft;
655     int y = clip.fTop;
656     int width = clip.width();
657     int height = clip.height();
658 
659     SkPMColor* dstRow = device.writable_addr32(x, y);
660     const SkPMColor* srcRow = reinterpret_cast<const SkPMColor*>(mask.getAddr8(x, y));
661 
662     do {
663         proc(dstRow, srcRow, width, alpha);
664         dstRow = (SkPMColor*)((char*)dstRow + device.rowBytes());
665         srcRow = (const SkPMColor*)((const char*)srcRow + mask.fRowBytes);
666     } while (--height != 0);
667 }
668 
669 //////////////////////////////////////////////////////////////////////////////////////
670 
SkARGB32_Blitter(const SkPixmap & device,const SkPaint & paint)671 SkARGB32_Blitter::SkARGB32_Blitter(const SkPixmap& device, const SkPaint& paint)
672         : INHERITED(device) {
673     SkColor color = paint.getColor();
674     fColor = color;
675 
676     fSrcA = SkColorGetA(color);
677     unsigned scale = SkAlpha255To256(fSrcA);
678     fSrcR = SkAlphaMul(SkColorGetR(color), scale);
679     fSrcG = SkAlphaMul(SkColorGetG(color), scale);
680     fSrcB = SkAlphaMul(SkColorGetB(color), scale);
681 
682     fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
683 }
684 
justAnOpaqueColor(uint32_t * value)685 const SkPixmap* SkARGB32_Blitter::justAnOpaqueColor(uint32_t* value) {
686     if (255 == fSrcA) {
687         *value = fPMColor;
688         return &fDevice;
689     }
690     return nullptr;
691 }
692 
693 #if defined _WIN32  // disable warning : local variable used without having been initialized
694 #pragma warning ( push )
695 #pragma warning ( disable : 4701 )
696 #endif
697 
blitH(int x,int y,int width)698 void SkARGB32_Blitter::blitH(int x, int y, int width) {
699     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
700 
701     uint32_t* device = fDevice.writable_addr32(x, y);
702     SkBlitRow::Color32(device, device, width, fPMColor);
703 }
704 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])705 void SkARGB32_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
706                                  const int16_t runs[]) {
707     if (fSrcA == 0) {
708         return;
709     }
710 
711     uint32_t    color = fPMColor;
712     uint32_t*   device = fDevice.writable_addr32(x, y);
713     unsigned    opaqueMask = fSrcA; // if fSrcA is 0xFF, then we will catch the fast opaque case
714 
715     for (;;) {
716         int count = runs[0];
717         SkASSERT(count >= 0);
718         if (count <= 0) {
719             return;
720         }
721         unsigned aa = antialias[0];
722         if (aa) {
723             if ((opaqueMask & aa) == 255) {
724                 sk_memset32(device, color, count);
725             } else {
726                 uint32_t sc = SkAlphaMulQ(color, SkAlpha255To256(aa));
727                 SkBlitRow::Color32(device, device, count, sc);
728             }
729         }
730         runs += count;
731         antialias += count;
732         device += count;
733     }
734 }
735 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)736 void SkARGB32_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
737     uint32_t* device = fDevice.writable_addr32(x, y);
738     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
739 
740     device[0] = SkBlendARGB32(fPMColor, device[0], a0);
741     device[1] = SkBlendARGB32(fPMColor, device[1], a1);
742 }
743 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)744 void SkARGB32_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
745     uint32_t* device = fDevice.writable_addr32(x, y);
746     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
747 
748     device[0] = SkBlendARGB32(fPMColor, device[0], a0);
749     device = (uint32_t*)((char*)device + fDevice.rowBytes());
750     device[0] = SkBlendARGB32(fPMColor, device[0], a1);
751 }
752 
753 //////////////////////////////////////////////////////////////////////////////////////
754 
755 #define solid_8_pixels(mask, dst, color)    \
756     do {                                    \
757         if (mask & 0x80) dst[0] = color;    \
758         if (mask & 0x40) dst[1] = color;    \
759         if (mask & 0x20) dst[2] = color;    \
760         if (mask & 0x10) dst[3] = color;    \
761         if (mask & 0x08) dst[4] = color;    \
762         if (mask & 0x04) dst[5] = color;    \
763         if (mask & 0x02) dst[6] = color;    \
764         if (mask & 0x01) dst[7] = color;    \
765     } while (0)
766 
767 #define SK_BLITBWMASK_NAME                  SkARGB32_BlitBW
768 #define SK_BLITBWMASK_ARGS                  , SkPMColor color
769 #define SK_BLITBWMASK_BLIT8(mask, dst)      solid_8_pixels(mask, dst, color)
770 #define SK_BLITBWMASK_GETADDR               writable_addr32
771 #define SK_BLITBWMASK_DEVTYPE               uint32_t
772 #include "src/core/SkBlitBWMaskTemplate.h"
773 
774 #define blend_8_pixels(mask, dst, sc, dst_scale)                            \
775     do {                                                                    \
776         if (mask & 0x80) { dst[0] = sc + SkAlphaMulQ(dst[0], dst_scale); }  \
777         if (mask & 0x40) { dst[1] = sc + SkAlphaMulQ(dst[1], dst_scale); }  \
778         if (mask & 0x20) { dst[2] = sc + SkAlphaMulQ(dst[2], dst_scale); }  \
779         if (mask & 0x10) { dst[3] = sc + SkAlphaMulQ(dst[3], dst_scale); }  \
780         if (mask & 0x08) { dst[4] = sc + SkAlphaMulQ(dst[4], dst_scale); }  \
781         if (mask & 0x04) { dst[5] = sc + SkAlphaMulQ(dst[5], dst_scale); }  \
782         if (mask & 0x02) { dst[6] = sc + SkAlphaMulQ(dst[6], dst_scale); }  \
783         if (mask & 0x01) { dst[7] = sc + SkAlphaMulQ(dst[7], dst_scale); }  \
784     } while (0)
785 
786 #define SK_BLITBWMASK_NAME                  SkARGB32_BlendBW
787 #define SK_BLITBWMASK_ARGS                  , uint32_t sc, unsigned dst_scale
788 #define SK_BLITBWMASK_BLIT8(mask, dst)      blend_8_pixels(mask, dst, sc, dst_scale)
789 #define SK_BLITBWMASK_GETADDR               writable_addr32
790 #define SK_BLITBWMASK_DEVTYPE               uint32_t
791 #include "src/core/SkBlitBWMaskTemplate.h"
792 
blitMask(const SkMask & mask,const SkIRect & clip)793 void SkARGB32_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
794     SkASSERT(mask.fBounds.contains(clip));
795     SkASSERT(fSrcA != 0xFF);
796 
797     if (fSrcA == 0) {
798         return;
799     }
800 
801     if (blit_color(fDevice, mask, clip, fColor)) {
802         return;
803     }
804 
805     switch (mask.fFormat) {
806         case SkMask::kBW_Format:
807             SkARGB32_BlendBW(fDevice, mask, clip, fPMColor, SkAlpha255To256(255 - fSrcA));
808             break;
809         case SkMask::kARGB32_Format:
810             SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
811             break;
812         default:
813             SK_ABORT("Mask format not handled.");
814     }
815 }
816 
blitMask(const SkMask & mask,const SkIRect & clip)817 void SkARGB32_Opaque_Blitter::blitMask(const SkMask& mask,
818                                        const SkIRect& clip) {
819     SkASSERT(mask.fBounds.contains(clip));
820 
821     if (blit_color(fDevice, mask, clip, fColor)) {
822         return;
823     }
824 
825     switch (mask.fFormat) {
826         case SkMask::kBW_Format:
827             SkARGB32_BlitBW(fDevice, mask, clip, fPMColor);
828             break;
829         case SkMask::kARGB32_Format:
830             SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
831             break;
832         default:
833             SK_ABORT("Mask format not handled.");
834     }
835 }
836 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)837 void SkARGB32_Opaque_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
838     uint32_t* device = fDevice.writable_addr32(x, y);
839     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
840 
841     device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
842     device[1] = SkFastFourByteInterp(fPMColor, device[1], a1);
843 }
844 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)845 void SkARGB32_Opaque_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
846     uint32_t* device = fDevice.writable_addr32(x, y);
847     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
848 
849     device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
850     device = (uint32_t*)((char*)device + fDevice.rowBytes());
851     device[0] = SkFastFourByteInterp(fPMColor, device[0], a1);
852 }
853 
854 ///////////////////////////////////////////////////////////////////////////////
855 
blitV(int x,int y,int height,SkAlpha alpha)856 void SkARGB32_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
857     if (alpha == 0 || fSrcA == 0) {
858         return;
859     }
860 
861     uint32_t* device = fDevice.writable_addr32(x, y);
862     uint32_t  color = fPMColor;
863 
864     if (alpha != 255) {
865         color = SkAlphaMulQ(color, SkAlpha255To256(alpha));
866     }
867 
868     unsigned dst_scale = SkAlpha255To256(255 - SkGetPackedA32(color));
869     size_t rowBytes = fDevice.rowBytes();
870     while (--height >= 0) {
871         device[0] = color + SkAlphaMulQ(device[0], dst_scale);
872         device = (uint32_t*)((char*)device + rowBytes);
873     }
874 }
875 
blitRect(int x,int y,int width,int height)876 void SkARGB32_Blitter::blitRect(int x, int y, int width, int height) {
877     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width() && y + height <= fDevice.height());
878 
879     if (fSrcA == 0) {
880         return;
881     }
882 
883     uint32_t*   device = fDevice.writable_addr32(x, y);
884     uint32_t    color = fPMColor;
885     size_t      rowBytes = fDevice.rowBytes();
886 
887     if (SkGetPackedA32(fPMColor) == 0xFF) {
888         SkOpts::rect_memset32(device, color, width, rowBytes, height);
889     } else {
890         while (height --> 0) {
891             SkBlitRow::Color32(device, device, width, color);
892             device = (uint32_t*)((char*)device + rowBytes);
893         }
894     }
895 }
896 
897 #if defined _WIN32
898 #pragma warning ( pop )
899 #endif
900 
901 ///////////////////////////////////////////////////////////////////////
902 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])903 void SkARGB32_Black_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
904                                        const int16_t runs[]) {
905     uint32_t*   device = fDevice.writable_addr32(x, y);
906     SkPMColor   black = (SkPMColor)(SK_A32_MASK << SK_A32_SHIFT);
907 
908     for (;;) {
909         int count = runs[0];
910         SkASSERT(count >= 0);
911         if (count <= 0) {
912             return;
913         }
914         unsigned aa = antialias[0];
915         if (aa) {
916             if (aa == 255) {
917                 sk_memset32(device, black, count);
918             } else {
919                 SkPMColor src = aa << SK_A32_SHIFT;
920                 unsigned dst_scale = 256 - aa;
921                 int n = count;
922                 do {
923                     --n;
924                     device[n] = src + SkAlphaMulQ(device[n], dst_scale);
925                 } while (n > 0);
926             }
927         }
928         runs += count;
929         antialias += count;
930         device += count;
931     }
932 }
933 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)934 void SkARGB32_Black_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
935     uint32_t* device = fDevice.writable_addr32(x, y);
936     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
937 
938     device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
939     device[1] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[1], 256 - a1);
940 }
941 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)942 void SkARGB32_Black_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
943     uint32_t* device = fDevice.writable_addr32(x, y);
944     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
945 
946     device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
947     device = (uint32_t*)((char*)device + fDevice.rowBytes());
948     device[0] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a1);
949 }
950 
951 ///////////////////////////////////////////////////////////////////////////////
952 
953 // Special version of SkBlitRow::Factory32 that knows we're in kSrc_Mode,
954 // instead of kSrcOver_Mode
blend_srcmode(SkPMColor * SK_RESTRICT device,const SkPMColor * SK_RESTRICT span,int count,U8CPU aa)955 static void blend_srcmode(SkPMColor* SK_RESTRICT device,
956                           const SkPMColor* SK_RESTRICT span,
957                           int count, U8CPU aa) {
958     int aa256 = SkAlpha255To256(aa);
959     for (int i = 0; i < count; ++i) {
960         device[i] = SkFourByteInterp256(span[i], device[i], aa256);
961     }
962 }
963 
SkARGB32_Shader_Blitter(const SkPixmap & device,const SkPaint & paint,SkShaderBase::Context * shaderContext)964 SkARGB32_Shader_Blitter::SkARGB32_Shader_Blitter(const SkPixmap& device,
965         const SkPaint& paint, SkShaderBase::Context* shaderContext)
966     : INHERITED(device, paint, shaderContext)
967 {
968     fBuffer = (SkPMColor*)sk_malloc_throw(device.width() * (sizeof(SkPMColor)));
969 
970     fXfermode = SkXfermode::Peek(paint.getBlendMode());
971 
972     int flags = 0;
973     if (!(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
974         flags |= SkBlitRow::kSrcPixelAlpha_Flag32;
975     }
976     // we call this on the output from the shader
977     fProc32 = SkBlitRow::Factory32(flags);
978     // we call this on the output from the shader + alpha from the aa buffer
979     fProc32Blend = SkBlitRow::Factory32(flags | SkBlitRow::kGlobalAlpha_Flag32);
980 
981     fShadeDirectlyIntoDevice = false;
982     if (fXfermode == nullptr) {
983         if (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag) {
984             fShadeDirectlyIntoDevice = true;
985         }
986     } else {
987         if (SkBlendMode::kSrc == paint.getBlendMode()) {
988             fShadeDirectlyIntoDevice = true;
989             fProc32Blend = blend_srcmode;
990         }
991     }
992 
993     fConstInY = SkToBool(shaderContext->getFlags() & SkShaderBase::kConstInY32_Flag);
994 }
995 
~SkARGB32_Shader_Blitter()996 SkARGB32_Shader_Blitter::~SkARGB32_Shader_Blitter() {
997     sk_free(fBuffer);
998 }
999 
blitH(int x,int y,int width)1000 void SkARGB32_Shader_Blitter::blitH(int x, int y, int width) {
1001     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
1002 
1003     uint32_t* device = fDevice.writable_addr32(x, y);
1004 
1005     if (fShadeDirectlyIntoDevice) {
1006         fShaderContext->shadeSpan(x, y, device, width);
1007     } else {
1008         SkPMColor*  span = fBuffer;
1009         fShaderContext->shadeSpan(x, y, span, width);
1010         if (fXfermode) {
1011             fXfermode->xfer32(device, span, width, nullptr);
1012         } else {
1013             fProc32(device, span, width, 255);
1014         }
1015     }
1016 }
1017 
blitRect(int x,int y,int width,int height)1018 void SkARGB32_Shader_Blitter::blitRect(int x, int y, int width, int height) {
1019     SkASSERT(x >= 0 && y >= 0 &&
1020              x + width <= fDevice.width() && y + height <= fDevice.height());
1021 
1022     uint32_t*  device = fDevice.writable_addr32(x, y);
1023     size_t     deviceRB = fDevice.rowBytes();
1024     auto*      shaderContext = fShaderContext;
1025     SkPMColor* span = fBuffer;
1026 
1027     if (fConstInY) {
1028         if (fShadeDirectlyIntoDevice) {
1029             // shade the first row directly into the device
1030             shaderContext->shadeSpan(x, y, device, width);
1031             span = device;
1032             while (--height > 0) {
1033                 device = (uint32_t*)((char*)device + deviceRB);
1034                 memcpy(device, span, width << 2);
1035             }
1036         } else {
1037             shaderContext->shadeSpan(x, y, span, width);
1038             SkXfermode* xfer = fXfermode;
1039             if (xfer) {
1040                 do {
1041                     xfer->xfer32(device, span, width, nullptr);
1042                     y += 1;
1043                     device = (uint32_t*)((char*)device + deviceRB);
1044                 } while (--height > 0);
1045             } else {
1046                 SkBlitRow::Proc32 proc = fProc32;
1047                 do {
1048                     proc(device, span, width, 255);
1049                     y += 1;
1050                     device = (uint32_t*)((char*)device + deviceRB);
1051                 } while (--height > 0);
1052             }
1053         }
1054         return;
1055     }
1056 
1057     if (fShadeDirectlyIntoDevice) {
1058         do {
1059             shaderContext->shadeSpan(x, y, device, width);
1060             y += 1;
1061             device = (uint32_t*)((char*)device + deviceRB);
1062         } while (--height > 0);
1063     } else {
1064         SkXfermode* xfer = fXfermode;
1065         if (xfer) {
1066             do {
1067                 shaderContext->shadeSpan(x, y, span, width);
1068                 xfer->xfer32(device, span, width, nullptr);
1069                 y += 1;
1070                 device = (uint32_t*)((char*)device + deviceRB);
1071             } while (--height > 0);
1072         } else {
1073             SkBlitRow::Proc32 proc = fProc32;
1074             do {
1075                 shaderContext->shadeSpan(x, y, span, width);
1076                 proc(device, span, width, 255);
1077                 y += 1;
1078                 device = (uint32_t*)((char*)device + deviceRB);
1079             } while (--height > 0);
1080         }
1081     }
1082 }
1083 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])1084 void SkARGB32_Shader_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1085                                         const int16_t runs[]) {
1086     SkPMColor* span = fBuffer;
1087     uint32_t*  device = fDevice.writable_addr32(x, y);
1088     auto*      shaderContext = fShaderContext;
1089 
1090     if (fXfermode && !fShadeDirectlyIntoDevice) {
1091         for (;;) {
1092             SkXfermode* xfer = fXfermode;
1093 
1094             int count = *runs;
1095             if (count <= 0)
1096                 break;
1097             int aa = *antialias;
1098             if (aa) {
1099                 shaderContext->shadeSpan(x, y, span, count);
1100                 if (aa == 255) {
1101                     xfer->xfer32(device, span, count, nullptr);
1102                 } else {
1103                     // count is almost always 1
1104                     for (int i = count - 1; i >= 0; --i) {
1105                         xfer->xfer32(&device[i], &span[i], 1, antialias);
1106                     }
1107                 }
1108             }
1109             device += count;
1110             runs += count;
1111             antialias += count;
1112             x += count;
1113         }
1114     } else if (fShadeDirectlyIntoDevice ||
1115                (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
1116         for (;;) {
1117             int count = *runs;
1118             if (count <= 0) {
1119                 break;
1120             }
1121             int aa = *antialias;
1122             if (aa) {
1123                 if (aa == 255) {
1124                     // cool, have the shader draw right into the device
1125                     shaderContext->shadeSpan(x, y, device, count);
1126                 } else {
1127                     shaderContext->shadeSpan(x, y, span, count);
1128                     fProc32Blend(device, span, count, aa);
1129                 }
1130             }
1131             device += count;
1132             runs += count;
1133             antialias += count;
1134             x += count;
1135         }
1136     } else {
1137         for (;;) {
1138             int count = *runs;
1139             if (count <= 0) {
1140                 break;
1141             }
1142             int aa = *antialias;
1143             if (aa) {
1144                 shaderContext->shadeSpan(x, y, span, count);
1145                 if (aa == 255) {
1146                     fProc32(device, span, count, 255);
1147                 } else {
1148                     fProc32Blend(device, span, count, aa);
1149                 }
1150             }
1151             device += count;
1152             runs += count;
1153             antialias += count;
1154             x += count;
1155         }
1156     }
1157 }
1158 
1159 using U32  = skvx::Vec< 4, uint32_t>;
1160 using U8x4 = skvx::Vec<16, uint8_t>;
1161 using U8   = skvx::Vec< 4, uint8_t>;
1162 
drive(SkPMColor * dst,const SkPMColor * src,const uint8_t * cov,int n,U8x4 (* kernel)(U8x4,U8x4,U8x4))1163 static void drive(SkPMColor* dst, const SkPMColor* src, const uint8_t* cov, int n,
1164                   U8x4 (*kernel)(U8x4,U8x4,U8x4)) {
1165 
1166     auto apply = [kernel](U32 dst, U32 src, U8 cov) -> U32 {
1167         U8x4 cov_splat = skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(cov);
1168         return skvx::bit_pun<U32>(kernel(skvx::bit_pun<U8x4>(dst),
1169                                          skvx::bit_pun<U8x4>(src),
1170                                          cov_splat));
1171     };
1172     while (n >= 4) {
1173         apply(U32::Load(dst), U32::Load(src), U8::Load(cov)).store(dst);
1174         dst += 4;
1175         src += 4;
1176         cov += 4;
1177         n   -= 4;
1178     }
1179     while (n --> 0) {
1180         *dst = apply(U32{*dst}, U32{*src}, U8{*cov})[0];
1181         dst++;
1182         src++;
1183         cov++;
1184     }
1185 }
1186 
blend_row_A8(SkPMColor * dst,const void * mask,const SkPMColor * src,int n)1187 static void blend_row_A8(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1188     auto cov = (const uint8_t*)mask;
1189     drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1190         U8x4 s_aa  = skvx::approx_scale(s, c),
1191              alpha = skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(s_aa);
1192         return s_aa + skvx::approx_scale(d, 255 - alpha);
1193     });
1194 }
1195 
blend_row_A8_opaque(SkPMColor * dst,const void * mask,const SkPMColor * src,int n)1196 static void blend_row_A8_opaque(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1197     auto cov = (const uint8_t*)mask;
1198     drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1199         return skvx::div255( skvx::cast<uint16_t>(s) * skvx::cast<uint16_t>(  c  )
1200                            + skvx::cast<uint16_t>(d) * skvx::cast<uint16_t>(255-c));
1201     });
1202 }
1203 
blend_row_lcd16(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1204 static void blend_row_lcd16(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1205     auto src_alpha_blend = [](int s, int d, int sa, int m) {
1206         return d + SkAlphaMul(s - SkAlphaMul(sa, d), m);
1207     };
1208 
1209     auto upscale_31_to_255 = [](int v) {
1210         return (v << 3) | (v >> 2);
1211     };
1212 
1213     auto mask = (const uint16_t*)vmask;
1214     for (int i = 0; i < n; ++i) {
1215         uint16_t m = mask[i];
1216         if (0 == m) {
1217             continue;
1218         }
1219 
1220         SkPMColor s = src[i];
1221         SkPMColor d = dst[i];
1222 
1223         int srcA = SkGetPackedA32(s);
1224         int srcR = SkGetPackedR32(s);
1225         int srcG = SkGetPackedG32(s);
1226         int srcB = SkGetPackedB32(s);
1227 
1228         srcA += srcA >> 7;
1229 
1230         // We're ignoring the least significant bit of the green coverage channel here.
1231         int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1232         int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1233         int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1234 
1235         // Scale up to 8-bit coverage to work with SkAlphaMul() in src_alpha_blend().
1236         maskR = upscale_31_to_255(maskR);
1237         maskG = upscale_31_to_255(maskG);
1238         maskB = upscale_31_to_255(maskB);
1239 
1240         // This LCD blit routine only works if the destination is opaque.
1241         dst[i] = SkPackARGB32(0xFF,
1242                               src_alpha_blend(srcR, SkGetPackedR32(d), srcA, maskR),
1243                               src_alpha_blend(srcG, SkGetPackedG32(d), srcA, maskG),
1244                               src_alpha_blend(srcB, SkGetPackedB32(d), srcA, maskB));
1245     }
1246 }
1247 
blend_row_LCD16_opaque(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1248 static void blend_row_LCD16_opaque(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1249     auto mask = (const uint16_t*)vmask;
1250 
1251     for (int i = 0; i < n; ++i) {
1252         uint16_t m = mask[i];
1253         if (0 == m) {
1254             continue;
1255         }
1256 
1257         SkPMColor s = src[i];
1258         SkPMColor d = dst[i];
1259 
1260         int srcR = SkGetPackedR32(s);
1261         int srcG = SkGetPackedG32(s);
1262         int srcB = SkGetPackedB32(s);
1263 
1264         // We're ignoring the least significant bit of the green coverage channel here.
1265         int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1266         int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1267         int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1268 
1269         // Now upscale them to 0..32, so we can use blend_32.
1270         maskR = upscale_31_to_32(maskR);
1271         maskG = upscale_31_to_32(maskG);
1272         maskB = upscale_31_to_32(maskB);
1273 
1274         // This LCD blit routine only works if the destination is opaque.
1275         dst[i] = SkPackARGB32(0xFF,
1276                               blend_32(srcR, SkGetPackedR32(d), maskR),
1277                               blend_32(srcG, SkGetPackedG32(d), maskG),
1278                               blend_32(srcB, SkGetPackedB32(d), maskB));
1279     }
1280 }
1281 
blitMask(const SkMask & mask,const SkIRect & clip)1282 void SkARGB32_Shader_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
1283     // we only handle kA8 with an xfermode
1284     if (fXfermode && (SkMask::kA8_Format != mask.fFormat)) {
1285         this->INHERITED::blitMask(mask, clip);
1286         return;
1287     }
1288 
1289     SkASSERT(mask.fBounds.contains(clip));
1290 
1291     void (*blend_row)(SkPMColor*, const void* mask, const SkPMColor*, int) = nullptr;
1292 
1293     if (!fXfermode) {
1294         bool opaque = (fShaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag);
1295 
1296         if (mask.fFormat == SkMask::kA8_Format && opaque) {
1297             blend_row = blend_row_A8_opaque;
1298         } else if (mask.fFormat == SkMask::kA8_Format) {
1299             blend_row = blend_row_A8;
1300         } else if (mask.fFormat == SkMask::kLCD16_Format && opaque) {
1301             blend_row = blend_row_LCD16_opaque;
1302         } else if (mask.fFormat == SkMask::kLCD16_Format) {
1303             blend_row = blend_row_lcd16;
1304         } else {
1305             this->INHERITED::blitMask(mask, clip);
1306             return;
1307         }
1308     }
1309 
1310     const int x = clip.fLeft;
1311     const int width = clip.width();
1312     int y = clip.fTop;
1313     int height = clip.height();
1314 
1315     char* dstRow = (char*)fDevice.writable_addr32(x, y);
1316     const size_t dstRB = fDevice.rowBytes();
1317     const uint8_t* maskRow = (const uint8_t*)mask.getAddr(x, y);
1318     const size_t maskRB = mask.fRowBytes;
1319 
1320     SkPMColor* span = fBuffer;
1321 
1322     if (fXfermode) {
1323         SkASSERT(SkMask::kA8_Format == mask.fFormat);
1324         SkXfermode* xfer = fXfermode;
1325         do {
1326             fShaderContext->shadeSpan(x, y, span, width);
1327             xfer->xfer32(reinterpret_cast<SkPMColor*>(dstRow), span, width, maskRow);
1328             dstRow += dstRB;
1329             maskRow += maskRB;
1330             y += 1;
1331         } while (--height > 0);
1332     } else {
1333         SkASSERT(blend_row);
1334         do {
1335             fShaderContext->shadeSpan(x, y, span, width);
1336             blend_row(reinterpret_cast<SkPMColor*>(dstRow), maskRow, span, width);
1337             dstRow += dstRB;
1338             maskRow += maskRB;
1339             y += 1;
1340         } while (--height > 0);
1341     }
1342 }
1343 
blitV(int x,int y,int height,SkAlpha alpha)1344 void SkARGB32_Shader_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
1345     SkASSERT(x >= 0 && y >= 0 && y + height <= fDevice.height());
1346 
1347     uint32_t* device = fDevice.writable_addr32(x, y);
1348     size_t    deviceRB = fDevice.rowBytes();
1349 
1350     if (fConstInY) {
1351         SkPMColor c;
1352         fShaderContext->shadeSpan(x, y, &c, 1);
1353 
1354         if (fShadeDirectlyIntoDevice) {
1355             if (255 == alpha) {
1356                 do {
1357                     *device = c;
1358                     device = (uint32_t*)((char*)device + deviceRB);
1359                 } while (--height > 0);
1360             } else {
1361                 do {
1362                     *device = SkFourByteInterp(c, *device, alpha);
1363                     device = (uint32_t*)((char*)device + deviceRB);
1364                 } while (--height > 0);
1365             }
1366         } else {
1367             SkXfermode* xfer = fXfermode;
1368             if (xfer) {
1369                 do {
1370                     xfer->xfer32(device, &c, 1, &alpha);
1371                     device = (uint32_t*)((char*)device + deviceRB);
1372                 } while (--height > 0);
1373             } else {
1374                 SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend;
1375                 do {
1376                     proc(device, &c, 1, alpha);
1377                     device = (uint32_t*)((char*)device + deviceRB);
1378                 } while (--height > 0);
1379             }
1380         }
1381         return;
1382     }
1383 
1384     if (fShadeDirectlyIntoDevice) {
1385         if (255 == alpha) {
1386             do {
1387                 fShaderContext->shadeSpan(x, y, device, 1);
1388                 y += 1;
1389                 device = (uint32_t*)((char*)device + deviceRB);
1390             } while (--height > 0);
1391         } else {
1392             do {
1393                 SkPMColor c;
1394                 fShaderContext->shadeSpan(x, y, &c, 1);
1395                 *device = SkFourByteInterp(c, *device, alpha);
1396                 y += 1;
1397                 device = (uint32_t*)((char*)device + deviceRB);
1398             } while (--height > 0);
1399         }
1400     } else {
1401         SkPMColor* span = fBuffer;
1402         SkXfermode* xfer = fXfermode;
1403         if (xfer) {
1404             do {
1405                 fShaderContext->shadeSpan(x, y, span, 1);
1406                 xfer->xfer32(device, span, 1, &alpha);
1407                 y += 1;
1408                 device = (uint32_t*)((char*)device + deviceRB);
1409             } while (--height > 0);
1410         } else {
1411             SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend;
1412             do {
1413                 fShaderContext->shadeSpan(x, y, span, 1);
1414                 proc(device, span, 1, alpha);
1415                 y += 1;
1416                 device = (uint32_t*)((char*)device + deviceRB);
1417             } while (--height > 0);
1418         }
1419     }
1420 }
1421