• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2006 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkColor.h"
9 #include "include/core/SkColorPriv.h"
10 #include "include/core/SkColorType.h"
11 #include "include/core/SkPaint.h"
12 #include "include/core/SkPixmap.h"
13 #include "include/core/SkRect.h"
14 #include "include/core/SkTypes.h"
15 #include "include/private/SkColorData.h"
16 #include "include/private/base/SkCPUTypes.h"
17 #include "include/private/base/SkDebug.h"
18 #include "include/private/base/SkMalloc.h"
19 #include "include/private/base/SkTo.h"
20 #include "src/base/SkUtils.h"
21 #include "src/base/SkVx.h"
22 #include "src/core/SkBlitMask.h"
23 #include "src/core/SkBlitRow.h"
24 #include "src/core/SkCoreBlitters.h"
25 #include "src/core/SkMask.h"
26 #include "src/core/SkMemset.h"
27 #include "src/shaders/SkShaderBase.h"
28 
29 #include <algorithm>
30 #include <cstddef>
31 #include <cstdint>
32 
upscale_31_to_32(int value)33 static inline int upscale_31_to_32(int value) {
34     SkASSERT((unsigned)value <= 31);
35     return value + (value >> 4);
36 }
37 
blend_32(int src,int dst,int scale)38 static inline int blend_32(int src, int dst, int scale) {
39     SkASSERT((unsigned)src <= 0xFF);
40     SkASSERT((unsigned)dst <= 0xFF);
41     SkASSERT((unsigned)scale <= 32);
42     return dst + ((src - dst) * scale >> 5);
43 }
44 
blend_lcd16(int srcA,int srcR,int srcG,int srcB,SkPMColor dst,uint16_t mask)45 static inline SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB,
46                                      SkPMColor dst, uint16_t mask) {
47     if (mask == 0) {
48         return dst;
49     }
50 
51     /*  We want all of these in 5bits, hence the shifts in case one of them
52      *  (green) is 6bits.
53      */
54     int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
55     int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
56     int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
57 
58     // Now upscale them to 0..32, so we can use blend32
59     maskR = upscale_31_to_32(maskR);
60     maskG = upscale_31_to_32(maskG);
61     maskB = upscale_31_to_32(maskB);
62 
63     // srcA has been upscaled to 256 before passed into this function
64     maskR = maskR * srcA >> 8;
65     maskG = maskG * srcA >> 8;
66     maskB = maskB * srcA >> 8;
67 
68     int dstA = SkGetPackedA32(dst);
69     int dstR = SkGetPackedR32(dst);
70     int dstG = SkGetPackedG32(dst);
71     int dstB = SkGetPackedB32(dst);
72 
73     // Subtract 1 from srcA to bring it back to [0-255] to compare against dstA, alpha needs to
74     // use either the min or the max of the LCD coverages. See https:/skbug.com/40037823
75     int maskA = (srcA-1) < dstA ? std::min(maskR, std::min(maskG, maskB))
76                                 : std::max(maskR, std::max(maskG, maskB));
77 
78     return SkPackARGB32(blend_32(0xFF, dstA, maskA),
79                         blend_32(srcR, dstR, maskR),
80                         blend_32(srcG, dstG, maskG),
81                         blend_32(srcB, dstB, maskB));
82 }
83 
blend_lcd16_opaque(int srcR,int srcG,int srcB,SkPMColor dst,uint16_t mask,SkPMColor opaqueDst)84 static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
85                                            SkPMColor dst, uint16_t mask,
86                                            SkPMColor opaqueDst) {
87     if (mask == 0) {
88         return dst;
89     }
90 
91     if (0xFFFF == mask) {
92         return opaqueDst;
93     }
94 
95     /*  We want all of these in 5bits, hence the shifts in case one of them
96      *  (green) is 6bits.
97      */
98     int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
99     int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
100     int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
101 
102     // Now upscale them to 0..32, so we can use blend32
103     maskR = upscale_31_to_32(maskR);
104     maskG = upscale_31_to_32(maskG);
105     maskB = upscale_31_to_32(maskB);
106 
107     int dstA = SkGetPackedA32(dst);
108     int dstR = SkGetPackedR32(dst);
109     int dstG = SkGetPackedG32(dst);
110     int dstB = SkGetPackedB32(dst);
111 
112     // Opaque src alpha always uses the max of the LCD coverages.
113     int maskA = std::max(maskR, std::max(maskG, maskB));
114 
115     // LCD blitting is only supported if the dst is known/required
116     // to be opaque
117     return SkPackARGB32(blend_32(0xFF, dstA, maskA),
118                         blend_32(srcR, dstR, maskR),
119                         blend_32(srcG, dstG, maskG),
120                         blend_32(srcB, dstB, maskB));
121 }
122 
123 
124 // TODO: rewrite at least the SSE code here.  It's miserable.
125 
126 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
127     #include <emmintrin.h>
128 
129     // The following (left) shifts cause the top 5 bits of the mask components to
130     // line up with the corresponding components in an SkPMColor.
131     // Note that the mask's RGB16 order may differ from the SkPMColor order.
132     #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
133     #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
134     #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
135 
136     #if SK_R16x5_R32x5_SHIFT == 0
137         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
138     #elif SK_R16x5_R32x5_SHIFT > 0
139         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
140     #else
141         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
142     #endif
143 
144     #if SK_G16x5_G32x5_SHIFT == 0
145         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
146     #elif SK_G16x5_G32x5_SHIFT > 0
147         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
148     #else
149         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
150     #endif
151 
152     #if SK_B16x5_B32x5_SHIFT == 0
153         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
154     #elif SK_B16x5_B32x5_SHIFT > 0
155         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
156     #else
157         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
158     #endif
159 
blend_lcd16_sse2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)160     static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
161         // In the following comments, the components of src, dst and mask are
162         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
163         // by an R, G, B, or A suffix. Components of one of the four pixels that
164         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
165         // example is the blue channel of the second destination pixel. Memory
166         // layout is shown for an ARGB byte order in a color value.
167 
168         // src and srcA store 8-bit values interleaved with zeros.
169         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
170         // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
171         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
172         // mask stores 16-bit values (compressed three channels) interleaved with zeros.
173         // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
174         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
175         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
176 
177         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
178         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
179         __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
180                                   _mm_set1_epi32(0x1F << SK_R32_SHIFT));
181 
182         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
183         __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
184                                   _mm_set1_epi32(0x1F << SK_G32_SHIFT));
185 
186         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
187         __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
188                                   _mm_set1_epi32(0x1F << SK_B32_SHIFT));
189 
190         // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
191         __m128i aMin = _mm_min_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
192                        _mm_min_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
193                                     _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
194         __m128i aMax = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
195                        _mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
196                                     _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
197         // srcA has been biased to [0-256], so compare srcA against (dstA+1)
198         __m128i a = _mm_cmplt_epi32(srcA,
199                                     _mm_and_si128(
200                                             _mm_add_epi32(dst, _mm_set1_epi32(1 << SK_A32_SHIFT)),
201                                             _mm_set1_epi32(SK_A32_MASK)));
202         // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
203         a = _mm_or_si128(_mm_and_si128(a, aMin), _mm_andnot_si128(a, aMax));
204 
205         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
206         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
207         // 8-bit position
208         // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
209         //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
210         mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));
211 
212         // Interleave R,G,B into the lower byte of word.
213         // i.e. split the sixteen 8-bit values from mask into two sets of eight
214         // 16-bit values, padded by zero.
215         __m128i maskLo, maskHi;
216         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
217         maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
218         // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
219         maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
220 
221         // Upscale from 0..31 to 0..32
222         // (allows to replace division by left-shift further down)
223         // Left-shift each component by 4 and add the result back to that component,
224         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
225         maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
226         maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
227 
228         // Multiply each component of maskLo and maskHi by srcA
229         maskLo = _mm_mullo_epi16(maskLo, srcA);
230         maskHi = _mm_mullo_epi16(maskHi, srcA);
231 
232         // Left shift mask components by 8 (divide by 256)
233         maskLo = _mm_srli_epi16(maskLo, 8);
234         maskHi = _mm_srli_epi16(maskHi, 8);
235 
236         // Interleave R,G,B into the lower byte of the word
237         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
238         __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
239         // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
240         __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
241 
242         // mask = (src - dst) * mask
243         maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
244         maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
245 
246         // mask = (src - dst) * mask >> 5
247         maskLo = _mm_srai_epi16(maskLo, 5);
248         maskHi = _mm_srai_epi16(maskHi, 5);
249 
250         // Add two pixels into result.
251         // result = dst + ((src - dst) * mask >> 5)
252         __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
253         __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
254 
255         // Pack into 4 32bit dst pixels.
256         // resultLo and resultHi contain eight 16-bit components (two pixels) each.
257         // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
258         // clamping to 255 if necessary.
259         return _mm_packus_epi16(resultLo, resultHi);
260     }
261 
blend_lcd16_opaque_sse2(__m128i & src,__m128i & dst,__m128i & mask)262     static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask) {
263         // In the following comments, the components of src, dst and mask are
264         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
265         // by an R, G, B, or A suffix. Components of one of the four pixels that
266         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
267         // example is the blue channel of the second destination pixel. Memory
268         // layout is shown for an ARGB byte order in a color value.
269 
270         // src and srcA store 8-bit values interleaved with zeros.
271         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
272         // mask stores 16-bit values (shown as high and low bytes) interleaved with
273         // zeros
274         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
275         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
276 
277         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
278         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
279         __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
280                                   _mm_set1_epi32(0x1F << SK_R32_SHIFT));
281 
282         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
283         __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
284                                   _mm_set1_epi32(0x1F << SK_G32_SHIFT));
285 
286         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
287         __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
288                                   _mm_set1_epi32(0x1F << SK_B32_SHIFT));
289 
290         // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
291         __m128i a = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
292                     _mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
293                                  _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
294 
295         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
296         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
297         // 8-bit position
298         // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
299         //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
300         mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));
301 
302         // Interleave R,G,B into the lower byte of word.
303         // i.e. split the sixteen 8-bit values from mask into two sets of eight
304         // 16-bit values, padded by zero.
305         __m128i maskLo, maskHi;
306         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
307         maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
308         // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
309         maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
310 
311         // Upscale from 0..31 to 0..32
312         // (allows to replace division by left-shift further down)
313         // Left-shift each component by 4 and add the result back to that component,
314         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
315         maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
316         maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
317 
318         // Interleave R,G,B into the lower byte of the word
319         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
320         __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
321         // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
322         __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
323 
324         // mask = (src - dst) * mask
325         maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
326         maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
327 
328         // mask = (src - dst) * mask >> 5
329         maskLo = _mm_srai_epi16(maskLo, 5);
330         maskHi = _mm_srai_epi16(maskHi, 5);
331 
332         // Add two pixels into result.
333         // result = dst + ((src - dst) * mask >> 5)
334         __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
335         __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
336 
337         // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
338         // clamping to 255 if necessary.
339         return _mm_packus_epi16(resultLo, resultHi);
340     }
341 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)342     void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
343         if (width <= 0) {
344             return;
345         }
346 
347         int srcA = SkColorGetA(src);
348         int srcR = SkColorGetR(src);
349         int srcG = SkColorGetG(src);
350         int srcB = SkColorGetB(src);
351 
352         srcA = SkAlpha255To256(srcA);
353 
354         if (width >= 4) {
355             SkASSERT(((size_t)dst & 0x03) == 0);
356             while (((size_t)dst & 0x0F) != 0) {
357                 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
358                 mask++;
359                 dst++;
360                 width--;
361             }
362 
363             __m128i *d = reinterpret_cast<__m128i*>(dst);
364             // Set alpha to 0xFF and replicate source four times in SSE register.
365             __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
366             // Interleave with zeros to get two sets of four 16-bit values.
367             src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
368             // Set srcA_sse to contain eight copies of srcA, padded with zero.
369             // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
370             __m128i srcA_sse = _mm_set1_epi16(srcA);
371             while (width >= 4) {
372                 // Load four destination pixels into dst_sse.
373                 __m128i dst_sse = _mm_load_si128(d);
374                 // Load four 16-bit masks into lower half of mask_sse.
375                 __m128i mask_sse = _mm_loadu_si64(mask);
376 
377                 // Check whether masks are equal to 0 and get the highest bit
378                 // of each byte of result, if masks are all zero, we will get
379                 // pack_cmp to 0xFFFF
380                 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
381                                                  _mm_setzero_si128()));
382 
383                 // if mask pixels are not all zero, we will blend the dst pixels
384                 if (pack_cmp != 0xFFFF) {
385                     // Unpack 4 16bit mask pixels to
386                     // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
387                     //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
388                     mask_sse = _mm_unpacklo_epi16(mask_sse,
389                                                   _mm_setzero_si128());
390 
391                     // Process 4 32bit dst pixels
392                     __m128i result = blend_lcd16_sse2(src_sse, dst_sse, mask_sse, srcA_sse);
393                     _mm_store_si128(d, result);
394                 }
395 
396                 d++;
397                 mask += 4;
398                 width -= 4;
399             }
400 
401             dst = reinterpret_cast<SkPMColor*>(d);
402         }
403 
404         while (width > 0) {
405             *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
406             mask++;
407             dst++;
408             width--;
409         }
410     }
411 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)412     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
413                                    SkColor src, int width, SkPMColor opaqueDst) {
414         if (width <= 0) {
415             return;
416         }
417 
418         int srcR = SkColorGetR(src);
419         int srcG = SkColorGetG(src);
420         int srcB = SkColorGetB(src);
421 
422         if (width >= 4) {
423             SkASSERT(((size_t)dst & 0x03) == 0);
424             while (((size_t)dst & 0x0F) != 0) {
425                 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
426                 mask++;
427                 dst++;
428                 width--;
429             }
430 
431             __m128i *d = reinterpret_cast<__m128i*>(dst);
432             // Set alpha to 0xFF and replicate source four times in SSE register.
433             __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
434             // Set srcA_sse to contain eight copies of srcA, padded with zero.
435             // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
436             src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
437             while (width >= 4) {
438                 // Load four destination pixels into dst_sse.
439                 __m128i dst_sse = _mm_load_si128(d);
440                 // Load four 16-bit masks into lower half of mask_sse.
441                 __m128i mask_sse = _mm_loadu_si64(mask);
442 
443                 // Check whether masks are equal to 0 and get the highest bit
444                 // of each byte of result, if masks are all zero, we will get
445                 // pack_cmp to 0xFFFF
446                 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
447                                                  _mm_setzero_si128()));
448 
449                 // if mask pixels are not all zero, we will blend the dst pixels
450                 if (pack_cmp != 0xFFFF) {
451                     // Unpack 4 16bit mask pixels to
452                     // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
453                     //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
454                     mask_sse = _mm_unpacklo_epi16(mask_sse,
455                                                   _mm_setzero_si128());
456 
457                     // Process 4 32bit dst pixels
458                     __m128i result = blend_lcd16_opaque_sse2(src_sse, dst_sse, mask_sse);
459                     _mm_store_si128(d, result);
460                 }
461 
462                 d++;
463                 mask += 4;
464                 width -= 4;
465             }
466 
467             dst = reinterpret_cast<SkPMColor*>(d);
468         }
469 
470         while (width > 0) {
471             *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
472             mask++;
473             dst++;
474             width--;
475         }
476     }
477 
478 #elif defined(SK_ARM_HAS_NEON)
479     #include <arm_neon.h>
480 
481     #define NEON_A (SK_A32_SHIFT / 8)
482     #define NEON_R (SK_R32_SHIFT / 8)
483     #define NEON_G (SK_G32_SHIFT / 8)
484     #define NEON_B (SK_B32_SHIFT / 8)
485 
blend_32_neon(uint8x8_t src,uint8x8_t dst,uint16x8_t scale)486     static inline uint8x8_t blend_32_neon(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
487         int16x8_t src_wide, dst_wide;
488 
489         src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
490         dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));
491 
492         src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);
493 
494         dst_wide += vshrq_n_s16(src_wide, 5);
495 
496         return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
497     }
498 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)499     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t src[],
500                                SkColor color, int width,
501                                SkPMColor opaqueDst) {
502         int colR = SkColorGetR(color);
503         int colG = SkColorGetG(color);
504         int colB = SkColorGetB(color);
505 
506         uint8x8_t vcolA = vdup_n_u8(0xFF);
507         uint8x8_t vcolR = vdup_n_u8(colR);
508         uint8x8_t vcolG = vdup_n_u8(colG);
509         uint8x8_t vcolB = vdup_n_u8(colB);
510 
511         while (width >= 8) {
512             uint8x8x4_t vdst;
513             uint16x8_t vmask;
514             uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;
515 
516             vdst = vld4_u8((uint8_t*)dst);
517             vmask = vld1q_u16(src);
518 
519             // Get all the color masks on 5 bits
520             vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
521             vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
522                                  SK_B16_BITS + SK_R16_BITS + 1);
523             vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
524 
525             // Upscale to 0..32
526             vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
527             vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
528             vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
529             // Opaque srcAlpha always uses the max of the 3 LCD coverage values
530             vmaskA = vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB));
531 
532             vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
533             vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
534             vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
535             vdst.val[NEON_A] = blend_32_neon(vcolA, vdst.val[NEON_A], vmaskA);
536 
537             vst4_u8((uint8_t*)dst, vdst);
538 
539             dst += 8;
540             src += 8;
541             width -= 8;
542         }
543 
544         // Leftovers
545         for (int i = 0; i < width; i++) {
546             dst[i] = blend_lcd16_opaque(colR, colG, colB, dst[i], src[i], opaqueDst);
547         }
548     }
549 
blit_row_lcd16(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)550     void blit_row_lcd16(SkPMColor dst[], const uint16_t src[],
551                         SkColor color, int width, SkPMColor) {
552         int colA = SkColorGetA(color);
553         int colR = SkColorGetR(color);
554         int colG = SkColorGetG(color);
555         int colB = SkColorGetB(color);
556 
557         // srcA in [0-255] to compare vs dstA
558         uint16x8_t vcolACmp = vdupq_n_u16(colA);
559         colA = SkAlpha255To256(colA);
560 
561         uint16x8_t vcolA = vdupq_n_u16(colA); // srcA in [0-256] to combine with coverage
562         uint8x8_t vcolR = vdup_n_u8(colR);
563         uint8x8_t vcolG = vdup_n_u8(colG);
564         uint8x8_t vcolB = vdup_n_u8(colB);
565 
566         while (width >= 8) {
567             uint8x8x4_t vdst;
568             uint16x8_t vmask;
569             uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;
570 
571             vdst = vld4_u8((uint8_t*)dst);
572             vmask = vld1q_u16(src);
573 
574             // Get all the color masks on 5 bits
575             vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
576             vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
577                                  SK_B16_BITS + SK_R16_BITS + 1);
578             vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
579 
580             // Upscale to 0..32
581             vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
582             vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
583             vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
584 
585             vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
586             vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
587             vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
588 
589             // Select either the min or the max of the RGB mask values, depending on if the src
590             // alpha is less than the dst alpha.
591             vmaskA = vbslq_u16(vcleq_u16(vcolACmp, vmovl_u8(vdst.val[NEON_A])), // srcA < dstA
592                                vminq_u16(vmaskR, vminq_u16(vmaskG, vmaskB)),    // ? min(r,g,b)
593                                vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB)));   // : max(r,g,b)
594 
595             vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
596             vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
597             vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
598             // vmaskA already includes vcolA so blend against 0xFF
599             vdst.val[NEON_A] = blend_32_neon(vdup_n_u8(0xFF), vdst.val[NEON_A], vmaskA);
600             vst4_u8((uint8_t*)dst, vdst);
601 
602             dst += 8;
603             src += 8;
604             width -= 8;
605         }
606 
607         for (int i = 0; i < width; i++) {
608             dst[i] = blend_lcd16(colA, colR, colG, colB, dst[i], src[i]);
609         }
610     }
611 
612 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
613 
614     // The following (left) shifts cause the top 5 bits of the mask components to
615     // line up with the corresponding components in an SkPMColor.
616     // Note that the mask's RGB16 order may differ from the SkPMColor order.
617     #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
618     #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
619     #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
620 
621     #if SK_R16x5_R32x5_SHIFT == 0
622         #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (x)
623     #elif SK_R16x5_R32x5_SHIFT > 0
624         #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvslli_w(x, SK_R16x5_R32x5_SHIFT))
625     #else
626         #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_R16x5_R32x5_SHIFT))
627     #endif
628 
629     #if SK_G16x5_G32x5_SHIFT == 0
630         #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (x)
631     #elif SK_G16x5_G32x5_SHIFT > 0
632         #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvslli_w(x, SK_G16x5_G32x5_SHIFT))
633     #else
634         #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_G16x5_G32x5_SHIFT))
635     #endif
636 
637     #if SK_B16x5_B32x5_SHIFT == 0
638         #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (x)
639     #elif SK_B16x5_B32x5_SHIFT > 0
640         #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvslli_w(x, SK_B16x5_B32x5_SHIFT))
641     #else
642         #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_B16x5_B32x5_SHIFT))
643     #endif
644 
blend_lcd16_lasx(__m256i & src,__m256i & dst,__m256i & mask,__m256i & srcA)645     static __m256i blend_lcd16_lasx(__m256i &src, __m256i &dst, __m256i &mask, __m256i &srcA) {
646         // In the following comments, the components of src, dst and mask are
647         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
648         // by an R, G, B, or A suffix. Components of one of the four pixels that
649         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
650         // example is the blue channel of the second destination pixel. Memory
651         // layout is shown for an ARGB byte order in a color value.
652 
653         // src and srcA store 8-bit values interleaved with zeros.
654         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
655         //         0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
656         // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
657         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0,
658         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0,
659         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
660         // mask stores 16-bit values (compressed three channels) interleaved with zeros.
661         // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
662         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
663         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
664         //         m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
665         //         m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
666 
667         __m256i xv_zero = __lasx_xvldi(0);
668 
669         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
670         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0,
671         //      0, m4R, 0, 0, 0, m5R, 0, 0, 0, m6R, 0, 0, 0, m7R, 0, 0)
672         __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
673                                    __lasx_xvreplgr2vr_w(0x1F << SK_R32_SHIFT));
674 
675         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
676         //      0, 0, m4G, 0, 0, 0, m5G, 0, 0, 0, m6G, 0, 0, 0, m7R, 0)
677         __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
678                                    __lasx_xvreplgr2vr_w(0x1F << SK_G32_SHIFT));
679 
680         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
681         //      0, 0, 0, m4B, 0, 0, 0, m5B, 0, 0, 0, m6B, 0, 0, 0, m7B)
682         __m256i b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
683                                    __lasx_xvreplgr2vr_w(0x1F << SK_B32_SHIFT));
684 
685         // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
686         __m256i aMin = __lasx_xvmin_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
687                        __lasx_xvmin_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
688                                       __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
689         __m256i aMax = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
690                        __lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
691                                       __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
692         // srcA has been biased to [0-256], so compare srcA against (dstA+1)
693         __m256i a = __lasx_xvmskltz_w(srcA -
694                                     __lasx_xvand_v(
695                                            __lasx_xvadd_w(dst,
696                                                           __lasx_xvreplgr2vr_w(1 << SK_A32_SHIFT)),
697                                            __lasx_xvreplgr2vr_w(SK_A32_MASK)));
698         // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
699         a = __lasx_xvor_v(__lasx_xvand_v(a, aMin), __lasx_xvandn_v(a, aMax));
700 
701         // Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3)
702         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
703         // 8-bit position
704         // mask = (m0A, m0R, m0G, m0B, m1R, m1R, m1G, m1B,
705         //         m2A, m2R, m2G, m2B, m3R, m3R, m3G, m3B,
706         //         m4A, m4R, m4G, m4B, m5R, m5R, m5G, m5B,
707         //         m6A, m6R, m6G, m6B, m7R, m7R, m7G, m7B)
708         mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));
709 
710         // Interleave R,G,B into the lower byte of word.
711         // i.e. split the sixteen 8-bit values from mask into two sets of sixteen
712         // 16-bit values, padded by zero.
713         __m256i maskLo, maskHi;
714         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0,
715         //           m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
716         maskLo = __lasx_xvilvl_b(xv_zero, mask);
717         // maskHi = (m4A, 0, m4R, 0, m4G, 0, m4B, 0, m5A, 0, m5R, 0, m5G, 0, m5B, 0,
718         //           m6A, 0, m6R, 0, m6G, 0, m6B, 0, m7A, 0, m7R, 0, m7G, 0, m7B, 0)
719         maskHi = __lasx_xvilvh_b(xv_zero, mask);
720 
721         // Upscale from 0..31 to 0..32
722         // (allows to replace division by left-shift further down)
723         // Left-shift each component by 4 and add the result back to that component,
724         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
725         maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
726         maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
727 
728         // Multiply each component of maskLo and maskHi by srcA
729         maskLo = __lasx_xvmul_h(maskLo, srcA);
730         maskHi = __lasx_xvmul_h(maskHi, srcA);
731 
732         // Left shift mask components by 8 (divide by 256)
733         maskLo = __lasx_xvsrli_h(maskLo, 8);
734         maskHi = __lasx_xvsrli_h(maskHi, 8);
735 
736         // Interleave R,G,B into the lower byte of the word
737         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
738         //          d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
739         __m256i dstLo = __lasx_xvilvl_b(xv_zero, dst);
740         // dstLo = (d4A, 0, d4R, 0, d4G, 0, d4B, 0, d5A, 0, d5R, 0, d5G, 0, d5B, 0)
741         //          d6A, 0, d6R, 0, d6G, 0, d6B, 0, d7A, 0, d7R, 0, d7G, 0, d7B, 0)
742         __m256i dstHi = __lasx_xvilvh_b(xv_zero, dst);
743 
744         // mask = (src - dst) * mask
745         maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(src, dstLo));
746         maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(src, dstHi));
747 
748         // mask = (src - dst) * mask >> 5
749         maskLo = __lasx_xvsrai_h(maskLo, 5);
750         maskHi = __lasx_xvsrai_h(maskHi, 5);
751 
752         // Add two pixels into result.
753         // result = dst + ((src - dst) * mask >> 5)
754         __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
755         __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);
756 
757         // Pack into 8 32bit dst pixels.
758         // resultLo and resultHi contain sixteen 16-bit components (four pixels) each.
759         // Merge into one LASX regsiter with 32 8-bit values (eight pixels),
760         // clamping to 255 if necessary.
761         __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
762         __m256i tmph = __lasx_xvsat_hu(resultHi, 7);
763         return __lasx_xvpickev_b(tmph, tmpl);
764     }
765 
blend_lcd16_opaque_lasx(__m256i & src,__m256i & dst,__m256i & mask)766     static __m256i blend_lcd16_opaque_lasx(__m256i &src, __m256i &dst, __m256i &mask) {
767         // In the following comments, the components of src, dst and mask are
768         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
769         // by an R, G, B, or A suffix. Components of one of the four pixels that
770         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
771         // example is the blue channel of the second destination pixel. Memory
772         // layout is shown for an ARGB byte order in a color value.
773 
774         // src and srcA store 8-bit values interleaved with zeros.
775         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
776         //         0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
777         // mask stores 16-bit values (shown as high and low bytes) interleaved with
778         // zeros
779         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
780         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
781         //         m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
782         //         m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
783 
784         __m256i xv_zero = __lasx_xvldi(0);
785 
786         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
787         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0,
788         //      0, m4R, 0, 0, 0, m5R, 0, 0, 0, m6R, 0, 0, 0, m7R, 0, 0)
789         __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
790                                    __lasx_xvreplgr2vr_w(0x1F << SK_R32_SHIFT));
791 
792         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0,
793         //      0, 0, m4G, 0, 0, 0, m5G, 0, 0, 0, m6G, 0, 0, 0, m7G, 0)
794         __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
795                                    __lasx_xvreplgr2vr_w(0x1F << SK_G32_SHIFT));
796 
797         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B,
798         //      0, 0, 0, m4B, 0, 0, 0, m5B, 0, 0, 0, m6B, 0, 0, 0, m7B)
799         __m256i b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
800                                    __lasx_xvreplgr2vr_w(0x1F << SK_B32_SHIFT));
801 
802         // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
803         __m256i a = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
804                     __lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
805                                    __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
806 
807         // Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3,
808         // p4, p5, p6, p7)
809         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
810         // 8-bit position
811         // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
812         //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B,
813         //         m4A, m4R, m4G, m4B, m5A, m5R, m5G, m5B,
814         //         m6A, m6R, m6G, m6B, m7A, m7R, m7G, m7B)
815         mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));
816 
817         // Interleave R,G,B into the lower byte of word.
818         // i.e. split the 32 8-bit values from mask into two sets of sixteen
819         // 16-bit values, padded by zero.
820         __m256i maskLo, maskHi;
821         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0,
822         //           m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
823         maskLo = __lasx_xvilvl_b(xv_zero, mask);
824         // maskHi = (m4A, 0, m4R, 0, m4G, 0, m4B, 0, m5A, 0, m5R, 0, m5G, 0, m5B, 0,
825         //           m6A, 0, m6R, 0, m6G, 0, m6B, 0, m7A, 0, m7R, 0, m7G, 0, m7B, 0)
826         maskHi = __lasx_xvilvh_b(xv_zero, mask);
827 
828         // Upscale from 0..31 to 0..32
829         // (allows to replace division by left-shift further down)
830         // Left-shift each component by 4 and add the result back to that component,
831         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
832         maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
833         maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
834 
835         // Interleave R,G,B into the lower byte of the word
836         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0,
837         //          d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
838         __m256i dstLo = __lasx_xvilvl_b(xv_zero, dst);
839         // dstLo = (d4A, 0, d4R, 0, d4G, 0, d4B, 0, d5A, 0, d5R, 0, d5G, 0, d5B, 0,
840         // dstLo = (d6A, 0, d6R, 0, d6G, 0, d6B, 0, d7A, 0, d7R, 0, d7G, 0, d7B, 0)
841         __m256i dstHi = __lasx_xvilvh_b(xv_zero, dst);
842 
843         // mask = (src - dst) * mask
844         maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(src, dstLo));
845         maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(src, dstHi));
846 
847         // mask = (src - dst) * mask >> 5
848         maskLo = __lasx_xvsrai_h(maskLo, 5);
849         maskHi = __lasx_xvsrai_h(maskHi, 5);
850 
851         // Add two pixels into result.
852         // result = dst + ((src - dst) * mask >> 5)
853         __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
854         __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);
855 
856         // Merge into one SSE regsiter with 32 8-bit values (eight pixels),
857         // clamping to 255 if necessary.
858         __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
859         __m256i tmph = __lasx_xvsat_hu(resultHi, 7);
860 
861         return __lasx_xvpickev_b(tmph, tmpl);
862     }
863 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)864     void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
865         if (width <= 0) {
866             return;
867         }
868 
869         int srcA = SkColorGetA(src);
870         int srcR = SkColorGetR(src);
871         int srcG = SkColorGetG(src);
872         int srcB = SkColorGetB(src);
873         __m256i xv_zero = __lasx_xvldi(0);
874 
875         srcA = SkAlpha255To256(srcA);
876         if (width >= 8) {
877             SkASSERT(((size_t)dst & 0x03) == 0);
878             while (((size_t)dst & 0x0F) != 0) {
879                 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
880                 mask++;
881                 dst++;
882                 width--;
883             }
884 
885             __m256i *d = reinterpret_cast<__m256i*>(dst);
886             // Set alpha to 0xFF and replicate source eight times in LASX register.
887             unsigned int skpackargb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
888             __m256i src_lasx = __lasx_xvreplgr2vr_w(skpackargb32);
889             // Interleave with zeros to get two sets of eight 16-bit values.
890             src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);
891             // Set srcA_lasx to contain sixteen copies of srcA, padded with zero.
892             // src_lasx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
893             //           0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
894             __m256i srcA_lasx = __lasx_xvreplgr2vr_h(srcA);
895 
896             while (width >= 8) {
897                 // Load eight destination pixels into dst_lasx.
898                 __m256i dst_lasx = __lasx_xvld(d, 0);
899                 // Load eight 16-bit masks into lower half of mask_lasx.
900                 __m256i mask_lasx = __lasx_xvld(mask, 0);
901                 mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};
902 
903                 int pack_cmp = __lasx_xbz_v(mask_lasx);
904                 // if mask pixels are not all zero, we will blend the dst pixels
905                 if (pack_cmp != 1) {
906                     // Unpack 8 16bit mask pixels to
907                     // mask_lasx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
908                     //              m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
909                     //              m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
910                     //              m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
911                     mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);
912 
913                     // Process 8 32bit dst pixels
914                     __m256i result = blend_lcd16_lasx(src_lasx, dst_lasx, mask_lasx, srcA_lasx);
915                     __lasx_xvst(result, d, 0);
916                 }
917                 d++;
918                 mask += 8;
919                 width -= 8;
920             }
921             dst = reinterpret_cast<SkPMColor*>(d);
922         }
923 
924         while (width > 0) {
925             *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
926             mask++;
927             dst++;
928             width--;
929         }
930     }
931 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)932     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
933                                SkColor src, int width, SkPMColor opaqueDst) {
934         if (width <= 0) {
935             return;
936         }
937 
938         int srcR = SkColorGetR(src);
939         int srcG = SkColorGetG(src);
940         int srcB = SkColorGetB(src);
941         __m256i xv_zero = __lasx_xvldi(0);
942 
943         if (width >= 8) {
944             SkASSERT(((size_t)dst & 0x03) == 0);
945             while (((size_t)dst & 0x0F) != 0) {
946                 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
947                 mask++;
948                 dst++;
949                 width--;
950             }
951 
952             __m256i *d = reinterpret_cast<__m256i*>(dst);
953             // Set alpha to 0xFF and replicate source four times in LASX register.
954             unsigned int sk_pack_argb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
955             __m256i src_lasx = __lasx_xvreplgr2vr_w(sk_pack_argb32);
956             // Set srcA_lasx to contain sixteen copies of srcA, padded with zero.
957             // src_lasx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
958             //           0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
959             src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);
960 
961             while (width >= 8) {
962                 // Load eight destination pixels into dst_lasx.
963                 __m256i dst_lasx = __lasx_xvld(d, 0);
964                 // Load eight 16-bit masks into lower half of mask_lasx.
965                 __m256i mask_lasx = __lasx_xvld(mask, 0);
966                 mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};
967 
968                 int32_t pack_cmp = __lasx_xbz_v(mask_lasx);
969                 // if mask pixels are not all zero, we will blend the dst pixels
970                 if (pack_cmp != 1) {
971                     // Unpack 8 16bit mask pixels to
972                     // mask_lasx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
973                     //              m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
974                     //              m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
975                     //              m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
976                     mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);
977                     // Process 8 32bit dst pixels
978                     __m256i result = blend_lcd16_opaque_lasx(src_lasx, dst_lasx, mask_lasx);
979                     __lasx_xvst(result, d, 0);
980                 }
981                 d++;
982                 mask += 8;
983                 width -= 8;
984             }
985 
986             dst = reinterpret_cast<SkPMColor*>(d);
987         }
988 
989         while (width > 0) {
990             *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
991             mask++;
992             dst++;
993             width--;
994         }
995     }
996 
997 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
998 
999     // The following (left) shifts cause the top 5 bits of the mask components to
1000     // line up with the corresponding components in an SkPMColor.
1001     // Note that the mask's RGB16 order may differ from the SkPMColor order.
1002     #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
1003     #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
1004     #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
1005 
1006     #if SK_R16x5_R32x5_SHIFT == 0
1007         #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (x)
1008     #elif SK_R16x5_R32x5_SHIFT > 0
1009         #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vslli_w(x, SK_R16x5_R32x5_SHIFT))
1010     #else
1011         #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vsrli_w(x, -SK_R16x5_R32x5_SHIFT))
1012     #endif
1013 
1014     #if SK_G16x5_G32x5_SHIFT == 0
1015         #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (x)
1016     #elif SK_G16x5_G32x5_SHIFT > 0
1017         #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vslli_w(x, SK_G16x5_G32x5_SHIFT))
1018     #else
1019         #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vsrli_w(x, -SK_G16x5_G32x5_SHIFT))
1020     #endif
1021 
1022     #if SK_B16x5_B32x5_SHIFT == 0
1023         #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (x)
1024     #elif SK_B16x5_B32x5_SHIFT > 0
1025         #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vslli_w(x, SK_B16x5_B32x5_SHIFT))
1026     #else
1027         #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vsrli_w(x, -SK_B16x5_B32x5_SHIFT))
1028     #endif
1029 
blend_lcd16_lsx(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)1030     static __m128i blend_lcd16_lsx(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
1031         // In the following comments, the components of src, dst and mask are
1032         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
1033         // by an R, G, B, or A suffix. Components of one of the four pixels that
1034         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
1035         // example is the blue channel of the second destination pixel. Memory
1036         // layout is shown for an ARGB byte order in a color value.
1037 
1038         // src and srcA store 8-bit values interleaved with zeros.
1039         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1040         // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
1041         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
1042         // mask stores 16-bit values (compressed three channels) interleaved with zeros.
1043         // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
1044         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1045         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1046 
1047         __m128i v_zero = __lsx_vldi(0);
1048 
1049         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
1050         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
1051         __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
1052                                  __lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));
1053 
1054         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
1055         __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
1056                                  __lsx_vreplgr2vr_w(0x1F << SK_G32_SHIFT));
1057 
1058         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
1059         __m128i b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
1060                                  __lsx_vreplgr2vr_w(0x1F << SK_B32_SHIFT));
1061 
1062         // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
1063         __m128i aMin = __lsx_vmin_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1064                        __lsx_vmin_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1065                                     __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1066         __m128i aMax = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1067                        __lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1068                                     __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1069         // srcA has been biased to [0-256], so compare srcA against (dstA+1)
1070         __m128i a = __lsx_vmskltz_w(srcA -
1071                                     __lsx_vand_v(
1072                                           __lsx_vadd_w(dst,
1073                                                        __lsx_vreplgr2vr_w(1 << SK_A32_SHIFT)),
1074                                           __lsx_vreplgr2vr_w(SK_A32_MASK)));
1075         // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
1076         a = __lsx_vor_v(__lsx_vand_v(a, aMin), __lsx_vandn_v(a, aMax));
1077 
1078         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
1079         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
1080         // 8-bit position
1081         // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
1082         //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
1083         mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));
1084 
1085         // Interleave R,G,B into the lower byte of word.
1086         // i.e. split the sixteen 8-bit values from mask into two sets of eight
1087         // 16-bit values, padded by zero.
1088         __m128i maskLo, maskHi;
1089         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
1090         maskLo = __lsx_vilvl_b(v_zero, mask);
1091         // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
1092         maskHi = __lsx_vilvh_b(v_zero, mask);
1093 
1094         // Upscale from 0..31 to 0..32
1095         // (allows to replace division by left-shift further down)
1096         // Left-shift each component by 4 and add the result back to that component,
1097         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
1098         maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
1099         maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
1100 
1101         // Multiply each component of maskLo and maskHi by srcA
1102         maskLo = __lsx_vmul_h(maskLo, srcA);
1103         maskHi = __lsx_vmul_h(maskHi, srcA);
1104 
1105         // Left shift mask components by 8 (divide by 256)
1106         maskLo = __lsx_vsrli_h(maskLo, 8);
1107         maskHi = __lsx_vsrli_h(maskHi, 8);
1108 
1109         // Interleave R,G,B into the lower byte of the word
1110         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
1111         __m128i dstLo = __lsx_vilvl_b(v_zero, dst);
1112         // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
1113         __m128i dstHi = __lsx_vilvh_b(v_zero, dst);
1114 
1115         // mask = (src - dst) * mask
1116         maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(src, dstLo));
1117         maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(src, dstHi));
1118 
1119         // mask = (src - dst) * mask >> 5
1120         maskLo = __lsx_vsrai_h(maskLo, 5);
1121         maskHi = __lsx_vsrai_h(maskHi, 5);
1122 
1123         // Add two pixels into result.
1124         // result = dst + ((src - dst) * mask >> 5)
1125         __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
1126         __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);
1127 
1128         // Pack into 4 32bit dst pixels.
1129         // resultLo and resultHi contain eight 16-bit components (two pixels) each.
1130         // Merge into one LSX regsiter with sixteen 8-bit values (four pixels),
1131         // clamping to 255 if necessary.
1132         __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
1133         __m128i tmph = __lsx_vsat_hu(resultHi, 7);
1134         return __lsx_vpickev_b(tmph, tmpl);
1135     }
1136 
blend_lcd16_opaque_lsx(__m128i & src,__m128i & dst,__m128i & mask)1137     static __m128i blend_lcd16_opaque_lsx(__m128i &src, __m128i &dst, __m128i &mask) {
1138         // In the following comments, the components of src, dst and mask are
1139         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
1140         // by an R, G, B, or A suffix. Components of one of the four pixels that
1141         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
1142         // example is the blue channel of the second destination pixel. Memory
1143         // layout is shown for an ARGB byte order in a color value.
1144 
1145         // src and srcA store 8-bit values interleaved with zeros.
1146         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1147         // mask stores 16-bit values (shown as high and low bytes) interleaved with
1148         // zeros
1149         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1150         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1151 
1152         __m128i v_zero = __lsx_vldi(0);
1153 
1154         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
1155         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
1156         __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
1157                                  __lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));
1158 
1159         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
1160         __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
1161                                  __lsx_vreplgr2vr_w(0x1F << SK_G32_SHIFT));
1162 
1163         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
1164         __m128i b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
1165                                  __lsx_vreplgr2vr_w(0x1F << SK_B32_SHIFT));
1166 
1167         // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
1168         __m128i a = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1169                     __lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1170                                  __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1171 
1172         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
1173         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
1174         // 8-bit position
1175         // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
1176         //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
1177         mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));
1178 
1179         // Interleave R,G,B into the lower byte of word.
1180         // i.e. split the sixteen 8-bit values from mask into two sets of eight
1181         // 16-bit values, padded by zero.
1182         __m128i maskLo, maskHi;
1183         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
1184         maskLo = __lsx_vilvl_b(v_zero, mask);
1185         // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
1186         maskHi = __lsx_vilvh_b(v_zero, mask);
1187 
1188         // Upscale from 0..31 to 0..32
1189         // (allows to replace division by left-shift further down)
1190         // Left-shift each component by 4 and add the result back to that component,
1191         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
1192         maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
1193         maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
1194 
1195         // Interleave R,G,B into the lower byte of the word
1196         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
1197         __m128i dstLo = __lsx_vilvl_b(v_zero, dst);
1198         // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
1199         __m128i dstHi = __lsx_vilvh_b(v_zero, dst);
1200 
1201         // mask = (src - dst) * mask
1202         maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(src, dstLo));
1203         maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(src, dstHi));
1204 
1205         // mask = (src - dst) * mask >> 5
1206         maskLo = __lsx_vsrai_h(maskLo, 5);
1207         maskHi = __lsx_vsrai_h(maskHi, 5);
1208 
1209         // Add two pixels into result.
1210         // result = dst + ((src - dst) * mask >> 5)
1211         __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
1212         __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);
1213 
1214         // Merge into one LSX regsiter with sixteen 8-bit values (four pixels),
1215         // clamping to 255 if necessary.
1216         __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
1217         __m128i tmph = __lsx_vsat_hu(resultHi, 7);
1218         return __lsx_vpickev_b(tmph, tmpl);
1219     }
1220 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)1221     void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
1222         if (width <= 0) {
1223             return;
1224         }
1225 
1226         int srcA = SkColorGetA(src);
1227         int srcR = SkColorGetR(src);
1228         int srcG = SkColorGetG(src);
1229         int srcB = SkColorGetB(src);
1230         __m128i v_zero = __lsx_vldi(0);
1231 
1232         srcA = SkAlpha255To256(srcA);
1233         if (width >= 4) {
1234             SkASSERT(((size_t)dst & 0x03) == 0);
1235             while (((size_t)dst & 0x0F) != 0) {
1236                 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
1237                 mask++;
1238                 dst++;
1239                 width--;
1240             }
1241 
1242             __m128i *d = reinterpret_cast<__m128i*>(dst);
1243             // Set alpha to 0xFF and replicate source eight times in LSX register.
1244             unsigned int skpackargb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
1245             __m128i src_lsx = __lsx_vreplgr2vr_w(skpackargb32);
1246             // Interleave with zeros to get two sets of eight 16-bit values.
1247             src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
1248             // Set srcA_lsx to contain eight copies of srcA, padded with zero.
1249             // src_lsx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1250             __m128i srcA_lsx = __lsx_vreplgr2vr_h(srcA);
1251 
1252             while (width >= 4) {
1253                 // Load eight destination pixels into dst_lsx.
1254                 __m128i dst_lsx = __lsx_vld(d, 0);
1255                 // Load four 16-bit masks into lower half of mask_lsx.
1256                 __m128i mask_lsx = __lsx_vldrepl_d((void *)mask, 0);
1257                 mask_lsx =  __lsx_vilvl_d(v_zero, mask_lsx);
1258 
1259                 int pack_cmp = __lsx_bz_v(mask_lsx);
1260                 // if mask pixels are not all zero, we will blend the dst pixels
1261                 if (pack_cmp != 1) {
1262                     // Unpack 4 16bit mask pixels to
1263                     // mask_lsx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1264                     //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1265                     mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);
1266 
1267                     // Process 8 32bit dst pixels
1268                     __m128i result = blend_lcd16_lsx(src_lsx, dst_lsx, mask_lsx, srcA_lsx);
1269                     __lsx_vst(result, d, 0);
1270                 }
1271 
1272                 d++;
1273                 mask += 4;
1274                 width -= 4;
1275             }
1276 
1277             dst = reinterpret_cast<SkPMColor*>(d);
1278         }
1279 
1280         while (width > 0) {
1281             *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
1282             mask++;
1283             dst++;
1284             width--;
1285         }
1286     }
1287 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)1288     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
1289                                SkColor src, int width, SkPMColor opaqueDst) {
1290         if (width <= 0) {
1291             return;
1292         }
1293 
1294         int srcR = SkColorGetR(src);
1295         int srcG = SkColorGetG(src);
1296         int srcB = SkColorGetB(src);
1297         __m128i v_zero = __lsx_vldi(0);
1298 
1299         if (width >= 4) {
1300             SkASSERT(((size_t)dst & 0x03) == 0);
1301             while (((size_t)dst & 0x0F) != 0) {
1302                 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
1303                 mask++;
1304                 dst++;
1305                 width--;
1306             }
1307 
1308             __m128i *d = reinterpret_cast<__m128i*>(dst);
1309             // Set alpha to 0xFF and replicate source four times in LSX register.
1310             unsigned int sk_pack_argb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
1311             __m128i src_lsx = __lsx_vreplgr2vr_w(sk_pack_argb32);
1312             // Set srcA_lsx to contain eight copies of srcA, padded with zero.
1313             // src_lsx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1314             src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
1315 
1316             while (width >= 4) {
1317                 // Load four destination pixels into dst_lsx.
1318                 __m128i dst_lsx = __lsx_vld(d, 0);
1319                 // Load four 16-bit masks into lower half of mask_lsx.
1320                 __m128i mask_lsx = __lsx_vldrepl_d((void *)(mask), 0);
1321                 mask_lsx =  __lsx_vilvl_d(v_zero, mask_lsx);
1322 
1323                 int pack_cmp = __lsx_bz_v(mask_lsx);
1324                 // if mask pixels are not all zero, we will blend the dst pixels
1325                 if (pack_cmp != 1) {
1326                     // Unpack 4 16bit mask pixels to
1327                     mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);
1328 
1329                     // Process 8 32bit dst pixels
1330                     __m128i result = blend_lcd16_opaque_lsx(src_lsx, dst_lsx, mask_lsx);
1331                     __lsx_vst(result, d, 0);
1332                 }
1333                 d++;
1334                 mask += 4;
1335                 width -= 4;
1336             }
1337 
1338             dst = reinterpret_cast<SkPMColor*>(d);
1339         }
1340 
1341         while (width > 0) {
1342             *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
1343             mask++;
1344             dst++;
1345             width--;
1346         }
1347     }
1348 
1349 #else
1350 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)1351     static inline void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[],
1352                                       SkColor src, int width, SkPMColor) {
1353         int srcA = SkColorGetA(src);
1354         int srcR = SkColorGetR(src);
1355         int srcG = SkColorGetG(src);
1356         int srcB = SkColorGetB(src);
1357 
1358         srcA = SkAlpha255To256(srcA);
1359 
1360         for (int i = 0; i < width; i++) {
1361             dst[i] = blend_lcd16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
1362         }
1363     }
1364 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)1365     static inline void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
1366                                              SkColor src, int width,
1367                                              SkPMColor opaqueDst) {
1368         int srcR = SkColorGetR(src);
1369         int srcG = SkColorGetG(src);
1370         int srcB = SkColorGetB(src);
1371 
1372         for (int i = 0; i < width; i++) {
1373             dst[i] = blend_lcd16_opaque(srcR, srcG, srcB, dst[i], mask[i], opaqueDst);
1374         }
1375     }
1376 
1377 #endif
1378 
blit_color(const SkPixmap & device,const SkMask & mask,const SkIRect & clip,SkColor color)1379 static bool blit_color(const SkPixmap& device,
1380                        const SkMask& mask,
1381                        const SkIRect& clip,
1382                        SkColor color) {
1383     int x = clip.fLeft,
1384         y = clip.fTop;
1385 
1386     if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
1387         SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
1388                                  (const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
1389                                  color, clip.width(), clip.height());
1390         return true;
1391     }
1392 
1393     if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
1394         auto dstRow  = device.writable_addr32(x,y);
1395         auto maskRow = (const uint16_t*)mask.getAddr(x,y);
1396 
1397         auto blit_row = blit_row_lcd16;
1398         SkPMColor opaqueDst = 0;  // ignored unless opaque
1399 
1400         if (0xff == SkColorGetA(color)) {
1401             blit_row  = blit_row_lcd16_opaque;
1402             opaqueDst = SkPreMultiplyColor(color);
1403         }
1404 
1405         for (int height = clip.height(); height --> 0; ) {
1406             blit_row(dstRow, maskRow, color, clip.width(), opaqueDst);
1407 
1408             dstRow  = (SkPMColor*)     ((      char*) dstRow + device.rowBytes());
1409             maskRow = (const uint16_t*)((const char*)maskRow +  mask.fRowBytes);
1410         }
1411         return true;
1412     }
1413 
1414     return false;
1415 }
1416 
1417 ///////////////////////////////////////////////////////////////////////////////
1418 
SkARGB32_Blit32(const SkPixmap & device,const SkMask & mask,const SkIRect & clip,SkPMColor srcColor)1419 static void SkARGB32_Blit32(const SkPixmap& device, const SkMask& mask,
1420                             const SkIRect& clip, SkPMColor srcColor) {
1421     U8CPU alpha = SkGetPackedA32(srcColor);
1422     unsigned flags = SkBlitRow::kSrcPixelAlpha_Flag32;
1423     if (alpha != 255) {
1424         flags |= SkBlitRow::kGlobalAlpha_Flag32;
1425     }
1426     SkBlitRow::Proc32 proc = SkBlitRow::Factory32(flags);
1427 
1428     int x = clip.fLeft;
1429     int y = clip.fTop;
1430     int width = clip.width();
1431     int height = clip.height();
1432 
1433     SkPMColor* dstRow = device.writable_addr32(x, y);
1434     const SkPMColor* srcRow = reinterpret_cast<const SkPMColor*>(mask.getAddr8(x, y));
1435 
1436     do {
1437         proc(dstRow, srcRow, width, alpha);
1438         dstRow = (SkPMColor*)((char*)dstRow + device.rowBytes());
1439         srcRow = (const SkPMColor*)((const char*)srcRow + mask.fRowBytes);
1440     } while (--height != 0);
1441 }
1442 
1443 //////////////////////////////////////////////////////////////////////////////////////
1444 
SkARGB32_Blitter(const SkPixmap & device,const SkPaint & paint)1445 SkARGB32_Blitter::SkARGB32_Blitter(const SkPixmap& device, const SkPaint& paint)
1446         : INHERITED(device) {
1447     SkColor color = paint.getColor();
1448     fColor = color;
1449 
1450     fSrcA = SkColorGetA(color);
1451     unsigned scale = SkAlpha255To256(fSrcA);
1452     fSrcR = SkAlphaMul(SkColorGetR(color), scale);
1453     fSrcG = SkAlphaMul(SkColorGetG(color), scale);
1454     fSrcB = SkAlphaMul(SkColorGetB(color), scale);
1455 
1456     fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
1457 }
1458 
1459 #if defined _WIN32  // disable warning : local variable used without having been initialized
1460 #pragma warning ( push )
1461 #pragma warning ( disable : 4701 )
1462 #endif
1463 
blitH(int x,int y,int width)1464 void SkARGB32_Blitter::blitH(int x, int y, int width) {
1465     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
1466 
1467     uint32_t* device = fDevice.writable_addr32(x, y);
1468     SkBlitRow::Color32(device, width, fPMColor);
1469 }
1470 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])1471 void SkARGB32_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1472                                  const int16_t runs[]) {
1473     if (fSrcA == 0) {
1474         return;
1475     }
1476 
1477     uint32_t    color = fPMColor;
1478     uint32_t*   device = fDevice.writable_addr32(x, y);
1479     unsigned    opaqueMask = fSrcA; // if fSrcA is 0xFF, then we will catch the fast opaque case
1480 
1481     for (;;) {
1482         int count = runs[0];
1483         SkASSERT(count >= 0);
1484         if (count <= 0) {
1485             return;
1486         }
1487         unsigned aa = antialias[0];
1488         if (aa) {
1489             if ((opaqueMask & aa) == 255) {
1490                 SkOpts::memset32(device, color, count);
1491             } else {
1492                 uint32_t sc = SkAlphaMulQ(color, SkAlpha255To256(aa));
1493                 SkBlitRow::Color32(device, count, sc);
1494             }
1495         }
1496         runs += count;
1497         antialias += count;
1498         device += count;
1499     }
1500 }
1501 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)1502 void SkARGB32_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
1503     uint32_t* device = fDevice.writable_addr32(x, y);
1504     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1505 
1506     device[0] = SkBlendARGB32(fPMColor, device[0], a0);
1507     device[1] = SkBlendARGB32(fPMColor, device[1], a1);
1508 }
1509 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)1510 void SkARGB32_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
1511     uint32_t* device = fDevice.writable_addr32(x, y);
1512     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1513 
1514     device[0] = SkBlendARGB32(fPMColor, device[0], a0);
1515     device = (uint32_t*)((char*)device + fDevice.rowBytes());
1516     device[0] = SkBlendARGB32(fPMColor, device[0], a1);
1517 }
1518 
1519 //////////////////////////////////////////////////////////////////////////////////////
1520 
1521 #define solid_8_pixels(mask, dst, color)    \
1522     do {                                    \
1523         if (mask & 0x80) dst[0] = color;    \
1524         if (mask & 0x40) dst[1] = color;    \
1525         if (mask & 0x20) dst[2] = color;    \
1526         if (mask & 0x10) dst[3] = color;    \
1527         if (mask & 0x08) dst[4] = color;    \
1528         if (mask & 0x04) dst[5] = color;    \
1529         if (mask & 0x02) dst[6] = color;    \
1530         if (mask & 0x01) dst[7] = color;    \
1531     } while (0)
1532 
1533 #define SK_BLITBWMASK_NAME                  SkARGB32_BlitBW
1534 #define SK_BLITBWMASK_ARGS                  , SkPMColor color
1535 #define SK_BLITBWMASK_BLIT8(mask, dst)      solid_8_pixels(mask, dst, color)
1536 #define SK_BLITBWMASK_GETADDR               writable_addr32
1537 #define SK_BLITBWMASK_DEVTYPE               uint32_t
1538 #include "src/core/SkBlitBWMaskTemplate.h"
1539 
1540 #define blend_8_pixels(mask, dst, sc, dst_scale)                            \
1541     do {                                                                    \
1542         if (mask & 0x80) { dst[0] = sc + SkAlphaMulQ(dst[0], dst_scale); }  \
1543         if (mask & 0x40) { dst[1] = sc + SkAlphaMulQ(dst[1], dst_scale); }  \
1544         if (mask & 0x20) { dst[2] = sc + SkAlphaMulQ(dst[2], dst_scale); }  \
1545         if (mask & 0x10) { dst[3] = sc + SkAlphaMulQ(dst[3], dst_scale); }  \
1546         if (mask & 0x08) { dst[4] = sc + SkAlphaMulQ(dst[4], dst_scale); }  \
1547         if (mask & 0x04) { dst[5] = sc + SkAlphaMulQ(dst[5], dst_scale); }  \
1548         if (mask & 0x02) { dst[6] = sc + SkAlphaMulQ(dst[6], dst_scale); }  \
1549         if (mask & 0x01) { dst[7] = sc + SkAlphaMulQ(dst[7], dst_scale); }  \
1550     } while (0)
1551 
1552 #define SK_BLITBWMASK_NAME                  SkARGB32_BlendBW
1553 #define SK_BLITBWMASK_ARGS                  , uint32_t sc, unsigned dst_scale
1554 #define SK_BLITBWMASK_BLIT8(mask, dst)      blend_8_pixels(mask, dst, sc, dst_scale)
1555 #define SK_BLITBWMASK_GETADDR               writable_addr32
1556 #define SK_BLITBWMASK_DEVTYPE               uint32_t
1557 #include "src/core/SkBlitBWMaskTemplate.h"
1558 
blitMask(const SkMask & mask,const SkIRect & clip)1559 void SkARGB32_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
1560     SkASSERT(mask.fBounds.contains(clip));
1561     SkASSERT(fSrcA != 0xFF);
1562 
1563     if (fSrcA == 0) {
1564         return;
1565     }
1566 
1567     if (blit_color(fDevice, mask, clip, fColor)) {
1568         return;
1569     }
1570 
1571     switch (mask.fFormat) {
1572         case SkMask::kBW_Format:
1573             SkARGB32_BlendBW(fDevice, mask, clip, fPMColor, SkAlpha255To256(255 - fSrcA));
1574             break;
1575         case SkMask::kARGB32_Format:
1576             SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
1577             break;
1578         default:
1579             SK_ABORT("Mask format not handled.");
1580     }
1581 }
1582 
blitMask(const SkMask & mask,const SkIRect & clip)1583 void SkARGB32_Opaque_Blitter::blitMask(const SkMask& mask,
1584                                        const SkIRect& clip) {
1585     SkASSERT(mask.fBounds.contains(clip));
1586 
1587     if (blit_color(fDevice, mask, clip, fColor)) {
1588         return;
1589     }
1590 
1591     switch (mask.fFormat) {
1592         case SkMask::kBW_Format:
1593             SkARGB32_BlitBW(fDevice, mask, clip, fPMColor);
1594             break;
1595         case SkMask::kARGB32_Format:
1596             SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
1597             break;
1598         default:
1599             SK_ABORT("Mask format not handled.");
1600     }
1601 }
1602 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)1603 void SkARGB32_Opaque_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
1604     uint32_t* device = fDevice.writable_addr32(x, y);
1605     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1606 
1607     device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
1608     device[1] = SkFastFourByteInterp(fPMColor, device[1], a1);
1609 }
1610 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)1611 void SkARGB32_Opaque_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
1612     uint32_t* device = fDevice.writable_addr32(x, y);
1613     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1614 
1615     device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
1616     device = (uint32_t*)((char*)device + fDevice.rowBytes());
1617     device[0] = SkFastFourByteInterp(fPMColor, device[0], a1);
1618 }
1619 
1620 ///////////////////////////////////////////////////////////////////////////////
1621 
blitV(int x,int y,int height,SkAlpha alpha)1622 void SkARGB32_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
1623     if (alpha == 0 || fSrcA == 0) {
1624         return;
1625     }
1626 
1627     uint32_t* device = fDevice.writable_addr32(x, y);
1628     uint32_t  color = fPMColor;
1629 
1630     if (alpha != 255) {
1631         color = SkAlphaMulQ(color, SkAlpha255To256(alpha));
1632     }
1633 
1634     unsigned dst_scale = SkAlpha255To256(255 - SkGetPackedA32(color));
1635     size_t rowBytes = fDevice.rowBytes();
1636     while (--height >= 0) {
1637         device[0] = color + SkAlphaMulQ(device[0], dst_scale);
1638         device = (uint32_t*)((char*)device + rowBytes);
1639     }
1640 }
1641 
blitRect(int x,int y,int width,int height)1642 void SkARGB32_Blitter::blitRect(int x, int y, int width, int height) {
1643     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width() && y + height <= fDevice.height());
1644 
1645     if (fSrcA == 0) {
1646         return;
1647     }
1648 
1649     uint32_t*   device = fDevice.writable_addr32(x, y);
1650     uint32_t    color = fPMColor;
1651     size_t      rowBytes = fDevice.rowBytes();
1652 
1653     if (SkGetPackedA32(fPMColor) == 0xFF) {
1654         SkOpts::rect_memset32(device, color, width, rowBytes, height);
1655     } else {
1656         while (height --> 0) {
1657             SkBlitRow::Color32(device, width, color);
1658             device = (uint32_t*)((char*)device + rowBytes);
1659         }
1660     }
1661 }
1662 
1663 #if defined _WIN32
1664 #pragma warning ( pop )
1665 #endif
1666 
1667 ///////////////////////////////////////////////////////////////////////
1668 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])1669 void SkARGB32_Black_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1670                                        const int16_t runs[]) {
1671     uint32_t*   device = fDevice.writable_addr32(x, y);
1672     SkPMColor   black = (SkPMColor)(SK_A32_MASK << SK_A32_SHIFT);
1673 
1674     for (;;) {
1675         int count = runs[0];
1676         SkASSERT(count >= 0);
1677         if (count <= 0) {
1678             return;
1679         }
1680         unsigned aa = antialias[0];
1681         if (aa) {
1682             if (aa == 255) {
1683                 SkOpts::memset32(device, black, count);
1684             } else {
1685                 SkPMColor src = aa << SK_A32_SHIFT;
1686                 unsigned dst_scale = 256 - aa;
1687                 int n = count;
1688                 do {
1689                     --n;
1690                     device[n] = src + SkAlphaMulQ(device[n], dst_scale);
1691                 } while (n > 0);
1692             }
1693         }
1694         runs += count;
1695         antialias += count;
1696         device += count;
1697     }
1698 }
1699 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)1700 void SkARGB32_Black_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
1701     uint32_t* device = fDevice.writable_addr32(x, y);
1702     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1703 
1704     device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
1705     device[1] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[1], 256 - a1);
1706 }
1707 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)1708 void SkARGB32_Black_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
1709     uint32_t* device = fDevice.writable_addr32(x, y);
1710     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1711 
1712     device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
1713     device = (uint32_t*)((char*)device + fDevice.rowBytes());
1714     device[0] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a1);
1715 }
1716 
1717 ///////////////////////////////////////////////////////////////////////////////
1718 
SkARGB32_Shader_Blitter(const SkPixmap & device,const SkPaint & paint,SkShaderBase::Context * shaderContext)1719 SkARGB32_Shader_Blitter::SkARGB32_Shader_Blitter(const SkPixmap& device,
1720         const SkPaint& paint, SkShaderBase::Context* shaderContext)
1721     : INHERITED(device, paint, shaderContext)
1722 {
1723     fBuffer = (SkPMColor*)sk_malloc_throw(device.width() * (sizeof(SkPMColor)));
1724 
1725     SkASSERT(paint.isSrcOver());
1726 
1727     int flags = 0;
1728     if (!(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
1729         flags |= SkBlitRow::kSrcPixelAlpha_Flag32;
1730     }
1731     // we call this on the output from the shader
1732     fProc32 = SkBlitRow::Factory32(flags);
1733     // we call this on the output from the shader + alpha from the aa buffer
1734     fProc32Blend = SkBlitRow::Factory32(flags | SkBlitRow::kGlobalAlpha_Flag32);
1735 
1736     fShadeDirectlyIntoDevice =
1737             SkToBool(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag);
1738 }
1739 
~SkARGB32_Shader_Blitter()1740 SkARGB32_Shader_Blitter::~SkARGB32_Shader_Blitter() {
1741     sk_free(fBuffer);
1742 }
1743 
blitH(int x,int y,int width)1744 void SkARGB32_Shader_Blitter::blitH(int x, int y, int width) {
1745     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
1746 
1747     uint32_t* device = fDevice.writable_addr32(x, y);
1748 
1749     if (fShadeDirectlyIntoDevice) {
1750         fShaderContext->shadeSpan(x, y, device, width);
1751     } else {
1752         SkPMColor*  span = fBuffer;
1753         fShaderContext->shadeSpan(x, y, span, width);
1754         fProc32(device, span, width, 255);
1755     }
1756 }
1757 
blitRect(int x,int y,int width,int height)1758 void SkARGB32_Shader_Blitter::blitRect(int x, int y, int width, int height) {
1759     SkASSERT(x >= 0 && y >= 0 &&
1760              x + width <= fDevice.width() && y + height <= fDevice.height());
1761 
1762     uint32_t*  device = fDevice.writable_addr32(x, y);
1763     size_t     deviceRB = fDevice.rowBytes();
1764     auto*      shaderContext = fShaderContext;
1765     SkPMColor* span = fBuffer;
1766 
1767     if (fShadeDirectlyIntoDevice) {
1768         do {
1769             shaderContext->shadeSpan(x, y, device, width);
1770             y += 1;
1771             device = (uint32_t*)((char*)device + deviceRB);
1772         } while (--height > 0);
1773     } else {
1774         SkBlitRow::Proc32 proc = fProc32;
1775         do {
1776             shaderContext->shadeSpan(x, y, span, width);
1777             proc(device, span, width, 255);
1778             y += 1;
1779             device = (uint32_t*)((char*)device + deviceRB);
1780         } while (--height > 0);
1781     }
1782 }
1783 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])1784 void SkARGB32_Shader_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1785                                         const int16_t runs[]) {
1786     SkPMColor* span = fBuffer;
1787     uint32_t*  device = fDevice.writable_addr32(x, y);
1788     auto*      shaderContext = fShaderContext;
1789 
1790     if (fShadeDirectlyIntoDevice || (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
1791         for (;;) {
1792             int count = *runs;
1793             if (count <= 0) {
1794                 break;
1795             }
1796             int aa = *antialias;
1797             if (aa) {
1798                 if (aa == 255) {
1799                     // cool, have the shader draw right into the device
1800                     shaderContext->shadeSpan(x, y, device, count);
1801                 } else {
1802                     shaderContext->shadeSpan(x, y, span, count);
1803                     fProc32Blend(device, span, count, aa);
1804                 }
1805             }
1806             device += count;
1807             runs += count;
1808             antialias += count;
1809             x += count;
1810         }
1811     } else {
1812         for (;;) {
1813             int count = *runs;
1814             if (count <= 0) {
1815                 break;
1816             }
1817             int aa = *antialias;
1818             if (aa) {
1819                 shaderContext->shadeSpan(x, y, span, count);
1820                 if (aa == 255) {
1821                     fProc32(device, span, count, 255);
1822                 } else {
1823                     fProc32Blend(device, span, count, aa);
1824                 }
1825             }
1826             device += count;
1827             runs += count;
1828             antialias += count;
1829             x += count;
1830         }
1831     }
1832 }
1833 
1834 using U32  = skvx::Vec< 4, uint32_t>;
1835 using U8x4 = skvx::Vec<16, uint8_t>;
1836 using U8   = skvx::Vec< 4, uint8_t>;
1837 
drive(SkPMColor * dst,const SkPMColor * src,const uint8_t * cov,int n,U8x4 (* kernel)(U8x4,U8x4,U8x4))1838 static void drive(SkPMColor* dst, const SkPMColor* src, const uint8_t* cov, int n,
1839                   U8x4 (*kernel)(U8x4,U8x4,U8x4)) {
1840 
1841     auto apply = [kernel](U32 dst, U32 src, U8 cov) -> U32 {
1842         U8x4 cov_splat = skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(cov);
1843         return sk_bit_cast<U32>(kernel(sk_bit_cast<U8x4>(dst),
1844                                        sk_bit_cast<U8x4>(src),
1845                                        cov_splat));
1846     };
1847     while (n >= 4) {
1848         apply(U32::Load(dst), U32::Load(src), U8::Load(cov)).store(dst);
1849         dst += 4;
1850         src += 4;
1851         cov += 4;
1852         n   -= 4;
1853     }
1854     while (n --> 0) {
1855         *dst = apply(U32{*dst}, U32{*src}, U8{*cov})[0];
1856         dst++;
1857         src++;
1858         cov++;
1859     }
1860 }
1861 
blend_row_A8(SkPMColor * dst,const void * mask,const SkPMColor * src,int n)1862 static void blend_row_A8(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1863     auto cov = (const uint8_t*)mask;
1864     drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1865         U8x4 s_aa  = skvx::approx_scale(s, c),
1866              alpha = skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(s_aa);
1867         return s_aa + skvx::approx_scale(d, 255 - alpha);
1868     });
1869 }
1870 
blend_row_A8_opaque(SkPMColor * dst,const void * mask,const SkPMColor * src,int n)1871 static void blend_row_A8_opaque(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1872     auto cov = (const uint8_t*)mask;
1873     drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1874         return skvx::div255( skvx::cast<uint16_t>(s) * skvx::cast<uint16_t>(  c  )
1875                            + skvx::cast<uint16_t>(d) * skvx::cast<uint16_t>(255-c));
1876     });
1877 }
1878 
blend_row_lcd16(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1879 static void blend_row_lcd16(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1880     auto src_alpha_blend = [](int s, int d, int sa, int m) {
1881         return d + SkAlphaMul(s - SkAlphaMul(sa, d), m);
1882     };
1883 
1884     auto upscale_31_to_255 = [](int v) {
1885         return (v << 3) | (v >> 2);
1886     };
1887 
1888     auto mask = (const uint16_t*)vmask;
1889     for (int i = 0; i < n; ++i) {
1890         uint16_t m = mask[i];
1891         if (0 == m) {
1892             continue;
1893         }
1894 
1895         SkPMColor s = src[i];
1896         SkPMColor d = dst[i];
1897 
1898         int srcA = SkGetPackedA32(s);
1899         int srcR = SkGetPackedR32(s);
1900         int srcG = SkGetPackedG32(s);
1901         int srcB = SkGetPackedB32(s);
1902 
1903         srcA += srcA >> 7;
1904 
1905         // We're ignoring the least significant bit of the green coverage channel here.
1906         int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1907         int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1908         int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1909 
1910         // Scale up to 8-bit coverage to work with SkAlphaMul() in src_alpha_blend().
1911         maskR = upscale_31_to_255(maskR);
1912         maskG = upscale_31_to_255(maskG);
1913         maskB = upscale_31_to_255(maskB);
1914 
1915         // This LCD blit routine only works if the destination is opaque.
1916         dst[i] = SkPackARGB32(0xFF,
1917                               src_alpha_blend(srcR, SkGetPackedR32(d), srcA, maskR),
1918                               src_alpha_blend(srcG, SkGetPackedG32(d), srcA, maskG),
1919                               src_alpha_blend(srcB, SkGetPackedB32(d), srcA, maskB));
1920     }
1921 }
1922 
blend_row_LCD16_opaque(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1923 static void blend_row_LCD16_opaque(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1924     auto mask = (const uint16_t*)vmask;
1925 
1926     for (int i = 0; i < n; ++i) {
1927         uint16_t m = mask[i];
1928         if (0 == m) {
1929             continue;
1930         }
1931 
1932         SkPMColor s = src[i];
1933         SkPMColor d = dst[i];
1934 
1935         int srcR = SkGetPackedR32(s);
1936         int srcG = SkGetPackedG32(s);
1937         int srcB = SkGetPackedB32(s);
1938 
1939         // We're ignoring the least significant bit of the green coverage channel here.
1940         int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1941         int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1942         int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1943 
1944         // Now upscale them to 0..32, so we can use blend_32.
1945         maskR = upscale_31_to_32(maskR);
1946         maskG = upscale_31_to_32(maskG);
1947         maskB = upscale_31_to_32(maskB);
1948 
1949         // This LCD blit routine only works if the destination is opaque.
1950         dst[i] = SkPackARGB32(0xFF,
1951                               blend_32(srcR, SkGetPackedR32(d), maskR),
1952                               blend_32(srcG, SkGetPackedG32(d), maskG),
1953                               blend_32(srcB, SkGetPackedB32(d), maskB));
1954     }
1955 }
1956 
blitMask(const SkMask & mask,const SkIRect & clip)1957 void SkARGB32_Shader_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
1958     SkASSERT(mask.fBounds.contains(clip));
1959 
1960     void (*blend_row)(SkPMColor*, const void* mask, const SkPMColor*, int) = nullptr;
1961 
1962     bool opaque = (fShaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag);
1963 
1964     if (mask.fFormat == SkMask::kA8_Format && opaque) {
1965         blend_row = blend_row_A8_opaque;
1966     } else if (mask.fFormat == SkMask::kA8_Format) {
1967         blend_row = blend_row_A8;
1968     } else if (mask.fFormat == SkMask::kLCD16_Format && opaque) {
1969         blend_row = blend_row_LCD16_opaque;
1970     } else if (mask.fFormat == SkMask::kLCD16_Format) {
1971         blend_row = blend_row_lcd16;
1972     } else {
1973         this->INHERITED::blitMask(mask, clip);
1974         return;
1975     }
1976 
1977     const int x = clip.fLeft;
1978     const int width = clip.width();
1979     int y = clip.fTop;
1980     int height = clip.height();
1981 
1982     char* dstRow = (char*)fDevice.writable_addr32(x, y);
1983     const size_t dstRB = fDevice.rowBytes();
1984     const uint8_t* maskRow = (const uint8_t*)mask.getAddr(x, y);
1985     const size_t maskRB = mask.fRowBytes;
1986 
1987     SkPMColor* span = fBuffer;
1988     SkASSERT(blend_row);
1989     do {
1990         fShaderContext->shadeSpan(x, y, span, width);
1991         blend_row(reinterpret_cast<SkPMColor*>(dstRow), maskRow, span, width);
1992         dstRow += dstRB;
1993         maskRow += maskRB;
1994         y += 1;
1995     } while (--height > 0);
1996 }
1997 
blitV(int x,int y,int height,SkAlpha alpha)1998 void SkARGB32_Shader_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
1999     SkASSERT(x >= 0 && y >= 0 && y + height <= fDevice.height());
2000 
2001     uint32_t* device = fDevice.writable_addr32(x, y);
2002     size_t    deviceRB = fDevice.rowBytes();
2003 
2004     if (fShadeDirectlyIntoDevice) {
2005         if (255 == alpha) {
2006             do {
2007                 fShaderContext->shadeSpan(x, y, device, 1);
2008                 y += 1;
2009                 device = (uint32_t*)((char*)device + deviceRB);
2010             } while (--height > 0);
2011         } else {
2012             do {
2013                 SkPMColor c;
2014                 fShaderContext->shadeSpan(x, y, &c, 1);
2015                 *device = SkFourByteInterp(c, *device, alpha);
2016                 y += 1;
2017                 device = (uint32_t*)((char*)device + deviceRB);
2018             } while (--height > 0);
2019         }
2020     } else {
2021         SkPMColor* span = fBuffer;
2022         SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend;
2023         do {
2024             fShaderContext->shadeSpan(x, y, span, 1);
2025             proc(device, span, 1, alpha);
2026             y += 1;
2027             device = (uint32_t*)((char*)device + deviceRB);
2028         } while (--height > 0);
2029     }
2030 }
2031