• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include <emmintrin.h>
9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBlitRow_opts_SSE2.h"
11 #include "SkColorPriv.h"
12 #include "SkColor_opts_SSE2.h"
13 #include "SkDither.h"
14 #include "SkMSAN.h"
15 #include "SkUtils.h"
16 
17 /* SSE2 version of S32_Blend_BlitRow32()
18  * portable version is in core/SkBlitRow_D32.cpp
19  */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)20 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21                               const SkPMColor* SK_RESTRICT src,
22                               int count, U8CPU alpha) {
23     SkASSERT(alpha <= 255);
24     if (count <= 0) {
25         return;
26     }
27 
28     uint32_t src_scale = SkAlpha255To256(alpha);
29 
30     if (count >= 4) {
31         SkASSERT(((size_t)dst & 0x03) == 0);
32         while (((size_t)dst & 0x0F) != 0) {
33             *dst = SkPMLerp(*src, *dst, src_scale);
34             src++;
35             dst++;
36             count--;
37         }
38 
39         const __m128i *s = reinterpret_cast<const __m128i*>(src);
40         __m128i *d = reinterpret_cast<__m128i*>(dst);
41 
42         while (count >= 4) {
43             // Load 4 pixels each of src and dest.
44             __m128i src_pixel = _mm_loadu_si128(s);
45             __m128i dst_pixel = _mm_load_si128(d);
46 
47             __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
48             _mm_store_si128(d, result);
49             s++;
50             d++;
51             count -= 4;
52         }
53         src = reinterpret_cast<const SkPMColor*>(s);
54         dst = reinterpret_cast<SkPMColor*>(d);
55     }
56 
57     while (count > 0) {
58         *dst = SkPMLerp(*src, *dst, src_scale);
59         src++;
60         dst++;
61         count--;
62     }
63 }
64 
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)65 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
66                                const SkPMColor* SK_RESTRICT src,
67                                int count, U8CPU alpha) {
68     SkASSERT(alpha <= 255);
69     if (count <= 0) {
70         return;
71     }
72 
73     if (count >= 4) {
74         while (((size_t)dst & 0x0F) != 0) {
75             *dst = SkBlendARGB32(*src, *dst, alpha);
76             src++;
77             dst++;
78             count--;
79         }
80 
81         const __m128i *s = reinterpret_cast<const __m128i*>(src);
82         __m128i *d = reinterpret_cast<__m128i*>(dst);
83         while (count >= 4) {
84             // Load 4 pixels each of src and dest.
85             __m128i src_pixel = _mm_loadu_si128(s);
86             __m128i dst_pixel = _mm_load_si128(d);
87 
88             __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
89             _mm_store_si128(d, result);
90             s++;
91             d++;
92             count -= 4;
93         }
94         src = reinterpret_cast<const SkPMColor*>(s);
95         dst = reinterpret_cast<SkPMColor*>(d);
96     }
97 
98     while (count > 0) {
99         *dst = SkBlendARGB32(*src, *dst, alpha);
100         src++;
101         dst++;
102         count--;
103     }
104 }
105 
106 // The following (left) shifts cause the top 5 bits of the mask components to
107 // line up with the corresponding components in an SkPMColor.
108 // Note that the mask's RGB16 order may differ from the SkPMColor order.
109 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
110 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
111 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
112 
113 #if SK_R16x5_R32x5_SHIFT == 0
114     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
115 #elif SK_R16x5_R32x5_SHIFT > 0
116     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
117 #else
118     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
119 #endif
120 
121 #if SK_G16x5_G32x5_SHIFT == 0
122     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
123 #elif SK_G16x5_G32x5_SHIFT > 0
124     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
125 #else
126     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
127 #endif
128 
129 #if SK_B16x5_B32x5_SHIFT == 0
130     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
131 #elif SK_B16x5_B32x5_SHIFT > 0
132     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
133 #else
134     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
135 #endif
136 
SkBlendLCD16_SSE2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)137 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
138                                  __m128i &mask, __m128i &srcA) {
139     // In the following comments, the components of src, dst and mask are
140     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
141     // by an R, G, B, or A suffix. Components of one of the four pixels that
142     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
143     // example is the blue channel of the second destination pixel. Memory
144     // layout is shown for an ARGB byte order in a color value.
145 
146     // src and srcA store 8-bit values interleaved with zeros.
147     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
148     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
149     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
150     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
151     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
152     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
153     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
154 
155     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
156     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
157     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
158                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
159 
160     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
161     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
162                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
163 
164     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
165     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
166                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
167 
168     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
169     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
170     // 8-bit position
171     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
172     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
173     mask = _mm_or_si128(_mm_or_si128(r, g), b);
174 
175     // Interleave R,G,B into the lower byte of word.
176     // i.e. split the sixteen 8-bit values from mask into two sets of eight
177     // 16-bit values, padded by zero.
178     __m128i maskLo, maskHi;
179     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
180     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
181     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
182     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
183 
184     // Upscale from 0..31 to 0..32
185     // (allows to replace division by left-shift further down)
186     // Left-shift each component by 4 and add the result back to that component,
187     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
188     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
189     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
190 
191     // Multiply each component of maskLo and maskHi by srcA
192     maskLo = _mm_mullo_epi16(maskLo, srcA);
193     maskHi = _mm_mullo_epi16(maskHi, srcA);
194 
195     // Left shift mask components by 8 (divide by 256)
196     maskLo = _mm_srli_epi16(maskLo, 8);
197     maskHi = _mm_srli_epi16(maskHi, 8);
198 
199     // Interleave R,G,B into the lower byte of the word
200     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
201     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
202     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
203     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
204 
205     // mask = (src - dst) * mask
206     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
207     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
208 
209     // mask = (src - dst) * mask >> 5
210     maskLo = _mm_srai_epi16(maskLo, 5);
211     maskHi = _mm_srai_epi16(maskHi, 5);
212 
213     // Add two pixels into result.
214     // result = dst + ((src - dst) * mask >> 5)
215     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
216     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
217 
218     // Pack into 4 32bit dst pixels.
219     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
220     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
221     // clamping to 255 if necessary.
222     return _mm_packus_epi16(resultLo, resultHi);
223 }
224 
SkBlendLCD16Opaque_SSE2(__m128i & src,__m128i & dst,__m128i & mask)225 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
226                                        __m128i &mask) {
227     // In the following comments, the components of src, dst and mask are
228     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
229     // by an R, G, B, or A suffix. Components of one of the four pixels that
230     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
231     // example is the blue channel of the second destination pixel. Memory
232     // layout is shown for an ARGB byte order in a color value.
233 
234     // src and srcA store 8-bit values interleaved with zeros.
235     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
236     // mask stores 16-bit values (shown as high and low bytes) interleaved with
237     // zeros
238     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
239     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
240 
241     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
242     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
243     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
244                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
245 
246     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
247     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
248                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
249 
250     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
251     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
252                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
253 
254     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
255     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
256     // 8-bit position
257     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
258     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
259     mask = _mm_or_si128(_mm_or_si128(r, g), b);
260 
261     // Interleave R,G,B into the lower byte of word.
262     // i.e. split the sixteen 8-bit values from mask into two sets of eight
263     // 16-bit values, padded by zero.
264     __m128i maskLo, maskHi;
265     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
266     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
267     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
268     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
269 
270     // Upscale from 0..31 to 0..32
271     // (allows to replace division by left-shift further down)
272     // Left-shift each component by 4 and add the result back to that component,
273     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
274     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
275     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
276 
277     // Interleave R,G,B into the lower byte of the word
278     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
279     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
280     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
281     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
282 
283     // mask = (src - dst) * mask
284     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
285     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
286 
287     // mask = (src - dst) * mask >> 5
288     maskLo = _mm_srai_epi16(maskLo, 5);
289     maskHi = _mm_srai_epi16(maskHi, 5);
290 
291     // Add two pixels into result.
292     // result = dst + ((src - dst) * mask >> 5)
293     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
294     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
295 
296     // Pack into 4 32bit dst pixels and force opaque.
297     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
298     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
299     // clamping to 255 if necessary. Set alpha components to 0xFF.
300     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
301                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
302 }
303 
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)304 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
305                          SkColor src, int width, SkPMColor) {
306     if (width <= 0) {
307         return;
308     }
309 
310     int srcA = SkColorGetA(src);
311     int srcR = SkColorGetR(src);
312     int srcG = SkColorGetG(src);
313     int srcB = SkColorGetB(src);
314 
315     srcA = SkAlpha255To256(srcA);
316 
317     if (width >= 4) {
318         SkASSERT(((size_t)dst & 0x03) == 0);
319         while (((size_t)dst & 0x0F) != 0) {
320             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
321             mask++;
322             dst++;
323             width--;
324         }
325 
326         __m128i *d = reinterpret_cast<__m128i*>(dst);
327         // Set alpha to 0xFF and replicate source four times in SSE register.
328         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
329         // Interleave with zeros to get two sets of four 16-bit values.
330         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
331         // Set srcA_sse to contain eight copies of srcA, padded with zero.
332         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
333         __m128i srcA_sse = _mm_set1_epi16(srcA);
334         while (width >= 4) {
335             // Load four destination pixels into dst_sse.
336             __m128i dst_sse = _mm_load_si128(d);
337             // Load four 16-bit masks into lower half of mask_sse.
338             __m128i mask_sse = _mm_loadl_epi64(
339                                    reinterpret_cast<const __m128i*>(mask));
340 
341             // Check whether masks are equal to 0 and get the highest bit
342             // of each byte of result, if masks are all zero, we will get
343             // pack_cmp to 0xFFFF
344             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
345                                              _mm_setzero_si128()));
346 
347             // if mask pixels are not all zero, we will blend the dst pixels
348             if (pack_cmp != 0xFFFF) {
349                 // Unpack 4 16bit mask pixels to
350                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
351                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
352                 mask_sse = _mm_unpacklo_epi16(mask_sse,
353                                               _mm_setzero_si128());
354 
355                 // Process 4 32bit dst pixels
356                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
357                                                    mask_sse, srcA_sse);
358                 _mm_store_si128(d, result);
359             }
360 
361             d++;
362             mask += 4;
363             width -= 4;
364         }
365 
366         dst = reinterpret_cast<SkPMColor*>(d);
367     }
368 
369     while (width > 0) {
370         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
371         mask++;
372         dst++;
373         width--;
374     }
375 }
376 
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)377 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
378                                SkColor src, int width, SkPMColor opaqueDst) {
379     if (width <= 0) {
380         return;
381     }
382 
383     int srcR = SkColorGetR(src);
384     int srcG = SkColorGetG(src);
385     int srcB = SkColorGetB(src);
386 
387     if (width >= 4) {
388         SkASSERT(((size_t)dst & 0x03) == 0);
389         while (((size_t)dst & 0x0F) != 0) {
390             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
391             mask++;
392             dst++;
393             width--;
394         }
395 
396         __m128i *d = reinterpret_cast<__m128i*>(dst);
397         // Set alpha to 0xFF and replicate source four times in SSE register.
398         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
399         // Set srcA_sse to contain eight copies of srcA, padded with zero.
400         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
401         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
402         while (width >= 4) {
403             // Load four destination pixels into dst_sse.
404             __m128i dst_sse = _mm_load_si128(d);
405             // Load four 16-bit masks into lower half of mask_sse.
406             __m128i mask_sse = _mm_loadl_epi64(
407                                    reinterpret_cast<const __m128i*>(mask));
408 
409             // Check whether masks are equal to 0 and get the highest bit
410             // of each byte of result, if masks are all zero, we will get
411             // pack_cmp to 0xFFFF
412             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
413                                              _mm_setzero_si128()));
414 
415             // if mask pixels are not all zero, we will blend the dst pixels
416             if (pack_cmp != 0xFFFF) {
417                 // Unpack 4 16bit mask pixels to
418                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
419                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
420                 mask_sse = _mm_unpacklo_epi16(mask_sse,
421                                               _mm_setzero_si128());
422 
423                 // Process 4 32bit dst pixels
424                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
425                                                          mask_sse);
426                 _mm_store_si128(d, result);
427             }
428 
429             d++;
430             mask += 4;
431             width -= 4;
432         }
433 
434         dst = reinterpret_cast<SkPMColor*>(d);
435     }
436 
437     while (width > 0) {
438         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
439         mask++;
440         dst++;
441         width--;
442     }
443 }
444