• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2011 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/private/SkColorData.h"
9 #include "src/core/SkBlitRow.h"
10 #include "src/core/SkOpts.h"
11 
12 // Everyone agrees memcpy() is the best way to do this.
blit_row_s32_opaque(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)13 static void blit_row_s32_opaque(SkPMColor* dst,
14                                 const SkPMColor* src,
15                                 int count,
16                                 U8CPU alpha) {
17     SkASSERT(255 == alpha);
18     memcpy(dst, src, count * sizeof(SkPMColor));
19 }
20 
21 // We have SSE2, NEON, and portable implementations of
22 // blit_row_s32_blend() and blit_row_s32a_blend().
23 
24 // TODO(mtklein): can we do better in NEON than 2 pixels at a time?
25 
26 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
27     #include <emmintrin.h>
28 
SkPMLerp_SSE2(const __m128i & src,const __m128i & dst,const unsigned src_scale)29     static inline __m128i SkPMLerp_SSE2(const __m128i& src,
30                                         const __m128i& dst,
31                                         const unsigned src_scale) {
32         // Computes dst + (((src - dst)*src_scale)>>8)
33         const __m128i mask = _mm_set1_epi32(0x00FF00FF);
34 
35         // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
36         __m128i src_rb = _mm_and_si128(mask, src);
37         __m128i src_ag = _mm_srli_epi16(src, 8);
38         __m128i dst_rb = _mm_and_si128(mask, dst);
39         __m128i dst_ag = _mm_srli_epi16(dst, 8);
40 
41         // Compute scaled differences.
42         __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
43         __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
44         __m128i s = _mm_set1_epi16(src_scale);
45         diff_rb = _mm_mullo_epi16(diff_rb, s);
46         diff_ag = _mm_mullo_epi16(diff_ag, s);
47 
48         // Pack the differences back together.
49         diff_rb = _mm_srli_epi16(diff_rb, 8);
50         diff_ag = _mm_andnot_si128(mask, diff_ag);
51         __m128i diff = _mm_or_si128(diff_rb, diff_ag);
52 
53         // Add difference to destination.
54         return _mm_add_epi8(dst, diff);
55     }
56 
57 
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)58     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
59         SkASSERT(alpha <= 255);
60 
61         auto src4 = (const __m128i*)src;
62         auto dst4 = (      __m128i*)dst;
63 
64         while (count >= 4) {
65             _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4),
66                                                  _mm_loadu_si128(dst4),
67                                                  SkAlpha255To256(alpha)));
68             src4++;
69             dst4++;
70             count -= 4;
71         }
72 
73         src = (const SkPMColor*)src4;
74         dst = (      SkPMColor*)dst4;
75 
76         while (count --> 0) {
77             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
78             src++;
79             dst++;
80         }
81     }
82 
SkBlendARGB32_SSE2(const __m128i & src,const __m128i & dst,const unsigned aa)83     static inline __m128i SkBlendARGB32_SSE2(const __m128i& src,
84                                              const __m128i& dst,
85                                              const unsigned aa) {
86         unsigned alpha = SkAlpha255To256(aa);
87         __m128i src_scale = _mm_set1_epi16(alpha);
88         // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
89         __m128i dst_scale = _mm_srli_epi32(src, 24);
90         // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
91         dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
92         dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
93         dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
94         dst_scale = _mm_srli_epi32(dst_scale, 8);
95         // Duplicate scales into 2x16-bit pattern per pixel.
96         dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
97         dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
98 
99         const __m128i mask = _mm_set1_epi32(0x00FF00FF);
100 
101         // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
102         __m128i src_rb = _mm_and_si128(mask, src);
103         __m128i src_ag = _mm_srli_epi16(src, 8);
104         __m128i dst_rb = _mm_and_si128(mask, dst);
105         __m128i dst_ag = _mm_srli_epi16(dst, 8);
106 
107         // Scale them.
108         src_rb = _mm_mullo_epi16(src_rb, src_scale);
109         src_ag = _mm_mullo_epi16(src_ag, src_scale);
110         dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
111         dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
112 
113         // Add the scaled source and destination.
114         dst_rb = _mm_add_epi16(src_rb, dst_rb);
115         dst_ag = _mm_add_epi16(src_ag, dst_ag);
116 
117         // Unsplay the halves back together.
118         dst_rb = _mm_srli_epi16(dst_rb, 8);
119         dst_ag = _mm_andnot_si128(mask, dst_ag);
120         return _mm_or_si128(dst_rb, dst_ag);
121     }
122 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)123     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
124         SkASSERT(alpha <= 255);
125 
126         auto src4 = (const __m128i*)src;
127         auto dst4 = (      __m128i*)dst;
128 
129         while (count >= 4) {
130             _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4),
131                                                       _mm_loadu_si128(dst4),
132                                                       alpha));
133             src4++;
134             dst4++;
135             count -= 4;
136         }
137 
138         src = (const SkPMColor*)src4;
139         dst = (      SkPMColor*)dst4;
140 
141         while (count --> 0) {
142             *dst = SkBlendARGB32(*src, *dst, alpha);
143             src++;
144             dst++;
145         }
146     }
147 
148 #elif defined(SK_ARM_HAS_NEON)
149     #include <arm_neon.h>
150 
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)151     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
152         SkASSERT(alpha <= 255);
153 
154         uint16_t src_scale = SkAlpha255To256(alpha);
155         uint16_t dst_scale = 256 - src_scale;
156 
157         while (count >= 2) {
158             uint8x8_t vsrc, vdst, vres;
159             uint16x8_t vsrc_wide, vdst_wide;
160 
161             vsrc = vreinterpret_u8_u32(vld1_u32(src));
162             vdst = vreinterpret_u8_u32(vld1_u32(dst));
163 
164             vsrc_wide = vmovl_u8(vsrc);
165             vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
166 
167             vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
168 
169             vdst_wide += vsrc_wide;
170             vres = vshrn_n_u16(vdst_wide, 8);
171 
172             vst1_u32(dst, vreinterpret_u32_u8(vres));
173 
174             src += 2;
175             dst += 2;
176             count -= 2;
177         }
178 
179         if (count == 1) {
180             uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
181             uint16x8_t vsrc_wide, vdst_wide;
182 
183             vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
184             vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
185 
186             vsrc_wide = vmovl_u8(vsrc);
187             vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
188             vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
189             vdst_wide += vsrc_wide;
190             vres = vshrn_n_u16(vdst_wide, 8);
191 
192             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
193         }
194     }
195 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)196     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
197         SkASSERT(alpha < 255);
198 
199         unsigned alpha256 = SkAlpha255To256(alpha);
200 
201         if (count & 1) {
202             uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
203             uint16x8_t vdst_wide, vsrc_wide;
204             unsigned dst_scale;
205 
206             vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
207             vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
208 
209             dst_scale = vget_lane_u8(vsrc, 3);
210             dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
211 
212             vsrc_wide = vmovl_u8(vsrc);
213             vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
214 
215             vdst_wide = vmovl_u8(vdst);
216             vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
217 
218             vdst_wide += vsrc_wide;
219             vres = vshrn_n_u16(vdst_wide, 8);
220 
221             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
222             dst++;
223             src++;
224             count--;
225         }
226 
227         uint8x8_t alpha_mask;
228         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
229         alpha_mask = vld1_u8(alpha_mask_setup);
230 
231         while (count) {
232 
233             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
234             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
235 
236             __builtin_prefetch(src+32);
237             __builtin_prefetch(dst+32);
238 
239             vsrc = vreinterpret_u8_u32(vld1_u32(src));
240             vdst = vreinterpret_u8_u32(vld1_u32(dst));
241 
242             vsrc_scale = vdupq_n_u16(alpha256);
243 
244             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
245             vdst_scale = vmovl_u8(vsrc_alphas);
246             // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
247             // A 16-bit lane would overflow if we used 0xFFFF here,
248             // so use an approximation with 0xFF00 that is off by 1,
249             // and add back 1 after to get the correct value.
250             // This is valid if alpha256 <= 255.
251             vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
252             vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
253             vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
254 
255             vsrc_wide = vmovl_u8(vsrc);
256             vsrc_wide *= vsrc_scale;
257 
258             vdst_wide = vmovl_u8(vdst);
259             vdst_wide *= vdst_scale;
260 
261             vdst_wide += vsrc_wide;
262             vres = vshrn_n_u16(vdst_wide, 8);
263 
264             vst1_u32(dst, vreinterpret_u32_u8(vres));
265 
266             src += 2;
267             dst += 2;
268             count -= 2;
269         }
270     }
271 
272 #else
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)273     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
274         SkASSERT(alpha <= 255);
275         while (count --> 0) {
276             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
277             src++;
278             dst++;
279         }
280     }
281 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)282     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
283         SkASSERT(alpha <= 255);
284         while (count --> 0) {
285             *dst = SkBlendARGB32(*src, *dst, alpha);
286             src++;
287             dst++;
288         }
289     }
290 #endif
291 
Factory32(unsigned flags)292 SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
293     static const SkBlitRow::Proc32 kProcs[] = {
294         blit_row_s32_opaque,
295         blit_row_s32_blend,
296         nullptr,  // blit_row_s32a_opaque is in SkOpts
297         blit_row_s32a_blend
298     };
299 
300     SkASSERT(flags < SK_ARRAY_COUNT(kProcs));
301     flags &= SK_ARRAY_COUNT(kProcs) - 1;  // just to be safe
302 
303     return flags == 2 ? SkOpts::blit_row_s32a_opaque
304                       : kProcs[flags];
305 }
306 
Color32(SkPMColor dst[],const SkPMColor src[],int count,SkPMColor color)307 void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
308     switch (SkGetPackedA32(color)) {
309         case   0: memmove(dst, src, count * sizeof(SkPMColor)); return;
310         case 255: sk_memset32(dst, color, count);               return;
311     }
312     return SkOpts::blit_row_color32(dst, src, count, color);
313 }
314