• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2011 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "Sk4px.h"
9 #include "SkBlitRow.h"
10 #include "SkColorData.h"
11 #include "SkOpts.h"
12 #include "SkUtils.h"
13 
14 // Everyone agrees memcpy() is the best way to do this.
blit_row_s32_opaque(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)15 static void blit_row_s32_opaque(SkPMColor* dst,
16                                 const SkPMColor* src,
17                                 int count,
18                                 U8CPU alpha) {
19     SkASSERT(255 == alpha);
20     memcpy(dst, src, count * sizeof(SkPMColor));
21 }
22 
23 // We have SSE2, NEON, and portable implementations of
24 // blit_row_s32_blend() and blit_row_s32a_blend().
25 
26 // TODO(mtklein): can we do better in NEON than 2 pixels at a time?
27 
28 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
29     #include <emmintrin.h>
30 
SkPMLerp_SSE2(const __m128i & src,const __m128i & dst,const unsigned src_scale)31     static inline __m128i SkPMLerp_SSE2(const __m128i& src,
32                                         const __m128i& dst,
33                                         const unsigned src_scale) {
34         // Computes dst + (((src - dst)*src_scale)>>8)
35         const __m128i mask = _mm_set1_epi32(0x00FF00FF);
36 
37         // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
38         __m128i src_rb = _mm_and_si128(mask, src);
39         __m128i src_ag = _mm_srli_epi16(src, 8);
40         __m128i dst_rb = _mm_and_si128(mask, dst);
41         __m128i dst_ag = _mm_srli_epi16(dst, 8);
42 
43         // Compute scaled differences.
44         __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
45         __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
46         __m128i s = _mm_set1_epi16(src_scale);
47         diff_rb = _mm_mullo_epi16(diff_rb, s);
48         diff_ag = _mm_mullo_epi16(diff_ag, s);
49 
50         // Pack the differences back together.
51         diff_rb = _mm_srli_epi16(diff_rb, 8);
52         diff_ag = _mm_andnot_si128(mask, diff_ag);
53         __m128i diff = _mm_or_si128(diff_rb, diff_ag);
54 
55         // Add difference to destination.
56         return _mm_add_epi8(dst, diff);
57     }
58 
59 
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)60     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
61         SkASSERT(alpha <= 255);
62 
63         auto src4 = (const __m128i*)src;
64         auto dst4 = (      __m128i*)dst;
65 
66         while (count >= 4) {
67             _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4),
68                                                  _mm_loadu_si128(dst4),
69                                                  SkAlpha255To256(alpha)));
70             src4++;
71             dst4++;
72             count -= 4;
73         }
74 
75         src = (const SkPMColor*)src4;
76         dst = (      SkPMColor*)dst4;
77 
78         while (count --> 0) {
79             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
80             src++;
81             dst++;
82         }
83     }
84 
SkBlendARGB32_SSE2(const __m128i & src,const __m128i & dst,const unsigned aa)85     static inline __m128i SkBlendARGB32_SSE2(const __m128i& src,
86                                              const __m128i& dst,
87                                              const unsigned aa) {
88         unsigned alpha = SkAlpha255To256(aa);
89         __m128i src_scale = _mm_set1_epi16(alpha);
90         // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
91         __m128i dst_scale = _mm_srli_epi32(src, 24);
92         // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
93         dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
94         dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
95         dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
96         dst_scale = _mm_srli_epi32(dst_scale, 8);
97         // Duplicate scales into 2x16-bit pattern per pixel.
98         dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
99         dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
100 
101         const __m128i mask = _mm_set1_epi32(0x00FF00FF);
102 
103         // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
104         __m128i src_rb = _mm_and_si128(mask, src);
105         __m128i src_ag = _mm_srli_epi16(src, 8);
106         __m128i dst_rb = _mm_and_si128(mask, dst);
107         __m128i dst_ag = _mm_srli_epi16(dst, 8);
108 
109         // Scale them.
110         src_rb = _mm_mullo_epi16(src_rb, src_scale);
111         src_ag = _mm_mullo_epi16(src_ag, src_scale);
112         dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
113         dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
114 
115         // Add the scaled source and destination.
116         dst_rb = _mm_add_epi16(src_rb, dst_rb);
117         dst_ag = _mm_add_epi16(src_ag, dst_ag);
118 
119         // Unsplay the halves back together.
120         dst_rb = _mm_srli_epi16(dst_rb, 8);
121         dst_ag = _mm_andnot_si128(mask, dst_ag);
122         return _mm_or_si128(dst_rb, dst_ag);
123     }
124 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)125     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
126         SkASSERT(alpha <= 255);
127 
128         auto src4 = (const __m128i*)src;
129         auto dst4 = (      __m128i*)dst;
130 
131         while (count >= 4) {
132             _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4),
133                                                       _mm_loadu_si128(dst4),
134                                                       alpha));
135             src4++;
136             dst4++;
137             count -= 4;
138         }
139 
140         src = (const SkPMColor*)src4;
141         dst = (      SkPMColor*)dst4;
142 
143         while (count --> 0) {
144             *dst = SkBlendARGB32(*src, *dst, alpha);
145             src++;
146             dst++;
147         }
148     }
149 
150 #elif defined(SK_ARM_HAS_NEON)
151     #include <arm_neon.h>
152 
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)153     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
154         SkASSERT(alpha <= 255);
155 
156         uint16_t src_scale = SkAlpha255To256(alpha);
157         uint16_t dst_scale = 256 - src_scale;
158 
159         while (count >= 2) {
160             uint8x8_t vsrc, vdst, vres;
161             uint16x8_t vsrc_wide, vdst_wide;
162 
163             vsrc = vreinterpret_u8_u32(vld1_u32(src));
164             vdst = vreinterpret_u8_u32(vld1_u32(dst));
165 
166             vsrc_wide = vmovl_u8(vsrc);
167             vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
168 
169             vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
170 
171             vdst_wide += vsrc_wide;
172             vres = vshrn_n_u16(vdst_wide, 8);
173 
174             vst1_u32(dst, vreinterpret_u32_u8(vres));
175 
176             src += 2;
177             dst += 2;
178             count -= 2;
179         }
180 
181         if (count == 1) {
182             uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
183             uint16x8_t vsrc_wide, vdst_wide;
184 
185             vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
186             vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
187 
188             vsrc_wide = vmovl_u8(vsrc);
189             vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
190             vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
191             vdst_wide += vsrc_wide;
192             vres = vshrn_n_u16(vdst_wide, 8);
193 
194             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
195         }
196     }
197 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)198     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
199         SkASSERT(alpha < 255);
200 
201         unsigned alpha256 = SkAlpha255To256(alpha);
202 
203         if (count & 1) {
204             uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
205             uint16x8_t vdst_wide, vsrc_wide;
206             unsigned dst_scale;
207 
208             vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
209             vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
210 
211             dst_scale = vget_lane_u8(vsrc, 3);
212             dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
213 
214             vsrc_wide = vmovl_u8(vsrc);
215             vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
216 
217             vdst_wide = vmovl_u8(vdst);
218             vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
219 
220             vdst_wide += vsrc_wide;
221             vres = vshrn_n_u16(vdst_wide, 8);
222 
223             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
224             dst++;
225             src++;
226             count--;
227         }
228 
229         uint8x8_t alpha_mask;
230         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
231         alpha_mask = vld1_u8(alpha_mask_setup);
232 
233         while (count) {
234 
235             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
236             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
237 
238             __builtin_prefetch(src+32);
239             __builtin_prefetch(dst+32);
240 
241             vsrc = vreinterpret_u8_u32(vld1_u32(src));
242             vdst = vreinterpret_u8_u32(vld1_u32(dst));
243 
244             vsrc_scale = vdupq_n_u16(alpha256);
245 
246             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
247             vdst_scale = vmovl_u8(vsrc_alphas);
248             // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
249             // A 16-bit lane would overflow if we used 0xFFFF here,
250             // so use an approximation with 0xFF00 that is off by 1,
251             // and add back 1 after to get the correct value.
252             // This is valid if alpha256 <= 255.
253             vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
254             vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
255             vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
256 
257             vsrc_wide = vmovl_u8(vsrc);
258             vsrc_wide *= vsrc_scale;
259 
260             vdst_wide = vmovl_u8(vdst);
261             vdst_wide *= vdst_scale;
262 
263             vdst_wide += vsrc_wide;
264             vres = vshrn_n_u16(vdst_wide, 8);
265 
266             vst1_u32(dst, vreinterpret_u32_u8(vres));
267 
268             src += 2;
269             dst += 2;
270             count -= 2;
271         }
272     }
273 
274 #else
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)275     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
276         SkASSERT(alpha <= 255);
277         while (count --> 0) {
278             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
279             src++;
280             dst++;
281         }
282     }
283 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)284     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
285         SkASSERT(alpha <= 255);
286         while (count --> 0) {
287             *dst = SkBlendARGB32(*src, *dst, alpha);
288             src++;
289             dst++;
290         }
291     }
292 #endif
293 
Factory32(unsigned flags)294 SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
295     static const SkBlitRow::Proc32 kProcs[] = {
296         blit_row_s32_opaque,
297         blit_row_s32_blend,
298         nullptr,  // blit_row_s32a_opaque is in SkOpts
299         blit_row_s32a_blend
300     };
301 
302     SkASSERT(flags < SK_ARRAY_COUNT(kProcs));
303     flags &= SK_ARRAY_COUNT(kProcs) - 1;  // just to be safe
304 
305     return flags == 2 ? SkOpts::blit_row_s32a_opaque
306                       : kProcs[flags];
307 }
308 
Color32(SkPMColor dst[],const SkPMColor src[],int count,SkPMColor color)309 void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
310     switch (SkGetPackedA32(color)) {
311         case   0: memmove(dst, src, count * sizeof(SkPMColor)); return;
312         case 255: sk_memset32(dst, color, count);               return;
313     }
314 
315     unsigned invA = 255 - SkGetPackedA32(color);
316     invA += invA >> 7;
317     SkASSERT(invA < 256);  // We've should have already handled alpha == 0 externally.
318 
319     Sk16h colorHighAndRound = (Sk4px::DupPMColor(color).widen() << 8) + Sk16h(128);
320     Sk16b invA_16x(invA);
321 
322     Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px {
323         return (src4 * invA_16x).addNarrowHi(colorHighAndRound);
324     });
325 }
326