1 /*
2 * Copyright 2011 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "include/private/SkColorData.h"
9 #include "src/core/SkBlitRow.h"
10 #include "src/core/SkOpts.h"
11
12 // Everyone agrees memcpy() is the best way to do this.
blit_row_s32_opaque(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)13 static void blit_row_s32_opaque(SkPMColor* dst,
14 const SkPMColor* src,
15 int count,
16 U8CPU alpha) {
17 SkASSERT(255 == alpha);
18 memcpy(dst, src, count * sizeof(SkPMColor));
19 }
20
21 // We have SSE2, NEON, and portable implementations of
22 // blit_row_s32_blend() and blit_row_s32a_blend().
23
24 // TODO(mtklein): can we do better in NEON than 2 pixels at a time?
25
26 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
27 #include <emmintrin.h>
28
SkPMLerp_SSE2(const __m128i & src,const __m128i & dst,const unsigned src_scale)29 static inline __m128i SkPMLerp_SSE2(const __m128i& src,
30 const __m128i& dst,
31 const unsigned src_scale) {
32 // Computes dst + (((src - dst)*src_scale)>>8)
33 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
34
35 // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
36 __m128i src_rb = _mm_and_si128(mask, src);
37 __m128i src_ag = _mm_srli_epi16(src, 8);
38 __m128i dst_rb = _mm_and_si128(mask, dst);
39 __m128i dst_ag = _mm_srli_epi16(dst, 8);
40
41 // Compute scaled differences.
42 __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
43 __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
44 __m128i s = _mm_set1_epi16(src_scale);
45 diff_rb = _mm_mullo_epi16(diff_rb, s);
46 diff_ag = _mm_mullo_epi16(diff_ag, s);
47
48 // Pack the differences back together.
49 diff_rb = _mm_srli_epi16(diff_rb, 8);
50 diff_ag = _mm_andnot_si128(mask, diff_ag);
51 __m128i diff = _mm_or_si128(diff_rb, diff_ag);
52
53 // Add difference to destination.
54 return _mm_add_epi8(dst, diff);
55 }
56
57
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)58 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
59 SkASSERT(alpha <= 255);
60
61 auto src4 = (const __m128i*)src;
62 auto dst4 = ( __m128i*)dst;
63
64 while (count >= 4) {
65 _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4),
66 _mm_loadu_si128(dst4),
67 SkAlpha255To256(alpha)));
68 src4++;
69 dst4++;
70 count -= 4;
71 }
72
73 src = (const SkPMColor*)src4;
74 dst = ( SkPMColor*)dst4;
75
76 while (count --> 0) {
77 *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
78 src++;
79 dst++;
80 }
81 }
82
SkBlendARGB32_SSE2(const __m128i & src,const __m128i & dst,const unsigned aa)83 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src,
84 const __m128i& dst,
85 const unsigned aa) {
86 unsigned alpha = SkAlpha255To256(aa);
87 __m128i src_scale = _mm_set1_epi16(alpha);
88 // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
89 __m128i dst_scale = _mm_srli_epi32(src, 24);
90 // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
91 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
92 dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
93 dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
94 dst_scale = _mm_srli_epi32(dst_scale, 8);
95 // Duplicate scales into 2x16-bit pattern per pixel.
96 dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
97 dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
98
99 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
100
101 // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
102 __m128i src_rb = _mm_and_si128(mask, src);
103 __m128i src_ag = _mm_srli_epi16(src, 8);
104 __m128i dst_rb = _mm_and_si128(mask, dst);
105 __m128i dst_ag = _mm_srli_epi16(dst, 8);
106
107 // Scale them.
108 src_rb = _mm_mullo_epi16(src_rb, src_scale);
109 src_ag = _mm_mullo_epi16(src_ag, src_scale);
110 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
111 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
112
113 // Add the scaled source and destination.
114 dst_rb = _mm_add_epi16(src_rb, dst_rb);
115 dst_ag = _mm_add_epi16(src_ag, dst_ag);
116
117 // Unsplay the halves back together.
118 dst_rb = _mm_srli_epi16(dst_rb, 8);
119 dst_ag = _mm_andnot_si128(mask, dst_ag);
120 return _mm_or_si128(dst_rb, dst_ag);
121 }
122
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)123 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
124 SkASSERT(alpha <= 255);
125
126 auto src4 = (const __m128i*)src;
127 auto dst4 = ( __m128i*)dst;
128
129 while (count >= 4) {
130 _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4),
131 _mm_loadu_si128(dst4),
132 alpha));
133 src4++;
134 dst4++;
135 count -= 4;
136 }
137
138 src = (const SkPMColor*)src4;
139 dst = ( SkPMColor*)dst4;
140
141 while (count --> 0) {
142 *dst = SkBlendARGB32(*src, *dst, alpha);
143 src++;
144 dst++;
145 }
146 }
147
148 #elif defined(SK_ARM_HAS_NEON)
149 #include <arm_neon.h>
150
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)151 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
152 SkASSERT(alpha <= 255);
153
154 uint16_t src_scale = SkAlpha255To256(alpha);
155 uint16_t dst_scale = 256 - src_scale;
156
157 while (count >= 2) {
158 uint8x8_t vsrc, vdst, vres;
159 uint16x8_t vsrc_wide, vdst_wide;
160
161 vsrc = vreinterpret_u8_u32(vld1_u32(src));
162 vdst = vreinterpret_u8_u32(vld1_u32(dst));
163
164 vsrc_wide = vmovl_u8(vsrc);
165 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
166
167 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
168
169 vdst_wide += vsrc_wide;
170 vres = vshrn_n_u16(vdst_wide, 8);
171
172 vst1_u32(dst, vreinterpret_u32_u8(vres));
173
174 src += 2;
175 dst += 2;
176 count -= 2;
177 }
178
179 if (count == 1) {
180 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
181 uint16x8_t vsrc_wide, vdst_wide;
182
183 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
184 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
185
186 vsrc_wide = vmovl_u8(vsrc);
187 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
188 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
189 vdst_wide += vsrc_wide;
190 vres = vshrn_n_u16(vdst_wide, 8);
191
192 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
193 }
194 }
195
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)196 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
197 SkASSERT(alpha < 255);
198
199 unsigned alpha256 = SkAlpha255To256(alpha);
200
201 if (count & 1) {
202 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
203 uint16x8_t vdst_wide, vsrc_wide;
204 unsigned dst_scale;
205
206 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
207 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
208
209 dst_scale = vget_lane_u8(vsrc, 3);
210 dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
211
212 vsrc_wide = vmovl_u8(vsrc);
213 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
214
215 vdst_wide = vmovl_u8(vdst);
216 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
217
218 vdst_wide += vsrc_wide;
219 vres = vshrn_n_u16(vdst_wide, 8);
220
221 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
222 dst++;
223 src++;
224 count--;
225 }
226
227 uint8x8_t alpha_mask;
228 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
229 alpha_mask = vld1_u8(alpha_mask_setup);
230
231 while (count) {
232
233 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
234 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
235
236 __builtin_prefetch(src+32);
237 __builtin_prefetch(dst+32);
238
239 vsrc = vreinterpret_u8_u32(vld1_u32(src));
240 vdst = vreinterpret_u8_u32(vld1_u32(dst));
241
242 vsrc_scale = vdupq_n_u16(alpha256);
243
244 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
245 vdst_scale = vmovl_u8(vsrc_alphas);
246 // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
247 // A 16-bit lane would overflow if we used 0xFFFF here,
248 // so use an approximation with 0xFF00 that is off by 1,
249 // and add back 1 after to get the correct value.
250 // This is valid if alpha256 <= 255.
251 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
252 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
253 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
254
255 vsrc_wide = vmovl_u8(vsrc);
256 vsrc_wide *= vsrc_scale;
257
258 vdst_wide = vmovl_u8(vdst);
259 vdst_wide *= vdst_scale;
260
261 vdst_wide += vsrc_wide;
262 vres = vshrn_n_u16(vdst_wide, 8);
263
264 vst1_u32(dst, vreinterpret_u32_u8(vres));
265
266 src += 2;
267 dst += 2;
268 count -= 2;
269 }
270 }
271
272 #else
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)273 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
274 SkASSERT(alpha <= 255);
275 while (count --> 0) {
276 *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
277 src++;
278 dst++;
279 }
280 }
281
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)282 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
283 SkASSERT(alpha <= 255);
284 while (count --> 0) {
285 *dst = SkBlendARGB32(*src, *dst, alpha);
286 src++;
287 dst++;
288 }
289 }
290 #endif
291
Factory32(unsigned flags)292 SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
293 static const SkBlitRow::Proc32 kProcs[] = {
294 blit_row_s32_opaque,
295 blit_row_s32_blend,
296 nullptr, // blit_row_s32a_opaque is in SkOpts
297 blit_row_s32a_blend
298 };
299
300 SkASSERT(flags < SK_ARRAY_COUNT(kProcs));
301 flags &= SK_ARRAY_COUNT(kProcs) - 1; // just to be safe
302
303 return flags == 2 ? SkOpts::blit_row_s32a_opaque
304 : kProcs[flags];
305 }
306
Color32(SkPMColor dst[],const SkPMColor src[],int count,SkPMColor color)307 void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
308 switch (SkGetPackedA32(color)) {
309 case 0: memmove(dst, src, count * sizeof(SkPMColor)); return;
310 case 255: sk_memset32(dst, color, count); return;
311 }
312 return SkOpts::blit_row_color32(dst, src, count, color);
313 }
314