1 /*
2 * Copyright 2011 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "Sk4px.h"
9 #include "SkBlitRow.h"
10 #include "SkColorData.h"
11 #include "SkOpts.h"
12 #include "SkUtils.h"
13
14 // Everyone agrees memcpy() is the best way to do this.
blit_row_s32_opaque(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)15 static void blit_row_s32_opaque(SkPMColor* dst,
16 const SkPMColor* src,
17 int count,
18 U8CPU alpha) {
19 SkASSERT(255 == alpha);
20 memcpy(dst, src, count * sizeof(SkPMColor));
21 }
22
23 // We have SSE2, NEON, and portable implementations of
24 // blit_row_s32_blend() and blit_row_s32a_blend().
25
26 // TODO(mtklein): can we do better in NEON than 2 pixels at a time?
27
28 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
29 #include <emmintrin.h>
30
SkPMLerp_SSE2(const __m128i & src,const __m128i & dst,const unsigned src_scale)31 static inline __m128i SkPMLerp_SSE2(const __m128i& src,
32 const __m128i& dst,
33 const unsigned src_scale) {
34 // Computes dst + (((src - dst)*src_scale)>>8)
35 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
36
37 // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
38 __m128i src_rb = _mm_and_si128(mask, src);
39 __m128i src_ag = _mm_srli_epi16(src, 8);
40 __m128i dst_rb = _mm_and_si128(mask, dst);
41 __m128i dst_ag = _mm_srli_epi16(dst, 8);
42
43 // Compute scaled differences.
44 __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
45 __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
46 __m128i s = _mm_set1_epi16(src_scale);
47 diff_rb = _mm_mullo_epi16(diff_rb, s);
48 diff_ag = _mm_mullo_epi16(diff_ag, s);
49
50 // Pack the differences back together.
51 diff_rb = _mm_srli_epi16(diff_rb, 8);
52 diff_ag = _mm_andnot_si128(mask, diff_ag);
53 __m128i diff = _mm_or_si128(diff_rb, diff_ag);
54
55 // Add difference to destination.
56 return _mm_add_epi8(dst, diff);
57 }
58
59
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)60 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
61 SkASSERT(alpha <= 255);
62
63 auto src4 = (const __m128i*)src;
64 auto dst4 = ( __m128i*)dst;
65
66 while (count >= 4) {
67 _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4),
68 _mm_loadu_si128(dst4),
69 SkAlpha255To256(alpha)));
70 src4++;
71 dst4++;
72 count -= 4;
73 }
74
75 src = (const SkPMColor*)src4;
76 dst = ( SkPMColor*)dst4;
77
78 while (count --> 0) {
79 *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
80 src++;
81 dst++;
82 }
83 }
84
SkBlendARGB32_SSE2(const __m128i & src,const __m128i & dst,const unsigned aa)85 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src,
86 const __m128i& dst,
87 const unsigned aa) {
88 unsigned alpha = SkAlpha255To256(aa);
89 __m128i src_scale = _mm_set1_epi16(alpha);
90 // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
91 __m128i dst_scale = _mm_srli_epi32(src, 24);
92 // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
93 dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
94 dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
95 dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
96 dst_scale = _mm_srli_epi32(dst_scale, 8);
97 // Duplicate scales into 2x16-bit pattern per pixel.
98 dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
99 dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
100
101 const __m128i mask = _mm_set1_epi32(0x00FF00FF);
102
103 // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
104 __m128i src_rb = _mm_and_si128(mask, src);
105 __m128i src_ag = _mm_srli_epi16(src, 8);
106 __m128i dst_rb = _mm_and_si128(mask, dst);
107 __m128i dst_ag = _mm_srli_epi16(dst, 8);
108
109 // Scale them.
110 src_rb = _mm_mullo_epi16(src_rb, src_scale);
111 src_ag = _mm_mullo_epi16(src_ag, src_scale);
112 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
113 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
114
115 // Add the scaled source and destination.
116 dst_rb = _mm_add_epi16(src_rb, dst_rb);
117 dst_ag = _mm_add_epi16(src_ag, dst_ag);
118
119 // Unsplay the halves back together.
120 dst_rb = _mm_srli_epi16(dst_rb, 8);
121 dst_ag = _mm_andnot_si128(mask, dst_ag);
122 return _mm_or_si128(dst_rb, dst_ag);
123 }
124
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)125 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
126 SkASSERT(alpha <= 255);
127
128 auto src4 = (const __m128i*)src;
129 auto dst4 = ( __m128i*)dst;
130
131 while (count >= 4) {
132 _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4),
133 _mm_loadu_si128(dst4),
134 alpha));
135 src4++;
136 dst4++;
137 count -= 4;
138 }
139
140 src = (const SkPMColor*)src4;
141 dst = ( SkPMColor*)dst4;
142
143 while (count --> 0) {
144 *dst = SkBlendARGB32(*src, *dst, alpha);
145 src++;
146 dst++;
147 }
148 }
149
150 #elif defined(SK_ARM_HAS_NEON)
151 #include <arm_neon.h>
152
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)153 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
154 SkASSERT(alpha <= 255);
155
156 uint16_t src_scale = SkAlpha255To256(alpha);
157 uint16_t dst_scale = 256 - src_scale;
158
159 while (count >= 2) {
160 uint8x8_t vsrc, vdst, vres;
161 uint16x8_t vsrc_wide, vdst_wide;
162
163 vsrc = vreinterpret_u8_u32(vld1_u32(src));
164 vdst = vreinterpret_u8_u32(vld1_u32(dst));
165
166 vsrc_wide = vmovl_u8(vsrc);
167 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
168
169 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
170
171 vdst_wide += vsrc_wide;
172 vres = vshrn_n_u16(vdst_wide, 8);
173
174 vst1_u32(dst, vreinterpret_u32_u8(vres));
175
176 src += 2;
177 dst += 2;
178 count -= 2;
179 }
180
181 if (count == 1) {
182 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
183 uint16x8_t vsrc_wide, vdst_wide;
184
185 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
186 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
187
188 vsrc_wide = vmovl_u8(vsrc);
189 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
190 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
191 vdst_wide += vsrc_wide;
192 vres = vshrn_n_u16(vdst_wide, 8);
193
194 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
195 }
196 }
197
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)198 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
199 SkASSERT(alpha < 255);
200
201 unsigned alpha256 = SkAlpha255To256(alpha);
202
203 if (count & 1) {
204 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
205 uint16x8_t vdst_wide, vsrc_wide;
206 unsigned dst_scale;
207
208 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
209 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
210
211 dst_scale = vget_lane_u8(vsrc, 3);
212 dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
213
214 vsrc_wide = vmovl_u8(vsrc);
215 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
216
217 vdst_wide = vmovl_u8(vdst);
218 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
219
220 vdst_wide += vsrc_wide;
221 vres = vshrn_n_u16(vdst_wide, 8);
222
223 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
224 dst++;
225 src++;
226 count--;
227 }
228
229 uint8x8_t alpha_mask;
230 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
231 alpha_mask = vld1_u8(alpha_mask_setup);
232
233 while (count) {
234
235 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
236 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
237
238 __builtin_prefetch(src+32);
239 __builtin_prefetch(dst+32);
240
241 vsrc = vreinterpret_u8_u32(vld1_u32(src));
242 vdst = vreinterpret_u8_u32(vld1_u32(dst));
243
244 vsrc_scale = vdupq_n_u16(alpha256);
245
246 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
247 vdst_scale = vmovl_u8(vsrc_alphas);
248 // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
249 // A 16-bit lane would overflow if we used 0xFFFF here,
250 // so use an approximation with 0xFF00 that is off by 1,
251 // and add back 1 after to get the correct value.
252 // This is valid if alpha256 <= 255.
253 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
254 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
255 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
256
257 vsrc_wide = vmovl_u8(vsrc);
258 vsrc_wide *= vsrc_scale;
259
260 vdst_wide = vmovl_u8(vdst);
261 vdst_wide *= vdst_scale;
262
263 vdst_wide += vsrc_wide;
264 vres = vshrn_n_u16(vdst_wide, 8);
265
266 vst1_u32(dst, vreinterpret_u32_u8(vres));
267
268 src += 2;
269 dst += 2;
270 count -= 2;
271 }
272 }
273
274 #else
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)275 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
276 SkASSERT(alpha <= 255);
277 while (count --> 0) {
278 *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
279 src++;
280 dst++;
281 }
282 }
283
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)284 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
285 SkASSERT(alpha <= 255);
286 while (count --> 0) {
287 *dst = SkBlendARGB32(*src, *dst, alpha);
288 src++;
289 dst++;
290 }
291 }
292 #endif
293
Factory32(unsigned flags)294 SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
295 static const SkBlitRow::Proc32 kProcs[] = {
296 blit_row_s32_opaque,
297 blit_row_s32_blend,
298 nullptr, // blit_row_s32a_opaque is in SkOpts
299 blit_row_s32a_blend
300 };
301
302 SkASSERT(flags < SK_ARRAY_COUNT(kProcs));
303 flags &= SK_ARRAY_COUNT(kProcs) - 1; // just to be safe
304
305 return flags == 2 ? SkOpts::blit_row_s32a_opaque
306 : kProcs[flags];
307 }
308
Color32(SkPMColor dst[],const SkPMColor src[],int count,SkPMColor color)309 void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
310 switch (SkGetPackedA32(color)) {
311 case 0: memmove(dst, src, count * sizeof(SkPMColor)); return;
312 case 255: sk_memset32(dst, color, count); return;
313 }
314
315 unsigned invA = 255 - SkGetPackedA32(color);
316 invA += invA >> 7;
317 SkASSERT(invA < 256); // We've should have already handled alpha == 0 externally.
318
319 Sk16h colorHighAndRound = (Sk4px::DupPMColor(color).widen() << 8) + Sk16h(128);
320 Sk16b invA_16x(invA);
321
322 Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px {
323 return (src4 * invA_16x).addNarrowHi(colorHighAndRound);
324 });
325 }
326