1 /*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8
9 #include "SkBlitRow_opts_SSE2.h"
10 #include "SkBitmapProcState_opts_SSE2.h"
11 #include "SkColorPriv.h"
12 #include "SkUtils.h"
13
14 #include <emmintrin.h>
15
16 /* SSE2 version of S32_Blend_BlitRow32()
17 * portable version is in core/SkBlitRow_D32.cpp
18 */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20 const SkPMColor* SK_RESTRICT src,
21 int count, U8CPU alpha) {
22 SkASSERT(alpha <= 255);
23 if (count <= 0) {
24 return;
25 }
26
27 uint32_t src_scale = SkAlpha255To256(alpha);
28 uint32_t dst_scale = 256 - src_scale;
29
30 if (count >= 4) {
31 SkASSERT(((size_t)dst & 0x03) == 0);
32 while (((size_t)dst & 0x0F) != 0) {
33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34 src++;
35 dst++;
36 count--;
37 }
38
39 const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 __m128i *d = reinterpret_cast<__m128i*>(dst);
41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
42 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
43
44 // Move scale factors to upper byte of word
45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
47 while (count >= 4) {
48 // Load 4 pixels each of src and dest.
49 __m128i src_pixel = _mm_loadu_si128(s);
50 __m128i dst_pixel = _mm_load_si128(d);
51
52 // Interleave Atom port 0/1 operations based on the execution port
53 // constraints that multiply can only be executed on port 0 (while
54 // boolean operations can be executed on either port 0 or port 1)
55 // because GCC currently doesn't do a good job scheduling
56 // instructions based on these constraints.
57
58 // Get red and blue pixels into lower byte of each word.
59 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
60 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
61
62 // Multiply by scale.
63 // (4 x (0, rs.h, 0, bs.h))
64 // where rs.h stands for the higher byte of r * scale, and
65 // bs.h the higher byte of b * scale.
66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
67
68 // Get alpha and green pixels into higher byte of each word.
69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
71
72 // Multiply by scale.
73 // (4 x (as.h, as.l, gs.h, gs.l))
74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
75
76 // Clear the lower byte of the a*scale and g*scale results
77 // (4 x (as.h, 0, gs.h, 0))
78 src_ag = _mm_and_si128(src_ag, ag_mask);
79
80 // Operations the destination pixels are the same as on the
81 // source pixels. See the comments above.
82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
86 dst_ag = _mm_and_si128(dst_ag, ag_mask);
87
88 // Combine back into RGBA.
89 // (4 x (as.h, rs.h, gs.h, bs.h))
90 src_pixel = _mm_or_si128(src_rb, src_ag);
91 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
92
93 // Add result
94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
95 _mm_store_si128(d, result);
96 s++;
97 d++;
98 count -= 4;
99 }
100 src = reinterpret_cast<const SkPMColor*>(s);
101 dst = reinterpret_cast<SkPMColor*>(d);
102 }
103
104 while (count > 0) {
105 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
106 src++;
107 dst++;
108 count--;
109 }
110 }
111
S32A_Opaque_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)112 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
113 const SkPMColor* SK_RESTRICT src,
114 int count, U8CPU alpha) {
115 SkASSERT(alpha == 255);
116 if (count <= 0) {
117 return;
118 }
119
120 if (count >= 4) {
121 SkASSERT(((size_t)dst & 0x03) == 0);
122 while (((size_t)dst & 0x0F) != 0) {
123 *dst = SkPMSrcOver(*src, *dst);
124 src++;
125 dst++;
126 count--;
127 }
128
129 const __m128i *s = reinterpret_cast<const __m128i*>(src);
130 __m128i *d = reinterpret_cast<__m128i*>(dst);
131 #ifdef SK_USE_ACCURATE_BLENDING
132 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
133 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
134 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
135 while (count >= 4) {
136 // Load 4 pixels
137 __m128i src_pixel = _mm_loadu_si128(s);
138 __m128i dst_pixel = _mm_load_si128(d);
139
140 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
141 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
142 // Shift alphas down to lower 8 bits of each quad.
143 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
144
145 // Copy alpha to upper 3rd byte of each quad
146 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
147
148 // Subtract alphas from 255, to get 0..255
149 alpha = _mm_sub_epi16(c_255, alpha);
150
151 // Multiply by red and blue by src alpha.
152 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
153 // Multiply by alpha and green by src alpha.
154 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
155
156 // dst_rb_low = (dst_rb >> 8)
157 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
158 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
159
160 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
161 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
162 dst_rb = _mm_add_epi16(dst_rb, c_128);
163 dst_rb = _mm_srli_epi16(dst_rb, 8);
164
165 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
166 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
167 dst_ag = _mm_add_epi16(dst_ag, c_128);
168 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
169
170 // Combine back into RGBA.
171 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
172
173 // Add result
174 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
175 _mm_store_si128(d, result);
176 s++;
177 d++;
178 count -= 4;
179 }
180 #else
181 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
182 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
183 while (count >= 4) {
184 // Load 4 pixels
185 __m128i src_pixel = _mm_loadu_si128(s);
186 __m128i dst_pixel = _mm_load_si128(d);
187
188 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
189 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
190
191 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
192 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
193
194 // (a0, a0, a1, a1, a2, g2, a3, g3)
195 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
196
197 // (a0, a0, a1, a1, a2, a2, a3, a3)
198 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
199
200 // Subtract alphas from 256, to get 1..256
201 alpha = _mm_sub_epi16(c_256, alpha);
202
203 // Multiply by red and blue by src alpha.
204 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
205 // Multiply by alpha and green by src alpha.
206 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
207
208 // Divide by 256.
209 dst_rb = _mm_srli_epi16(dst_rb, 8);
210
211 // Mask out high bits (already in the right place)
212 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
213
214 // Combine back into RGBA.
215 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
216
217 // Add result
218 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
219 _mm_store_si128(d, result);
220 s++;
221 d++;
222 count -= 4;
223 }
224 #endif
225 src = reinterpret_cast<const SkPMColor*>(s);
226 dst = reinterpret_cast<SkPMColor*>(d);
227 }
228
229 while (count > 0) {
230 *dst = SkPMSrcOver(*src, *dst);
231 src++;
232 dst++;
233 count--;
234 }
235 }
236
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)237 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
238 const SkPMColor* SK_RESTRICT src,
239 int count, U8CPU alpha) {
240 SkASSERT(alpha <= 255);
241 if (count <= 0) {
242 return;
243 }
244
245 if (count >= 4) {
246 while (((size_t)dst & 0x0F) != 0) {
247 *dst = SkBlendARGB32(*src, *dst, alpha);
248 src++;
249 dst++;
250 count--;
251 }
252
253 uint32_t src_scale = SkAlpha255To256(alpha);
254
255 const __m128i *s = reinterpret_cast<const __m128i*>(src);
256 __m128i *d = reinterpret_cast<__m128i*>(dst);
257 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
258 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
259 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
260 while (count >= 4) {
261 // Load 4 pixels each of src and dest.
262 __m128i src_pixel = _mm_loadu_si128(s);
263 __m128i dst_pixel = _mm_load_si128(d);
264
265 // Get red and blue pixels into lower byte of each word.
266 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
267 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
268
269 // Get alpha and green into lower byte of each word.
270 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
271 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
272
273 // Put per-pixel alpha in low byte of each word.
274 // After the following two statements, the dst_alpha looks like
275 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
276 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
277 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
278
279 // dst_alpha = dst_alpha * src_scale
280 // Because src_scales are in the higher byte of each word and
281 // we use mulhi here, the resulting alpha values are already
282 // in the right place and don't need to be divided by 256.
283 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
284 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
285
286 // Subtract alphas from 256, to get 1..256
287 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
288
289 // Multiply red and blue by dst pixel alpha.
290 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
291 // Multiply alpha and green by dst pixel alpha.
292 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
293
294 // Multiply red and blue by global alpha.
295 // (4 x (0, rs.h, 0, bs.h))
296 // where rs.h stands for the higher byte of r * src_scale,
297 // and bs.h the higher byte of b * src_scale.
298 // Again, because we use mulhi, the resuling red and blue
299 // values are already in the right place and don't need to
300 // be divided by 256.
301 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
302 // Multiply alpha and green by global alpha.
303 // (4 x (0, as.h, 0, gs.h))
304 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
305
306 // Divide by 256.
307 dst_rb = _mm_srli_epi16(dst_rb, 8);
308
309 // Mask out low bits (goodies already in the right place; no need to divide)
310 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
311 // Shift alpha and green to higher byte of each word.
312 // (4 x (as.h, 0, gs.h, 0))
313 src_ag = _mm_slli_epi16(src_ag, 8);
314
315 // Combine back into RGBA.
316 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
317 src_pixel = _mm_or_si128(src_rb, src_ag);
318
319 // Add two pixels into result.
320 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
321 _mm_store_si128(d, result);
322 s++;
323 d++;
324 count -= 4;
325 }
326 src = reinterpret_cast<const SkPMColor*>(s);
327 dst = reinterpret_cast<SkPMColor*>(d);
328 }
329
330 while (count > 0) {
331 *dst = SkBlendARGB32(*src, *dst, alpha);
332 src++;
333 dst++;
334 count--;
335 }
336 }
337
338 /* SSE2 version of Color32()
339 * portable version is in core/SkBlitRow_D32.cpp
340 */
Color32_SSE2(SkPMColor dst[],const SkPMColor src[],int count,SkPMColor color)341 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
342 SkPMColor color) {
343
344 if (count <= 0) {
345 return;
346 }
347
348 if (0 == color) {
349 if (src != dst) {
350 memcpy(dst, src, count * sizeof(SkPMColor));
351 }
352 return;
353 }
354
355 unsigned colorA = SkGetPackedA32(color);
356 if (255 == colorA) {
357 sk_memset32(dst, color, count);
358 } else {
359 unsigned scale = 256 - SkAlpha255To256(colorA);
360
361 if (count >= 4) {
362 SkASSERT(((size_t)dst & 0x03) == 0);
363 while (((size_t)dst & 0x0F) != 0) {
364 *dst = color + SkAlphaMulQ(*src, scale);
365 src++;
366 dst++;
367 count--;
368 }
369
370 const __m128i *s = reinterpret_cast<const __m128i*>(src);
371 __m128i *d = reinterpret_cast<__m128i*>(dst);
372 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
373 __m128i src_scale_wide = _mm_set1_epi16(scale);
374 __m128i color_wide = _mm_set1_epi32(color);
375 while (count >= 4) {
376 // Load 4 pixels each of src and dest.
377 __m128i src_pixel = _mm_loadu_si128(s);
378
379 // Get red and blue pixels into lower byte of each word.
380 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
381
382 // Get alpha and green into lower byte of each word.
383 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
384
385 // Multiply by scale.
386 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
387 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
388
389 // Divide by 256.
390 src_rb = _mm_srli_epi16(src_rb, 8);
391 src_ag = _mm_andnot_si128(rb_mask, src_ag);
392
393 // Combine back into RGBA.
394 src_pixel = _mm_or_si128(src_rb, src_ag);
395
396 // Add color to result.
397 __m128i result = _mm_add_epi8(color_wide, src_pixel);
398
399 // Store result.
400 _mm_store_si128(d, result);
401 s++;
402 d++;
403 count -= 4;
404 }
405 src = reinterpret_cast<const SkPMColor*>(s);
406 dst = reinterpret_cast<SkPMColor*>(d);
407 }
408
409 while (count > 0) {
410 *dst = color + SkAlphaMulQ(*src, scale);
411 src += 1;
412 dst += 1;
413 count--;
414 }
415 }
416 }
417
SkARGB32_A8_BlitMask_SSE2(void * device,size_t dstRB,const void * maskPtr,size_t maskRB,SkColor origColor,int width,int height)418 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
419 size_t maskRB, SkColor origColor,
420 int width, int height) {
421 SkPMColor color = SkPreMultiplyColor(origColor);
422 size_t dstOffset = dstRB - (width << 2);
423 size_t maskOffset = maskRB - width;
424 SkPMColor* dst = (SkPMColor *)device;
425 const uint8_t* mask = (const uint8_t*)maskPtr;
426 do {
427 int count = width;
428 if (count >= 4) {
429 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
430 *dst = SkBlendARGB32(color, *dst, *mask);
431 mask++;
432 dst++;
433 count--;
434 }
435 __m128i *d = reinterpret_cast<__m128i*>(dst);
436 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
437 __m128i c_256 = _mm_set1_epi16(256);
438 __m128i c_1 = _mm_set1_epi16(1);
439 __m128i src_pixel = _mm_set1_epi32(color);
440 while (count >= 4) {
441 // Load 4 pixels each of src and dest.
442 __m128i dst_pixel = _mm_load_si128(d);
443
444 //set the aphla value
445 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
446 0, *(mask+3),0, \
447 *(mask+2),0, *(mask+2),\
448 0,*(mask+1), 0,*(mask+1),\
449 0, *mask,0,*mask);
450
451 //call SkAlpha255To256()
452 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
453
454 // Get red and blue pixels into lower byte of each word.
455 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
456 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
457
458 // Get alpha and green into lower byte of each word.
459 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
460 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
461
462 // Put per-pixel alpha in low byte of each word.
463 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
464 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
465
466 // dst_alpha = dst_alpha * src_scale
467 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
468
469 // Divide by 256.
470 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
471
472 // Subtract alphas from 256, to get 1..256
473 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
474 // Multiply red and blue by dst pixel alpha.
475 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
476 // Multiply alpha and green by dst pixel alpha.
477 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
478
479 // Multiply red and blue by global alpha.
480 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
481 // Multiply alpha and green by global alpha.
482 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
483 // Divide by 256.
484 dst_rb = _mm_srli_epi16(dst_rb, 8);
485 src_rb = _mm_srli_epi16(src_rb, 8);
486
487 // Mask out low bits (goodies already in the right place; no need to divide)
488 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
489 src_ag = _mm_andnot_si128(rb_mask, src_ag);
490
491 // Combine back into RGBA.
492 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
493 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
494
495 // Add two pixels into result.
496 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
497 _mm_store_si128(d, result);
498 // load the next 4 pixel
499 mask = mask + 4;
500 d++;
501 count -= 4;
502 }
503 dst = reinterpret_cast<SkPMColor *>(d);
504 }
505 while(count > 0) {
506 *dst= SkBlendARGB32(color, *dst, *mask);
507 dst += 1;
508 mask++;
509 count --;
510 }
511 dst = (SkPMColor *)((char*)dst + dstOffset);
512 mask += maskOffset;
513 } while (--height != 0);
514 }
515
516 // The following (left) shifts cause the top 5 bits of the mask components to
517 // line up with the corresponding components in an SkPMColor.
518 // Note that the mask's RGB16 order may differ from the SkPMColor order.
519 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
520 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
521 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
522
523 #if SK_R16x5_R32x5_SHIFT == 0
524 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
525 #elif SK_R16x5_R32x5_SHIFT > 0
526 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
527 #else
528 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
529 #endif
530
531 #if SK_G16x5_G32x5_SHIFT == 0
532 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
533 #elif SK_G16x5_G32x5_SHIFT > 0
534 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
535 #else
536 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
537 #endif
538
539 #if SK_B16x5_B32x5_SHIFT == 0
540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
541 #elif SK_B16x5_B32x5_SHIFT > 0
542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
543 #else
544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
545 #endif
546
SkBlendLCD16_SSE2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)547 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
548 __m128i &mask, __m128i &srcA) {
549 // In the following comments, the components of src, dst and mask are
550 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
551 // by an R, G, B, or A suffix. Components of one of the four pixels that
552 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
553 // example is the blue channel of the second destination pixel. Memory
554 // layout is shown for an ARGB byte order in a color value.
555
556 // src and srcA store 8-bit values interleaved with zeros.
557 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
558 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
559 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
560 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
561 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
562 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
563 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
564
565 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
566 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
567 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
568 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
569
570 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
571 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
572 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
573
574 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
575 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
576 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
577
578 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
579 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
580 // 8-bit position
581 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
582 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
583 mask = _mm_or_si128(_mm_or_si128(r, g), b);
584
585 // Interleave R,G,B into the lower byte of word.
586 // i.e. split the sixteen 8-bit values from mask into two sets of eight
587 // 16-bit values, padded by zero.
588 __m128i maskLo, maskHi;
589 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
590 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
591 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
592 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
593
594 // Upscale from 0..31 to 0..32
595 // (allows to replace division by left-shift further down)
596 // Left-shift each component by 4 and add the result back to that component,
597 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
598 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
599 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
600
601 // Multiply each component of maskLo and maskHi by srcA
602 maskLo = _mm_mullo_epi16(maskLo, srcA);
603 maskHi = _mm_mullo_epi16(maskHi, srcA);
604
605 // Left shift mask components by 8 (divide by 256)
606 maskLo = _mm_srli_epi16(maskLo, 8);
607 maskHi = _mm_srli_epi16(maskHi, 8);
608
609 // Interleave R,G,B into the lower byte of the word
610 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
611 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
612 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
613 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
614
615 // mask = (src - dst) * mask
616 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
617 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
618
619 // mask = (src - dst) * mask >> 5
620 maskLo = _mm_srai_epi16(maskLo, 5);
621 maskHi = _mm_srai_epi16(maskHi, 5);
622
623 // Add two pixels into result.
624 // result = dst + ((src - dst) * mask >> 5)
625 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
626 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
627
628 // Pack into 4 32bit dst pixels.
629 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
630 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
631 // clamping to 255 if necessary.
632 return _mm_packus_epi16(resultLo, resultHi);
633 }
634
SkBlendLCD16Opaque_SSE2(__m128i & src,__m128i & dst,__m128i & mask)635 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
636 __m128i &mask) {
637 // In the following comments, the components of src, dst and mask are
638 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
639 // by an R, G, B, or A suffix. Components of one of the four pixels that
640 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
641 // example is the blue channel of the second destination pixel. Memory
642 // layout is shown for an ARGB byte order in a color value.
643
644 // src and srcA store 8-bit values interleaved with zeros.
645 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
646 // mask stores 16-bit values (shown as high and low bytes) interleaved with
647 // zeros
648 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
649 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
650
651 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
652 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
653 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
654 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
655
656 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
657 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
658 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
659
660 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
661 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
662 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
663
664 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
665 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
666 // 8-bit position
667 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
668 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
669 mask = _mm_or_si128(_mm_or_si128(r, g), b);
670
671 // Interleave R,G,B into the lower byte of word.
672 // i.e. split the sixteen 8-bit values from mask into two sets of eight
673 // 16-bit values, padded by zero.
674 __m128i maskLo, maskHi;
675 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
676 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
677 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
678 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
679
680 // Upscale from 0..31 to 0..32
681 // (allows to replace division by left-shift further down)
682 // Left-shift each component by 4 and add the result back to that component,
683 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
684 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
685 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
686
687 // Interleave R,G,B into the lower byte of the word
688 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
689 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
690 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
691 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
692
693 // mask = (src - dst) * mask
694 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
695 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
696
697 // mask = (src - dst) * mask >> 5
698 maskLo = _mm_srai_epi16(maskLo, 5);
699 maskHi = _mm_srai_epi16(maskHi, 5);
700
701 // Add two pixels into result.
702 // result = dst + ((src - dst) * mask >> 5)
703 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
704 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
705
706 // Pack into 4 32bit dst pixels and force opaque.
707 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
708 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
709 // clamping to 255 if necessary. Set alpha components to 0xFF.
710 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
711 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
712 }
713
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)714 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
715 SkColor src, int width, SkPMColor) {
716 if (width <= 0) {
717 return;
718 }
719
720 int srcA = SkColorGetA(src);
721 int srcR = SkColorGetR(src);
722 int srcG = SkColorGetG(src);
723 int srcB = SkColorGetB(src);
724
725 srcA = SkAlpha255To256(srcA);
726
727 if (width >= 4) {
728 SkASSERT(((size_t)dst & 0x03) == 0);
729 while (((size_t)dst & 0x0F) != 0) {
730 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
731 mask++;
732 dst++;
733 width--;
734 }
735
736 __m128i *d = reinterpret_cast<__m128i*>(dst);
737 // Set alpha to 0xFF and replicate source four times in SSE register.
738 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
739 // Interleave with zeros to get two sets of four 16-bit values.
740 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
741 // Set srcA_sse to contain eight copies of srcA, padded with zero.
742 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
743 __m128i srcA_sse = _mm_set1_epi16(srcA);
744 while (width >= 4) {
745 // Load four destination pixels into dst_sse.
746 __m128i dst_sse = _mm_load_si128(d);
747 // Load four 16-bit masks into lower half of mask_sse.
748 __m128i mask_sse = _mm_loadl_epi64(
749 reinterpret_cast<const __m128i*>(mask));
750
751 // Check whether masks are equal to 0 and get the highest bit
752 // of each byte of result, if masks are all zero, we will get
753 // pack_cmp to 0xFFFF
754 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
755 _mm_setzero_si128()));
756
757 // if mask pixels are not all zero, we will blend the dst pixels
758 if (pack_cmp != 0xFFFF) {
759 // Unpack 4 16bit mask pixels to
760 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
761 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
762 mask_sse = _mm_unpacklo_epi16(mask_sse,
763 _mm_setzero_si128());
764
765 // Process 4 32bit dst pixels
766 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
767 mask_sse, srcA_sse);
768 _mm_store_si128(d, result);
769 }
770
771 d++;
772 mask += 4;
773 width -= 4;
774 }
775
776 dst = reinterpret_cast<SkPMColor*>(d);
777 }
778
779 while (width > 0) {
780 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
781 mask++;
782 dst++;
783 width--;
784 }
785 }
786
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)787 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
788 SkColor src, int width, SkPMColor opaqueDst) {
789 if (width <= 0) {
790 return;
791 }
792
793 int srcR = SkColorGetR(src);
794 int srcG = SkColorGetG(src);
795 int srcB = SkColorGetB(src);
796
797 if (width >= 4) {
798 SkASSERT(((size_t)dst & 0x03) == 0);
799 while (((size_t)dst & 0x0F) != 0) {
800 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
801 mask++;
802 dst++;
803 width--;
804 }
805
806 __m128i *d = reinterpret_cast<__m128i*>(dst);
807 // Set alpha to 0xFF and replicate source four times in SSE register.
808 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
809 // Set srcA_sse to contain eight copies of srcA, padded with zero.
810 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
811 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
812 while (width >= 4) {
813 // Load four destination pixels into dst_sse.
814 __m128i dst_sse = _mm_load_si128(d);
815 // Load four 16-bit masks into lower half of mask_sse.
816 __m128i mask_sse = _mm_loadl_epi64(
817 reinterpret_cast<const __m128i*>(mask));
818
819 // Check whether masks are equal to 0 and get the highest bit
820 // of each byte of result, if masks are all zero, we will get
821 // pack_cmp to 0xFFFF
822 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
823 _mm_setzero_si128()));
824
825 // if mask pixels are not all zero, we will blend the dst pixels
826 if (pack_cmp != 0xFFFF) {
827 // Unpack 4 16bit mask pixels to
828 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
829 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
830 mask_sse = _mm_unpacklo_epi16(mask_sse,
831 _mm_setzero_si128());
832
833 // Process 4 32bit dst pixels
834 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
835 mask_sse);
836 _mm_store_si128(d, result);
837 }
838
839 d++;
840 mask += 4;
841 width -= 4;
842 }
843
844 dst = reinterpret_cast<SkPMColor*>(d);
845 }
846
847 while (width > 0) {
848 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
849 mask++;
850 dst++;
851 width--;
852 }
853 }
854