1 /*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8
9 #include "SkBlitRow_opts_SSE2.h"
10 #include "SkBitmapProcState_opts_SSE2.h"
11 #include "SkColorPriv.h"
12 #include "SkUtils.h"
13
14 #include <emmintrin.h>
15
16 /* SSE2 version of S32_Blend_BlitRow32()
17 * portable version is in core/SkBlitRow_D32.cpp
18 */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20 const SkPMColor* SK_RESTRICT src,
21 int count, U8CPU alpha) {
22 SkASSERT(alpha <= 255);
23 if (count <= 0) {
24 return;
25 }
26
27 uint32_t src_scale = SkAlpha255To256(alpha);
28 uint32_t dst_scale = 256 - src_scale;
29
30 if (count >= 4) {
31 SkASSERT(((size_t)dst & 0x03) == 0);
32 while (((size_t)dst & 0x0F) != 0) {
33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34 src++;
35 dst++;
36 count--;
37 }
38
39 const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 __m128i *d = reinterpret_cast<__m128i*>(dst);
41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
42 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
43
44 // Move scale factors to upper byte of word
45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
47 while (count >= 4) {
48 // Load 4 pixels each of src and dest.
49 __m128i src_pixel = _mm_loadu_si128(s);
50 __m128i dst_pixel = _mm_load_si128(d);
51
52 // Interleave Atom port 0/1 operations based on the execution port
53 // constraints that multiply can only be executed on port 0 (while
54 // boolean operations can be executed on either port 0 or port 1)
55 // because GCC currently doesn't do a good job scheduling
56 // instructions based on these constraints.
57
58 // Get red and blue pixels into lower byte of each word.
59 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
60 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
61
62 // Multiply by scale.
63 // (4 x (0, rs.h, 0, bs.h))
64 // where rs.h stands for the higher byte of r * scale, and
65 // bs.h the higher byte of b * scale.
66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
67
68 // Get alpha and green pixels into higher byte of each word.
69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
71
72 // Multiply by scale.
73 // (4 x (as.h, as.l, gs.h, gs.l))
74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
75
76 // Clear the lower byte of the a*scale and g*scale results
77 // (4 x (as.h, 0, gs.h, 0))
78 src_ag = _mm_and_si128(src_ag, ag_mask);
79
80 // Operations the destination pixels are the same as on the
81 // source pixels. See the comments above.
82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
86 dst_ag = _mm_and_si128(dst_ag, ag_mask);
87
88 // Combine back into RGBA.
89 // (4 x (as.h, rs.h, gs.h, bs.h))
90 src_pixel = _mm_or_si128(src_rb, src_ag);
91 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
92
93 // Add result
94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
95 _mm_store_si128(d, result);
96 s++;
97 d++;
98 count -= 4;
99 }
100 src = reinterpret_cast<const SkPMColor*>(s);
101 dst = reinterpret_cast<SkPMColor*>(d);
102 }
103
104 while (count > 0) {
105 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
106 src++;
107 dst++;
108 count--;
109 }
110 }
111
S32A_Opaque_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)112 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
113 const SkPMColor* SK_RESTRICT src,
114 int count, U8CPU alpha) {
115 SkASSERT(alpha == 255);
116 if (count <= 0) {
117 return;
118 }
119
120 if (count >= 4) {
121 SkASSERT(((size_t)dst & 0x03) == 0);
122 while (((size_t)dst & 0x0F) != 0) {
123 *dst = SkPMSrcOver(*src, *dst);
124 src++;
125 dst++;
126 count--;
127 }
128
129 const __m128i *s = reinterpret_cast<const __m128i*>(src);
130 __m128i *d = reinterpret_cast<__m128i*>(dst);
131 #ifdef SK_USE_ACCURATE_BLENDING
132 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
133 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
134 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
135 while (count >= 4) {
136 // Load 4 pixels
137 __m128i src_pixel = _mm_loadu_si128(s);
138 __m128i dst_pixel = _mm_load_si128(d);
139
140 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
141 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
142 // Shift alphas down to lower 8 bits of each quad.
143 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
144
145 // Copy alpha to upper 3rd byte of each quad
146 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
147
148 // Subtract alphas from 255, to get 0..255
149 alpha = _mm_sub_epi16(c_255, alpha);
150
151 // Multiply by red and blue by src alpha.
152 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
153 // Multiply by alpha and green by src alpha.
154 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
155
156 // dst_rb_low = (dst_rb >> 8)
157 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
158 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
159
160 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
161 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
162 dst_rb = _mm_add_epi16(dst_rb, c_128);
163 dst_rb = _mm_srli_epi16(dst_rb, 8);
164
165 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
166 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
167 dst_ag = _mm_add_epi16(dst_ag, c_128);
168 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
169
170 // Combine back into RGBA.
171 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
172
173 // Add result
174 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
175 _mm_store_si128(d, result);
176 s++;
177 d++;
178 count -= 4;
179 }
180 #else
181 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
182 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
183 while (count >= 4) {
184 // Load 4 pixels
185 __m128i src_pixel = _mm_loadu_si128(s);
186 __m128i dst_pixel = _mm_load_si128(d);
187
188 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
189 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
190
191 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
192 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
193
194 // (a0, a0, a1, a1, a2, g2, a3, g3)
195 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
196
197 // (a0, a0, a1, a1, a2, a2, a3, a3)
198 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
199
200 // Subtract alphas from 256, to get 1..256
201 alpha = _mm_sub_epi16(c_256, alpha);
202
203 // Multiply by red and blue by src alpha.
204 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
205 // Multiply by alpha and green by src alpha.
206 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
207
208 // Divide by 256.
209 dst_rb = _mm_srli_epi16(dst_rb, 8);
210
211 // Mask out high bits (already in the right place)
212 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
213
214 // Combine back into RGBA.
215 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
216
217 // Add result
218 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
219 _mm_store_si128(d, result);
220 s++;
221 d++;
222 count -= 4;
223 }
224 #endif
225 src = reinterpret_cast<const SkPMColor*>(s);
226 dst = reinterpret_cast<SkPMColor*>(d);
227 }
228
229 while (count > 0) {
230 *dst = SkPMSrcOver(*src, *dst);
231 src++;
232 dst++;
233 count--;
234 }
235 }
236
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)237 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
238 const SkPMColor* SK_RESTRICT src,
239 int count, U8CPU alpha) {
240 SkASSERT(alpha <= 255);
241 if (count <= 0) {
242 return;
243 }
244
245 if (count >= 4) {
246 while (((size_t)dst & 0x0F) != 0) {
247 *dst = SkBlendARGB32(*src, *dst, alpha);
248 src++;
249 dst++;
250 count--;
251 }
252
253 uint32_t src_scale = SkAlpha255To256(alpha);
254
255 const __m128i *s = reinterpret_cast<const __m128i*>(src);
256 __m128i *d = reinterpret_cast<__m128i*>(dst);
257 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
258 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
259 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
260 while (count >= 4) {
261 // Load 4 pixels each of src and dest.
262 __m128i src_pixel = _mm_loadu_si128(s);
263 __m128i dst_pixel = _mm_load_si128(d);
264
265 // Get red and blue pixels into lower byte of each word.
266 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
267 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
268
269 // Get alpha and green into lower byte of each word.
270 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
271 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
272
273 // Put per-pixel alpha in low byte of each word.
274 // After the following two statements, the dst_alpha looks like
275 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
276 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
277 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
278
279 // dst_alpha = dst_alpha * src_scale
280 // Because src_scales are in the higher byte of each word and
281 // we use mulhi here, the resulting alpha values are already
282 // in the right place and don't need to be divided by 256.
283 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
284 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
285
286 // Subtract alphas from 256, to get 1..256
287 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
288
289 // Multiply red and blue by dst pixel alpha.
290 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
291 // Multiply alpha and green by dst pixel alpha.
292 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
293
294 // Multiply red and blue by global alpha.
295 // (4 x (0, rs.h, 0, bs.h))
296 // where rs.h stands for the higher byte of r * src_scale,
297 // and bs.h the higher byte of b * src_scale.
298 // Again, because we use mulhi, the resuling red and blue
299 // values are already in the right place and don't need to
300 // be divided by 256.
301 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
302 // Multiply alpha and green by global alpha.
303 // (4 x (0, as.h, 0, gs.h))
304 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
305
306 // Divide by 256.
307 dst_rb = _mm_srli_epi16(dst_rb, 8);
308
309 // Mask out low bits (goodies already in the right place; no need to divide)
310 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
311 // Shift alpha and green to higher byte of each word.
312 // (4 x (as.h, 0, gs.h, 0))
313 src_ag = _mm_slli_epi16(src_ag, 8);
314
315 // Combine back into RGBA.
316 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
317 src_pixel = _mm_or_si128(src_rb, src_ag);
318
319 // Add two pixels into result.
320 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
321 _mm_store_si128(d, result);
322 s++;
323 d++;
324 count -= 4;
325 }
326 src = reinterpret_cast<const SkPMColor*>(s);
327 dst = reinterpret_cast<SkPMColor*>(d);
328 }
329
330 while (count > 0) {
331 *dst = SkBlendARGB32(*src, *dst, alpha);
332 src++;
333 dst++;
334 count--;
335 }
336 }
337
338 /* SSE2 version of Color32()
339 * portable version is in core/SkBlitRow_D32.cpp
340 */
Color32_SSE2(SkPMColor dst[],const SkPMColor src[],int count,SkPMColor color)341 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
342 SkPMColor color) {
343
344 if (count <= 0) {
345 return;
346 }
347
348 if (0 == color) {
349 if (src != dst) {
350 memcpy(dst, src, count * sizeof(SkPMColor));
351 }
352 return;
353 }
354
355 unsigned colorA = SkGetPackedA32(color);
356 if (255 == colorA) {
357 sk_memset32(dst, color, count);
358 } else {
359 unsigned scale = 256 - SkAlpha255To256(colorA);
360
361 if (count >= 4) {
362 SkASSERT(((size_t)dst & 0x03) == 0);
363 while (((size_t)dst & 0x0F) != 0) {
364 *dst = color + SkAlphaMulQ(*src, scale);
365 src++;
366 dst++;
367 count--;
368 }
369
370 const __m128i *s = reinterpret_cast<const __m128i*>(src);
371 __m128i *d = reinterpret_cast<__m128i*>(dst);
372 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
373 __m128i src_scale_wide = _mm_set1_epi16(scale);
374 __m128i color_wide = _mm_set1_epi32(color);
375 while (count >= 4) {
376 // Load 4 pixels each of src and dest.
377 __m128i src_pixel = _mm_loadu_si128(s);
378
379 // Get red and blue pixels into lower byte of each word.
380 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
381
382 // Get alpha and green into lower byte of each word.
383 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
384
385 // Multiply by scale.
386 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
387 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
388
389 // Divide by 256.
390 src_rb = _mm_srli_epi16(src_rb, 8);
391 src_ag = _mm_andnot_si128(rb_mask, src_ag);
392
393 // Combine back into RGBA.
394 src_pixel = _mm_or_si128(src_rb, src_ag);
395
396 // Add color to result.
397 __m128i result = _mm_add_epi8(color_wide, src_pixel);
398
399 // Store result.
400 _mm_store_si128(d, result);
401 s++;
402 d++;
403 count -= 4;
404 }
405 src = reinterpret_cast<const SkPMColor*>(s);
406 dst = reinterpret_cast<SkPMColor*>(d);
407 }
408
409 while (count > 0) {
410 *dst = color + SkAlphaMulQ(*src, scale);
411 src += 1;
412 dst += 1;
413 count--;
414 }
415 }
416 }
417
SkARGB32_A8_BlitMask_SSE2(void * device,size_t dstRB,const void * maskPtr,size_t maskRB,SkColor origColor,int width,int height)418 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
419 size_t maskRB, SkColor origColor,
420 int width, int height) {
421 SkPMColor color = SkPreMultiplyColor(origColor);
422 size_t dstOffset = dstRB - (width << 2);
423 size_t maskOffset = maskRB - width;
424 SkPMColor* dst = (SkPMColor *)device;
425 const uint8_t* mask = (const uint8_t*)maskPtr;
426 do {
427 int count = width;
428 if (count >= 4) {
429 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
430 *dst = SkBlendARGB32(color, *dst, *mask);
431 mask++;
432 dst++;
433 count--;
434 }
435 __m128i *d = reinterpret_cast<__m128i*>(dst);
436 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
437 __m128i c_256 = _mm_set1_epi16(256);
438 __m128i c_1 = _mm_set1_epi16(1);
439 __m128i src_pixel = _mm_set1_epi32(color);
440 while (count >= 4) {
441 // Load 4 pixels each of src and dest.
442 __m128i dst_pixel = _mm_load_si128(d);
443
444 //set the aphla value
445 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
446 0, *(mask+3),0, \
447 *(mask+2),0, *(mask+2),\
448 0,*(mask+1), 0,*(mask+1),\
449 0, *mask,0,*mask);
450
451 //call SkAlpha255To256()
452 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
453
454 // Get red and blue pixels into lower byte of each word.
455 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
456 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
457
458 // Get alpha and green into lower byte of each word.
459 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
460 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
461
462 // Put per-pixel alpha in low byte of each word.
463 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
464 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
465
466 // dst_alpha = dst_alpha * src_scale
467 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
468
469 // Divide by 256.
470 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
471
472 // Subtract alphas from 256, to get 1..256
473 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
474 // Multiply red and blue by dst pixel alpha.
475 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
476 // Multiply alpha and green by dst pixel alpha.
477 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
478
479 // Multiply red and blue by global alpha.
480 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
481 // Multiply alpha and green by global alpha.
482 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
483 // Divide by 256.
484 dst_rb = _mm_srli_epi16(dst_rb, 8);
485 src_rb = _mm_srli_epi16(src_rb, 8);
486
487 // Mask out low bits (goodies already in the right place; no need to divide)
488 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
489 src_ag = _mm_andnot_si128(rb_mask, src_ag);
490
491 // Combine back into RGBA.
492 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
493 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
494
495 // Add two pixels into result.
496 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
497 _mm_store_si128(d, result);
498 // load the next 4 pixel
499 mask = mask + 4;
500 d++;
501 count -= 4;
502 }
503 dst = reinterpret_cast<SkPMColor *>(d);
504 }
505 while(count > 0) {
506 *dst= SkBlendARGB32(color, *dst, *mask);
507 dst += 1;
508 mask++;
509 count --;
510 }
511 dst = (SkPMColor *)((char*)dst + dstOffset);
512 mask += maskOffset;
513 } while (--height != 0);
514 }
515
516 // The following (left) shifts cause the top 5 bits of the mask components to
517 // line up with the corresponding components in an SkPMColor.
518 // Note that the mask's RGB16 order may differ from the SkPMColor order.
519 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
520 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
521 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
522
523 #if SK_R16x5_R32x5_SHIFT == 0
524 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
525 #elif SK_R16x5_R32x5_SHIFT > 0
526 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
527 #else
528 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
529 #endif
530
531 #if SK_G16x5_G32x5_SHIFT == 0
532 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
533 #elif SK_G16x5_G32x5_SHIFT > 0
534 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
535 #else
536 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
537 #endif
538
539 #if SK_B16x5_B32x5_SHIFT == 0
540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
541 #elif SK_B16x5_B32x5_SHIFT > 0
542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
543 #else
544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
545 #endif
546
SkBlendLCD16_SSE2(__m128i & srci,__m128i & dst,__m128i & mask,__m128i & scale)547 static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,
548 __m128i &mask, __m128i &scale) {
549 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
550 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
551 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
552
553 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
554 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
555
556 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
557 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
558
559 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
560 mask = _mm_or_si128(_mm_or_si128(r, g), b);
561
562 // Interleave R,G,B into the lower byte of word.
563 __m128i maskLo, maskHi;
564 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
565 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
566
567 // Upscale to 0..32
568 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
569 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
570
571 maskLo = _mm_mullo_epi16(maskLo, scale);
572 maskHi = _mm_mullo_epi16(maskHi, scale);
573
574 maskLo = _mm_srli_epi16(maskLo, 8);
575 maskHi = _mm_srli_epi16(maskHi, 8);
576
577 // Interleave R,G,B into the lower byte of the word.
578 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
579 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
580
581 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
582 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
583
584 maskLo = _mm_srai_epi16(maskLo, 5);
585 maskHi = _mm_srai_epi16(maskHi, 5);
586
587 // Add two pixels into result.
588 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
589 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
590
591 // Pack into 4 32bit dst pixels
592 return _mm_packus_epi16(resultLo, resultHi);
593 }
594
SkBlendLCD16Opaque_SSE2(__m128i & srci,__m128i & dst,__m128i & mask)595 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,
596 __m128i &mask) {
597 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
598 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
599 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
600
601 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
602 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
603
604 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
605 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
606
607 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
608 mask = _mm_or_si128(_mm_or_si128(r, g), b);
609
610 // Interleave R,G,B into the lower byte of word.
611 __m128i maskLo, maskHi;
612 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
613 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
614
615 // Upscale to 0..32
616 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
617 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
618
619 // Interleave R,G,B into the lower byte of the word.
620 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
621 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
622
623 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
624 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
625
626 maskLo = _mm_srai_epi16(maskLo, 5);
627 maskHi = _mm_srai_epi16(maskHi, 5);
628
629 // Add two pixels into result.
630 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
631 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
632
633 // Pack into 4 32bit dst pixels and force opaque.
634 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
635 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
636 }
637
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)638 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
639 SkColor color, int width, SkPMColor) {
640 if (width <= 0) {
641 return;
642 }
643
644 int srcA = SkColorGetA(color);
645 int srcR = SkColorGetR(color);
646 int srcG = SkColorGetG(color);
647 int srcB = SkColorGetB(color);
648
649 srcA = SkAlpha255To256(srcA);
650
651 if (width >= 4) {
652 SkASSERT(((size_t)dst & 0x03) == 0);
653 while (((size_t)dst & 0x0F) != 0) {
654 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
655 src++;
656 dst++;
657 width--;
658 }
659
660 __m128i *d = reinterpret_cast<__m128i*>(dst);
661 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
662 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
663 __m128i scale = _mm_set1_epi16(srcA);
664 while (width >= 4) {
665 __m128i dst_pixel = _mm_load_si128(d);
666 __m128i mask_pixel = _mm_loadl_epi64(
667 reinterpret_cast<const __m128i*>(src));
668
669 // Check whether mask_pixels are equal to 0 and get the highest bit
670 // of each byte of result, if mask pixes are all zero, we will get
671 // pack_cmp to 0xFFFF
672 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
673 _mm_setzero_si128()));
674
675 // if mask pixels are not all zero, we will blend the dst pixels
676 if (pack_cmp != 0xFFFF) {
677 // Unpack 4 16bit mask pixels to
678 // (p0, 0, p1, 0, p2, 0, p3, 0)
679 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
680 _mm_setzero_si128());
681
682 // Process 4 32bit dst pixels
683 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,
684 mask_pixel, scale);
685 _mm_store_si128(d, result);
686 }
687
688 d++;
689 src += 4;
690 width -= 4;
691 }
692
693 dst = reinterpret_cast<SkPMColor*>(d);
694 }
695
696 while (width > 0) {
697 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
698 src++;
699 dst++;
700 width--;
701 }
702 }
703
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)704 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
705 SkColor color, int width, SkPMColor opaqueDst) {
706 if (width <= 0) {
707 return;
708 }
709
710 int srcR = SkColorGetR(color);
711 int srcG = SkColorGetG(color);
712 int srcB = SkColorGetB(color);
713
714 if (width >= 4) {
715 SkASSERT(((size_t)dst & 0x03) == 0);
716 while (((size_t)dst & 0x0F) != 0) {
717 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
718 src++;
719 dst++;
720 width--;
721 }
722
723 __m128i *d = reinterpret_cast<__m128i*>(dst);
724 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
725 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
726 while (width >= 4) {
727 __m128i dst_pixel = _mm_load_si128(d);
728 __m128i mask_pixel = _mm_loadl_epi64(
729 reinterpret_cast<const __m128i*>(src));
730
731 // Check whether mask_pixels are equal to 0 and get the highest bit
732 // of each byte of result, if mask pixes are all zero, we will get
733 // pack_cmp to 0xFFFF
734 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
735 _mm_setzero_si128()));
736
737 // if mask pixels are not all zero, we will blend the dst pixels
738 if (pack_cmp != 0xFFFF) {
739 // Unpack 4 16bit mask pixels to
740 // (p0, 0, p1, 0, p2, 0, p3, 0)
741 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
742 _mm_setzero_si128());
743
744 // Process 4 32bit dst pixels
745 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,
746 mask_pixel);
747 _mm_store_si128(d, result);
748 }
749
750 d++;
751 src += 4;
752 width -= 4;
753 }
754
755 dst = reinterpret_cast<SkPMColor*>(d);
756 }
757
758 while (width > 0) {
759 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
760 src++;
761 dst++;
762 width--;
763 }
764 }
765