1 /*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8
9 #include "SkBlitRow_opts_SSE2.h"
10 #include "SkColorPriv.h"
11 #include "SkUtils.h"
12
13 #include <emmintrin.h>
14
15 /* SSE2 version of S32_Blend_BlitRow32()
16 * portable version is in core/SkBlitRow_D32.cpp
17 */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)18 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
19 const SkPMColor* SK_RESTRICT src,
20 int count, U8CPU alpha) {
21 SkASSERT(alpha <= 255);
22 if (count <= 0) {
23 return;
24 }
25
26 uint32_t src_scale = SkAlpha255To256(alpha);
27 uint32_t dst_scale = 256 - src_scale;
28
29 if (count >= 4) {
30 SkASSERT(((size_t)dst & 0x03) == 0);
31 while (((size_t)dst & 0x0F) != 0) {
32 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
33 src++;
34 dst++;
35 count--;
36 }
37
38 const __m128i *s = reinterpret_cast<const __m128i*>(src);
39 __m128i *d = reinterpret_cast<__m128i*>(dst);
40 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
41 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
42
43 // Move scale factors to upper byte of word
44 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
45 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
46 while (count >= 4) {
47 // Load 4 pixels each of src and dest.
48 __m128i src_pixel = _mm_loadu_si128(s);
49 __m128i dst_pixel = _mm_load_si128(d);
50
51 // Interleave Atom port 0/1 operations based on the execution port
52 // constraints that multiply can only be executed on port 0 (while
53 // boolean operations can be executed on either port 0 or port 1)
54 // because GCC currently doesn't do a good job scheduling
55 // instructions based on these constraints.
56
57 // Get red and blue pixels into lower byte of each word.
58 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
59 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
60
61 // Multiply by scale.
62 // (4 x (0, rs.h, 0, bs.h))
63 // where rs.h stands for the higher byte of r * scale, and
64 // bs.h the higher byte of b * scale.
65 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
66
67 // Get alpha and green pixels into higher byte of each word.
68 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
69 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
70
71 // Multiply by scale.
72 // (4 x (as.h, as.l, gs.h, gs.l))
73 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
74
75 // Clear the lower byte of the a*scale and g*scale results
76 // (4 x (as.h, 0, gs.h, 0))
77 src_ag = _mm_and_si128(src_ag, ag_mask);
78
79 // Operations the destination pixels are the same as on the
80 // source pixels. See the comments above.
81 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
82 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
83 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
84 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
85 dst_ag = _mm_and_si128(dst_ag, ag_mask);
86
87 // Combine back into RGBA.
88 // (4 x (as.h, rs.h, gs.h, bs.h))
89 src_pixel = _mm_or_si128(src_rb, src_ag);
90 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
91
92 // Add result
93 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
94 _mm_store_si128(d, result);
95 s++;
96 d++;
97 count -= 4;
98 }
99 src = reinterpret_cast<const SkPMColor*>(s);
100 dst = reinterpret_cast<SkPMColor*>(d);
101 }
102
103 while (count > 0) {
104 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
105 src++;
106 dst++;
107 count--;
108 }
109 }
110
S32A_Opaque_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)111 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
112 const SkPMColor* SK_RESTRICT src,
113 int count, U8CPU alpha) {
114 SkASSERT(alpha == 255);
115 if (count <= 0) {
116 return;
117 }
118
119 if (count >= 4) {
120 SkASSERT(((size_t)dst & 0x03) == 0);
121 while (((size_t)dst & 0x0F) != 0) {
122 *dst = SkPMSrcOver(*src, *dst);
123 src++;
124 dst++;
125 count--;
126 }
127
128 const __m128i *s = reinterpret_cast<const __m128i*>(src);
129 __m128i *d = reinterpret_cast<__m128i*>(dst);
130 #ifdef SK_USE_ACCURATE_BLENDING
131 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
132 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
133 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
134 while (count >= 4) {
135 // Load 4 pixels
136 __m128i src_pixel = _mm_loadu_si128(s);
137 __m128i dst_pixel = _mm_load_si128(d);
138
139 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
140 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
141 // Shift alphas down to lower 8 bits of each quad.
142 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
143
144 // Copy alpha to upper 3rd byte of each quad
145 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
146
147 // Subtract alphas from 255, to get 0..255
148 alpha = _mm_sub_epi16(c_255, alpha);
149
150 // Multiply by red and blue by src alpha.
151 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
152 // Multiply by alpha and green by src alpha.
153 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
154
155 // dst_rb_low = (dst_rb >> 8)
156 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
157 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
158
159 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
160 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
161 dst_rb = _mm_add_epi16(dst_rb, c_128);
162 dst_rb = _mm_srli_epi16(dst_rb, 8);
163
164 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
165 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
166 dst_ag = _mm_add_epi16(dst_ag, c_128);
167 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
168
169 // Combine back into RGBA.
170 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
171
172 // Add result
173 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
174 _mm_store_si128(d, result);
175 s++;
176 d++;
177 count -= 4;
178 }
179 #else
180 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
181 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
182 while (count >= 4) {
183 // Load 4 pixels
184 __m128i src_pixel = _mm_loadu_si128(s);
185 __m128i dst_pixel = _mm_load_si128(d);
186
187 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
188 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
189
190 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
191 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
192
193 // (a0, a0, a1, a1, a2, g2, a3, g3)
194 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
195
196 // (a0, a0, a1, a1, a2, a2, a3, a3)
197 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
198
199 // Subtract alphas from 256, to get 1..256
200 alpha = _mm_sub_epi16(c_256, alpha);
201
202 // Multiply by red and blue by src alpha.
203 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
204 // Multiply by alpha and green by src alpha.
205 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
206
207 // Divide by 256.
208 dst_rb = _mm_srli_epi16(dst_rb, 8);
209
210 // Mask out high bits (already in the right place)
211 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
212
213 // Combine back into RGBA.
214 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
215
216 // Add result
217 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
218 _mm_store_si128(d, result);
219 s++;
220 d++;
221 count -= 4;
222 }
223 #endif
224 src = reinterpret_cast<const SkPMColor*>(s);
225 dst = reinterpret_cast<SkPMColor*>(d);
226 }
227
228 while (count > 0) {
229 *dst = SkPMSrcOver(*src, *dst);
230 src++;
231 dst++;
232 count--;
233 }
234 }
235
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)236 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
237 const SkPMColor* SK_RESTRICT src,
238 int count, U8CPU alpha) {
239 SkASSERT(alpha <= 255);
240 if (count <= 0) {
241 return;
242 }
243
244 if (count >= 4) {
245 while (((size_t)dst & 0x0F) != 0) {
246 *dst = SkBlendARGB32(*src, *dst, alpha);
247 src++;
248 dst++;
249 count--;
250 }
251
252 uint32_t src_scale = SkAlpha255To256(alpha);
253
254 const __m128i *s = reinterpret_cast<const __m128i*>(src);
255 __m128i *d = reinterpret_cast<__m128i*>(dst);
256 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
257 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
258 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
259 while (count >= 4) {
260 // Load 4 pixels each of src and dest.
261 __m128i src_pixel = _mm_loadu_si128(s);
262 __m128i dst_pixel = _mm_load_si128(d);
263
264 // Get red and blue pixels into lower byte of each word.
265 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
266 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
267
268 // Get alpha and green into lower byte of each word.
269 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
270 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
271
272 // Put per-pixel alpha in low byte of each word.
273 // After the following two statements, the dst_alpha looks like
274 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
275 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
276 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
277
278 // dst_alpha = dst_alpha * src_scale
279 // Because src_scales are in the higher byte of each word and
280 // we use mulhi here, the resulting alpha values are already
281 // in the right place and don't need to be divided by 256.
282 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
283 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
284
285 // Subtract alphas from 256, to get 1..256
286 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
287
288 // Multiply red and blue by dst pixel alpha.
289 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
290 // Multiply alpha and green by dst pixel alpha.
291 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
292
293 // Multiply red and blue by global alpha.
294 // (4 x (0, rs.h, 0, bs.h))
295 // where rs.h stands for the higher byte of r * src_scale,
296 // and bs.h the higher byte of b * src_scale.
297 // Again, because we use mulhi, the resuling red and blue
298 // values are already in the right place and don't need to
299 // be divided by 256.
300 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
301 // Multiply alpha and green by global alpha.
302 // (4 x (0, as.h, 0, gs.h))
303 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
304
305 // Divide by 256.
306 dst_rb = _mm_srli_epi16(dst_rb, 8);
307
308 // Mask out low bits (goodies already in the right place; no need to divide)
309 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
310 // Shift alpha and green to higher byte of each word.
311 // (4 x (as.h, 0, gs.h, 0))
312 src_ag = _mm_slli_epi16(src_ag, 8);
313
314 // Combine back into RGBA.
315 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
316 src_pixel = _mm_or_si128(src_rb, src_ag);
317
318 // Add two pixels into result.
319 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
320 _mm_store_si128(d, result);
321 s++;
322 d++;
323 count -= 4;
324 }
325 src = reinterpret_cast<const SkPMColor*>(s);
326 dst = reinterpret_cast<SkPMColor*>(d);
327 }
328
329 while (count > 0) {
330 *dst = SkBlendARGB32(*src, *dst, alpha);
331 src++;
332 dst++;
333 count--;
334 }
335 }
336
337 /* SSE2 version of Color32()
338 * portable version is in core/SkBlitRow_D32.cpp
339 */
Color32_SSE2(SkPMColor dst[],const SkPMColor src[],int count,SkPMColor color)340 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
341 SkPMColor color) {
342
343 if (count <= 0) {
344 return;
345 }
346
347 if (0 == color) {
348 if (src != dst) {
349 memcpy(dst, src, count * sizeof(SkPMColor));
350 }
351 return;
352 }
353
354 unsigned colorA = SkGetPackedA32(color);
355 if (255 == colorA) {
356 sk_memset32(dst, color, count);
357 } else {
358 unsigned scale = 256 - SkAlpha255To256(colorA);
359
360 if (count >= 4) {
361 SkASSERT(((size_t)dst & 0x03) == 0);
362 while (((size_t)dst & 0x0F) != 0) {
363 *dst = color + SkAlphaMulQ(*src, scale);
364 src++;
365 dst++;
366 count--;
367 }
368
369 const __m128i *s = reinterpret_cast<const __m128i*>(src);
370 __m128i *d = reinterpret_cast<__m128i*>(dst);
371 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
372 __m128i src_scale_wide = _mm_set1_epi16(scale);
373 __m128i color_wide = _mm_set1_epi32(color);
374 while (count >= 4) {
375 // Load 4 pixels each of src and dest.
376 __m128i src_pixel = _mm_loadu_si128(s);
377
378 // Get red and blue pixels into lower byte of each word.
379 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
380
381 // Get alpha and green into lower byte of each word.
382 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
383
384 // Multiply by scale.
385 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
386 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
387
388 // Divide by 256.
389 src_rb = _mm_srli_epi16(src_rb, 8);
390 src_ag = _mm_andnot_si128(rb_mask, src_ag);
391
392 // Combine back into RGBA.
393 src_pixel = _mm_or_si128(src_rb, src_ag);
394
395 // Add color to result.
396 __m128i result = _mm_add_epi8(color_wide, src_pixel);
397
398 // Store result.
399 _mm_store_si128(d, result);
400 s++;
401 d++;
402 count -= 4;
403 }
404 src = reinterpret_cast<const SkPMColor*>(s);
405 dst = reinterpret_cast<SkPMColor*>(d);
406 }
407
408 while (count > 0) {
409 *dst = color + SkAlphaMulQ(*src, scale);
410 src += 1;
411 dst += 1;
412 count--;
413 }
414 }
415 }
416
SkARGB32_A8_BlitMask_SSE2(void * device,size_t dstRB,const void * maskPtr,size_t maskRB,SkColor origColor,int width,int height)417 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
418 size_t maskRB, SkColor origColor,
419 int width, int height) {
420 SkPMColor color = SkPreMultiplyColor(origColor);
421 size_t dstOffset = dstRB - (width << 2);
422 size_t maskOffset = maskRB - width;
423 SkPMColor* dst = (SkPMColor *)device;
424 const uint8_t* mask = (const uint8_t*)maskPtr;
425 do {
426 int count = width;
427 if (count >= 4) {
428 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
429 *dst = SkBlendARGB32(color, *dst, *mask);
430 mask++;
431 dst++;
432 count--;
433 }
434 __m128i *d = reinterpret_cast<__m128i*>(dst);
435 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
436 __m128i c_256 = _mm_set1_epi16(256);
437 __m128i c_1 = _mm_set1_epi16(1);
438 __m128i src_pixel = _mm_set1_epi32(color);
439 while (count >= 4) {
440 // Load 4 pixels each of src and dest.
441 __m128i dst_pixel = _mm_load_si128(d);
442
443 //set the aphla value
444 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
445 0, *(mask+3),0, \
446 *(mask+2),0, *(mask+2),\
447 0,*(mask+1), 0,*(mask+1),\
448 0, *mask,0,*mask);
449
450 //call SkAlpha255To256()
451 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
452
453 // Get red and blue pixels into lower byte of each word.
454 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
455 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
456
457 // Get alpha and green into lower byte of each word.
458 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
459 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
460
461 // Put per-pixel alpha in low byte of each word.
462 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
463 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
464
465 // dst_alpha = dst_alpha * src_scale
466 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
467
468 // Divide by 256.
469 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
470
471 // Subtract alphas from 256, to get 1..256
472 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
473 // Multiply red and blue by dst pixel alpha.
474 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
475 // Multiply alpha and green by dst pixel alpha.
476 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
477
478 // Multiply red and blue by global alpha.
479 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
480 // Multiply alpha and green by global alpha.
481 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
482 // Divide by 256.
483 dst_rb = _mm_srli_epi16(dst_rb, 8);
484 src_rb = _mm_srli_epi16(src_rb, 8);
485
486 // Mask out low bits (goodies already in the right place; no need to divide)
487 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
488 src_ag = _mm_andnot_si128(rb_mask, src_ag);
489
490 // Combine back into RGBA.
491 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
492 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
493
494 // Add two pixels into result.
495 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
496 _mm_store_si128(d, result);
497 // load the next 4 pixel
498 mask = mask + 4;
499 d++;
500 count -= 4;
501 }
502 dst = reinterpret_cast<SkPMColor *>(d);
503 }
504 while(count > 0) {
505 *dst= SkBlendARGB32(color, *dst, *mask);
506 dst += 1;
507 mask++;
508 count --;
509 }
510 dst = (SkPMColor *)((char*)dst + dstOffset);
511 mask += maskOffset;
512 } while (--height != 0);
513 }
514
SkBlendLCD16_SSE2(__m128i & srci,__m128i & dst,__m128i & mask,__m128i & scale)515 static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,
516 __m128i &mask, __m128i &scale) {
517 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
518 __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
519 16-SK_R16_SHIFT-(SK_R16_BITS-5)),
520 _mm_set1_epi32(0x001F0000));
521
522 __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
523 8-SK_G16_SHIFT-(SK_G16_BITS-5)),
524 _mm_set1_epi32(0x00001F00));
525
526 __m128i b = _mm_and_si128(_mm_slli_epi32(mask,
527 SK_B16_BITS-5),
528 _mm_set1_epi32(0x0000001F));
529
530 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
531 mask = _mm_or_si128(_mm_or_si128(r, g), b);
532
533 // Interleave R,G,B into the lower byte of word.
534 __m128i maskLo, maskHi;
535 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
536 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
537
538 // Upscale to 0..32
539 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
540 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
541
542 maskLo = _mm_mullo_epi16(maskLo, scale);
543 maskHi = _mm_mullo_epi16(maskHi, scale);
544
545 maskLo = _mm_srli_epi16(maskLo, 8);
546 maskHi = _mm_srli_epi16(maskHi, 8);
547
548 // Interleave R,G,B into the lower byte of the word.
549 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
550 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
551
552 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
553 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
554
555 maskLo = _mm_srai_epi16(maskLo, 5);
556 maskHi = _mm_srai_epi16(maskHi, 5);
557
558 // Add two pixels into result.
559 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
560 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
561
562 // Pack into 4 32bit dst pixels
563 return _mm_packus_epi16(resultLo, resultHi);
564 }
565
SkBlendLCD16Opaque_SSE2(__m128i & srci,__m128i & dst,__m128i & mask)566 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,
567 __m128i &mask) {
568 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
569 __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
570 16-SK_R16_SHIFT-(SK_R16_BITS-5)),
571 _mm_set1_epi32(0x001F0000));
572
573 __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
574 8-SK_G16_SHIFT-(SK_G16_BITS-5)),
575 _mm_set1_epi32(0x00001F00));
576
577 __m128i b = _mm_and_si128(_mm_slli_epi32(mask, SK_B16_BITS-5),
578 _mm_set1_epi32(0x0000001F));
579
580 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
581 mask = _mm_or_si128(_mm_or_si128(r, g), b);
582
583 // Interleave R,G,B into the lower byte of word.
584 __m128i maskLo, maskHi;
585 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
586 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
587
588 // Upscale to 0..32
589 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
590 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
591
592 // Interleave R,G,B into the lower byte of the word.
593 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
594 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
595
596 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
597 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
598
599 maskLo = _mm_srai_epi16(maskLo, 5);
600 maskHi = _mm_srai_epi16(maskHi, 5);
601
602 // Add two pixels into result.
603 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
604 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
605
606 // Pack into 4 32bit dst pixels
607 return _mm_packus_epi16(resultLo, resultHi);
608 }
609
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)610 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
611 SkColor color, int width, SkPMColor) {
612 if (width <= 0) {
613 return;
614 }
615
616 int srcA = SkColorGetA(color);
617 int srcR = SkColorGetR(color);
618 int srcG = SkColorGetG(color);
619 int srcB = SkColorGetB(color);
620
621 srcA = SkAlpha255To256(srcA);
622
623 if (width >= 4) {
624 SkASSERT(((size_t)dst & 0x03) == 0);
625 while (((size_t)dst & 0x0F) != 0) {
626 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
627 src++;
628 dst++;
629 width--;
630 }
631
632 __m128i *d = reinterpret_cast<__m128i*>(dst);
633 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
634 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
635 __m128i scale = _mm_set1_epi16(srcA);
636 while (width >= 4) {
637 __m128i dst_pixel = _mm_load_si128(d);
638 __m128i mask_pixel = _mm_loadl_epi64(
639 reinterpret_cast<const __m128i*>(src));
640
641 // Check whether mask_pixels are equal to 0 and get the highest bit
642 // of each byte of result, if mask pixes are all zero, we will get
643 // pack_cmp to 0xFFFF
644 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
645 _mm_setzero_si128()));
646
647 // if mask pixels are not all zero, we will blend the dst pixels
648 if (pack_cmp != 0xFFFF) {
649 // Unpack 4 16bit mask pixels to
650 // (p0, 0, p1, 0, p2, 0, p3, 0)
651 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
652 _mm_setzero_si128());
653
654 // Process 4 32bit dst pixels
655 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,
656 mask_pixel, scale);
657 _mm_store_si128(d, result);
658 }
659
660 d++;
661 src += 4;
662 width -= 4;
663 }
664
665 dst = reinterpret_cast<SkPMColor*>(d);
666 }
667
668 while (width > 0) {
669 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
670 src++;
671 dst++;
672 width--;
673 }
674 }
675
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)676 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
677 SkColor color, int width, SkPMColor opaqueDst) {
678 if (width <= 0) {
679 return;
680 }
681
682 int srcR = SkColorGetR(color);
683 int srcG = SkColorGetG(color);
684 int srcB = SkColorGetB(color);
685
686 if (width >= 4) {
687 SkASSERT(((size_t)dst & 0x03) == 0);
688 while (((size_t)dst & 0x0F) != 0) {
689 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
690 src++;
691 dst++;
692 width--;
693 }
694
695 __m128i *d = reinterpret_cast<__m128i*>(dst);
696 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
697 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
698 while (width >= 4) {
699 __m128i dst_pixel = _mm_load_si128(d);
700 __m128i mask_pixel = _mm_loadl_epi64(
701 reinterpret_cast<const __m128i*>(src));
702
703 // Check whether mask_pixels are equal to 0 and get the highest bit
704 // of each byte of result, if mask pixes are all zero, we will get
705 // pack_cmp to 0xFFFF
706 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
707 _mm_setzero_si128()));
708
709 // if mask pixels are not all zero, we will blend the dst pixels
710 if (pack_cmp != 0xFFFF) {
711 // Unpack 4 16bit mask pixels to
712 // (p0, 0, p1, 0, p2, 0, p3, 0)
713 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
714 _mm_setzero_si128());
715
716 // Process 4 32bit dst pixels
717 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,
718 mask_pixel);
719 _mm_store_si128(d, result);
720 }
721
722 d++;
723 src += 4;
724 width -= 4;
725 }
726
727 dst = reinterpret_cast<SkPMColor*>(d);
728 }
729
730 while (width > 0) {
731 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
732 src++;
733 dst++;
734 width--;
735 }
736 }
737