1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED
10
11 #include "include/private/SkColorData.h"
12
13 #include <utility>
14
15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
16 #include <immintrin.h>
17 #elif defined(SK_ARM_HAS_NEON)
18 #include <arm_neon.h>
19 #endif
20
21 namespace SK_OPTS_NS {
22
RGBA_to_rgbA_portable(uint32_t * dst,const uint32_t * src,int count)23 static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
24 for (int i = 0; i < count; i++) {
25 uint8_t a = (src[i] >> 24) & 0xFF,
26 b = (src[i] >> 16) & 0xFF,
27 g = (src[i] >> 8) & 0xFF,
28 r = (src[i] >> 0) & 0xFF;
29 b = (b*a+127)/255;
30 g = (g*a+127)/255;
31 r = (r*a+127)/255;
32 dst[i] = (uint32_t)a << 24
33 | (uint32_t)b << 16
34 | (uint32_t)g << 8
35 | (uint32_t)r << 0;
36 }
37 }
38
RGBA_to_bgrA_portable(uint32_t * dst,const uint32_t * src,int count)39 static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
40 for (int i = 0; i < count; i++) {
41 uint8_t a = (src[i] >> 24) & 0xFF,
42 b = (src[i] >> 16) & 0xFF,
43 g = (src[i] >> 8) & 0xFF,
44 r = (src[i] >> 0) & 0xFF;
45 b = (b*a+127)/255;
46 g = (g*a+127)/255;
47 r = (r*a+127)/255;
48 dst[i] = (uint32_t)a << 24
49 | (uint32_t)r << 16
50 | (uint32_t)g << 8
51 | (uint32_t)b << 0;
52 }
53 }
54
RGBA_to_BGRA_portable(uint32_t * dst,const uint32_t * src,int count)55 static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
56 for (int i = 0; i < count; i++) {
57 uint8_t a = (src[i] >> 24) & 0xFF,
58 b = (src[i] >> 16) & 0xFF,
59 g = (src[i] >> 8) & 0xFF,
60 r = (src[i] >> 0) & 0xFF;
61 dst[i] = (uint32_t)a << 24
62 | (uint32_t)r << 16
63 | (uint32_t)g << 8
64 | (uint32_t)b << 0;
65 }
66 }
67
RGB_to_RGB1_portable(uint32_t dst[],const uint8_t * src,int count)68 static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
69 for (int i = 0; i < count; i++) {
70 uint8_t r = src[0],
71 g = src[1],
72 b = src[2];
73 src += 3;
74 dst[i] = (uint32_t)0xFF << 24
75 | (uint32_t)b << 16
76 | (uint32_t)g << 8
77 | (uint32_t)r << 0;
78 }
79 }
80
RGB_to_BGR1_portable(uint32_t dst[],const uint8_t * src,int count)81 static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
82 for (int i = 0; i < count; i++) {
83 uint8_t r = src[0],
84 g = src[1],
85 b = src[2];
86 src += 3;
87 dst[i] = (uint32_t)0xFF << 24
88 | (uint32_t)r << 16
89 | (uint32_t)g << 8
90 | (uint32_t)b << 0;
91 }
92 }
93
gray_to_RGB1_portable(uint32_t dst[],const uint8_t * src,int count)94 static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
95 for (int i = 0; i < count; i++) {
96 dst[i] = (uint32_t)0xFF << 24
97 | (uint32_t)src[i] << 16
98 | (uint32_t)src[i] << 8
99 | (uint32_t)src[i] << 0;
100 }
101 }
102
grayA_to_RGBA_portable(uint32_t dst[],const uint8_t * src,int count)103 static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
104 for (int i = 0; i < count; i++) {
105 uint8_t g = src[0],
106 a = src[1];
107 src += 2;
108 dst[i] = (uint32_t)a << 24
109 | (uint32_t)g << 16
110 | (uint32_t)g << 8
111 | (uint32_t)g << 0;
112 }
113 }
114
grayA_to_rgbA_portable(uint32_t dst[],const uint8_t * src,int count)115 static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
116 for (int i = 0; i < count; i++) {
117 uint8_t g = src[0],
118 a = src[1];
119 src += 2;
120 g = (g*a+127)/255;
121 dst[i] = (uint32_t)a << 24
122 | (uint32_t)g << 16
123 | (uint32_t)g << 8
124 | (uint32_t)g << 0;
125 }
126 }
127
inverted_CMYK_to_RGB1_portable(uint32_t * dst,const uint32_t * src,int count)128 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
129 for (int i = 0; i < count; i++) {
130 uint8_t k = (src[i] >> 24) & 0xFF,
131 y = (src[i] >> 16) & 0xFF,
132 m = (src[i] >> 8) & 0xFF,
133 c = (src[i] >> 0) & 0xFF;
134 // See comments in SkSwizzler.cpp for details on the conversion formula.
135 uint8_t b = (y*k+127)/255,
136 g = (m*k+127)/255,
137 r = (c*k+127)/255;
138 dst[i] = (uint32_t)0xFF << 24
139 | (uint32_t) b << 16
140 | (uint32_t) g << 8
141 | (uint32_t) r << 0;
142 }
143 }
144
inverted_CMYK_to_BGR1_portable(uint32_t * dst,const uint32_t * src,int count)145 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
146 for (int i = 0; i < count; i++) {
147 uint8_t k = (src[i] >> 24) & 0xFF,
148 y = (src[i] >> 16) & 0xFF,
149 m = (src[i] >> 8) & 0xFF,
150 c = (src[i] >> 0) & 0xFF;
151 uint8_t b = (y*k+127)/255,
152 g = (m*k+127)/255,
153 r = (c*k+127)/255;
154 dst[i] = (uint32_t)0xFF << 24
155 | (uint32_t) r << 16
156 | (uint32_t) g << 8
157 | (uint32_t) b << 0;
158 }
159 }
160
161 #if defined(SK_ARM_HAS_NEON)
162
163 // Rounded divide by 255, (x + 127) / 255
div255_round(uint16x8_t x)164 static uint8x8_t div255_round(uint16x8_t x) {
165 // result = (x + 127) / 255
166 // result = (x + 127) / 256 + error1
167 //
168 // error1 = (x + 127) / (255 * 256)
169 // error1 = (x + 127) / (256 * 256) + error2
170 //
171 // error2 = (x + 127) / (255 * 256 * 256)
172 //
173 // The maximum value of error2 is too small to matter. Thus:
174 // result = (x + 127) / 256 + (x + 127) / (256 * 256)
175 // result = ((x + 127) / 256 + x + 127) / 256
176 // result = ((x + 127) >> 8 + x + 127) >> 8
177 //
178 // Use >>> to represent "rounded right shift" which, conveniently,
179 // NEON supports in one instruction.
180 // result = ((x >>> 8) + x) >>> 8
181 //
182 // Note that the second right shift is actually performed as an
183 // "add, round, and narrow back to 8-bits" instruction.
184 return vraddhn_u16(x, vrshrq_n_u16(x, 8));
185 }
186
187 // Scale a byte by another, (x * y + 127) / 255
scale(uint8x8_t x,uint8x8_t y)188 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
189 return div255_round(vmull_u8(x, y));
190 }
191
192 template <bool kSwapRB>
premul_should_swapRB(uint32_t * dst,const uint32_t * src,int count)193 static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
194 while (count >= 8) {
195 // Load 8 pixels.
196 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
197
198 uint8x8_t a = rgba.val[3],
199 b = rgba.val[2],
200 g = rgba.val[1],
201 r = rgba.val[0];
202
203 // Premultiply.
204 b = scale(b, a);
205 g = scale(g, a);
206 r = scale(r, a);
207
208 // Store 8 premultiplied pixels.
209 if (kSwapRB) {
210 rgba.val[2] = r;
211 rgba.val[1] = g;
212 rgba.val[0] = b;
213 } else {
214 rgba.val[2] = b;
215 rgba.val[1] = g;
216 rgba.val[0] = r;
217 }
218 vst4_u8((uint8_t*) dst, rgba);
219 src += 8;
220 dst += 8;
221 count -= 8;
222 }
223
224 // Call portable code to finish up the tail of [0,8) pixels.
225 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
226 proc(dst, src, count);
227 }
228
RGBA_to_rgbA(uint32_t * dst,const uint32_t * src,int count)229 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
230 premul_should_swapRB<false>(dst, src, count);
231 }
232
RGBA_to_bgrA(uint32_t * dst,const uint32_t * src,int count)233 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
234 premul_should_swapRB<true>(dst, src, count);
235 }
236
RGBA_to_BGRA(uint32_t * dst,const uint32_t * src,int count)237 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
238 using std::swap;
239 while (count >= 16) {
240 // Load 16 pixels.
241 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
242
243 // Swap r and b.
244 swap(rgba.val[0], rgba.val[2]);
245
246 // Store 16 pixels.
247 vst4q_u8((uint8_t*) dst, rgba);
248 src += 16;
249 dst += 16;
250 count -= 16;
251 }
252
253 if (count >= 8) {
254 // Load 8 pixels.
255 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
256
257 // Swap r and b.
258 swap(rgba.val[0], rgba.val[2]);
259
260 // Store 8 pixels.
261 vst4_u8((uint8_t*) dst, rgba);
262 src += 8;
263 dst += 8;
264 count -= 8;
265 }
266
267 RGBA_to_BGRA_portable(dst, src, count);
268 }
269
270 template <bool kSwapRB>
insert_alpha_should_swaprb(uint32_t dst[],const uint8_t * src,int count)271 static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
272 while (count >= 16) {
273 // Load 16 pixels.
274 uint8x16x3_t rgb = vld3q_u8(src);
275
276 // Insert an opaque alpha channel and swap if needed.
277 uint8x16x4_t rgba;
278 if (kSwapRB) {
279 rgba.val[0] = rgb.val[2];
280 rgba.val[2] = rgb.val[0];
281 } else {
282 rgba.val[0] = rgb.val[0];
283 rgba.val[2] = rgb.val[2];
284 }
285 rgba.val[1] = rgb.val[1];
286 rgba.val[3] = vdupq_n_u8(0xFF);
287
288 // Store 16 pixels.
289 vst4q_u8((uint8_t*) dst, rgba);
290 src += 16*3;
291 dst += 16;
292 count -= 16;
293 }
294
295 if (count >= 8) {
296 // Load 8 pixels.
297 uint8x8x3_t rgb = vld3_u8(src);
298
299 // Insert an opaque alpha channel and swap if needed.
300 uint8x8x4_t rgba;
301 if (kSwapRB) {
302 rgba.val[0] = rgb.val[2];
303 rgba.val[2] = rgb.val[0];
304 } else {
305 rgba.val[0] = rgb.val[0];
306 rgba.val[2] = rgb.val[2];
307 }
308 rgba.val[1] = rgb.val[1];
309 rgba.val[3] = vdup_n_u8(0xFF);
310
311 // Store 8 pixels.
312 vst4_u8((uint8_t*) dst, rgba);
313 src += 8*3;
314 dst += 8;
315 count -= 8;
316 }
317
318 // Call portable code to finish up the tail of [0,8) pixels.
319 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
320 proc(dst, src, count);
321 }
322
RGB_to_RGB1(uint32_t dst[],const uint8_t * src,int count)323 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
324 insert_alpha_should_swaprb<false>(dst, src, count);
325 }
326
RGB_to_BGR1(uint32_t dst[],const uint8_t * src,int count)327 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
328 insert_alpha_should_swaprb<true>(dst, src, count);
329 }
330
gray_to_RGB1(uint32_t dst[],const uint8_t * src,int count)331 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
332 while (count >= 16) {
333 // Load 16 pixels.
334 uint8x16_t gray = vld1q_u8(src);
335
336 // Set each of the color channels.
337 uint8x16x4_t rgba;
338 rgba.val[0] = gray;
339 rgba.val[1] = gray;
340 rgba.val[2] = gray;
341 rgba.val[3] = vdupq_n_u8(0xFF);
342
343 // Store 16 pixels.
344 vst4q_u8((uint8_t*) dst, rgba);
345 src += 16;
346 dst += 16;
347 count -= 16;
348 }
349
350 if (count >= 8) {
351 // Load 8 pixels.
352 uint8x8_t gray = vld1_u8(src);
353
354 // Set each of the color channels.
355 uint8x8x4_t rgba;
356 rgba.val[0] = gray;
357 rgba.val[1] = gray;
358 rgba.val[2] = gray;
359 rgba.val[3] = vdup_n_u8(0xFF);
360
361 // Store 8 pixels.
362 vst4_u8((uint8_t*) dst, rgba);
363 src += 8;
364 dst += 8;
365 count -= 8;
366 }
367
368 gray_to_RGB1_portable(dst, src, count);
369 }
370
371 template <bool kPremul>
expand_grayA(uint32_t dst[],const uint8_t * src,int count)372 static void expand_grayA(uint32_t dst[], const uint8_t* src, int count) {
373 while (count >= 16) {
374 // Load 16 pixels.
375 uint8x16x2_t ga = vld2q_u8(src);
376
377 // Premultiply if requested.
378 if (kPremul) {
379 ga.val[0] = vcombine_u8(
380 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
381 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
382 }
383
384 // Set each of the color channels.
385 uint8x16x4_t rgba;
386 rgba.val[0] = ga.val[0];
387 rgba.val[1] = ga.val[0];
388 rgba.val[2] = ga.val[0];
389 rgba.val[3] = ga.val[1];
390
391 // Store 16 pixels.
392 vst4q_u8((uint8_t*) dst, rgba);
393 src += 16*2;
394 dst += 16;
395 count -= 16;
396 }
397
398 if (count >= 8) {
399 // Load 8 pixels.
400 uint8x8x2_t ga = vld2_u8(src);
401
402 // Premultiply if requested.
403 if (kPremul) {
404 ga.val[0] = scale(ga.val[0], ga.val[1]);
405 }
406
407 // Set each of the color channels.
408 uint8x8x4_t rgba;
409 rgba.val[0] = ga.val[0];
410 rgba.val[1] = ga.val[0];
411 rgba.val[2] = ga.val[0];
412 rgba.val[3] = ga.val[1];
413
414 // Store 8 pixels.
415 vst4_u8((uint8_t*) dst, rgba);
416 src += 8*2;
417 dst += 8;
418 count -= 8;
419 }
420
421 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
422 proc(dst, src, count);
423 }
424
grayA_to_RGBA(uint32_t dst[],const uint8_t * src,int count)425 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
426 expand_grayA<false>(dst, src, count);
427 }
428
grayA_to_rgbA(uint32_t dst[],const uint8_t * src,int count)429 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
430 expand_grayA<true>(dst, src, count);
431 }
432
433 enum Format { kRGB1, kBGR1 };
434 template <Format format>
inverted_cmyk_to(uint32_t * dst,const uint32_t * src,int count)435 static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
436 while (count >= 8) {
437 // Load 8 cmyk pixels.
438 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
439
440 uint8x8_t k = pixels.val[3],
441 y = pixels.val[2],
442 m = pixels.val[1],
443 c = pixels.val[0];
444
445 // Scale to r, g, b.
446 uint8x8_t b = scale(y, k);
447 uint8x8_t g = scale(m, k);
448 uint8x8_t r = scale(c, k);
449
450 // Store 8 rgba pixels.
451 if (kBGR1 == format) {
452 pixels.val[3] = vdup_n_u8(0xFF);
453 pixels.val[2] = r;
454 pixels.val[1] = g;
455 pixels.val[0] = b;
456 } else {
457 pixels.val[3] = vdup_n_u8(0xFF);
458 pixels.val[2] = b;
459 pixels.val[1] = g;
460 pixels.val[0] = r;
461 }
462 vst4_u8((uint8_t*) dst, pixels);
463 src += 8;
464 dst += 8;
465 count -= 8;
466 }
467
468 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
469 proc(dst, src, count);
470 }
471
inverted_CMYK_to_RGB1(uint32_t dst[],const uint32_t * src,int count)472 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
473 inverted_cmyk_to<kRGB1>(dst, src, count);
474 }
475
inverted_CMYK_to_BGR1(uint32_t dst[],const uint32_t * src,int count)476 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
477 inverted_cmyk_to<kBGR1>(dst, src, count);
478 }
479
480 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
481
482 // Scale a byte by another.
483 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
scale(__m128i x,__m128i y)484 static __m128i scale(__m128i x, __m128i y) {
485 const __m128i _128 = _mm_set1_epi16(128);
486 const __m128i _257 = _mm_set1_epi16(257);
487
488 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
489 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
490 }
491
492 template <bool kSwapRB>
premul_should_swapRB(uint32_t * dst,const uint32_t * src,int count)493 static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
494
495 auto premul8 = [](__m128i* lo, __m128i* hi) {
496 const __m128i zeros = _mm_setzero_si128();
497 __m128i planar;
498 if (kSwapRB) {
499 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
500 } else {
501 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
502 }
503
504 // Swizzle the pixels to 8-bit planar.
505 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa
506 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA
507 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG
508 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA
509
510 // Unpack to 16-bit planar.
511 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
512 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
513 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
514 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
515
516 // Premultiply!
517 r = scale(r, a);
518 g = scale(g, a);
519 b = scale(b, a);
520
521 // Repack into interlaced pixels.
522 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
523 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA
524 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
525 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA
526 };
527
528 while (count >= 8) {
529 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
530 hi = _mm_loadu_si128((const __m128i*) (src + 4));
531
532 premul8(&lo, &hi);
533
534 _mm_storeu_si128((__m128i*) (dst + 0), lo);
535 _mm_storeu_si128((__m128i*) (dst + 4), hi);
536
537 src += 8;
538 dst += 8;
539 count -= 8;
540 }
541
542 if (count >= 4) {
543 __m128i lo = _mm_loadu_si128((const __m128i*) src),
544 hi = _mm_setzero_si128();
545
546 premul8(&lo, &hi);
547
548 _mm_storeu_si128((__m128i*) dst, lo);
549
550 src += 4;
551 dst += 4;
552 count -= 4;
553 }
554
555 // Call portable code to finish up the tail of [0,4) pixels.
556 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
557 proc(dst, src, count);
558 }
559
RGBA_to_rgbA(uint32_t * dst,const uint32_t * src,int count)560 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
561 premul_should_swapRB<false>(dst, src, count);
562 }
563
RGBA_to_bgrA(uint32_t * dst,const uint32_t * src,int count)564 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
565 premul_should_swapRB<true>(dst, src, count);
566 }
567
RGBA_to_BGRA(uint32_t * dst,const uint32_t * src,int count)568 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
569 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
570
571 while (count >= 4) {
572 __m128i rgba = _mm_loadu_si128((const __m128i*) src);
573 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
574 _mm_storeu_si128((__m128i*) dst, bgra);
575
576 src += 4;
577 dst += 4;
578 count -= 4;
579 }
580
581 RGBA_to_BGRA_portable(dst, src, count);
582 }
583
584 template <bool kSwapRB>
insert_alpha_should_swaprb(uint32_t dst[],const uint8_t * src,int count)585 static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
586 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
587 __m128i expand;
588 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant.
589 if (kSwapRB) {
590 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
591 } else {
592 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
593 }
594
595 while (count >= 6) {
596 // Load a vector. While this actually contains 5 pixels plus an
597 // extra component, we will discard all but the first four pixels on
598 // this iteration.
599 __m128i rgb = _mm_loadu_si128((const __m128i*) src);
600
601 // Expand the first four pixels to RGBX and then mask to RGB(FF).
602 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
603
604 // Store 4 pixels.
605 _mm_storeu_si128((__m128i*) dst, rgba);
606
607 src += 4*3;
608 dst += 4;
609 count -= 4;
610 }
611
612 // Call portable code to finish up the tail of [0,4) pixels.
613 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
614 proc(dst, src, count);
615 }
616
RGB_to_RGB1(uint32_t dst[],const uint8_t * src,int count)617 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
618 insert_alpha_should_swaprb<false>(dst, src, count);
619 }
620
RGB_to_BGR1(uint32_t dst[],const uint8_t * src,int count)621 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
622 insert_alpha_should_swaprb<true>(dst, src, count);
623 }
624
gray_to_RGB1(uint32_t dst[],const uint8_t * src,int count)625 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
626 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
627 while (count >= 16) {
628 __m128i grays = _mm_loadu_si128((const __m128i*) src);
629
630 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
631 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
632 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
633 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
634
635 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
636 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
637 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
638 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
639
640 _mm_storeu_si128((__m128i*) (dst + 0), ggga0);
641 _mm_storeu_si128((__m128i*) (dst + 4), ggga1);
642 _mm_storeu_si128((__m128i*) (dst + 8), ggga2);
643 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
644
645 src += 16;
646 dst += 16;
647 count -= 16;
648 }
649
650 gray_to_RGB1_portable(dst, src, count);
651 }
652
grayA_to_RGBA(uint32_t dst[],const uint8_t * src,int count)653 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
654 while (count >= 8) {
655 __m128i ga = _mm_loadu_si128((const __m128i*) src);
656
657 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
658 _mm_slli_epi16(ga, 8));
659
660 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
661 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
662
663 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
664 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
665
666 src += 8*2;
667 dst += 8;
668 count -= 8;
669 }
670
671 grayA_to_RGBA_portable(dst, src, count);
672 }
673
grayA_to_rgbA(uint32_t dst[],const uint8_t * src,int count)674 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
675 while (count >= 8) {
676 __m128i grayA = _mm_loadu_si128((const __m128i*) src);
677
678 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
679 __m128i a0 = _mm_srli_epi16(grayA, 8);
680
681 // Premultiply
682 g0 = scale(g0, a0);
683
684 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
685 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
686
687
688 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
689 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
690
691 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
692 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
693
694 src += 8*2;
695 dst += 8;
696 count -= 8;
697 }
698
699 grayA_to_rgbA_portable(dst, src, count);
700 }
701
702 enum Format { kRGB1, kBGR1 };
703 template <Format format>
inverted_cmyk_to(uint32_t * dst,const uint32_t * src,int count)704 static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
705 auto convert8 = [](__m128i* lo, __m128i* hi) {
706 const __m128i zeros = _mm_setzero_si128();
707 __m128i planar;
708 if (kBGR1 == format) {
709 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
710 } else {
711 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
712 }
713
714 // Swizzle the pixels to 8-bit planar.
715 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk
716 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK
717 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM
718 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK
719
720 // Unpack to 16-bit planar.
721 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_
722 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_
723 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_
724 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_
725
726 // Scale to r, g, b.
727 __m128i r = scale(c, k),
728 g = scale(m, k),
729 b = scale(y, k);
730
731 // Repack into interlaced pixels.
732 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG
733 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1
734 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
735 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1
736 };
737
738 while (count >= 8) {
739 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
740 hi = _mm_loadu_si128((const __m128i*) (src + 4));
741
742 convert8(&lo, &hi);
743
744 _mm_storeu_si128((__m128i*) (dst + 0), lo);
745 _mm_storeu_si128((__m128i*) (dst + 4), hi);
746
747 src += 8;
748 dst += 8;
749 count -= 8;
750 }
751
752 if (count >= 4) {
753 __m128i lo = _mm_loadu_si128((const __m128i*) src),
754 hi = _mm_setzero_si128();
755
756 convert8(&lo, &hi);
757
758 _mm_storeu_si128((__m128i*) dst, lo);
759
760 src += 4;
761 dst += 4;
762 count -= 4;
763 }
764
765 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
766 proc(dst, src, count);
767 }
768
inverted_CMYK_to_RGB1(uint32_t dst[],const uint32_t * src,int count)769 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
770 inverted_cmyk_to<kRGB1>(dst, src, count);
771 }
772
inverted_CMYK_to_BGR1(uint32_t dst[],const uint32_t * src,int count)773 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
774 inverted_cmyk_to<kBGR1>(dst, src, count);
775 }
776
777 #else
778
RGBA_to_rgbA(uint32_t * dst,const uint32_t * src,int count)779 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
780 RGBA_to_rgbA_portable(dst, src, count);
781 }
782
RGBA_to_bgrA(uint32_t * dst,const uint32_t * src,int count)783 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
784 RGBA_to_bgrA_portable(dst, src, count);
785 }
786
RGBA_to_BGRA(uint32_t * dst,const uint32_t * src,int count)787 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
788 RGBA_to_BGRA_portable(dst, src, count);
789 }
790
RGB_to_RGB1(uint32_t dst[],const uint8_t * src,int count)791 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
792 RGB_to_RGB1_portable(dst, src, count);
793 }
794
RGB_to_BGR1(uint32_t dst[],const uint8_t * src,int count)795 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
796 RGB_to_BGR1_portable(dst, src, count);
797 }
798
gray_to_RGB1(uint32_t dst[],const uint8_t * src,int count)799 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
800 gray_to_RGB1_portable(dst, src, count);
801 }
802
grayA_to_RGBA(uint32_t dst[],const uint8_t * src,int count)803 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
804 grayA_to_RGBA_portable(dst, src, count);
805 }
806
grayA_to_rgbA(uint32_t dst[],const uint8_t * src,int count)807 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
808 grayA_to_rgbA_portable(dst, src, count);
809 }
810
inverted_CMYK_to_RGB1(uint32_t dst[],const uint32_t * src,int count)811 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
812 inverted_CMYK_to_RGB1_portable(dst, src, count);
813 }
814
inverted_CMYK_to_BGR1(uint32_t dst[],const uint32_t * src,int count)815 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
816 inverted_CMYK_to_BGR1_portable(dst, src, count);
817 }
818
819 #endif
820
821 }
822
823 #endif // SkSwizzler_opts_DEFINED
824