• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED
10 
11 #include "include/private/SkColorData.h"
12 #include "include/private/SkVx.h"
13 #include <utility>
14 
15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
16     #include <immintrin.h>
17 #elif defined(SK_ARM_HAS_NEON)
18     #include <arm_neon.h>
19 #endif
20 
21 namespace SK_OPTS_NS {
22 
RGBA_to_rgbA_portable(uint32_t * dst,const uint32_t * src,int count)23 static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
24     for (int i = 0; i < count; i++) {
25         uint8_t a = (src[i] >> 24) & 0xFF,
26                 b = (src[i] >> 16) & 0xFF,
27                 g = (src[i] >>  8) & 0xFF,
28                 r = (src[i] >>  0) & 0xFF;
29         b = (b*a+127)/255;
30         g = (g*a+127)/255;
31         r = (r*a+127)/255;
32         dst[i] = (uint32_t)a << 24
33                | (uint32_t)b << 16
34                | (uint32_t)g <<  8
35                | (uint32_t)r <<  0;
36     }
37 }
38 
RGBA_to_bgrA_portable(uint32_t * dst,const uint32_t * src,int count)39 static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
40     for (int i = 0; i < count; i++) {
41         uint8_t a = (src[i] >> 24) & 0xFF,
42                 b = (src[i] >> 16) & 0xFF,
43                 g = (src[i] >>  8) & 0xFF,
44                 r = (src[i] >>  0) & 0xFF;
45         b = (b*a+127)/255;
46         g = (g*a+127)/255;
47         r = (r*a+127)/255;
48         dst[i] = (uint32_t)a << 24
49                | (uint32_t)r << 16
50                | (uint32_t)g <<  8
51                | (uint32_t)b <<  0;
52     }
53 }
54 
RGBA_to_BGRA_portable(uint32_t * dst,const uint32_t * src,int count)55 static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
56     for (int i = 0; i < count; i++) {
57         uint8_t a = (src[i] >> 24) & 0xFF,
58                 b = (src[i] >> 16) & 0xFF,
59                 g = (src[i] >>  8) & 0xFF,
60                 r = (src[i] >>  0) & 0xFF;
61         dst[i] = (uint32_t)a << 24
62                | (uint32_t)r << 16
63                | (uint32_t)g <<  8
64                | (uint32_t)b <<  0;
65     }
66 }
67 
grayA_to_RGBA_portable(uint32_t dst[],const uint8_t * src,int count)68 static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
69     for (int i = 0; i < count; i++) {
70         uint8_t g = src[0],
71                 a = src[1];
72         src += 2;
73         dst[i] = (uint32_t)a << 24
74                | (uint32_t)g << 16
75                | (uint32_t)g <<  8
76                | (uint32_t)g <<  0;
77     }
78 }
79 
grayA_to_rgbA_portable(uint32_t dst[],const uint8_t * src,int count)80 static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
81     for (int i = 0; i < count; i++) {
82         uint8_t g = src[0],
83                 a = src[1];
84         src += 2;
85         g = (g*a+127)/255;
86         dst[i] = (uint32_t)a << 24
87                | (uint32_t)g << 16
88                | (uint32_t)g <<  8
89                | (uint32_t)g <<  0;
90     }
91 }
92 
inverted_CMYK_to_RGB1_portable(uint32_t * dst,const uint32_t * src,int count)93 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
94     for (int i = 0; i < count; i++) {
95         uint8_t k = (src[i] >> 24) & 0xFF,
96                 y = (src[i] >> 16) & 0xFF,
97                 m = (src[i] >>  8) & 0xFF,
98                 c = (src[i] >>  0) & 0xFF;
99         // See comments in SkSwizzler.cpp for details on the conversion formula.
100         uint8_t b = (y*k+127)/255,
101                 g = (m*k+127)/255,
102                 r = (c*k+127)/255;
103         dst[i] = (uint32_t)0xFF << 24
104                | (uint32_t)   b << 16
105                | (uint32_t)   g <<  8
106                | (uint32_t)   r <<  0;
107     }
108 }
109 
inverted_CMYK_to_BGR1_portable(uint32_t * dst,const uint32_t * src,int count)110 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
111     for (int i = 0; i < count; i++) {
112         uint8_t k = (src[i] >> 24) & 0xFF,
113                 y = (src[i] >> 16) & 0xFF,
114                 m = (src[i] >>  8) & 0xFF,
115                 c = (src[i] >>  0) & 0xFF;
116         uint8_t b = (y*k+127)/255,
117                 g = (m*k+127)/255,
118                 r = (c*k+127)/255;
119         dst[i] = (uint32_t)0xFF << 24
120                | (uint32_t)   r << 16
121                | (uint32_t)   g <<  8
122                | (uint32_t)   b <<  0;
123     }
124 }
125 
126 #if defined(SK_ARM_HAS_NEON)
127 
128 // Rounded divide by 255, (x + 127) / 255
div255_round(uint16x8_t x)129 static uint8x8_t div255_round(uint16x8_t x) {
130     // result = (x + 127) / 255
131     // result = (x + 127) / 256 + error1
132     //
133     // error1 = (x + 127) / (255 * 256)
134     // error1 = (x + 127) / (256 * 256) + error2
135     //
136     // error2 = (x + 127) / (255 * 256 * 256)
137     //
138     // The maximum value of error2 is too small to matter.  Thus:
139     // result = (x + 127) / 256 + (x + 127) / (256 * 256)
140     // result = ((x + 127) / 256 + x + 127) / 256
141     // result = ((x + 127) >> 8 + x + 127) >> 8
142     //
143     // Use >>> to represent "rounded right shift" which, conveniently,
144     // NEON supports in one instruction.
145     // result = ((x >>> 8) + x) >>> 8
146     //
147     // Note that the second right shift is actually performed as an
148     // "add, round, and narrow back to 8-bits" instruction.
149     return vraddhn_u16(x, vrshrq_n_u16(x, 8));
150 }
151 
152 // Scale a byte by another, (x * y + 127) / 255
scale(uint8x8_t x,uint8x8_t y)153 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
154     return div255_round(vmull_u8(x, y));
155 }
156 
premul_should_swapRB(bool kSwapRB,uint32_t * dst,const uint32_t * src,int count)157 static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
158     while (count >= 8) {
159         // Load 8 pixels.
160         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
161 
162         uint8x8_t a = rgba.val[3],
163                   b = rgba.val[2],
164                   g = rgba.val[1],
165                   r = rgba.val[0];
166 
167         // Premultiply.
168         b = scale(b, a);
169         g = scale(g, a);
170         r = scale(r, a);
171 
172         // Store 8 premultiplied pixels.
173         if (kSwapRB) {
174             rgba.val[2] = r;
175             rgba.val[1] = g;
176             rgba.val[0] = b;
177         } else {
178             rgba.val[2] = b;
179             rgba.val[1] = g;
180             rgba.val[0] = r;
181         }
182         vst4_u8((uint8_t*) dst, rgba);
183         src += 8;
184         dst += 8;
185         count -= 8;
186     }
187 
188     // Call portable code to finish up the tail of [0,8) pixels.
189     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
190     proc(dst, src, count);
191 }
192 
RGBA_to_rgbA(uint32_t * dst,const uint32_t * src,int count)193 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
194     premul_should_swapRB(false, dst, src, count);
195 }
196 
RGBA_to_bgrA(uint32_t * dst,const uint32_t * src,int count)197 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
198     premul_should_swapRB(true, dst, src, count);
199 }
200 
RGBA_to_BGRA(uint32_t * dst,const uint32_t * src,int count)201 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
202     using std::swap;
203     while (count >= 16) {
204         // Load 16 pixels.
205         uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
206 
207         // Swap r and b.
208         swap(rgba.val[0], rgba.val[2]);
209 
210         // Store 16 pixels.
211         vst4q_u8((uint8_t*) dst, rgba);
212         src += 16;
213         dst += 16;
214         count -= 16;
215     }
216 
217     if (count >= 8) {
218         // Load 8 pixels.
219         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
220 
221         // Swap r and b.
222         swap(rgba.val[0], rgba.val[2]);
223 
224         // Store 8 pixels.
225         vst4_u8((uint8_t*) dst, rgba);
226         src += 8;
227         dst += 8;
228         count -= 8;
229     }
230 
231     RGBA_to_BGRA_portable(dst, src, count);
232 }
233 
expand_grayA(bool kPremul,uint32_t dst[],const uint8_t * src,int count)234 static void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) {
235     while (count >= 16) {
236         // Load 16 pixels.
237         uint8x16x2_t ga = vld2q_u8(src);
238 
239         // Premultiply if requested.
240         if (kPremul) {
241             ga.val[0] = vcombine_u8(
242                     scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
243                     scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
244         }
245 
246         // Set each of the color channels.
247         uint8x16x4_t rgba;
248         rgba.val[0] = ga.val[0];
249         rgba.val[1] = ga.val[0];
250         rgba.val[2] = ga.val[0];
251         rgba.val[3] = ga.val[1];
252 
253         // Store 16 pixels.
254         vst4q_u8((uint8_t*) dst, rgba);
255         src += 16*2;
256         dst += 16;
257         count -= 16;
258     }
259 
260     if (count >= 8) {
261         // Load 8 pixels.
262         uint8x8x2_t ga = vld2_u8(src);
263 
264         // Premultiply if requested.
265         if (kPremul) {
266             ga.val[0] = scale(ga.val[0], ga.val[1]);
267         }
268 
269         // Set each of the color channels.
270         uint8x8x4_t rgba;
271         rgba.val[0] = ga.val[0];
272         rgba.val[1] = ga.val[0];
273         rgba.val[2] = ga.val[0];
274         rgba.val[3] = ga.val[1];
275 
276         // Store 8 pixels.
277         vst4_u8((uint8_t*) dst, rgba);
278         src += 8*2;
279         dst += 8;
280         count -= 8;
281     }
282 
283     auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
284     proc(dst, src, count);
285 }
286 
grayA_to_RGBA(uint32_t dst[],const uint8_t * src,int count)287 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
288     expand_grayA(false, dst, src, count);
289 }
290 
grayA_to_rgbA(uint32_t dst[],const uint8_t * src,int count)291 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
292     expand_grayA(true, dst, src, count);
293 }
294 
295 enum Format { kRGB1, kBGR1 };
inverted_cmyk_to(Format format,uint32_t * dst,const uint32_t * src,int count)296 static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
297     while (count >= 8) {
298         // Load 8 cmyk pixels.
299         uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
300 
301         uint8x8_t k = pixels.val[3],
302                   y = pixels.val[2],
303                   m = pixels.val[1],
304                   c = pixels.val[0];
305 
306         // Scale to r, g, b.
307         uint8x8_t b = scale(y, k);
308         uint8x8_t g = scale(m, k);
309         uint8x8_t r = scale(c, k);
310 
311         // Store 8 rgba pixels.
312         if (kBGR1 == format) {
313             pixels.val[3] = vdup_n_u8(0xFF);
314             pixels.val[2] = r;
315             pixels.val[1] = g;
316             pixels.val[0] = b;
317         } else {
318             pixels.val[3] = vdup_n_u8(0xFF);
319             pixels.val[2] = b;
320             pixels.val[1] = g;
321             pixels.val[0] = r;
322         }
323         vst4_u8((uint8_t*) dst, pixels);
324         src += 8;
325         dst += 8;
326         count -= 8;
327     }
328 
329     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
330     proc(dst, src, count);
331 }
332 
inverted_CMYK_to_RGB1(uint32_t dst[],const uint32_t * src,int count)333 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
334     inverted_cmyk_to(kRGB1, dst, src, count);
335 }
336 
inverted_CMYK_to_BGR1(uint32_t dst[],const uint32_t * src,int count)337 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
338     inverted_cmyk_to(kBGR1, dst, src, count);
339 }
340 
341 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
342 // Scale a byte by another.
343 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
scale(__m512i x,__m512i y)344 static __m512i scale(__m512i x, __m512i y) {
345     const __m512i _128 = _mm512_set1_epi16(128);
346     const __m512i _257 = _mm512_set1_epi16(257);
347 
348     // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
349     return _mm512_mulhi_epu16(_mm512_add_epi16(_mm512_mullo_epi16(x, y), _128), _257);
350 }
351 
premul_should_swapRB(bool kSwapRB,uint32_t * dst,const uint32_t * src,int count)352 static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
353 
354     auto premul8 = [=](__m512i* lo, __m512i* hi) {
355         const __m512i zeros = _mm512_setzero_si512();
356         skvx::Vec<64, uint8_t> mask;
357         if (kSwapRB) {
358             mask = { 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
359                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
360                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
361                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15 };
362         } else {
363             mask = { 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
364                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
365                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
366                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15 };
367         }
368         __m512i planar = skvx::bit_pun<__m512i>(mask);
369 
370         // Swizzle the pixels to 8-bit planar.
371         *lo = _mm512_shuffle_epi8(*lo, planar);
372         *hi = _mm512_shuffle_epi8(*hi, planar);
373         __m512i rg = _mm512_unpacklo_epi32(*lo, *hi),
374                 ba = _mm512_unpackhi_epi32(*lo, *hi);
375 
376         // Unpack to 16-bit planar.
377         __m512i r = _mm512_unpacklo_epi8(rg, zeros),
378                 g = _mm512_unpackhi_epi8(rg, zeros),
379                 b = _mm512_unpacklo_epi8(ba, zeros),
380                 a = _mm512_unpackhi_epi8(ba, zeros);
381 
382         // Premultiply!
383         r = scale(r, a);
384         g = scale(g, a);
385         b = scale(b, a);
386 
387         // Repack into interlaced pixels.
388         rg = _mm512_or_si512(r, _mm512_slli_epi16(g, 8));
389         ba = _mm512_or_si512(b, _mm512_slli_epi16(a, 8));
390         *lo = _mm512_unpacklo_epi16(rg, ba);
391         *hi = _mm512_unpackhi_epi16(rg, ba);
392     };
393 
394     while (count >= 32) {
395         __m512i lo = _mm512_loadu_si512((const __m512i*) (src + 0)),
396                 hi = _mm512_loadu_si512((const __m512i*) (src + 16));
397 
398         premul8(&lo, &hi);
399 
400         _mm512_storeu_si512((__m512i*) (dst + 0), lo);
401         _mm512_storeu_si512((__m512i*) (dst + 16), hi);
402 
403         src += 32;
404         dst += 32;
405         count -= 32;
406     }
407 
408     if (count >= 16) {
409         __m512i lo = _mm512_loadu_si512((const __m512i*) src),
410                 hi = _mm512_setzero_si512();
411 
412         premul8(&lo, &hi);
413 
414         _mm512_storeu_si512((__m512i*) dst, lo);
415 
416         src += 16;
417         dst += 16;
418         count -= 16;
419     }
420 
421     // Call portable code to finish up the tail of [0,16) pixels.
422     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
423     proc(dst, src, count);
424 }
425 
RGBA_to_rgbA(uint32_t * dst,const uint32_t * src,int count)426 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
427     premul_should_swapRB(false, dst, src, count);
428 }
429 
RGBA_to_bgrA(uint32_t * dst,const uint32_t * src,int count)430 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
431     premul_should_swapRB(true, dst, src, count);
432 }
433 
RGBA_to_BGRA(uint32_t * dst,const uint32_t * src,int count)434 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
435     const uint8_t mask[64] = { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
436                                2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
437                                2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
438                                2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
439     const __m512i swapRB = _mm512_loadu_si512(mask);
440 
441     while (count >= 16) {
442         __m512i rgba = _mm512_loadu_si512((const __m512i*) src);
443         __m512i bgra = _mm512_shuffle_epi8(rgba, swapRB);
444         _mm512_storeu_si512((__m512i*) dst, bgra);
445 
446         src += 16;
447         dst += 16;
448         count -= 16;
449     }
450 
451     RGBA_to_BGRA_portable(dst, src, count);
452 }
453 
454 // Use SSSE3 impl as AVX2 / AVX-512 impl regresses performance for RGB_to_RGB1 / RGB_to_BGR1.
455 
456 // Use AVX2 impl as AVX-512 impl regresses performance for gray_to_RGB1.
457 
grayA_to_RGBA(uint32_t dst[],const uint8_t * src,int count)458 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
459     while (count >= 32) {
460         __m512i ga = _mm512_loadu_si512((const __m512i*) src);
461 
462         __m512i gg = _mm512_or_si512(_mm512_and_si512(ga, _mm512_set1_epi16(0x00FF)),
463                                      _mm512_slli_epi16(ga, 8));
464 
465         __m512i ggga_lo = _mm512_unpacklo_epi16(gg, ga);
466         __m512i ggga_hi = _mm512_unpackhi_epi16(gg, ga);
467 
468         // 1st shuffle for pixel reorder.
469         // Note. 'p' stands for 'ggga'
470         // Before 1st shuffle:
471         //     ggga_lo = p0 p1 p2 p3 | p8  p9  p10 p11 | p16 p17 p18 p19 | p24 p25 p26 p27
472         //     ggga_hi = p4 p5 p6 p7 | p12 p13 p14 p15 | p20 p21 p22 p23 | p28 p29 p30 p31
473         //
474         // After 1st shuffle:
475         //     ggga_lo_shuffle_1 =
476         //               p0  p1  p2  p3  | p8  p9  p10 p11 | p4  p5  p6  p7  | p12 p13 p14 p15
477         //     ggga_hi_shuffle_1 =
478         //               p16 p17 p18 p19 | p24 p25 p26 p27 | p20 p21 p22 p23 | p28 p29 p30 p31
479         __m512i ggga_lo_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0x44),
480                 ggga_hi_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0xee);
481 
482         // 2nd shuffle for pixel reorder.
483         // After the 2nd shuffle:
484         //     ggga_lo_shuffle_2 =
485         //               p0  p1  p2  p3  | p4  p5  p6  p7  | p8  p9  p10 p11 | p12 p13 p14 p15
486         //     ggga_hi_shuffle_2 =
487         //               p16 p17 p18 p19 | p20 p21 p22 p23 | p24 p25 p26 p27 | p28 p29 p30 p31
488         __m512i ggga_lo_shuffle_2 = _mm512_shuffle_i32x4(ggga_lo_shuffle_1,
489                                                          ggga_lo_shuffle_1, 0xd8),
490                 ggga_hi_shuffle_2 = _mm512_shuffle_i32x4(ggga_hi_shuffle_1,
491                                                          ggga_hi_shuffle_1, 0xd8);
492 
493         _mm512_storeu_si512((__m512i*) (dst +  0), ggga_lo_shuffle_2);
494         _mm512_storeu_si512((__m512i*) (dst + 16), ggga_hi_shuffle_2);
495 
496         src += 32*2;
497         dst += 32;
498         count -= 32;
499     }
500 
501     grayA_to_RGBA_portable(dst, src, count);
502 }
503 
grayA_to_rgbA(uint32_t dst[],const uint8_t * src,int count)504 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
505     while (count >= 32) {
506         __m512i grayA = _mm512_loadu_si512((const __m512i*) src);
507 
508         __m512i g0 = _mm512_and_si512(grayA, _mm512_set1_epi16(0x00FF));
509         __m512i a0 = _mm512_srli_epi16(grayA, 8);
510 
511         // Premultiply
512         g0 = scale(g0, a0);
513 
514         __m512i gg = _mm512_or_si512(g0, _mm512_slli_epi16(g0, 8));
515         __m512i ga = _mm512_or_si512(g0, _mm512_slli_epi16(a0, 8));
516 
517         __m512i ggga_lo = _mm512_unpacklo_epi16(gg, ga);
518         __m512i ggga_hi = _mm512_unpackhi_epi16(gg, ga);
519 
520         // 1st shuffle for pixel reorder, same as grayA_to_RGBA.
521         __m512i ggga_lo_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0x44),
522                 ggga_hi_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0xee);
523 
524         // 2nd shuffle for pixel reorder, same as grayA_to_RGBA.
525         __m512i ggga_lo_shuffle_2 = _mm512_shuffle_i32x4(ggga_lo_shuffle_1,
526                                                          ggga_lo_shuffle_1, 0xd8),
527                 ggga_hi_shuffle_2 = _mm512_shuffle_i32x4(ggga_hi_shuffle_1,
528                                                          ggga_hi_shuffle_1, 0xd8);
529 
530         _mm512_storeu_si512((__m512i*) (dst +  0), ggga_lo_shuffle_2);
531         _mm512_storeu_si512((__m512i*) (dst + 16), ggga_hi_shuffle_2);
532 
533         src += 32*2;
534         dst += 32;
535         count -= 32;
536     }
537 
538     grayA_to_rgbA_portable(dst, src, count);
539 }
540 
541 enum Format { kRGB1, kBGR1 };
inverted_cmyk_to(Format format,uint32_t * dst,const uint32_t * src,int count)542 static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
543     auto convert8 = [=](__m512i* lo, __m512i* hi) {
544         const __m512i zeros = _mm512_setzero_si512();
545         skvx::Vec<64, uint8_t> mask;
546         if (kBGR1 == format) {
547             mask = { 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
548                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
549                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
550                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15 };
551         } else {
552             mask = { 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
553                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
554                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
555                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15 };
556         }
557         __m512i planar = skvx::bit_pun<__m512i>(mask);
558 
559         // Swizzle the pixels to 8-bit planar.
560         *lo = _mm512_shuffle_epi8(*lo, planar);
561         *hi = _mm512_shuffle_epi8(*hi, planar);
562         __m512i cm = _mm512_unpacklo_epi32(*lo, *hi),
563                 yk = _mm512_unpackhi_epi32(*lo, *hi);
564 
565         // Unpack to 16-bit planar.
566         __m512i c = _mm512_unpacklo_epi8(cm, zeros),
567                 m = _mm512_unpackhi_epi8(cm, zeros),
568                 y = _mm512_unpacklo_epi8(yk, zeros),
569                 k = _mm512_unpackhi_epi8(yk, zeros);
570 
571         // Scale to r, g, b.
572         __m512i r = scale(c, k),
573                 g = scale(m, k),
574                 b = scale(y, k);
575 
576         // Repack into interlaced pixels.
577         __m512i rg = _mm512_or_si512(r, _mm512_slli_epi16(g, 8)),
578                 ba = _mm512_or_si512(b, _mm512_set1_epi16((uint16_t) 0xFF00));
579         *lo = _mm512_unpacklo_epi16(rg, ba);
580         *hi = _mm512_unpackhi_epi16(rg, ba);
581     };
582 
583     while (count >= 32) {
584         __m512i lo = _mm512_loadu_si512((const __m512i*) (src + 0)),
585                 hi = _mm512_loadu_si512((const __m512i*) (src + 16));
586 
587         convert8(&lo, &hi);
588 
589         _mm512_storeu_si512((__m512i*) (dst +  0), lo);
590         _mm512_storeu_si512((__m512i*) (dst + 16), hi);
591 
592         src += 32;
593         dst += 32;
594         count -= 32;
595     }
596 
597     if (count >= 16) {
598         __m512i lo = _mm512_loadu_si512((const __m512i*) src),
599                 hi = _mm512_setzero_si512();
600 
601         convert8(&lo, &hi);
602 
603         _mm512_storeu_si512((__m512i*) dst, lo);
604 
605         src += 16;
606         dst += 16;
607         count -= 16;
608     }
609 
610     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
611     proc(dst, src, count);
612 }
613 
inverted_CMYK_to_RGB1(uint32_t dst[],const uint32_t * src,int count)614 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
615     inverted_cmyk_to(kRGB1, dst, src, count);
616 }
617 
inverted_CMYK_to_BGR1(uint32_t dst[],const uint32_t * src,int count)618 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
619     inverted_cmyk_to(kBGR1, dst, src, count);
620 }
621 
622 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
623 
624 // Scale a byte by another.
625 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
scale(__m256i x,__m256i y)626 static __m256i scale(__m256i x, __m256i y) {
627     const __m256i _128 = _mm256_set1_epi16(128);
628     const __m256i _257 = _mm256_set1_epi16(257);
629 
630     // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
631     return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257);
632 }
633 
premul_should_swapRB(bool kSwapRB,uint32_t * dst,const uint32_t * src,int count)634 static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
635 
636     auto premul8 = [=](__m256i* lo, __m256i* hi) {
637         const __m256i zeros = _mm256_setzero_si256();
638         __m256i planar;
639         if (kSwapRB) {
640             planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
641                                       2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
642         } else {
643             planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
644                                       0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
645         }
646 
647         // Swizzle the pixels to 8-bit planar.
648         *lo = _mm256_shuffle_epi8(*lo, planar);             // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa
649         *hi = _mm256_shuffle_epi8(*hi, planar);             // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA
650         __m256i rg = _mm256_unpacklo_epi32(*lo, *hi),       // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG
651                 ba = _mm256_unpackhi_epi32(*lo, *hi);       // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA
652 
653         // Unpack to 16-bit planar.
654         __m256i r = _mm256_unpacklo_epi8(rg, zeros),        // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_
655                 g = _mm256_unpackhi_epi8(rg, zeros),        // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_
656                 b = _mm256_unpacklo_epi8(ba, zeros),        // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_
657                 a = _mm256_unpackhi_epi8(ba, zeros);        // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_
658 
659         // Premultiply!
660         r = scale(r, a);
661         g = scale(g, a);
662         b = scale(b, a);
663 
664         // Repack into interlaced pixels.
665         rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8));   // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
666         ba = _mm256_or_si256(b, _mm256_slli_epi16(a, 8));   // babababa BABABABA babababa BABABABA
667         *lo = _mm256_unpacklo_epi16(rg, ba);                // rgbargba rgbargba rgbargba rgbargba
668         *hi = _mm256_unpackhi_epi16(rg, ba);                // RGBARGBA RGBARGBA RGBARGBA RGBARGBA
669     };
670 
671     while (count >= 16) {
672         __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
673                 hi = _mm256_loadu_si256((const __m256i*) (src + 8));
674 
675         premul8(&lo, &hi);
676 
677         _mm256_storeu_si256((__m256i*) (dst + 0), lo);
678         _mm256_storeu_si256((__m256i*) (dst + 8), hi);
679 
680         src += 16;
681         dst += 16;
682         count -= 16;
683     }
684 
685     if (count >= 8) {
686         __m256i lo = _mm256_loadu_si256((const __m256i*) src),
687                 hi = _mm256_setzero_si256();
688 
689         premul8(&lo, &hi);
690 
691         _mm256_storeu_si256((__m256i*) dst, lo);
692 
693         src += 8;
694         dst += 8;
695         count -= 8;
696     }
697 
698     // Call portable code to finish up the tail of [0,8) pixels.
699     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
700     proc(dst, src, count);
701 }
702 
RGBA_to_rgbA(uint32_t * dst,const uint32_t * src,int count)703 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
704     premul_should_swapRB(false, dst, src, count);
705 }
706 
RGBA_to_bgrA(uint32_t * dst,const uint32_t * src,int count)707 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
708     premul_should_swapRB(true, dst, src, count);
709 }
710 
RGBA_to_BGRA(uint32_t * dst,const uint32_t * src,int count)711 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
712     const __m256i swapRB = _mm256_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
713                                             2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
714 
715     while (count >= 8) {
716         __m256i rgba = _mm256_loadu_si256((const __m256i*) src);
717         __m256i bgra = _mm256_shuffle_epi8(rgba, swapRB);
718         _mm256_storeu_si256((__m256i*) dst, bgra);
719 
720         src += 8;
721         dst += 8;
722         count -= 8;
723     }
724 
725     RGBA_to_BGRA_portable(dst, src, count);
726 }
727 
grayA_to_RGBA(uint32_t dst[],const uint8_t * src,int count)728 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
729     while (count >= 16) {
730         __m256i ga = _mm256_loadu_si256((const __m256i*) src);
731 
732         __m256i gg = _mm256_or_si256(_mm256_and_si256(ga, _mm256_set1_epi16(0x00FF)),
733                                      _mm256_slli_epi16(ga, 8));
734 
735         __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
736         __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
737 
738         // Shuffle for pixel reorder
739         // Note. 'p' stands for 'ggga'
740         // Before shuffle:
741         // ggga_lo = p0 p1 p2 p3 | p8  p9  p10 p11
742         // ggga_hi = p4 p5 p6 p7 | p12 p13 p14 p15
743         //
744         // After shuffle:
745         // ggga_lo_shuffle = p0 p1 p2  p3  | p4  p5  p6  p7
746         // ggga_hi_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15
747         __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
748                 ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
749 
750         _mm256_storeu_si256((__m256i*) (dst +  0), ggga_lo_shuffle);
751         _mm256_storeu_si256((__m256i*) (dst +  8), ggga_hi_shuffle);
752 
753         src += 16*2;
754         dst += 16;
755         count -= 16;
756     }
757 
758     grayA_to_RGBA_portable(dst, src, count);
759 }
760 
grayA_to_rgbA(uint32_t dst[],const uint8_t * src,int count)761 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
762     while (count >= 16) {
763         __m256i grayA = _mm256_loadu_si256((const __m256i*) src);
764 
765         __m256i g0 = _mm256_and_si256(grayA, _mm256_set1_epi16(0x00FF));
766         __m256i a0 = _mm256_srli_epi16(grayA, 8);
767 
768         // Premultiply
769         g0 = scale(g0, a0);
770 
771         __m256i gg = _mm256_or_si256(g0, _mm256_slli_epi16(g0, 8));
772         __m256i ga = _mm256_or_si256(g0, _mm256_slli_epi16(a0, 8));
773 
774         __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
775         __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
776 
777         // Shuffle for pixel reorder, similar as grayA_to_RGBA
778         __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
779                 ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
780 
781         _mm256_storeu_si256((__m256i*) (dst +  0), ggga_lo_shuffle);
782         _mm256_storeu_si256((__m256i*) (dst +  8), ggga_hi_shuffle);
783 
784         src += 16*2;
785         dst += 16;
786         count -= 16;
787     }
788 
789     grayA_to_rgbA_portable(dst, src, count);
790 }
791 
792 enum Format { kRGB1, kBGR1 };
inverted_cmyk_to(Format format,uint32_t * dst,const uint32_t * src,int count)793 static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
794     auto convert8 = [=](__m256i* lo, __m256i* hi) {
795         const __m256i zeros = _mm256_setzero_si256();
796         __m256i planar;
797         if (kBGR1 == format) {
798             planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
799                                       2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
800         } else {
801             planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
802                                       0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
803         }
804 
805         // Swizzle the pixels to 8-bit planar.
806         *lo = _mm256_shuffle_epi8(*lo, planar);            // ccccmmmm yyyykkkk ccccmmmm yyyykkkk
807         *hi = _mm256_shuffle_epi8(*hi, planar);            // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK
808         __m256i cm = _mm256_unpacklo_epi32(*lo, *hi),      // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM
809                 yk = _mm256_unpackhi_epi32(*lo, *hi);      // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK
810 
811         // Unpack to 16-bit planar.
812         __m256i c = _mm256_unpacklo_epi8(cm, zeros),       // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_
813                 m = _mm256_unpackhi_epi8(cm, zeros),       // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_
814                 y = _mm256_unpacklo_epi8(yk, zeros),       // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_
815                 k = _mm256_unpackhi_epi8(yk, zeros);       // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_
816 
817         // Scale to r, g, b.
818         __m256i r = scale(c, k),
819                 g = scale(m, k),
820                 b = scale(y, k);
821 
822         // Repack into interlaced pixels:
823         //     rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
824         //     ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1
825         __m256i rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)),
826                 ba = _mm256_or_si256(b, _mm256_set1_epi16((uint16_t) 0xFF00));
827         *lo = _mm256_unpacklo_epi16(rg, ba);               // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1
828         *hi = _mm256_unpackhi_epi16(rg, ba);               // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1
829     };
830 
831     while (count >= 16) {
832         __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
833                 hi = _mm256_loadu_si256((const __m256i*) (src + 8));
834 
835         convert8(&lo, &hi);
836 
837         _mm256_storeu_si256((__m256i*) (dst + 0), lo);
838         _mm256_storeu_si256((__m256i*) (dst + 8), hi);
839 
840         src += 16;
841         dst += 16;
842         count -= 16;
843     }
844 
845     if (count >= 8) {
846         __m256i lo = _mm256_loadu_si256((const __m256i*) src),
847                 hi = _mm256_setzero_si256();
848 
849         convert8(&lo, &hi);
850 
851         _mm256_storeu_si256((__m256i*) dst, lo);
852 
853         src += 8;
854         dst += 8;
855         count -= 8;
856     }
857 
858     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
859     proc(dst, src, count);
860 }
861 
inverted_CMYK_to_RGB1(uint32_t dst[],const uint32_t * src,int count)862 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
863     inverted_cmyk_to(kRGB1, dst, src, count);
864 }
865 
inverted_CMYK_to_BGR1(uint32_t dst[],const uint32_t * src,int count)866 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
867     inverted_cmyk_to(kBGR1, dst, src, count);
868 }
869 
870 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
871 
872 // Scale a byte by another.
873 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
scale(__m128i x,__m128i y)874 static __m128i scale(__m128i x, __m128i y) {
875     const __m128i _128 = _mm_set1_epi16(128);
876     const __m128i _257 = _mm_set1_epi16(257);
877 
878     // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
879     return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
880 }
881 
premul_should_swapRB(bool kSwapRB,uint32_t * dst,const uint32_t * src,int count)882 static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
883 
884     auto premul8 = [=](__m128i* lo, __m128i* hi) {
885         const __m128i zeros = _mm_setzero_si128();
886         __m128i planar;
887         if (kSwapRB) {
888             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
889         } else {
890             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
891         }
892 
893         // Swizzle the pixels to 8-bit planar.
894         *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
895         *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
896         __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
897                 ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
898 
899         // Unpack to 16-bit planar.
900         __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
901                 g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
902                 b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
903                 a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
904 
905         // Premultiply!
906         r = scale(r, a);
907         g = scale(g, a);
908         b = scale(b, a);
909 
910         // Repack into interlaced pixels.
911         rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
912         ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
913         *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
914         *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
915     };
916 
917     while (count >= 8) {
918         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
919                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
920 
921         premul8(&lo, &hi);
922 
923         _mm_storeu_si128((__m128i*) (dst + 0), lo);
924         _mm_storeu_si128((__m128i*) (dst + 4), hi);
925 
926         src += 8;
927         dst += 8;
928         count -= 8;
929     }
930 
931     if (count >= 4) {
932         __m128i lo = _mm_loadu_si128((const __m128i*) src),
933                 hi = _mm_setzero_si128();
934 
935         premul8(&lo, &hi);
936 
937         _mm_storeu_si128((__m128i*) dst, lo);
938 
939         src += 4;
940         dst += 4;
941         count -= 4;
942     }
943 
944     // Call portable code to finish up the tail of [0,4) pixels.
945     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
946     proc(dst, src, count);
947 }
948 
RGBA_to_rgbA(uint32_t * dst,const uint32_t * src,int count)949 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
950     premul_should_swapRB(false, dst, src, count);
951 }
952 
RGBA_to_bgrA(uint32_t * dst,const uint32_t * src,int count)953 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
954     premul_should_swapRB(true, dst, src, count);
955 }
956 
RGBA_to_BGRA(uint32_t * dst,const uint32_t * src,int count)957 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
958     const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
959 
960     while (count >= 4) {
961         __m128i rgba = _mm_loadu_si128((const __m128i*) src);
962         __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
963         _mm_storeu_si128((__m128i*) dst, bgra);
964 
965         src += 4;
966         dst += 4;
967         count -= 4;
968     }
969 
970     RGBA_to_BGRA_portable(dst, src, count);
971 }
972 
grayA_to_RGBA(uint32_t dst[],const uint8_t * src,int count)973 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
974     while (count >= 8) {
975         __m128i ga = _mm_loadu_si128((const __m128i*) src);
976 
977         __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
978                                   _mm_slli_epi16(ga, 8));
979 
980         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
981         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
982 
983         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
984         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
985 
986         src += 8*2;
987         dst += 8;
988         count -= 8;
989     }
990 
991     grayA_to_RGBA_portable(dst, src, count);
992 }
993 
grayA_to_rgbA(uint32_t dst[],const uint8_t * src,int count)994 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
995     while (count >= 8) {
996         __m128i grayA = _mm_loadu_si128((const __m128i*) src);
997 
998         __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
999         __m128i a0 = _mm_srli_epi16(grayA, 8);
1000 
1001         // Premultiply
1002         g0 = scale(g0, a0);
1003 
1004         __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
1005         __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
1006 
1007 
1008         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
1009         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
1010 
1011         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
1012         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
1013 
1014         src += 8*2;
1015         dst += 8;
1016         count -= 8;
1017     }
1018 
1019     grayA_to_rgbA_portable(dst, src, count);
1020 }
1021 
1022 enum Format { kRGB1, kBGR1 };
inverted_cmyk_to(Format format,uint32_t * dst,const uint32_t * src,int count)1023 static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1024     auto convert8 = [=](__m128i* lo, __m128i* hi) {
1025         const __m128i zeros = _mm_setzero_si128();
1026         __m128i planar;
1027         if (kBGR1 == format) {
1028             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
1029         } else {
1030             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
1031         }
1032 
1033         // Swizzle the pixels to 8-bit planar.
1034         *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
1035         *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
1036         __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
1037                 yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK
1038 
1039         // Unpack to 16-bit planar.
1040         __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
1041                 m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
1042                 y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
1043                 k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_
1044 
1045         // Scale to r, g, b.
1046         __m128i r = scale(c, k),
1047                 g = scale(m, k),
1048                 b = scale(y, k);
1049 
1050         // Repack into interlaced pixels.
1051         __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
1052                 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
1053         *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
1054         *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
1055     };
1056 
1057     while (count >= 8) {
1058         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
1059                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
1060 
1061         convert8(&lo, &hi);
1062 
1063         _mm_storeu_si128((__m128i*) (dst + 0), lo);
1064         _mm_storeu_si128((__m128i*) (dst + 4), hi);
1065 
1066         src += 8;
1067         dst += 8;
1068         count -= 8;
1069     }
1070 
1071     if (count >= 4) {
1072         __m128i lo = _mm_loadu_si128((const __m128i*) src),
1073                 hi = _mm_setzero_si128();
1074 
1075         convert8(&lo, &hi);
1076 
1077         _mm_storeu_si128((__m128i*) dst, lo);
1078 
1079         src += 4;
1080         dst += 4;
1081         count -= 4;
1082     }
1083 
1084     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1085     proc(dst, src, count);
1086 }
1087 
inverted_CMYK_to_RGB1(uint32_t dst[],const uint32_t * src,int count)1088 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1089     inverted_cmyk_to(kRGB1, dst, src, count);
1090 }
1091 
inverted_CMYK_to_BGR1(uint32_t dst[],const uint32_t * src,int count)1092 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1093     inverted_cmyk_to(kBGR1, dst, src, count);
1094 }
1095 
1096 #else
1097 
RGBA_to_rgbA(uint32_t * dst,const uint32_t * src,int count)1098 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1099     RGBA_to_rgbA_portable(dst, src, count);
1100 }
1101 
RGBA_to_bgrA(uint32_t * dst,const uint32_t * src,int count)1102 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1103     RGBA_to_bgrA_portable(dst, src, count);
1104 }
1105 
RGBA_to_BGRA(uint32_t * dst,const uint32_t * src,int count)1106 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1107     RGBA_to_BGRA_portable(dst, src, count);
1108 }
1109 
grayA_to_RGBA(uint32_t dst[],const uint8_t * src,int count)1110 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1111     grayA_to_RGBA_portable(dst, src, count);
1112 }
1113 
grayA_to_rgbA(uint32_t dst[],const uint8_t * src,int count)1114 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1115     grayA_to_rgbA_portable(dst, src, count);
1116 }
1117 
inverted_CMYK_to_RGB1(uint32_t dst[],const uint32_t * src,int count)1118 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1119     inverted_CMYK_to_RGB1_portable(dst, src, count);
1120 }
1121 
inverted_CMYK_to_BGR1(uint32_t dst[],const uint32_t * src,int count)1122 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1123     inverted_CMYK_to_BGR1_portable(dst, src, count);
1124 }
1125 
1126 #endif
1127 
1128 // Basically as above, but we found no benefit from AVX-512 for gray_to_RGB1.
gray_to_RGB1_portable(uint32_t dst[],const uint8_t * src,int count)1129 static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1130     for (int i = 0; i < count; i++) {
1131         dst[i] = (uint32_t)0xFF   << 24
1132                | (uint32_t)src[i] << 16
1133                | (uint32_t)src[i] <<  8
1134                | (uint32_t)src[i] <<  0;
1135     }
1136 }
1137 #if defined(SK_ARM_HAS_NEON)
gray_to_RGB1(uint32_t dst[],const uint8_t * src,int count)1138     /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1139         while (count >= 16) {
1140             // Load 16 pixels.
1141             uint8x16_t gray = vld1q_u8(src);
1142 
1143             // Set each of the color channels.
1144             uint8x16x4_t rgba;
1145             rgba.val[0] = gray;
1146             rgba.val[1] = gray;
1147             rgba.val[2] = gray;
1148             rgba.val[3] = vdupq_n_u8(0xFF);
1149 
1150             // Store 16 pixels.
1151             vst4q_u8((uint8_t*) dst, rgba);
1152             src += 16;
1153             dst += 16;
1154             count -= 16;
1155         }
1156         if (count >= 8) {
1157             // Load 8 pixels.
1158             uint8x8_t gray = vld1_u8(src);
1159 
1160             // Set each of the color channels.
1161             uint8x8x4_t rgba;
1162             rgba.val[0] = gray;
1163             rgba.val[1] = gray;
1164             rgba.val[2] = gray;
1165             rgba.val[3] = vdup_n_u8(0xFF);
1166 
1167             // Store 8 pixels.
1168             vst4_u8((uint8_t*) dst, rgba);
1169             src += 8;
1170             dst += 8;
1171             count -= 8;
1172         }
1173         gray_to_RGB1_portable(dst, src, count);
1174     }
1175 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
gray_to_RGB1(uint32_t dst[],const uint8_t * src,int count)1176     /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1177         const __m256i alphas = _mm256_set1_epi8((uint8_t) 0xFF);
1178         while (count >= 32) {
1179             __m256i grays = _mm256_loadu_si256((const __m256i*) src);
1180 
1181             __m256i gg_lo = _mm256_unpacklo_epi8(grays, grays);
1182             __m256i gg_hi = _mm256_unpackhi_epi8(grays, grays);
1183             __m256i ga_lo = _mm256_unpacklo_epi8(grays, alphas);
1184             __m256i ga_hi = _mm256_unpackhi_epi8(grays, alphas);
1185 
1186             __m256i ggga0 = _mm256_unpacklo_epi16(gg_lo, ga_lo);
1187             __m256i ggga1 = _mm256_unpackhi_epi16(gg_lo, ga_lo);
1188             __m256i ggga2 = _mm256_unpacklo_epi16(gg_hi, ga_hi);
1189             __m256i ggga3 = _mm256_unpackhi_epi16(gg_hi, ga_hi);
1190 
1191             // Shuffle for pixel reorder.
1192             // Note. 'p' stands for 'ggga'
1193             // Before shuffle:
1194             //     ggga0 = p0  p1  p2  p3  | p16 p17 p18 p19
1195             //     ggga1 = p4  p5  p6  p7  | p20 p21 p22 p23
1196             //     ggga2 = p8  p9  p10 p11 | p24 p25 p26 p27
1197             //     ggga3 = p12 p13 p14 p15 | p28 p29 p30 p31
1198             //
1199             // After shuffle:
1200             //     ggga0_shuffle = p0  p1  p2  p3  | p4  p5  p6  p7
1201             //     ggga1_shuffle = p8  p9  p10 p11 | p12 p13 p14 p15
1202             //     ggga2_shuffle = p16 p17 p18 p19 | p20 p21 p22 p23
1203             //     ggga3_shuffle = p24 p25 p26 p27 | p28 p29 p30 p31
1204             __m256i ggga0_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x20),
1205                     ggga1_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x20),
1206                     ggga2_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x31),
1207                     ggga3_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x31);
1208 
1209             _mm256_storeu_si256((__m256i*) (dst +  0), ggga0_shuffle);
1210             _mm256_storeu_si256((__m256i*) (dst +  8), ggga1_shuffle);
1211             _mm256_storeu_si256((__m256i*) (dst + 16), ggga2_shuffle);
1212             _mm256_storeu_si256((__m256i*) (dst + 24), ggga3_shuffle);
1213 
1214             src += 32;
1215             dst += 32;
1216             count -= 32;
1217         }
1218         gray_to_RGB1_portable(dst, src, count);
1219     }
1220 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3  // TODO: just check >= SSE2?
gray_to_RGB1(uint32_t dst[],const uint8_t * src,int count)1221     /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1222         const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
1223         while (count >= 16) {
1224             __m128i grays = _mm_loadu_si128((const __m128i*) src);
1225 
1226             __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
1227             __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
1228             __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
1229             __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
1230 
1231             __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
1232             __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
1233             __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
1234             __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
1235 
1236             _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
1237             _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
1238             _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
1239             _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
1240 
1241             src += 16;
1242             dst += 16;
1243             count -= 16;
1244         }
1245         gray_to_RGB1_portable(dst, src, count);
1246     }
1247 #else
gray_to_RGB1(uint32_t dst[],const uint8_t * src,int count)1248     /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1249         gray_to_RGB1_portable(dst, src, count);
1250     }
1251 #endif
1252 
1253 // Again as above, this time not even finding benefit from AVX2 for RGB_to_{RGB,BGR}1.
RGB_to_RGB1_portable(uint32_t dst[],const uint8_t * src,int count)1254 static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1255     for (int i = 0; i < count; i++) {
1256         uint8_t r = src[0],
1257                 g = src[1],
1258                 b = src[2];
1259         src += 3;
1260         dst[i] = (uint32_t)0xFF << 24
1261                | (uint32_t)b    << 16
1262                | (uint32_t)g    <<  8
1263                | (uint32_t)r    <<  0;
1264     }
1265 }
RGB_to_BGR1_portable(uint32_t dst[],const uint8_t * src,int count)1266 static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
1267     for (int i = 0; i < count; i++) {
1268         uint8_t r = src[0],
1269                 g = src[1],
1270                 b = src[2];
1271         src += 3;
1272         dst[i] = (uint32_t)0xFF << 24
1273                | (uint32_t)r    << 16
1274                | (uint32_t)g    <<  8
1275                | (uint32_t)b    <<  0;
1276     }
1277 }
1278 #if defined(SK_ARM_HAS_NEON)
insert_alpha_should_swaprb(bool kSwapRB,uint32_t dst[],const uint8_t * src,int count)1279     static void insert_alpha_should_swaprb(bool kSwapRB,
1280                                            uint32_t dst[], const uint8_t* src, int count) {
1281         while (count >= 16) {
1282             // Load 16 pixels.
1283             uint8x16x3_t rgb = vld3q_u8(src);
1284 
1285             // Insert an opaque alpha channel and swap if needed.
1286             uint8x16x4_t rgba;
1287             if (kSwapRB) {
1288                 rgba.val[0] = rgb.val[2];
1289                 rgba.val[2] = rgb.val[0];
1290             } else {
1291                 rgba.val[0] = rgb.val[0];
1292                 rgba.val[2] = rgb.val[2];
1293             }
1294             rgba.val[1] = rgb.val[1];
1295             rgba.val[3] = vdupq_n_u8(0xFF);
1296 
1297             // Store 16 pixels.
1298             vst4q_u8((uint8_t*) dst, rgba);
1299             src += 16*3;
1300             dst += 16;
1301             count -= 16;
1302         }
1303 
1304         if (count >= 8) {
1305             // Load 8 pixels.
1306             uint8x8x3_t rgb = vld3_u8(src);
1307 
1308             // Insert an opaque alpha channel and swap if needed.
1309             uint8x8x4_t rgba;
1310             if (kSwapRB) {
1311                 rgba.val[0] = rgb.val[2];
1312                 rgba.val[2] = rgb.val[0];
1313             } else {
1314                 rgba.val[0] = rgb.val[0];
1315                 rgba.val[2] = rgb.val[2];
1316             }
1317             rgba.val[1] = rgb.val[1];
1318             rgba.val[3] = vdup_n_u8(0xFF);
1319 
1320             // Store 8 pixels.
1321             vst4_u8((uint8_t*) dst, rgba);
1322             src += 8*3;
1323             dst += 8;
1324             count -= 8;
1325         }
1326 
1327         // Call portable code to finish up the tail of [0,8) pixels.
1328         auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1329         proc(dst, src, count);
1330     }
1331 
RGB_to_RGB1(uint32_t dst[],const uint8_t * src,int count)1332     /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1333         insert_alpha_should_swaprb(false, dst, src, count);
1334     }
RGB_to_BGR1(uint32_t dst[],const uint8_t * src,int count)1335     /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1336         insert_alpha_should_swaprb(true, dst, src, count);
1337     }
1338 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
insert_alpha_should_swaprb(bool kSwapRB,uint32_t dst[],const uint8_t * src,int count)1339     static void insert_alpha_should_swaprb(bool kSwapRB,
1340                                            uint32_t dst[], const uint8_t* src, int count) {
1341         const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
1342         __m128i expand;
1343         const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
1344         if (kSwapRB) {
1345             expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
1346         } else {
1347             expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
1348         }
1349 
1350         while (count >= 6) {
1351             // Load a vector.  While this actually contains 5 pixels plus an
1352             // extra component, we will discard all but the first four pixels on
1353             // this iteration.
1354             __m128i rgb = _mm_loadu_si128((const __m128i*) src);
1355 
1356             // Expand the first four pixels to RGBX and then mask to RGB(FF).
1357             __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
1358 
1359             // Store 4 pixels.
1360             _mm_storeu_si128((__m128i*) dst, rgba);
1361 
1362             src += 4*3;
1363             dst += 4;
1364             count -= 4;
1365         }
1366 
1367         // Call portable code to finish up the tail of [0,4) pixels.
1368         auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1369         proc(dst, src, count);
1370     }
1371 
RGB_to_RGB1(uint32_t dst[],const uint8_t * src,int count)1372     /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1373         insert_alpha_should_swaprb(false, dst, src, count);
1374     }
RGB_to_BGR1(uint32_t dst[],const uint8_t * src,int count)1375     /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1376         insert_alpha_should_swaprb(true, dst, src, count);
1377     }
1378 #else
RGB_to_RGB1(uint32_t dst[],const uint8_t * src,int count)1379     /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1380         RGB_to_RGB1_portable(dst, src, count);
1381     }
RGB_to_BGR1(uint32_t dst[],const uint8_t * src,int count)1382     /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1383         RGB_to_BGR1_portable(dst, src, count);
1384     }
1385 #endif
1386 
1387 }  // namespace SK_OPTS_NS
1388 
1389 #endif // SkSwizzler_opts_DEFINED
1390