• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "src/base/SkUtils.h"
9#include "src/base/SkVx.h"
10#include "src/core/SkColorData.h"
11#include "src/core/SkSwizzlePriv.h"
12
13#include <algorithm>
14#include <cmath>
15#include <utility>
16
17#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
18    #include <immintrin.h>
19#elif defined(SK_ARM_HAS_NEON)
20    #include <arm_neon.h>
21#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
22    #include <lasxintrin.h>
23#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
24    #include <lsxintrin.h>
25#endif
26
27// This file is included in multiple translation units with different #defines set enabling
28// different instruction use for different CPU architectures.
29//
30// A pair of files controls what #defines are defined: SkOpts_SetTarget.h set the flags, and
31// SkOpts_RestoreTarget.h restores them. SkOpts_SetTarget is controlled by setting the
32// SK_OPTS_TARGET define before included it.
33//
34// SkOpts_SetTarget also sets the #define SK_OPTS_NS to the unique namespace for this code.
35
36#if defined(__clang__) || defined(__GNUC__)
37#define SI __attribute__((always_inline)) static inline
38#else
39#define SI static inline
40#endif
41
42namespace SK_OPTS_NS {
43
44#if defined(SK_USE_FAST_UNPREMUL_324099025)
45constexpr bool kFastUnpremul = true;
46#else
47constexpr bool kFastUnpremul = false;
48#endif
49
50SI float reciprocal_alpha_times_255_portable(float a) {
51    return a != 0 ? 255.0f / a : 0.0f;
52}
53
54SI float reciprocal_alpha_portable(float a) {
55    return a != 0 ? 1.0f / a : 0.0f;
56}
57
58#if defined(SK_ARM_HAS_NEON)
59// -- NEON -- Harden against timing attacks
60// For neon, the portable versions create branchless code.
61SI float reciprocal_alpha_times_255(float a) {
62    return reciprocal_alpha_times_255_portable(a);
63}
64
65SI float reciprocal_alpha(float a) {
66    return reciprocal_alpha_portable(a);
67}
68#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 && (defined(__clang__) || !defined(_MSC_VER))
69// -- SSE -- Harden against timing attacks -- MSVC is not supported.
70using F4 = __m128;
71
72SK_NO_SANITIZE("float-divide-by-zero")
73SI float reciprocal_alpha_times_255(float a) {
74    SkASSERT(0 <= a && a <= 255);
75    F4 vA{a, a, a, a};
76    auto q = F4{255.0f} / vA;
77    return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
78}
79
80SK_NO_SANITIZE("float-divide-by-zero")
81SI float reciprocal_alpha(float a) {
82    SkASSERT(0 <= a && a <= 1);
83    F4 vA{a, a, a, a};
84    auto q = F4{1.0f} / vA;
85    return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0];
86}
87#else
88// -- Portable -- *Not* hardened against timing attacks
89SI float reciprocal_alpha_times_255(float a) {
90    return reciprocal_alpha_times_255_portable(a);
91}
92
93SI float reciprocal_alpha(float a) {
94    return reciprocal_alpha_portable(a);
95}
96#endif
97
98static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
99    for (int i = 0; i < count; i++) {
100        uint8_t a = (src[i] >> 24) & 0xFF,
101                b = (src[i] >> 16) & 0xFF,
102                g = (src[i] >>  8) & 0xFF,
103                r = (src[i] >>  0) & 0xFF;
104        b = (b*a+127)/255;
105        g = (g*a+127)/255;
106        r = (r*a+127)/255;
107        dst[i] = (uint32_t)a << 24
108               | (uint32_t)b << 16
109               | (uint32_t)g <<  8
110               | (uint32_t)r <<  0;
111    }
112}
113
114// RP uses the following rounding routines in store_8888. There are three different
115// styles of rounding:
116//   1) +0.5 and floor - used by scalar and ARMv7
117//   2) round to even for sure - ARMv8
118//   3) round to even maybe - intel. The rounding on intel depends on MXCSR which
119//                            defaults to round to even.
120//
121// Note: that vrndns_f32 is the single float version of vcvtnq_u32_f32.
122
123SI uint32_t pixel_round_as_RP(float n) {
124#if defined(SK_ARM_HAS_NEON) && defined(SK_CPU_ARM64)
125    return vrndns_f32(n);
126#elif defined(SK_ARM_HAS_NEON) && !defined(SK_CPU_ARM64)
127    float32x4_t vN{n + 0.5f};
128    return vcvtq_u32_f32(vN)[0];
129#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 && (defined(__clang__) || !defined(_MSC_VER))
130    return _mm_cvtps_epi32(__m128{n})[0];
131#else
132    return (uint32_t)(n + 0.5f);
133#endif
134}
135
136// Doing the math for an original color b resulting in a premul color x,
137//   x = ⌊(b * a + 127) / 255⌋,
138//   x ≤ (b * a + 127) / 255 < x + 1,
139//   255 * x ≤ b * a + 127 < 255 * (x + 1),
140//   255 * x - 127 ≤ b * a < 255 * (x + 1) - 127,
141//   255 * x - 127 ≤ b * a < 255 * x + 128,
142//   (255 * x - 127) / a ≤ b < (255 * x + 128) / a.
143// So, given a premul value x < a, the original color b can be in the above range.
144// We can pick the middle of that range as
145//   b = 255 * x / a
146//   b = x * (255 / a)
147SI uint32_t unpremul_quick(float reciprocalA, float c) {
148    return (uint32_t)std::min(255.0f, (c * reciprocalA + 0.5f));
149}
150
151// Similar to unpremul but simulates Raster Pipeline by normalizing the pixel on the interval
152// [0, 1] and uses round-to-even in most cases instead of round-up.
153SI uint32_t unpremul_simulating_RP(float reciprocalA, float c) {
154    const float normalizedC = c * (1.0f / 255.0f);
155    const float answer = std::min(255.0f, normalizedC * reciprocalA * 255.0f);
156    return pixel_round_as_RP(answer);
157}
158
159SI uint32_t rgbA_to_CCCA(float c00, float c08, float c16, float a) {
160    if constexpr (kFastUnpremul) {
161        const float reciprocalA = reciprocal_alpha_times_255(a);
162        auto unpremul = [reciprocalA](float c) -> uint32_t {
163            return unpremul_quick(reciprocalA, c);
164        };
165        return (uint32_t) a << 24
166               | unpremul(c16) << 16
167               | unpremul(c08) <<  8
168               | unpremul(c00) <<  0;
169    } else {
170        const float normalizedA = a * (1.0f / 255.0f);
171        const float reciprocalA = reciprocal_alpha(normalizedA);
172        auto unpremul = [reciprocalA](float c) -> uint32_t {
173            return unpremul_simulating_RP(reciprocalA, c);
174        };
175        return (uint32_t) a << 24
176               | unpremul(c16) << 16
177               | unpremul(c08) <<  8
178               | unpremul(c00) <<  0;
179    }
180}
181
182static void rgbA_to_RGBA_portable(uint32_t* dst, const uint32_t* src, int count) {
183    for (int i = 0; i < count; i++) {
184        const uint32_t p = src[i];
185
186        const float a = (p >> 24) & 0xFF,
187                    b = (p >> 16) & 0xFF,
188                    g = (p >>  8) & 0xFF,
189                    r = (p >>  0) & 0xFF;
190
191        dst[i] = rgbA_to_CCCA(r, g, b, a);
192    }
193}
194
195static void rgbA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
196    for (int i = 0; i < count; i++) {
197        const uint32_t p = src[i];
198
199        const uint32_t a = (p >> 24) & 0xFF,
200                       b = (p >> 16) & 0xFF,
201                       g = (p >>  8) & 0xFF,
202                       r = (p >>  0) & 0xFF;
203
204        dst[i] = rgbA_to_CCCA(b, g, r, a);
205    }
206}
207
208static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
209    for (int i = 0; i < count; i++) {
210        uint8_t a = (src[i] >> 24) & 0xFF,
211                b = (src[i] >> 16) & 0xFF,
212                g = (src[i] >>  8) & 0xFF,
213                r = (src[i] >>  0) & 0xFF;
214        b = (b*a+127)/255;
215        g = (g*a+127)/255;
216        r = (r*a+127)/255;
217        dst[i] = (uint32_t)a << 24
218               | (uint32_t)r << 16
219               | (uint32_t)g <<  8
220               | (uint32_t)b <<  0;
221    }
222}
223
224static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
225    for (int i = 0; i < count; i++) {
226        uint8_t a = (src[i] >> 24) & 0xFF,
227                b = (src[i] >> 16) & 0xFF,
228                g = (src[i] >>  8) & 0xFF,
229                r = (src[i] >>  0) & 0xFF;
230        dst[i] = (uint32_t)a << 24
231               | (uint32_t)r << 16
232               | (uint32_t)g <<  8
233               | (uint32_t)b <<  0;
234    }
235}
236
237static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
238    for (int i = 0; i < count; i++) {
239        uint8_t g = src[0],
240                a = src[1];
241        src += 2;
242        dst[i] = (uint32_t)a << 24
243               | (uint32_t)g << 16
244               | (uint32_t)g <<  8
245               | (uint32_t)g <<  0;
246    }
247}
248
249static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
250    for (int i = 0; i < count; i++) {
251        uint8_t g = src[0],
252                a = src[1];
253        src += 2;
254        g = (g*a+127)/255;
255        dst[i] = (uint32_t)a << 24
256               | (uint32_t)g << 16
257               | (uint32_t)g <<  8
258               | (uint32_t)g <<  0;
259    }
260}
261
262static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
263    for (int i = 0; i < count; i++) {
264        uint8_t k = (src[i] >> 24) & 0xFF,
265                y = (src[i] >> 16) & 0xFF,
266                m = (src[i] >>  8) & 0xFF,
267                c = (src[i] >>  0) & 0xFF;
268        // See comments in SkSwizzler.cpp for details on the conversion formula.
269        uint8_t b = (y*k+127)/255,
270                g = (m*k+127)/255,
271                r = (c*k+127)/255;
272        dst[i] = (uint32_t)0xFF << 24
273               | (uint32_t)   b << 16
274               | (uint32_t)   g <<  8
275               | (uint32_t)   r <<  0;
276    }
277}
278
279static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
280    for (int i = 0; i < count; i++) {
281        uint8_t k = (src[i] >> 24) & 0xFF,
282                y = (src[i] >> 16) & 0xFF,
283                m = (src[i] >>  8) & 0xFF,
284                c = (src[i] >>  0) & 0xFF;
285        uint8_t b = (y*k+127)/255,
286                g = (m*k+127)/255,
287                r = (c*k+127)/255;
288        dst[i] = (uint32_t)0xFF << 24
289               | (uint32_t)   r << 16
290               | (uint32_t)   g <<  8
291               | (uint32_t)   b <<  0;
292    }
293}
294
295#if defined(SK_ARM_HAS_NEON)
296// -- NEON -----------------------------------------------------------------------------------------
297// Rounded divide by 255, (x + 127) / 255
298SI uint8x8_t div255_round(uint16x8_t x) {
299    // result = (x + 127) / 255
300    // result = (x + 127) / 256 + error1
301    //
302    // error1 = (x + 127) / (255 * 256)
303    // error1 = (x + 127) / (256 * 256) + error2
304    //
305    // error2 = (x + 127) / (255 * 256 * 256)
306    //
307    // The maximum value of error2 is too small to matter.  Thus:
308    // result = (x + 127) / 256 + (x + 127) / (256 * 256)
309    // result = ((x + 127) / 256 + x + 127) / 256
310    // result = ((x + 127) >> 8 + x + 127) >> 8
311    //
312    // Use >>> to represent "rounded right shift" which, conveniently,
313    // NEON supports in one instruction.
314    // result = ((x >>> 8) + x) >>> 8
315    //
316    // Note that the second right shift is actually performed as an
317    // "add, round, and narrow back to 8-bits" instruction.
318    return vraddhn_u16(x, vrshrq_n_u16(x, 8));
319}
320
321// Scale a byte by another, (x * y + 127) / 255
322SI uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
323    return div255_round(vmull_u8(x, y));
324}
325
326static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
327    while (count >= 8) {
328        // Load 8 pixels.
329        uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
330
331        uint8x8_t a = rgba.val[3],
332                  b = rgba.val[2],
333                  g = rgba.val[1],
334                  r = rgba.val[0];
335
336        // Premultiply.
337        b = scale(b, a);
338        g = scale(g, a);
339        r = scale(r, a);
340
341        // Store 8 premultiplied pixels.
342        if (kSwapRB) {
343            rgba.val[2] = r;
344            rgba.val[1] = g;
345            rgba.val[0] = b;
346        } else {
347            rgba.val[2] = b;
348            rgba.val[1] = g;
349            rgba.val[0] = r;
350        }
351        vst4_u8((uint8_t*) dst, rgba);
352        src += 8;
353        dst += 8;
354        count -= 8;
355    }
356
357    // Call portable code to finish up the tail of [0,8) pixels.
358    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
359    proc(dst, src, count);
360}
361
362void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
363    premul_should_swapRB(false, dst, src, count);
364}
365
366void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
367    premul_should_swapRB(true, dst, src, count);
368}
369
370void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
371    using std::swap;
372    while (count >= 16) {
373        // Load 16 pixels.
374        uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
375
376        // Swap r and b.
377        swap(rgba.val[0], rgba.val[2]);
378
379        // Store 16 pixels.
380        vst4q_u8((uint8_t*) dst, rgba);
381        src += 16;
382        dst += 16;
383        count -= 16;
384    }
385
386    if (count >= 8) {
387        // Load 8 pixels.
388        uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
389
390        // Swap r and b.
391        swap(rgba.val[0], rgba.val[2]);
392
393        // Store 8 pixels.
394        vst4_u8((uint8_t*) dst, rgba);
395        src += 8;
396        dst += 8;
397        count -= 8;
398    }
399
400    RGBA_to_BGRA_portable(dst, src, count);
401}
402
403static void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) {
404    while (count >= 16) {
405        // Load 16 pixels.
406        uint8x16x2_t ga = vld2q_u8(src);
407
408        // Premultiply if requested.
409        if (kPremul) {
410            ga.val[0] = vcombine_u8(
411                    scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
412                    scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
413        }
414
415        // Set each of the color channels.
416        uint8x16x4_t rgba;
417        rgba.val[0] = ga.val[0];
418        rgba.val[1] = ga.val[0];
419        rgba.val[2] = ga.val[0];
420        rgba.val[3] = ga.val[1];
421
422        // Store 16 pixels.
423        vst4q_u8((uint8_t*) dst, rgba);
424        src += 16*2;
425        dst += 16;
426        count -= 16;
427    }
428
429    if (count >= 8) {
430        // Load 8 pixels.
431        uint8x8x2_t ga = vld2_u8(src);
432
433        // Premultiply if requested.
434        if (kPremul) {
435            ga.val[0] = scale(ga.val[0], ga.val[1]);
436        }
437
438        // Set each of the color channels.
439        uint8x8x4_t rgba;
440        rgba.val[0] = ga.val[0];
441        rgba.val[1] = ga.val[0];
442        rgba.val[2] = ga.val[0];
443        rgba.val[3] = ga.val[1];
444
445        // Store 8 pixels.
446        vst4_u8((uint8_t*) dst, rgba);
447        src += 8*2;
448        dst += 8;
449        count -= 8;
450    }
451
452    auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
453    proc(dst, src, count);
454}
455
456void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
457    expand_grayA(false, dst, src, count);
458}
459
460void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
461    expand_grayA(true, dst, src, count);
462}
463
464enum Format { kRGB1, kBGR1 };
465static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
466    while (count >= 8) {
467        // Load 8 cmyk pixels.
468        uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
469
470        uint8x8_t k = pixels.val[3],
471                  y = pixels.val[2],
472                  m = pixels.val[1],
473                  c = pixels.val[0];
474
475        // Scale to r, g, b.
476        uint8x8_t b = scale(y, k);
477        uint8x8_t g = scale(m, k);
478        uint8x8_t r = scale(c, k);
479
480        // Store 8 rgba pixels.
481        if (kBGR1 == format) {
482            pixels.val[3] = vdup_n_u8(0xFF);
483            pixels.val[2] = r;
484            pixels.val[1] = g;
485            pixels.val[0] = b;
486        } else {
487            pixels.val[3] = vdup_n_u8(0xFF);
488            pixels.val[2] = b;
489            pixels.val[1] = g;
490            pixels.val[0] = r;
491        }
492        vst4_u8((uint8_t*) dst, pixels);
493        src += 8;
494        dst += 8;
495        count -= 8;
496    }
497
498    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
499    proc(dst, src, count);
500}
501
502void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
503    inverted_cmyk_to(kRGB1, dst, src, count);
504}
505
506void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
507    inverted_cmyk_to(kBGR1, dst, src, count);
508}
509
510template <bool swapRB>
511static void common_rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
512
513    // Only use the SIMD code if simulating RP, otherwise the quick code auto-vectorizes will
514    // enough on ARM to not need a SIMD implementation.
515    if constexpr (!kFastUnpremul) {
516        while (count >= 8) {
517            const uint8x8x4_t in = vld4_u8((const uint8_t*)src);
518
519            auto round = [](float32x4_t v) -> uint32x4_t {
520                #if defined(SK_CPU_ARM64)
521                    return vcvtnq_u32_f32(v);
522                #else
523                    return vcvtq_u32_f32(v + 0.5f);
524                #endif
525            };
526
527            static constexpr float kN = 1.0f / 255.0f;
528            auto toNormalized = [](uint16x4_t v) -> float32x4_t {
529                return vcvtq_f32_u32(vmovl_u16(v)) * kN;
530            };
531
532            auto unpremulHalf =
533                    [toNormalized, round](float32x4_t invA, uint16x4_t v) -> uint16x4_t {
534                const float32x4_t normalizedV = toNormalized(v);
535                const float32x4_t divided = invA * normalizedV;
536                const float32x4_t denormalized = divided * 255.0f;
537                const uint32x4_t rounded = round(denormalized);
538                return vqmovn_u32(rounded);
539            };
540
541            auto reciprocal = [](float32x4_t a) -> float32x4_t {
542                uint32x4_t mask = sk_bit_cast<uint32x4_t>(a != float32x4_t{0, 0, 0, 0});
543                auto recip = 1.0f / a;
544                return sk_bit_cast<float32x4_t>(mask & sk_bit_cast<uint32x4_t>(recip));
545            };
546
547            const uint8x8_t a = in.val[3];
548            const uint16x8_t intA = vmovl_u8(a);
549            const float32x4_t invALow = reciprocal(toNormalized(vget_low_u16(intA)));
550            const float32x4_t invAHigh = reciprocal(toNormalized(vget_high_u16(intA)));
551
552            auto unpremul = [unpremulHalf, invALow, invAHigh](uint8x8_t v) -> uint8x8_t {
553                const uint16x8_t to16 = vmovl_u8(v);
554
555                const uint16x4_t low = unpremulHalf(invALow, vget_low_u16(to16));
556                const uint16x4_t high = unpremulHalf(invAHigh, vget_high_u16(to16));
557
558                const uint16x8_t combined = vcombine_u16(low, high);
559                return vqmovn_u16(combined);
560            };
561
562            const uint8x8_t b = unpremul(in.val[2]);
563            const uint8x8_t g = unpremul(in.val[1]);
564            const uint8x8_t r = unpremul(in.val[0]);
565
566            if constexpr (swapRB) {
567                const uint8x8x4_t out{b, g, r, a};
568                vst4_u8((uint8_t*)dst, out);
569            } else {
570                const uint8x8x4_t out{r, g, b, a};
571                vst4_u8((uint8_t*)dst, out);
572            }
573
574            src += 8;
575            dst += 8;
576            count -= 8;
577        }
578    }
579
580    // Handle the tail. Count will be < 8.
581    if constexpr (swapRB) {
582        rgbA_to_BGRA_portable(dst, src, count);
583    } else {
584        rgbA_to_RGBA_portable(dst, src, count);
585    }
586}
587
588void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
589    common_rgbA_to_RGBA</*swapRB=*/false>(dst, src, count);
590}
591
592void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
593    common_rgbA_to_RGBA</*swapRB=*/true>(dst, src, count);
594}
595
596#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
597// -- AVX2 -----------------------------------------------------------------------------------------
598
599// Scale a byte by another.
600// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
601static __m256i scale(__m256i x, __m256i y) {
602    const __m256i _128 = _mm256_set1_epi16(128);
603    const __m256i _257 = _mm256_set1_epi16(257);
604
605    // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
606    return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257);
607}
608
609static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
610
611    auto premul8 = [=](__m256i* lo, __m256i* hi) {
612        const __m256i zeros = _mm256_setzero_si256();
613        __m256i planar;
614        if (kSwapRB) {
615            planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
616                                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
617        } else {
618            planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
619                                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
620        }
621
622        // Swizzle the pixels to 8-bit planar.
623        *lo = _mm256_shuffle_epi8(*lo, planar);             // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa
624        *hi = _mm256_shuffle_epi8(*hi, planar);             // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA
625        __m256i rg = _mm256_unpacklo_epi32(*lo, *hi),       // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG
626                ba = _mm256_unpackhi_epi32(*lo, *hi);       // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA
627
628        // Unpack to 16-bit planar.
629        __m256i r = _mm256_unpacklo_epi8(rg, zeros),        // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_
630                g = _mm256_unpackhi_epi8(rg, zeros),        // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_
631                b = _mm256_unpacklo_epi8(ba, zeros),        // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_
632                a = _mm256_unpackhi_epi8(ba, zeros);        // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_
633
634        // Premultiply!
635        r = scale(r, a);
636        g = scale(g, a);
637        b = scale(b, a);
638
639        // Repack into interlaced pixels.
640        rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8));   // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
641        ba = _mm256_or_si256(b, _mm256_slli_epi16(a, 8));   // babababa BABABABA babababa BABABABA
642        *lo = _mm256_unpacklo_epi16(rg, ba);                // rgbargba rgbargba rgbargba rgbargba
643        *hi = _mm256_unpackhi_epi16(rg, ba);                // RGBARGBA RGBARGBA RGBARGBA RGBARGBA
644    };
645
646    while (count >= 16) {
647        __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
648                hi = _mm256_loadu_si256((const __m256i*) (src + 8));
649
650        premul8(&lo, &hi);
651
652        _mm256_storeu_si256((__m256i*) (dst + 0), lo);
653        _mm256_storeu_si256((__m256i*) (dst + 8), hi);
654
655        src += 16;
656        dst += 16;
657        count -= 16;
658    }
659
660    if (count >= 8) {
661        __m256i lo = _mm256_loadu_si256((const __m256i*) src),
662                hi = _mm256_setzero_si256();
663
664        premul8(&lo, &hi);
665
666        _mm256_storeu_si256((__m256i*) dst, lo);
667
668        src += 8;
669        dst += 8;
670        count -= 8;
671    }
672
673    // Call portable code to finish up the tail of [0,8) pixels.
674    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
675    proc(dst, src, count);
676}
677
678void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
679    premul_should_swapRB(false, dst, src, count);
680}
681
682void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
683    premul_should_swapRB(true, dst, src, count);
684}
685
686void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
687    const __m256i swapRB = _mm256_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
688                                            2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
689
690    while (count >= 8) {
691        __m256i rgba = _mm256_loadu_si256((const __m256i*) src);
692        __m256i bgra = _mm256_shuffle_epi8(rgba, swapRB);
693        _mm256_storeu_si256((__m256i*) dst, bgra);
694
695        src += 8;
696        dst += 8;
697        count -= 8;
698    }
699
700    RGBA_to_BGRA_portable(dst, src, count);
701}
702
703void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
704    while (count >= 16) {
705        __m256i ga = _mm256_loadu_si256((const __m256i*) src);
706
707        __m256i gg = _mm256_or_si256(_mm256_and_si256(ga, _mm256_set1_epi16(0x00FF)),
708                                     _mm256_slli_epi16(ga, 8));
709
710        __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
711        __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
712
713        // Shuffle for pixel reorder
714        // Note. 'p' stands for 'ggga'
715        // Before shuffle:
716        // ggga_lo = p0 p1 p2 p3 | p8  p9  p10 p11
717        // ggga_hi = p4 p5 p6 p7 | p12 p13 p14 p15
718        //
719        // After shuffle:
720        // ggga_lo_shuffle = p0 p1 p2  p3  | p4  p5  p6  p7
721        // ggga_hi_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15
722        __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
723                ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
724
725        _mm256_storeu_si256((__m256i*) (dst +  0), ggga_lo_shuffle);
726        _mm256_storeu_si256((__m256i*) (dst +  8), ggga_hi_shuffle);
727
728        src += 16*2;
729        dst += 16;
730        count -= 16;
731    }
732
733    grayA_to_RGBA_portable(dst, src, count);
734}
735
736void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
737    while (count >= 16) {
738        __m256i grayA = _mm256_loadu_si256((const __m256i*) src);
739
740        __m256i g0 = _mm256_and_si256(grayA, _mm256_set1_epi16(0x00FF));
741        __m256i a0 = _mm256_srli_epi16(grayA, 8);
742
743        // Premultiply
744        g0 = scale(g0, a0);
745
746        __m256i gg = _mm256_or_si256(g0, _mm256_slli_epi16(g0, 8));
747        __m256i ga = _mm256_or_si256(g0, _mm256_slli_epi16(a0, 8));
748
749        __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
750        __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);
751
752        // Shuffle for pixel reorder, similar as grayA_to_RGBA
753        __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
754                ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);
755
756        _mm256_storeu_si256((__m256i*) (dst +  0), ggga_lo_shuffle);
757        _mm256_storeu_si256((__m256i*) (dst +  8), ggga_hi_shuffle);
758
759        src += 16*2;
760        dst += 16;
761        count -= 16;
762    }
763
764    grayA_to_rgbA_portable(dst, src, count);
765}
766
767enum Format { kRGB1, kBGR1 };
768static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
769    auto convert8 = [=](__m256i* lo, __m256i* hi) {
770        const __m256i zeros = _mm256_setzero_si256();
771        __m256i planar;
772        if (kBGR1 == format) {
773            planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
774                                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
775        } else {
776            planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
777                                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
778        }
779
780        // Swizzle the pixels to 8-bit planar.
781        *lo = _mm256_shuffle_epi8(*lo, planar);            // ccccmmmm yyyykkkk ccccmmmm yyyykkkk
782        *hi = _mm256_shuffle_epi8(*hi, planar);            // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK
783        __m256i cm = _mm256_unpacklo_epi32(*lo, *hi),      // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM
784                yk = _mm256_unpackhi_epi32(*lo, *hi);      // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK
785
786        // Unpack to 16-bit planar.
787        __m256i c = _mm256_unpacklo_epi8(cm, zeros),       // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_
788                m = _mm256_unpackhi_epi8(cm, zeros),       // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_
789                y = _mm256_unpacklo_epi8(yk, zeros),       // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_
790                k = _mm256_unpackhi_epi8(yk, zeros);       // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_
791
792        // Scale to r, g, b.
793        __m256i r = scale(c, k),
794                g = scale(m, k),
795                b = scale(y, k);
796
797        // Repack into interlaced pixels:
798        //     rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
799        //     ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1
800        __m256i rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)),
801                ba = _mm256_or_si256(b, _mm256_set1_epi16((uint16_t) 0xFF00));
802        *lo = _mm256_unpacklo_epi16(rg, ba);               // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1
803        *hi = _mm256_unpackhi_epi16(rg, ba);               // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1
804    };
805
806    while (count >= 16) {
807        __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
808                hi = _mm256_loadu_si256((const __m256i*) (src + 8));
809
810        convert8(&lo, &hi);
811
812        _mm256_storeu_si256((__m256i*) (dst + 0), lo);
813        _mm256_storeu_si256((__m256i*) (dst + 8), hi);
814
815        src += 16;
816        dst += 16;
817        count -= 16;
818    }
819
820    if (count >= 8) {
821        __m256i lo = _mm256_loadu_si256((const __m256i*) src),
822                hi = _mm256_setzero_si256();
823
824        convert8(&lo, &hi);
825
826        _mm256_storeu_si256((__m256i*) dst, lo);
827
828        src += 8;
829        dst += 8;
830        count -= 8;
831    }
832
833    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
834    proc(dst, src, count);
835}
836
837void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
838    inverted_cmyk_to(kRGB1, dst, src, count);
839}
840
841void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
842    inverted_cmyk_to(kBGR1, dst, src, count);
843}
844
845void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
846    rgbA_to_RGBA_portable(dst, src, count);
847}
848
849void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
850    rgbA_to_BGRA_portable(dst, src, count);
851}
852
853#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
854// -- SSSE3 ----------------------------------------------------------------------------------------
855
856// Scale a byte by another.
857// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
858static __m128i scale(__m128i x, __m128i y) {
859    const __m128i _128 = _mm_set1_epi16(128);
860    const __m128i _257 = _mm_set1_epi16(257);
861
862    // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
863    return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
864}
865
866static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
867
868    auto premul8 = [=](__m128i* lo, __m128i* hi) {
869        const __m128i zeros = _mm_setzero_si128();
870        __m128i planar;
871        if (kSwapRB) {
872            planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
873        } else {
874            planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
875        }
876
877        // Swizzle the pixels to 8-bit planar.
878        *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
879        *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
880        __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
881                ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
882
883        // Unpack to 16-bit planar.
884        __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
885                g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
886                b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
887                a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
888
889        // Premultiply!
890        r = scale(r, a);
891        g = scale(g, a);
892        b = scale(b, a);
893
894        // Repack into interlaced pixels.
895        rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
896        ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
897        *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
898        *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
899    };
900
901    while (count >= 8) {
902        __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
903                hi = _mm_loadu_si128((const __m128i*) (src + 4));
904
905        premul8(&lo, &hi);
906
907        _mm_storeu_si128((__m128i*) (dst + 0), lo);
908        _mm_storeu_si128((__m128i*) (dst + 4), hi);
909
910        src += 8;
911        dst += 8;
912        count -= 8;
913    }
914
915    if (count >= 4) {
916        __m128i lo = _mm_loadu_si128((const __m128i*) src),
917                hi = _mm_setzero_si128();
918
919        premul8(&lo, &hi);
920
921        _mm_storeu_si128((__m128i*) dst, lo);
922
923        src += 4;
924        dst += 4;
925        count -= 4;
926    }
927
928    // Call portable code to finish up the tail of [0,4) pixels.
929    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
930    proc(dst, src, count);
931}
932
933void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
934    premul_should_swapRB(false, dst, src, count);
935}
936
937void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
938    premul_should_swapRB(true, dst, src, count);
939}
940
941void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
942    const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
943
944    while (count >= 4) {
945        __m128i rgba = _mm_loadu_si128((const __m128i*) src);
946        __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
947        _mm_storeu_si128((__m128i*) dst, bgra);
948
949        src += 4;
950        dst += 4;
951        count -= 4;
952    }
953
954    RGBA_to_BGRA_portable(dst, src, count);
955}
956
957void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
958    while (count >= 8) {
959        __m128i ga = _mm_loadu_si128((const __m128i*) src);
960
961        __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
962                                  _mm_slli_epi16(ga, 8));
963
964        __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
965        __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
966
967        _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
968        _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
969
970        src += 8*2;
971        dst += 8;
972        count -= 8;
973    }
974
975    grayA_to_RGBA_portable(dst, src, count);
976}
977
978void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
979    while (count >= 8) {
980        __m128i grayA = _mm_loadu_si128((const __m128i*) src);
981
982        __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
983        __m128i a0 = _mm_srli_epi16(grayA, 8);
984
985        // Premultiply
986        g0 = scale(g0, a0);
987
988        __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
989        __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
990
991
992        __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
993        __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
994
995        _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
996        _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
997
998        src += 8*2;
999        dst += 8;
1000        count -= 8;
1001    }
1002
1003    grayA_to_rgbA_portable(dst, src, count);
1004}
1005
1006enum Format { kRGB1, kBGR1 };
1007static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1008    auto convert8 = [=](__m128i* lo, __m128i* hi) {
1009        const __m128i zeros = _mm_setzero_si128();
1010        __m128i planar;
1011        if (kBGR1 == format) {
1012            planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
1013        } else {
1014            planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
1015        }
1016
1017        // Swizzle the pixels to 8-bit planar.
1018        *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
1019        *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
1020        __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
1021                yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK
1022
1023        // Unpack to 16-bit planar.
1024        __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
1025                m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
1026                y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
1027                k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_
1028
1029        // Scale to r, g, b.
1030        __m128i r = scale(c, k),
1031                g = scale(m, k),
1032                b = scale(y, k);
1033
1034        // Repack into interlaced pixels.
1035        __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
1036                ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
1037        *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
1038        *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
1039    };
1040
1041    while (count >= 8) {
1042        __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
1043                hi = _mm_loadu_si128((const __m128i*) (src + 4));
1044
1045        convert8(&lo, &hi);
1046
1047        _mm_storeu_si128((__m128i*) (dst + 0), lo);
1048        _mm_storeu_si128((__m128i*) (dst + 4), hi);
1049
1050        src += 8;
1051        dst += 8;
1052        count -= 8;
1053    }
1054
1055    if (count >= 4) {
1056        __m128i lo = _mm_loadu_si128((const __m128i*) src),
1057                hi = _mm_setzero_si128();
1058
1059        convert8(&lo, &hi);
1060
1061        _mm_storeu_si128((__m128i*) dst, lo);
1062
1063        src += 4;
1064        dst += 4;
1065        count -= 4;
1066    }
1067
1068    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1069    proc(dst, src, count);
1070}
1071
1072void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1073    inverted_cmyk_to(kRGB1, dst, src, count);
1074}
1075
1076void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1077    inverted_cmyk_to(kBGR1, dst, src, count);
1078}
1079
1080void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1081    rgbA_to_RGBA_portable(dst, src, count);
1082}
1083
1084void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1085    rgbA_to_BGRA_portable(dst, src, count);
1086}
1087
1088#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1089// -- LASX ----------------------------------------------------------------------------------------
1090
1091// Scale a byte by another.
1092// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
1093// (x+127)/255 == ((x+128)*257)>>16
1094SI __m256i scale(__m256i x, __m256i y) {
1095    const __m256i _128 = __lasx_xvreplgr2vr_h(128);
1096    const __m256i _257 = __lasx_xvreplgr2vr_h(257);
1097
1098    // (x+127)/255 == ((x+128)*257)>>16
1099    return __lasx_xvmuh_hu(__lasx_xvadd_h(__lasx_xvmul_h(x, y), _128), _257);
1100}
1101
1102static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
1103    auto premul8 = [=](__m256i* lo, __m256i* hi) {
1104        const __m256i zeros = __lasx_xvldi(0);
1105        __m256i planar = __lasx_xvldi(0);
1106        if (kSwapRB) {
1107            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0);
1108            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1);
1109            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2);
1110            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3);
1111        } else {
1112            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0);
1113            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1);
1114            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2);
1115            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3);
1116        }
1117
1118        // Swizzle the pixels to 8-bit planar.
1119        *lo = __lasx_xvshuf_b(zeros, *lo, planar);      // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa
1120        *hi = __lasx_xvshuf_b(zeros, *hi, planar);      // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA
1121        __m256i rg = __lasx_xvilvl_w(*hi, *lo),         // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG
1122                ba = __lasx_xvilvh_w(*hi, *lo);         // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA
1123
1124        // Unpack to 16-bit planar.
1125        __m256i r = __lasx_xvilvl_b(zeros, rg),         // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_
1126                g = __lasx_xvilvh_b(zeros, rg),         // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_
1127                b = __lasx_xvilvl_b(zeros, ba),         // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_
1128                a = __lasx_xvilvh_b(zeros, ba);         // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_
1129
1130        // Premultiply!
1131        r = scale(r, a);
1132        g = scale(g, a);
1133        b = scale(b, a);
1134
1135        // Repack into interlaced pixels.
1136        rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8));   // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
1137        ba = __lasx_xvor_v(b, __lasx_xvslli_h(a, 8));   // babababa BABABABA babababa BABABABA
1138        *lo = __lasx_xvilvl_h(ba, rg);                  // rgbargba rgbargba rgbargba rgbargba
1139        *hi = __lasx_xvilvh_h(ba, rg);                  // RGBARGBA RGBARGBA RGBARGBA RGBARGBA
1140    };
1141
1142    while (count >= 16) {
1143        __m256i lo = __lasx_xvld(src, 0),
1144                hi = __lasx_xvld(src, 32);
1145
1146        premul8(&lo, &hi);
1147
1148        __lasx_xvst(lo, dst, 0);
1149        __lasx_xvst(hi, dst, 32);
1150
1151        src += 16;
1152        dst += 16;
1153        count -= 16;
1154    }
1155
1156    if (count >= 8) {
1157        __m256i lo = __lasx_xvld(src, 0),
1158                hi = __lasx_xvldi(0);
1159
1160        premul8(&lo, &hi);
1161
1162        __lasx_xvst(lo, dst, 0);
1163
1164        src += 8;
1165        dst += 8;
1166        count -= 8;
1167    }
1168
1169    // Call portable code to finish up the tail of [0,4) pixels.
1170    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
1171    proc(dst, src, count);
1172}
1173
1174/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1175    premul_should_swapRB(false, dst, src, count);
1176}
1177
1178/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1179    premul_should_swapRB(true, dst, src, count);
1180}
1181
1182/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1183    while (count >= 8) {
1184        __m256i rgba = __lasx_xvld(src, 0);
1185        __m256i bgra = __lasx_xvshuf4i_b(rgba, 0xC6);
1186        __lasx_xvst(bgra, dst, 0);
1187
1188        src += 8;
1189        dst += 8;
1190        count -= 8;
1191    }
1192
1193    RGBA_to_BGRA_portable(dst, src, count);
1194}
1195
1196/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1197    while (count >= 16) {
1198        __m256i ga = __lasx_xvld(src, 0);
1199
1200        __m256i gg = __lasx_xvor_v(__lasx_xvand_v(ga, __lasx_xvreplgr2vr_h(0x00FF)),
1201                                   __lasx_xvslli_h(ga, 8));
1202
1203        __m256i ggga_lo = __lasx_xvilvl_h(ga, gg);
1204        __m256i ggga_hi = __lasx_xvilvh_h(ga, gg);
1205
1206        __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02), dst, 0);
1207        __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13), dst, 32);
1208
1209        src += 16*2;
1210        dst += 16;
1211        count -= 16;
1212    }
1213
1214    grayA_to_RGBA_portable(dst, src, count);
1215}
1216
1217/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1218    while (count >= 16) {
1219        __m256i grayA = __lasx_xvld(src, 0);
1220
1221        __m256i val = __lasx_xvreplgr2vr_h(0x00FF);
1222
1223        __m256i g0 = __lasx_xvand_v(grayA, val);
1224        __m256i a0 = __lasx_xvsrli_h(grayA, 8);
1225
1226        // Premultiply
1227        g0 = scale(g0, a0);
1228
1229        __m256i gg = __lasx_xvor_v(g0, __lasx_xvslli_h(g0, 8));
1230        __m256i ga = __lasx_xvor_v(g0, __lasx_xvslli_h(a0, 8));
1231
1232        __m256i ggga_lo = __lasx_xvilvl_h(ga, gg);
1233        __m256i ggga_hi = __lasx_xvilvh_h(ga, gg);
1234
1235        val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02);
1236        __lasx_xvst(val, dst, 0);
1237
1238        val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13);
1239        __lasx_xvst(val, dst, 32);
1240
1241        src += 16*2;
1242        dst += 16;
1243        count -= 16;
1244    }
1245
1246    grayA_to_rgbA_portable(dst, src, count);
1247}
1248
1249enum Format { kRGB1, kBGR1 };
1250static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1251    auto convert8 = [=](__m256i *lo, __m256i* hi) {
1252        const __m256i zeros = __lasx_xvldi(0);
1253        __m256i planar = __lasx_xvldi(0);
1254        if (kBGR1 == format) {
1255            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0);
1256            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1);
1257            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2);
1258            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3);
1259        } else {
1260            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0);
1261            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1);
1262            planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2);
1263            planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3);
1264        }
1265
1266        // Swizzle the pixels to 8-bit planar.
1267        *lo = __lasx_xvshuf_b(zeros, *lo, planar);   // ccccmmmm yyyykkkk ccccmmmm yyyykkkk
1268        *hi = __lasx_xvshuf_b(zeros, *hi, planar);   // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK
1269        __m256i cm = __lasx_xvilvl_w(*hi, *lo),      // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM
1270                yk = __lasx_xvilvh_w(*hi, *lo);      // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK
1271
1272        // Unpack to 16-bit planar.
1273        __m256i c = __lasx_xvilvl_b(zeros, cm),      // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_
1274                m = __lasx_xvilvh_b(zeros, cm),      // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_
1275                y = __lasx_xvilvl_b(zeros, yk),      // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_
1276                k = __lasx_xvilvh_b(zeros, yk);      // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_
1277
1278        // Scale to r, g, b.
1279        __m256i r = scale(c, k),
1280                g = scale(m, k),
1281                b = scale(y, k);
1282
1283        // Repack into interlaced pixels:
1284        //     rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
1285        //     ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1
1286        __m256i rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8)),
1287                ba = __lasx_xvor_v(b, __lasx_xvreplgr2vr_h(0xff00));
1288        *lo = __lasx_xvilvl_h(ba, rg);               // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1
1289        *hi = __lasx_xvilvh_h(ba, rg);               // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1
1290    };
1291
1292    while (count >= 16) {
1293        __m256i lo = __lasx_xvld(src, 0),
1294                hi = __lasx_xvld(src, 32);
1295
1296        convert8(&lo, &hi);
1297
1298        __lasx_xvst(lo, dst, 0);
1299        __lasx_xvst(hi, dst, 32);
1300
1301        src += 16;
1302        dst += 16;
1303        count -= 16;
1304    }
1305
1306    while (count >= 8) {
1307        __m256i lo = __lasx_xvld(src, 0),
1308                hi = __lasx_xvldi(0);
1309
1310        convert8(&lo, &hi);
1311
1312        __lasx_xvst(lo, dst, 0);
1313
1314        src += 8;
1315        dst += 8;
1316        count -= 8;
1317    }
1318
1319    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1320    proc(dst, src, count);
1321}
1322
1323/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1324    inverted_cmyk_to(kRGB1, dst, src, count);
1325}
1326
1327/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1328    inverted_cmyk_to(kBGR1, dst, src, count);
1329}
1330
1331/*not static*/ inline void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1332    rgbA_to_RGBA_portable(dst, src, count);
1333}
1334
1335/*not static*/ inline void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1336    rgbA_to_BGRA_portable(dst, src, count);
1337}
1338
1339#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1340// -- LSX -----------------------------------------------------------------------------------------
1341
1342// Scale a byte by another.
1343// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
1344SI __m128i scale(__m128i x, __m128i y) {
1345    const __m128i _128 = __lsx_vreplgr2vr_h(128);
1346    const __m128i _257 = __lsx_vreplgr2vr_h(257);
1347
1348    // (x+127)/255 == ((x+128)*257)>>16
1349    return __lsx_vmuh_hu(__lsx_vadd_h(__lsx_vmul_h(x, y), _128), _257);
1350}
1351
1352static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
1353
1354    auto premul8 = [=](__m128i *lo, __m128i *hi){
1355        const __m128i zeros = __lsx_vldi(0);
1356        __m128i planar = __lsx_vldi(0);
1357        if (kSwapRB) {
1358            planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0);
1359            planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1);
1360        } else {
1361            planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0);
1362            planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1);
1363        }
1364
1365        // Swizzle the pixels to 8-bit planar.
1366        *lo = __lsx_vshuf_b(zeros, *lo, planar);             // rrrrgggg bbbbaaaa
1367        *hi = __lsx_vshuf_b(zeros, *hi, planar);             // RRRRGGGG BBBBAAAA
1368        __m128i rg = __lsx_vilvl_w(*hi, *lo),                // rrrrRRRR ggggGGGG
1369                ba = __lsx_vilvh_w(*hi, *lo);                // bbbbBBBB aaaaAAAA
1370
1371        // Unpack to 16-bit planar.
1372        __m128i r = __lsx_vilvl_b(zeros, rg),                 // r_r_r_r_ R_R_R_R_
1373                g = __lsx_vilvh_b(zeros, rg),                 // g_g_g_g_ G_G_G_G_
1374                b = __lsx_vilvl_b(zeros, ba),                 // b_b_b_b_ B_B_B_B_
1375                a = __lsx_vilvh_b(zeros, ba);                 // a_a_a_a_ A_A_A_A_
1376
1377        // Premultiply!
1378        r = scale(r, a);
1379        g = scale(g, a);
1380        b = scale(b, a);
1381
1382        // Repack into interlaced pixels.
1383        rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8));             // rgrgrgrg RGRGRGRG
1384        ba = __lsx_vor_v(b, __lsx_vslli_h(a, 8));             // babababa BABABABA
1385        *lo = __lsx_vilvl_h(ba, rg);                          // rgbargba rgbargba
1386        *hi = __lsx_vilvh_h(ba, rg);                          // RGBARGBA RGBARGBA
1387    };
1388    while (count >= 8) {
1389        __m128i lo = __lsx_vld(src ,0),
1390                hi = __lsx_vld(src ,16);
1391
1392        premul8(&lo, &hi);
1393
1394        __lsx_vst(lo, dst, 0);
1395        __lsx_vst(hi, dst, 16);
1396
1397        src += 8;
1398        dst += 8;
1399        count -= 8;
1400    }
1401
1402    if (count >= 4) {
1403        __m128i lo = __lsx_vld(src, 0),
1404                hi = __lsx_vldi(0);
1405
1406        premul8(&lo, &hi);
1407
1408        __lsx_vst(lo, dst, 0);
1409
1410        src += 4;
1411        dst += 4;
1412        count -= 4;
1413    }
1414
1415    // Call portable code to finish up the tail of [0,4) pixels.
1416    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
1417    proc(dst, src, count);
1418}
1419
1420/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1421    premul_should_swapRB(false, dst, src, count);
1422}
1423
1424/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1425    premul_should_swapRB(true, dst, src, count);
1426}
1427
1428/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1429    __m128i swapRB = __lsx_vldi(0);
1430    swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0704050603000102, 0);
1431    swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0f0c0d0e0b08090a, 1);
1432
1433    while (count >= 4) {
1434        __m128i rgba = __lsx_vld(src, 0);
1435        __m128i bgra = __lsx_vshuf4i_b(rgba, 0xC6);
1436        __lsx_vst(bgra, dst, 0);
1437
1438        src += 4;
1439        dst += 4;
1440        count -= 4;
1441    }
1442
1443    RGBA_to_BGRA_portable(dst, src, count);
1444}
1445
1446/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1447    while (count >= 8) {
1448        __m128i ga = __lsx_vld(src, 0);
1449
1450        __m128i gg = __lsx_vor_v(__lsx_vand_v(ga, __lsx_vreplgr2vr_h(0x00FF)),
1451                                 __lsx_vslli_h(ga, 8));
1452
1453        __m128i ggga_lo = __lsx_vilvl_h(ga, gg);
1454        __m128i ggga_hi = __lsx_vilvh_h(ga, gg);
1455
1456        __lsx_vst(ggga_lo, dst, 0);
1457        __lsx_vst(ggga_hi, dst, 16);
1458
1459        src += 8*2;
1460        dst += 8;
1461        count -= 8;
1462    }
1463
1464    grayA_to_RGBA_portable(dst, src, count);
1465}
1466
1467/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1468    while (count >= 8) {
1469        __m128i grayA = __lsx_vld(src, 0);
1470
1471        __m128i g0 = __lsx_vand_v(grayA, __lsx_vreplgr2vr_h(0x00FF));
1472        __m128i a0 = __lsx_vsrli_h(grayA, 8);
1473
1474        // Premultiply
1475        g0 = scale(g0, a0);
1476
1477        __m128i gg = __lsx_vor_v(g0, __lsx_vslli_h(g0, 8));
1478        __m128i ga = __lsx_vor_v(g0, __lsx_vslli_h(a0, 8));
1479
1480        __m128i ggga_lo = __lsx_vilvl_h(ga, gg);
1481        __m128i ggga_hi = __lsx_vilvh_h(ga, gg);
1482
1483        __lsx_vst(ggga_lo, dst, 0);
1484        __lsx_vst(ggga_hi, dst, 16);
1485
1486        src += 8*2;
1487        dst += 8;
1488        count -= 8;
1489    }
1490
1491    grayA_to_rgbA_portable(dst, src, count);
1492}
1493
1494enum Format { kRGB1, kBGR1 };
1495static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
1496    auto convert8 = [=](__m128i *lo, __m128i* hi) {
1497        const __m128i zeros = __lsx_vldi(0);
1498        __m128i planar = __lsx_vldi(0);
1499        if (kBGR1 == format) {
1500            planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0);
1501            planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1);
1502        } else {
1503            planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0);
1504            planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1);
1505        }
1506
1507        // Swizzle the pixels to 8-bit planar.
1508        *lo = __lsx_vshuf_b(zeros, *lo, planar);              // ccccmmmm yyyykkkk
1509        *hi = __lsx_vshuf_b(zeros, *hi, planar);              // CCCCMMMM YYYYKKKK
1510        __m128i cm = __lsx_vilvl_w(*hi, *lo),                 // ccccCCCC mmmmMMMM
1511                yk = __lsx_vilvh_w(*hi, *lo);                 // yyyyYYYY kkkkKKKK
1512
1513        // Unpack to 16-bit planar.
1514        __m128i c = __lsx_vilvl_b(zeros, cm),                 // c_c_c_c_ C_C_C_C_
1515                m = __lsx_vilvh_b(zeros, cm),                 // m_m_m_m_ M_M_M_M_
1516                y = __lsx_vilvl_b(zeros, yk),                 // y_y_y_y_ Y_Y_Y_Y_
1517                k = __lsx_vilvh_b(zeros, yk);                 // k_k_k_k_ K_K_K_K_
1518
1519        // Scale to r, g, b.
1520        __m128i r = scale(c, k),
1521                g = scale(m, k),
1522                b = scale(y, k);
1523
1524        // Repack into interlaced pixels.
1525        // rgrgrgrg RGRGRGRG
1526        // b1b1b1b1 B1B1B1B1
1527        __m128i rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8)),
1528                ba = __lsx_vor_v(b, __lsx_vreplgr2vr_h(0xff00));
1529        *lo = __lsx_vilvl_h(ba, rg);                          // rgbargba rgbargba
1530        *hi = __lsx_vilvl_h(ba, rg);                          // RGB1RGB1 RGB1RGB1
1531    };
1532
1533    while (count >= 8) {
1534        __m128i lo = __lsx_vld(src, 0),
1535                hi = __lsx_vld(src, 16);
1536
1537        convert8(&lo, &hi);
1538
1539        __lsx_vst(lo, dst, 0);
1540        __lsx_vst(hi, dst, 16);
1541
1542        src += 8;
1543        dst += 8;
1544        count -= 8;
1545    }
1546
1547    if (count >= 4) {
1548        __m128i lo = __lsx_vld(src, 0),
1549                hi = __lsx_vldi(0);
1550
1551        convert8(&lo, &hi);
1552
1553        __lsx_vst(lo, dst, 0);
1554
1555        src += 4;
1556        dst += 4;
1557        count -= 4;
1558    }
1559
1560    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
1561    proc(dst, src, count);
1562}
1563
1564/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1565    inverted_cmyk_to(kRGB1, dst, src, count);
1566}
1567
1568/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1569    inverted_cmyk_to(kBGR1, dst, src, count);
1570}
1571
1572/*not static*/ inline void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1573    rgbA_to_RGBA_portable(dst, src, count);
1574}
1575
1576/*not static*/ inline void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1577    rgbA_to_BGRA_portable(dst, src, count);
1578}
1579
1580#else
1581// -- No Opts --------------------------------------------------------------------------------------
1582
1583void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) {
1584    rgbA_to_RGBA_portable(dst, src, count);
1585}
1586
1587void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1588    rgbA_to_BGRA_portable(dst, src, count);
1589}
1590
1591void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
1592    RGBA_to_rgbA_portable(dst, src, count);
1593}
1594
1595void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
1596    RGBA_to_bgrA_portable(dst, src, count);
1597}
1598
1599void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
1600    RGBA_to_BGRA_portable(dst, src, count);
1601}
1602
1603void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
1604    grayA_to_RGBA_portable(dst, src, count);
1605}
1606
1607void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
1608    grayA_to_rgbA_portable(dst, src, count);
1609}
1610
1611void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
1612    inverted_CMYK_to_RGB1_portable(dst, src, count);
1613}
1614
1615void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
1616    inverted_CMYK_to_BGR1_portable(dst, src, count);
1617}
1618#endif
1619
1620// Basically as above, but we found no benefit from AVX-512 for gray_to_RGB1.
1621static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1622    for (int i = 0; i < count; i++) {
1623        dst[i] = (uint32_t)0xFF   << 24
1624               | (uint32_t)src[i] << 16
1625               | (uint32_t)src[i] <<  8
1626               | (uint32_t)src[i] <<  0;
1627    }
1628}
1629#if defined(SK_ARM_HAS_NEON)
1630    void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1631        while (count >= 16) {
1632            // Load 16 pixels.
1633            uint8x16_t gray = vld1q_u8(src);
1634
1635            // Set each of the color channels.
1636            uint8x16x4_t rgba;
1637            rgba.val[0] = gray;
1638            rgba.val[1] = gray;
1639            rgba.val[2] = gray;
1640            rgba.val[3] = vdupq_n_u8(0xFF);
1641
1642            // Store 16 pixels.
1643            vst4q_u8((uint8_t*) dst, rgba);
1644            src += 16;
1645            dst += 16;
1646            count -= 16;
1647        }
1648        if (count >= 8) {
1649            // Load 8 pixels.
1650            uint8x8_t gray = vld1_u8(src);
1651
1652            // Set each of the color channels.
1653            uint8x8x4_t rgba;
1654            rgba.val[0] = gray;
1655            rgba.val[1] = gray;
1656            rgba.val[2] = gray;
1657            rgba.val[3] = vdup_n_u8(0xFF);
1658
1659            // Store 8 pixels.
1660            vst4_u8((uint8_t*) dst, rgba);
1661            src += 8;
1662            dst += 8;
1663            count -= 8;
1664        }
1665        gray_to_RGB1_portable(dst, src, count);
1666    }
1667#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
1668    void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1669        const __m256i alphas = _mm256_set1_epi8((uint8_t) 0xFF);
1670        while (count >= 32) {
1671            __m256i grays = _mm256_loadu_si256((const __m256i*) src);
1672
1673            __m256i gg_lo = _mm256_unpacklo_epi8(grays, grays);
1674            __m256i gg_hi = _mm256_unpackhi_epi8(grays, grays);
1675            __m256i ga_lo = _mm256_unpacklo_epi8(grays, alphas);
1676            __m256i ga_hi = _mm256_unpackhi_epi8(grays, alphas);
1677
1678            __m256i ggga0 = _mm256_unpacklo_epi16(gg_lo, ga_lo);
1679            __m256i ggga1 = _mm256_unpackhi_epi16(gg_lo, ga_lo);
1680            __m256i ggga2 = _mm256_unpacklo_epi16(gg_hi, ga_hi);
1681            __m256i ggga3 = _mm256_unpackhi_epi16(gg_hi, ga_hi);
1682
1683            // Shuffle for pixel reorder.
1684            // Note. 'p' stands for 'ggga'
1685            // Before shuffle:
1686            //     ggga0 = p0  p1  p2  p3  | p16 p17 p18 p19
1687            //     ggga1 = p4  p5  p6  p7  | p20 p21 p22 p23
1688            //     ggga2 = p8  p9  p10 p11 | p24 p25 p26 p27
1689            //     ggga3 = p12 p13 p14 p15 | p28 p29 p30 p31
1690            //
1691            // After shuffle:
1692            //     ggga0_shuffle = p0  p1  p2  p3  | p4  p5  p6  p7
1693            //     ggga1_shuffle = p8  p9  p10 p11 | p12 p13 p14 p15
1694            //     ggga2_shuffle = p16 p17 p18 p19 | p20 p21 p22 p23
1695            //     ggga3_shuffle = p24 p25 p26 p27 | p28 p29 p30 p31
1696            __m256i ggga0_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x20),
1697                    ggga1_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x20),
1698                    ggga2_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x31),
1699                    ggga3_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x31);
1700
1701            _mm256_storeu_si256((__m256i*) (dst +  0), ggga0_shuffle);
1702            _mm256_storeu_si256((__m256i*) (dst +  8), ggga1_shuffle);
1703            _mm256_storeu_si256((__m256i*) (dst + 16), ggga2_shuffle);
1704            _mm256_storeu_si256((__m256i*) (dst + 24), ggga3_shuffle);
1705
1706            src += 32;
1707            dst += 32;
1708            count -= 32;
1709        }
1710        gray_to_RGB1_portable(dst, src, count);
1711    }
1712#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3  // TODO: just check >= SSE2?
1713    void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1714        const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
1715        while (count >= 16) {
1716            __m128i grays = _mm_loadu_si128((const __m128i*) src);
1717
1718            __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
1719            __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
1720            __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
1721            __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
1722
1723            __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
1724            __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
1725            __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
1726            __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
1727
1728            _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
1729            _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
1730            _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
1731            _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
1732
1733            src += 16;
1734            dst += 16;
1735            count -= 16;
1736        }
1737        gray_to_RGB1_portable(dst, src, count);
1738    }
1739#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1740    /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1741        const __m256i alphas = __lasx_xvreplgr2vr_b(0xFF);
1742        while (count >= 32) {
1743            __m256i grays = __lasx_xvld(src, 0);
1744
1745            __m256i gg_lo = __lasx_xvilvl_b(grays, grays);
1746            __m256i gg_hi = __lasx_xvilvh_b(grays, grays);
1747            __m256i ga_lo = __lasx_xvilvl_b(alphas, grays);
1748            __m256i ga_hi = __lasx_xvilvh_b(alphas, grays);
1749
1750            __m256i ggga0 = __lasx_xvilvl_h(ga_lo, gg_lo);
1751            __m256i ggga1 = __lasx_xvilvh_h(ga_lo, gg_lo);
1752            __m256i ggga2 = __lasx_xvilvl_h(ga_hi, gg_hi);
1753            __m256i ggga3 = __lasx_xvilvh_h(ga_hi, gg_hi);
1754
1755            __m256i ggga_0 = __lasx_xvpermi_q(ggga0, ggga1, 0x02);
1756            __m256i ggga_1 = __lasx_xvpermi_q(ggga2, ggga3, 0x02);
1757            __m256i ggga_2 = __lasx_xvpermi_q(ggga0, ggga1, 0x13);
1758            __m256i ggga_3 = __lasx_xvpermi_q(ggga2, ggga3, 0x13);
1759
1760            __lasx_xvst(ggga_0, dst,  0);
1761            __lasx_xvst(ggga_1, dst, 32);
1762            __lasx_xvst(ggga_2, dst, 64);
1763            __lasx_xvst(ggga_3, dst, 96);
1764
1765            src += 32;
1766            dst += 32;
1767            count -= 32;
1768        }
1769        gray_to_RGB1_portable(dst, src, count);
1770    }
1771#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1772    /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1773        const __m128i alphas = __lsx_vreplgr2vr_b(0xFF);
1774        while (count >= 16) {
1775            __m128i grays = __lsx_vld(src, 0);
1776
1777            __m128i gg_lo = __lsx_vilvl_b(grays, grays);
1778            __m128i gg_hi = __lsx_vilvh_b(grays, grays);
1779            __m128i ga_lo = __lsx_vilvl_b(alphas, grays);
1780            __m128i ga_hi = __lsx_vilvh_b(alphas, grays);
1781
1782            __m128i ggga0 = __lsx_vilvl_h(ga_lo, gg_lo);
1783            __m128i ggga1 = __lsx_vilvh_h(ga_lo, gg_lo);
1784            __m128i ggga2 = __lsx_vilvl_h(ga_hi, gg_hi);
1785            __m128i ggga3 = __lsx_vilvh_h(ga_hi, gg_hi);
1786
1787            __lsx_vst(ggga0, dst,  0);
1788            __lsx_vst(ggga1, dst, 16);
1789            __lsx_vst(ggga2, dst, 32);
1790            __lsx_vst(ggga3, dst, 48);
1791
1792            src += 16;
1793            dst += 16;
1794            count -= 16;
1795        }
1796        gray_to_RGB1_portable(dst, src, count);
1797    }
1798#else
1799    void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1800        gray_to_RGB1_portable(dst, src, count);
1801    }
1802#endif
1803
1804// Again as above, this time not even finding benefit from AVX2 for RGB_to_{RGB,BGR}1.
1805static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
1806    for (int i = 0; i < count; i++) {
1807        uint8_t r = src[0],
1808                g = src[1],
1809                b = src[2];
1810        src += 3;
1811        dst[i] = (uint32_t)0xFF << 24
1812               | (uint32_t)b    << 16
1813               | (uint32_t)g    <<  8
1814               | (uint32_t)r    <<  0;
1815    }
1816}
1817static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
1818    for (int i = 0; i < count; i++) {
1819        uint8_t r = src[0],
1820                g = src[1],
1821                b = src[2];
1822        src += 3;
1823        dst[i] = (uint32_t)0xFF << 24
1824               | (uint32_t)r    << 16
1825               | (uint32_t)g    <<  8
1826               | (uint32_t)b    <<  0;
1827    }
1828}
1829#if defined(SK_ARM_HAS_NEON)
1830    static void insert_alpha_should_swaprb(bool kSwapRB,
1831                                           uint32_t dst[], const uint8_t* src, int count) {
1832        while (count >= 16) {
1833            // Load 16 pixels.
1834            uint8x16x3_t rgb = vld3q_u8(src);
1835
1836            // Insert an opaque alpha channel and swap if needed.
1837            uint8x16x4_t rgba;
1838            if (kSwapRB) {
1839                rgba.val[0] = rgb.val[2];
1840                rgba.val[2] = rgb.val[0];
1841            } else {
1842                rgba.val[0] = rgb.val[0];
1843                rgba.val[2] = rgb.val[2];
1844            }
1845            rgba.val[1] = rgb.val[1];
1846            rgba.val[3] = vdupq_n_u8(0xFF);
1847
1848            // Store 16 pixels.
1849            vst4q_u8((uint8_t*) dst, rgba);
1850            src += 16*3;
1851            dst += 16;
1852            count -= 16;
1853        }
1854
1855        if (count >= 8) {
1856            // Load 8 pixels.
1857            uint8x8x3_t rgb = vld3_u8(src);
1858
1859            // Insert an opaque alpha channel and swap if needed.
1860            uint8x8x4_t rgba;
1861            if (kSwapRB) {
1862                rgba.val[0] = rgb.val[2];
1863                rgba.val[2] = rgb.val[0];
1864            } else {
1865                rgba.val[0] = rgb.val[0];
1866                rgba.val[2] = rgb.val[2];
1867            }
1868            rgba.val[1] = rgb.val[1];
1869            rgba.val[3] = vdup_n_u8(0xFF);
1870
1871            // Store 8 pixels.
1872            vst4_u8((uint8_t*) dst, rgba);
1873            src += 8*3;
1874            dst += 8;
1875            count -= 8;
1876        }
1877
1878        // Call portable code to finish up the tail of [0,8) pixels.
1879        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1880        proc(dst, src, count);
1881    }
1882
1883    void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1884        insert_alpha_should_swaprb(false, dst, src, count);
1885    }
1886    void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1887        insert_alpha_should_swaprb(true, dst, src, count);
1888    }
1889#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
1890    static void insert_alpha_should_swaprb(bool kSwapRB,
1891                                           uint32_t dst[], const uint8_t* src, int count) {
1892        const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
1893        __m128i expand;
1894        const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
1895        if (kSwapRB) {
1896            expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
1897        } else {
1898            expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
1899        }
1900
1901        while (count >= 6) {
1902            // Load a vector.  While this actually contains 5 pixels plus an
1903            // extra component, we will discard all but the first four pixels on
1904            // this iteration.
1905            __m128i rgb = _mm_loadu_si128((const __m128i*) src);
1906
1907            // Expand the first four pixels to RGBX and then mask to RGB(FF).
1908            __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
1909
1910            // Store 4 pixels.
1911            _mm_storeu_si128((__m128i*) dst, rgba);
1912
1913            src += 4*3;
1914            dst += 4;
1915            count -= 4;
1916        }
1917
1918        // Call portable code to finish up the tail of [0,4) pixels.
1919        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1920        proc(dst, src, count);
1921    }
1922
1923    void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1924        insert_alpha_should_swaprb(false, dst, src, count);
1925    }
1926    void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1927        insert_alpha_should_swaprb(true, dst, src, count);
1928    }
1929#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
1930    static void insert_alpha_should_swaprb(bool kSwapRB,
1931                                           uint32_t dst[], const uint8_t* src, int count) {
1932        const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xFF000000);
1933
1934        __m256i expand = __lasx_xvldi(0);
1935        if (kSwapRB) {
1936            expand = __lasx_xvinsgr2vr_d(expand, 0x0503040502000102, 0);
1937            expand = __lasx_xvinsgr2vr_d(expand, 0x0b090a0b08060708, 1);
1938            expand = __lasx_xvinsgr2vr_d(expand, 0x110f10110e0c0d0e, 2);
1939            expand = __lasx_xvinsgr2vr_d(expand, 0x1715161714121314, 3);
1940        } else {
1941            expand = __lasx_xvinsgr2vr_d(expand, 0x0505040302020100, 0);
1942            expand = __lasx_xvinsgr2vr_d(expand, 0x0b0b0a0908080706, 1);
1943            expand = __lasx_xvinsgr2vr_d(expand, 0x1111100f0e0e0d0c, 2);
1944            expand = __lasx_xvinsgr2vr_d(expand, 0x1717161514141312, 3);
1945        }
1946
1947        while (count >= 8) {
1948            // Load a vector.  While this actually contains 5 pixels plus an
1949            // extra component, we will discard all but the first four pixels on
1950            // this iteration.
1951            __m256i rgb = __lasx_xvld(src, 0);
1952            __m256i rgb_l = __lasx_xvpermi_d(rgb, 0x44);
1953            __m256i rgb_h = __lasx_xvpermi_d(rgb, 0xEE);
1954
1955            // Expand the first four pixels to RGBX and then mask to RGB(FF).
1956            __m256i rgba = __lasx_xvor_v(__lasx_xvshuf_b(rgb_h, rgb_l, expand), alphaMask);
1957
1958            // Store 8 pixels.
1959            __lasx_xvst(rgba, dst, 0);
1960
1961            src += 4*6;
1962            dst += 8;
1963            count -= 8;
1964        }
1965
1966        // Call portable code to finish up the tail of [0,4) pixels.
1967        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
1968        proc(dst, src, count);
1969    }
1970    /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
1971        insert_alpha_should_swaprb(false, dst, src, count);
1972    }
1973    /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
1974        insert_alpha_should_swaprb(true, dst, src, count);
1975    }
1976#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1977    static void insert_alpha_should_swaprb(bool kSwapRB,
1978                                           uint32_t dst[], const uint8_t* src, int count) {
1979        const __m128i alphaMask = __lsx_vreplgr2vr_w(0xFF000000);
1980
1981        __m128i expand = __lsx_vldi(0);
1982        if (kSwapRB) {
1983            expand = __lsx_vinsgr2vr_d(expand, 0x0503040502000102, 0);
1984            expand = __lsx_vinsgr2vr_d(expand, 0x0b090a0b08060708, 1);
1985        } else {
1986            expand = __lsx_vinsgr2vr_d(expand, 0x0505040302020100, 0);
1987            expand = __lsx_vinsgr2vr_d(expand, 0x0b0b0a0908080706, 1);
1988        }
1989
1990        while (count >= 6) {
1991            // Load a vector.  While this actually contains 5 pixels plus an
1992            // extra component, we will discard all but the first four pixels on
1993            // this iteration.
1994            __m128i rgb = __lsx_vld(src, 0);
1995
1996            // Expand the first four pixels to RGBX and then mask to RGB(FF).
1997            __m128i rgba = __lsx_vor_v(__lsx_vshuf_b(rgb, rgb, expand), alphaMask);
1998
1999            // Store 4 pixels.
2000            __lsx_vst(rgba, dst, 0);
2001
2002            src += 4*3;
2003            dst += 4;
2004            count -= 4;
2005        }
2006
2007        // Call portable code to finish up the tail of [0,4) pixels.
2008        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
2009        proc(dst, src, count);
2010    }
2011    /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
2012        insert_alpha_should_swaprb(false, dst, src, count);
2013    }
2014    /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
2015        insert_alpha_should_swaprb(true, dst, src, count);
2016    }
2017#else
2018    void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
2019        RGB_to_RGB1_portable(dst, src, count);
2020    }
2021    void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
2022        RGB_to_BGR1_portable(dst, src, count);
2023    }
2024#endif
2025
2026}  // namespace SK_OPTS_NS
2027
2028#undef SI
2029