src/opts/SkSwizzler_opts.h

/*
 * Copyright 2016 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#ifndef SkSwizzler_opts_DEFINED
#define SkSwizzler_opts_DEFINED

#include "include/private/SkColorData.h"
#include "include/private/SkVx.h"
#include <utility>

#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
    #include <immintrin.h>
#elif defined(SK_ARM_HAS_NEON)
    #include <arm_neon.h>
#endif

namespace SK_OPTS_NS {

static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
    for (int i = 0; i < count; i++) {
        uint8_t a = (src[i] >> 24) & 0xFF,
                b = (src[i] >> 16) & 0xFF,
                g = (src[i] >>  8) & 0xFF,
                r = (src[i] >>  0) & 0xFF;
        b = (b*a+127)/255;
        g = (g*a+127)/255;
        r = (r*a+127)/255;
        dst[i] = (uint32_t)a << 24
               | (uint32_t)b << 16
               | (uint32_t)g <<  8
               | (uint32_t)r <<  0;
    }
}

static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
    for (int i = 0; i < count; i++) {
        uint8_t a = (src[i] >> 24) & 0xFF,
                b = (src[i] >> 16) & 0xFF,
                g = (src[i] >>  8) & 0xFF,
                r = (src[i] >>  0) & 0xFF;
        b = (b*a+127)/255;
        g = (g*a+127)/255;
        r = (r*a+127)/255;
        dst[i] = (uint32_t)a << 24
               | (uint32_t)r << 16
               | (uint32_t)g <<  8
               | (uint32_t)b <<  0;
    }
}

static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
    for (int i = 0; i < count; i++) {
        uint8_t a = (src[i] >> 24) & 0xFF,
                b = (src[i] >> 16) & 0xFF,
                g = (src[i] >>  8) & 0xFF,
                r = (src[i] >>  0) & 0xFF;
        dst[i] = (uint32_t)a << 24
               | (uint32_t)r << 16
               | (uint32_t)g <<  8
               | (uint32_t)b <<  0;
    }
}

static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
    for (int i = 0; i < count; i++) {
        uint8_t g = src[0],
                a = src[1];
        src += 2;
        dst[i] = (uint32_t)a << 24
               | (uint32_t)g << 16
               | (uint32_t)g <<  8
               | (uint32_t)g <<  0;
    }
}

static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
    for (int i = 0; i < count; i++) {
        uint8_t g = src[0],
                a = src[1];
        src += 2;
        g = (g*a+127)/255;
        dst[i] = (uint32_t)a << 24
               | (uint32_t)g << 16
               | (uint32_t)g <<  8
               | (uint32_t)g <<  0;
    }
}

static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
    for (int i = 0; i < count; i++) {
        uint8_t k = (src[i] >> 24) & 0xFF,
                y = (src[i] >> 16) & 0xFF,
                m = (src[i] >>  8) & 0xFF,
                c = (src[i] >>  0) & 0xFF;
        // See comments in SkSwizzler.cpp for details on the conversion formula.
        uint8_t b = (y*k+127)/255,
                g = (m*k+127)/255,
                r = (c*k+127)/255;
        dst[i] = (uint32_t)0xFF << 24
               | (uint32_t)   b << 16
               | (uint32_t)   g <<  8
               | (uint32_t)   r <<  0;
    }
}

static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
    for (int i = 0; i < count; i++) {
        uint8_t k = (src[i] >> 24) & 0xFF,
                y = (src[i] >> 16) & 0xFF,
                m = (src[i] >>  8) & 0xFF,
                c = (src[i] >>  0) & 0xFF;
        uint8_t b = (y*k+127)/255,
                g = (m*k+127)/255,
                r = (c*k+127)/255;
        dst[i] = (uint32_t)0xFF << 24
               | (uint32_t)   r << 16
               | (uint32_t)   g <<  8
               | (uint32_t)   b <<  0;
    }
}

#if defined(SK_ARM_HAS_NEON)

// Rounded divide by 255, (x + 127) / 255
static uint8x8_t div255_round(uint16x8_t x) {
    // result = (x + 127) / 255
    // result = (x + 127) / 256 + error1
    //
    // error1 = (x + 127) / (255 * 256)
    // error1 = (x + 127) / (256 * 256) + error2
    //
    // error2 = (x + 127) / (255 * 256 * 256)
    //
    // The maximum value of error2 is too small to matter.  Thus:
    // result = (x + 127) / 256 + (x + 127) / (256 * 256)
    // result = ((x + 127) / 256 + x + 127) / 256
    // result = ((x + 127) >> 8 + x + 127) >> 8
    //
    // Use >>> to represent "rounded right shift" which, conveniently,
    // NEON supports in one instruction.
    // result = ((x >>> 8) + x) >>> 8
    //
    // Note that the second right shift is actually performed as an
    // "add, round, and narrow back to 8-bits" instruction.
    return vraddhn_u16(x, vrshrq_n_u16(x, 8));
}

// Scale a byte by another, (x * y + 127) / 255
static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
    return div255_round(vmull_u8(x, y));
}

static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {
    while (count >= 8) {
        // Load 8 pixels.
        uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);

        uint8x8_t a = rgba.val[3],
                  b = rgba.val[2],
                  g = rgba.val[1],
                  r = rgba.val[0];

        // Premultiply.
        b = scale(b, a);
        g = scale(g, a);
        r = scale(r, a);

        // Store 8 premultiplied pixels.
        if (kSwapRB) {
            rgba.val[2] = r;
            rgba.val[1] = g;
            rgba.val[0] = b;
        } else {
            rgba.val[2] = b;
            rgba.val[1] = g;
            rgba.val[0] = r;
        }
        vst4_u8((uint8_t*) dst, rgba);
        src += 8;
        dst += 8;
        count -= 8;
    }

    // Call portable code to finish up the tail of [0,8) pixels.
    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
    proc(dst, src, count);
}

/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
    premul_should_swapRB(false, dst, src, count);
}

/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
    premul_should_swapRB(true, dst, src, count);
}

/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
    using std::swap;
    while (count >= 16) {
        // Load 16 pixels.
        uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);

        // Swap r and b.
        swap(rgba.val[0], rgba.val[2]);

        // Store 16 pixels.
        vst4q_u8((uint8_t*) dst, rgba);
        src += 16;
        dst += 16;
        count -= 16;
    }

    if (count >= 8) {
        // Load 8 pixels.
        uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);

        // Swap r and b.
        swap(rgba.val[0], rgba.val[2]);

        // Store 8 pixels.
        vst4_u8((uint8_t*) dst, rgba);
        src += 8;
        dst += 8;
        count -= 8;
    }

    RGBA_to_BGRA_portable(dst, src, count);
}

static void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) {
    while (count >= 16) {
        // Load 16 pixels.
        uint8x16x2_t ga = vld2q_u8(src);

        // Premultiply if requested.
        if (kPremul) {
            ga.val[0] = vcombine_u8(
                    scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
                    scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
        }

        // Set each of the color channels.
        uint8x16x4_t rgba;
        rgba.val[0] = ga.val[0];
        rgba.val[1] = ga.val[0];
        rgba.val[2] = ga.val[0];
        rgba.val[3] = ga.val[1];

        // Store 16 pixels.
        vst4q_u8((uint8_t*) dst, rgba);
        src += 16*2;
        dst += 16;
        count -= 16;
    }

    if (count >= 8) {
        // Load 8 pixels.
        uint8x8x2_t ga = vld2_u8(src);

        // Premultiply if requested.
        if (kPremul) {
            ga.val[0] = scale(ga.val[0], ga.val[1]);
        }

        // Set each of the color channels.
        uint8x8x4_t rgba;
        rgba.val[0] = ga.val[0];
        rgba.val[1] = ga.val[0];
        rgba.val[2] = ga.val[0];
        rgba.val[3] = ga.val[1];

        // Store 8 pixels.
        vst4_u8((uint8_t*) dst, rgba);
        src += 8*2;
        dst += 8;
        count -= 8;
    }

    auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
    proc(dst, src, count);
}

/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
    expand_grayA(false, dst, src, count);
}

/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
    expand_grayA(true, dst, src, count);
}

enum Format { kRGB1, kBGR1 };
static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
    while (count >= 8) {
        // Load 8 cmyk pixels.
        uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);

        uint8x8_t k = pixels.val[3],
                  y = pixels.val[2],
                  m = pixels.val[1],
                  c = pixels.val[0];

        // Scale to r, g, b.
        uint8x8_t b = scale(y, k);
        uint8x8_t g = scale(m, k);
        uint8x8_t r = scale(c, k);

        // Store 8 rgba pixels.
        if (kBGR1 == format) {
            pixels.val[3] = vdup_n_u8(0xFF);
            pixels.val[2] = r;
            pixels.val[1] = g;
            pixels.val[0] = b;
        } else {
            pixels.val[3] = vdup_n_u8(0xFF);
            pixels.val[2] = b;
            pixels.val[1] = g;
            pixels.val[0] = r;
        }
        vst4_u8((uint8_t*) dst, pixels);
        src += 8;
        dst += 8;
        count -= 8;
    }

    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
    proc(dst, src, count);
}

/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
    inverted_cmyk_to(kRGB1, dst, src, count);
}

/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
    inverted_cmyk_to(kBGR1, dst, src, count);
}

#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
// Scale a byte by another.
// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
static __m512i scale(__m512i x, __m512i y) {
    const __m512i _128 = _mm512_set1_epi16(128);
    const __m512i _257 = _mm512_set1_epi16(257);

    // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
    return _mm512_mulhi_epu16(_mm512_add_epi16(_mm512_mullo_epi16(x, y), _128), _257);
}

static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {

    auto premul8 = [=](__m512i* lo, __m512i* hi) {
        const __m512i zeros = _mm512_setzero_si512();
        skvx::Vec<64, uint8_t> mask;
        if (kSwapRB) {
            mask = { 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
                     2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
                     2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
                     2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15 };
        } else {
            mask = { 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
                     0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
                     0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
                     0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15 };
        }
        __m512i planar = skvx::bit_pun<__m512i>(mask);

        // Swizzle the pixels to 8-bit planar.
        *lo = _mm512_shuffle_epi8(*lo, planar);
        *hi = _mm512_shuffle_epi8(*hi, planar);
        __m512i rg = _mm512_unpacklo_epi32(*lo, *hi),
                ba = _mm512_unpackhi_epi32(*lo, *hi);

        // Unpack to 16-bit planar.
        __m512i r = _mm512_unpacklo_epi8(rg, zeros),
                g = _mm512_unpackhi_epi8(rg, zeros),
                b = _mm512_unpacklo_epi8(ba, zeros),
                a = _mm512_unpackhi_epi8(ba, zeros);

        // Premultiply!
        r = scale(r, a);
        g = scale(g, a);
        b = scale(b, a);

        // Repack into interlaced pixels.
        rg = _mm512_or_si512(r, _mm512_slli_epi16(g, 8));
        ba = _mm512_or_si512(b, _mm512_slli_epi16(a, 8));
        *lo = _mm512_unpacklo_epi16(rg, ba);
        *hi = _mm512_unpackhi_epi16(rg, ba);
    };

    while (count >= 32) {
        __m512i lo = _mm512_loadu_si512((const __m512i*) (src + 0)),
                hi = _mm512_loadu_si512((const __m512i*) (src + 16));

        premul8(&lo, &hi);

        _mm512_storeu_si512((__m512i*) (dst + 0), lo);
        _mm512_storeu_si512((__m512i*) (dst + 16), hi);

        src += 32;
        dst += 32;
        count -= 32;
    }

    if (count >= 16) {
        __m512i lo = _mm512_loadu_si512((const __m512i*) src),
                hi = _mm512_setzero_si512();

        premul8(&lo, &hi);

        _mm512_storeu_si512((__m512i*) dst, lo);

        src += 16;
        dst += 16;
        count -= 16;
    }

    // Call portable code to finish up the tail of [0,16) pixels.
    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
    proc(dst, src, count);
}

/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
    premul_should_swapRB(false, dst, src, count);
}

/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
    premul_should_swapRB(true, dst, src, count);
}

/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
    const uint8_t mask[64] = { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
                               2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
                               2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
                               2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
    const __m512i swapRB = _mm512_loadu_si512(mask);

    while (count >= 16) {
        __m512i rgba = _mm512_loadu_si512((const __m512i*) src);
        __m512i bgra = _mm512_shuffle_epi8(rgba, swapRB);
        _mm512_storeu_si512((__m512i*) dst, bgra);

        src += 16;
        dst += 16;
        count -= 16;
    }

    RGBA_to_BGRA_portable(dst, src, count);
}

// Use SSSE3 impl as AVX2 / AVX-512 impl regresses performance for RGB_to_RGB1 / RGB_to_BGR1.

// Use AVX2 impl as AVX-512 impl regresses performance for gray_to_RGB1.

/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
    while (count >= 32) {
        __m512i ga = _mm512_loadu_si512((const __m512i*) src);

        __m512i gg = _mm512_or_si512(_mm512_and_si512(ga, _mm512_set1_epi16(0x00FF)),
                                     _mm512_slli_epi16(ga, 8));

        __m512i ggga_lo = _mm512_unpacklo_epi16(gg, ga);
        __m512i ggga_hi = _mm512_unpackhi_epi16(gg, ga);

        // 1st shuffle for pixel reorder.
        // Note. 'p' stands for 'ggga'
        // Before 1st shuffle:
        //     ggga_lo = p0 p1 p2 p3 | p8  p9  p10 p11 | p16 p17 p18 p19 | p24 p25 p26 p27
        //     ggga_hi = p4 p5 p6 p7 | p12 p13 p14 p15 | p20 p21 p22 p23 | p28 p29 p30 p31
        //
        // After 1st shuffle:
        //     ggga_lo_shuffle_1 =
        //               p0  p1  p2  p3  | p8  p9  p10 p11 | p4  p5  p6  p7  | p12 p13 p14 p15
        //     ggga_hi_shuffle_1 =
        //               p16 p17 p18 p19 | p24 p25 p26 p27 | p20 p21 p22 p23 | p28 p29 p30 p31
        __m512i ggga_lo_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0x44),
                ggga_hi_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0xee);

        // 2nd shuffle for pixel reorder.
        // After the 2nd shuffle:
        //     ggga_lo_shuffle_2 =
        //               p0  p1  p2  p3  | p4  p5  p6  p7  | p8  p9  p10 p11 | p12 p13 p14 p15
        //     ggga_hi_shuffle_2 =
        //               p16 p17 p18 p19 | p20 p21 p22 p23 | p24 p25 p26 p27 | p28 p29 p30 p31
        __m512i ggga_lo_shuffle_2 = _mm512_shuffle_i32x4(ggga_lo_shuffle_1,
                                                         ggga_lo_shuffle_1, 0xd8),
                ggga_hi_shuffle_2 = _mm512_shuffle_i32x4(ggga_hi_shuffle_1,
                                                         ggga_hi_shuffle_1, 0xd8);

        _mm512_storeu_si512((__m512i*) (dst +  0), ggga_lo_shuffle_2);
        _mm512_storeu_si512((__m512i*) (dst + 16), ggga_hi_shuffle_2);

        src += 32*2;
        dst += 32;
        count -= 32;
    }

    grayA_to_RGBA_portable(dst, src, count);
}

/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
    while (count >= 32) {
        __m512i grayA = _mm512_loadu_si512((const __m512i*) src);

        __m512i g0 = _mm512_and_si512(grayA, _mm512_set1_epi16(0x00FF));
        __m512i a0 = _mm512_srli_epi16(grayA, 8);

        // Premultiply
        g0 = scale(g0, a0);

        __m512i gg = _mm512_or_si512(g0, _mm512_slli_epi16(g0, 8));
        __m512i ga = _mm512_or_si512(g0, _mm512_slli_epi16(a0, 8));

        __m512i ggga_lo = _mm512_unpacklo_epi16(gg, ga);
        __m512i ggga_hi = _mm512_unpackhi_epi16(gg, ga);

        // 1st shuffle for pixel reorder, same as grayA_to_RGBA.
        __m512i ggga_lo_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0x44),
                ggga_hi_shuffle_1 = _mm512_shuffle_i32x4(ggga_lo, ggga_hi, 0xee);

        // 2nd shuffle for pixel reorder, same as grayA_to_RGBA.
        __m512i ggga_lo_shuffle_2 = _mm512_shuffle_i32x4(ggga_lo_shuffle_1,
                                                         ggga_lo_shuffle_1, 0xd8),
                ggga_hi_shuffle_2 = _mm512_shuffle_i32x4(ggga_hi_shuffle_1,
                                                         ggga_hi_shuffle_1, 0xd8);

        _mm512_storeu_si512((__m512i*) (dst +  0), ggga_lo_shuffle_2);
        _mm512_storeu_si512((__m512i*) (dst + 16), ggga_hi_shuffle_2);

        src += 32*2;
        dst += 32;
        count -= 32;
    }

    grayA_to_rgbA_portable(dst, src, count);
}

enum Format { kRGB1, kBGR1 };
static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
    auto convert8 = [=](__m512i* lo, __m512i* hi) {
        const __m512i zeros = _mm512_setzero_si512();
        skvx::Vec<64, uint8_t> mask;
        if (kBGR1 == format) {
            mask = { 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
                     2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
                     2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
                     2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15 };
        } else {
            mask = { 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
                     0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
                     0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
                     0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15 };
        }
        __m512i planar = skvx::bit_pun<__m512i>(mask);

        // Swizzle the pixels to 8-bit planar.
        *lo = _mm512_shuffle_epi8(*lo, planar);
        *hi = _mm512_shuffle_epi8(*hi, planar);
        __m512i cm = _mm512_unpacklo_epi32(*lo, *hi),
                yk = _mm512_unpackhi_epi32(*lo, *hi);

        // Unpack to 16-bit planar.
        __m512i c = _mm512_unpacklo_epi8(cm, zeros),
                m = _mm512_unpackhi_epi8(cm, zeros),
                y = _mm512_unpacklo_epi8(yk, zeros),
                k = _mm512_unpackhi_epi8(yk, zeros);

        // Scale to r, g, b.
        __m512i r = scale(c, k),
                g = scale(m, k),
                b = scale(y, k);

        // Repack into interlaced pixels.
        __m512i rg = _mm512_or_si512(r, _mm512_slli_epi16(g, 8)),
                ba = _mm512_or_si512(b, _mm512_set1_epi16((uint16_t) 0xFF00));
        *lo = _mm512_unpacklo_epi16(rg, ba);
        *hi = _mm512_unpackhi_epi16(rg, ba);
    };

    while (count >= 32) {
        __m512i lo = _mm512_loadu_si512((const __m512i*) (src + 0)),
                hi = _mm512_loadu_si512((const __m512i*) (src + 16));

        convert8(&lo, &hi);

        _mm512_storeu_si512((__m512i*) (dst +  0), lo);
        _mm512_storeu_si512((__m512i*) (dst + 16), hi);

        src += 32;
        dst += 32;
        count -= 32;
    }

    if (count >= 16) {
        __m512i lo = _mm512_loadu_si512((const __m512i*) src),
                hi = _mm512_setzero_si512();

        convert8(&lo, &hi);

        _mm512_storeu_si512((__m512i*) dst, lo);

        src += 16;
        dst += 16;
        count -= 16;
    }

    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
    proc(dst, src, count);
}

/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
    inverted_cmyk_to(kRGB1, dst, src, count);
}

/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
    inverted_cmyk_to(kBGR1, dst, src, count);
}

#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2

// Scale a byte by another.
// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
static __m256i scale(__m256i x, __m256i y) {
    const __m256i _128 = _mm256_set1_epi16(128);
    const __m256i _257 = _mm256_set1_epi16(257);

    // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
    return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257);
}

static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {

    auto premul8 = [=](__m256i* lo, __m256i* hi) {
        const __m256i zeros = _mm256_setzero_si256();
        __m256i planar;
        if (kSwapRB) {
            planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
                                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
        } else {
            planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
                                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
        }

        // Swizzle the pixels to 8-bit planar.
        *lo = _mm256_shuffle_epi8(*lo, planar);             // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa
        *hi = _mm256_shuffle_epi8(*hi, planar);             // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA
        __m256i rg = _mm256_unpacklo_epi32(*lo, *hi),       // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG
                ba = _mm256_unpackhi_epi32(*lo, *hi);       // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA

        // Unpack to 16-bit planar.
        __m256i r = _mm256_unpacklo_epi8(rg, zeros),        // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_
                g = _mm256_unpackhi_epi8(rg, zeros),        // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_
                b = _mm256_unpacklo_epi8(ba, zeros),        // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_
                a = _mm256_unpackhi_epi8(ba, zeros);        // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_

        // Premultiply!
        r = scale(r, a);
        g = scale(g, a);
        b = scale(b, a);

        // Repack into interlaced pixels.
        rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8));   // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
        ba = _mm256_or_si256(b, _mm256_slli_epi16(a, 8));   // babababa BABABABA babababa BABABABA
        *lo = _mm256_unpacklo_epi16(rg, ba);                // rgbargba rgbargba rgbargba rgbargba
        *hi = _mm256_unpackhi_epi16(rg, ba);                // RGBARGBA RGBARGBA RGBARGBA RGBARGBA
    };

    while (count >= 16) {
        __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
                hi = _mm256_loadu_si256((const __m256i*) (src + 8));

        premul8(&lo, &hi);

        _mm256_storeu_si256((__m256i*) (dst + 0), lo);
        _mm256_storeu_si256((__m256i*) (dst + 8), hi);

        src += 16;
        dst += 16;
        count -= 16;
    }

    if (count >= 8) {
        __m256i lo = _mm256_loadu_si256((const __m256i*) src),
                hi = _mm256_setzero_si256();

        premul8(&lo, &hi);

        _mm256_storeu_si256((__m256i*) dst, lo);

        src += 8;
        dst += 8;
        count -= 8;
    }

    // Call portable code to finish up the tail of [0,8) pixels.
    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
    proc(dst, src, count);
}

/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
    premul_should_swapRB(false, dst, src, count);
}

/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
    premul_should_swapRB(true, dst, src, count);
}

/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
    const __m256i swapRB = _mm256_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15,
                                            2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);

    while (count >= 8) {
        __m256i rgba = _mm256_loadu_si256((const __m256i*) src);
        __m256i bgra = _mm256_shuffle_epi8(rgba, swapRB);
        _mm256_storeu_si256((__m256i*) dst, bgra);

        src += 8;
        dst += 8;
        count -= 8;
    }

    RGBA_to_BGRA_portable(dst, src, count);
}

/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
    while (count >= 16) {
        __m256i ga = _mm256_loadu_si256((const __m256i*) src);

        __m256i gg = _mm256_or_si256(_mm256_and_si256(ga, _mm256_set1_epi16(0x00FF)),
                                     _mm256_slli_epi16(ga, 8));

        __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
        __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);

        // Shuffle for pixel reorder
        // Note. 'p' stands for 'ggga'
        // Before shuffle:
        // ggga_lo = p0 p1 p2 p3 | p8  p9  p10 p11
        // ggga_hi = p4 p5 p6 p7 | p12 p13 p14 p15
        //
        // After shuffle:
        // ggga_lo_shuffle = p0 p1 p2  p3  | p4  p5  p6  p7
        // ggga_hi_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15
        __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
                ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);

        _mm256_storeu_si256((__m256i*) (dst +  0), ggga_lo_shuffle);
        _mm256_storeu_si256((__m256i*) (dst +  8), ggga_hi_shuffle);

        src += 16*2;
        dst += 16;
        count -= 16;
    }

    grayA_to_RGBA_portable(dst, src, count);
}

/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
    while (count >= 16) {
        __m256i grayA = _mm256_loadu_si256((const __m256i*) src);

        __m256i g0 = _mm256_and_si256(grayA, _mm256_set1_epi16(0x00FF));
        __m256i a0 = _mm256_srli_epi16(grayA, 8);

        // Premultiply
        g0 = scale(g0, a0);

        __m256i gg = _mm256_or_si256(g0, _mm256_slli_epi16(g0, 8));
        __m256i ga = _mm256_or_si256(g0, _mm256_slli_epi16(a0, 8));

        __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga);
        __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga);

        // Shuffle for pixel reorder, similar as grayA_to_RGBA
        __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20),
                ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31);

        _mm256_storeu_si256((__m256i*) (dst +  0), ggga_lo_shuffle);
        _mm256_storeu_si256((__m256i*) (dst +  8), ggga_hi_shuffle);

        src += 16*2;
        dst += 16;
        count -= 16;
    }

    grayA_to_rgbA_portable(dst, src, count);
}

enum Format { kRGB1, kBGR1 };
static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
    auto convert8 = [=](__m256i* lo, __m256i* hi) {
        const __m256i zeros = _mm256_setzero_si256();
        __m256i planar;
        if (kBGR1 == format) {
            planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15,
                                      2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
        } else {
            planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15,
                                      0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
        }

        // Swizzle the pixels to 8-bit planar.
        *lo = _mm256_shuffle_epi8(*lo, planar);            // ccccmmmm yyyykkkk ccccmmmm yyyykkkk
        *hi = _mm256_shuffle_epi8(*hi, planar);            // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK
        __m256i cm = _mm256_unpacklo_epi32(*lo, *hi),      // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM
                yk = _mm256_unpackhi_epi32(*lo, *hi);      // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK

        // Unpack to 16-bit planar.
        __m256i c = _mm256_unpacklo_epi8(cm, zeros),       // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_
                m = _mm256_unpackhi_epi8(cm, zeros),       // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_
                y = _mm256_unpacklo_epi8(yk, zeros),       // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_
                k = _mm256_unpackhi_epi8(yk, zeros);       // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_

        // Scale to r, g, b.
        __m256i r = scale(c, k),
                g = scale(m, k),
                b = scale(y, k);

        // Repack into interlaced pixels:
        //     rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG
        //     ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1
        __m256i rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)),
                ba = _mm256_or_si256(b, _mm256_set1_epi16((uint16_t) 0xFF00));
        *lo = _mm256_unpacklo_epi16(rg, ba);               // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1
        *hi = _mm256_unpackhi_epi16(rg, ba);               // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1
    };

    while (count >= 16) {
        __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)),
                hi = _mm256_loadu_si256((const __m256i*) (src + 8));

        convert8(&lo, &hi);

        _mm256_storeu_si256((__m256i*) (dst + 0), lo);
        _mm256_storeu_si256((__m256i*) (dst + 8), hi);

        src += 16;
        dst += 16;
        count -= 16;
    }

    if (count >= 8) {
        __m256i lo = _mm256_loadu_si256((const __m256i*) src),
                hi = _mm256_setzero_si256();

        convert8(&lo, &hi);

        _mm256_storeu_si256((__m256i*) dst, lo);

        src += 8;
        dst += 8;
        count -= 8;
    }

    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
    proc(dst, src, count);
}

/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
    inverted_cmyk_to(kRGB1, dst, src, count);
}

/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
    inverted_cmyk_to(kBGR1, dst, src, count);
}

#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

// Scale a byte by another.
// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
static __m128i scale(__m128i x, __m128i y) {
    const __m128i _128 = _mm_set1_epi16(128);
    const __m128i _257 = _mm_set1_epi16(257);

    // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
    return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
}

static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) {

    auto premul8 = [=](__m128i* lo, __m128i* hi) {
        const __m128i zeros = _mm_setzero_si128();
        __m128i planar;
        if (kSwapRB) {
            planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
        } else {
            planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
        }

        // Swizzle the pixels to 8-bit planar.
        *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
        *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
        __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
                ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA

        // Unpack to 16-bit planar.
        __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
                g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
                b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
                a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_

        // Premultiply!
        r = scale(r, a);
        g = scale(g, a);
        b = scale(b, a);

        // Repack into interlaced pixels.
        rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
        ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
        *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
        *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
    };

    while (count >= 8) {
        __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
                hi = _mm_loadu_si128((const __m128i*) (src + 4));

        premul8(&lo, &hi);

        _mm_storeu_si128((__m128i*) (dst + 0), lo);
        _mm_storeu_si128((__m128i*) (dst + 4), hi);

        src += 8;
        dst += 8;
        count -= 8;
    }

    if (count >= 4) {
        __m128i lo = _mm_loadu_si128((const __m128i*) src),
                hi = _mm_setzero_si128();

        premul8(&lo, &hi);

        _mm_storeu_si128((__m128i*) dst, lo);

        src += 4;
        dst += 4;
        count -= 4;
    }

    // Call portable code to finish up the tail of [0,4) pixels.
    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
    proc(dst, src, count);
}

/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
    premul_should_swapRB(false, dst, src, count);
}

/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
    premul_should_swapRB(true, dst, src, count);
}

/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
    const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);

    while (count >= 4) {
        __m128i rgba = _mm_loadu_si128((const __m128i*) src);
        __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
        _mm_storeu_si128((__m128i*) dst, bgra);

        src += 4;
        dst += 4;
        count -= 4;
    }

    RGBA_to_BGRA_portable(dst, src, count);
}

/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
    while (count >= 8) {
        __m128i ga = _mm_loadu_si128((const __m128i*) src);

        __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
                                  _mm_slli_epi16(ga, 8));

        __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
        __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);

        _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
        _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);

        src += 8*2;
        dst += 8;
        count -= 8;
    }

    grayA_to_RGBA_portable(dst, src, count);
}

/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
    while (count >= 8) {
        __m128i grayA = _mm_loadu_si128((const __m128i*) src);

        __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
        __m128i a0 = _mm_srli_epi16(grayA, 8);

        // Premultiply
        g0 = scale(g0, a0);

        __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
        __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));


        __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
        __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);

        _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
        _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);

        src += 8*2;
        dst += 8;
        count -= 8;
    }

    grayA_to_rgbA_portable(dst, src, count);
}

enum Format { kRGB1, kBGR1 };
static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) {
    auto convert8 = [=](__m128i* lo, __m128i* hi) {
        const __m128i zeros = _mm_setzero_si128();
        __m128i planar;
        if (kBGR1 == format) {
            planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
        } else {
            planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
        }

        // Swizzle the pixels to 8-bit planar.
        *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
        *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
        __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
                yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK

        // Unpack to 16-bit planar.
        __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
                m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
                y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
                k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_

        // Scale to r, g, b.
        __m128i r = scale(c, k),
                g = scale(m, k),
                b = scale(y, k);

        // Repack into interlaced pixels.
        __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
                ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
        *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
        *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
    };

    while (count >= 8) {
        __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
                hi = _mm_loadu_si128((const __m128i*) (src + 4));

        convert8(&lo, &hi);

        _mm_storeu_si128((__m128i*) (dst + 0), lo);
        _mm_storeu_si128((__m128i*) (dst + 4), hi);

        src += 8;
        dst += 8;
        count -= 8;
    }

    if (count >= 4) {
        __m128i lo = _mm_loadu_si128((const __m128i*) src),
                hi = _mm_setzero_si128();

        convert8(&lo, &hi);

        _mm_storeu_si128((__m128i*) dst, lo);

        src += 4;
        dst += 4;
        count -= 4;
    }

    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
    proc(dst, src, count);
}

/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
    inverted_cmyk_to(kRGB1, dst, src, count);
}

/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
    inverted_cmyk_to(kBGR1, dst, src, count);
}

#else

/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
    RGBA_to_rgbA_portable(dst, src, count);
}

/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
    RGBA_to_bgrA_portable(dst, src, count);
}

/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
    RGBA_to_BGRA_portable(dst, src, count);
}

/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
    grayA_to_RGBA_portable(dst, src, count);
}

/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
    grayA_to_rgbA_portable(dst, src, count);
}

/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
    inverted_CMYK_to_RGB1_portable(dst, src, count);
}

/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
    inverted_CMYK_to_BGR1_portable(dst, src, count);
}

#endif

// Basically as above, but we found no benefit from AVX-512 for gray_to_RGB1.
static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
    for (int i = 0; i < count; i++) {
        dst[i] = (uint32_t)0xFF   << 24
               | (uint32_t)src[i] << 16
               | (uint32_t)src[i] <<  8
               | (uint32_t)src[i] <<  0;
    }
}
#if defined(SK_ARM_HAS_NEON)
    /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
        while (count >= 16) {
            // Load 16 pixels.
            uint8x16_t gray = vld1q_u8(src);

            // Set each of the color channels.
            uint8x16x4_t rgba;
            rgba.val[0] = gray;
            rgba.val[1] = gray;
            rgba.val[2] = gray;
            rgba.val[3] = vdupq_n_u8(0xFF);

            // Store 16 pixels.
            vst4q_u8((uint8_t*) dst, rgba);
            src += 16;
            dst += 16;
            count -= 16;
        }
        if (count >= 8) {
            // Load 8 pixels.
            uint8x8_t gray = vld1_u8(src);

            // Set each of the color channels.
            uint8x8x4_t rgba;
            rgba.val[0] = gray;
            rgba.val[1] = gray;
            rgba.val[2] = gray;
            rgba.val[3] = vdup_n_u8(0xFF);

            // Store 8 pixels.
            vst4_u8((uint8_t*) dst, rgba);
            src += 8;
            dst += 8;
            count -= 8;
        }
        gray_to_RGB1_portable(dst, src, count);
    }
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
    /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
        const __m256i alphas = _mm256_set1_epi8((uint8_t) 0xFF);
        while (count >= 32) {
            __m256i grays = _mm256_loadu_si256((const __m256i*) src);

            __m256i gg_lo = _mm256_unpacklo_epi8(grays, grays);
            __m256i gg_hi = _mm256_unpackhi_epi8(grays, grays);
            __m256i ga_lo = _mm256_unpacklo_epi8(grays, alphas);
            __m256i ga_hi = _mm256_unpackhi_epi8(grays, alphas);

            __m256i ggga0 = _mm256_unpacklo_epi16(gg_lo, ga_lo);
            __m256i ggga1 = _mm256_unpackhi_epi16(gg_lo, ga_lo);
            __m256i ggga2 = _mm256_unpacklo_epi16(gg_hi, ga_hi);
            __m256i ggga3 = _mm256_unpackhi_epi16(gg_hi, ga_hi);

            // Shuffle for pixel reorder.
            // Note. 'p' stands for 'ggga'
            // Before shuffle:
            //     ggga0 = p0  p1  p2  p3  | p16 p17 p18 p19
            //     ggga1 = p4  p5  p6  p7  | p20 p21 p22 p23
            //     ggga2 = p8  p9  p10 p11 | p24 p25 p26 p27
            //     ggga3 = p12 p13 p14 p15 | p28 p29 p30 p31
            //
            // After shuffle:
            //     ggga0_shuffle = p0  p1  p2  p3  | p4  p5  p6  p7
            //     ggga1_shuffle = p8  p9  p10 p11 | p12 p13 p14 p15
            //     ggga2_shuffle = p16 p17 p18 p19 | p20 p21 p22 p23
            //     ggga3_shuffle = p24 p25 p26 p27 | p28 p29 p30 p31
            __m256i ggga0_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x20),
                    ggga1_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x20),
                    ggga2_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x31),
                    ggga3_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x31);

            _mm256_storeu_si256((__m256i*) (dst +  0), ggga0_shuffle);
            _mm256_storeu_si256((__m256i*) (dst +  8), ggga1_shuffle);
            _mm256_storeu_si256((__m256i*) (dst + 16), ggga2_shuffle);
            _mm256_storeu_si256((__m256i*) (dst + 24), ggga3_shuffle);

            src += 32;
            dst += 32;
            count -= 32;
        }
        gray_to_RGB1_portable(dst, src, count);
    }
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3  // TODO: just check >= SSE2?
    /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
        const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
        while (count >= 16) {
            __m128i grays = _mm_loadu_si128((const __m128i*) src);

            __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
            __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
            __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
            __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);

            __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
            __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
            __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
            __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);

            _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
            _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
            _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
            _mm_storeu_si128((__m128i*) (dst + 12), ggga3);

            src += 16;
            dst += 16;
            count -= 16;
        }
        gray_to_RGB1_portable(dst, src, count);
    }
#else
    /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
        gray_to_RGB1_portable(dst, src, count);
    }
#endif

// Again as above, this time not even finding benefit from AVX2 for RGB_to_{RGB,BGR}1.
static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
    for (int i = 0; i < count; i++) {
        uint8_t r = src[0],
                g = src[1],
                b = src[2];
        src += 3;
        dst[i] = (uint32_t)0xFF << 24
               | (uint32_t)b    << 16
               | (uint32_t)g    <<  8
               | (uint32_t)r    <<  0;
    }
}
static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
    for (int i = 0; i < count; i++) {
        uint8_t r = src[0],
                g = src[1],
                b = src[2];
        src += 3;
        dst[i] = (uint32_t)0xFF << 24
               | (uint32_t)r    << 16
               | (uint32_t)g    <<  8
               | (uint32_t)b    <<  0;
    }
}
#if defined(SK_ARM_HAS_NEON)
    static void insert_alpha_should_swaprb(bool kSwapRB,
                                           uint32_t dst[], const uint8_t* src, int count) {
        while (count >= 16) {
            // Load 16 pixels.
            uint8x16x3_t rgb = vld3q_u8(src);

            // Insert an opaque alpha channel and swap if needed.
            uint8x16x4_t rgba;
            if (kSwapRB) {
                rgba.val[0] = rgb.val[2];
                rgba.val[2] = rgb.val[0];
            } else {
                rgba.val[0] = rgb.val[0];
                rgba.val[2] = rgb.val[2];
            }
            rgba.val[1] = rgb.val[1];
            rgba.val[3] = vdupq_n_u8(0xFF);

            // Store 16 pixels.
            vst4q_u8((uint8_t*) dst, rgba);
            src += 16*3;
            dst += 16;
            count -= 16;
        }

        if (count >= 8) {
            // Load 8 pixels.
            uint8x8x3_t rgb = vld3_u8(src);

            // Insert an opaque alpha channel and swap if needed.
            uint8x8x4_t rgba;
            if (kSwapRB) {
                rgba.val[0] = rgb.val[2];
                rgba.val[2] = rgb.val[0];
            } else {
                rgba.val[0] = rgb.val[0];
                rgba.val[2] = rgb.val[2];
            }
            rgba.val[1] = rgb.val[1];
            rgba.val[3] = vdup_n_u8(0xFF);

            // Store 8 pixels.
            vst4_u8((uint8_t*) dst, rgba);
            src += 8*3;
            dst += 8;
            count -= 8;
        }

        // Call portable code to finish up the tail of [0,8) pixels.
        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
        proc(dst, src, count);
    }

    /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
        insert_alpha_should_swaprb(false, dst, src, count);
    }
    /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
        insert_alpha_should_swaprb(true, dst, src, count);
    }
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
    static void insert_alpha_should_swaprb(bool kSwapRB,
                                           uint32_t dst[], const uint8_t* src, int count) {
        const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
        __m128i expand;
        const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
        if (kSwapRB) {
            expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
        } else {
            expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
        }

        while (count >= 6) {
            // Load a vector.  While this actually contains 5 pixels plus an
            // extra component, we will discard all but the first four pixels on
            // this iteration.
            __m128i rgb = _mm_loadu_si128((const __m128i*) src);

            // Expand the first four pixels to RGBX and then mask to RGB(FF).
            __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);

            // Store 4 pixels.
            _mm_storeu_si128((__m128i*) dst, rgba);

            src += 4*3;
            dst += 4;
            count -= 4;
        }

        // Call portable code to finish up the tail of [0,4) pixels.
        auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
        proc(dst, src, count);
    }

    /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
        insert_alpha_should_swaprb(false, dst, src, count);
    }
    /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
        insert_alpha_should_swaprb(true, dst, src, count);
    }
#else
    /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
        RGB_to_RGB1_portable(dst, src, count);
    }
    /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
        RGB_to_BGR1_portable(dst, src, count);
    }
#endif

}  // namespace SK_OPTS_NS

#endif // SkSwizzler_opts_DEFINED