1/* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "src/base/SkUtils.h" 9#include "src/base/SkVx.h" 10#include "src/core/SkColorData.h" 11#include "src/core/SkSwizzlePriv.h" 12 13#include <algorithm> 14#include <cmath> 15#include <utility> 16 17#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 18 #include <immintrin.h> 19#elif defined(SK_ARM_HAS_NEON) 20 #include <arm_neon.h> 21#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX 22 #include <lasxintrin.h> 23#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX 24 #include <lsxintrin.h> 25#endif 26 27// This file is included in multiple translation units with different #defines set enabling 28// different instruction use for different CPU architectures. 29// 30// A pair of files controls what #defines are defined: SkOpts_SetTarget.h set the flags, and 31// SkOpts_RestoreTarget.h restores them. SkOpts_SetTarget is controlled by setting the 32// SK_OPTS_TARGET define before included it. 33// 34// SkOpts_SetTarget also sets the #define SK_OPTS_NS to the unique namespace for this code. 35 36#if defined(__clang__) || defined(__GNUC__) 37#define SI __attribute__((always_inline)) static inline 38#else 39#define SI static inline 40#endif 41 42namespace SK_OPTS_NS { 43 44#if defined(SK_USE_FAST_UNPREMUL_324099025) 45constexpr bool kFastUnpremul = true; 46#else 47constexpr bool kFastUnpremul = false; 48#endif 49 50SI float reciprocal_alpha_times_255_portable(float a) { 51 return a != 0 ? 255.0f / a : 0.0f; 52} 53 54SI float reciprocal_alpha_portable(float a) { 55 return a != 0 ? 1.0f / a : 0.0f; 56} 57 58#if defined(SK_ARM_HAS_NEON) 59// -- NEON -- Harden against timing attacks 60// For neon, the portable versions create branchless code. 61SI float reciprocal_alpha_times_255(float a) { 62 return reciprocal_alpha_times_255_portable(a); 63} 64 65SI float reciprocal_alpha(float a) { 66 return reciprocal_alpha_portable(a); 67} 68#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1 && (defined(__clang__) || !defined(_MSC_VER)) 69// -- SSE -- Harden against timing attacks -- MSVC is not supported. 70using F4 = __m128; 71 72SK_NO_SANITIZE("float-divide-by-zero") 73SI float reciprocal_alpha_times_255(float a) { 74 SkASSERT(0 <= a && a <= 255); 75 F4 vA{a, a, a, a}; 76 auto q = F4{255.0f} / vA; 77 return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0]; 78} 79 80SK_NO_SANITIZE("float-divide-by-zero") 81SI float reciprocal_alpha(float a) { 82 SkASSERT(0 <= a && a <= 1); 83 F4 vA{a, a, a, a}; 84 auto q = F4{1.0f} / vA; 85 return _mm_and_ps(sk_bit_cast<__m128>(vA != F4{0.0f}), q)[0]; 86} 87#else 88// -- Portable -- *Not* hardened against timing attacks 89SI float reciprocal_alpha_times_255(float a) { 90 return reciprocal_alpha_times_255_portable(a); 91} 92 93SI float reciprocal_alpha(float a) { 94 return reciprocal_alpha_portable(a); 95} 96#endif 97 98static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) { 99 for (int i = 0; i < count; i++) { 100 uint8_t a = (src[i] >> 24) & 0xFF, 101 b = (src[i] >> 16) & 0xFF, 102 g = (src[i] >> 8) & 0xFF, 103 r = (src[i] >> 0) & 0xFF; 104 b = (b*a+127)/255; 105 g = (g*a+127)/255; 106 r = (r*a+127)/255; 107 dst[i] = (uint32_t)a << 24 108 | (uint32_t)b << 16 109 | (uint32_t)g << 8 110 | (uint32_t)r << 0; 111 } 112} 113 114// RP uses the following rounding routines in store_8888. There are three different 115// styles of rounding: 116// 1) +0.5 and floor - used by scalar and ARMv7 117// 2) round to even for sure - ARMv8 118// 3) round to even maybe - intel. The rounding on intel depends on MXCSR which 119// defaults to round to even. 120// 121// Note: that vrndns_f32 is the single float version of vcvtnq_u32_f32. 122 123SI uint32_t pixel_round_as_RP(float n) { 124#if defined(SK_ARM_HAS_NEON) && defined(SK_CPU_ARM64) 125 return vrndns_f32(n); 126#elif defined(SK_ARM_HAS_NEON) && !defined(SK_CPU_ARM64) 127 float32x4_t vN{n + 0.5f}; 128 return vcvtq_u32_f32(vN)[0]; 129#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 && (defined(__clang__) || !defined(_MSC_VER)) 130 return _mm_cvtps_epi32(__m128{n})[0]; 131#else 132 return (uint32_t)(n + 0.5f); 133#endif 134} 135 136// Doing the math for an original color b resulting in a premul color x, 137// x = ⌊(b * a + 127) / 255⌋, 138// x ≤ (b * a + 127) / 255 < x + 1, 139// 255 * x ≤ b * a + 127 < 255 * (x + 1), 140// 255 * x - 127 ≤ b * a < 255 * (x + 1) - 127, 141// 255 * x - 127 ≤ b * a < 255 * x + 128, 142// (255 * x - 127) / a ≤ b < (255 * x + 128) / a. 143// So, given a premul value x < a, the original color b can be in the above range. 144// We can pick the middle of that range as 145// b = 255 * x / a 146// b = x * (255 / a) 147SI uint32_t unpremul_quick(float reciprocalA, float c) { 148 return (uint32_t)std::min(255.0f, (c * reciprocalA + 0.5f)); 149} 150 151// Similar to unpremul but simulates Raster Pipeline by normalizing the pixel on the interval 152// [0, 1] and uses round-to-even in most cases instead of round-up. 153SI uint32_t unpremul_simulating_RP(float reciprocalA, float c) { 154 const float normalizedC = c * (1.0f / 255.0f); 155 const float answer = std::min(255.0f, normalizedC * reciprocalA * 255.0f); 156 return pixel_round_as_RP(answer); 157} 158 159SI uint32_t rgbA_to_CCCA(float c00, float c08, float c16, float a) { 160 if constexpr (kFastUnpremul) { 161 const float reciprocalA = reciprocal_alpha_times_255(a); 162 auto unpremul = [reciprocalA](float c) -> uint32_t { 163 return unpremul_quick(reciprocalA, c); 164 }; 165 return (uint32_t) a << 24 166 | unpremul(c16) << 16 167 | unpremul(c08) << 8 168 | unpremul(c00) << 0; 169 } else { 170 const float normalizedA = a * (1.0f / 255.0f); 171 const float reciprocalA = reciprocal_alpha(normalizedA); 172 auto unpremul = [reciprocalA](float c) -> uint32_t { 173 return unpremul_simulating_RP(reciprocalA, c); 174 }; 175 return (uint32_t) a << 24 176 | unpremul(c16) << 16 177 | unpremul(c08) << 8 178 | unpremul(c00) << 0; 179 } 180} 181 182static void rgbA_to_RGBA_portable(uint32_t* dst, const uint32_t* src, int count) { 183 for (int i = 0; i < count; i++) { 184 const uint32_t p = src[i]; 185 186 const float a = (p >> 24) & 0xFF, 187 b = (p >> 16) & 0xFF, 188 g = (p >> 8) & 0xFF, 189 r = (p >> 0) & 0xFF; 190 191 dst[i] = rgbA_to_CCCA(r, g, b, a); 192 } 193} 194 195static void rgbA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) { 196 for (int i = 0; i < count; i++) { 197 const uint32_t p = src[i]; 198 199 const uint32_t a = (p >> 24) & 0xFF, 200 b = (p >> 16) & 0xFF, 201 g = (p >> 8) & 0xFF, 202 r = (p >> 0) & 0xFF; 203 204 dst[i] = rgbA_to_CCCA(b, g, r, a); 205 } 206} 207 208static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) { 209 for (int i = 0; i < count; i++) { 210 uint8_t a = (src[i] >> 24) & 0xFF, 211 b = (src[i] >> 16) & 0xFF, 212 g = (src[i] >> 8) & 0xFF, 213 r = (src[i] >> 0) & 0xFF; 214 b = (b*a+127)/255; 215 g = (g*a+127)/255; 216 r = (r*a+127)/255; 217 dst[i] = (uint32_t)a << 24 218 | (uint32_t)r << 16 219 | (uint32_t)g << 8 220 | (uint32_t)b << 0; 221 } 222} 223 224static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) { 225 for (int i = 0; i < count; i++) { 226 uint8_t a = (src[i] >> 24) & 0xFF, 227 b = (src[i] >> 16) & 0xFF, 228 g = (src[i] >> 8) & 0xFF, 229 r = (src[i] >> 0) & 0xFF; 230 dst[i] = (uint32_t)a << 24 231 | (uint32_t)r << 16 232 | (uint32_t)g << 8 233 | (uint32_t)b << 0; 234 } 235} 236 237static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) { 238 for (int i = 0; i < count; i++) { 239 uint8_t g = src[0], 240 a = src[1]; 241 src += 2; 242 dst[i] = (uint32_t)a << 24 243 | (uint32_t)g << 16 244 | (uint32_t)g << 8 245 | (uint32_t)g << 0; 246 } 247} 248 249static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) { 250 for (int i = 0; i < count; i++) { 251 uint8_t g = src[0], 252 a = src[1]; 253 src += 2; 254 g = (g*a+127)/255; 255 dst[i] = (uint32_t)a << 24 256 | (uint32_t)g << 16 257 | (uint32_t)g << 8 258 | (uint32_t)g << 0; 259 } 260} 261 262static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) { 263 for (int i = 0; i < count; i++) { 264 uint8_t k = (src[i] >> 24) & 0xFF, 265 y = (src[i] >> 16) & 0xFF, 266 m = (src[i] >> 8) & 0xFF, 267 c = (src[i] >> 0) & 0xFF; 268 // See comments in SkSwizzler.cpp for details on the conversion formula. 269 uint8_t b = (y*k+127)/255, 270 g = (m*k+127)/255, 271 r = (c*k+127)/255; 272 dst[i] = (uint32_t)0xFF << 24 273 | (uint32_t) b << 16 274 | (uint32_t) g << 8 275 | (uint32_t) r << 0; 276 } 277} 278 279static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) { 280 for (int i = 0; i < count; i++) { 281 uint8_t k = (src[i] >> 24) & 0xFF, 282 y = (src[i] >> 16) & 0xFF, 283 m = (src[i] >> 8) & 0xFF, 284 c = (src[i] >> 0) & 0xFF; 285 uint8_t b = (y*k+127)/255, 286 g = (m*k+127)/255, 287 r = (c*k+127)/255; 288 dst[i] = (uint32_t)0xFF << 24 289 | (uint32_t) r << 16 290 | (uint32_t) g << 8 291 | (uint32_t) b << 0; 292 } 293} 294 295#if defined(SK_ARM_HAS_NEON) 296// -- NEON ----------------------------------------------------------------------------------------- 297// Rounded divide by 255, (x + 127) / 255 298SI uint8x8_t div255_round(uint16x8_t x) { 299 // result = (x + 127) / 255 300 // result = (x + 127) / 256 + error1 301 // 302 // error1 = (x + 127) / (255 * 256) 303 // error1 = (x + 127) / (256 * 256) + error2 304 // 305 // error2 = (x + 127) / (255 * 256 * 256) 306 // 307 // The maximum value of error2 is too small to matter. Thus: 308 // result = (x + 127) / 256 + (x + 127) / (256 * 256) 309 // result = ((x + 127) / 256 + x + 127) / 256 310 // result = ((x + 127) >> 8 + x + 127) >> 8 311 // 312 // Use >>> to represent "rounded right shift" which, conveniently, 313 // NEON supports in one instruction. 314 // result = ((x >>> 8) + x) >>> 8 315 // 316 // Note that the second right shift is actually performed as an 317 // "add, round, and narrow back to 8-bits" instruction. 318 return vraddhn_u16(x, vrshrq_n_u16(x, 8)); 319} 320 321// Scale a byte by another, (x * y + 127) / 255 322SI uint8x8_t scale(uint8x8_t x, uint8x8_t y) { 323 return div255_round(vmull_u8(x, y)); 324} 325 326static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) { 327 while (count >= 8) { 328 // Load 8 pixels. 329 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); 330 331 uint8x8_t a = rgba.val[3], 332 b = rgba.val[2], 333 g = rgba.val[1], 334 r = rgba.val[0]; 335 336 // Premultiply. 337 b = scale(b, a); 338 g = scale(g, a); 339 r = scale(r, a); 340 341 // Store 8 premultiplied pixels. 342 if (kSwapRB) { 343 rgba.val[2] = r; 344 rgba.val[1] = g; 345 rgba.val[0] = b; 346 } else { 347 rgba.val[2] = b; 348 rgba.val[1] = g; 349 rgba.val[0] = r; 350 } 351 vst4_u8((uint8_t*) dst, rgba); 352 src += 8; 353 dst += 8; 354 count -= 8; 355 } 356 357 // Call portable code to finish up the tail of [0,8) pixels. 358 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 359 proc(dst, src, count); 360} 361 362void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 363 premul_should_swapRB(false, dst, src, count); 364} 365 366void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 367 premul_should_swapRB(true, dst, src, count); 368} 369 370void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 371 using std::swap; 372 while (count >= 16) { 373 // Load 16 pixels. 374 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src); 375 376 // Swap r and b. 377 swap(rgba.val[0], rgba.val[2]); 378 379 // Store 16 pixels. 380 vst4q_u8((uint8_t*) dst, rgba); 381 src += 16; 382 dst += 16; 383 count -= 16; 384 } 385 386 if (count >= 8) { 387 // Load 8 pixels. 388 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); 389 390 // Swap r and b. 391 swap(rgba.val[0], rgba.val[2]); 392 393 // Store 8 pixels. 394 vst4_u8((uint8_t*) dst, rgba); 395 src += 8; 396 dst += 8; 397 count -= 8; 398 } 399 400 RGBA_to_BGRA_portable(dst, src, count); 401} 402 403static void expand_grayA(bool kPremul, uint32_t dst[], const uint8_t* src, int count) { 404 while (count >= 16) { 405 // Load 16 pixels. 406 uint8x16x2_t ga = vld2q_u8(src); 407 408 // Premultiply if requested. 409 if (kPremul) { 410 ga.val[0] = vcombine_u8( 411 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])), 412 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1]))); 413 } 414 415 // Set each of the color channels. 416 uint8x16x4_t rgba; 417 rgba.val[0] = ga.val[0]; 418 rgba.val[1] = ga.val[0]; 419 rgba.val[2] = ga.val[0]; 420 rgba.val[3] = ga.val[1]; 421 422 // Store 16 pixels. 423 vst4q_u8((uint8_t*) dst, rgba); 424 src += 16*2; 425 dst += 16; 426 count -= 16; 427 } 428 429 if (count >= 8) { 430 // Load 8 pixels. 431 uint8x8x2_t ga = vld2_u8(src); 432 433 // Premultiply if requested. 434 if (kPremul) { 435 ga.val[0] = scale(ga.val[0], ga.val[1]); 436 } 437 438 // Set each of the color channels. 439 uint8x8x4_t rgba; 440 rgba.val[0] = ga.val[0]; 441 rgba.val[1] = ga.val[0]; 442 rgba.val[2] = ga.val[0]; 443 rgba.val[3] = ga.val[1]; 444 445 // Store 8 pixels. 446 vst4_u8((uint8_t*) dst, rgba); 447 src += 8*2; 448 dst += 8; 449 count -= 8; 450 } 451 452 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable; 453 proc(dst, src, count); 454} 455 456void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 457 expand_grayA(false, dst, src, count); 458} 459 460void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 461 expand_grayA(true, dst, src, count); 462} 463 464enum Format { kRGB1, kBGR1 }; 465static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) { 466 while (count >= 8) { 467 // Load 8 cmyk pixels. 468 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src); 469 470 uint8x8_t k = pixels.val[3], 471 y = pixels.val[2], 472 m = pixels.val[1], 473 c = pixels.val[0]; 474 475 // Scale to r, g, b. 476 uint8x8_t b = scale(y, k); 477 uint8x8_t g = scale(m, k); 478 uint8x8_t r = scale(c, k); 479 480 // Store 8 rgba pixels. 481 if (kBGR1 == format) { 482 pixels.val[3] = vdup_n_u8(0xFF); 483 pixels.val[2] = r; 484 pixels.val[1] = g; 485 pixels.val[0] = b; 486 } else { 487 pixels.val[3] = vdup_n_u8(0xFF); 488 pixels.val[2] = b; 489 pixels.val[1] = g; 490 pixels.val[0] = r; 491 } 492 vst4_u8((uint8_t*) dst, pixels); 493 src += 8; 494 dst += 8; 495 count -= 8; 496 } 497 498 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 499 proc(dst, src, count); 500} 501 502void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 503 inverted_cmyk_to(kRGB1, dst, src, count); 504} 505 506void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 507 inverted_cmyk_to(kBGR1, dst, src, count); 508} 509 510template <bool swapRB> 511static void common_rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 512 513 // Only use the SIMD code if simulating RP, otherwise the quick code auto-vectorizes will 514 // enough on ARM to not need a SIMD implementation. 515 if constexpr (!kFastUnpremul) { 516 while (count >= 8) { 517 const uint8x8x4_t in = vld4_u8((const uint8_t*)src); 518 519 auto round = [](float32x4_t v) -> uint32x4_t { 520 #if defined(SK_CPU_ARM64) 521 return vcvtnq_u32_f32(v); 522 #else 523 return vcvtq_u32_f32(v + 0.5f); 524 #endif 525 }; 526 527 static constexpr float kN = 1.0f / 255.0f; 528 auto toNormalized = [](uint16x4_t v) -> float32x4_t { 529 return vcvtq_f32_u32(vmovl_u16(v)) * kN; 530 }; 531 532 auto unpremulHalf = 533 [toNormalized, round](float32x4_t invA, uint16x4_t v) -> uint16x4_t { 534 const float32x4_t normalizedV = toNormalized(v); 535 const float32x4_t divided = invA * normalizedV; 536 const float32x4_t denormalized = divided * 255.0f; 537 const uint32x4_t rounded = round(denormalized); 538 return vqmovn_u32(rounded); 539 }; 540 541 auto reciprocal = [](float32x4_t a) -> float32x4_t { 542 uint32x4_t mask = sk_bit_cast<uint32x4_t>(a != float32x4_t{0, 0, 0, 0}); 543 auto recip = 1.0f / a; 544 return sk_bit_cast<float32x4_t>(mask & sk_bit_cast<uint32x4_t>(recip)); 545 }; 546 547 const uint8x8_t a = in.val[3]; 548 const uint16x8_t intA = vmovl_u8(a); 549 const float32x4_t invALow = reciprocal(toNormalized(vget_low_u16(intA))); 550 const float32x4_t invAHigh = reciprocal(toNormalized(vget_high_u16(intA))); 551 552 auto unpremul = [unpremulHalf, invALow, invAHigh](uint8x8_t v) -> uint8x8_t { 553 const uint16x8_t to16 = vmovl_u8(v); 554 555 const uint16x4_t low = unpremulHalf(invALow, vget_low_u16(to16)); 556 const uint16x4_t high = unpremulHalf(invAHigh, vget_high_u16(to16)); 557 558 const uint16x8_t combined = vcombine_u16(low, high); 559 return vqmovn_u16(combined); 560 }; 561 562 const uint8x8_t b = unpremul(in.val[2]); 563 const uint8x8_t g = unpremul(in.val[1]); 564 const uint8x8_t r = unpremul(in.val[0]); 565 566 if constexpr (swapRB) { 567 const uint8x8x4_t out{b, g, r, a}; 568 vst4_u8((uint8_t*)dst, out); 569 } else { 570 const uint8x8x4_t out{r, g, b, a}; 571 vst4_u8((uint8_t*)dst, out); 572 } 573 574 src += 8; 575 dst += 8; 576 count -= 8; 577 } 578 } 579 580 // Handle the tail. Count will be < 8. 581 if constexpr (swapRB) { 582 rgbA_to_BGRA_portable(dst, src, count); 583 } else { 584 rgbA_to_RGBA_portable(dst, src, count); 585 } 586} 587 588void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 589 common_rgbA_to_RGBA</*swapRB=*/false>(dst, src, count); 590} 591 592void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 593 common_rgbA_to_RGBA</*swapRB=*/true>(dst, src, count); 594} 595 596#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 597// -- AVX2 ----------------------------------------------------------------------------------------- 598 599// Scale a byte by another. 600// Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 601static __m256i scale(__m256i x, __m256i y) { 602 const __m256i _128 = _mm256_set1_epi16(128); 603 const __m256i _257 = _mm256_set1_epi16(257); 604 605 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 606 return _mm256_mulhi_epu16(_mm256_add_epi16(_mm256_mullo_epi16(x, y), _128), _257); 607} 608 609static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) { 610 611 auto premul8 = [=](__m256i* lo, __m256i* hi) { 612 const __m256i zeros = _mm256_setzero_si256(); 613 __m256i planar; 614 if (kSwapRB) { 615 planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15, 616 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 617 } else { 618 planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15, 619 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 620 } 621 622 // Swizzle the pixels to 8-bit planar. 623 *lo = _mm256_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa 624 *hi = _mm256_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA 625 __m256i rg = _mm256_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG 626 ba = _mm256_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA 627 628 // Unpack to 16-bit planar. 629 __m256i r = _mm256_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_ 630 g = _mm256_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_ 631 b = _mm256_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_ 632 a = _mm256_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_ 633 634 // Premultiply! 635 r = scale(r, a); 636 g = scale(g, a); 637 b = scale(b, a); 638 639 // Repack into interlaced pixels. 640 rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG 641 ba = _mm256_or_si256(b, _mm256_slli_epi16(a, 8)); // babababa BABABABA babababa BABABABA 642 *lo = _mm256_unpacklo_epi16(rg, ba); // rgbargba rgbargba rgbargba rgbargba 643 *hi = _mm256_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA RGBARGBA RGBARGBA 644 }; 645 646 while (count >= 16) { 647 __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)), 648 hi = _mm256_loadu_si256((const __m256i*) (src + 8)); 649 650 premul8(&lo, &hi); 651 652 _mm256_storeu_si256((__m256i*) (dst + 0), lo); 653 _mm256_storeu_si256((__m256i*) (dst + 8), hi); 654 655 src += 16; 656 dst += 16; 657 count -= 16; 658 } 659 660 if (count >= 8) { 661 __m256i lo = _mm256_loadu_si256((const __m256i*) src), 662 hi = _mm256_setzero_si256(); 663 664 premul8(&lo, &hi); 665 666 _mm256_storeu_si256((__m256i*) dst, lo); 667 668 src += 8; 669 dst += 8; 670 count -= 8; 671 } 672 673 // Call portable code to finish up the tail of [0,8) pixels. 674 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 675 proc(dst, src, count); 676} 677 678void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 679 premul_should_swapRB(false, dst, src, count); 680} 681 682void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 683 premul_should_swapRB(true, dst, src, count); 684} 685 686void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 687 const __m256i swapRB = _mm256_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15, 688 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); 689 690 while (count >= 8) { 691 __m256i rgba = _mm256_loadu_si256((const __m256i*) src); 692 __m256i bgra = _mm256_shuffle_epi8(rgba, swapRB); 693 _mm256_storeu_si256((__m256i*) dst, bgra); 694 695 src += 8; 696 dst += 8; 697 count -= 8; 698 } 699 700 RGBA_to_BGRA_portable(dst, src, count); 701} 702 703void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 704 while (count >= 16) { 705 __m256i ga = _mm256_loadu_si256((const __m256i*) src); 706 707 __m256i gg = _mm256_or_si256(_mm256_and_si256(ga, _mm256_set1_epi16(0x00FF)), 708 _mm256_slli_epi16(ga, 8)); 709 710 __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga); 711 __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga); 712 713 // Shuffle for pixel reorder 714 // Note. 'p' stands for 'ggga' 715 // Before shuffle: 716 // ggga_lo = p0 p1 p2 p3 | p8 p9 p10 p11 717 // ggga_hi = p4 p5 p6 p7 | p12 p13 p14 p15 718 // 719 // After shuffle: 720 // ggga_lo_shuffle = p0 p1 p2 p3 | p4 p5 p6 p7 721 // ggga_hi_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15 722 __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20), 723 ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31); 724 725 _mm256_storeu_si256((__m256i*) (dst + 0), ggga_lo_shuffle); 726 _mm256_storeu_si256((__m256i*) (dst + 8), ggga_hi_shuffle); 727 728 src += 16*2; 729 dst += 16; 730 count -= 16; 731 } 732 733 grayA_to_RGBA_portable(dst, src, count); 734} 735 736void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 737 while (count >= 16) { 738 __m256i grayA = _mm256_loadu_si256((const __m256i*) src); 739 740 __m256i g0 = _mm256_and_si256(grayA, _mm256_set1_epi16(0x00FF)); 741 __m256i a0 = _mm256_srli_epi16(grayA, 8); 742 743 // Premultiply 744 g0 = scale(g0, a0); 745 746 __m256i gg = _mm256_or_si256(g0, _mm256_slli_epi16(g0, 8)); 747 __m256i ga = _mm256_or_si256(g0, _mm256_slli_epi16(a0, 8)); 748 749 __m256i ggga_lo = _mm256_unpacklo_epi16(gg, ga); 750 __m256i ggga_hi = _mm256_unpackhi_epi16(gg, ga); 751 752 // Shuffle for pixel reorder, similar as grayA_to_RGBA 753 __m256i ggga_lo_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x20), 754 ggga_hi_shuffle = _mm256_permute2x128_si256(ggga_lo, ggga_hi, 0x31); 755 756 _mm256_storeu_si256((__m256i*) (dst + 0), ggga_lo_shuffle); 757 _mm256_storeu_si256((__m256i*) (dst + 8), ggga_hi_shuffle); 758 759 src += 16*2; 760 dst += 16; 761 count -= 16; 762 } 763 764 grayA_to_rgbA_portable(dst, src, count); 765} 766 767enum Format { kRGB1, kBGR1 }; 768static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) { 769 auto convert8 = [=](__m256i* lo, __m256i* hi) { 770 const __m256i zeros = _mm256_setzero_si256(); 771 __m256i planar; 772 if (kBGR1 == format) { 773 planar = _mm256_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15, 774 2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 775 } else { 776 planar = _mm256_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15, 777 0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 778 } 779 780 // Swizzle the pixels to 8-bit planar. 781 *lo = _mm256_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk ccccmmmm yyyykkkk 782 *hi = _mm256_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK 783 __m256i cm = _mm256_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM 784 yk = _mm256_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK 785 786 // Unpack to 16-bit planar. 787 __m256i c = _mm256_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_ 788 m = _mm256_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_ 789 y = _mm256_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_ 790 k = _mm256_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_ 791 792 // Scale to r, g, b. 793 __m256i r = scale(c, k), 794 g = scale(m, k), 795 b = scale(y, k); 796 797 // Repack into interlaced pixels: 798 // rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG 799 // ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1 800 __m256i rg = _mm256_or_si256(r, _mm256_slli_epi16(g, 8)), 801 ba = _mm256_or_si256(b, _mm256_set1_epi16((uint16_t) 0xFF00)); 802 *lo = _mm256_unpacklo_epi16(rg, ba); // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1 803 *hi = _mm256_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1 804 }; 805 806 while (count >= 16) { 807 __m256i lo = _mm256_loadu_si256((const __m256i*) (src + 0)), 808 hi = _mm256_loadu_si256((const __m256i*) (src + 8)); 809 810 convert8(&lo, &hi); 811 812 _mm256_storeu_si256((__m256i*) (dst + 0), lo); 813 _mm256_storeu_si256((__m256i*) (dst + 8), hi); 814 815 src += 16; 816 dst += 16; 817 count -= 16; 818 } 819 820 if (count >= 8) { 821 __m256i lo = _mm256_loadu_si256((const __m256i*) src), 822 hi = _mm256_setzero_si256(); 823 824 convert8(&lo, &hi); 825 826 _mm256_storeu_si256((__m256i*) dst, lo); 827 828 src += 8; 829 dst += 8; 830 count -= 8; 831 } 832 833 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 834 proc(dst, src, count); 835} 836 837void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 838 inverted_cmyk_to(kRGB1, dst, src, count); 839} 840 841void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 842 inverted_cmyk_to(kBGR1, dst, src, count); 843} 844 845void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 846 rgbA_to_RGBA_portable(dst, src, count); 847} 848 849void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 850 rgbA_to_BGRA_portable(dst, src, count); 851} 852 853#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 854// -- SSSE3 ---------------------------------------------------------------------------------------- 855 856// Scale a byte by another. 857// Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 858static __m128i scale(__m128i x, __m128i y) { 859 const __m128i _128 = _mm_set1_epi16(128); 860 const __m128i _257 = _mm_set1_epi16(257); 861 862 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 863 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); 864} 865 866static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) { 867 868 auto premul8 = [=](__m128i* lo, __m128i* hi) { 869 const __m128i zeros = _mm_setzero_si128(); 870 __m128i planar; 871 if (kSwapRB) { 872 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 873 } else { 874 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 875 } 876 877 // Swizzle the pixels to 8-bit planar. 878 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa 879 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA 880 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG 881 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA 882 883 // Unpack to 16-bit planar. 884 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ 885 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ 886 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ 887 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ 888 889 // Premultiply! 890 r = scale(r, a); 891 g = scale(g, a); 892 b = scale(b, a); 893 894 // Repack into interlaced pixels. 895 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG 896 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA 897 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba 898 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA 899 }; 900 901 while (count >= 8) { 902 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 903 hi = _mm_loadu_si128((const __m128i*) (src + 4)); 904 905 premul8(&lo, &hi); 906 907 _mm_storeu_si128((__m128i*) (dst + 0), lo); 908 _mm_storeu_si128((__m128i*) (dst + 4), hi); 909 910 src += 8; 911 dst += 8; 912 count -= 8; 913 } 914 915 if (count >= 4) { 916 __m128i lo = _mm_loadu_si128((const __m128i*) src), 917 hi = _mm_setzero_si128(); 918 919 premul8(&lo, &hi); 920 921 _mm_storeu_si128((__m128i*) dst, lo); 922 923 src += 4; 924 dst += 4; 925 count -= 4; 926 } 927 928 // Call portable code to finish up the tail of [0,4) pixels. 929 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 930 proc(dst, src, count); 931} 932 933void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 934 premul_should_swapRB(false, dst, src, count); 935} 936 937void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 938 premul_should_swapRB(true, dst, src, count); 939} 940 941void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 942 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); 943 944 while (count >= 4) { 945 __m128i rgba = _mm_loadu_si128((const __m128i*) src); 946 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB); 947 _mm_storeu_si128((__m128i*) dst, bgra); 948 949 src += 4; 950 dst += 4; 951 count -= 4; 952 } 953 954 RGBA_to_BGRA_portable(dst, src, count); 955} 956 957void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 958 while (count >= 8) { 959 __m128i ga = _mm_loadu_si128((const __m128i*) src); 960 961 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)), 962 _mm_slli_epi16(ga, 8)); 963 964 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); 965 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); 966 967 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); 968 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); 969 970 src += 8*2; 971 dst += 8; 972 count -= 8; 973 } 974 975 grayA_to_RGBA_portable(dst, src, count); 976} 977 978void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 979 while (count >= 8) { 980 __m128i grayA = _mm_loadu_si128((const __m128i*) src); 981 982 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF)); 983 __m128i a0 = _mm_srli_epi16(grayA, 8); 984 985 // Premultiply 986 g0 = scale(g0, a0); 987 988 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8)); 989 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8)); 990 991 992 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); 993 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); 994 995 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); 996 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); 997 998 src += 8*2; 999 dst += 8; 1000 count -= 8; 1001 } 1002 1003 grayA_to_rgbA_portable(dst, src, count); 1004} 1005 1006enum Format { kRGB1, kBGR1 }; 1007static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) { 1008 auto convert8 = [=](__m128i* lo, __m128i* hi) { 1009 const __m128i zeros = _mm_setzero_si128(); 1010 __m128i planar; 1011 if (kBGR1 == format) { 1012 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 1013 } else { 1014 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 1015 } 1016 1017 // Swizzle the pixels to 8-bit planar. 1018 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk 1019 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK 1020 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM 1021 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK 1022 1023 // Unpack to 16-bit planar. 1024 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_ 1025 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_ 1026 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_ 1027 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_ 1028 1029 // Scale to r, g, b. 1030 __m128i r = scale(c, k), 1031 g = scale(m, k), 1032 b = scale(y, k); 1033 1034 // Repack into interlaced pixels. 1035 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG 1036 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1 1037 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba 1038 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1 1039 }; 1040 1041 while (count >= 8) { 1042 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 1043 hi = _mm_loadu_si128((const __m128i*) (src + 4)); 1044 1045 convert8(&lo, &hi); 1046 1047 _mm_storeu_si128((__m128i*) (dst + 0), lo); 1048 _mm_storeu_si128((__m128i*) (dst + 4), hi); 1049 1050 src += 8; 1051 dst += 8; 1052 count -= 8; 1053 } 1054 1055 if (count >= 4) { 1056 __m128i lo = _mm_loadu_si128((const __m128i*) src), 1057 hi = _mm_setzero_si128(); 1058 1059 convert8(&lo, &hi); 1060 1061 _mm_storeu_si128((__m128i*) dst, lo); 1062 1063 src += 4; 1064 dst += 4; 1065 count -= 4; 1066 } 1067 1068 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 1069 proc(dst, src, count); 1070} 1071 1072void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 1073 inverted_cmyk_to(kRGB1, dst, src, count); 1074} 1075 1076void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 1077 inverted_cmyk_to(kBGR1, dst, src, count); 1078} 1079 1080void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 1081 rgbA_to_RGBA_portable(dst, src, count); 1082} 1083 1084void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1085 rgbA_to_BGRA_portable(dst, src, count); 1086} 1087 1088#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX 1089// -- LASX ---------------------------------------------------------------------------------------- 1090 1091// Scale a byte by another. 1092// Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 1093// (x+127)/255 == ((x+128)*257)>>16 1094SI __m256i scale(__m256i x, __m256i y) { 1095 const __m256i _128 = __lasx_xvreplgr2vr_h(128); 1096 const __m256i _257 = __lasx_xvreplgr2vr_h(257); 1097 1098 // (x+127)/255 == ((x+128)*257)>>16 1099 return __lasx_xvmuh_hu(__lasx_xvadd_h(__lasx_xvmul_h(x, y), _128), _257); 1100} 1101 1102static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) { 1103 auto premul8 = [=](__m256i* lo, __m256i* hi) { 1104 const __m256i zeros = __lasx_xvldi(0); 1105 __m256i planar = __lasx_xvldi(0); 1106 if (kSwapRB) { 1107 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0); 1108 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1); 1109 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2); 1110 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3); 1111 } else { 1112 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0); 1113 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1); 1114 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2); 1115 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3); 1116 } 1117 1118 // Swizzle the pixels to 8-bit planar. 1119 *lo = __lasx_xvshuf_b(zeros, *lo, planar); // rrrrgggg bbbbaaaa rrrrgggg bbbbaaaa 1120 *hi = __lasx_xvshuf_b(zeros, *hi, planar); // RRRRGGGG BBBBAAAA RRRRGGGG BBBBAAAA 1121 __m256i rg = __lasx_xvilvl_w(*hi, *lo), // rrrrRRRR ggggGGGG rrrrRRRR ggggGGGG 1122 ba = __lasx_xvilvh_w(*hi, *lo); // bbbbBBBB aaaaAAAA bbbbBBBB aaaaAAAA 1123 1124 // Unpack to 16-bit planar. 1125 __m256i r = __lasx_xvilvl_b(zeros, rg), // r_r_r_r_ R_R_R_R_ r_r_r_r_ R_R_R_R_ 1126 g = __lasx_xvilvh_b(zeros, rg), // g_g_g_g_ G_G_G_G_ g_g_g_g_ G_G_G_G_ 1127 b = __lasx_xvilvl_b(zeros, ba), // b_b_b_b_ B_B_B_B_ b_b_b_b_ B_B_B_B_ 1128 a = __lasx_xvilvh_b(zeros, ba); // a_a_a_a_ A_A_A_A_ a_a_a_a_ A_A_A_A_ 1129 1130 // Premultiply! 1131 r = scale(r, a); 1132 g = scale(g, a); 1133 b = scale(b, a); 1134 1135 // Repack into interlaced pixels. 1136 rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8)); // rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG 1137 ba = __lasx_xvor_v(b, __lasx_xvslli_h(a, 8)); // babababa BABABABA babababa BABABABA 1138 *lo = __lasx_xvilvl_h(ba, rg); // rgbargba rgbargba rgbargba rgbargba 1139 *hi = __lasx_xvilvh_h(ba, rg); // RGBARGBA RGBARGBA RGBARGBA RGBARGBA 1140 }; 1141 1142 while (count >= 16) { 1143 __m256i lo = __lasx_xvld(src, 0), 1144 hi = __lasx_xvld(src, 32); 1145 1146 premul8(&lo, &hi); 1147 1148 __lasx_xvst(lo, dst, 0); 1149 __lasx_xvst(hi, dst, 32); 1150 1151 src += 16; 1152 dst += 16; 1153 count -= 16; 1154 } 1155 1156 if (count >= 8) { 1157 __m256i lo = __lasx_xvld(src, 0), 1158 hi = __lasx_xvldi(0); 1159 1160 premul8(&lo, &hi); 1161 1162 __lasx_xvst(lo, dst, 0); 1163 1164 src += 8; 1165 dst += 8; 1166 count -= 8; 1167 } 1168 1169 // Call portable code to finish up the tail of [0,4) pixels. 1170 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 1171 proc(dst, src, count); 1172} 1173 1174/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 1175 premul_should_swapRB(false, dst, src, count); 1176} 1177 1178/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 1179 premul_should_swapRB(true, dst, src, count); 1180} 1181 1182/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1183 while (count >= 8) { 1184 __m256i rgba = __lasx_xvld(src, 0); 1185 __m256i bgra = __lasx_xvshuf4i_b(rgba, 0xC6); 1186 __lasx_xvst(bgra, dst, 0); 1187 1188 src += 8; 1189 dst += 8; 1190 count -= 8; 1191 } 1192 1193 RGBA_to_BGRA_portable(dst, src, count); 1194} 1195 1196/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 1197 while (count >= 16) { 1198 __m256i ga = __lasx_xvld(src, 0); 1199 1200 __m256i gg = __lasx_xvor_v(__lasx_xvand_v(ga, __lasx_xvreplgr2vr_h(0x00FF)), 1201 __lasx_xvslli_h(ga, 8)); 1202 1203 __m256i ggga_lo = __lasx_xvilvl_h(ga, gg); 1204 __m256i ggga_hi = __lasx_xvilvh_h(ga, gg); 1205 1206 __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02), dst, 0); 1207 __lasx_xvst(__lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13), dst, 32); 1208 1209 src += 16*2; 1210 dst += 16; 1211 count -= 16; 1212 } 1213 1214 grayA_to_RGBA_portable(dst, src, count); 1215} 1216 1217/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 1218 while (count >= 16) { 1219 __m256i grayA = __lasx_xvld(src, 0); 1220 1221 __m256i val = __lasx_xvreplgr2vr_h(0x00FF); 1222 1223 __m256i g0 = __lasx_xvand_v(grayA, val); 1224 __m256i a0 = __lasx_xvsrli_h(grayA, 8); 1225 1226 // Premultiply 1227 g0 = scale(g0, a0); 1228 1229 __m256i gg = __lasx_xvor_v(g0, __lasx_xvslli_h(g0, 8)); 1230 __m256i ga = __lasx_xvor_v(g0, __lasx_xvslli_h(a0, 8)); 1231 1232 __m256i ggga_lo = __lasx_xvilvl_h(ga, gg); 1233 __m256i ggga_hi = __lasx_xvilvh_h(ga, gg); 1234 1235 val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x02); 1236 __lasx_xvst(val, dst, 0); 1237 1238 val = __lasx_xvpermi_q(ggga_lo, ggga_hi, 0x13); 1239 __lasx_xvst(val, dst, 32); 1240 1241 src += 16*2; 1242 dst += 16; 1243 count -= 16; 1244 } 1245 1246 grayA_to_rgbA_portable(dst, src, count); 1247} 1248 1249enum Format { kRGB1, kBGR1 }; 1250static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) { 1251 auto convert8 = [=](__m256i *lo, __m256i* hi) { 1252 const __m256i zeros = __lasx_xvldi(0); 1253 __m256i planar = __lasx_xvldi(0); 1254 if (kBGR1 == format) { 1255 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,0); 1256 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,1); 1257 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010e0a0602 ,2); 1258 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030c080400 ,3); 1259 } else { 1260 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,0); 1261 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,1); 1262 planar = __lasx_xvinsgr2vr_d(planar, 0x0d0905010c080400 ,2); 1263 planar = __lasx_xvinsgr2vr_d(planar, 0x0f0b07030e0a0602 ,3); 1264 } 1265 1266 // Swizzle the pixels to 8-bit planar. 1267 *lo = __lasx_xvshuf_b(zeros, *lo, planar); // ccccmmmm yyyykkkk ccccmmmm yyyykkkk 1268 *hi = __lasx_xvshuf_b(zeros, *hi, planar); // CCCCMMMM YYYYKKKK CCCCMMMM YYYYKKKK 1269 __m256i cm = __lasx_xvilvl_w(*hi, *lo), // ccccCCCC mmmmMMMM ccccCCCC mmmmMMMM 1270 yk = __lasx_xvilvh_w(*hi, *lo); // yyyyYYYY kkkkKKKK yyyyYYYY kkkkKKKK 1271 1272 // Unpack to 16-bit planar. 1273 __m256i c = __lasx_xvilvl_b(zeros, cm), // c_c_c_c_ C_C_C_C_ c_c_c_c_ C_C_C_C_ 1274 m = __lasx_xvilvh_b(zeros, cm), // m_m_m_m_ M_M_M_M_ m_m_m_m_ M_M_M_M_ 1275 y = __lasx_xvilvl_b(zeros, yk), // y_y_y_y_ Y_Y_Y_Y_ y_y_y_y_ Y_Y_Y_Y_ 1276 k = __lasx_xvilvh_b(zeros, yk); // k_k_k_k_ K_K_K_K_ k_k_k_k_ K_K_K_K_ 1277 1278 // Scale to r, g, b. 1279 __m256i r = scale(c, k), 1280 g = scale(m, k), 1281 b = scale(y, k); 1282 1283 // Repack into interlaced pixels: 1284 // rg = rgrgrgrg RGRGRGRG rgrgrgrg RGRGRGRG 1285 // ba = b1b1b1b1 B1B1B1B1 b1b1b1b1 B1B1B1B1 1286 __m256i rg = __lasx_xvor_v(r, __lasx_xvslli_h(g, 8)), 1287 ba = __lasx_xvor_v(b, __lasx_xvreplgr2vr_h(0xff00)); 1288 *lo = __lasx_xvilvl_h(ba, rg); // rgb1rgb1 rgb1rgb1 rgb1rgb1 rgb1rgb1 1289 *hi = __lasx_xvilvh_h(ba, rg); // RGB1RGB1 RGB1RGB1 RGB1RGB1 RGB1RGB1 1290 }; 1291 1292 while (count >= 16) { 1293 __m256i lo = __lasx_xvld(src, 0), 1294 hi = __lasx_xvld(src, 32); 1295 1296 convert8(&lo, &hi); 1297 1298 __lasx_xvst(lo, dst, 0); 1299 __lasx_xvst(hi, dst, 32); 1300 1301 src += 16; 1302 dst += 16; 1303 count -= 16; 1304 } 1305 1306 while (count >= 8) { 1307 __m256i lo = __lasx_xvld(src, 0), 1308 hi = __lasx_xvldi(0); 1309 1310 convert8(&lo, &hi); 1311 1312 __lasx_xvst(lo, dst, 0); 1313 1314 src += 8; 1315 dst += 8; 1316 count -= 8; 1317 } 1318 1319 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 1320 proc(dst, src, count); 1321} 1322 1323/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 1324 inverted_cmyk_to(kRGB1, dst, src, count); 1325} 1326 1327/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 1328 inverted_cmyk_to(kBGR1, dst, src, count); 1329} 1330 1331/*not static*/ inline void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 1332 rgbA_to_RGBA_portable(dst, src, count); 1333} 1334 1335/*not static*/ inline void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1336 rgbA_to_BGRA_portable(dst, src, count); 1337} 1338 1339#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX 1340// -- LSX ----------------------------------------------------------------------------------------- 1341 1342// Scale a byte by another. 1343// Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 1344SI __m128i scale(__m128i x, __m128i y) { 1345 const __m128i _128 = __lsx_vreplgr2vr_h(128); 1346 const __m128i _257 = __lsx_vreplgr2vr_h(257); 1347 1348 // (x+127)/255 == ((x+128)*257)>>16 1349 return __lsx_vmuh_hu(__lsx_vadd_h(__lsx_vmul_h(x, y), _128), _257); 1350} 1351 1352static void premul_should_swapRB(bool kSwapRB, uint32_t* dst, const uint32_t* src, int count) { 1353 1354 auto premul8 = [=](__m128i *lo, __m128i *hi){ 1355 const __m128i zeros = __lsx_vldi(0); 1356 __m128i planar = __lsx_vldi(0); 1357 if (kSwapRB) { 1358 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0); 1359 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1); 1360 } else { 1361 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0); 1362 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1); 1363 } 1364 1365 // Swizzle the pixels to 8-bit planar. 1366 *lo = __lsx_vshuf_b(zeros, *lo, planar); // rrrrgggg bbbbaaaa 1367 *hi = __lsx_vshuf_b(zeros, *hi, planar); // RRRRGGGG BBBBAAAA 1368 __m128i rg = __lsx_vilvl_w(*hi, *lo), // rrrrRRRR ggggGGGG 1369 ba = __lsx_vilvh_w(*hi, *lo); // bbbbBBBB aaaaAAAA 1370 1371 // Unpack to 16-bit planar. 1372 __m128i r = __lsx_vilvl_b(zeros, rg), // r_r_r_r_ R_R_R_R_ 1373 g = __lsx_vilvh_b(zeros, rg), // g_g_g_g_ G_G_G_G_ 1374 b = __lsx_vilvl_b(zeros, ba), // b_b_b_b_ B_B_B_B_ 1375 a = __lsx_vilvh_b(zeros, ba); // a_a_a_a_ A_A_A_A_ 1376 1377 // Premultiply! 1378 r = scale(r, a); 1379 g = scale(g, a); 1380 b = scale(b, a); 1381 1382 // Repack into interlaced pixels. 1383 rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8)); // rgrgrgrg RGRGRGRG 1384 ba = __lsx_vor_v(b, __lsx_vslli_h(a, 8)); // babababa BABABABA 1385 *lo = __lsx_vilvl_h(ba, rg); // rgbargba rgbargba 1386 *hi = __lsx_vilvh_h(ba, rg); // RGBARGBA RGBARGBA 1387 }; 1388 while (count >= 8) { 1389 __m128i lo = __lsx_vld(src ,0), 1390 hi = __lsx_vld(src ,16); 1391 1392 premul8(&lo, &hi); 1393 1394 __lsx_vst(lo, dst, 0); 1395 __lsx_vst(hi, dst, 16); 1396 1397 src += 8; 1398 dst += 8; 1399 count -= 8; 1400 } 1401 1402 if (count >= 4) { 1403 __m128i lo = __lsx_vld(src, 0), 1404 hi = __lsx_vldi(0); 1405 1406 premul8(&lo, &hi); 1407 1408 __lsx_vst(lo, dst, 0); 1409 1410 src += 4; 1411 dst += 4; 1412 count -= 4; 1413 } 1414 1415 // Call portable code to finish up the tail of [0,4) pixels. 1416 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 1417 proc(dst, src, count); 1418} 1419 1420/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 1421 premul_should_swapRB(false, dst, src, count); 1422} 1423 1424/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 1425 premul_should_swapRB(true, dst, src, count); 1426} 1427 1428/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1429 __m128i swapRB = __lsx_vldi(0); 1430 swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0704050603000102, 0); 1431 swapRB = __lsx_vinsgr2vr_d(swapRB, 0x0f0c0d0e0b08090a, 1); 1432 1433 while (count >= 4) { 1434 __m128i rgba = __lsx_vld(src, 0); 1435 __m128i bgra = __lsx_vshuf4i_b(rgba, 0xC6); 1436 __lsx_vst(bgra, dst, 0); 1437 1438 src += 4; 1439 dst += 4; 1440 count -= 4; 1441 } 1442 1443 RGBA_to_BGRA_portable(dst, src, count); 1444} 1445 1446/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 1447 while (count >= 8) { 1448 __m128i ga = __lsx_vld(src, 0); 1449 1450 __m128i gg = __lsx_vor_v(__lsx_vand_v(ga, __lsx_vreplgr2vr_h(0x00FF)), 1451 __lsx_vslli_h(ga, 8)); 1452 1453 __m128i ggga_lo = __lsx_vilvl_h(ga, gg); 1454 __m128i ggga_hi = __lsx_vilvh_h(ga, gg); 1455 1456 __lsx_vst(ggga_lo, dst, 0); 1457 __lsx_vst(ggga_hi, dst, 16); 1458 1459 src += 8*2; 1460 dst += 8; 1461 count -= 8; 1462 } 1463 1464 grayA_to_RGBA_portable(dst, src, count); 1465} 1466 1467/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 1468 while (count >= 8) { 1469 __m128i grayA = __lsx_vld(src, 0); 1470 1471 __m128i g0 = __lsx_vand_v(grayA, __lsx_vreplgr2vr_h(0x00FF)); 1472 __m128i a0 = __lsx_vsrli_h(grayA, 8); 1473 1474 // Premultiply 1475 g0 = scale(g0, a0); 1476 1477 __m128i gg = __lsx_vor_v(g0, __lsx_vslli_h(g0, 8)); 1478 __m128i ga = __lsx_vor_v(g0, __lsx_vslli_h(a0, 8)); 1479 1480 __m128i ggga_lo = __lsx_vilvl_h(ga, gg); 1481 __m128i ggga_hi = __lsx_vilvh_h(ga, gg); 1482 1483 __lsx_vst(ggga_lo, dst, 0); 1484 __lsx_vst(ggga_hi, dst, 16); 1485 1486 src += 8*2; 1487 dst += 8; 1488 count -= 8; 1489 } 1490 1491 grayA_to_rgbA_portable(dst, src, count); 1492} 1493 1494enum Format { kRGB1, kBGR1 }; 1495static void inverted_cmyk_to(Format format, uint32_t* dst, const uint32_t* src, int count) { 1496 auto convert8 = [=](__m128i *lo, __m128i* hi) { 1497 const __m128i zeros = __lsx_vldi(0); 1498 __m128i planar = __lsx_vldi(0); 1499 if (kBGR1 == format) { 1500 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010e0a0602, 0); 1501 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030c080400, 1); 1502 } else { 1503 planar = __lsx_vinsgr2vr_d(planar, 0x0d0905010c080400, 0); 1504 planar = __lsx_vinsgr2vr_d(planar, 0x0f0b07030e0a0602, 1); 1505 } 1506 1507 // Swizzle the pixels to 8-bit planar. 1508 *lo = __lsx_vshuf_b(zeros, *lo, planar); // ccccmmmm yyyykkkk 1509 *hi = __lsx_vshuf_b(zeros, *hi, planar); // CCCCMMMM YYYYKKKK 1510 __m128i cm = __lsx_vilvl_w(*hi, *lo), // ccccCCCC mmmmMMMM 1511 yk = __lsx_vilvh_w(*hi, *lo); // yyyyYYYY kkkkKKKK 1512 1513 // Unpack to 16-bit planar. 1514 __m128i c = __lsx_vilvl_b(zeros, cm), // c_c_c_c_ C_C_C_C_ 1515 m = __lsx_vilvh_b(zeros, cm), // m_m_m_m_ M_M_M_M_ 1516 y = __lsx_vilvl_b(zeros, yk), // y_y_y_y_ Y_Y_Y_Y_ 1517 k = __lsx_vilvh_b(zeros, yk); // k_k_k_k_ K_K_K_K_ 1518 1519 // Scale to r, g, b. 1520 __m128i r = scale(c, k), 1521 g = scale(m, k), 1522 b = scale(y, k); 1523 1524 // Repack into interlaced pixels. 1525 // rgrgrgrg RGRGRGRG 1526 // b1b1b1b1 B1B1B1B1 1527 __m128i rg = __lsx_vor_v(r, __lsx_vslli_h(g, 8)), 1528 ba = __lsx_vor_v(b, __lsx_vreplgr2vr_h(0xff00)); 1529 *lo = __lsx_vilvl_h(ba, rg); // rgbargba rgbargba 1530 *hi = __lsx_vilvl_h(ba, rg); // RGB1RGB1 RGB1RGB1 1531 }; 1532 1533 while (count >= 8) { 1534 __m128i lo = __lsx_vld(src, 0), 1535 hi = __lsx_vld(src, 16); 1536 1537 convert8(&lo, &hi); 1538 1539 __lsx_vst(lo, dst, 0); 1540 __lsx_vst(hi, dst, 16); 1541 1542 src += 8; 1543 dst += 8; 1544 count -= 8; 1545 } 1546 1547 if (count >= 4) { 1548 __m128i lo = __lsx_vld(src, 0), 1549 hi = __lsx_vldi(0); 1550 1551 convert8(&lo, &hi); 1552 1553 __lsx_vst(lo, dst, 0); 1554 1555 src += 4; 1556 dst += 4; 1557 count -= 4; 1558 } 1559 1560 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 1561 proc(dst, src, count); 1562} 1563 1564/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 1565 inverted_cmyk_to(kRGB1, dst, src, count); 1566} 1567 1568/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 1569 inverted_cmyk_to(kBGR1, dst, src, count); 1570} 1571 1572/*not static*/ inline void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 1573 rgbA_to_RGBA_portable(dst, src, count); 1574} 1575 1576/*not static*/ inline void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1577 rgbA_to_BGRA_portable(dst, src, count); 1578} 1579 1580#else 1581// -- No Opts -------------------------------------------------------------------------------------- 1582 1583void rgbA_to_RGBA(uint32_t* dst, const uint32_t* src, int count) { 1584 rgbA_to_RGBA_portable(dst, src, count); 1585} 1586 1587void rgbA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1588 rgbA_to_BGRA_portable(dst, src, count); 1589} 1590 1591void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 1592 RGBA_to_rgbA_portable(dst, src, count); 1593} 1594 1595void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 1596 RGBA_to_bgrA_portable(dst, src, count); 1597} 1598 1599void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 1600 RGBA_to_BGRA_portable(dst, src, count); 1601} 1602 1603void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 1604 grayA_to_RGBA_portable(dst, src, count); 1605} 1606 1607void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 1608 grayA_to_rgbA_portable(dst, src, count); 1609} 1610 1611void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 1612 inverted_CMYK_to_RGB1_portable(dst, src, count); 1613} 1614 1615void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 1616 inverted_CMYK_to_BGR1_portable(dst, src, count); 1617} 1618#endif 1619 1620// Basically as above, but we found no benefit from AVX-512 for gray_to_RGB1. 1621static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) { 1622 for (int i = 0; i < count; i++) { 1623 dst[i] = (uint32_t)0xFF << 24 1624 | (uint32_t)src[i] << 16 1625 | (uint32_t)src[i] << 8 1626 | (uint32_t)src[i] << 0; 1627 } 1628} 1629#if defined(SK_ARM_HAS_NEON) 1630 void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1631 while (count >= 16) { 1632 // Load 16 pixels. 1633 uint8x16_t gray = vld1q_u8(src); 1634 1635 // Set each of the color channels. 1636 uint8x16x4_t rgba; 1637 rgba.val[0] = gray; 1638 rgba.val[1] = gray; 1639 rgba.val[2] = gray; 1640 rgba.val[3] = vdupq_n_u8(0xFF); 1641 1642 // Store 16 pixels. 1643 vst4q_u8((uint8_t*) dst, rgba); 1644 src += 16; 1645 dst += 16; 1646 count -= 16; 1647 } 1648 if (count >= 8) { 1649 // Load 8 pixels. 1650 uint8x8_t gray = vld1_u8(src); 1651 1652 // Set each of the color channels. 1653 uint8x8x4_t rgba; 1654 rgba.val[0] = gray; 1655 rgba.val[1] = gray; 1656 rgba.val[2] = gray; 1657 rgba.val[3] = vdup_n_u8(0xFF); 1658 1659 // Store 8 pixels. 1660 vst4_u8((uint8_t*) dst, rgba); 1661 src += 8; 1662 dst += 8; 1663 count -= 8; 1664 } 1665 gray_to_RGB1_portable(dst, src, count); 1666 } 1667#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 1668 void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1669 const __m256i alphas = _mm256_set1_epi8((uint8_t) 0xFF); 1670 while (count >= 32) { 1671 __m256i grays = _mm256_loadu_si256((const __m256i*) src); 1672 1673 __m256i gg_lo = _mm256_unpacklo_epi8(grays, grays); 1674 __m256i gg_hi = _mm256_unpackhi_epi8(grays, grays); 1675 __m256i ga_lo = _mm256_unpacklo_epi8(grays, alphas); 1676 __m256i ga_hi = _mm256_unpackhi_epi8(grays, alphas); 1677 1678 __m256i ggga0 = _mm256_unpacklo_epi16(gg_lo, ga_lo); 1679 __m256i ggga1 = _mm256_unpackhi_epi16(gg_lo, ga_lo); 1680 __m256i ggga2 = _mm256_unpacklo_epi16(gg_hi, ga_hi); 1681 __m256i ggga3 = _mm256_unpackhi_epi16(gg_hi, ga_hi); 1682 1683 // Shuffle for pixel reorder. 1684 // Note. 'p' stands for 'ggga' 1685 // Before shuffle: 1686 // ggga0 = p0 p1 p2 p3 | p16 p17 p18 p19 1687 // ggga1 = p4 p5 p6 p7 | p20 p21 p22 p23 1688 // ggga2 = p8 p9 p10 p11 | p24 p25 p26 p27 1689 // ggga3 = p12 p13 p14 p15 | p28 p29 p30 p31 1690 // 1691 // After shuffle: 1692 // ggga0_shuffle = p0 p1 p2 p3 | p4 p5 p6 p7 1693 // ggga1_shuffle = p8 p9 p10 p11 | p12 p13 p14 p15 1694 // ggga2_shuffle = p16 p17 p18 p19 | p20 p21 p22 p23 1695 // ggga3_shuffle = p24 p25 p26 p27 | p28 p29 p30 p31 1696 __m256i ggga0_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x20), 1697 ggga1_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x20), 1698 ggga2_shuffle = _mm256_permute2x128_si256(ggga0, ggga1, 0x31), 1699 ggga3_shuffle = _mm256_permute2x128_si256(ggga2, ggga3, 0x31); 1700 1701 _mm256_storeu_si256((__m256i*) (dst + 0), ggga0_shuffle); 1702 _mm256_storeu_si256((__m256i*) (dst + 8), ggga1_shuffle); 1703 _mm256_storeu_si256((__m256i*) (dst + 16), ggga2_shuffle); 1704 _mm256_storeu_si256((__m256i*) (dst + 24), ggga3_shuffle); 1705 1706 src += 32; 1707 dst += 32; 1708 count -= 32; 1709 } 1710 gray_to_RGB1_portable(dst, src, count); 1711 } 1712#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 // TODO: just check >= SSE2? 1713 void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1714 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF); 1715 while (count >= 16) { 1716 __m128i grays = _mm_loadu_si128((const __m128i*) src); 1717 1718 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays); 1719 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays); 1720 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas); 1721 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas); 1722 1723 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo); 1724 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo); 1725 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi); 1726 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi); 1727 1728 _mm_storeu_si128((__m128i*) (dst + 0), ggga0); 1729 _mm_storeu_si128((__m128i*) (dst + 4), ggga1); 1730 _mm_storeu_si128((__m128i*) (dst + 8), ggga2); 1731 _mm_storeu_si128((__m128i*) (dst + 12), ggga3); 1732 1733 src += 16; 1734 dst += 16; 1735 count -= 16; 1736 } 1737 gray_to_RGB1_portable(dst, src, count); 1738 } 1739#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX 1740 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1741 const __m256i alphas = __lasx_xvreplgr2vr_b(0xFF); 1742 while (count >= 32) { 1743 __m256i grays = __lasx_xvld(src, 0); 1744 1745 __m256i gg_lo = __lasx_xvilvl_b(grays, grays); 1746 __m256i gg_hi = __lasx_xvilvh_b(grays, grays); 1747 __m256i ga_lo = __lasx_xvilvl_b(alphas, grays); 1748 __m256i ga_hi = __lasx_xvilvh_b(alphas, grays); 1749 1750 __m256i ggga0 = __lasx_xvilvl_h(ga_lo, gg_lo); 1751 __m256i ggga1 = __lasx_xvilvh_h(ga_lo, gg_lo); 1752 __m256i ggga2 = __lasx_xvilvl_h(ga_hi, gg_hi); 1753 __m256i ggga3 = __lasx_xvilvh_h(ga_hi, gg_hi); 1754 1755 __m256i ggga_0 = __lasx_xvpermi_q(ggga0, ggga1, 0x02); 1756 __m256i ggga_1 = __lasx_xvpermi_q(ggga2, ggga3, 0x02); 1757 __m256i ggga_2 = __lasx_xvpermi_q(ggga0, ggga1, 0x13); 1758 __m256i ggga_3 = __lasx_xvpermi_q(ggga2, ggga3, 0x13); 1759 1760 __lasx_xvst(ggga_0, dst, 0); 1761 __lasx_xvst(ggga_1, dst, 32); 1762 __lasx_xvst(ggga_2, dst, 64); 1763 __lasx_xvst(ggga_3, dst, 96); 1764 1765 src += 32; 1766 dst += 32; 1767 count -= 32; 1768 } 1769 gray_to_RGB1_portable(dst, src, count); 1770 } 1771#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX 1772 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1773 const __m128i alphas = __lsx_vreplgr2vr_b(0xFF); 1774 while (count >= 16) { 1775 __m128i grays = __lsx_vld(src, 0); 1776 1777 __m128i gg_lo = __lsx_vilvl_b(grays, grays); 1778 __m128i gg_hi = __lsx_vilvh_b(grays, grays); 1779 __m128i ga_lo = __lsx_vilvl_b(alphas, grays); 1780 __m128i ga_hi = __lsx_vilvh_b(alphas, grays); 1781 1782 __m128i ggga0 = __lsx_vilvl_h(ga_lo, gg_lo); 1783 __m128i ggga1 = __lsx_vilvh_h(ga_lo, gg_lo); 1784 __m128i ggga2 = __lsx_vilvl_h(ga_hi, gg_hi); 1785 __m128i ggga3 = __lsx_vilvh_h(ga_hi, gg_hi); 1786 1787 __lsx_vst(ggga0, dst, 0); 1788 __lsx_vst(ggga1, dst, 16); 1789 __lsx_vst(ggga2, dst, 32); 1790 __lsx_vst(ggga3, dst, 48); 1791 1792 src += 16; 1793 dst += 16; 1794 count -= 16; 1795 } 1796 gray_to_RGB1_portable(dst, src, count); 1797 } 1798#else 1799 void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1800 gray_to_RGB1_portable(dst, src, count); 1801 } 1802#endif 1803 1804// Again as above, this time not even finding benefit from AVX2 for RGB_to_{RGB,BGR}1. 1805static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) { 1806 for (int i = 0; i < count; i++) { 1807 uint8_t r = src[0], 1808 g = src[1], 1809 b = src[2]; 1810 src += 3; 1811 dst[i] = (uint32_t)0xFF << 24 1812 | (uint32_t)b << 16 1813 | (uint32_t)g << 8 1814 | (uint32_t)r << 0; 1815 } 1816} 1817static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) { 1818 for (int i = 0; i < count; i++) { 1819 uint8_t r = src[0], 1820 g = src[1], 1821 b = src[2]; 1822 src += 3; 1823 dst[i] = (uint32_t)0xFF << 24 1824 | (uint32_t)r << 16 1825 | (uint32_t)g << 8 1826 | (uint32_t)b << 0; 1827 } 1828} 1829#if defined(SK_ARM_HAS_NEON) 1830 static void insert_alpha_should_swaprb(bool kSwapRB, 1831 uint32_t dst[], const uint8_t* src, int count) { 1832 while (count >= 16) { 1833 // Load 16 pixels. 1834 uint8x16x3_t rgb = vld3q_u8(src); 1835 1836 // Insert an opaque alpha channel and swap if needed. 1837 uint8x16x4_t rgba; 1838 if (kSwapRB) { 1839 rgba.val[0] = rgb.val[2]; 1840 rgba.val[2] = rgb.val[0]; 1841 } else { 1842 rgba.val[0] = rgb.val[0]; 1843 rgba.val[2] = rgb.val[2]; 1844 } 1845 rgba.val[1] = rgb.val[1]; 1846 rgba.val[3] = vdupq_n_u8(0xFF); 1847 1848 // Store 16 pixels. 1849 vst4q_u8((uint8_t*) dst, rgba); 1850 src += 16*3; 1851 dst += 16; 1852 count -= 16; 1853 } 1854 1855 if (count >= 8) { 1856 // Load 8 pixels. 1857 uint8x8x3_t rgb = vld3_u8(src); 1858 1859 // Insert an opaque alpha channel and swap if needed. 1860 uint8x8x4_t rgba; 1861 if (kSwapRB) { 1862 rgba.val[0] = rgb.val[2]; 1863 rgba.val[2] = rgb.val[0]; 1864 } else { 1865 rgba.val[0] = rgb.val[0]; 1866 rgba.val[2] = rgb.val[2]; 1867 } 1868 rgba.val[1] = rgb.val[1]; 1869 rgba.val[3] = vdup_n_u8(0xFF); 1870 1871 // Store 8 pixels. 1872 vst4_u8((uint8_t*) dst, rgba); 1873 src += 8*3; 1874 dst += 8; 1875 count -= 8; 1876 } 1877 1878 // Call portable code to finish up the tail of [0,8) pixels. 1879 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 1880 proc(dst, src, count); 1881 } 1882 1883 void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1884 insert_alpha_should_swaprb(false, dst, src, count); 1885 } 1886 void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 1887 insert_alpha_should_swaprb(true, dst, src, count); 1888 } 1889#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 1890 static void insert_alpha_should_swaprb(bool kSwapRB, 1891 uint32_t dst[], const uint8_t* src, int count) { 1892 const __m128i alphaMask = _mm_set1_epi32(0xFF000000); 1893 __m128i expand; 1894 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant. 1895 if (kSwapRB) { 1896 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X); 1897 } else { 1898 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X); 1899 } 1900 1901 while (count >= 6) { 1902 // Load a vector. While this actually contains 5 pixels plus an 1903 // extra component, we will discard all but the first four pixels on 1904 // this iteration. 1905 __m128i rgb = _mm_loadu_si128((const __m128i*) src); 1906 1907 // Expand the first four pixels to RGBX and then mask to RGB(FF). 1908 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask); 1909 1910 // Store 4 pixels. 1911 _mm_storeu_si128((__m128i*) dst, rgba); 1912 1913 src += 4*3; 1914 dst += 4; 1915 count -= 4; 1916 } 1917 1918 // Call portable code to finish up the tail of [0,4) pixels. 1919 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 1920 proc(dst, src, count); 1921 } 1922 1923 void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1924 insert_alpha_should_swaprb(false, dst, src, count); 1925 } 1926 void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 1927 insert_alpha_should_swaprb(true, dst, src, count); 1928 } 1929#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX 1930 static void insert_alpha_should_swaprb(bool kSwapRB, 1931 uint32_t dst[], const uint8_t* src, int count) { 1932 const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xFF000000); 1933 1934 __m256i expand = __lasx_xvldi(0); 1935 if (kSwapRB) { 1936 expand = __lasx_xvinsgr2vr_d(expand, 0x0503040502000102, 0); 1937 expand = __lasx_xvinsgr2vr_d(expand, 0x0b090a0b08060708, 1); 1938 expand = __lasx_xvinsgr2vr_d(expand, 0x110f10110e0c0d0e, 2); 1939 expand = __lasx_xvinsgr2vr_d(expand, 0x1715161714121314, 3); 1940 } else { 1941 expand = __lasx_xvinsgr2vr_d(expand, 0x0505040302020100, 0); 1942 expand = __lasx_xvinsgr2vr_d(expand, 0x0b0b0a0908080706, 1); 1943 expand = __lasx_xvinsgr2vr_d(expand, 0x1111100f0e0e0d0c, 2); 1944 expand = __lasx_xvinsgr2vr_d(expand, 0x1717161514141312, 3); 1945 } 1946 1947 while (count >= 8) { 1948 // Load a vector. While this actually contains 5 pixels plus an 1949 // extra component, we will discard all but the first four pixels on 1950 // this iteration. 1951 __m256i rgb = __lasx_xvld(src, 0); 1952 __m256i rgb_l = __lasx_xvpermi_d(rgb, 0x44); 1953 __m256i rgb_h = __lasx_xvpermi_d(rgb, 0xEE); 1954 1955 // Expand the first four pixels to RGBX and then mask to RGB(FF). 1956 __m256i rgba = __lasx_xvor_v(__lasx_xvshuf_b(rgb_h, rgb_l, expand), alphaMask); 1957 1958 // Store 8 pixels. 1959 __lasx_xvst(rgba, dst, 0); 1960 1961 src += 4*6; 1962 dst += 8; 1963 count -= 8; 1964 } 1965 1966 // Call portable code to finish up the tail of [0,4) pixels. 1967 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 1968 proc(dst, src, count); 1969 } 1970 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 1971 insert_alpha_should_swaprb(false, dst, src, count); 1972 } 1973 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 1974 insert_alpha_should_swaprb(true, dst, src, count); 1975 } 1976#elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX 1977 static void insert_alpha_should_swaprb(bool kSwapRB, 1978 uint32_t dst[], const uint8_t* src, int count) { 1979 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xFF000000); 1980 1981 __m128i expand = __lsx_vldi(0); 1982 if (kSwapRB) { 1983 expand = __lsx_vinsgr2vr_d(expand, 0x0503040502000102, 0); 1984 expand = __lsx_vinsgr2vr_d(expand, 0x0b090a0b08060708, 1); 1985 } else { 1986 expand = __lsx_vinsgr2vr_d(expand, 0x0505040302020100, 0); 1987 expand = __lsx_vinsgr2vr_d(expand, 0x0b0b0a0908080706, 1); 1988 } 1989 1990 while (count >= 6) { 1991 // Load a vector. While this actually contains 5 pixels plus an 1992 // extra component, we will discard all but the first four pixels on 1993 // this iteration. 1994 __m128i rgb = __lsx_vld(src, 0); 1995 1996 // Expand the first four pixels to RGBX and then mask to RGB(FF). 1997 __m128i rgba = __lsx_vor_v(__lsx_vshuf_b(rgb, rgb, expand), alphaMask); 1998 1999 // Store 4 pixels. 2000 __lsx_vst(rgba, dst, 0); 2001 2002 src += 4*3; 2003 dst += 4; 2004 count -= 4; 2005 } 2006 2007 // Call portable code to finish up the tail of [0,4) pixels. 2008 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 2009 proc(dst, src, count); 2010 } 2011 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 2012 insert_alpha_should_swaprb(false, dst, src, count); 2013 } 2014 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 2015 insert_alpha_should_swaprb(true, dst, src, count); 2016 } 2017#else 2018 void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 2019 RGB_to_RGB1_portable(dst, src, count); 2020 } 2021 void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 2022 RGB_to_BGR1_portable(dst, src, count); 2023 } 2024#endif 2025 2026} // namespace SK_OPTS_NS 2027 2028#undef SI 2029