1 /*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 // Intentionally NO #pragma once... included multiple times.
9
10 // This file is included from skcms.cc in a namespace with some pre-defines:
11 // - N: SIMD width of all vectors; 1, 4, 8 or 16 (preprocessor define)
12 // - V<T>: a template to create a vector of N T's.
13
14 using F = V<float>;
15 using I32 = V<int32_t>;
16 using U64 = V<uint64_t>;
17 using U32 = V<uint32_t>;
18 using U16 = V<uint16_t>;
19 using U8 = V<uint8_t>;
20
21 #if defined(__GNUC__) && !defined(__clang__)
22 // GCC is kind of weird, not allowing vector = scalar directly.
23 static constexpr F F0 = F() + 0.0f,
24 F1 = F() + 1.0f,
25 FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
26 #else
27 static constexpr F F0 = 0.0f,
28 F1 = 1.0f,
29 FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
30 #endif
31
32 // Instead of checking __AVX__ below, we'll check USING_AVX.
33 // This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
34 // Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
35
36 #if !defined(USING_AVX) && N == 8 && defined(__AVX__)
37 #define USING_AVX
38 #endif
39 #if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
40 #define USING_AVX_F16C
41 #endif
42 #if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
43 #define USING_AVX2
44 #endif
45 #if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
46 #define USING_AVX512F
47 #endif
48
49 // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
50 // This is more for organizational clarity... skcms.cc doesn't force these.
51 #if N > 1 && defined(__ARM_NEON)
52 #define USING_NEON
53
54 // We have to use two different mechanisms to enable the f16 conversion intrinsics:
55 #if defined(__clang__)
56 // Clang's arm_neon.h guards them with the FP hardware bit:
57 #if __ARM_FP & 2
58 #define USING_NEON_F16C
59 #endif
60 #elif defined(__GNUC__)
61 // GCC's arm_neon.h guards them with the FP16 format macros (IEEE and ALTERNATIVE).
62 // We don't actually want the alternative format - we're reading/writing IEEE f16 values.
63 #if defined(__ARM_FP16_FORMAT_IEEE)
64 #define USING_NEON_F16C
65 #endif
66 #endif
67 #endif
68
69 // These -Wvector-conversion warnings seem to trigger in very bogus situations,
70 // like vst3q_f32() expecting a 16x char rather than a 4x float vector. :/
71 #if defined(USING_NEON) && defined(__clang__)
72 #pragma clang diagnostic ignored "-Wvector-conversion"
73 #endif
74
75 // GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
76 // You'd see warnings like, "using AVX even though AVX is not enabled".
77 // We stifle these warnings; our helpers that return U64 are always inlined.
78 #if defined(__SSE__) && defined(__GNUC__)
79 #if !defined(__has_warning)
80 #pragma GCC diagnostic ignored "-Wpsabi"
81 #elif __has_warning("-Wpsabi")
82 #pragma GCC diagnostic ignored "-Wpsabi"
83 #endif
84 #endif
85
86 // We tag most helper functions as SI, to enforce good code generation
87 // but also work around what we think is a bug in GCC: when targeting 32-bit
88 // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
89 // MMX mm0 register, which seems to mess with unrelated code that later uses
90 // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
91 #if defined(__clang__) || defined(__GNUC__)
92 #define SI static inline __attribute__((always_inline))
93 #else
94 #define SI static inline
95 #endif
96
97 template <typename T, typename P>
load(const P * ptr)98 SI T load(const P* ptr) {
99 T val;
100 memcpy(&val, ptr, sizeof(val));
101 return val;
102 }
103 template <typename T, typename P>
store(P * ptr,const T & val)104 SI void store(P* ptr, const T& val) {
105 memcpy(ptr, &val, sizeof(val));
106 }
107
108 // (T)v is a cast when N == 1 and a bit-pun when N>1,
109 // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
110 template <typename D, typename S>
cast(const S & v)111 SI D cast(const S& v) {
112 #if N == 1
113 return (D)v;
114 #elif defined(__clang__)
115 return __builtin_convertvector(v, D);
116 #else
117 D d;
118 for (int i = 0; i < N; i++) {
119 d[i] = v[i];
120 }
121 return d;
122 #endif
123 }
124
125 template <typename D, typename S>
bit_pun(const S & v)126 SI D bit_pun(const S& v) {
127 static_assert(sizeof(D) == sizeof(v), "");
128 return load<D>(&v);
129 }
130
131 // When we convert from float to fixed point, it's very common to want to round,
132 // and for some reason compilers generate better code when converting to int32_t.
133 // To serve both those ends, we use this function to_fixed() instead of direct cast().
to_fixed(F f)134 SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
135
136 // Sometimes we do something crazy on one branch of a conditonal,
137 // like divide by zero or convert a huge float to an integer,
138 // but then harmlessly select the other side. That trips up N==1
139 // sanitizer builds, so we make if_then_else() a macro to avoid
140 // evaluating the unused side.
141
142 #if N == 1
143 #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
144 #else
145 template <typename C, typename T>
if_then_else(C cond,T t,T e)146 SI T if_then_else(C cond, T t, T e) {
147 return bit_pun<T>( ( cond & bit_pun<C>(t)) |
148 (~cond & bit_pun<C>(e)) );
149 }
150 #endif
151
152
F_from_Half(U16 half)153 SI F F_from_Half(U16 half) {
154 #if defined(USING_NEON_F16C)
155 return vcvt_f32_f16((float16x4_t)half);
156 #elif defined(USING_AVX512F)
157 return (F)_mm512_cvtph_ps((__m256i)half);
158 #elif defined(USING_AVX_F16C)
159 typedef int16_t __attribute__((vector_size(16))) I16;
160 return __builtin_ia32_vcvtph2ps256((I16)half);
161 #else
162 U32 wide = cast<U32>(half);
163 // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
164 U32 s = wide & 0x8000,
165 em = wide ^ s;
166
167 // Constructing the float is easy if the half is not denormalized.
168 F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
169
170 // Simply flush all denorm half floats to zero.
171 return if_then_else(em < 0x0400, F0, norm);
172 #endif
173 }
174
175 #if defined(__clang__)
176 // The -((127-15)<<10) underflows that side of the math when
177 // we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
178 __attribute__((no_sanitize("unsigned-integer-overflow")))
179 #endif
Half_from_F(F f)180 SI U16 Half_from_F(F f) {
181 #if defined(USING_NEON_F16C)
182 return (U16)vcvt_f16_f32(f);
183 #elif defined(USING_AVX512F)
184 return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
185 #elif defined(USING_AVX_F16C)
186 return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
187 #else
188 // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
189 U32 sem = bit_pun<U32>(f),
190 s = sem & 0x80000000,
191 em = sem ^ s;
192
193 // For simplicity we flush denorm half floats (including all denorm floats) to zero.
194 return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
195 , (s>>16) + (em>>13) - ((127-15)<<10)));
196 #endif
197 }
198
199 // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
200 #if defined(USING_NEON)
swap_endian_16(U16 v)201 SI U16 swap_endian_16(U16 v) {
202 return (U16)vrev16_u8((uint8x8_t) v);
203 }
204 #endif
205
swap_endian_16x4(const U64 & rgba)206 SI U64 swap_endian_16x4(const U64& rgba) {
207 return (rgba & 0x00ff00ff00ff00ff) << 8
208 | (rgba & 0xff00ff00ff00ff00) >> 8;
209 }
210
211 #if defined(USING_NEON)
min_(F x,F y)212 SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
max_(F x,F y)213 SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
214 #else
min_(F x,F y)215 SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
max_(F x,F y)216 SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
217 #endif
218
floor_(F x)219 SI F floor_(F x) {
220 #if N == 1
221 return floorf_(x);
222 #elif defined(__aarch64__)
223 return vrndmq_f32(x);
224 #elif defined(USING_AVX512F)
225 // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
226 // and integer santizer catches that this implicit cast changes the
227 // value from -1 to 65535. We'll cast manually to work around it.
228 // Read this as `return _mm512_floor_ps(x)`.
229 return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
230 #elif defined(USING_AVX)
231 return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
232 #elif defined(__SSE4_1__)
233 return _mm_floor_ps(x);
234 #else
235 // Round trip through integers with a truncating cast.
236 F roundtrip = cast<F>(cast<I32>(x));
237 // If x is negative, truncating gives the ceiling instead of the floor.
238 return roundtrip - if_then_else(roundtrip > x, F1, F0);
239
240 // This implementation fails for values of x that are outside
241 // the range an integer can represent. We expect most x to be small.
242 #endif
243 }
244
approx_log2(F x)245 SI F approx_log2(F x) {
246 // The first approximation of log2(x) is its exponent 'e', minus 127.
247 I32 bits = bit_pun<I32>(x);
248
249 F e = cast<F>(bits) * (1.0f / (1<<23));
250
251 // If we use the mantissa too we can refine the error signficantly.
252 F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
253
254 return e - 124.225514990f
255 - 1.498030302f*m
256 - 1.725879990f/(0.3520887068f + m);
257 }
258
approx_log(F x)259 SI F approx_log(F x) {
260 const float ln2 = 0.69314718f;
261 return ln2 * approx_log2(x);
262 }
263
approx_exp2(F x)264 SI F approx_exp2(F x) {
265 F fract = x - floor_(x);
266
267 F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
268 - 1.490129070f*fract
269 + 27.728023300f/(4.84252568f - fract));
270 I32 bits = cast<I32>(min_(max_(fbits, F0), FInfBits));
271
272 return bit_pun<F>(bits);
273 }
274
approx_pow(F x,float y)275 SI F approx_pow(F x, float y) {
276 return if_then_else((x == F0) | (x == F1), x
277 , approx_exp2(approx_log2(x) * y));
278 }
279
approx_exp(F x)280 SI F approx_exp(F x) {
281 const float log2_e = 1.4426950408889634074f;
282 return approx_exp2(log2_e * x);
283 }
284
strip_sign(F x,U32 * sign)285 SI F strip_sign(F x, U32* sign) {
286 U32 bits = bit_pun<U32>(x);
287 *sign = bits & 0x80000000;
288 return bit_pun<F>(bits ^ *sign);
289 }
290
apply_sign(F x,U32 sign)291 SI F apply_sign(F x, U32 sign) {
292 return bit_pun<F>(sign | bit_pun<U32>(x));
293 }
294
295 // Return tf(x).
apply_tf(const skcms_TransferFunction * tf,F x)296 SI F apply_tf(const skcms_TransferFunction* tf, F x) {
297 // Peel off the sign bit and set x = |x|.
298 U32 sign;
299 x = strip_sign(x, &sign);
300
301 // The transfer function has a linear part up to d, exponential at d and after.
302 F v = if_then_else(x < tf->d, tf->c*x + tf->f
303 , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
304
305 // Tack the sign bit back on.
306 return apply_sign(v, sign);
307 }
308
309 // Return the gamma function (|x|^G with the original sign re-applied to x).
apply_gamma(const skcms_TransferFunction * tf,F x)310 SI F apply_gamma(const skcms_TransferFunction* tf, F x) {
311 U32 sign;
312 x = strip_sign(x, &sign);
313 return apply_sign(approx_pow(x, tf->g), sign);
314 }
315
apply_pq(const skcms_TransferFunction * tf,F x)316 SI F apply_pq(const skcms_TransferFunction* tf, F x) {
317 U32 bits = bit_pun<U32>(x),
318 sign = bits & 0x80000000;
319 x = bit_pun<F>(bits ^ sign);
320
321 F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
322 / (tf->d + tf->e * approx_pow(x, tf->c)),
323 tf->f);
324
325 return bit_pun<F>(sign | bit_pun<U32>(v));
326 }
327
apply_hlg(const skcms_TransferFunction * tf,F x)328 SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
329 const float R = tf->a, G = tf->b,
330 a = tf->c, b = tf->d, c = tf->e,
331 K = tf->f + 1;
332 U32 bits = bit_pun<U32>(x),
333 sign = bits & 0x80000000;
334 x = bit_pun<F>(bits ^ sign);
335
336 F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
337 , approx_exp((x-c)*a) + b);
338
339 return K*bit_pun<F>(sign | bit_pun<U32>(v));
340 }
341
apply_hlginv(const skcms_TransferFunction * tf,F x)342 SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
343 const float R = tf->a, G = tf->b,
344 a = tf->c, b = tf->d, c = tf->e,
345 K = tf->f + 1;
346 U32 bits = bit_pun<U32>(x),
347 sign = bits & 0x80000000;
348 x = bit_pun<F>(bits ^ sign);
349 x /= K;
350
351 F v = if_then_else(x <= 1, R * approx_pow(x, G)
352 , a * approx_log(x - b) + c);
353
354 return bit_pun<F>(sign | bit_pun<U32>(v));
355 }
356
357
358 // Strided loads and stores of N values, starting from p.
359 template <typename T, typename P>
load_3(const P * p)360 SI T load_3(const P* p) {
361 #if N == 1
362 return (T)p[0];
363 #elif N == 4
364 return T{p[ 0],p[ 3],p[ 6],p[ 9]};
365 #elif N == 8
366 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
367 #elif N == 16
368 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
369 p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
370 #endif
371 }
372
373 template <typename T, typename P>
load_4(const P * p)374 SI T load_4(const P* p) {
375 #if N == 1
376 return (T)p[0];
377 #elif N == 4
378 return T{p[ 0],p[ 4],p[ 8],p[12]};
379 #elif N == 8
380 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
381 #elif N == 16
382 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
383 p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
384 #endif
385 }
386
387 template <typename T, typename P>
store_3(P * p,const T & v)388 SI void store_3(P* p, const T& v) {
389 #if N == 1
390 p[0] = v;
391 #elif N == 4
392 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
393 #elif N == 8
394 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
395 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
396 #elif N == 16
397 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
398 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
399 p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
400 p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
401 #endif
402 }
403
404 template <typename T, typename P>
store_4(P * p,const T & v)405 SI void store_4(P* p, const T& v) {
406 #if N == 1
407 p[0] = v;
408 #elif N == 4
409 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
410 #elif N == 8
411 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
412 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
413 #elif N == 16
414 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
415 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
416 p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
417 p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
418 #endif
419 }
420
421
gather_8(const uint8_t * p,I32 ix)422 SI U8 gather_8(const uint8_t* p, I32 ix) {
423 #if N == 1
424 U8 v = p[ix];
425 #elif N == 4
426 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
427 #elif N == 8
428 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
429 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
430 #elif N == 16
431 U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
432 p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
433 p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
434 p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
435 #endif
436 return v;
437 }
438
gather_16(const uint8_t * p,I32 ix)439 SI U16 gather_16(const uint8_t* p, I32 ix) {
440 // Load the i'th 16-bit value from p.
441 auto load_16 = [p](int i) {
442 return load<uint16_t>(p + 2*i);
443 };
444 #if N == 1
445 U16 v = load_16(ix);
446 #elif N == 4
447 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
448 #elif N == 8
449 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
450 load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
451 #elif N == 16
452 U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
453 load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
454 load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
455 load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
456 #endif
457 return v;
458 }
459
gather_32(const uint8_t * p,I32 ix)460 SI U32 gather_32(const uint8_t* p, I32 ix) {
461 // Load the i'th 32-bit value from p.
462 auto load_32 = [p](int i) {
463 return load<uint32_t>(p + 4*i);
464 };
465 #if N == 1
466 U32 v = load_32(ix);
467 #elif N == 4
468 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
469 #elif N == 8
470 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
471 load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
472 #elif N == 16
473 U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
474 load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
475 load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
476 load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
477 #endif
478 // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
479 return v;
480 }
481
gather_24(const uint8_t * p,I32 ix)482 SI U32 gather_24(const uint8_t* p, I32 ix) {
483 // First, back up a byte. Any place we're gathering from has a safe junk byte to read
484 // in front of it, either a previous table value, or some tag metadata.
485 p -= 1;
486
487 // Load the i'th 24-bit value from p, and 1 extra byte.
488 auto load_24_32 = [p](int i) {
489 return load<uint32_t>(p + 3*i);
490 };
491
492 // Now load multiples of 4 bytes (a junk byte, then r,g,b).
493 #if N == 1
494 U32 v = load_24_32(ix);
495 #elif N == 4
496 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
497 #elif N == 8 && !defined(USING_AVX2)
498 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
499 load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
500 #elif N == 8
501 (void)load_24_32;
502 // The gather instruction here doesn't need any particular alignment,
503 // but the intrinsic takes a const int*.
504 const int* p4 = bit_pun<const int*>(p);
505 I32 zero = { 0, 0, 0, 0, 0, 0, 0, 0},
506 mask = {-1,-1,-1,-1, -1,-1,-1,-1};
507 #if defined(__clang__)
508 U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
509 #elif defined(__GNUC__)
510 U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
511 #endif
512 #elif N == 16
513 (void)load_24_32;
514 // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
515 // And AVX-512 swapped the order of arguments. :/
516 const int* p4 = bit_pun<const int*>(p);
517 U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
518 #endif
519
520 // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
521 return v >> 8;
522 }
523
524 #if !defined(__arm__)
gather_48(const uint8_t * p,I32 ix,U64 * v)525 SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
526 // As in gather_24(), with everything doubled.
527 p -= 2;
528
529 // Load the i'th 48-bit value from p, and 2 extra bytes.
530 auto load_48_64 = [p](int i) {
531 return load<uint64_t>(p + 6*i);
532 };
533
534 #if N == 1
535 *v = load_48_64(ix);
536 #elif N == 4
537 *v = U64{
538 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
539 };
540 #elif N == 8 && !defined(USING_AVX2)
541 *v = U64{
542 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
543 load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
544 };
545 #elif N == 8
546 (void)load_48_64;
547 typedef int32_t __attribute__((vector_size(16))) Half_I32;
548 typedef long long __attribute__((vector_size(32))) Half_I64;
549
550 // The gather instruction here doesn't need any particular alignment,
551 // but the intrinsic takes a const long long*.
552 const long long int* p8 = bit_pun<const long long int*>(p);
553
554 Half_I64 zero = { 0, 0, 0, 0},
555 mask = {-1,-1,-1,-1};
556
557 ix *= 6;
558 Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
559 ix_hi = { ix[4], ix[5], ix[6], ix[7] };
560
561 #if defined(__clang__)
562 Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
563 hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
564 #elif defined(__GNUC__)
565 Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
566 hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
567 #endif
568 store((char*)v + 0, lo);
569 store((char*)v + 32, hi);
570 #elif N == 16
571 (void)load_48_64;
572 const long long int* p8 = bit_pun<const long long int*>(p);
573 __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
574 hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
575 store((char*)v + 0, lo);
576 store((char*)v + 64, hi);
577 #endif
578
579 *v >>= 16;
580 }
581 #endif
582
F_from_U8(U8 v)583 SI F F_from_U8(U8 v) {
584 return cast<F>(v) * (1/255.0f);
585 }
586
F_from_U16_BE(U16 v)587 SI F F_from_U16_BE(U16 v) {
588 // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
589 // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
590 U16 lo = (v >> 8),
591 hi = (v << 8) & 0xffff;
592 return cast<F>(lo|hi) * (1/65535.0f);
593 }
594
U16_from_F(F v)595 SI U16 U16_from_F(F v) {
596 // 65535 == inf in FP16, so promote to FP32 before converting.
597 return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
598 }
599
minus_1_ulp(F v)600 SI F minus_1_ulp(F v) {
601 return bit_pun<F>( bit_pun<U32>(v) - 1 );
602 }
603
table(const skcms_Curve * curve,F v)604 SI F table(const skcms_Curve* curve, F v) {
605 // Clamp the input to [0,1], then scale to a table index.
606 F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
607
608 // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
609 I32 lo = cast<I32>( ix ),
610 hi = cast<I32>(minus_1_ulp(ix+1.0f));
611 F t = ix - cast<F>(lo); // i.e. the fractional part of ix.
612
613 // TODO: can we load l and h simultaneously? Each entry in 'h' is either
614 // the same as in 'l' or adjacent. We have a rough idea that's it'd always be safe
615 // to read adjacent entries and perhaps underflow the table by a byte or two
616 // (it'd be junk, but always safe to read). Not sure how to lerp yet.
617 F l,h;
618 if (curve->table_8) {
619 l = F_from_U8(gather_8(curve->table_8, lo));
620 h = F_from_U8(gather_8(curve->table_8, hi));
621 } else {
622 l = F_from_U16_BE(gather_16(curve->table_16, lo));
623 h = F_from_U16_BE(gather_16(curve->table_16, hi));
624 }
625 return l + (h-l)*t;
626 }
627
sample_clut_8(const uint8_t * grid_8,I32 ix,F * r,F * g,F * b)628 SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
629 U32 rgb = gather_24(grid_8, ix);
630
631 *r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
632 *g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f);
633 *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
634 }
635
sample_clut_8(const uint8_t * grid_8,I32 ix,F * r,F * g,F * b,F * a)636 SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
637 // TODO: don't forget to optimize gather_32().
638 U32 rgba = gather_32(grid_8, ix);
639
640 *r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
641 *g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
642 *b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
643 *a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
644 }
645
sample_clut_16(const uint8_t * grid_16,I32 ix,F * r,F * g,F * b)646 SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
647 #if defined(__arm__)
648 // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
649 *r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
650 *g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
651 *b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
652 #else
653 // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
654 U64 rgb;
655 gather_48(grid_16, ix, &rgb);
656 rgb = swap_endian_16x4(rgb);
657
658 *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
659 *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
660 *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
661 #endif
662 }
663
sample_clut_16(const uint8_t * grid_16,I32 ix,F * r,F * g,F * b,F * a)664 SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
665 // TODO: gather_64()-based fast path?
666 *r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
667 *g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
668 *b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
669 *a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
670 }
671
clut(uint32_t input_channels,uint32_t output_channels,const uint8_t grid_points[4],const uint8_t * grid_8,const uint8_t * grid_16,F * r,F * g,F * b,F * a)672 static void clut(uint32_t input_channels, uint32_t output_channels,
673 const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
674 F* r, F* g, F* b, F* a) {
675
676 const int dim = (int)input_channels;
677 assert (0 < dim && dim <= 4);
678 assert (output_channels == 3 ||
679 output_channels == 4);
680
681 // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
682 I32 index [8]; // Index contribution by dimension, first low from 0, then high from 4.
683 F weight[8]; // Weight for each contribution, again first low, then high.
684
685 // O(dim) work first: calculate index,weight from r,g,b,a.
686 const F inputs[] = { *r,*g,*b,*a };
687 for (int i = dim-1, stride = 1; i >= 0; i--) {
688 // x is where we logically want to sample the grid in the i-th dimension.
689 F x = inputs[i] * (float)(grid_points[i] - 1);
690
691 // But we can't index at floats. lo and hi are the two integer grid points surrounding x.
692 I32 lo = cast<I32>( x ), // i.e. trunc(x) == floor(x) here.
693 hi = cast<I32>(minus_1_ulp(x+1.0f));
694 // Notice how we fold in the accumulated stride across previous dimensions here.
695 index[i+0] = lo * stride;
696 index[i+4] = hi * stride;
697 stride *= grid_points[i];
698
699 // We'll interpolate between those two integer grid points by t.
700 F t = x - cast<F>(lo); // i.e. fract(x)
701 weight[i+0] = 1-t;
702 weight[i+4] = t;
703 }
704
705 *r = *g = *b = F0;
706 if (output_channels == 4) {
707 *a = F0;
708 }
709
710 // We'll sample 2^dim == 1<<dim table entries per pixel,
711 // in all combinations of low and high in each dimension.
712 for (int combo = 0; combo < (1<<dim); combo++) { // This loop can be done in any order.
713
714 // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
715 // where 0 selects the low index contribution and its weight 1-t,
716 // or 4 the high index contribution and its weight t.
717
718 // Since 0<dim≤4, we can always just start off with the 0-th channel,
719 // then handle the others conditionally.
720 I32 ix = index [0 + (combo&1)*4];
721 F w = weight[0 + (combo&1)*4];
722
723 switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle.
724 case 3: ix += index [3 + (combo&8)/2];
725 w *= weight[3 + (combo&8)/2];
726 SKCMS_FALLTHROUGH;
727 // fall through
728
729 case 2: ix += index [2 + (combo&4)*1];
730 w *= weight[2 + (combo&4)*1];
731 SKCMS_FALLTHROUGH;
732 // fall through
733
734 case 1: ix += index [1 + (combo&2)*2];
735 w *= weight[1 + (combo&2)*2];
736 }
737
738 F R,G,B,A=F0;
739 if (output_channels == 3) {
740 if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
741 else { sample_clut_16(grid_16,ix, &R,&G,&B); }
742 } else {
743 if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
744 else { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
745 }
746 *r += w*R;
747 *g += w*G;
748 *b += w*B;
749 *a += w*A;
750 }
751 }
752
clut(const skcms_A2B * a2b,F * r,F * g,F * b,F a)753 static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
754 clut(a2b->input_channels, a2b->output_channels,
755 a2b->grid_points, a2b->grid_8, a2b->grid_16,
756 r,g,b,&a);
757 }
clut(const skcms_B2A * b2a,F * r,F * g,F * b,F * a)758 static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
759 clut(b2a->input_channels, b2a->output_channels,
760 b2a->grid_points, b2a->grid_8, b2a->grid_16,
761 r,g,b,a);
762 }
763
764 struct NoCtx {};
765
766 struct Ctx {
767 const void* fArg;
NoCtxCtx768 operator NoCtx() { return NoCtx{}; }
769 template <typename T> operator T*() { return (const T*)fArg; }
770 };
771
772 #define STAGE_PARAMS(MAYBE_REF) SKCMS_MAYBE_UNUSED const char* src, \
773 SKCMS_MAYBE_UNUSED char* dst, \
774 SKCMS_MAYBE_UNUSED F MAYBE_REF r, \
775 SKCMS_MAYBE_UNUSED F MAYBE_REF g, \
776 SKCMS_MAYBE_UNUSED F MAYBE_REF b, \
777 SKCMS_MAYBE_UNUSED F MAYBE_REF a, \
778 SKCMS_MAYBE_UNUSED int i
779
780 #if SKCMS_HAS_MUSTTAIL
781
782 // Stages take a stage list, and each stage is responsible for tail-calling the next one.
783 //
784 // Unfortunately, we can't declare a StageFn as a function pointer which takes a pointer to
785 // another StageFn; declaring this leads to a circular dependency. To avoid this, StageFn is
786 // wrapped in a single-element `struct StageList` which we are able to forward-declare.
787 struct StageList;
788 using StageFn = void (*)(StageList stages, const void** ctx, STAGE_PARAMS());
789 struct StageList {
790 const StageFn* fn;
791 };
792
793 #define DECLARE_STAGE(name, arg, CALL_NEXT) \
794 SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
795 \
796 SI void Exec_##name(StageList list, const void** ctx, STAGE_PARAMS()) { \
797 Exec_##name##_k(Ctx{*ctx}, src, dst, r, g, b, a, i); \
798 ++list.fn; ++ctx; \
799 CALL_NEXT; \
800 } \
801 \
802 SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
803
804 #define STAGE(name, arg) \
805 DECLARE_STAGE(name, arg, [[clang::musttail]] return (*list.fn)(list, ctx, src, dst, \
806 r, g, b, a, i))
807
808 #define FINAL_STAGE(name, arg) \
809 DECLARE_STAGE(name, arg, /* Stop executing stages and return to the caller. */)
810
811 #else
812
813 #define DECLARE_STAGE(name, arg) \
814 SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
815 \
816 SI void Exec_##name(const void* ctx, STAGE_PARAMS(&)) { \
817 Exec_##name##_k(Ctx{ctx}, src, dst, r, g, b, a, i); \
818 } \
819 \
820 SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
821
822 #define STAGE(name, arg) DECLARE_STAGE(name, arg)
823 #define FINAL_STAGE(name, arg) DECLARE_STAGE(name, arg)
824
825 #endif
826
STAGE(load_a8,NoCtx)827 STAGE(load_a8, NoCtx) {
828 a = F_from_U8(load<U8>(src + 1*i));
829 }
830
STAGE(load_g8,NoCtx)831 STAGE(load_g8, NoCtx) {
832 r = g = b = F_from_U8(load<U8>(src + 1*i));
833 }
834
STAGE(load_4444,NoCtx)835 STAGE(load_4444, NoCtx) {
836 U16 abgr = load<U16>(src + 2*i);
837
838 r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
839 g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
840 b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
841 a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
842 }
843
STAGE(load_565,NoCtx)844 STAGE(load_565, NoCtx) {
845 U16 rgb = load<U16>(src + 2*i);
846
847 r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
848 g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
849 b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
850 }
851
STAGE(load_888,NoCtx)852 STAGE(load_888, NoCtx) {
853 const uint8_t* rgb = (const uint8_t*)(src + 3*i);
854 #if defined(USING_NEON)
855 // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
856 // a time. Since we're doing that, we might as well load them into 16-bit lanes.
857 // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
858 uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
859 v = vld3_lane_u8(rgb+0, v, 0);
860 v = vld3_lane_u8(rgb+3, v, 2);
861 v = vld3_lane_u8(rgb+6, v, 4);
862 v = vld3_lane_u8(rgb+9, v, 6);
863
864 // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
865 // convert to F. (Again, U32 would be even better here if drop ARMv7 or split
866 // ARMv7 and ARMv8 impls.)
867 r = cast<F>((U16)v.val[0]) * (1/255.0f);
868 g = cast<F>((U16)v.val[1]) * (1/255.0f);
869 b = cast<F>((U16)v.val[2]) * (1/255.0f);
870 #else
871 r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
872 g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
873 b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
874 #endif
875 }
876
STAGE(load_8888,NoCtx)877 STAGE(load_8888, NoCtx) {
878 U32 rgba = load<U32>(src + 4*i);
879
880 r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
881 g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
882 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
883 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
884 }
885
STAGE(load_1010102,NoCtx)886 STAGE(load_1010102, NoCtx) {
887 U32 rgba = load<U32>(src + 4*i);
888
889 r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
890 g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
891 b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
892 a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
893 }
894
STAGE(load_101010x_XR,NoCtx)895 STAGE(load_101010x_XR, NoCtx) {
896 static constexpr float min = -0.752941f;
897 static constexpr float max = 1.25098f;
898 static constexpr float range = max - min;
899 U32 rgba = load<U32>(src + 4*i);
900 r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f) * range + min;
901 g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
902 b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
903 }
904
STAGE(load_10101010_XR,NoCtx)905 STAGE(load_10101010_XR, NoCtx) {
906 static constexpr float min = -0.752941f;
907 static constexpr float max = 1.25098f;
908 static constexpr float range = max - min;
909 U64 rgba = load<U64>(src + 8 * i);
910 r = cast<F>((rgba >> (0+6)) & 0x3ff) * (1/1023.0f) * range + min;
911 g = cast<F>((rgba >> (16+6)) & 0x3ff) * (1/1023.0f) * range + min;
912 b = cast<F>((rgba >> (32+6)) & 0x3ff) * (1/1023.0f) * range + min;
913 a = cast<F>((rgba >> (48+6)) & 0x3ff) * (1/1023.0f) * range + min;
914 }
915
STAGE(load_161616LE,NoCtx)916 STAGE(load_161616LE, NoCtx) {
917 uintptr_t ptr = (uintptr_t)(src + 6*i);
918 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
919 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
920 #if defined(USING_NEON)
921 uint16x4x3_t v = vld3_u16(rgb);
922 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
923 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
924 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
925 #else
926 r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
927 g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
928 b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
929 #endif
930 }
931
STAGE(load_16161616LE,NoCtx)932 STAGE(load_16161616LE, NoCtx) {
933 uintptr_t ptr = (uintptr_t)(src + 8*i);
934 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
935 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
936 #if defined(USING_NEON)
937 uint16x4x4_t v = vld4_u16(rgba);
938 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
939 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
940 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
941 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
942 #else
943 U64 px = load<U64>(rgba);
944
945 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
946 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
947 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
948 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
949 #endif
950 }
951
STAGE(load_161616BE,NoCtx)952 STAGE(load_161616BE, NoCtx) {
953 uintptr_t ptr = (uintptr_t)(src + 6*i);
954 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
955 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
956 #if defined(USING_NEON)
957 uint16x4x3_t v = vld3_u16(rgb);
958 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
959 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
960 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
961 #else
962 U32 R = load_3<U32>(rgb+0),
963 G = load_3<U32>(rgb+1),
964 B = load_3<U32>(rgb+2);
965 // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
966 r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
967 g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
968 b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
969 #endif
970 }
971
STAGE(load_16161616BE,NoCtx)972 STAGE(load_16161616BE, NoCtx) {
973 uintptr_t ptr = (uintptr_t)(src + 8*i);
974 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
975 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
976 #if defined(USING_NEON)
977 uint16x4x4_t v = vld4_u16(rgba);
978 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
979 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
980 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
981 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
982 #else
983 U64 px = swap_endian_16x4(load<U64>(rgba));
984
985 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
986 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
987 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
988 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
989 #endif
990 }
991
STAGE(load_hhh,NoCtx)992 STAGE(load_hhh, NoCtx) {
993 uintptr_t ptr = (uintptr_t)(src + 6*i);
994 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
995 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
996 #if defined(USING_NEON)
997 uint16x4x3_t v = vld3_u16(rgb);
998 U16 R = (U16)v.val[0],
999 G = (U16)v.val[1],
1000 B = (U16)v.val[2];
1001 #else
1002 U16 R = load_3<U16>(rgb+0),
1003 G = load_3<U16>(rgb+1),
1004 B = load_3<U16>(rgb+2);
1005 #endif
1006 r = F_from_Half(R);
1007 g = F_from_Half(G);
1008 b = F_from_Half(B);
1009 }
1010
STAGE(load_hhhh,NoCtx)1011 STAGE(load_hhhh, NoCtx) {
1012 uintptr_t ptr = (uintptr_t)(src + 8*i);
1013 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1014 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1015 #if defined(USING_NEON)
1016 uint16x4x4_t v = vld4_u16(rgba);
1017 U16 R = (U16)v.val[0],
1018 G = (U16)v.val[1],
1019 B = (U16)v.val[2],
1020 A = (U16)v.val[3];
1021 #else
1022 U64 px = load<U64>(rgba);
1023 U16 R = cast<U16>((px >> 0) & 0xffff),
1024 G = cast<U16>((px >> 16) & 0xffff),
1025 B = cast<U16>((px >> 32) & 0xffff),
1026 A = cast<U16>((px >> 48) & 0xffff);
1027 #endif
1028 r = F_from_Half(R);
1029 g = F_from_Half(G);
1030 b = F_from_Half(B);
1031 a = F_from_Half(A);
1032 }
1033
STAGE(load_fff,NoCtx)1034 STAGE(load_fff, NoCtx) {
1035 uintptr_t ptr = (uintptr_t)(src + 12*i);
1036 assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1037 const float* rgb = (const float*)ptr; // cast to const float* to be safe.
1038 #if defined(USING_NEON)
1039 float32x4x3_t v = vld3q_f32(rgb);
1040 r = (F)v.val[0];
1041 g = (F)v.val[1];
1042 b = (F)v.val[2];
1043 #else
1044 r = load_3<F>(rgb+0);
1045 g = load_3<F>(rgb+1);
1046 b = load_3<F>(rgb+2);
1047 #endif
1048 }
1049
STAGE(load_ffff,NoCtx)1050 STAGE(load_ffff, NoCtx) {
1051 uintptr_t ptr = (uintptr_t)(src + 16*i);
1052 assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1053 const float* rgba = (const float*)ptr; // cast to const float* to be safe.
1054 #if defined(USING_NEON)
1055 float32x4x4_t v = vld4q_f32(rgba);
1056 r = (F)v.val[0];
1057 g = (F)v.val[1];
1058 b = (F)v.val[2];
1059 a = (F)v.val[3];
1060 #else
1061 r = load_4<F>(rgba+0);
1062 g = load_4<F>(rgba+1);
1063 b = load_4<F>(rgba+2);
1064 a = load_4<F>(rgba+3);
1065 #endif
1066 }
1067
STAGE(swap_rb,NoCtx)1068 STAGE(swap_rb, NoCtx) {
1069 F t = r;
1070 r = b;
1071 b = t;
1072 }
1073
STAGE(clamp,NoCtx)1074 STAGE(clamp, NoCtx) {
1075 r = max_(F0, min_(r, F1));
1076 g = max_(F0, min_(g, F1));
1077 b = max_(F0, min_(b, F1));
1078 a = max_(F0, min_(a, F1));
1079 }
1080
STAGE(invert,NoCtx)1081 STAGE(invert, NoCtx) {
1082 r = F1 - r;
1083 g = F1 - g;
1084 b = F1 - b;
1085 a = F1 - a;
1086 }
1087
STAGE(force_opaque,NoCtx)1088 STAGE(force_opaque, NoCtx) {
1089 a = F1;
1090 }
1091
STAGE(premul,NoCtx)1092 STAGE(premul, NoCtx) {
1093 r *= a;
1094 g *= a;
1095 b *= a;
1096 }
1097
STAGE(unpremul,NoCtx)1098 STAGE(unpremul, NoCtx) {
1099 F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1100 r *= scale;
1101 g *= scale;
1102 b *= scale;
1103 }
1104
STAGE(matrix_3x3,const skcms_Matrix3x3 * matrix)1105 STAGE(matrix_3x3, const skcms_Matrix3x3* matrix) {
1106 const float* m = &matrix->vals[0][0];
1107
1108 F R = m[0]*r + m[1]*g + m[2]*b,
1109 G = m[3]*r + m[4]*g + m[5]*b,
1110 B = m[6]*r + m[7]*g + m[8]*b;
1111
1112 r = R;
1113 g = G;
1114 b = B;
1115 }
1116
STAGE(matrix_3x4,const skcms_Matrix3x4 * matrix)1117 STAGE(matrix_3x4, const skcms_Matrix3x4* matrix) {
1118 const float* m = &matrix->vals[0][0];
1119
1120 F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1121 G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1122 B = m[8]*r + m[9]*g + m[10]*b + m[11];
1123
1124 r = R;
1125 g = G;
1126 b = B;
1127 }
1128
STAGE(lab_to_xyz,NoCtx)1129 STAGE(lab_to_xyz, NoCtx) {
1130 // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
1131 F L = r * 100.0f,
1132 A = g * 255.0f - 128.0f,
1133 B = b * 255.0f - 128.0f;
1134
1135 // Convert to CIE XYZ.
1136 F Y = (L + 16.0f) * (1/116.0f),
1137 X = Y + A*(1/500.0f),
1138 Z = Y - B*(1/200.0f);
1139
1140 X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1141 Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1142 Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1143
1144 // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1145 r = X * 0.9642f;
1146 g = Y ;
1147 b = Z * 0.8249f;
1148 }
1149
1150 // As above, in reverse.
STAGE(xyz_to_lab,NoCtx)1151 STAGE(xyz_to_lab, NoCtx) {
1152 F X = r * (1/0.9642f),
1153 Y = g,
1154 Z = b * (1/0.8249f);
1155
1156 X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
1157 Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
1158 Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
1159
1160 F L = Y*116.0f - 16.0f,
1161 A = (X-Y)*500.0f,
1162 B = (Y-Z)*200.0f;
1163
1164 r = L * (1/100.f);
1165 g = (A + 128.0f) * (1/255.0f);
1166 b = (B + 128.0f) * (1/255.0f);
1167 }
1168
STAGE(gamma_r,const skcms_TransferFunction * tf)1169 STAGE(gamma_r, const skcms_TransferFunction* tf) { r = apply_gamma(tf, r); }
STAGE(gamma_g,const skcms_TransferFunction * tf)1170 STAGE(gamma_g, const skcms_TransferFunction* tf) { g = apply_gamma(tf, g); }
STAGE(gamma_b,const skcms_TransferFunction * tf)1171 STAGE(gamma_b, const skcms_TransferFunction* tf) { b = apply_gamma(tf, b); }
STAGE(gamma_a,const skcms_TransferFunction * tf)1172 STAGE(gamma_a, const skcms_TransferFunction* tf) { a = apply_gamma(tf, a); }
1173
STAGE(gamma_rgb,const skcms_TransferFunction * tf)1174 STAGE(gamma_rgb, const skcms_TransferFunction* tf) {
1175 r = apply_gamma(tf, r);
1176 g = apply_gamma(tf, g);
1177 b = apply_gamma(tf, b);
1178 }
1179
STAGE(tf_r,const skcms_TransferFunction * tf)1180 STAGE(tf_r, const skcms_TransferFunction* tf) { r = apply_tf(tf, r); }
STAGE(tf_g,const skcms_TransferFunction * tf)1181 STAGE(tf_g, const skcms_TransferFunction* tf) { g = apply_tf(tf, g); }
STAGE(tf_b,const skcms_TransferFunction * tf)1182 STAGE(tf_b, const skcms_TransferFunction* tf) { b = apply_tf(tf, b); }
STAGE(tf_a,const skcms_TransferFunction * tf)1183 STAGE(tf_a, const skcms_TransferFunction* tf) { a = apply_tf(tf, a); }
1184
STAGE(tf_rgb,const skcms_TransferFunction * tf)1185 STAGE(tf_rgb, const skcms_TransferFunction* tf) {
1186 r = apply_tf(tf, r);
1187 g = apply_tf(tf, g);
1188 b = apply_tf(tf, b);
1189 }
1190
STAGE(pq_r,const skcms_TransferFunction * tf)1191 STAGE(pq_r, const skcms_TransferFunction* tf) { r = apply_pq(tf, r); }
STAGE(pq_g,const skcms_TransferFunction * tf)1192 STAGE(pq_g, const skcms_TransferFunction* tf) { g = apply_pq(tf, g); }
STAGE(pq_b,const skcms_TransferFunction * tf)1193 STAGE(pq_b, const skcms_TransferFunction* tf) { b = apply_pq(tf, b); }
STAGE(pq_a,const skcms_TransferFunction * tf)1194 STAGE(pq_a, const skcms_TransferFunction* tf) { a = apply_pq(tf, a); }
1195
STAGE(pq_rgb,const skcms_TransferFunction * tf)1196 STAGE(pq_rgb, const skcms_TransferFunction* tf) {
1197 r = apply_pq(tf, r);
1198 g = apply_pq(tf, g);
1199 b = apply_pq(tf, b);
1200 }
1201
STAGE(hlg_r,const skcms_TransferFunction * tf)1202 STAGE(hlg_r, const skcms_TransferFunction* tf) { r = apply_hlg(tf, r); }
STAGE(hlg_g,const skcms_TransferFunction * tf)1203 STAGE(hlg_g, const skcms_TransferFunction* tf) { g = apply_hlg(tf, g); }
STAGE(hlg_b,const skcms_TransferFunction * tf)1204 STAGE(hlg_b, const skcms_TransferFunction* tf) { b = apply_hlg(tf, b); }
STAGE(hlg_a,const skcms_TransferFunction * tf)1205 STAGE(hlg_a, const skcms_TransferFunction* tf) { a = apply_hlg(tf, a); }
1206
STAGE(hlg_rgb,const skcms_TransferFunction * tf)1207 STAGE(hlg_rgb, const skcms_TransferFunction* tf) {
1208 r = apply_hlg(tf, r);
1209 g = apply_hlg(tf, g);
1210 b = apply_hlg(tf, b);
1211 }
1212
STAGE(hlginv_r,const skcms_TransferFunction * tf)1213 STAGE(hlginv_r, const skcms_TransferFunction* tf) { r = apply_hlginv(tf, r); }
STAGE(hlginv_g,const skcms_TransferFunction * tf)1214 STAGE(hlginv_g, const skcms_TransferFunction* tf) { g = apply_hlginv(tf, g); }
STAGE(hlginv_b,const skcms_TransferFunction * tf)1215 STAGE(hlginv_b, const skcms_TransferFunction* tf) { b = apply_hlginv(tf, b); }
STAGE(hlginv_a,const skcms_TransferFunction * tf)1216 STAGE(hlginv_a, const skcms_TransferFunction* tf) { a = apply_hlginv(tf, a); }
1217
STAGE(hlginv_rgb,const skcms_TransferFunction * tf)1218 STAGE(hlginv_rgb, const skcms_TransferFunction* tf) {
1219 r = apply_hlginv(tf, r);
1220 g = apply_hlginv(tf, g);
1221 b = apply_hlginv(tf, b);
1222 }
1223
STAGE(table_r,const skcms_Curve * curve)1224 STAGE(table_r, const skcms_Curve* curve) { r = table(curve, r); }
STAGE(table_g,const skcms_Curve * curve)1225 STAGE(table_g, const skcms_Curve* curve) { g = table(curve, g); }
STAGE(table_b,const skcms_Curve * curve)1226 STAGE(table_b, const skcms_Curve* curve) { b = table(curve, b); }
STAGE(table_a,const skcms_Curve * curve)1227 STAGE(table_a, const skcms_Curve* curve) { a = table(curve, a); }
1228
STAGE(clut_A2B,const skcms_A2B * a2b)1229 STAGE(clut_A2B, const skcms_A2B* a2b) {
1230 clut(a2b, &r,&g,&b,a);
1231
1232 if (a2b->input_channels == 4) {
1233 // CMYK is opaque.
1234 a = F1;
1235 }
1236 }
1237
STAGE(clut_B2A,const skcms_B2A * b2a)1238 STAGE(clut_B2A, const skcms_B2A* b2a) {
1239 clut(b2a, &r,&g,&b,&a);
1240 }
1241
1242 // From here on down, the store_ ops are all "final stages," terminating processing of this group.
1243
FINAL_STAGE(store_a8,NoCtx)1244 FINAL_STAGE(store_a8, NoCtx) {
1245 store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1246 }
1247
FINAL_STAGE(store_g8,NoCtx)1248 FINAL_STAGE(store_g8, NoCtx) {
1249 // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1250 store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1251 }
1252
FINAL_STAGE(store_4444,NoCtx)1253 FINAL_STAGE(store_4444, NoCtx) {
1254 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1255 | cast<U16>(to_fixed(g * 15) << 8)
1256 | cast<U16>(to_fixed(b * 15) << 4)
1257 | cast<U16>(to_fixed(a * 15) << 0));
1258 }
1259
FINAL_STAGE(store_565,NoCtx)1260 FINAL_STAGE(store_565, NoCtx) {
1261 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
1262 | cast<U16>(to_fixed(g * 63) << 5 )
1263 | cast<U16>(to_fixed(b * 31) << 11 ));
1264 }
1265
FINAL_STAGE(store_888,NoCtx)1266 FINAL_STAGE(store_888, NoCtx) {
1267 uint8_t* rgb = (uint8_t*)dst + 3*i;
1268 #if defined(USING_NEON)
1269 // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1270 // get there via U16 to save some instructions converting to float. And just
1271 // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1272 U16 R = cast<U16>(to_fixed(r * 255)),
1273 G = cast<U16>(to_fixed(g * 255)),
1274 B = cast<U16>(to_fixed(b * 255));
1275
1276 uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1277 vst3_lane_u8(rgb+0, v, 0);
1278 vst3_lane_u8(rgb+3, v, 2);
1279 vst3_lane_u8(rgb+6, v, 4);
1280 vst3_lane_u8(rgb+9, v, 6);
1281 #else
1282 store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1283 store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1284 store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1285 #endif
1286 }
1287
FINAL_STAGE(store_8888,NoCtx)1288 FINAL_STAGE(store_8888, NoCtx) {
1289 store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
1290 | cast<U32>(to_fixed(g * 255)) << 8
1291 | cast<U32>(to_fixed(b * 255)) << 16
1292 | cast<U32>(to_fixed(a * 255)) << 24);
1293 }
1294
FINAL_STAGE(store_101010x_XR,NoCtx)1295 FINAL_STAGE(store_101010x_XR, NoCtx) {
1296 static constexpr float min = -0.752941f;
1297 static constexpr float max = 1.25098f;
1298 static constexpr float range = max - min;
1299 store(dst + 4*i, cast<U32>(to_fixed(((r - min) / range) * 1023)) << 0
1300 | cast<U32>(to_fixed(((g - min) / range) * 1023)) << 10
1301 | cast<U32>(to_fixed(((b - min) / range) * 1023)) << 20);
1302 }
1303
FINAL_STAGE(store_1010102,NoCtx)1304 FINAL_STAGE(store_1010102, NoCtx) {
1305 store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
1306 | cast<U32>(to_fixed(g * 1023)) << 10
1307 | cast<U32>(to_fixed(b * 1023)) << 20
1308 | cast<U32>(to_fixed(a * 3)) << 30);
1309 }
1310
FINAL_STAGE(store_161616LE,NoCtx)1311 FINAL_STAGE(store_161616LE, NoCtx) {
1312 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1313 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1314 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1315 #if defined(USING_NEON)
1316 uint16x4x3_t v = {{
1317 (uint16x4_t)U16_from_F(r),
1318 (uint16x4_t)U16_from_F(g),
1319 (uint16x4_t)U16_from_F(b),
1320 }};
1321 vst3_u16(rgb, v);
1322 #else
1323 store_3(rgb+0, U16_from_F(r));
1324 store_3(rgb+1, U16_from_F(g));
1325 store_3(rgb+2, U16_from_F(b));
1326 #endif
1327
1328 }
1329
FINAL_STAGE(store_16161616LE,NoCtx)1330 FINAL_STAGE(store_16161616LE, NoCtx) {
1331 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1332 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1333 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1334 #if defined(USING_NEON)
1335 uint16x4x4_t v = {{
1336 (uint16x4_t)U16_from_F(r),
1337 (uint16x4_t)U16_from_F(g),
1338 (uint16x4_t)U16_from_F(b),
1339 (uint16x4_t)U16_from_F(a),
1340 }};
1341 vst4_u16(rgba, v);
1342 #else
1343 U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1344 | cast<U64>(to_fixed(g * 65535)) << 16
1345 | cast<U64>(to_fixed(b * 65535)) << 32
1346 | cast<U64>(to_fixed(a * 65535)) << 48;
1347 store(rgba, px);
1348 #endif
1349 }
1350
FINAL_STAGE(store_161616BE,NoCtx)1351 FINAL_STAGE(store_161616BE, NoCtx) {
1352 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1353 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1354 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1355 #if defined(USING_NEON)
1356 uint16x4x3_t v = {{
1357 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1358 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1359 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1360 }};
1361 vst3_u16(rgb, v);
1362 #else
1363 U32 R = to_fixed(r * 65535),
1364 G = to_fixed(g * 65535),
1365 B = to_fixed(b * 65535);
1366 store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1367 store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1368 store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1369 #endif
1370
1371 }
1372
FINAL_STAGE(store_16161616BE,NoCtx)1373 FINAL_STAGE(store_16161616BE, NoCtx) {
1374 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1375 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1376 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1377 #if defined(USING_NEON)
1378 uint16x4x4_t v = {{
1379 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1380 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1381 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1382 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1383 }};
1384 vst4_u16(rgba, v);
1385 #else
1386 U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1387 | cast<U64>(to_fixed(g * 65535)) << 16
1388 | cast<U64>(to_fixed(b * 65535)) << 32
1389 | cast<U64>(to_fixed(a * 65535)) << 48;
1390 store(rgba, swap_endian_16x4(px));
1391 #endif
1392 }
1393
FINAL_STAGE(store_hhh,NoCtx)1394 FINAL_STAGE(store_hhh, NoCtx) {
1395 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1396 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1397 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1398
1399 U16 R = Half_from_F(r),
1400 G = Half_from_F(g),
1401 B = Half_from_F(b);
1402 #if defined(USING_NEON)
1403 uint16x4x3_t v = {{
1404 (uint16x4_t)R,
1405 (uint16x4_t)G,
1406 (uint16x4_t)B,
1407 }};
1408 vst3_u16(rgb, v);
1409 #else
1410 store_3(rgb+0, R);
1411 store_3(rgb+1, G);
1412 store_3(rgb+2, B);
1413 #endif
1414 }
1415
FINAL_STAGE(store_hhhh,NoCtx)1416 FINAL_STAGE(store_hhhh, NoCtx) {
1417 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1418 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1419 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1420
1421 U16 R = Half_from_F(r),
1422 G = Half_from_F(g),
1423 B = Half_from_F(b),
1424 A = Half_from_F(a);
1425 #if defined(USING_NEON)
1426 uint16x4x4_t v = {{
1427 (uint16x4_t)R,
1428 (uint16x4_t)G,
1429 (uint16x4_t)B,
1430 (uint16x4_t)A,
1431 }};
1432 vst4_u16(rgba, v);
1433 #else
1434 store(rgba, cast<U64>(R) << 0
1435 | cast<U64>(G) << 16
1436 | cast<U64>(B) << 32
1437 | cast<U64>(A) << 48);
1438 #endif
1439 }
1440
FINAL_STAGE(store_fff,NoCtx)1441 FINAL_STAGE(store_fff, NoCtx) {
1442 uintptr_t ptr = (uintptr_t)(dst + 12*i);
1443 assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1444 float* rgb = (float*)ptr; // for this cast to float* to be safe.
1445 #if defined(USING_NEON)
1446 float32x4x3_t v = {{
1447 (float32x4_t)r,
1448 (float32x4_t)g,
1449 (float32x4_t)b,
1450 }};
1451 vst3q_f32(rgb, v);
1452 #else
1453 store_3(rgb+0, r);
1454 store_3(rgb+1, g);
1455 store_3(rgb+2, b);
1456 #endif
1457 }
1458
FINAL_STAGE(store_ffff,NoCtx)1459 FINAL_STAGE(store_ffff, NoCtx) {
1460 uintptr_t ptr = (uintptr_t)(dst + 16*i);
1461 assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1462 float* rgba = (float*)ptr; // for this cast to float* to be safe.
1463 #if defined(USING_NEON)
1464 float32x4x4_t v = {{
1465 (float32x4_t)r,
1466 (float32x4_t)g,
1467 (float32x4_t)b,
1468 (float32x4_t)a,
1469 }};
1470 vst4q_f32(rgba, v);
1471 #else
1472 store_4(rgba+0, r);
1473 store_4(rgba+1, g);
1474 store_4(rgba+2, b);
1475 store_4(rgba+3, a);
1476 #endif
1477 }
1478
1479 #if SKCMS_HAS_MUSTTAIL
1480
exec_stages(StageFn * stages,const void ** contexts,const char * src,char * dst,int i)1481 SI void exec_stages(StageFn* stages, const void** contexts, const char* src, char* dst, int i) {
1482 (*stages)({stages}, contexts, src, dst, F0, F0, F0, F1, i);
1483 }
1484
1485 #else
1486
exec_stages(const Op * ops,const void ** contexts,const char * src,char * dst,int i)1487 static void exec_stages(const Op* ops, const void** contexts,
1488 const char* src, char* dst, int i) {
1489 F r = F0, g = F0, b = F0, a = F1;
1490 while (true) {
1491 switch (*ops++) {
1492 #define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); break;
1493 SKCMS_WORK_OPS(M)
1494 #undef M
1495 #define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); return;
1496 SKCMS_STORE_OPS(M)
1497 #undef M
1498 }
1499 }
1500 }
1501
1502 #endif
1503
1504 // NOLINTNEXTLINE(misc-definitions-in-headers)
run_program(const Op * program,const void ** contexts,SKCMS_MAYBE_UNUSED ptrdiff_t programSize,const char * src,char * dst,int n,const size_t src_bpp,const size_t dst_bpp)1505 void run_program(const Op* program, const void** contexts, SKCMS_MAYBE_UNUSED ptrdiff_t programSize,
1506 const char* src, char* dst, int n,
1507 const size_t src_bpp, const size_t dst_bpp) {
1508 #if SKCMS_HAS_MUSTTAIL
1509 // Convert the program into an array of tailcall stages.
1510 StageFn stages[32];
1511 assert(programSize <= ARRAY_COUNT(stages));
1512
1513 static constexpr StageFn kStageFns[] = {
1514 #define M(name) &Exec_##name,
1515 SKCMS_WORK_OPS(M)
1516 SKCMS_STORE_OPS(M)
1517 #undef M
1518 };
1519
1520 for (ptrdiff_t index = 0; index < programSize; ++index) {
1521 stages[index] = kStageFns[(int)program[index]];
1522 }
1523 #else
1524 // Use the op array as-is.
1525 const Op* stages = program;
1526 #endif
1527
1528 int i = 0;
1529 while (n >= N) {
1530 exec_stages(stages, contexts, src, dst, i);
1531 i += N;
1532 n -= N;
1533 }
1534 if (n > 0) {
1535 char tmp[4*4*N] = {0};
1536
1537 memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1538 exec_stages(stages, contexts, tmp, tmp, 0);
1539 memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1540 }
1541 }
1542