• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2018 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 // Intentionally NO #pragma once... included multiple times.
9 
10 // This file is included from skcms.cc in a namespace with some pre-defines:
11 //    - N:    depth of all vectors, 1,4,8, or 16 (preprocessor define)
12 //    - V<T>: a template to create a vector of N T's.
13 
14 using F   = V<Color>;   // Called F for historic reasons... maybe rename C?
15 using I32 = V<int32_t>;
16 using U64 = V<uint64_t>;
17 using U32 = V<uint32_t>;
18 using U16 = V<uint16_t>;
19 using U8  = V<uint8_t>;
20 
21 
22 #if defined(__GNUC__) && !defined(__clang__)
23     // Once again, GCC is kind of weird, not allowing vector = scalar directly.
24     static constexpr F F0 = F() + 0.0f,
25                        F1 = F() + 1.0f;
26 #else
27     static constexpr F F0 = 0.0f,
28                        F1 = 1.0f;
29 #endif
30 
31 // Instead of checking __AVX__ below, we'll check USING_AVX.
32 // This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
33 // Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
34 
35 #if !defined(USING_AVX)      && N == 8 && defined(__AVX__)
36     #define  USING_AVX
37 #endif
38 #if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
39     #define  USING AVX_F16C
40 #endif
41 #if !defined(USING_AVX2)     && defined(USING_AVX) && defined(__AVX2__)
42     #define  USING_AVX2
43 #endif
44 #if !defined(USING_AVX512F)  && N == 16 && defined(__AVX512F__)
45     #define  USING_AVX512F
46 #endif
47 
48 // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
49 // This is more for organizational clarity... skcms.cc doesn't force these.
50 #if N > 1 && defined(__ARM_NEON)
51     #define USING_NEON
52     #if __ARM_FP & 2
53         #define USING_NEON_F16C
54     #endif
55     #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
56         #define USING_NEON_FP16
57     #endif
58 #endif
59 
60 // These -Wvector-conversion warnings seem to trigger in very bogus situations,
61 // like vst3q_f32() expecting a 16x char rather than a 4x float vector.  :/
62 #if defined(USING_NEON) && defined(__clang__)
63     #pragma clang diagnostic ignored "-Wvector-conversion"
64 #endif
65 
66 // GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
67 // You'd see warnings like, "using AVX even though AVX is not enabled".
68 // We stifle these warnings; our helpers that return U64 are always inlined.
69 #if defined(__SSE__) && defined(__GNUC__)
70     #if !defined(__has_warning)
71         #pragma GCC diagnostic ignored "-Wpsabi"
72     #elif __has_warning("-Wpsabi")
73         #pragma GCC diagnostic ignored "-Wpsabi"
74     #endif
75 #endif
76 
77 #if defined(__clang__)
78     #define FALLTHROUGH [[clang::fallthrough]]
79 #else
80     #define FALLTHROUGH
81 #endif
82 
83 // We tag most helper functions as SI, to enforce good code generation
84 // but also work around what we think is a bug in GCC: when targeting 32-bit
85 // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
86 // MMX mm0 register, which seems to mess with unrelated code that later uses
87 // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
88 //
89 // It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
90 #if defined(__clang__) || defined(__GNUC__)
91     #define SI static inline __attribute__((always_inline))
92 #else
93     #define SI static inline
94 #endif
95 
96 template <typename T, typename P>
load(const P * ptr)97 SI T load(const P* ptr) {
98     T val;
99     small_memcpy(&val, ptr, sizeof(val));
100     return val;
101 }
102 template <typename T, typename P>
store(P * ptr,const T & val)103 SI void store(P* ptr, const T& val) {
104     small_memcpy(ptr, &val, sizeof(val));
105 }
106 
107 // (T)v is a cast when N == 1 and a bit-pun when N>1,
108 // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
109 template <typename D, typename S>
cast(const S & v)110 SI D cast(const S& v) {
111 #if N == 1
112     return (D)v;
113 #elif defined(__clang__)
114     return __builtin_convertvector(v, D);
115 #else
116     D d;
117     for (int i = 0; i < N; i++) {
118         d[i] = v[i];
119     }
120     return d;
121 #endif
122 }
123 
124 template <typename D, typename S>
bit_pun(const S & v)125 SI D bit_pun(const S& v) {
126     static_assert(sizeof(D) == sizeof(v), "");
127     return load<D>(&v);
128 }
129 
130 // When we convert from float to fixed point, it's very common to want to round,
131 // and for some reason compilers generate better code when converting to int32_t.
132 // To serve both those ends, we use this function to_fixed() instead of direct cast().
133 #if defined(USING_NEON_FP16)
134     // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
to_fixed(F f)135     SI U16 to_fixed(F f) {  return cast<U16>(f + 0.5f); }
136 #else
to_fixed(F f)137     SI U32 to_fixed(F f) {  return (U32)cast<I32>(f + 0.5f); }
138 #endif
139 
140 
141 // Sometimes we do something crazy on one branch of a conditonal,
142 // like divide by zero or convert a huge float to an integer,
143 // but then harmlessly select the other side.  That trips up N==1
144 // sanitizer builds, so we make if_then_else() a macro to avoid
145 // evaluating the unused side.
146 
147 #if N == 1
148     #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
149 #else
150     template <typename C, typename T>
if_then_else(C cond,T t,T e)151     SI T if_then_else(C cond, T t, T e) {
152         return bit_pun<T>( ( cond & bit_pun<C>(t)) |
153                            (~cond & bit_pun<C>(e)) );
154     }
155 #endif
156 
157 
F_from_Half(U16 half)158 SI F F_from_Half(U16 half) {
159 #if defined(USING_NEON_FP16)
160     return bit_pun<F>(half);
161 #elif defined(USING_NEON_F16C)
162     return vcvt_f32_f16((float16x4_t)half);
163 #elif defined(USING_AVX512F)
164     return (F)_mm512_cvtph_ps((__m256i)half);
165 #elif defined(USING_AVX_F16C)
166     typedef int16_t __attribute__((vector_size(16))) I16;
167     return __builtin_ia32_vcvtph2ps256((I16)half);
168 #else
169     U32 wide = cast<U32>(half);
170     // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
171     U32 s  = wide & 0x8000,
172         em = wide ^ s;
173 
174     // Constructing the float is easy if the half is not denormalized.
175     F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
176 
177     // Simply flush all denorm half floats to zero.
178     return if_then_else(em < 0x0400, F0, norm);
179 #endif
180 }
181 
182 #if defined(__clang__)
183     // The -((127-15)<<10) underflows that side of the math when
184     // we pass a denorm half float.  It's harmless... we'll take the 0 side anyway.
185     __attribute__((no_sanitize("unsigned-integer-overflow")))
186 #endif
Half_from_F(F f)187 SI U16 Half_from_F(F f) {
188 #if defined(USING_NEON_FP16)
189     return bit_pun<U16>(f);
190 #elif defined(USING_NEON_F16C)
191     return (U16)vcvt_f16_f32(f);
192 #elif defined(USING_AVX512F)
193     return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
194 #elif defined(USING_AVX_F16C)
195     return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
196 #else
197     // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
198     U32 sem = bit_pun<U32>(f),
199         s   = sem & 0x80000000,
200          em = sem ^ s;
201 
202     // For simplicity we flush denorm half floats (including all denorm floats) to zero.
203     return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
204                                                  , (s>>16) + (em>>13) - ((127-15)<<10)));
205 #endif
206 }
207 
208 // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
209 #if defined(USING_NEON_FP16)
swap_endian_16(U16 v)210     SI U16 swap_endian_16(U16 v) {
211         return (U16)vrev16q_u8((uint8x16_t) v);
212     }
213 #elif defined(USING_NEON)
swap_endian_16(U16 v)214     SI U16 swap_endian_16(U16 v) {
215         return (U16)vrev16_u8((uint8x8_t) v);
216     }
217 #endif
218 
swap_endian_16x4(const U64 & rgba)219 SI U64 swap_endian_16x4(const U64& rgba) {
220     return (rgba & 0x00ff00ff00ff00ff) << 8
221          | (rgba & 0xff00ff00ff00ff00) >> 8;
222 }
223 
224 #if defined(USING_NEON_FP16)
min_(F x,F y)225     SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
max_(F x,F y)226     SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
227 #elif defined(USING_NEON)
min_(F x,F y)228     SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
max_(F x,F y)229     SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
230 #else
min_(F x,F y)231     SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
max_(F x,F y)232     SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
233 #endif
234 
floor_(F x)235 SI F floor_(F x) {
236 #if N == 1
237     return floorf_(x);
238 #elif defined(USING_NEON_FP16)
239     return vrndmq_f16(x);
240 #elif defined(__aarch64__)
241     return vrndmq_f32(x);
242 #elif defined(USING_AVX512F)
243     // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
244     // and integer santizer catches that this implicit cast changes the
245     // value from -1 to 65535.  We'll cast manually to work around it.
246     // Read this as `return _mm512_floor_ps(x)`.
247     return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
248 #elif defined(USING_AVX)
249     return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
250 #elif defined(__SSE4_1__)
251     return _mm_floor_ps(x);
252 #else
253     // Round trip through integers with a truncating cast.
254     F roundtrip = cast<F>(cast<I32>(x));
255     // If x is negative, truncating gives the ceiling instead of the floor.
256     return roundtrip - if_then_else(roundtrip > x, F1, F0);
257 
258     // This implementation fails for values of x that are outside
259     // the range an integer can represent.  We expect most x to be small.
260 #endif
261 }
262 
approx_log2(F x)263 SI F approx_log2(F x) {
264 #if defined(USING_NEON_FP16)
265     // TODO(mtklein)
266     return x;
267 #else
268     // The first approximation of log2(x) is its exponent 'e', minus 127.
269     I32 bits = bit_pun<I32>(x);
270 
271     F e = cast<F>(bits) * (1.0f / (1<<23));
272 
273     // If we use the mantissa too we can refine the error signficantly.
274     F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
275 
276     return e - 124.225514990f
277              -   1.498030302f*m
278              -   1.725879990f/(0.3520887068f + m);
279 #endif
280 }
281 
approx_log(F x)282 SI F approx_log(F x) {
283     const float ln2 = 0.69314718f;
284     return ln2 * approx_log2(x);
285 }
286 
approx_exp2(F x)287 SI F approx_exp2(F x) {
288 #if defined(USING_NEON_FP16)
289     // TODO(mtklein)
290     return x;
291 #else
292     F fract = x - floor_(x);
293 
294     F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
295                                     -   1.490129070f*fract
296                                     +  27.728023300f/(4.84252568f - fract));
297     I32 bits = cast<I32>(max_(fbits, F0));
298 
299     return bit_pun<F>(bits);
300 #endif
301 }
302 
approx_pow(F x,float y)303 SI F approx_pow(F x, float y) {
304     return if_then_else((x == F0) | (x == F1), x
305                                              , approx_exp2(approx_log2(x) * y));
306 }
307 
approx_exp(F x)308 SI F approx_exp(F x) {
309     const float log2_e = 1.4426950408889634074f;
310     return approx_exp2(log2_e * x);
311 }
312 
313 // Return tf(x).
apply_tf(const skcms_TransferFunction * tf,F x)314 SI F apply_tf(const skcms_TransferFunction* tf, F x) {
315 #if defined(USING_NEON_FP16)
316     // TODO(mtklein)
317     (void)tf;
318     return x;
319 #else
320     // Peel off the sign bit and set x = |x|.
321     U32 bits = bit_pun<U32>(x),
322         sign = bits & 0x80000000;
323     x = bit_pun<F>(bits ^ sign);
324 
325     // The transfer function has a linear part up to d, exponential at d and after.
326     F v = if_then_else(x < tf->d,            tf->c*x + tf->f
327                                 , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
328 
329     // Tack the sign bit back on.
330     return bit_pun<F>(sign | bit_pun<U32>(v));
331 #endif
332 }
333 
apply_pq(const skcms_TransferFunction * tf,F x)334 SI F apply_pq(const skcms_TransferFunction* tf, F x) {
335 #if defined(USING_NEON_FP16)
336     // TODO(mtklein)
337     (void)tf;
338     return x;
339 #else
340     U32 bits = bit_pun<U32>(x),
341         sign = bits & 0x80000000;
342     x = bit_pun<F>(bits ^ sign);
343 
344     F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
345                        / (tf->d + tf->e * approx_pow(x, tf->c)),
346                      tf->f);
347 
348     return bit_pun<F>(sign | bit_pun<U32>(v));
349 #endif
350 }
351 
apply_hlg(const skcms_TransferFunction * tf,F x)352 SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
353 #if defined(USING_NEON_FP16)
354     // TODO(mtklein)
355     (void)tf;
356     return x;
357 #else
358     const float R = tf->a, G = tf->b,
359                 a = tf->c, b = tf->d, c = tf->e,
360                 K = tf->f + 1;
361     U32 bits = bit_pun<U32>(x),
362         sign = bits & 0x80000000;
363     x = bit_pun<F>(bits ^ sign);
364 
365     F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
366                                , approx_exp((x-c)*a) + b);
367 
368     return K*bit_pun<F>(sign | bit_pun<U32>(v));
369 #endif
370 }
371 
apply_hlginv(const skcms_TransferFunction * tf,F x)372 SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
373 #if defined(USING_NEON_FP16)
374     // TODO(mtklein)
375     (void)tf;
376     return x;
377 #else
378     const float R = tf->a, G = tf->b,
379                 a = tf->c, b = tf->d, c = tf->e,
380                 K = tf->f + 1;
381     U32 bits = bit_pun<U32>(x),
382         sign = bits & 0x80000000;
383     x = bit_pun<F>(bits ^ sign);
384     x /= K;
385 
386     F v = if_then_else(x <= 1, R * approx_pow(x, G)
387                              , a * approx_log(x - b) + c);
388 
389     return bit_pun<F>(sign | bit_pun<U32>(v));
390 #endif
391 }
392 
393 
394 // Strided loads and stores of N values, starting from p.
395 template <typename T, typename P>
load_3(const P * p)396 SI T load_3(const P* p) {
397 #if N == 1
398     return (T)p[0];
399 #elif N == 4
400     return T{p[ 0],p[ 3],p[ 6],p[ 9]};
401 #elif N == 8
402     return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
403 #elif N == 16
404     return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
405              p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
406 #endif
407 }
408 
409 template <typename T, typename P>
load_4(const P * p)410 SI T load_4(const P* p) {
411 #if N == 1
412     return (T)p[0];
413 #elif N == 4
414     return T{p[ 0],p[ 4],p[ 8],p[12]};
415 #elif N == 8
416     return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
417 #elif N == 16
418     return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
419              p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
420 #endif
421 }
422 
423 template <typename T, typename P>
store_3(P * p,const T & v)424 SI void store_3(P* p, const T& v) {
425 #if N == 1
426     p[0] = v;
427 #elif N == 4
428     p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
429 #elif N == 8
430     p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
431     p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
432 #elif N == 16
433     p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
434     p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
435     p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
436     p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
437 #endif
438 }
439 
440 template <typename T, typename P>
store_4(P * p,const T & v)441 SI void store_4(P* p, const T& v) {
442 #if N == 1
443     p[0] = v;
444 #elif N == 4
445     p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
446 #elif N == 8
447     p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
448     p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
449 #elif N == 16
450     p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
451     p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
452     p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
453     p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
454 #endif
455 }
456 
457 
gather_8(const uint8_t * p,I32 ix)458 SI U8 gather_8(const uint8_t* p, I32 ix) {
459 #if N == 1
460     U8 v = p[ix];
461 #elif N == 4
462     U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
463 #elif N == 8
464     U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
465              p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
466 #elif N == 16
467     U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
468              p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
469              p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
470              p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
471 #endif
472     return v;
473 }
474 
gather_16(const uint8_t * p,I32 ix)475 SI U16 gather_16(const uint8_t* p, I32 ix) {
476     // Load the i'th 16-bit value from p.
477     auto load_16 = [p](int i) {
478         return load<uint16_t>(p + 2*i);
479     };
480 #if N == 1
481     U16 v = load_16(ix);
482 #elif N == 4
483     U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
484 #elif N == 8
485     U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
486               load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
487 #elif N == 16
488     U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
489               load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
490               load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
491               load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
492 #endif
493     return v;
494 }
495 
gather_32(const uint8_t * p,I32 ix)496 SI U32 gather_32(const uint8_t* p, I32 ix) {
497     // Load the i'th 32-bit value from p.
498     auto load_32 = [p](int i) {
499         return load<uint32_t>(p + 4*i);
500     };
501 #if N == 1
502     U32 v = load_32(ix);
503 #elif N == 4
504     U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
505 #elif N == 8
506     U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
507               load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
508 #elif N == 16
509     U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
510               load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
511               load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
512               load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
513 #endif
514     // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
515     return v;
516 }
517 
gather_24(const uint8_t * p,I32 ix)518 SI U32 gather_24(const uint8_t* p, I32 ix) {
519     // First, back up a byte.  Any place we're gathering from has a safe junk byte to read
520     // in front of it, either a previous table value, or some tag metadata.
521     p -= 1;
522 
523     // Load the i'th 24-bit value from p, and 1 extra byte.
524     auto load_24_32 = [p](int i) {
525         return load<uint32_t>(p + 3*i);
526     };
527 
528     // Now load multiples of 4 bytes (a junk byte, then r,g,b).
529 #if N == 1
530     U32 v = load_24_32(ix);
531 #elif N == 4
532     U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
533 #elif N == 8 && !defined(USING_AVX2)
534     U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
535               load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
536 #elif N == 8
537     (void)load_24_32;
538     // The gather instruction here doesn't need any particular alignment,
539     // but the intrinsic takes a const int*.
540     const int* p4 = bit_pun<const int*>(p);
541     I32 zero = { 0, 0, 0, 0,  0, 0, 0, 0},
542         mask = {-1,-1,-1,-1, -1,-1,-1,-1};
543     #if defined(__clang__)
544         U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
545     #elif defined(__GNUC__)
546         U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
547     #endif
548 #elif N == 16
549     (void)load_24_32;
550     // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
551     // And AVX-512 swapped the order of arguments.  :/
552     const int* p4 = bit_pun<const int*>(p);
553     U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
554 #endif
555 
556     // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
557     return v >> 8;
558 }
559 
560 #if !defined(__arm__)
gather_48(const uint8_t * p,I32 ix,U64 * v)561     SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
562         // As in gather_24(), with everything doubled.
563         p -= 2;
564 
565         // Load the i'th 48-bit value from p, and 2 extra bytes.
566         auto load_48_64 = [p](int i) {
567             return load<uint64_t>(p + 6*i);
568         };
569 
570     #if N == 1
571         *v = load_48_64(ix);
572     #elif N == 4
573         *v = U64{
574             load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
575         };
576     #elif N == 8 && !defined(USING_AVX2)
577         *v = U64{
578             load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
579             load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
580         };
581     #elif N == 8
582         (void)load_48_64;
583         typedef int32_t   __attribute__((vector_size(16))) Half_I32;
584         typedef long long __attribute__((vector_size(32))) Half_I64;
585 
586         // The gather instruction here doesn't need any particular alignment,
587         // but the intrinsic takes a const long long*.
588         const long long int* p8 = bit_pun<const long long int*>(p);
589 
590         Half_I64 zero = { 0, 0, 0, 0},
591                  mask = {-1,-1,-1,-1};
592 
593         ix *= 6;
594         Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
595                  ix_hi = { ix[4], ix[5], ix[6], ix[7] };
596 
597         #if defined(__clang__)
598             Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
599                      hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
600         #elif defined(__GNUC__)
601             Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
602                      hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
603         #endif
604         store((char*)v +  0, lo);
605         store((char*)v + 32, hi);
606     #elif N == 16
607         (void)load_48_64;
608         const long long int* p8 = bit_pun<const long long int*>(p);
609         __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
610                 hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
611         store((char*)v +  0, lo);
612         store((char*)v + 64, hi);
613     #endif
614 
615         *v >>= 16;
616     }
617 #endif
618 
F_from_U8(U8 v)619 SI F F_from_U8(U8 v) {
620     return cast<F>(v) * (1/255.0f);
621 }
622 
F_from_U16_BE(U16 v)623 SI F F_from_U16_BE(U16 v) {
624     // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
625     // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
626     U16 lo = (v >> 8),
627         hi = (v << 8) & 0xffff;
628     return cast<F>(lo|hi) * (1/65535.0f);
629 }
630 
U16_from_F(F v)631 SI U16 U16_from_F(F v) {
632     // 65535 == inf in FP16, so promote to FP32 before converting.
633     return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
634 }
635 
minus_1_ulp(F v)636 SI F minus_1_ulp(F v) {
637 #if defined(USING_NEON_FP16)
638     return bit_pun<F>( bit_pun<U16>(v) - 1 );
639 #else
640     return bit_pun<F>( bit_pun<U32>(v) - 1 );
641 #endif
642 }
643 
table(const skcms_Curve * curve,F v)644 SI F table(const skcms_Curve* curve, F v) {
645     // Clamp the input to [0,1], then scale to a table index.
646     F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
647 
648     // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
649     I32 lo = cast<I32>(            ix      ),
650         hi = cast<I32>(minus_1_ulp(ix+1.0f));
651     F t = ix - cast<F>(lo);  // i.e. the fractional part of ix.
652 
653     // TODO: can we load l and h simultaneously?  Each entry in 'h' is either
654     // the same as in 'l' or adjacent.  We have a rough idea that's it'd always be safe
655     // to read adjacent entries and perhaps underflow the table by a byte or two
656     // (it'd be junk, but always safe to read).  Not sure how to lerp yet.
657     F l,h;
658     if (curve->table_8) {
659         l = F_from_U8(gather_8(curve->table_8, lo));
660         h = F_from_U8(gather_8(curve->table_8, hi));
661     } else {
662         l = F_from_U16_BE(gather_16(curve->table_16, lo));
663         h = F_from_U16_BE(gather_16(curve->table_16, hi));
664     }
665     return l + (h-l)*t;
666 }
667 
sample_clut_8(const uint8_t * grid_8,I32 ix,F * r,F * g,F * b)668 SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
669     U32 rgb = gather_24(grid_8, ix);
670 
671     *r = cast<F>((rgb >>  0) & 0xff) * (1/255.0f);
672     *g = cast<F>((rgb >>  8) & 0xff) * (1/255.0f);
673     *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
674 }
675 
sample_clut_8(const uint8_t * grid_8,I32 ix,F * r,F * g,F * b,F * a)676 SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
677     // TODO: don't forget to optimize gather_32().
678     U32 rgba = gather_32(grid_8, ix);
679 
680     *r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
681     *g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
682     *b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
683     *a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
684 }
685 
sample_clut_16(const uint8_t * grid_16,I32 ix,F * r,F * g,F * b)686 SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
687 #if defined(__arm__)
688     // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
689     *r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
690     *g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
691     *b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
692 #else
693     // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
694     U64 rgb;
695     gather_48(grid_16, ix, &rgb);
696     rgb = swap_endian_16x4(rgb);
697 
698     *r = cast<F>((rgb >>  0) & 0xffff) * (1/65535.0f);
699     *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
700     *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
701 #endif
702 }
703 
sample_clut_16(const uint8_t * grid_16,I32 ix,F * r,F * g,F * b,F * a)704 SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
705     // TODO: gather_64()-based fast path?
706     *r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
707     *g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
708     *b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
709     *a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
710 }
711 
clut(uint32_t input_channels,uint32_t output_channels,const uint8_t grid_points[4],const uint8_t * grid_8,const uint8_t * grid_16,F * r,F * g,F * b,F * a)712 static void clut(uint32_t input_channels, uint32_t output_channels,
713                  const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
714                  F* r, F* g, F* b, F* a) {
715 
716     const int dim = (int)input_channels;
717     assert (0 < dim && dim <= 4);
718     assert (output_channels == 3 ||
719             output_channels == 4);
720 
721     // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
722     I32 index [8];  // Index contribution by dimension, first low from 0, then high from 4.
723     F   weight[8];  // Weight for each contribution, again first low, then high.
724 
725     // O(dim) work first: calculate index,weight from r,g,b,a.
726     const F inputs[] = { *r,*g,*b,*a };
727     for (int i = dim-1, stride = 1; i >= 0; i--) {
728         // x is where we logically want to sample the grid in the i-th dimension.
729         F x = inputs[i] * (float)(grid_points[i] - 1);
730 
731         // But we can't index at floats.  lo and hi are the two integer grid points surrounding x.
732         I32 lo = cast<I32>(            x      ),   // i.e. trunc(x) == floor(x) here.
733             hi = cast<I32>(minus_1_ulp(x+1.0f));
734         // Notice how we fold in the accumulated stride across previous dimensions here.
735         index[i+0] = lo * stride;
736         index[i+4] = hi * stride;
737         stride *= grid_points[i];
738 
739         // We'll interpolate between those two integer grid points by t.
740         F t = x - cast<F>(lo);  // i.e. fract(x)
741         weight[i+0] = 1-t;
742         weight[i+4] = t;
743     }
744 
745     *r = *g = *b = F0;
746     if (output_channels == 4) {
747         *a = F0;
748     }
749 
750     // We'll sample 2^dim == 1<<dim table entries per pixel,
751     // in all combinations of low and high in each dimension.
752     for (int combo = 0; combo < (1<<dim); combo++) {  // This loop can be done in any order.
753 
754         // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
755         // where 0 selects the low index contribution and its weight 1-t,
756         // or 4 the high index contribution and its weight t.
757 
758         // Since 0<dim≤4, we can always just start off with the 0-th channel,
759         // then handle the others conditionally.
760         I32 ix = index [0 + (combo&1)*4];
761         F    w = weight[0 + (combo&1)*4];
762 
763         switch ((dim-1)&3) {  // This lets the compiler know there are no other cases to handle.
764             case 3: ix += index [3 + (combo&8)/2];
765                     w  *= weight[3 + (combo&8)/2];
766                     FALLTHROUGH;
767                     // fall through
768 
769             case 2: ix += index [2 + (combo&4)*1];
770                     w  *= weight[2 + (combo&4)*1];
771                     FALLTHROUGH;
772                     // fall through
773 
774             case 1: ix += index [1 + (combo&2)*2];
775                     w  *= weight[1 + (combo&2)*2];
776         }
777 
778         F R,G,B,A=F0;
779         if (output_channels == 3) {
780             if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
781             else        { sample_clut_16(grid_16,ix, &R,&G,&B); }
782         } else {
783             if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
784             else        { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
785         }
786         *r += w*R;
787         *g += w*G;
788         *b += w*B;
789         *a += w*A;
790     }
791 }
792 
clut(const skcms_A2B * a2b,F * r,F * g,F * b,F a)793 static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
794     clut(a2b->input_channels, a2b->output_channels,
795          a2b->grid_points, a2b->grid_8, a2b->grid_16,
796          r,g,b,&a);
797 }
clut(const skcms_B2A * b2a,F * r,F * g,F * b,F * a)798 static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
799     clut(b2a->input_channels, b2a->output_channels,
800          b2a->grid_points, b2a->grid_8, b2a->grid_16,
801          r,g,b,a);
802 }
803 
exec_ops(const Op * ops,const void ** args,const char * src,char * dst,int i)804 static void exec_ops(const Op* ops, const void** args,
805                      const char* src, char* dst, int i) {
806     F r = F0, g = F0, b = F0, a = F1;
807     while (true) {
808         switch (*ops++) {
809             case Op_load_a8:{
810                 a = F_from_U8(load<U8>(src + 1*i));
811             } break;
812 
813             case Op_load_g8:{
814                 r = g = b = F_from_U8(load<U8>(src + 1*i));
815             } break;
816 
817             case Op_load_4444:{
818                 U16 abgr = load<U16>(src + 2*i);
819 
820                 r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
821                 g = cast<F>((abgr >>  8) & 0xf) * (1/15.0f);
822                 b = cast<F>((abgr >>  4) & 0xf) * (1/15.0f);
823                 a = cast<F>((abgr >>  0) & 0xf) * (1/15.0f);
824             } break;
825 
826             case Op_load_565:{
827                 U16 rgb = load<U16>(src + 2*i);
828 
829                 r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
830                 g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
831                 b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
832             } break;
833 
834             case Op_load_888:{
835                 const uint8_t* rgb = (const uint8_t*)(src + 3*i);
836             #if defined(USING_NEON_FP16)
837                 // See the explanation under USING_NEON below.  This is that doubled up.
838                 uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
839                 v = vld3q_lane_u8(rgb+ 0, v,  0);
840                 v = vld3q_lane_u8(rgb+ 3, v,  2);
841                 v = vld3q_lane_u8(rgb+ 6, v,  4);
842                 v = vld3q_lane_u8(rgb+ 9, v,  6);
843 
844                 v = vld3q_lane_u8(rgb+12, v,  8);
845                 v = vld3q_lane_u8(rgb+15, v, 10);
846                 v = vld3q_lane_u8(rgb+18, v, 12);
847                 v = vld3q_lane_u8(rgb+21, v, 14);
848 
849                 r = cast<F>((U16)v.val[0]) * (1/255.0f);
850                 g = cast<F>((U16)v.val[1]) * (1/255.0f);
851                 b = cast<F>((U16)v.val[2]) * (1/255.0f);
852             #elif defined(USING_NEON)
853                 // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
854                 // a time.  Since we're doing that, we might as well load them into 16-bit lanes.
855                 // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
856                 uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
857                 v = vld3_lane_u8(rgb+0, v, 0);
858                 v = vld3_lane_u8(rgb+3, v, 2);
859                 v = vld3_lane_u8(rgb+6, v, 4);
860                 v = vld3_lane_u8(rgb+9, v, 6);
861 
862                 // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
863                 // convert to F.  (Again, U32 would be even better here if drop ARMv7 or split
864                 // ARMv7 and ARMv8 impls.)
865                 r = cast<F>((U16)v.val[0]) * (1/255.0f);
866                 g = cast<F>((U16)v.val[1]) * (1/255.0f);
867                 b = cast<F>((U16)v.val[2]) * (1/255.0f);
868             #else
869                 r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
870                 g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
871                 b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
872             #endif
873             } break;
874 
875             case Op_load_8888:{
876                 U32 rgba = load<U32>(src + 4*i);
877 
878                 r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
879                 g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
880                 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
881                 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
882             } break;
883 
884             case Op_load_8888_palette8:{
885                 const uint8_t* palette = (const uint8_t*) *args++;
886                 I32 ix = cast<I32>(load<U8>(src + 1*i));
887                 U32 rgba = gather_32(palette, ix);
888 
889                 r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
890                 g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
891                 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
892                 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
893             } break;
894 
895             case Op_load_1010102:{
896                 U32 rgba = load<U32>(src + 4*i);
897 
898                 r = cast<F>((rgba >>  0) & 0x3ff) * (1/1023.0f);
899                 g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
900                 b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
901                 a = cast<F>((rgba >> 30) & 0x3  ) * (1/   3.0f);
902             } break;
903 
904             case Op_load_161616LE:{
905                 uintptr_t ptr = (uintptr_t)(src + 6*i);
906                 assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
907                 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
908             #if defined(USING_NEON_FP16)
909                 uint16x8x3_t v = vld3q_u16(rgb);
910                 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
911                 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
912                 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
913             #elif defined(USING_NEON)
914                 uint16x4x3_t v = vld3_u16(rgb);
915                 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
916                 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
917                 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
918             #else
919                 r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
920                 g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
921                 b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
922             #endif
923             } break;
924 
925             case Op_load_16161616LE:{
926                 uintptr_t ptr = (uintptr_t)(src + 8*i);
927                 assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
928                 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
929             #if defined(USING_NEON_FP16)
930                 uint16x8x4_t v = vld4q_u16(rgba);
931                 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
932                 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
933                 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
934                 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
935             #elif defined(USING_NEON)
936                 uint16x4x4_t v = vld4_u16(rgba);
937                 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
938                 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
939                 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
940                 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
941             #else
942                 U64 px = load<U64>(rgba);
943 
944                 r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
945                 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
946                 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
947                 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
948             #endif
949             } break;
950 
951             case Op_load_161616BE:{
952                 uintptr_t ptr = (uintptr_t)(src + 6*i);
953                 assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
954                 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
955             #if defined(USING_NEON_FP16)
956                 uint16x8x3_t v = vld3q_u16(rgb);
957                 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
958                 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
959                 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
960             #elif defined(USING_NEON)
961                 uint16x4x3_t v = vld3_u16(rgb);
962                 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
963                 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
964                 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
965             #else
966                 U32 R = load_3<U32>(rgb+0),
967                     G = load_3<U32>(rgb+1),
968                     B = load_3<U32>(rgb+2);
969                 // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
970                 r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
971                 g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
972                 b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
973             #endif
974             } break;
975 
976             case Op_load_16161616BE:{
977                 uintptr_t ptr = (uintptr_t)(src + 8*i);
978                 assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
979                 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
980             #if defined(USING_NEON_FP16)
981                 uint16x8x4_t v = vld4q_u16(rgba);
982                 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
983                 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
984                 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
985                 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
986             #elif defined(USING_NEON)
987                 uint16x4x4_t v = vld4_u16(rgba);
988                 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
989                 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
990                 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
991                 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
992             #else
993                 U64 px = swap_endian_16x4(load<U64>(rgba));
994 
995                 r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
996                 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
997                 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
998                 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
999             #endif
1000             } break;
1001 
1002             case Op_load_hhh:{
1003                 uintptr_t ptr = (uintptr_t)(src + 6*i);
1004                 assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
1005                 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1006             #if defined(USING_NEON_FP16)
1007                 uint16x8x3_t v = vld3q_u16(rgb);
1008                 U16 R = (U16)v.val[0],
1009                     G = (U16)v.val[1],
1010                     B = (U16)v.val[2];
1011             #elif defined(USING_NEON)
1012                 uint16x4x3_t v = vld3_u16(rgb);
1013                 U16 R = (U16)v.val[0],
1014                     G = (U16)v.val[1],
1015                     B = (U16)v.val[2];
1016             #else
1017                 U16 R = load_3<U16>(rgb+0),
1018                     G = load_3<U16>(rgb+1),
1019                     B = load_3<U16>(rgb+2);
1020             #endif
1021                 r = F_from_Half(R);
1022                 g = F_from_Half(G);
1023                 b = F_from_Half(B);
1024             } break;
1025 
1026             case Op_load_hhhh:{
1027                 uintptr_t ptr = (uintptr_t)(src + 8*i);
1028                 assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
1029                 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1030             #if defined(USING_NEON_FP16)
1031                 uint16x8x4_t v = vld4q_u16(rgba);
1032                 U16 R = (U16)v.val[0],
1033                     G = (U16)v.val[1],
1034                     B = (U16)v.val[2],
1035                     A = (U16)v.val[3];
1036             #elif defined(USING_NEON)
1037                 uint16x4x4_t v = vld4_u16(rgba);
1038                 U16 R = (U16)v.val[0],
1039                     G = (U16)v.val[1],
1040                     B = (U16)v.val[2],
1041                     A = (U16)v.val[3];
1042             #else
1043                 U64 px = load<U64>(rgba);
1044                 U16 R = cast<U16>((px >>  0) & 0xffff),
1045                     G = cast<U16>((px >> 16) & 0xffff),
1046                     B = cast<U16>((px >> 32) & 0xffff),
1047                     A = cast<U16>((px >> 48) & 0xffff);
1048             #endif
1049                 r = F_from_Half(R);
1050                 g = F_from_Half(G);
1051                 b = F_from_Half(B);
1052                 a = F_from_Half(A);
1053             } break;
1054 
1055             case Op_load_fff:{
1056                 uintptr_t ptr = (uintptr_t)(src + 12*i);
1057                 assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
1058                 const float* rgb = (const float*)ptr;       // cast to const float* to be safe.
1059             #if defined(USING_NEON_FP16)
1060                 float32x4x3_t lo = vld3q_f32(rgb +  0),
1061                               hi = vld3q_f32(rgb + 12);
1062                 r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1063                 g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1064                 b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1065             #elif defined(USING_NEON)
1066                 float32x4x3_t v = vld3q_f32(rgb);
1067                 r = (F)v.val[0];
1068                 g = (F)v.val[1];
1069                 b = (F)v.val[2];
1070             #else
1071                 r = load_3<F>(rgb+0);
1072                 g = load_3<F>(rgb+1);
1073                 b = load_3<F>(rgb+2);
1074             #endif
1075             } break;
1076 
1077             case Op_load_ffff:{
1078                 uintptr_t ptr = (uintptr_t)(src + 16*i);
1079                 assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
1080                 const float* rgba = (const float*)ptr;      // cast to const float* to be safe.
1081             #if defined(USING_NEON_FP16)
1082                 float32x4x4_t lo = vld4q_f32(rgba +  0),
1083                               hi = vld4q_f32(rgba + 16);
1084                 r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1085                 g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1086                 b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1087                 a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
1088             #elif defined(USING_NEON)
1089                 float32x4x4_t v = vld4q_f32(rgba);
1090                 r = (F)v.val[0];
1091                 g = (F)v.val[1];
1092                 b = (F)v.val[2];
1093                 a = (F)v.val[3];
1094             #else
1095                 r = load_4<F>(rgba+0);
1096                 g = load_4<F>(rgba+1);
1097                 b = load_4<F>(rgba+2);
1098                 a = load_4<F>(rgba+3);
1099             #endif
1100             } break;
1101 
1102             case Op_swap_rb:{
1103                 F t = r;
1104                 r = b;
1105                 b = t;
1106             } break;
1107 
1108             case Op_clamp:{
1109                 r = max_(F0, min_(r, F1));
1110                 g = max_(F0, min_(g, F1));
1111                 b = max_(F0, min_(b, F1));
1112                 a = max_(F0, min_(a, F1));
1113             } break;
1114 
1115             case Op_invert:{
1116                 r = F1 - r;
1117                 g = F1 - g;
1118                 b = F1 - b;
1119                 a = F1 - a;
1120             } break;
1121 
1122             case Op_force_opaque:{
1123                 a = F1;
1124             } break;
1125 
1126             case Op_premul:{
1127                 r *= a;
1128                 g *= a;
1129                 b *= a;
1130             } break;
1131 
1132             case Op_unpremul:{
1133                 F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1134                 r *= scale;
1135                 g *= scale;
1136                 b *= scale;
1137             } break;
1138 
1139             case Op_matrix_3x3:{
1140                 const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
1141                 const float* m = &matrix->vals[0][0];
1142 
1143                 F R = m[0]*r + m[1]*g + m[2]*b,
1144                   G = m[3]*r + m[4]*g + m[5]*b,
1145                   B = m[6]*r + m[7]*g + m[8]*b;
1146 
1147                 r = R;
1148                 g = G;
1149                 b = B;
1150             } break;
1151 
1152             case Op_matrix_3x4:{
1153                 const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
1154                 const float* m = &matrix->vals[0][0];
1155 
1156                 F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1157                   G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1158                   B = m[8]*r + m[9]*g + m[10]*b + m[11];
1159 
1160                 r = R;
1161                 g = G;
1162                 b = B;
1163             } break;
1164 
1165             case Op_lab_to_xyz:{
1166                 // The L*a*b values are in r,g,b, but normalized to [0,1].  Reconstruct them:
1167                 F L = r * 100.0f,
1168                   A = g * 255.0f - 128.0f,
1169                   B = b * 255.0f - 128.0f;
1170 
1171                 // Convert to CIE XYZ.
1172                 F Y = (L + 16.0f) * (1/116.0f),
1173                   X = Y + A*(1/500.0f),
1174                   Z = Y - B*(1/200.0f);
1175 
1176                 X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1177                 Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1178                 Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1179 
1180                 // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1181                 r = X * 0.9642f;
1182                 g = Y          ;
1183                 b = Z * 0.8249f;
1184             } break;
1185 
1186             // As above, in reverse.
1187             case Op_xyz_to_lab:{
1188                 F X = r * (1/0.9642f),
1189                   Y = g,
1190                   Z = b * (1/0.8249f);
1191 
1192                 X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
1193                 Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
1194                 Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
1195 
1196                 F L = Y*116.0f - 16.0f,
1197                   A = (X-Y)*500.0f,
1198                   B = (Y-Z)*200.0f;
1199 
1200                 r = L * (1/100.f);
1201                 g = (A + 128.0f) * (1/255.0f);
1202                 b = (B + 128.0f) * (1/255.0f);
1203             } break;
1204 
1205             case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
1206             case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
1207             case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
1208             case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
1209 
1210             case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
1211             case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
1212             case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
1213             case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
1214 
1215             case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
1216             case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
1217             case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
1218             case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
1219 
1220             case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
1221             case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
1222             case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
1223             case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
1224 
1225             case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
1226             case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
1227             case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
1228             case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
1229 
1230             case Op_clut_A2B: {
1231                 const skcms_A2B* a2b = (const skcms_A2B*) *args++;
1232                 clut(a2b, &r,&g,&b,a);
1233 
1234                 if (a2b->input_channels == 4) {
1235                     // CMYK is opaque.
1236                     a = F1;
1237                 }
1238             } break;
1239 
1240             case Op_clut_B2A: {
1241                 const skcms_B2A* b2a = (const skcms_B2A*) *args++;
1242                 clut(b2a, &r,&g,&b,&a);
1243             } break;
1244 
1245     // Notice, from here on down the store_ ops all return, ending the loop.
1246 
1247             case Op_store_a8: {
1248                 store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1249             } return;
1250 
1251             case Op_store_g8: {
1252                 // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1253                 store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1254             } return;
1255 
1256             case Op_store_4444: {
1257                 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1258                                     | cast<U16>(to_fixed(g * 15) <<  8)
1259                                     | cast<U16>(to_fixed(b * 15) <<  4)
1260                                     | cast<U16>(to_fixed(a * 15) <<  0));
1261             } return;
1262 
1263             case Op_store_565: {
1264                 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) <<  0 )
1265                                     | cast<U16>(to_fixed(g * 63) <<  5 )
1266                                     | cast<U16>(to_fixed(b * 31) << 11 ));
1267             } return;
1268 
1269             case Op_store_888: {
1270                 uint8_t* rgb = (uint8_t*)dst + 3*i;
1271             #if defined(USING_NEON_FP16)
1272                 // See the explanation under USING_NEON below.  This is that doubled up.
1273                 U16 R = to_fixed(r * 255),
1274                     G = to_fixed(g * 255),
1275                     B = to_fixed(b * 255);
1276 
1277                 uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
1278                 vst3q_lane_u8(rgb+ 0, v,  0);
1279                 vst3q_lane_u8(rgb+ 3, v,  2);
1280                 vst3q_lane_u8(rgb+ 6, v,  4);
1281                 vst3q_lane_u8(rgb+ 9, v,  6);
1282 
1283                 vst3q_lane_u8(rgb+12, v,  8);
1284                 vst3q_lane_u8(rgb+15, v, 10);
1285                 vst3q_lane_u8(rgb+18, v, 12);
1286                 vst3q_lane_u8(rgb+21, v, 14);
1287             #elif defined(USING_NEON)
1288                 // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1289                 // get there via U16 to save some instructions converting to float.  And just
1290                 // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1291                 U16 R = cast<U16>(to_fixed(r * 255)),
1292                     G = cast<U16>(to_fixed(g * 255)),
1293                     B = cast<U16>(to_fixed(b * 255));
1294 
1295                 uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1296                 vst3_lane_u8(rgb+0, v, 0);
1297                 vst3_lane_u8(rgb+3, v, 2);
1298                 vst3_lane_u8(rgb+6, v, 4);
1299                 vst3_lane_u8(rgb+9, v, 6);
1300             #else
1301                 store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1302                 store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1303                 store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1304             #endif
1305             } return;
1306 
1307             case Op_store_8888: {
1308                 store(dst + 4*i, cast<U32>(to_fixed(r * 255)) <<  0
1309                                | cast<U32>(to_fixed(g * 255)) <<  8
1310                                | cast<U32>(to_fixed(b * 255)) << 16
1311                                | cast<U32>(to_fixed(a * 255)) << 24);
1312             } return;
1313 
1314             case Op_store_1010102: {
1315                 store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) <<  0
1316                                | cast<U32>(to_fixed(g * 1023)) << 10
1317                                | cast<U32>(to_fixed(b * 1023)) << 20
1318                                | cast<U32>(to_fixed(a *    3)) << 30);
1319             } return;
1320 
1321             case Op_store_161616LE: {
1322                 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1323                 assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1324                 uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
1325             #if defined(USING_NEON_FP16)
1326                 uint16x8x3_t v = {{
1327                     (uint16x8_t)U16_from_F(r),
1328                     (uint16x8_t)U16_from_F(g),
1329                     (uint16x8_t)U16_from_F(b),
1330                 }};
1331                 vst3q_u16(rgb, v);
1332             #elif defined(USING_NEON)
1333                 uint16x4x3_t v = {{
1334                     (uint16x4_t)U16_from_F(r),
1335                     (uint16x4_t)U16_from_F(g),
1336                     (uint16x4_t)U16_from_F(b),
1337                 }};
1338                 vst3_u16(rgb, v);
1339             #else
1340                 store_3(rgb+0, U16_from_F(r));
1341                 store_3(rgb+1, U16_from_F(g));
1342                 store_3(rgb+2, U16_from_F(b));
1343             #endif
1344 
1345             } return;
1346 
1347             case Op_store_16161616LE: {
1348                 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1349                 assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
1350                 uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
1351             #if defined(USING_NEON_FP16)
1352                 uint16x8x4_t v = {{
1353                     (uint16x8_t)U16_from_F(r),
1354                     (uint16x8_t)U16_from_F(g),
1355                     (uint16x8_t)U16_from_F(b),
1356                     (uint16x8_t)U16_from_F(a),
1357                 }};
1358                 vst4q_u16(rgba, v);
1359             #elif defined(USING_NEON)
1360                 uint16x4x4_t v = {{
1361                     (uint16x4_t)U16_from_F(r),
1362                     (uint16x4_t)U16_from_F(g),
1363                     (uint16x4_t)U16_from_F(b),
1364                     (uint16x4_t)U16_from_F(a),
1365                 }};
1366                 vst4_u16(rgba, v);
1367             #else
1368                 U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
1369                        | cast<U64>(to_fixed(g * 65535)) << 16
1370                        | cast<U64>(to_fixed(b * 65535)) << 32
1371                        | cast<U64>(to_fixed(a * 65535)) << 48;
1372                 store(rgba, px);
1373             #endif
1374             } return;
1375 
1376             case Op_store_161616BE: {
1377                 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1378                 assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1379                 uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
1380             #if defined(USING_NEON_FP16)
1381                 uint16x8x3_t v = {{
1382                     (uint16x8_t)swap_endian_16(U16_from_F(r)),
1383                     (uint16x8_t)swap_endian_16(U16_from_F(g)),
1384                     (uint16x8_t)swap_endian_16(U16_from_F(b)),
1385                 }};
1386                 vst3q_u16(rgb, v);
1387             #elif defined(USING_NEON)
1388                 uint16x4x3_t v = {{
1389                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1390                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1391                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1392                 }};
1393                 vst3_u16(rgb, v);
1394             #else
1395                 U32 R = to_fixed(r * 65535),
1396                     G = to_fixed(g * 65535),
1397                     B = to_fixed(b * 65535);
1398                 store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1399                 store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1400                 store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1401             #endif
1402 
1403             } return;
1404 
1405             case Op_store_16161616BE: {
1406                 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1407                 assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
1408                 uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
1409             #if defined(USING_NEON_FP16)
1410                 uint16x8x4_t v = {{
1411                     (uint16x8_t)swap_endian_16(U16_from_F(r)),
1412                     (uint16x8_t)swap_endian_16(U16_from_F(g)),
1413                     (uint16x8_t)swap_endian_16(U16_from_F(b)),
1414                     (uint16x8_t)swap_endian_16(U16_from_F(a)),
1415                 }};
1416                 vst4q_u16(rgba, v);
1417             #elif defined(USING_NEON)
1418                 uint16x4x4_t v = {{
1419                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1420                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1421                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1422                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1423                 }};
1424                 vst4_u16(rgba, v);
1425             #else
1426                 U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
1427                        | cast<U64>(to_fixed(g * 65535)) << 16
1428                        | cast<U64>(to_fixed(b * 65535)) << 32
1429                        | cast<U64>(to_fixed(a * 65535)) << 48;
1430                 store(rgba, swap_endian_16x4(px));
1431             #endif
1432             } return;
1433 
1434             case Op_store_hhh: {
1435                 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1436                 assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1437                 uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
1438 
1439                 U16 R = Half_from_F(r),
1440                     G = Half_from_F(g),
1441                     B = Half_from_F(b);
1442             #if defined(USING_NEON_FP16)
1443                 uint16x8x3_t v = {{
1444                     (uint16x8_t)R,
1445                     (uint16x8_t)G,
1446                     (uint16x8_t)B,
1447                 }};
1448                 vst3q_u16(rgb, v);
1449             #elif defined(USING_NEON)
1450                 uint16x4x3_t v = {{
1451                     (uint16x4_t)R,
1452                     (uint16x4_t)G,
1453                     (uint16x4_t)B,
1454                 }};
1455                 vst3_u16(rgb, v);
1456             #else
1457                 store_3(rgb+0, R);
1458                 store_3(rgb+1, G);
1459                 store_3(rgb+2, B);
1460             #endif
1461             } return;
1462 
1463             case Op_store_hhhh: {
1464                 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1465                 assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1466                 uint16_t* rgba = (uint16_t*)ptr;         // for this cast to uint16_t* to be safe.
1467 
1468                 U16 R = Half_from_F(r),
1469                     G = Half_from_F(g),
1470                     B = Half_from_F(b),
1471                     A = Half_from_F(a);
1472             #if defined(USING_NEON_FP16)
1473                 uint16x8x4_t v = {{
1474                     (uint16x8_t)R,
1475                     (uint16x8_t)G,
1476                     (uint16x8_t)B,
1477                     (uint16x8_t)A,
1478                 }};
1479                 vst4q_u16(rgba, v);
1480             #elif defined(USING_NEON)
1481                 uint16x4x4_t v = {{
1482                     (uint16x4_t)R,
1483                     (uint16x4_t)G,
1484                     (uint16x4_t)B,
1485                     (uint16x4_t)A,
1486                 }};
1487                 vst4_u16(rgba, v);
1488             #else
1489                 store(rgba, cast<U64>(R) <<  0
1490                           | cast<U64>(G) << 16
1491                           | cast<U64>(B) << 32
1492                           | cast<U64>(A) << 48);
1493             #endif
1494 
1495             } return;
1496 
1497             case Op_store_fff: {
1498                 uintptr_t ptr = (uintptr_t)(dst + 12*i);
1499                 assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
1500                 float* rgb = (float*)ptr;                // for this cast to float* to be safe.
1501             #if defined(USING_NEON_FP16)
1502                 float32x4x3_t lo = {{
1503                     vcvt_f32_f16(vget_low_f16(r)),
1504                     vcvt_f32_f16(vget_low_f16(g)),
1505                     vcvt_f32_f16(vget_low_f16(b)),
1506                 }}, hi = {{
1507                     vcvt_f32_f16(vget_high_f16(r)),
1508                     vcvt_f32_f16(vget_high_f16(g)),
1509                     vcvt_f32_f16(vget_high_f16(b)),
1510                 }};
1511                 vst3q_f32(rgb +  0, lo);
1512                 vst3q_f32(rgb + 12, hi);
1513             #elif defined(USING_NEON)
1514                 float32x4x3_t v = {{
1515                     (float32x4_t)r,
1516                     (float32x4_t)g,
1517                     (float32x4_t)b,
1518                 }};
1519                 vst3q_f32(rgb, v);
1520             #else
1521                 store_3(rgb+0, r);
1522                 store_3(rgb+1, g);
1523                 store_3(rgb+2, b);
1524             #endif
1525             } return;
1526 
1527             case Op_store_ffff: {
1528                 uintptr_t ptr = (uintptr_t)(dst + 16*i);
1529                 assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
1530                 float* rgba = (float*)ptr;               // for this cast to float* to be safe.
1531             #if defined(USING_NEON_FP16)
1532                 float32x4x4_t lo = {{
1533                     vcvt_f32_f16(vget_low_f16(r)),
1534                     vcvt_f32_f16(vget_low_f16(g)),
1535                     vcvt_f32_f16(vget_low_f16(b)),
1536                     vcvt_f32_f16(vget_low_f16(a)),
1537                 }}, hi = {{
1538                     vcvt_f32_f16(vget_high_f16(r)),
1539                     vcvt_f32_f16(vget_high_f16(g)),
1540                     vcvt_f32_f16(vget_high_f16(b)),
1541                     vcvt_f32_f16(vget_high_f16(a)),
1542                 }};
1543                 vst4q_f32(rgba +  0, lo);
1544                 vst4q_f32(rgba + 16, hi);
1545             #elif defined(USING_NEON)
1546                 float32x4x4_t v = {{
1547                     (float32x4_t)r,
1548                     (float32x4_t)g,
1549                     (float32x4_t)b,
1550                     (float32x4_t)a,
1551                 }};
1552                 vst4q_f32(rgba, v);
1553             #else
1554                 store_4(rgba+0, r);
1555                 store_4(rgba+1, g);
1556                 store_4(rgba+2, b);
1557                 store_4(rgba+3, a);
1558             #endif
1559             } return;
1560         }
1561     }
1562 }
1563 
1564 
run_program(const Op * program,const void ** arguments,const char * src,char * dst,int n,const size_t src_bpp,const size_t dst_bpp)1565 static void run_program(const Op* program, const void** arguments,
1566                         const char* src, char* dst, int n,
1567                         const size_t src_bpp, const size_t dst_bpp) {
1568     int i = 0;
1569     while (n >= N) {
1570         exec_ops(program, arguments, src, dst, i);
1571         i += N;
1572         n -= N;
1573     }
1574     if (n > 0) {
1575         char tmp[4*4*N] = {0};
1576 
1577         memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1578         exec_ops(program, arguments, tmp, tmp, 0);
1579         memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1580     }
1581 }
1582 
1583 // Clean up any #defines we may have set so that we can be #included again.
1584 #if defined(USING_AVX)
1585     #undef  USING_AVX
1586 #endif
1587 #if defined(USING_AVX_F16C)
1588     #undef  USING_AVX_F16C
1589 #endif
1590 #if defined(USING_AVX2)
1591     #undef  USING_AVX2
1592 #endif
1593 #if defined(USING_AVX512F)
1594     #undef  USING_AVX512F
1595 #endif
1596 
1597 #if defined(USING_NEON)
1598     #undef  USING_NEON
1599 #endif
1600 #if defined(USING_NEON_F16C)
1601     #undef  USING_NEON_F16C
1602 #endif
1603 #if defined(USING_NEON_FP16)
1604     #undef  USING_NEON_FP16
1605 #endif
1606 
1607 #undef FALLTHROUGH
1608