1 /*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 // Intentionally NO #pragma once... included multiple times.
9
10 // This file is included from skcms.cc in a namespace with some pre-defines:
11 // - N: depth of all vectors, 1,4,8, or 16 (preprocessor define)
12 // - V<T>: a template to create a vector of N T's.
13
14 using F = V<Color>; // Called F for historic reasons... maybe rename C?
15 using I32 = V<int32_t>;
16 using U64 = V<uint64_t>;
17 using U32 = V<uint32_t>;
18 using U16 = V<uint16_t>;
19 using U8 = V<uint8_t>;
20
21
22 #if defined(__GNUC__) && !defined(__clang__)
23 // Once again, GCC is kind of weird, not allowing vector = scalar directly.
24 static constexpr F F0 = F() + 0.0f,
25 F1 = F() + 1.0f,
26 FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
27 #else
28 static constexpr F F0 = 0.0f,
29 F1 = 1.0f,
30 FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
31 #endif
32
33 // Instead of checking __AVX__ below, we'll check USING_AVX.
34 // This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
35 // Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
36
37 #if !defined(USING_AVX) && N == 8 && defined(__AVX__)
38 #define USING_AVX
39 #endif
40 #if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
41 #define USING AVX_F16C
42 #endif
43 #if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
44 #define USING_AVX2
45 #endif
46 #if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
47 #define USING_AVX512F
48 #endif
49
50 // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
51 // This is more for organizational clarity... skcms.cc doesn't force these.
52 #if N > 1 && defined(__ARM_NEON)
53 #define USING_NEON
54 #if __ARM_FP & 2
55 #define USING_NEON_F16C
56 #endif
57 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
58 #define USING_NEON_FP16
59 #endif
60 #endif
61
62 // These -Wvector-conversion warnings seem to trigger in very bogus situations,
63 // like vst3q_f32() expecting a 16x char rather than a 4x float vector. :/
64 #if defined(USING_NEON) && defined(__clang__)
65 #pragma clang diagnostic ignored "-Wvector-conversion"
66 #endif
67
68 // GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
69 // You'd see warnings like, "using AVX even though AVX is not enabled".
70 // We stifle these warnings; our helpers that return U64 are always inlined.
71 #if defined(__SSE__) && defined(__GNUC__)
72 #if !defined(__has_warning)
73 #pragma GCC diagnostic ignored "-Wpsabi"
74 #elif __has_warning("-Wpsabi")
75 #pragma GCC diagnostic ignored "-Wpsabi"
76 #endif
77 #endif
78
79 #if defined(__clang__)
80 #define FALLTHROUGH [[clang::fallthrough]]
81 #else
82 #define FALLTHROUGH
83 #endif
84
85 // We tag most helper functions as SI, to enforce good code generation
86 // but also work around what we think is a bug in GCC: when targeting 32-bit
87 // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
88 // MMX mm0 register, which seems to mess with unrelated code that later uses
89 // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
90 //
91 // It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
92 #if defined(__clang__) || defined(__GNUC__)
93 #define SI static inline __attribute__((always_inline))
94 #else
95 #define SI static inline
96 #endif
97
98 template <typename T, typename P>
load(const P * ptr)99 SI T load(const P* ptr) {
100 T val;
101 small_memcpy(&val, ptr, sizeof(val));
102 return val;
103 }
104 template <typename T, typename P>
store(P * ptr,const T & val)105 SI void store(P* ptr, const T& val) {
106 small_memcpy(ptr, &val, sizeof(val));
107 }
108
109 // (T)v is a cast when N == 1 and a bit-pun when N>1,
110 // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
111 template <typename D, typename S>
cast(const S & v)112 SI D cast(const S& v) {
113 #if N == 1
114 return (D)v;
115 #elif defined(__clang__)
116 return __builtin_convertvector(v, D);
117 #else
118 D d;
119 for (int i = 0; i < N; i++) {
120 d[i] = v[i];
121 }
122 return d;
123 #endif
124 }
125
126 template <typename D, typename S>
bit_pun(const S & v)127 SI D bit_pun(const S& v) {
128 static_assert(sizeof(D) == sizeof(v), "");
129 return load<D>(&v);
130 }
131
132 // When we convert from float to fixed point, it's very common to want to round,
133 // and for some reason compilers generate better code when converting to int32_t.
134 // To serve both those ends, we use this function to_fixed() instead of direct cast().
135 #if defined(USING_NEON_FP16)
136 // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
to_fixed(F f)137 SI U16 to_fixed(F f) { return cast<U16>(f + 0.5f); }
138 #else
to_fixed(F f)139 SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
140 #endif
141
142
143 // Sometimes we do something crazy on one branch of a conditonal,
144 // like divide by zero or convert a huge float to an integer,
145 // but then harmlessly select the other side. That trips up N==1
146 // sanitizer builds, so we make if_then_else() a macro to avoid
147 // evaluating the unused side.
148
149 #if N == 1
150 #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
151 #else
152 template <typename C, typename T>
if_then_else(C cond,T t,T e)153 SI T if_then_else(C cond, T t, T e) {
154 return bit_pun<T>( ( cond & bit_pun<C>(t)) |
155 (~cond & bit_pun<C>(e)) );
156 }
157 #endif
158
159
F_from_Half(U16 half)160 SI F F_from_Half(U16 half) {
161 #if defined(USING_NEON_FP16)
162 return bit_pun<F>(half);
163 #elif defined(USING_NEON_F16C)
164 return vcvt_f32_f16((float16x4_t)half);
165 #elif defined(USING_AVX512F)
166 return (F)_mm512_cvtph_ps((__m256i)half);
167 #elif defined(USING_AVX_F16C)
168 typedef int16_t __attribute__((vector_size(16))) I16;
169 return __builtin_ia32_vcvtph2ps256((I16)half);
170 #else
171 U32 wide = cast<U32>(half);
172 // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
173 U32 s = wide & 0x8000,
174 em = wide ^ s;
175
176 // Constructing the float is easy if the half is not denormalized.
177 F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
178
179 // Simply flush all denorm half floats to zero.
180 return if_then_else(em < 0x0400, F0, norm);
181 #endif
182 }
183
184 #if defined(__clang__)
185 // The -((127-15)<<10) underflows that side of the math when
186 // we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
187 __attribute__((no_sanitize("unsigned-integer-overflow")))
188 #endif
Half_from_F(F f)189 SI U16 Half_from_F(F f) {
190 #if defined(USING_NEON_FP16)
191 return bit_pun<U16>(f);
192 #elif defined(USING_NEON_F16C)
193 return (U16)vcvt_f16_f32(f);
194 #elif defined(USING_AVX512F)
195 return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
196 #elif defined(USING_AVX_F16C)
197 return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
198 #else
199 // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
200 U32 sem = bit_pun<U32>(f),
201 s = sem & 0x80000000,
202 em = sem ^ s;
203
204 // For simplicity we flush denorm half floats (including all denorm floats) to zero.
205 return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
206 , (s>>16) + (em>>13) - ((127-15)<<10)));
207 #endif
208 }
209
210 // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
211 #if defined(USING_NEON_FP16)
swap_endian_16(U16 v)212 SI U16 swap_endian_16(U16 v) {
213 return (U16)vrev16q_u8((uint8x16_t) v);
214 }
215 #elif defined(USING_NEON)
swap_endian_16(U16 v)216 SI U16 swap_endian_16(U16 v) {
217 return (U16)vrev16_u8((uint8x8_t) v);
218 }
219 #endif
220
swap_endian_16x4(const U64 & rgba)221 SI U64 swap_endian_16x4(const U64& rgba) {
222 return (rgba & 0x00ff00ff00ff00ff) << 8
223 | (rgba & 0xff00ff00ff00ff00) >> 8;
224 }
225
226 #if defined(USING_NEON_FP16)
min_(F x,F y)227 SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
max_(F x,F y)228 SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
229 #elif defined(USING_NEON)
min_(F x,F y)230 SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
max_(F x,F y)231 SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
232 #else
min_(F x,F y)233 SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
max_(F x,F y)234 SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
235 #endif
236
floor_(F x)237 SI F floor_(F x) {
238 #if N == 1
239 return floorf_(x);
240 #elif defined(USING_NEON_FP16)
241 return vrndmq_f16(x);
242 #elif defined(__aarch64__)
243 return vrndmq_f32(x);
244 #elif defined(USING_AVX512F)
245 // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
246 // and integer santizer catches that this implicit cast changes the
247 // value from -1 to 65535. We'll cast manually to work around it.
248 // Read this as `return _mm512_floor_ps(x)`.
249 return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
250 #elif defined(USING_AVX)
251 return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
252 #elif defined(__SSE4_1__)
253 return _mm_floor_ps(x);
254 #else
255 // Round trip through integers with a truncating cast.
256 F roundtrip = cast<F>(cast<I32>(x));
257 // If x is negative, truncating gives the ceiling instead of the floor.
258 return roundtrip - if_then_else(roundtrip > x, F1, F0);
259
260 // This implementation fails for values of x that are outside
261 // the range an integer can represent. We expect most x to be small.
262 #endif
263 }
264
approx_log2(F x)265 SI F approx_log2(F x) {
266 #if defined(USING_NEON_FP16)
267 // TODO(mtklein)
268 return x;
269 #else
270 // The first approximation of log2(x) is its exponent 'e', minus 127.
271 I32 bits = bit_pun<I32>(x);
272
273 F e = cast<F>(bits) * (1.0f / (1<<23));
274
275 // If we use the mantissa too we can refine the error signficantly.
276 F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
277
278 return e - 124.225514990f
279 - 1.498030302f*m
280 - 1.725879990f/(0.3520887068f + m);
281 #endif
282 }
283
approx_log(F x)284 SI F approx_log(F x) {
285 const float ln2 = 0.69314718f;
286 return ln2 * approx_log2(x);
287 }
288
approx_exp2(F x)289 SI F approx_exp2(F x) {
290 #if defined(USING_NEON_FP16)
291 // TODO(mtklein)
292 return x;
293 #else
294 F fract = x - floor_(x);
295
296 F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
297 - 1.490129070f*fract
298 + 27.728023300f/(4.84252568f - fract));
299 I32 bits = cast<I32>(min_(max_(fbits, F0), FInfBits));
300
301 return bit_pun<F>(bits);
302 #endif
303 }
304
approx_pow(F x,float y)305 SI F approx_pow(F x, float y) {
306 return if_then_else((x == F0) | (x == F1), x
307 , approx_exp2(approx_log2(x) * y));
308 }
309
approx_exp(F x)310 SI F approx_exp(F x) {
311 const float log2_e = 1.4426950408889634074f;
312 return approx_exp2(log2_e * x);
313 }
314
315 // Return tf(x).
apply_tf(const skcms_TransferFunction * tf,F x)316 SI F apply_tf(const skcms_TransferFunction* tf, F x) {
317 #if defined(USING_NEON_FP16)
318 // TODO(mtklein)
319 (void)tf;
320 return x;
321 #else
322 // Peel off the sign bit and set x = |x|.
323 U32 bits = bit_pun<U32>(x),
324 sign = bits & 0x80000000;
325 x = bit_pun<F>(bits ^ sign);
326
327 // The transfer function has a linear part up to d, exponential at d and after.
328 F v = if_then_else(x < tf->d, tf->c*x + tf->f
329 , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
330
331 // Tack the sign bit back on.
332 return bit_pun<F>(sign | bit_pun<U32>(v));
333 #endif
334 }
335
apply_pq(const skcms_TransferFunction * tf,F x)336 SI F apply_pq(const skcms_TransferFunction* tf, F x) {
337 #if defined(USING_NEON_FP16)
338 // TODO(mtklein)
339 (void)tf;
340 return x;
341 #else
342 U32 bits = bit_pun<U32>(x),
343 sign = bits & 0x80000000;
344 x = bit_pun<F>(bits ^ sign);
345
346 F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
347 / (tf->d + tf->e * approx_pow(x, tf->c)),
348 tf->f);
349
350 return bit_pun<F>(sign | bit_pun<U32>(v));
351 #endif
352 }
353
apply_hlg(const skcms_TransferFunction * tf,F x)354 SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
355 #if defined(USING_NEON_FP16)
356 // TODO(mtklein)
357 (void)tf;
358 return x;
359 #else
360 const float R = tf->a, G = tf->b,
361 a = tf->c, b = tf->d, c = tf->e,
362 K = tf->f + 1;
363 U32 bits = bit_pun<U32>(x),
364 sign = bits & 0x80000000;
365 x = bit_pun<F>(bits ^ sign);
366
367 F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
368 , approx_exp((x-c)*a) + b);
369
370 return K*bit_pun<F>(sign | bit_pun<U32>(v));
371 #endif
372 }
373
apply_hlginv(const skcms_TransferFunction * tf,F x)374 SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
375 #if defined(USING_NEON_FP16)
376 // TODO(mtklein)
377 (void)tf;
378 return x;
379 #else
380 const float R = tf->a, G = tf->b,
381 a = tf->c, b = tf->d, c = tf->e,
382 K = tf->f + 1;
383 U32 bits = bit_pun<U32>(x),
384 sign = bits & 0x80000000;
385 x = bit_pun<F>(bits ^ sign);
386 x /= K;
387
388 F v = if_then_else(x <= 1, R * approx_pow(x, G)
389 , a * approx_log(x - b) + c);
390
391 return bit_pun<F>(sign | bit_pun<U32>(v));
392 #endif
393 }
394
395
396 // Strided loads and stores of N values, starting from p.
397 template <typename T, typename P>
load_3(const P * p)398 SI T load_3(const P* p) {
399 #if N == 1
400 return (T)p[0];
401 #elif N == 4
402 return T{p[ 0],p[ 3],p[ 6],p[ 9]};
403 #elif N == 8
404 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
405 #elif N == 16
406 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
407 p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
408 #endif
409 }
410
411 template <typename T, typename P>
load_4(const P * p)412 SI T load_4(const P* p) {
413 #if N == 1
414 return (T)p[0];
415 #elif N == 4
416 return T{p[ 0],p[ 4],p[ 8],p[12]};
417 #elif N == 8
418 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
419 #elif N == 16
420 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
421 p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
422 #endif
423 }
424
425 template <typename T, typename P>
store_3(P * p,const T & v)426 SI void store_3(P* p, const T& v) {
427 #if N == 1
428 p[0] = v;
429 #elif N == 4
430 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
431 #elif N == 8
432 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
433 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
434 #elif N == 16
435 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
436 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
437 p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
438 p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
439 #endif
440 }
441
442 template <typename T, typename P>
store_4(P * p,const T & v)443 SI void store_4(P* p, const T& v) {
444 #if N == 1
445 p[0] = v;
446 #elif N == 4
447 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
448 #elif N == 8
449 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
450 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
451 #elif N == 16
452 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
453 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
454 p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
455 p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
456 #endif
457 }
458
459
gather_8(const uint8_t * p,I32 ix)460 SI U8 gather_8(const uint8_t* p, I32 ix) {
461 #if N == 1
462 U8 v = p[ix];
463 #elif N == 4
464 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
465 #elif N == 8
466 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
467 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
468 #elif N == 16
469 U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
470 p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
471 p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
472 p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
473 #endif
474 return v;
475 }
476
gather_16(const uint8_t * p,I32 ix)477 SI U16 gather_16(const uint8_t* p, I32 ix) {
478 // Load the i'th 16-bit value from p.
479 auto load_16 = [p](int i) {
480 return load<uint16_t>(p + 2*i);
481 };
482 #if N == 1
483 U16 v = load_16(ix);
484 #elif N == 4
485 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
486 #elif N == 8
487 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
488 load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
489 #elif N == 16
490 U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
491 load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
492 load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
493 load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
494 #endif
495 return v;
496 }
497
gather_32(const uint8_t * p,I32 ix)498 SI U32 gather_32(const uint8_t* p, I32 ix) {
499 // Load the i'th 32-bit value from p.
500 auto load_32 = [p](int i) {
501 return load<uint32_t>(p + 4*i);
502 };
503 #if N == 1
504 U32 v = load_32(ix);
505 #elif N == 4
506 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
507 #elif N == 8
508 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
509 load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
510 #elif N == 16
511 U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
512 load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
513 load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
514 load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
515 #endif
516 // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
517 return v;
518 }
519
gather_24(const uint8_t * p,I32 ix)520 SI U32 gather_24(const uint8_t* p, I32 ix) {
521 // First, back up a byte. Any place we're gathering from has a safe junk byte to read
522 // in front of it, either a previous table value, or some tag metadata.
523 p -= 1;
524
525 // Load the i'th 24-bit value from p, and 1 extra byte.
526 auto load_24_32 = [p](int i) {
527 return load<uint32_t>(p + 3*i);
528 };
529
530 // Now load multiples of 4 bytes (a junk byte, then r,g,b).
531 #if N == 1
532 U32 v = load_24_32(ix);
533 #elif N == 4
534 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
535 #elif N == 8 && !defined(USING_AVX2)
536 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
537 load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
538 #elif N == 8
539 (void)load_24_32;
540 // The gather instruction here doesn't need any particular alignment,
541 // but the intrinsic takes a const int*.
542 const int* p4 = bit_pun<const int*>(p);
543 I32 zero = { 0, 0, 0, 0, 0, 0, 0, 0},
544 mask = {-1,-1,-1,-1, -1,-1,-1,-1};
545 #if defined(__clang__)
546 U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
547 #elif defined(__GNUC__)
548 U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
549 #endif
550 #elif N == 16
551 (void)load_24_32;
552 // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
553 // And AVX-512 swapped the order of arguments. :/
554 const int* p4 = bit_pun<const int*>(p);
555 U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
556 #endif
557
558 // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
559 return v >> 8;
560 }
561
562 #if !defined(__arm__)
gather_48(const uint8_t * p,I32 ix,U64 * v)563 SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
564 // As in gather_24(), with everything doubled.
565 p -= 2;
566
567 // Load the i'th 48-bit value from p, and 2 extra bytes.
568 auto load_48_64 = [p](int i) {
569 return load<uint64_t>(p + 6*i);
570 };
571
572 #if N == 1
573 *v = load_48_64(ix);
574 #elif N == 4
575 *v = U64{
576 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
577 };
578 #elif N == 8 && !defined(USING_AVX2)
579 *v = U64{
580 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
581 load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
582 };
583 #elif N == 8
584 (void)load_48_64;
585 typedef int32_t __attribute__((vector_size(16))) Half_I32;
586 typedef long long __attribute__((vector_size(32))) Half_I64;
587
588 // The gather instruction here doesn't need any particular alignment,
589 // but the intrinsic takes a const long long*.
590 const long long int* p8 = bit_pun<const long long int*>(p);
591
592 Half_I64 zero = { 0, 0, 0, 0},
593 mask = {-1,-1,-1,-1};
594
595 ix *= 6;
596 Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
597 ix_hi = { ix[4], ix[5], ix[6], ix[7] };
598
599 #if defined(__clang__)
600 Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
601 hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
602 #elif defined(__GNUC__)
603 Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
604 hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
605 #endif
606 store((char*)v + 0, lo);
607 store((char*)v + 32, hi);
608 #elif N == 16
609 (void)load_48_64;
610 const long long int* p8 = bit_pun<const long long int*>(p);
611 __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
612 hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
613 store((char*)v + 0, lo);
614 store((char*)v + 64, hi);
615 #endif
616
617 *v >>= 16;
618 }
619 #endif
620
F_from_U8(U8 v)621 SI F F_from_U8(U8 v) {
622 return cast<F>(v) * (1/255.0f);
623 }
624
F_from_U16_BE(U16 v)625 SI F F_from_U16_BE(U16 v) {
626 // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
627 // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
628 U16 lo = (v >> 8),
629 hi = (v << 8) & 0xffff;
630 return cast<F>(lo|hi) * (1/65535.0f);
631 }
632
U16_from_F(F v)633 SI U16 U16_from_F(F v) {
634 // 65535 == inf in FP16, so promote to FP32 before converting.
635 return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
636 }
637
minus_1_ulp(F v)638 SI F minus_1_ulp(F v) {
639 #if defined(USING_NEON_FP16)
640 return bit_pun<F>( bit_pun<U16>(v) - 1 );
641 #else
642 return bit_pun<F>( bit_pun<U32>(v) - 1 );
643 #endif
644 }
645
table(const skcms_Curve * curve,F v)646 SI F table(const skcms_Curve* curve, F v) {
647 // Clamp the input to [0,1], then scale to a table index.
648 F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
649
650 // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
651 I32 lo = cast<I32>( ix ),
652 hi = cast<I32>(minus_1_ulp(ix+1.0f));
653 F t = ix - cast<F>(lo); // i.e. the fractional part of ix.
654
655 // TODO: can we load l and h simultaneously? Each entry in 'h' is either
656 // the same as in 'l' or adjacent. We have a rough idea that's it'd always be safe
657 // to read adjacent entries and perhaps underflow the table by a byte or two
658 // (it'd be junk, but always safe to read). Not sure how to lerp yet.
659 F l,h;
660 if (curve->table_8) {
661 l = F_from_U8(gather_8(curve->table_8, lo));
662 h = F_from_U8(gather_8(curve->table_8, hi));
663 } else {
664 l = F_from_U16_BE(gather_16(curve->table_16, lo));
665 h = F_from_U16_BE(gather_16(curve->table_16, hi));
666 }
667 return l + (h-l)*t;
668 }
669
sample_clut_8(const uint8_t * grid_8,I32 ix,F * r,F * g,F * b)670 SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
671 U32 rgb = gather_24(grid_8, ix);
672
673 *r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
674 *g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f);
675 *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
676 }
677
sample_clut_8(const uint8_t * grid_8,I32 ix,F * r,F * g,F * b,F * a)678 SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
679 // TODO: don't forget to optimize gather_32().
680 U32 rgba = gather_32(grid_8, ix);
681
682 *r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
683 *g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
684 *b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
685 *a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
686 }
687
sample_clut_16(const uint8_t * grid_16,I32 ix,F * r,F * g,F * b)688 SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
689 #if defined(__arm__)
690 // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
691 *r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
692 *g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
693 *b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
694 #else
695 // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
696 U64 rgb;
697 gather_48(grid_16, ix, &rgb);
698 rgb = swap_endian_16x4(rgb);
699
700 *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
701 *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
702 *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
703 #endif
704 }
705
sample_clut_16(const uint8_t * grid_16,I32 ix,F * r,F * g,F * b,F * a)706 SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
707 // TODO: gather_64()-based fast path?
708 *r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
709 *g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
710 *b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
711 *a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
712 }
713
clut(uint32_t input_channels,uint32_t output_channels,const uint8_t grid_points[4],const uint8_t * grid_8,const uint8_t * grid_16,F * r,F * g,F * b,F * a)714 static void clut(uint32_t input_channels, uint32_t output_channels,
715 const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
716 F* r, F* g, F* b, F* a) {
717
718 const int dim = (int)input_channels;
719 assert (0 < dim && dim <= 4);
720 assert (output_channels == 3 ||
721 output_channels == 4);
722
723 // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
724 I32 index [8]; // Index contribution by dimension, first low from 0, then high from 4.
725 F weight[8]; // Weight for each contribution, again first low, then high.
726
727 // O(dim) work first: calculate index,weight from r,g,b,a.
728 const F inputs[] = { *r,*g,*b,*a };
729 for (int i = dim-1, stride = 1; i >= 0; i--) {
730 // x is where we logically want to sample the grid in the i-th dimension.
731 F x = inputs[i] * (float)(grid_points[i] - 1);
732
733 // But we can't index at floats. lo and hi are the two integer grid points surrounding x.
734 I32 lo = cast<I32>( x ), // i.e. trunc(x) == floor(x) here.
735 hi = cast<I32>(minus_1_ulp(x+1.0f));
736 // Notice how we fold in the accumulated stride across previous dimensions here.
737 index[i+0] = lo * stride;
738 index[i+4] = hi * stride;
739 stride *= grid_points[i];
740
741 // We'll interpolate between those two integer grid points by t.
742 F t = x - cast<F>(lo); // i.e. fract(x)
743 weight[i+0] = 1-t;
744 weight[i+4] = t;
745 }
746
747 *r = *g = *b = F0;
748 if (output_channels == 4) {
749 *a = F0;
750 }
751
752 // We'll sample 2^dim == 1<<dim table entries per pixel,
753 // in all combinations of low and high in each dimension.
754 for (int combo = 0; combo < (1<<dim); combo++) { // This loop can be done in any order.
755
756 // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
757 // where 0 selects the low index contribution and its weight 1-t,
758 // or 4 the high index contribution and its weight t.
759
760 // Since 0<dim≤4, we can always just start off with the 0-th channel,
761 // then handle the others conditionally.
762 I32 ix = index [0 + (combo&1)*4];
763 F w = weight[0 + (combo&1)*4];
764
765 switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle.
766 case 3: ix += index [3 + (combo&8)/2];
767 w *= weight[3 + (combo&8)/2];
768 FALLTHROUGH;
769 // fall through
770
771 case 2: ix += index [2 + (combo&4)*1];
772 w *= weight[2 + (combo&4)*1];
773 FALLTHROUGH;
774 // fall through
775
776 case 1: ix += index [1 + (combo&2)*2];
777 w *= weight[1 + (combo&2)*2];
778 }
779
780 F R,G,B,A=F0;
781 if (output_channels == 3) {
782 if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
783 else { sample_clut_16(grid_16,ix, &R,&G,&B); }
784 } else {
785 if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
786 else { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
787 }
788 *r += w*R;
789 *g += w*G;
790 *b += w*B;
791 *a += w*A;
792 }
793 }
794
clut(const skcms_A2B * a2b,F * r,F * g,F * b,F a)795 static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
796 clut(a2b->input_channels, a2b->output_channels,
797 a2b->grid_points, a2b->grid_8, a2b->grid_16,
798 r,g,b,&a);
799 }
clut(const skcms_B2A * b2a,F * r,F * g,F * b,F * a)800 static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
801 clut(b2a->input_channels, b2a->output_channels,
802 b2a->grid_points, b2a->grid_8, b2a->grid_16,
803 r,g,b,a);
804 }
805
exec_ops(const Op * ops,const void ** args,const char * src,char * dst,int i)806 static void exec_ops(const Op* ops, const void** args,
807 const char* src, char* dst, int i) {
808 F r = F0, g = F0, b = F0, a = F1;
809 while (true) {
810 switch (*ops++) {
811 case Op_load_a8:{
812 a = F_from_U8(load<U8>(src + 1*i));
813 } break;
814
815 case Op_load_g8:{
816 r = g = b = F_from_U8(load<U8>(src + 1*i));
817 } break;
818
819 case Op_load_4444:{
820 U16 abgr = load<U16>(src + 2*i);
821
822 r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
823 g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
824 b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
825 a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
826 } break;
827
828 case Op_load_565:{
829 U16 rgb = load<U16>(src + 2*i);
830
831 r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
832 g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
833 b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
834 } break;
835
836 case Op_load_888:{
837 const uint8_t* rgb = (const uint8_t*)(src + 3*i);
838 #if defined(USING_NEON_FP16)
839 // See the explanation under USING_NEON below. This is that doubled up.
840 uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
841 v = vld3q_lane_u8(rgb+ 0, v, 0);
842 v = vld3q_lane_u8(rgb+ 3, v, 2);
843 v = vld3q_lane_u8(rgb+ 6, v, 4);
844 v = vld3q_lane_u8(rgb+ 9, v, 6);
845
846 v = vld3q_lane_u8(rgb+12, v, 8);
847 v = vld3q_lane_u8(rgb+15, v, 10);
848 v = vld3q_lane_u8(rgb+18, v, 12);
849 v = vld3q_lane_u8(rgb+21, v, 14);
850
851 r = cast<F>((U16)v.val[0]) * (1/255.0f);
852 g = cast<F>((U16)v.val[1]) * (1/255.0f);
853 b = cast<F>((U16)v.val[2]) * (1/255.0f);
854 #elif defined(USING_NEON)
855 // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
856 // a time. Since we're doing that, we might as well load them into 16-bit lanes.
857 // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
858 uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
859 v = vld3_lane_u8(rgb+0, v, 0);
860 v = vld3_lane_u8(rgb+3, v, 2);
861 v = vld3_lane_u8(rgb+6, v, 4);
862 v = vld3_lane_u8(rgb+9, v, 6);
863
864 // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
865 // convert to F. (Again, U32 would be even better here if drop ARMv7 or split
866 // ARMv7 and ARMv8 impls.)
867 r = cast<F>((U16)v.val[0]) * (1/255.0f);
868 g = cast<F>((U16)v.val[1]) * (1/255.0f);
869 b = cast<F>((U16)v.val[2]) * (1/255.0f);
870 #else
871 r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
872 g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
873 b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
874 #endif
875 } break;
876
877 case Op_load_8888:{
878 U32 rgba = load<U32>(src + 4*i);
879
880 r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
881 g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
882 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
883 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
884 } break;
885
886 case Op_load_8888_palette8:{
887 const uint8_t* palette = (const uint8_t*) *args++;
888 I32 ix = cast<I32>(load<U8>(src + 1*i));
889 U32 rgba = gather_32(palette, ix);
890
891 r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
892 g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
893 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
894 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
895 } break;
896
897 case Op_load_1010102:{
898 U32 rgba = load<U32>(src + 4*i);
899
900 r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
901 g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
902 b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
903 a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
904 } break;
905
906 case Op_load_101010x_XR:{
907 static constexpr float min = -0.752941f;
908 static constexpr float max = 1.25098f;
909 static constexpr float range = max - min;
910 U32 rgba = load<U32>(src + 4*i);
911 r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f) * range + min;
912 g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
913 b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
914 } break;
915
916 case Op_load_161616LE:{
917 uintptr_t ptr = (uintptr_t)(src + 6*i);
918 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
919 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
920 #if defined(USING_NEON_FP16)
921 uint16x8x3_t v = vld3q_u16(rgb);
922 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
923 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
924 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
925 #elif defined(USING_NEON)
926 uint16x4x3_t v = vld3_u16(rgb);
927 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
928 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
929 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
930 #else
931 r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
932 g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
933 b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
934 #endif
935 } break;
936
937 case Op_load_16161616LE:{
938 uintptr_t ptr = (uintptr_t)(src + 8*i);
939 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
940 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
941 #if defined(USING_NEON_FP16)
942 uint16x8x4_t v = vld4q_u16(rgba);
943 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
944 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
945 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
946 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
947 #elif defined(USING_NEON)
948 uint16x4x4_t v = vld4_u16(rgba);
949 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
950 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
951 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
952 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
953 #else
954 U64 px = load<U64>(rgba);
955
956 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
957 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
958 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
959 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
960 #endif
961 } break;
962
963 case Op_load_161616BE:{
964 uintptr_t ptr = (uintptr_t)(src + 6*i);
965 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
966 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
967 #if defined(USING_NEON_FP16)
968 uint16x8x3_t v = vld3q_u16(rgb);
969 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
970 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
971 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
972 #elif defined(USING_NEON)
973 uint16x4x3_t v = vld3_u16(rgb);
974 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
975 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
976 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
977 #else
978 U32 R = load_3<U32>(rgb+0),
979 G = load_3<U32>(rgb+1),
980 B = load_3<U32>(rgb+2);
981 // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
982 r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
983 g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
984 b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
985 #endif
986 } break;
987
988 case Op_load_16161616BE:{
989 uintptr_t ptr = (uintptr_t)(src + 8*i);
990 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
991 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
992 #if defined(USING_NEON_FP16)
993 uint16x8x4_t v = vld4q_u16(rgba);
994 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
995 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
996 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
997 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
998 #elif defined(USING_NEON)
999 uint16x4x4_t v = vld4_u16(rgba);
1000 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
1001 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
1002 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
1003 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
1004 #else
1005 U64 px = swap_endian_16x4(load<U64>(rgba));
1006
1007 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
1008 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
1009 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
1010 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
1011 #endif
1012 } break;
1013
1014 case Op_load_hhh:{
1015 uintptr_t ptr = (uintptr_t)(src + 6*i);
1016 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1017 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1018 #if defined(USING_NEON_FP16)
1019 uint16x8x3_t v = vld3q_u16(rgb);
1020 U16 R = (U16)v.val[0],
1021 G = (U16)v.val[1],
1022 B = (U16)v.val[2];
1023 #elif defined(USING_NEON)
1024 uint16x4x3_t v = vld3_u16(rgb);
1025 U16 R = (U16)v.val[0],
1026 G = (U16)v.val[1],
1027 B = (U16)v.val[2];
1028 #else
1029 U16 R = load_3<U16>(rgb+0),
1030 G = load_3<U16>(rgb+1),
1031 B = load_3<U16>(rgb+2);
1032 #endif
1033 r = F_from_Half(R);
1034 g = F_from_Half(G);
1035 b = F_from_Half(B);
1036 } break;
1037
1038 case Op_load_hhhh:{
1039 uintptr_t ptr = (uintptr_t)(src + 8*i);
1040 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1041 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1042 #if defined(USING_NEON_FP16)
1043 uint16x8x4_t v = vld4q_u16(rgba);
1044 U16 R = (U16)v.val[0],
1045 G = (U16)v.val[1],
1046 B = (U16)v.val[2],
1047 A = (U16)v.val[3];
1048 #elif defined(USING_NEON)
1049 uint16x4x4_t v = vld4_u16(rgba);
1050 U16 R = (U16)v.val[0],
1051 G = (U16)v.val[1],
1052 B = (U16)v.val[2],
1053 A = (U16)v.val[3];
1054 #else
1055 U64 px = load<U64>(rgba);
1056 U16 R = cast<U16>((px >> 0) & 0xffff),
1057 G = cast<U16>((px >> 16) & 0xffff),
1058 B = cast<U16>((px >> 32) & 0xffff),
1059 A = cast<U16>((px >> 48) & 0xffff);
1060 #endif
1061 r = F_from_Half(R);
1062 g = F_from_Half(G);
1063 b = F_from_Half(B);
1064 a = F_from_Half(A);
1065 } break;
1066
1067 case Op_load_fff:{
1068 uintptr_t ptr = (uintptr_t)(src + 12*i);
1069 assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1070 const float* rgb = (const float*)ptr; // cast to const float* to be safe.
1071 #if defined(USING_NEON_FP16)
1072 float32x4x3_t lo = vld3q_f32(rgb + 0),
1073 hi = vld3q_f32(rgb + 12);
1074 r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1075 g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1076 b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1077 #elif defined(USING_NEON)
1078 float32x4x3_t v = vld3q_f32(rgb);
1079 r = (F)v.val[0];
1080 g = (F)v.val[1];
1081 b = (F)v.val[2];
1082 #else
1083 r = load_3<F>(rgb+0);
1084 g = load_3<F>(rgb+1);
1085 b = load_3<F>(rgb+2);
1086 #endif
1087 } break;
1088
1089 case Op_load_ffff:{
1090 uintptr_t ptr = (uintptr_t)(src + 16*i);
1091 assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1092 const float* rgba = (const float*)ptr; // cast to const float* to be safe.
1093 #if defined(USING_NEON_FP16)
1094 float32x4x4_t lo = vld4q_f32(rgba + 0),
1095 hi = vld4q_f32(rgba + 16);
1096 r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1097 g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1098 b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1099 a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
1100 #elif defined(USING_NEON)
1101 float32x4x4_t v = vld4q_f32(rgba);
1102 r = (F)v.val[0];
1103 g = (F)v.val[1];
1104 b = (F)v.val[2];
1105 a = (F)v.val[3];
1106 #else
1107 r = load_4<F>(rgba+0);
1108 g = load_4<F>(rgba+1);
1109 b = load_4<F>(rgba+2);
1110 a = load_4<F>(rgba+3);
1111 #endif
1112 } break;
1113
1114 case Op_swap_rb:{
1115 F t = r;
1116 r = b;
1117 b = t;
1118 } break;
1119
1120 case Op_clamp:{
1121 r = max_(F0, min_(r, F1));
1122 g = max_(F0, min_(g, F1));
1123 b = max_(F0, min_(b, F1));
1124 a = max_(F0, min_(a, F1));
1125 } break;
1126
1127 case Op_invert:{
1128 r = F1 - r;
1129 g = F1 - g;
1130 b = F1 - b;
1131 a = F1 - a;
1132 } break;
1133
1134 case Op_force_opaque:{
1135 a = F1;
1136 } break;
1137
1138 case Op_premul:{
1139 r *= a;
1140 g *= a;
1141 b *= a;
1142 } break;
1143
1144 case Op_unpremul:{
1145 F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1146 r *= scale;
1147 g *= scale;
1148 b *= scale;
1149 } break;
1150
1151 case Op_matrix_3x3:{
1152 const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
1153 const float* m = &matrix->vals[0][0];
1154
1155 F R = m[0]*r + m[1]*g + m[2]*b,
1156 G = m[3]*r + m[4]*g + m[5]*b,
1157 B = m[6]*r + m[7]*g + m[8]*b;
1158
1159 r = R;
1160 g = G;
1161 b = B;
1162 } break;
1163
1164 case Op_matrix_3x4:{
1165 const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
1166 const float* m = &matrix->vals[0][0];
1167
1168 F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1169 G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1170 B = m[8]*r + m[9]*g + m[10]*b + m[11];
1171
1172 r = R;
1173 g = G;
1174 b = B;
1175 } break;
1176
1177 case Op_lab_to_xyz:{
1178 // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
1179 F L = r * 100.0f,
1180 A = g * 255.0f - 128.0f,
1181 B = b * 255.0f - 128.0f;
1182
1183 // Convert to CIE XYZ.
1184 F Y = (L + 16.0f) * (1/116.0f),
1185 X = Y + A*(1/500.0f),
1186 Z = Y - B*(1/200.0f);
1187
1188 X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1189 Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1190 Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1191
1192 // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1193 r = X * 0.9642f;
1194 g = Y ;
1195 b = Z * 0.8249f;
1196 } break;
1197
1198 // As above, in reverse.
1199 case Op_xyz_to_lab:{
1200 F X = r * (1/0.9642f),
1201 Y = g,
1202 Z = b * (1/0.8249f);
1203
1204 X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
1205 Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
1206 Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
1207
1208 F L = Y*116.0f - 16.0f,
1209 A = (X-Y)*500.0f,
1210 B = (Y-Z)*200.0f;
1211
1212 r = L * (1/100.f);
1213 g = (A + 128.0f) * (1/255.0f);
1214 b = (B + 128.0f) * (1/255.0f);
1215 } break;
1216
1217 case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
1218 case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
1219 case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
1220 case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
1221
1222 case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
1223 case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
1224 case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
1225 case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
1226
1227 case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
1228 case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
1229 case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
1230 case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
1231
1232 case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
1233 case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
1234 case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
1235 case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
1236
1237 case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
1238 case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
1239 case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
1240 case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
1241
1242 case Op_clut_A2B: {
1243 const skcms_A2B* a2b = (const skcms_A2B*) *args++;
1244 clut(a2b, &r,&g,&b,a);
1245
1246 if (a2b->input_channels == 4) {
1247 // CMYK is opaque.
1248 a = F1;
1249 }
1250 } break;
1251
1252 case Op_clut_B2A: {
1253 const skcms_B2A* b2a = (const skcms_B2A*) *args++;
1254 clut(b2a, &r,&g,&b,&a);
1255 } break;
1256
1257 // Notice, from here on down the store_ ops all return, ending the loop.
1258
1259 case Op_store_a8: {
1260 store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1261 } return;
1262
1263 case Op_store_g8: {
1264 // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1265 store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1266 } return;
1267
1268 case Op_store_4444: {
1269 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1270 | cast<U16>(to_fixed(g * 15) << 8)
1271 | cast<U16>(to_fixed(b * 15) << 4)
1272 | cast<U16>(to_fixed(a * 15) << 0));
1273 } return;
1274
1275 case Op_store_565: {
1276 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
1277 | cast<U16>(to_fixed(g * 63) << 5 )
1278 | cast<U16>(to_fixed(b * 31) << 11 ));
1279 } return;
1280
1281 case Op_store_888: {
1282 uint8_t* rgb = (uint8_t*)dst + 3*i;
1283 #if defined(USING_NEON_FP16)
1284 // See the explanation under USING_NEON below. This is that doubled up.
1285 U16 R = to_fixed(r * 255),
1286 G = to_fixed(g * 255),
1287 B = to_fixed(b * 255);
1288
1289 uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
1290 vst3q_lane_u8(rgb+ 0, v, 0);
1291 vst3q_lane_u8(rgb+ 3, v, 2);
1292 vst3q_lane_u8(rgb+ 6, v, 4);
1293 vst3q_lane_u8(rgb+ 9, v, 6);
1294
1295 vst3q_lane_u8(rgb+12, v, 8);
1296 vst3q_lane_u8(rgb+15, v, 10);
1297 vst3q_lane_u8(rgb+18, v, 12);
1298 vst3q_lane_u8(rgb+21, v, 14);
1299 #elif defined(USING_NEON)
1300 // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1301 // get there via U16 to save some instructions converting to float. And just
1302 // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1303 U16 R = cast<U16>(to_fixed(r * 255)),
1304 G = cast<U16>(to_fixed(g * 255)),
1305 B = cast<U16>(to_fixed(b * 255));
1306
1307 uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1308 vst3_lane_u8(rgb+0, v, 0);
1309 vst3_lane_u8(rgb+3, v, 2);
1310 vst3_lane_u8(rgb+6, v, 4);
1311 vst3_lane_u8(rgb+9, v, 6);
1312 #else
1313 store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1314 store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1315 store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1316 #endif
1317 } return;
1318
1319 case Op_store_8888: {
1320 store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
1321 | cast<U32>(to_fixed(g * 255)) << 8
1322 | cast<U32>(to_fixed(b * 255)) << 16
1323 | cast<U32>(to_fixed(a * 255)) << 24);
1324 } return;
1325
1326 case Op_store_101010x_XR: {
1327 static constexpr float min = -0.752941f;
1328 static constexpr float max = 1.25098f;
1329 static constexpr float range = max - min;
1330 store(dst + 4*i, cast<U32>(to_fixed(((r - min) / range) * 1023)) << 0
1331 | cast<U32>(to_fixed(((g - min) / range) * 1023)) << 10
1332 | cast<U32>(to_fixed(((b - min) / range) * 1023)) << 20);
1333 return;
1334 }
1335 case Op_store_1010102: {
1336 store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
1337 | cast<U32>(to_fixed(g * 1023)) << 10
1338 | cast<U32>(to_fixed(b * 1023)) << 20
1339 | cast<U32>(to_fixed(a * 3)) << 30);
1340 } return;
1341
1342 case Op_store_161616LE: {
1343 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1344 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1345 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1346 #if defined(USING_NEON_FP16)
1347 uint16x8x3_t v = {{
1348 (uint16x8_t)U16_from_F(r),
1349 (uint16x8_t)U16_from_F(g),
1350 (uint16x8_t)U16_from_F(b),
1351 }};
1352 vst3q_u16(rgb, v);
1353 #elif defined(USING_NEON)
1354 uint16x4x3_t v = {{
1355 (uint16x4_t)U16_from_F(r),
1356 (uint16x4_t)U16_from_F(g),
1357 (uint16x4_t)U16_from_F(b),
1358 }};
1359 vst3_u16(rgb, v);
1360 #else
1361 store_3(rgb+0, U16_from_F(r));
1362 store_3(rgb+1, U16_from_F(g));
1363 store_3(rgb+2, U16_from_F(b));
1364 #endif
1365
1366 } return;
1367
1368 case Op_store_16161616LE: {
1369 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1370 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1371 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1372 #if defined(USING_NEON_FP16)
1373 uint16x8x4_t v = {{
1374 (uint16x8_t)U16_from_F(r),
1375 (uint16x8_t)U16_from_F(g),
1376 (uint16x8_t)U16_from_F(b),
1377 (uint16x8_t)U16_from_F(a),
1378 }};
1379 vst4q_u16(rgba, v);
1380 #elif defined(USING_NEON)
1381 uint16x4x4_t v = {{
1382 (uint16x4_t)U16_from_F(r),
1383 (uint16x4_t)U16_from_F(g),
1384 (uint16x4_t)U16_from_F(b),
1385 (uint16x4_t)U16_from_F(a),
1386 }};
1387 vst4_u16(rgba, v);
1388 #else
1389 U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1390 | cast<U64>(to_fixed(g * 65535)) << 16
1391 | cast<U64>(to_fixed(b * 65535)) << 32
1392 | cast<U64>(to_fixed(a * 65535)) << 48;
1393 store(rgba, px);
1394 #endif
1395 } return;
1396
1397 case Op_store_161616BE: {
1398 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1399 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1400 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1401 #if defined(USING_NEON_FP16)
1402 uint16x8x3_t v = {{
1403 (uint16x8_t)swap_endian_16(U16_from_F(r)),
1404 (uint16x8_t)swap_endian_16(U16_from_F(g)),
1405 (uint16x8_t)swap_endian_16(U16_from_F(b)),
1406 }};
1407 vst3q_u16(rgb, v);
1408 #elif defined(USING_NEON)
1409 uint16x4x3_t v = {{
1410 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1411 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1412 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1413 }};
1414 vst3_u16(rgb, v);
1415 #else
1416 U32 R = to_fixed(r * 65535),
1417 G = to_fixed(g * 65535),
1418 B = to_fixed(b * 65535);
1419 store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1420 store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1421 store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1422 #endif
1423
1424 } return;
1425
1426 case Op_store_16161616BE: {
1427 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1428 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1429 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1430 #if defined(USING_NEON_FP16)
1431 uint16x8x4_t v = {{
1432 (uint16x8_t)swap_endian_16(U16_from_F(r)),
1433 (uint16x8_t)swap_endian_16(U16_from_F(g)),
1434 (uint16x8_t)swap_endian_16(U16_from_F(b)),
1435 (uint16x8_t)swap_endian_16(U16_from_F(a)),
1436 }};
1437 vst4q_u16(rgba, v);
1438 #elif defined(USING_NEON)
1439 uint16x4x4_t v = {{
1440 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1441 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1442 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1443 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1444 }};
1445 vst4_u16(rgba, v);
1446 #else
1447 U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1448 | cast<U64>(to_fixed(g * 65535)) << 16
1449 | cast<U64>(to_fixed(b * 65535)) << 32
1450 | cast<U64>(to_fixed(a * 65535)) << 48;
1451 store(rgba, swap_endian_16x4(px));
1452 #endif
1453 } return;
1454
1455 case Op_store_hhh: {
1456 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1457 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1458 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1459
1460 U16 R = Half_from_F(r),
1461 G = Half_from_F(g),
1462 B = Half_from_F(b);
1463 #if defined(USING_NEON_FP16)
1464 uint16x8x3_t v = {{
1465 (uint16x8_t)R,
1466 (uint16x8_t)G,
1467 (uint16x8_t)B,
1468 }};
1469 vst3q_u16(rgb, v);
1470 #elif defined(USING_NEON)
1471 uint16x4x3_t v = {{
1472 (uint16x4_t)R,
1473 (uint16x4_t)G,
1474 (uint16x4_t)B,
1475 }};
1476 vst3_u16(rgb, v);
1477 #else
1478 store_3(rgb+0, R);
1479 store_3(rgb+1, G);
1480 store_3(rgb+2, B);
1481 #endif
1482 } return;
1483
1484 case Op_store_hhhh: {
1485 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1486 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1487 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1488
1489 U16 R = Half_from_F(r),
1490 G = Half_from_F(g),
1491 B = Half_from_F(b),
1492 A = Half_from_F(a);
1493 #if defined(USING_NEON_FP16)
1494 uint16x8x4_t v = {{
1495 (uint16x8_t)R,
1496 (uint16x8_t)G,
1497 (uint16x8_t)B,
1498 (uint16x8_t)A,
1499 }};
1500 vst4q_u16(rgba, v);
1501 #elif defined(USING_NEON)
1502 uint16x4x4_t v = {{
1503 (uint16x4_t)R,
1504 (uint16x4_t)G,
1505 (uint16x4_t)B,
1506 (uint16x4_t)A,
1507 }};
1508 vst4_u16(rgba, v);
1509 #else
1510 store(rgba, cast<U64>(R) << 0
1511 | cast<U64>(G) << 16
1512 | cast<U64>(B) << 32
1513 | cast<U64>(A) << 48);
1514 #endif
1515
1516 } return;
1517
1518 case Op_store_fff: {
1519 uintptr_t ptr = (uintptr_t)(dst + 12*i);
1520 assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1521 float* rgb = (float*)ptr; // for this cast to float* to be safe.
1522 #if defined(USING_NEON_FP16)
1523 float32x4x3_t lo = {{
1524 vcvt_f32_f16(vget_low_f16(r)),
1525 vcvt_f32_f16(vget_low_f16(g)),
1526 vcvt_f32_f16(vget_low_f16(b)),
1527 }}, hi = {{
1528 vcvt_f32_f16(vget_high_f16(r)),
1529 vcvt_f32_f16(vget_high_f16(g)),
1530 vcvt_f32_f16(vget_high_f16(b)),
1531 }};
1532 vst3q_f32(rgb + 0, lo);
1533 vst3q_f32(rgb + 12, hi);
1534 #elif defined(USING_NEON)
1535 float32x4x3_t v = {{
1536 (float32x4_t)r,
1537 (float32x4_t)g,
1538 (float32x4_t)b,
1539 }};
1540 vst3q_f32(rgb, v);
1541 #else
1542 store_3(rgb+0, r);
1543 store_3(rgb+1, g);
1544 store_3(rgb+2, b);
1545 #endif
1546 } return;
1547
1548 case Op_store_ffff: {
1549 uintptr_t ptr = (uintptr_t)(dst + 16*i);
1550 assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1551 float* rgba = (float*)ptr; // for this cast to float* to be safe.
1552 #if defined(USING_NEON_FP16)
1553 float32x4x4_t lo = {{
1554 vcvt_f32_f16(vget_low_f16(r)),
1555 vcvt_f32_f16(vget_low_f16(g)),
1556 vcvt_f32_f16(vget_low_f16(b)),
1557 vcvt_f32_f16(vget_low_f16(a)),
1558 }}, hi = {{
1559 vcvt_f32_f16(vget_high_f16(r)),
1560 vcvt_f32_f16(vget_high_f16(g)),
1561 vcvt_f32_f16(vget_high_f16(b)),
1562 vcvt_f32_f16(vget_high_f16(a)),
1563 }};
1564 vst4q_f32(rgba + 0, lo);
1565 vst4q_f32(rgba + 16, hi);
1566 #elif defined(USING_NEON)
1567 float32x4x4_t v = {{
1568 (float32x4_t)r,
1569 (float32x4_t)g,
1570 (float32x4_t)b,
1571 (float32x4_t)a,
1572 }};
1573 vst4q_f32(rgba, v);
1574 #else
1575 store_4(rgba+0, r);
1576 store_4(rgba+1, g);
1577 store_4(rgba+2, b);
1578 store_4(rgba+3, a);
1579 #endif
1580 } return;
1581 }
1582 }
1583 }
1584
1585
run_program(const Op * program,const void ** arguments,const char * src,char * dst,int n,const size_t src_bpp,const size_t dst_bpp)1586 static void run_program(const Op* program, const void** arguments,
1587 const char* src, char* dst, int n,
1588 const size_t src_bpp, const size_t dst_bpp) {
1589 int i = 0;
1590 while (n >= N) {
1591 exec_ops(program, arguments, src, dst, i);
1592 i += N;
1593 n -= N;
1594 }
1595 if (n > 0) {
1596 char tmp[4*4*N] = {0};
1597
1598 memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1599 exec_ops(program, arguments, tmp, tmp, 0);
1600 memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1601 }
1602 }
1603
1604 // Clean up any #defines we may have set so that we can be #included again.
1605 #if defined(USING_AVX)
1606 #undef USING_AVX
1607 #endif
1608 #if defined(USING_AVX_F16C)
1609 #undef USING_AVX_F16C
1610 #endif
1611 #if defined(USING_AVX2)
1612 #undef USING_AVX2
1613 #endif
1614 #if defined(USING_AVX512F)
1615 #undef USING_AVX512F
1616 #endif
1617
1618 #if defined(USING_NEON)
1619 #undef USING_NEON
1620 #endif
1621 #if defined(USING_NEON_F16C)
1622 #undef USING_NEON_F16C
1623 #endif
1624 #if defined(USING_NEON_FP16)
1625 #undef USING_NEON_FP16
1626 #endif
1627
1628 #undef FALLTHROUGH
1629