1 /*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 // Intentionally NO #pragma once... included multiple times.
9
10 // This file is included from skcms.cc in a namespace with some pre-defines:
11 // - N: depth of all vectors, 1,4,8, or 16 (preprocessor define)
12 // - V<T>: a template to create a vector of N T's.
13
14 using F = V<Color>; // Called F for historic reasons... maybe rename C?
15 using I32 = V<int32_t>;
16 using U64 = V<uint64_t>;
17 using U32 = V<uint32_t>;
18 using U16 = V<uint16_t>;
19 using U8 = V<uint8_t>;
20
21
22 #if defined(__GNUC__) && !defined(__clang__)
23 // Once again, GCC is kind of weird, not allowing vector = scalar directly.
24 static constexpr F F0 = F() + 0.0f,
25 F1 = F() + 1.0f;
26 #else
27 static constexpr F F0 = 0.0f,
28 F1 = 1.0f;
29 #endif
30
31 // Instead of checking __AVX__ below, we'll check USING_AVX.
32 // This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
33 // Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
34
35 #if !defined(USING_AVX) && N == 8 && defined(__AVX__)
36 #define USING_AVX
37 #endif
38 #if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
39 #define USING AVX_F16C
40 #endif
41 #if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
42 #define USING_AVX2
43 #endif
44 #if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__)
45 #define USING_AVX512F
46 #endif
47
48 // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
49 // This is more for organizational clarity... skcms.cc doesn't force these.
50 #if N > 1 && defined(__ARM_NEON)
51 #define USING_NEON
52 #if __ARM_FP & 2
53 #define USING_NEON_F16C
54 #endif
55 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
56 #define USING_NEON_FP16
57 #endif
58 #endif
59
60 // These -Wvector-conversion warnings seem to trigger in very bogus situations,
61 // like vst3q_f32() expecting a 16x char rather than a 4x float vector. :/
62 #if defined(USING_NEON) && defined(__clang__)
63 #pragma clang diagnostic ignored "-Wvector-conversion"
64 #endif
65
66 // GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
67 // You'd see warnings like, "using AVX even though AVX is not enabled".
68 // We stifle these warnings; our helpers that return U64 are always inlined.
69 #if defined(__SSE__) && defined(__GNUC__)
70 #pragma GCC diagnostic ignored "-Wpsabi"
71 #endif
72
73 #if defined(__clang__)
74 #define FALLTHROUGH [[clang::fallthrough]]
75 #else
76 #define FALLTHROUGH
77 #endif
78
79 // We tag most helper functions as SI, to enforce good code generation
80 // but also work around what we think is a bug in GCC: when targeting 32-bit
81 // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
82 // MMX mm0 register, which seems to mess with unrelated code that later uses
83 // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
84 //
85 // It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
86 #if defined(__clang__) || defined(__GNUC__)
87 #define SI static inline __attribute__((always_inline))
88 #else
89 #define SI static inline
90 #endif
91
92 template <typename T, typename P>
load(const P * ptr)93 SI T load(const P* ptr) {
94 T val;
95 small_memcpy(&val, ptr, sizeof(val));
96 return val;
97 }
98 template <typename T, typename P>
store(P * ptr,const T & val)99 SI void store(P* ptr, const T& val) {
100 small_memcpy(ptr, &val, sizeof(val));
101 }
102
103 // (T)v is a cast when N == 1 and a bit-pun when N>1,
104 // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
105 template <typename D, typename S>
cast(const S & v)106 SI D cast(const S& v) {
107 #if N == 1
108 return (D)v;
109 #elif defined(__clang__)
110 return __builtin_convertvector(v, D);
111 #else
112 D d;
113 for (int i = 0; i < N; i++) {
114 d[i] = v[i];
115 }
116 return d;
117 #endif
118 }
119
120 template <typename D, typename S>
bit_pun(const S & v)121 SI D bit_pun(const S& v) {
122 static_assert(sizeof(D) == sizeof(v), "");
123 return load<D>(&v);
124 }
125
126 // When we convert from float to fixed point, it's very common to want to round,
127 // and for some reason compilers generate better code when converting to int32_t.
128 // To serve both those ends, we use this function to_fixed() instead of direct cast().
129 #if defined(USING_NEON_FP16)
130 // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
to_fixed(F f)131 SI U16 to_fixed(F f) { return cast<U16>(f + 0.5f); }
132 #else
to_fixed(F f)133 SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
134 #endif
135
136
137 // Sometimes we do something crazy on one branch of a conditonal,
138 // like divide by zero or convert a huge float to an integer,
139 // but then harmlessly select the other side. That trips up N==1
140 // sanitizer builds, so we make if_then_else() a macro to avoid
141 // evaluating the unused side.
142
143 #if N == 1
144 #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
145 #else
146 template <typename C, typename T>
if_then_else(C cond,T t,T e)147 SI T if_then_else(C cond, T t, T e) {
148 return bit_pun<T>( ( cond & bit_pun<C>(t)) |
149 (~cond & bit_pun<C>(e)) );
150 }
151 #endif
152
153
F_from_Half(U16 half)154 SI F F_from_Half(U16 half) {
155 #if defined(USING_NEON_FP16)
156 return bit_pun<F>(half);
157 #elif defined(USING_NEON_F16C)
158 return vcvt_f32_f16((float16x4_t)half);
159 #elif defined(USING_AVX512F)
160 return (F)_mm512_cvtph_ps((__m256i)half);
161 #elif defined(USING_AVX_F16C)
162 typedef int16_t __attribute__((vector_size(16))) I16;
163 return __builtin_ia32_vcvtph2ps256((I16)half);
164 #else
165 U32 wide = cast<U32>(half);
166 // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
167 U32 s = wide & 0x8000,
168 em = wide ^ s;
169
170 // Constructing the float is easy if the half is not denormalized.
171 F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
172
173 // Simply flush all denorm half floats to zero.
174 return if_then_else(em < 0x0400, F0, norm);
175 #endif
176 }
177
178 #if defined(__clang__)
179 // The -((127-15)<<10) underflows that side of the math when
180 // we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
181 __attribute__((no_sanitize("unsigned-integer-overflow")))
182 #endif
Half_from_F(F f)183 SI U16 Half_from_F(F f) {
184 #if defined(USING_NEON_FP16)
185 return bit_pun<U16>(f);
186 #elif defined(USING_NEON_F16C)
187 return (U16)vcvt_f16_f32(f);
188 #elif defined(USING_AVX512F)
189 return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
190 #elif defined(USING_AVX_F16C)
191 return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
192 #else
193 // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
194 U32 sem = bit_pun<U32>(f),
195 s = sem & 0x80000000,
196 em = sem ^ s;
197
198 // For simplicity we flush denorm half floats (including all denorm floats) to zero.
199 return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
200 , (s>>16) + (em>>13) - ((127-15)<<10)));
201 #endif
202 }
203
204 // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
205 #if defined(USING_NEON_FP16)
swap_endian_16(U16 v)206 SI U16 swap_endian_16(U16 v) {
207 return (U16)vrev16q_u8((uint8x16_t) v);
208 }
209 #elif defined(USING_NEON)
swap_endian_16(U16 v)210 SI U16 swap_endian_16(U16 v) {
211 return (U16)vrev16_u8((uint8x8_t) v);
212 }
213 #endif
214
swap_endian_16x4(const U64 & rgba)215 SI U64 swap_endian_16x4(const U64& rgba) {
216 return (rgba & 0x00ff00ff00ff00ff) << 8
217 | (rgba & 0xff00ff00ff00ff00) >> 8;
218 }
219
220 #if defined(USING_NEON_FP16)
min_(F x,F y)221 SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
max_(F x,F y)222 SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
223 #elif defined(USING_NEON)
min_(F x,F y)224 SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
max_(F x,F y)225 SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
226 #else
min_(F x,F y)227 SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
max_(F x,F y)228 SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
229 #endif
230
floor_(F x)231 SI F floor_(F x) {
232 #if N == 1
233 return floorf_(x);
234 #elif defined(USING_NEON_FP16)
235 return vrndmq_f16(x);
236 #elif defined(__aarch64__)
237 return vrndmq_f32(x);
238 #elif defined(USING_AVX512F)
239 // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
240 // and integer santizer catches that this implicit cast changes the
241 // value from -1 to 65535. We'll cast manually to work around it.
242 // Read this as `return _mm512_floor_ps(x)`.
243 return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
244 #elif defined(USING_AVX)
245 return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
246 #elif defined(__SSE4_1__)
247 return _mm_floor_ps(x);
248 #else
249 // Round trip through integers with a truncating cast.
250 F roundtrip = cast<F>(cast<I32>(x));
251 // If x is negative, truncating gives the ceiling instead of the floor.
252 return roundtrip - if_then_else(roundtrip > x, F1, F0);
253
254 // This implementation fails for values of x that are outside
255 // the range an integer can represent. We expect most x to be small.
256 #endif
257 }
258
approx_log2(F x)259 SI F approx_log2(F x) {
260 #if defined(USING_NEON_FP16)
261 // TODO(mtklein)
262 return x;
263 #else
264 // The first approximation of log2(x) is its exponent 'e', minus 127.
265 I32 bits = bit_pun<I32>(x);
266
267 F e = cast<F>(bits) * (1.0f / (1<<23));
268
269 // If we use the mantissa too we can refine the error signficantly.
270 F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
271
272 return e - 124.225514990f
273 - 1.498030302f*m
274 - 1.725879990f/(0.3520887068f + m);
275 #endif
276 }
277
approx_log(F x)278 SI F approx_log(F x) {
279 const float ln2 = 0.69314718f;
280 return ln2 * approx_log2(x);
281 }
282
approx_exp2(F x)283 SI F approx_exp2(F x) {
284 #if defined(USING_NEON_FP16)
285 // TODO(mtklein)
286 return x;
287 #else
288 F fract = x - floor_(x);
289
290 F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
291 - 1.490129070f*fract
292 + 27.728023300f/(4.84252568f - fract));
293 I32 bits = cast<I32>(max_(fbits, F0));
294
295 return bit_pun<F>(bits);
296 #endif
297 }
298
approx_pow(F x,float y)299 SI F approx_pow(F x, float y) {
300 return if_then_else((x == F0) | (x == F1), x
301 , approx_exp2(approx_log2(x) * y));
302 }
303
approx_exp(F x)304 SI F approx_exp(F x) {
305 const float log2_e = 1.4426950408889634074f;
306 return approx_exp2(log2_e * x);
307 }
308
309 // Return tf(x).
apply_tf(const skcms_TransferFunction * tf,F x)310 SI F apply_tf(const skcms_TransferFunction* tf, F x) {
311 #if defined(USING_NEON_FP16)
312 // TODO(mtklein)
313 (void)tf;
314 return x;
315 #else
316 // Peel off the sign bit and set x = |x|.
317 U32 bits = bit_pun<U32>(x),
318 sign = bits & 0x80000000;
319 x = bit_pun<F>(bits ^ sign);
320
321 // The transfer function has a linear part up to d, exponential at d and after.
322 F v = if_then_else(x < tf->d, tf->c*x + tf->f
323 , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
324
325 // Tack the sign bit back on.
326 return bit_pun<F>(sign | bit_pun<U32>(v));
327 #endif
328 }
329
apply_pq(const skcms_TransferFunction * tf,F x)330 SI F apply_pq(const skcms_TransferFunction* tf, F x) {
331 #if defined(USING_NEON_FP16)
332 // TODO(mtklein)
333 (void)tf;
334 return x;
335 #else
336 U32 bits = bit_pun<U32>(x),
337 sign = bits & 0x80000000;
338 x = bit_pun<F>(bits ^ sign);
339
340 F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
341 / (tf->d + tf->e * approx_pow(x, tf->c)),
342 tf->f);
343
344 return bit_pun<F>(sign | bit_pun<U32>(v));
345 #endif
346 }
347
apply_hlg(const skcms_TransferFunction * tf,F x)348 SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
349 #if defined(USING_NEON_FP16)
350 // TODO(mtklein)
351 (void)tf;
352 return x;
353 #else
354 const float R = tf->a, G = tf->b,
355 a = tf->c, b = tf->d, c = tf->e,
356 K = tf->f + 1;
357 U32 bits = bit_pun<U32>(x),
358 sign = bits & 0x80000000;
359 x = bit_pun<F>(bits ^ sign);
360
361 F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
362 , approx_exp((x-c)*a) + b);
363
364 return K*bit_pun<F>(sign | bit_pun<U32>(v));
365 #endif
366 }
367
apply_hlginv(const skcms_TransferFunction * tf,F x)368 SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
369 #if defined(USING_NEON_FP16)
370 // TODO(mtklein)
371 (void)tf;
372 return x;
373 #else
374 const float R = tf->a, G = tf->b,
375 a = tf->c, b = tf->d, c = tf->e,
376 K = tf->f + 1;
377 U32 bits = bit_pun<U32>(x),
378 sign = bits & 0x80000000;
379 x = bit_pun<F>(bits ^ sign);
380 x /= K;
381
382 F v = if_then_else(x <= 1, R * approx_pow(x, G)
383 , a * approx_log(x - b) + c);
384
385 return bit_pun<F>(sign | bit_pun<U32>(v));
386 #endif
387 }
388
389
390 // Strided loads and stores of N values, starting from p.
391 template <typename T, typename P>
load_3(const P * p)392 SI T load_3(const P* p) {
393 #if N == 1
394 return (T)p[0];
395 #elif N == 4
396 return T{p[ 0],p[ 3],p[ 6],p[ 9]};
397 #elif N == 8
398 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
399 #elif N == 16
400 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
401 p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
402 #endif
403 }
404
405 template <typename T, typename P>
load_4(const P * p)406 SI T load_4(const P* p) {
407 #if N == 1
408 return (T)p[0];
409 #elif N == 4
410 return T{p[ 0],p[ 4],p[ 8],p[12]};
411 #elif N == 8
412 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
413 #elif N == 16
414 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
415 p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
416 #endif
417 }
418
419 template <typename T, typename P>
store_3(P * p,const T & v)420 SI void store_3(P* p, const T& v) {
421 #if N == 1
422 p[0] = v;
423 #elif N == 4
424 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
425 #elif N == 8
426 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
427 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
428 #elif N == 16
429 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
430 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
431 p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
432 p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
433 #endif
434 }
435
436 template <typename T, typename P>
store_4(P * p,const T & v)437 SI void store_4(P* p, const T& v) {
438 #if N == 1
439 p[0] = v;
440 #elif N == 4
441 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
442 #elif N == 8
443 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
444 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
445 #elif N == 16
446 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
447 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
448 p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
449 p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
450 #endif
451 }
452
453
gather_8(const uint8_t * p,I32 ix)454 SI U8 gather_8(const uint8_t* p, I32 ix) {
455 #if N == 1
456 U8 v = p[ix];
457 #elif N == 4
458 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
459 #elif N == 8
460 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
461 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
462 #elif N == 16
463 U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
464 p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
465 p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
466 p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
467 #endif
468 return v;
469 }
470
gather_16(const uint8_t * p,I32 ix)471 SI U16 gather_16(const uint8_t* p, I32 ix) {
472 // Load the i'th 16-bit value from p.
473 auto load_16 = [p](int i) {
474 return load<uint16_t>(p + 2*i);
475 };
476 #if N == 1
477 U16 v = load_16(ix);
478 #elif N == 4
479 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
480 #elif N == 8
481 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
482 load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
483 #elif N == 16
484 U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
485 load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
486 load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
487 load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
488 #endif
489 return v;
490 }
491
gather_32(const uint8_t * p,I32 ix)492 SI U32 gather_32(const uint8_t* p, I32 ix) {
493 // Load the i'th 32-bit value from p.
494 auto load_32 = [p](int i) {
495 return load<uint32_t>(p + 4*i);
496 };
497 #if N == 1
498 U32 v = load_32(ix);
499 #elif N == 4
500 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
501 #elif N == 8
502 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
503 load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
504 #elif N == 16
505 U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
506 load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
507 load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
508 load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
509 #endif
510 // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
511 return v;
512 }
513
gather_24(const uint8_t * p,I32 ix)514 SI U32 gather_24(const uint8_t* p, I32 ix) {
515 // First, back up a byte. Any place we're gathering from has a safe junk byte to read
516 // in front of it, either a previous table value, or some tag metadata.
517 p -= 1;
518
519 // Load the i'th 24-bit value from p, and 1 extra byte.
520 auto load_24_32 = [p](int i) {
521 return load<uint32_t>(p + 3*i);
522 };
523
524 // Now load multiples of 4 bytes (a junk byte, then r,g,b).
525 #if N == 1
526 U32 v = load_24_32(ix);
527 #elif N == 4
528 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
529 #elif N == 8 && !defined(USING_AVX2)
530 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
531 load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
532 #elif N == 8
533 (void)load_24_32;
534 // The gather instruction here doesn't need any particular alignment,
535 // but the intrinsic takes a const int*.
536 const int* p4 = bit_pun<const int*>(p);
537 I32 zero = { 0, 0, 0, 0, 0, 0, 0, 0},
538 mask = {-1,-1,-1,-1, -1,-1,-1,-1};
539 #if defined(__clang__)
540 U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
541 #elif defined(__GNUC__)
542 U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
543 #endif
544 #elif N == 16
545 (void)load_24_32;
546 // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
547 // And AVX-512 swapped the order of arguments. :/
548 const int* p4 = bit_pun<const int*>(p);
549 U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
550 #endif
551
552 // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
553 return v >> 8;
554 }
555
556 #if !defined(__arm__)
gather_48(const uint8_t * p,I32 ix,U64 * v)557 SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
558 // As in gather_24(), with everything doubled.
559 p -= 2;
560
561 // Load the i'th 48-bit value from p, and 2 extra bytes.
562 auto load_48_64 = [p](int i) {
563 return load<uint64_t>(p + 6*i);
564 };
565
566 #if N == 1
567 *v = load_48_64(ix);
568 #elif N == 4
569 *v = U64{
570 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
571 };
572 #elif N == 8 && !defined(USING_AVX2)
573 *v = U64{
574 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
575 load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
576 };
577 #elif N == 8
578 (void)load_48_64;
579 typedef int32_t __attribute__((vector_size(16))) Half_I32;
580 typedef long long __attribute__((vector_size(32))) Half_I64;
581
582 // The gather instruction here doesn't need any particular alignment,
583 // but the intrinsic takes a const long long*.
584 const long long int* p8 = bit_pun<const long long int*>(p);
585
586 Half_I64 zero = { 0, 0, 0, 0},
587 mask = {-1,-1,-1,-1};
588
589 ix *= 6;
590 Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
591 ix_hi = { ix[4], ix[5], ix[6], ix[7] };
592
593 #if defined(__clang__)
594 Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
595 hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
596 #elif defined(__GNUC__)
597 Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
598 hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
599 #endif
600 store((char*)v + 0, lo);
601 store((char*)v + 32, hi);
602 #elif N == 16
603 (void)load_48_64;
604 const long long int* p8 = bit_pun<const long long int*>(p);
605 __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
606 hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
607 store((char*)v + 0, lo);
608 store((char*)v + 64, hi);
609 #endif
610
611 *v >>= 16;
612 }
613 #endif
614
F_from_U8(U8 v)615 SI F F_from_U8(U8 v) {
616 return cast<F>(v) * (1/255.0f);
617 }
618
F_from_U16_BE(U16 v)619 SI F F_from_U16_BE(U16 v) {
620 // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
621 // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
622 U16 lo = (v >> 8),
623 hi = (v << 8) & 0xffff;
624 return cast<F>(lo|hi) * (1/65535.0f);
625 }
626
U16_from_F(F v)627 SI U16 U16_from_F(F v) {
628 // 65535 == inf in FP16, so promote to FP32 before converting.
629 return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
630 }
631
minus_1_ulp(F v)632 SI F minus_1_ulp(F v) {
633 #if defined(USING_NEON_FP16)
634 return bit_pun<F>( bit_pun<U16>(v) - 1 );
635 #else
636 return bit_pun<F>( bit_pun<U32>(v) - 1 );
637 #endif
638 }
639
table(const skcms_Curve * curve,F v)640 SI F table(const skcms_Curve* curve, F v) {
641 // Clamp the input to [0,1], then scale to a table index.
642 F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
643
644 // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
645 I32 lo = cast<I32>( ix ),
646 hi = cast<I32>(minus_1_ulp(ix+1.0f));
647 F t = ix - cast<F>(lo); // i.e. the fractional part of ix.
648
649 // TODO: can we load l and h simultaneously? Each entry in 'h' is either
650 // the same as in 'l' or adjacent. We have a rough idea that's it'd always be safe
651 // to read adjacent entries and perhaps underflow the table by a byte or two
652 // (it'd be junk, but always safe to read). Not sure how to lerp yet.
653 F l,h;
654 if (curve->table_8) {
655 l = F_from_U8(gather_8(curve->table_8, lo));
656 h = F_from_U8(gather_8(curve->table_8, hi));
657 } else {
658 l = F_from_U16_BE(gather_16(curve->table_16, lo));
659 h = F_from_U16_BE(gather_16(curve->table_16, hi));
660 }
661 return l + (h-l)*t;
662 }
663
sample_clut_8(const uint8_t * grid_8,I32 ix,F * r,F * g,F * b)664 SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
665 U32 rgb = gather_24(grid_8, ix);
666
667 *r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
668 *g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f);
669 *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
670 }
671
sample_clut_8(const uint8_t * grid_8,I32 ix,F * r,F * g,F * b,F * a)672 SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
673 // TODO: don't forget to optimize gather_32().
674 U32 rgba = gather_32(grid_8, ix);
675
676 *r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
677 *g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
678 *b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
679 *a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
680 }
681
sample_clut_16(const uint8_t * grid_16,I32 ix,F * r,F * g,F * b)682 SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
683 #if defined(__arm__)
684 // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
685 *r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
686 *g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
687 *b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
688 #else
689 // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
690 U64 rgb;
691 gather_48(grid_16, ix, &rgb);
692 rgb = swap_endian_16x4(rgb);
693
694 *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
695 *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
696 *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
697 #endif
698 }
699
sample_clut_16(const uint8_t * grid_16,I32 ix,F * r,F * g,F * b,F * a)700 SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
701 // TODO: gather_64()-based fast path?
702 *r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
703 *g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
704 *b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
705 *a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
706 }
707
clut(uint32_t input_channels,uint32_t output_channels,const uint8_t grid_points[4],const uint8_t * grid_8,const uint8_t * grid_16,F * r,F * g,F * b,F * a)708 static void clut(uint32_t input_channels, uint32_t output_channels,
709 const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
710 F* r, F* g, F* b, F* a) {
711
712 const int dim = (int)input_channels;
713 assert (0 < dim && dim <= 4);
714 assert (output_channels == 3 ||
715 output_channels == 4);
716
717 // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
718 I32 index [8]; // Index contribution by dimension, first low from 0, then high from 4.
719 F weight[8]; // Weight for each contribution, again first low, then high.
720
721 // O(dim) work first: calculate index,weight from r,g,b,a.
722 const F inputs[] = { *r,*g,*b,*a };
723 for (int i = dim-1, stride = 1; i >= 0; i--) {
724 // x is where we logically want to sample the grid in the i-th dimension.
725 F x = inputs[i] * (float)(grid_points[i] - 1);
726
727 // But we can't index at floats. lo and hi are the two integer grid points surrounding x.
728 I32 lo = cast<I32>( x ), // i.e. trunc(x) == floor(x) here.
729 hi = cast<I32>(minus_1_ulp(x+1.0f));
730 // Notice how we fold in the accumulated stride across previous dimensions here.
731 index[i+0] = lo * stride;
732 index[i+4] = hi * stride;
733 stride *= grid_points[i];
734
735 // We'll interpolate between those two integer grid points by t.
736 F t = x - cast<F>(lo); // i.e. fract(x)
737 weight[i+0] = 1-t;
738 weight[i+4] = t;
739 }
740
741 *r = *g = *b = F0;
742 if (output_channels == 4) {
743 *a = F0;
744 }
745
746 // We'll sample 2^dim == 1<<dim table entries per pixel,
747 // in all combinations of low and high in each dimension.
748 for (int combo = 0; combo < (1<<dim); combo++) { // This loop can be done in any order.
749
750 // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
751 // where 0 selects the low index contribution and its weight 1-t,
752 // or 4 the high index contribution and its weight t.
753
754 // Since 0<dim≤4, we can always just start off with the 0-th channel,
755 // then handle the others conditionally.
756 I32 ix = index [0 + (combo&1)*4];
757 F w = weight[0 + (combo&1)*4];
758
759 switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle.
760 case 3: ix += index [3 + (combo&8)/2];
761 w *= weight[3 + (combo&8)/2];
762 FALLTHROUGH;
763 // fall through
764
765 case 2: ix += index [2 + (combo&4)*1];
766 w *= weight[2 + (combo&4)*1];
767 FALLTHROUGH;
768 // fall through
769
770 case 1: ix += index [1 + (combo&2)*2];
771 w *= weight[1 + (combo&2)*2];
772 }
773
774 F R,G,B,A=F0;
775 if (output_channels == 3) {
776 if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
777 else { sample_clut_16(grid_16,ix, &R,&G,&B); }
778 } else {
779 if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
780 else { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
781 }
782 *r += w*R;
783 *g += w*G;
784 *b += w*B;
785 *a += w*A;
786 }
787 }
788
clut(const skcms_A2B * a2b,F * r,F * g,F * b,F a)789 static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
790 clut(a2b->input_channels, a2b->output_channels,
791 a2b->grid_points, a2b->grid_8, a2b->grid_16,
792 r,g,b,&a);
793 }
clut(const skcms_B2A * b2a,F * r,F * g,F * b,F * a)794 static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
795 clut(b2a->input_channels, b2a->output_channels,
796 b2a->grid_points, b2a->grid_8, b2a->grid_16,
797 r,g,b,a);
798 }
799
exec_ops(const Op * ops,const void ** args,const char * src,char * dst,int i)800 static void exec_ops(const Op* ops, const void** args,
801 const char* src, char* dst, int i) {
802 F r = F0, g = F0, b = F0, a = F1;
803 while (true) {
804 switch (*ops++) {
805 case Op_load_a8:{
806 a = F_from_U8(load<U8>(src + 1*i));
807 } break;
808
809 case Op_load_g8:{
810 r = g = b = F_from_U8(load<U8>(src + 1*i));
811 } break;
812
813 case Op_load_4444:{
814 U16 abgr = load<U16>(src + 2*i);
815
816 r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
817 g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
818 b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
819 a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
820 } break;
821
822 case Op_load_565:{
823 U16 rgb = load<U16>(src + 2*i);
824
825 r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
826 g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
827 b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
828 } break;
829
830 case Op_load_888:{
831 const uint8_t* rgb = (const uint8_t*)(src + 3*i);
832 #if defined(USING_NEON_FP16)
833 // See the explanation under USING_NEON below. This is that doubled up.
834 uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
835 v = vld3q_lane_u8(rgb+ 0, v, 0);
836 v = vld3q_lane_u8(rgb+ 3, v, 2);
837 v = vld3q_lane_u8(rgb+ 6, v, 4);
838 v = vld3q_lane_u8(rgb+ 9, v, 6);
839
840 v = vld3q_lane_u8(rgb+12, v, 8);
841 v = vld3q_lane_u8(rgb+15, v, 10);
842 v = vld3q_lane_u8(rgb+18, v, 12);
843 v = vld3q_lane_u8(rgb+21, v, 14);
844
845 r = cast<F>((U16)v.val[0]) * (1/255.0f);
846 g = cast<F>((U16)v.val[1]) * (1/255.0f);
847 b = cast<F>((U16)v.val[2]) * (1/255.0f);
848 #elif defined(USING_NEON)
849 // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
850 // a time. Since we're doing that, we might as well load them into 16-bit lanes.
851 // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
852 uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
853 v = vld3_lane_u8(rgb+0, v, 0);
854 v = vld3_lane_u8(rgb+3, v, 2);
855 v = vld3_lane_u8(rgb+6, v, 4);
856 v = vld3_lane_u8(rgb+9, v, 6);
857
858 // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
859 // convert to F. (Again, U32 would be even better here if drop ARMv7 or split
860 // ARMv7 and ARMv8 impls.)
861 r = cast<F>((U16)v.val[0]) * (1/255.0f);
862 g = cast<F>((U16)v.val[1]) * (1/255.0f);
863 b = cast<F>((U16)v.val[2]) * (1/255.0f);
864 #else
865 r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
866 g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
867 b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
868 #endif
869 } break;
870
871 case Op_load_8888:{
872 U32 rgba = load<U32>(src + 4*i);
873
874 r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
875 g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
876 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
877 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
878 } break;
879
880 case Op_load_8888_palette8:{
881 const uint8_t* palette = (const uint8_t*) *args++;
882 I32 ix = cast<I32>(load<U8>(src + 1*i));
883 U32 rgba = gather_32(palette, ix);
884
885 r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
886 g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
887 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
888 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
889 } break;
890
891 case Op_load_1010102:{
892 U32 rgba = load<U32>(src + 4*i);
893
894 r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
895 g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
896 b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
897 a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
898 } break;
899
900 case Op_load_161616LE:{
901 uintptr_t ptr = (uintptr_t)(src + 6*i);
902 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
903 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
904 #if defined(USING_NEON_FP16)
905 uint16x8x3_t v = vld3q_u16(rgb);
906 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
907 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
908 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
909 #elif defined(USING_NEON)
910 uint16x4x3_t v = vld3_u16(rgb);
911 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
912 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
913 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
914 #else
915 r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
916 g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
917 b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
918 #endif
919 } break;
920
921 case Op_load_16161616LE:{
922 uintptr_t ptr = (uintptr_t)(src + 8*i);
923 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
924 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
925 #if defined(USING_NEON_FP16)
926 uint16x8x4_t v = vld4q_u16(rgba);
927 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
928 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
929 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
930 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
931 #elif defined(USING_NEON)
932 uint16x4x4_t v = vld4_u16(rgba);
933 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
934 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
935 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
936 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
937 #else
938 U64 px = load<U64>(rgba);
939
940 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
941 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
942 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
943 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
944 #endif
945 } break;
946
947 case Op_load_161616BE:{
948 uintptr_t ptr = (uintptr_t)(src + 6*i);
949 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
950 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
951 #if defined(USING_NEON_FP16)
952 uint16x8x3_t v = vld3q_u16(rgb);
953 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
954 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
955 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
956 #elif defined(USING_NEON)
957 uint16x4x3_t v = vld3_u16(rgb);
958 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
959 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
960 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
961 #else
962 U32 R = load_3<U32>(rgb+0),
963 G = load_3<U32>(rgb+1),
964 B = load_3<U32>(rgb+2);
965 // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
966 r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
967 g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
968 b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
969 #endif
970 } break;
971
972 case Op_load_16161616BE:{
973 uintptr_t ptr = (uintptr_t)(src + 8*i);
974 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
975 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
976 #if defined(USING_NEON_FP16)
977 uint16x8x4_t v = vld4q_u16(rgba);
978 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
979 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
980 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
981 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
982 #elif defined(USING_NEON)
983 uint16x4x4_t v = vld4_u16(rgba);
984 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
985 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
986 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
987 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
988 #else
989 U64 px = swap_endian_16x4(load<U64>(rgba));
990
991 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
992 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
993 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
994 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
995 #endif
996 } break;
997
998 case Op_load_hhh:{
999 uintptr_t ptr = (uintptr_t)(src + 6*i);
1000 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1001 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1002 #if defined(USING_NEON_FP16)
1003 uint16x8x3_t v = vld3q_u16(rgb);
1004 U16 R = (U16)v.val[0],
1005 G = (U16)v.val[1],
1006 B = (U16)v.val[2];
1007 #elif defined(USING_NEON)
1008 uint16x4x3_t v = vld3_u16(rgb);
1009 U16 R = (U16)v.val[0],
1010 G = (U16)v.val[1],
1011 B = (U16)v.val[2];
1012 #else
1013 U16 R = load_3<U16>(rgb+0),
1014 G = load_3<U16>(rgb+1),
1015 B = load_3<U16>(rgb+2);
1016 #endif
1017 r = F_from_Half(R);
1018 g = F_from_Half(G);
1019 b = F_from_Half(B);
1020 } break;
1021
1022 case Op_load_hhhh:{
1023 uintptr_t ptr = (uintptr_t)(src + 8*i);
1024 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1025 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1026 #if defined(USING_NEON_FP16)
1027 uint16x8x4_t v = vld4q_u16(rgba);
1028 U16 R = (U16)v.val[0],
1029 G = (U16)v.val[1],
1030 B = (U16)v.val[2],
1031 A = (U16)v.val[3];
1032 #elif defined(USING_NEON)
1033 uint16x4x4_t v = vld4_u16(rgba);
1034 U16 R = (U16)v.val[0],
1035 G = (U16)v.val[1],
1036 B = (U16)v.val[2],
1037 A = (U16)v.val[3];
1038 #else
1039 U64 px = load<U64>(rgba);
1040 U16 R = cast<U16>((px >> 0) & 0xffff),
1041 G = cast<U16>((px >> 16) & 0xffff),
1042 B = cast<U16>((px >> 32) & 0xffff),
1043 A = cast<U16>((px >> 48) & 0xffff);
1044 #endif
1045 r = F_from_Half(R);
1046 g = F_from_Half(G);
1047 b = F_from_Half(B);
1048 a = F_from_Half(A);
1049 } break;
1050
1051 case Op_load_fff:{
1052 uintptr_t ptr = (uintptr_t)(src + 12*i);
1053 assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1054 const float* rgb = (const float*)ptr; // cast to const float* to be safe.
1055 #if defined(USING_NEON_FP16)
1056 float32x4x3_t lo = vld3q_f32(rgb + 0),
1057 hi = vld3q_f32(rgb + 12);
1058 r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1059 g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1060 b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1061 #elif defined(USING_NEON)
1062 float32x4x3_t v = vld3q_f32(rgb);
1063 r = (F)v.val[0];
1064 g = (F)v.val[1];
1065 b = (F)v.val[2];
1066 #else
1067 r = load_3<F>(rgb+0);
1068 g = load_3<F>(rgb+1);
1069 b = load_3<F>(rgb+2);
1070 #endif
1071 } break;
1072
1073 case Op_load_ffff:{
1074 uintptr_t ptr = (uintptr_t)(src + 16*i);
1075 assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1076 const float* rgba = (const float*)ptr; // cast to const float* to be safe.
1077 #if defined(USING_NEON_FP16)
1078 float32x4x4_t lo = vld4q_f32(rgba + 0),
1079 hi = vld4q_f32(rgba + 16);
1080 r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1081 g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1082 b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1083 a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
1084 #elif defined(USING_NEON)
1085 float32x4x4_t v = vld4q_f32(rgba);
1086 r = (F)v.val[0];
1087 g = (F)v.val[1];
1088 b = (F)v.val[2];
1089 a = (F)v.val[3];
1090 #else
1091 r = load_4<F>(rgba+0);
1092 g = load_4<F>(rgba+1);
1093 b = load_4<F>(rgba+2);
1094 a = load_4<F>(rgba+3);
1095 #endif
1096 } break;
1097
1098 case Op_swap_rb:{
1099 F t = r;
1100 r = b;
1101 b = t;
1102 } break;
1103
1104 case Op_clamp:{
1105 r = max_(F0, min_(r, F1));
1106 g = max_(F0, min_(g, F1));
1107 b = max_(F0, min_(b, F1));
1108 a = max_(F0, min_(a, F1));
1109 } break;
1110
1111 case Op_invert:{
1112 r = F1 - r;
1113 g = F1 - g;
1114 b = F1 - b;
1115 a = F1 - a;
1116 } break;
1117
1118 case Op_force_opaque:{
1119 a = F1;
1120 } break;
1121
1122 case Op_premul:{
1123 r *= a;
1124 g *= a;
1125 b *= a;
1126 } break;
1127
1128 case Op_unpremul:{
1129 F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1130 r *= scale;
1131 g *= scale;
1132 b *= scale;
1133 } break;
1134
1135 case Op_matrix_3x3:{
1136 const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
1137 const float* m = &matrix->vals[0][0];
1138
1139 F R = m[0]*r + m[1]*g + m[2]*b,
1140 G = m[3]*r + m[4]*g + m[5]*b,
1141 B = m[6]*r + m[7]*g + m[8]*b;
1142
1143 r = R;
1144 g = G;
1145 b = B;
1146 } break;
1147
1148 case Op_matrix_3x4:{
1149 const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
1150 const float* m = &matrix->vals[0][0];
1151
1152 F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1153 G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1154 B = m[8]*r + m[9]*g + m[10]*b + m[11];
1155
1156 r = R;
1157 g = G;
1158 b = B;
1159 } break;
1160
1161 case Op_lab_to_xyz:{
1162 // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
1163 F L = r * 100.0f,
1164 A = g * 255.0f - 128.0f,
1165 B = b * 255.0f - 128.0f;
1166
1167 // Convert to CIE XYZ.
1168 F Y = (L + 16.0f) * (1/116.0f),
1169 X = Y + A*(1/500.0f),
1170 Z = Y - B*(1/200.0f);
1171
1172 X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1173 Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1174 Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1175
1176 // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1177 r = X * 0.9642f;
1178 g = Y ;
1179 b = Z * 0.8249f;
1180 } break;
1181
1182 // As above, in reverse.
1183 case Op_xyz_to_lab:{
1184 F X = r * (1/0.9642f),
1185 Y = g,
1186 Z = b * (1/0.8249f);
1187
1188 X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
1189 Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
1190 Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
1191
1192 F L = Y*116.0f - 16.0f,
1193 A = (X-Y)*500.0f,
1194 B = (Y-Z)*200.0f;
1195
1196 r = L * (1/100.f);
1197 g = (A + 128.0f) * (1/255.0f);
1198 b = (B + 128.0f) * (1/255.0f);
1199 } break;
1200
1201 case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
1202 case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
1203 case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
1204 case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
1205
1206 case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
1207 case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
1208 case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
1209 case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
1210
1211 case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
1212 case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
1213 case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
1214 case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
1215
1216 case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
1217 case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
1218 case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
1219 case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
1220
1221 case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
1222 case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
1223 case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
1224 case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
1225
1226 case Op_clut_A2B: {
1227 const skcms_A2B* a2b = (const skcms_A2B*) *args++;
1228 clut(a2b, &r,&g,&b,a);
1229
1230 if (a2b->input_channels == 4) {
1231 // CMYK is opaque.
1232 a = F1;
1233 }
1234 } break;
1235
1236 case Op_clut_B2A: {
1237 const skcms_B2A* b2a = (const skcms_B2A*) *args++;
1238 clut(b2a, &r,&g,&b,&a);
1239 } break;
1240
1241 // Notice, from here on down the store_ ops all return, ending the loop.
1242
1243 case Op_store_a8: {
1244 store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1245 } return;
1246
1247 case Op_store_g8: {
1248 // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1249 store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1250 } return;
1251
1252 case Op_store_4444: {
1253 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1254 | cast<U16>(to_fixed(g * 15) << 8)
1255 | cast<U16>(to_fixed(b * 15) << 4)
1256 | cast<U16>(to_fixed(a * 15) << 0));
1257 } return;
1258
1259 case Op_store_565: {
1260 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
1261 | cast<U16>(to_fixed(g * 63) << 5 )
1262 | cast<U16>(to_fixed(b * 31) << 11 ));
1263 } return;
1264
1265 case Op_store_888: {
1266 uint8_t* rgb = (uint8_t*)dst + 3*i;
1267 #if defined(USING_NEON_FP16)
1268 // See the explanation under USING_NEON below. This is that doubled up.
1269 U16 R = to_fixed(r * 255),
1270 G = to_fixed(g * 255),
1271 B = to_fixed(b * 255);
1272
1273 uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
1274 vst3q_lane_u8(rgb+ 0, v, 0);
1275 vst3q_lane_u8(rgb+ 3, v, 2);
1276 vst3q_lane_u8(rgb+ 6, v, 4);
1277 vst3q_lane_u8(rgb+ 9, v, 6);
1278
1279 vst3q_lane_u8(rgb+12, v, 8);
1280 vst3q_lane_u8(rgb+15, v, 10);
1281 vst3q_lane_u8(rgb+18, v, 12);
1282 vst3q_lane_u8(rgb+21, v, 14);
1283 #elif defined(USING_NEON)
1284 // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1285 // get there via U16 to save some instructions converting to float. And just
1286 // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1287 U16 R = cast<U16>(to_fixed(r * 255)),
1288 G = cast<U16>(to_fixed(g * 255)),
1289 B = cast<U16>(to_fixed(b * 255));
1290
1291 uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1292 vst3_lane_u8(rgb+0, v, 0);
1293 vst3_lane_u8(rgb+3, v, 2);
1294 vst3_lane_u8(rgb+6, v, 4);
1295 vst3_lane_u8(rgb+9, v, 6);
1296 #else
1297 store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1298 store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1299 store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1300 #endif
1301 } return;
1302
1303 case Op_store_8888: {
1304 store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
1305 | cast<U32>(to_fixed(g * 255)) << 8
1306 | cast<U32>(to_fixed(b * 255)) << 16
1307 | cast<U32>(to_fixed(a * 255)) << 24);
1308 } return;
1309
1310 case Op_store_1010102: {
1311 store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
1312 | cast<U32>(to_fixed(g * 1023)) << 10
1313 | cast<U32>(to_fixed(b * 1023)) << 20
1314 | cast<U32>(to_fixed(a * 3)) << 30);
1315 } return;
1316
1317 case Op_store_161616LE: {
1318 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1319 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1320 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1321 #if defined(USING_NEON_FP16)
1322 uint16x8x3_t v = {{
1323 (uint16x8_t)U16_from_F(r),
1324 (uint16x8_t)U16_from_F(g),
1325 (uint16x8_t)U16_from_F(b),
1326 }};
1327 vst3q_u16(rgb, v);
1328 #elif defined(USING_NEON)
1329 uint16x4x3_t v = {{
1330 (uint16x4_t)U16_from_F(r),
1331 (uint16x4_t)U16_from_F(g),
1332 (uint16x4_t)U16_from_F(b),
1333 }};
1334 vst3_u16(rgb, v);
1335 #else
1336 store_3(rgb+0, U16_from_F(r));
1337 store_3(rgb+1, U16_from_F(g));
1338 store_3(rgb+2, U16_from_F(b));
1339 #endif
1340
1341 } return;
1342
1343 case Op_store_16161616LE: {
1344 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1345 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1346 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1347 #if defined(USING_NEON_FP16)
1348 uint16x8x4_t v = {{
1349 (uint16x8_t)U16_from_F(r),
1350 (uint16x8_t)U16_from_F(g),
1351 (uint16x8_t)U16_from_F(b),
1352 (uint16x8_t)U16_from_F(a),
1353 }};
1354 vst4q_u16(rgba, v);
1355 #elif defined(USING_NEON)
1356 uint16x4x4_t v = {{
1357 (uint16x4_t)U16_from_F(r),
1358 (uint16x4_t)U16_from_F(g),
1359 (uint16x4_t)U16_from_F(b),
1360 (uint16x4_t)U16_from_F(a),
1361 }};
1362 vst4_u16(rgba, v);
1363 #else
1364 U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1365 | cast<U64>(to_fixed(g * 65535)) << 16
1366 | cast<U64>(to_fixed(b * 65535)) << 32
1367 | cast<U64>(to_fixed(a * 65535)) << 48;
1368 store(rgba, px);
1369 #endif
1370 } return;
1371
1372 case Op_store_161616BE: {
1373 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1374 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1375 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1376 #if defined(USING_NEON_FP16)
1377 uint16x8x3_t v = {{
1378 (uint16x8_t)swap_endian_16(U16_from_F(r)),
1379 (uint16x8_t)swap_endian_16(U16_from_F(g)),
1380 (uint16x8_t)swap_endian_16(U16_from_F(b)),
1381 }};
1382 vst3q_u16(rgb, v);
1383 #elif defined(USING_NEON)
1384 uint16x4x3_t v = {{
1385 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1386 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1387 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1388 }};
1389 vst3_u16(rgb, v);
1390 #else
1391 U32 R = to_fixed(r * 65535),
1392 G = to_fixed(g * 65535),
1393 B = to_fixed(b * 65535);
1394 store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1395 store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1396 store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1397 #endif
1398
1399 } return;
1400
1401 case Op_store_16161616BE: {
1402 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1403 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1404 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1405 #if defined(USING_NEON_FP16)
1406 uint16x8x4_t v = {{
1407 (uint16x8_t)swap_endian_16(U16_from_F(r)),
1408 (uint16x8_t)swap_endian_16(U16_from_F(g)),
1409 (uint16x8_t)swap_endian_16(U16_from_F(b)),
1410 (uint16x8_t)swap_endian_16(U16_from_F(a)),
1411 }};
1412 vst4q_u16(rgba, v);
1413 #elif defined(USING_NEON)
1414 uint16x4x4_t v = {{
1415 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1416 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1417 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1418 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1419 }};
1420 vst4_u16(rgba, v);
1421 #else
1422 U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1423 | cast<U64>(to_fixed(g * 65535)) << 16
1424 | cast<U64>(to_fixed(b * 65535)) << 32
1425 | cast<U64>(to_fixed(a * 65535)) << 48;
1426 store(rgba, swap_endian_16x4(px));
1427 #endif
1428 } return;
1429
1430 case Op_store_hhh: {
1431 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1432 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1433 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1434
1435 U16 R = Half_from_F(r),
1436 G = Half_from_F(g),
1437 B = Half_from_F(b);
1438 #if defined(USING_NEON_FP16)
1439 uint16x8x3_t v = {{
1440 (uint16x8_t)R,
1441 (uint16x8_t)G,
1442 (uint16x8_t)B,
1443 }};
1444 vst3q_u16(rgb, v);
1445 #elif defined(USING_NEON)
1446 uint16x4x3_t v = {{
1447 (uint16x4_t)R,
1448 (uint16x4_t)G,
1449 (uint16x4_t)B,
1450 }};
1451 vst3_u16(rgb, v);
1452 #else
1453 store_3(rgb+0, R);
1454 store_3(rgb+1, G);
1455 store_3(rgb+2, B);
1456 #endif
1457 } return;
1458
1459 case Op_store_hhhh: {
1460 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1461 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1462 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1463
1464 U16 R = Half_from_F(r),
1465 G = Half_from_F(g),
1466 B = Half_from_F(b),
1467 A = Half_from_F(a);
1468 #if defined(USING_NEON_FP16)
1469 uint16x8x4_t v = {{
1470 (uint16x8_t)R,
1471 (uint16x8_t)G,
1472 (uint16x8_t)B,
1473 (uint16x8_t)A,
1474 }};
1475 vst4q_u16(rgba, v);
1476 #elif defined(USING_NEON)
1477 uint16x4x4_t v = {{
1478 (uint16x4_t)R,
1479 (uint16x4_t)G,
1480 (uint16x4_t)B,
1481 (uint16x4_t)A,
1482 }};
1483 vst4_u16(rgba, v);
1484 #else
1485 store(rgba, cast<U64>(R) << 0
1486 | cast<U64>(G) << 16
1487 | cast<U64>(B) << 32
1488 | cast<U64>(A) << 48);
1489 #endif
1490
1491 } return;
1492
1493 case Op_store_fff: {
1494 uintptr_t ptr = (uintptr_t)(dst + 12*i);
1495 assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1496 float* rgb = (float*)ptr; // for this cast to float* to be safe.
1497 #if defined(USING_NEON_FP16)
1498 float32x4x3_t lo = {{
1499 vcvt_f32_f16(vget_low_f16(r)),
1500 vcvt_f32_f16(vget_low_f16(g)),
1501 vcvt_f32_f16(vget_low_f16(b)),
1502 }}, hi = {{
1503 vcvt_f32_f16(vget_high_f16(r)),
1504 vcvt_f32_f16(vget_high_f16(g)),
1505 vcvt_f32_f16(vget_high_f16(b)),
1506 }};
1507 vst3q_f32(rgb + 0, lo);
1508 vst3q_f32(rgb + 12, hi);
1509 #elif defined(USING_NEON)
1510 float32x4x3_t v = {{
1511 (float32x4_t)r,
1512 (float32x4_t)g,
1513 (float32x4_t)b,
1514 }};
1515 vst3q_f32(rgb, v);
1516 #else
1517 store_3(rgb+0, r);
1518 store_3(rgb+1, g);
1519 store_3(rgb+2, b);
1520 #endif
1521 } return;
1522
1523 case Op_store_ffff: {
1524 uintptr_t ptr = (uintptr_t)(dst + 16*i);
1525 assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1526 float* rgba = (float*)ptr; // for this cast to float* to be safe.
1527 #if defined(USING_NEON_FP16)
1528 float32x4x4_t lo = {{
1529 vcvt_f32_f16(vget_low_f16(r)),
1530 vcvt_f32_f16(vget_low_f16(g)),
1531 vcvt_f32_f16(vget_low_f16(b)),
1532 vcvt_f32_f16(vget_low_f16(a)),
1533 }}, hi = {{
1534 vcvt_f32_f16(vget_high_f16(r)),
1535 vcvt_f32_f16(vget_high_f16(g)),
1536 vcvt_f32_f16(vget_high_f16(b)),
1537 vcvt_f32_f16(vget_high_f16(a)),
1538 }};
1539 vst4q_f32(rgba + 0, lo);
1540 vst4q_f32(rgba + 16, hi);
1541 #elif defined(USING_NEON)
1542 float32x4x4_t v = {{
1543 (float32x4_t)r,
1544 (float32x4_t)g,
1545 (float32x4_t)b,
1546 (float32x4_t)a,
1547 }};
1548 vst4q_f32(rgba, v);
1549 #else
1550 store_4(rgba+0, r);
1551 store_4(rgba+1, g);
1552 store_4(rgba+2, b);
1553 store_4(rgba+3, a);
1554 #endif
1555 } return;
1556 }
1557 }
1558 }
1559
1560
run_program(const Op * program,const void ** arguments,const char * src,char * dst,int n,const size_t src_bpp,const size_t dst_bpp)1561 static void run_program(const Op* program, const void** arguments,
1562 const char* src, char* dst, int n,
1563 const size_t src_bpp, const size_t dst_bpp) {
1564 int i = 0;
1565 while (n >= N) {
1566 exec_ops(program, arguments, src, dst, i);
1567 i += N;
1568 n -= N;
1569 }
1570 if (n > 0) {
1571 char tmp[4*4*N] = {0};
1572
1573 memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1574 exec_ops(program, arguments, tmp, tmp, 0);
1575 memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1576 }
1577 }
1578
1579 // Clean up any #defines we may have set so that we can be #included again.
1580 #if defined(USING_AVX)
1581 #undef USING_AVX
1582 #endif
1583 #if defined(USING_AVX_F16C)
1584 #undef USING_AVX_F16C
1585 #endif
1586 #if defined(USING_AVX2)
1587 #undef USING_AVX2
1588 #endif
1589 #if defined(USING_AVX512F)
1590 #undef USING_AVX512F
1591 #endif
1592
1593 #if defined(USING_NEON)
1594 #undef USING_NEON
1595 #endif
1596 #if defined(USING_NEON_F16C)
1597 #undef USING_NEON_F16C
1598 #endif
1599 #if defined(USING_NEON_FP16)
1600 #undef USING_NEON_FP16
1601 #endif
1602
1603 #undef FALLTHROUGH
1604