1 /*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 // Intentionally NO #pragma once... included multiple times.
9
10 // This file is included from skcms.cc with some pre-defined macros:
11 // N: depth of all vectors, 1,4,8, or 16
12 // and inside a namespace, with some types already defined:
13 // F: a vector of N float
14 // I32: a vector of N int32_t
15 // U64: a vector of N uint64_t
16 // U32: a vector of N uint32_t
17 // U16: a vector of N uint16_t
18 // U8: a vector of N uint8_t
19
20 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
21 // TODO(mtklein): this build supports FP16 compute
22 #endif
23
24 #if defined(__GNUC__) && !defined(__clang__)
25 // Once again, GCC is kind of weird, not allowing vector = scalar directly.
26 static constexpr F F0 = F() + 0.0f,
27 F1 = F() + 1.0f;
28 #else
29 static constexpr F F0 = 0.0f,
30 F1 = 1.0f;
31 #endif
32
33 // Instead of checking __AVX__ below, we'll check USING_AVX.
34 // This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
35 // Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
36
37 #if !defined(USING_AVX) && N == 8 && defined(__AVX__)
38 #define USING_AVX
39 #endif
40 #if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
41 #define USING AVX_F16C
42 #endif
43 #if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
44 #define USING_AVX2
45 #endif
46
47 // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
48 // This is more for organizational clarity... skcms.cc doesn't force these.
49 #if N == 4 && defined(__ARM_NEON)
50 #define USING_NEON
51 #if __ARM_FP & 2
52 #define USING_NEON_F16C
53 #endif
54 #endif
55
56 // These -Wvector-conversion warnings seem to trigger in very bogus situations,
57 // like vst3q_f32() expecting a 16x char rather than a 4x float vector. :/
58 #if defined(USING_NEON) && defined(__clang__)
59 #pragma clang diagnostic ignored "-Wvector-conversion"
60 #endif
61
62 // GCC warns us about returning U64 on x86 because it's larger than a register.
63 // You'd see warnings like, "using AVX even though AVX is not enabled".
64 // We stifle these warnings... our helpers that return U64 are always inlined.
65 #if defined(__SSE__) && defined(__GNUC__) && !defined(__clang__)
66 #pragma GCC diagnostic ignored "-Wpsabi"
67 #endif
68
69 #if defined(__clang__)
70 #define FALLTHROUGH [[clang::fallthrough]]
71 #else
72 #define FALLTHROUGH
73 #endif
74
75 // We tag most helper functions as SI, to enforce good code generation
76 // but also work around what we think is a bug in GCC: when targeting 32-bit
77 // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
78 // MMX mm0 register, which seems to mess with unrelated code that later uses
79 // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
80 //
81 // It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
82 #if defined(__clang__) || defined(__GNUC__)
83 #define SI static inline __attribute__((always_inline))
84 #else
85 #define SI static inline
86 #endif
87
88 template <typename T, typename P>
load(const P * ptr)89 SI T load(const P* ptr) {
90 T val;
91 small_memcpy(&val, ptr, sizeof(val));
92 return val;
93 }
94 template <typename T, typename P>
store(P * ptr,const T & val)95 SI void store(P* ptr, const T& val) {
96 small_memcpy(ptr, &val, sizeof(val));
97 }
98
99 // (T)v is a cast when N == 1 and a bit-pun when N>1,
100 // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
101 template <typename D, typename S>
cast(const S & v)102 SI D cast(const S& v) {
103 #if N == 1
104 return (D)v;
105 #elif defined(__clang__)
106 return __builtin_convertvector(v, D);
107 #elif N == 4
108 return D{v[0],v[1],v[2],v[3]};
109 #elif N == 8
110 return D{v[0],v[1],v[2],v[3], v[4],v[5],v[6],v[7]};
111 #elif N == 16
112 return D{v[0],v[1],v[ 2],v[ 3], v[ 4],v[ 5],v[ 6],v[ 7],
113 v[8],v[9],v[10],v[11], v[12],v[13],v[14],v[15]};
114 #endif
115 }
116
117 template <typename D, typename S>
bit_pun(const S & v)118 SI D bit_pun(const S& v) {
119 static_assert(sizeof(D) == sizeof(v), "");
120 return load<D>(&v);
121 }
122
123 // When we convert from float to fixed point, it's very common to want to round,
124 // and for some reason compilers generate better code when converting to int32_t.
125 // To serve both those ends, we use this function to_fixed() instead of direct cast().
to_fixed(F f)126 SI I32 to_fixed(F f) { return cast<I32>(f + 0.5f); }
127
128 template <typename T>
if_then_else(I32 cond,T t,T e)129 SI T if_then_else(I32 cond, T t, T e) {
130 #if N == 1
131 return cond ? t : e;
132 #else
133 return bit_pun<T>( ( cond & bit_pun<I32>(t)) |
134 (~cond & bit_pun<I32>(e)) );
135 #endif
136 }
137
F_from_Half(U16 half)138 SI F F_from_Half(U16 half) {
139 #if defined(USING_NEON_F16C)
140 return vcvt_f32_f16((float16x4_t)half);
141 #elif defined(__AVX512F__)
142 return (F)_mm512_cvtph_ps((__m256i)half);
143 #elif defined(USING_AVX_F16C)
144 typedef int16_t __attribute__((vector_size(16))) I16;
145 return __builtin_ia32_vcvtph2ps256((I16)half);
146 #else
147 U32 wide = cast<U32>(half);
148 // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
149 U32 s = wide & 0x8000,
150 em = wide ^ s;
151
152 // Constructing the float is easy if the half is not denormalized.
153 F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
154
155 // Simply flush all denorm half floats to zero.
156 return if_then_else(em < 0x0400, F0, norm);
157 #endif
158 }
159
160 #if defined(__clang__)
161 // The -((127-15)<<10) underflows that side of the math when
162 // we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
163 __attribute__((no_sanitize("unsigned-integer-overflow")))
164 #endif
Half_from_F(F f)165 SI U16 Half_from_F(F f) {
166 #if defined(USING_NEON_F16C)
167 return (U16)vcvt_f16_f32(f);
168 #elif defined(__AVX512F__)
169 return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
170 #elif defined(USING_AVX_F16C)
171 return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
172 #else
173 // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
174 U32 sem = bit_pun<U32>(f),
175 s = sem & 0x80000000,
176 em = sem ^ s;
177
178 // For simplicity we flush denorm half floats (including all denorm floats) to zero.
179 return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
180 , (s>>16) + (em>>13) - ((127-15)<<10)));
181 #endif
182 }
183
184 // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
185 #if defined(USING_NEON)
swap_endian_16(U16 v)186 SI U16 swap_endian_16(U16 v) {
187 return (U16)vrev16_u8((uint8x8_t) v);
188 }
189 #endif
190
swap_endian_16x4(const U64 & rgba)191 SI U64 swap_endian_16x4(const U64& rgba) {
192 return (rgba & 0x00ff00ff00ff00ff) << 8
193 | (rgba & 0xff00ff00ff00ff00) >> 8;
194 }
195
196 #if defined(USING_NEON)
min_(F x,F y)197 SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
max_(F x,F y)198 SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
199 #else
min_(F x,F y)200 SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
max_(F x,F y)201 SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
202 #endif
203
floor_(F x)204 SI F floor_(F x) {
205 #if N == 1
206 return floorf_(x);
207 #elif defined(__aarch64__)
208 return vrndmq_f32(x);
209 #elif defined(__AVX512F__)
210 return _mm512_floor_ps(x);
211 #elif defined(USING_AVX)
212 return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
213 #elif defined(__SSE4_1__)
214 return _mm_floor_ps(x);
215 #else
216 // Round trip through integers with a truncating cast.
217 F roundtrip = cast<F>(cast<I32>(x));
218 // If x is negative, truncating gives the ceiling instead of the floor.
219 return roundtrip - if_then_else(roundtrip > x, F1, F0);
220
221 // This implementation fails for values of x that are outside
222 // the range an integer can represent. We expect most x to be small.
223 #endif
224 }
225
approx_log2(F x)226 SI F approx_log2(F x) {
227 // The first approximation of log2(x) is its exponent 'e', minus 127.
228 I32 bits = bit_pun<I32>(x);
229
230 F e = cast<F>(bits) * (1.0f / (1<<23));
231
232 // If we use the mantissa too we can refine the error signficantly.
233 F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
234
235 return e - 124.225514990f
236 - 1.498030302f*m
237 - 1.725879990f/(0.3520887068f + m);
238 }
239
approx_exp2(F x)240 SI F approx_exp2(F x) {
241 F fract = x - floor_(x);
242
243 I32 bits = cast<I32>((1.0f * (1<<23)) * (x + 121.274057500f
244 - 1.490129070f*fract
245 + 27.728023300f/(4.84252568f - fract)));
246 return bit_pun<F>(bits);
247 }
248
approx_pow(F x,float y)249 SI F approx_pow(F x, float y) {
250 return if_then_else((x == F0) | (x == F1), x
251 , approx_exp2(approx_log2(x) * y));
252 }
253
254 // Return tf(x).
apply_tf(const skcms_TransferFunction * tf,F x)255 SI F apply_tf(const skcms_TransferFunction* tf, F x) {
256 // Peel off the sign bit and set x = |x|.
257 U32 bits = bit_pun<U32>(x),
258 sign = bits & 0x80000000;
259 x = bit_pun<F>(bits ^ sign);
260
261 // The transfer function has a linear part up to d, exponential at d and after.
262 F v = if_then_else(x < tf->d, tf->c*x + tf->f
263 , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
264
265 // Tack the sign bit back on.
266 return bit_pun<F>(sign | bit_pun<U32>(v));
267 }
268
269
270 // Strided loads and stores of N values, starting from p.
271 template <typename T, typename P>
load_3(const P * p)272 SI T load_3(const P* p) {
273 #if N == 1
274 return (T)p[0];
275 #elif N == 4
276 return T{p[ 0],p[ 3],p[ 6],p[ 9]};
277 #elif N == 8
278 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
279 #elif N == 16
280 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
281 p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
282 #endif
283 }
284
285 template <typename T, typename P>
load_4(const P * p)286 SI T load_4(const P* p) {
287 #if N == 1
288 return (T)p[0];
289 #elif N == 4
290 return T{p[ 0],p[ 4],p[ 8],p[12]};
291 #elif N == 8
292 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
293 #elif N == 16
294 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
295 p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
296 #endif
297 }
298
299 template <typename T, typename P>
store_3(P * p,const T & v)300 SI void store_3(P* p, const T& v) {
301 #if N == 1
302 p[0] = v;
303 #elif N == 4
304 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
305 #elif N == 8
306 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
307 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
308 #elif N == 16
309 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
310 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
311 p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
312 p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
313 #endif
314 }
315
316 template <typename T, typename P>
store_4(P * p,const T & v)317 SI void store_4(P* p, const T& v) {
318 #if N == 1
319 p[0] = v;
320 #elif N == 4
321 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
322 #elif N == 8
323 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
324 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
325 #elif N == 16
326 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
327 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
328 p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
329 p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
330 #endif
331 }
332
333
gather_8(const uint8_t * p,I32 ix)334 SI U8 gather_8(const uint8_t* p, I32 ix) {
335 #if N == 1
336 U8 v = p[ix];
337 #elif N == 4
338 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
339 #elif N == 8
340 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
341 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
342 #elif N == 16
343 U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
344 p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
345 p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
346 p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
347 #endif
348 return v;
349 }
350
gather_16(const uint8_t * p,I32 ix)351 SI U16 gather_16(const uint8_t* p, I32 ix) {
352 // Load the i'th 16-bit value from p.
353 auto load_16 = [p](int i) {
354 return load<uint16_t>(p + 2*i);
355 };
356 #if N == 1
357 U16 v = load_16(ix);
358 #elif N == 4
359 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
360 #elif N == 8
361 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
362 load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
363 #elif N == 16
364 U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
365 load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
366 load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
367 load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
368 #endif
369 return v;
370 }
371
gather_32(const uint8_t * p,I32 ix)372 SI U32 gather_32(const uint8_t* p, I32 ix) {
373 // Load the i'th 32-bit value from p.
374 auto load_32 = [p](int i) {
375 return load<uint32_t>(p + 4*i);
376 };
377 #if N == 1
378 U32 v = load_32(ix);
379 #elif N == 4
380 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
381 #elif N == 8
382 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
383 load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
384 #elif N == 16
385 U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
386 load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
387 load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
388 load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
389 #endif
390 // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
391 return v;
392 }
393
gather_24(const uint8_t * p,I32 ix)394 SI U32 gather_24(const uint8_t* p, I32 ix) {
395 // First, back up a byte. Any place we're gathering from has a safe junk byte to read
396 // in front of it, either a previous table value, or some tag metadata.
397 p -= 1;
398
399 // Load the i'th 24-bit value from p, and 1 extra byte.
400 auto load_24_32 = [p](int i) {
401 return load<uint32_t>(p + 3*i);
402 };
403
404 // Now load multiples of 4 bytes (a junk byte, then r,g,b).
405 #if N == 1
406 U32 v = load_24_32(ix);
407 #elif N == 4
408 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
409 #elif N == 8 && !defined(USING_AVX2)
410 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
411 load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
412 #elif N == 8
413 (void)load_24_32;
414 // The gather instruction here doesn't need any particular alignment,
415 // but the intrinsic takes a const int*.
416 const int* p4 = bit_pun<const int*>(p);
417 I32 zero = { 0, 0, 0, 0, 0, 0, 0, 0},
418 mask = {-1,-1,-1,-1, -1,-1,-1,-1};
419 #if defined(__clang__)
420 U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
421 #elif defined(__GNUC__)
422 U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
423 #endif
424 #elif N == 16
425 (void)load_24_32;
426 // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
427 // And AVX-512 swapped the order of arguments. :/
428 const int* p4 = bit_pun<const int*>(p);
429 U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
430 #endif
431
432 // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
433 return v >> 8;
434 }
435
436 #if !defined(__arm__)
gather_48(const uint8_t * p,I32 ix,U64 * v)437 SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
438 // As in gather_24(), with everything doubled.
439 p -= 2;
440
441 // Load the i'th 48-bit value from p, and 2 extra bytes.
442 auto load_48_64 = [p](int i) {
443 return load<uint64_t>(p + 6*i);
444 };
445
446 #if N == 1
447 *v = load_48_64(ix);
448 #elif N == 4
449 *v = U64{
450 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
451 };
452 #elif N == 8 && !defined(USING_AVX2)
453 *v = U64{
454 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
455 load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
456 };
457 #elif N == 8
458 (void)load_48_64;
459 typedef int32_t __attribute__((vector_size(16))) Half_I32;
460 typedef long long __attribute__((vector_size(32))) Half_I64;
461
462 // The gather instruction here doesn't need any particular alignment,
463 // but the intrinsic takes a const long long*.
464 const long long int* p8 = bit_pun<const long long int*>(p);
465
466 Half_I64 zero = { 0, 0, 0, 0},
467 mask = {-1,-1,-1,-1};
468
469 ix *= 6;
470 Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
471 ix_hi = { ix[4], ix[5], ix[6], ix[7] };
472
473 #if defined(__clang__)
474 Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
475 hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
476 #elif defined(__GNUC__)
477 Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
478 hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
479 #endif
480 store((char*)v + 0, lo);
481 store((char*)v + 32, hi);
482 #elif N == 16
483 (void)load_48_64;
484 const long long int* p8 = bit_pun<const long long int*>(p);
485 __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
486 hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
487 store((char*)v + 0, lo);
488 store((char*)v + 64, hi);
489 #endif
490
491 *v >>= 16;
492 }
493 #endif
494
F_from_U8(U8 v)495 SI F F_from_U8(U8 v) {
496 return cast<F>(v) * (1/255.0f);
497 }
498
F_from_U16_BE(U16 v)499 SI F F_from_U16_BE(U16 v) {
500 // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
501 // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
502 v = (U16)( ((v<<8)|(v>>8)) & 0xffff );
503 return cast<F>(v) * (1/65535.0f);
504 }
505
minus_1_ulp(F v)506 SI F minus_1_ulp(F v) {
507 return bit_pun<F>( bit_pun<I32>(v) - 1 );
508 }
509
table(const skcms_Curve * curve,F v)510 SI F table(const skcms_Curve* curve, F v) {
511 // Clamp the input to [0,1], then scale to a table index.
512 F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
513
514 // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
515 I32 lo = cast<I32>( ix ),
516 hi = cast<I32>(minus_1_ulp(ix+1.0f));
517 F t = ix - cast<F>(lo); // i.e. the fractional part of ix.
518
519 // TODO: can we load l and h simultaneously? Each entry in 'h' is either
520 // the same as in 'l' or adjacent. We have a rough idea that's it'd always be safe
521 // to read adjacent entries and perhaps underflow the table by a byte or two
522 // (it'd be junk, but always safe to read). Not sure how to lerp yet.
523 F l,h;
524 if (curve->table_8) {
525 l = F_from_U8(gather_8(curve->table_8, lo));
526 h = F_from_U8(gather_8(curve->table_8, hi));
527 } else {
528 l = F_from_U16_BE(gather_16(curve->table_16, lo));
529 h = F_from_U16_BE(gather_16(curve->table_16, hi));
530 }
531 return l + (h-l)*t;
532 }
533
sample_clut_8(const skcms_A2B * a2b,I32 ix,F * r,F * g,F * b)534 SI void sample_clut_8(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
535 U32 rgb = gather_24(a2b->grid_8, ix);
536
537 *r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
538 *g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f);
539 *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
540 }
541
sample_clut_16(const skcms_A2B * a2b,I32 ix,F * r,F * g,F * b)542 SI void sample_clut_16(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
543 #if defined(__arm__)
544 // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
545 *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0));
546 *g = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+1));
547 *b = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+2));
548 #else
549 // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
550 U64 rgb;
551 gather_48(a2b->grid_16, ix, &rgb);
552 rgb = swap_endian_16x4(rgb);
553
554 *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
555 *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
556 *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
557 #endif
558 }
559
560 // GCC 7.2.0 hits an internal compiler error with -finline-functions (or -O3)
561 // when targeting MIPS 64, I think attempting to inline clut() into exec_ops().
562 #if 1 && defined(__GNUC__) && !defined(__clang__) && defined(__mips64)
563 #define MAYBE_NOINLINE __attribute__((noinline))
564 #else
565 #define MAYBE_NOINLINE
566 #endif
567
568 MAYBE_NOINLINE
clut(const skcms_A2B * a2b,F * r,F * g,F * b,F a)569 static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
570 const int dim = (int)a2b->input_channels;
571 assert (0 < dim && dim <= 4);
572
573 // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
574 I32 index [8]; // Index contribution by dimension, first low from 0, then high from 4.
575 F weight[8]; // Weight for each contribution, again first low, then high.
576
577 // O(dim) work first: calculate index,weight from r,g,b,a.
578 const F inputs[] = { *r,*g,*b,a };
579 for (int i = dim-1, stride = 1; i >= 0; i--) {
580 // x is where we logically want to sample the grid in the i-th dimension.
581 F x = inputs[i] * (float)(a2b->grid_points[i] - 1);
582
583 // But we can't index at floats. lo and hi are the two integer grid points surrounding x.
584 I32 lo = cast<I32>( x ), // i.e. trunc(x) == floor(x) here.
585 hi = cast<I32>(minus_1_ulp(x+1.0f));
586 // Notice how we fold in the accumulated stride across previous dimensions here.
587 index[i+0] = lo * stride;
588 index[i+4] = hi * stride;
589 stride *= a2b->grid_points[i];
590
591 // We'll interpolate between those two integer grid points by t.
592 F t = x - cast<F>(lo); // i.e. fract(x)
593 weight[i+0] = 1-t;
594 weight[i+4] = t;
595 }
596
597 *r = *g = *b = F0;
598
599 // We'll sample 2^dim == 1<<dim table entries per pixel,
600 // in all combinations of low and high in each dimension.
601 for (int combo = 0; combo < (1<<dim); combo++) { // This loop can be done in any order.
602
603 // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
604 // where 0 selects the low index contribution and its weight 1-t,
605 // or 4 the high index contribution and its weight t.
606
607 // Since 0<dim≤4, we can always just start off with the 0-th channel,
608 // then handle the others conditionally.
609 I32 ix = index [0 + (combo&1)*4];
610 F w = weight[0 + (combo&1)*4];
611
612 switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle.
613 case 3: ix += index [3 + (combo&8)/2];
614 w *= weight[3 + (combo&8)/2];
615 FALLTHROUGH;
616 // fall through
617
618 case 2: ix += index [2 + (combo&4)*1];
619 w *= weight[2 + (combo&4)*1];
620 FALLTHROUGH;
621 // fall through
622
623 case 1: ix += index [1 + (combo&2)*2];
624 w *= weight[1 + (combo&2)*2];
625 }
626
627 F R,G,B;
628 if (a2b->grid_8) {
629 sample_clut_8 (a2b,ix, &R,&G,&B);
630 } else {
631 sample_clut_16(a2b,ix, &R,&G,&B);
632 }
633
634 *r += w*R;
635 *g += w*G;
636 *b += w*B;
637 }
638 }
639
exec_ops(const Op * ops,const void ** args,const char * src,char * dst,int i)640 static void exec_ops(const Op* ops, const void** args,
641 const char* src, char* dst, int i) {
642 F r = F0, g = F0, b = F0, a = F1;
643 while (true) {
644 switch (*ops++) {
645 case Op_load_a8:{
646 a = F_from_U8(load<U8>(src + 1*i));
647 } break;
648
649 case Op_load_g8:{
650 r = g = b = F_from_U8(load<U8>(src + 1*i));
651 } break;
652
653 case Op_load_4444:{
654 U16 abgr = load<U16>(src + 2*i);
655
656 r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
657 g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
658 b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
659 a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
660 } break;
661
662 case Op_load_565:{
663 U16 rgb = load<U16>(src + 2*i);
664
665 r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
666 g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
667 b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
668 } break;
669
670 case Op_load_888:{
671 const uint8_t* rgb = (const uint8_t*)(src + 3*i);
672 #if defined(USING_NEON)
673 // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
674 // a time. Since we're doing that, we might as well load them into 16-bit lanes.
675 // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
676 uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
677 v = vld3_lane_u8(rgb+0, v, 0);
678 v = vld3_lane_u8(rgb+3, v, 2);
679 v = vld3_lane_u8(rgb+6, v, 4);
680 v = vld3_lane_u8(rgb+9, v, 6);
681
682 // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
683 // convert to F. (Again, U32 would be even better here if drop ARMv7 or split
684 // ARMv7 and ARMv8 impls.)
685 r = cast<F>((U16)v.val[0]) * (1/255.0f);
686 g = cast<F>((U16)v.val[1]) * (1/255.0f);
687 b = cast<F>((U16)v.val[2]) * (1/255.0f);
688 #else
689 r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
690 g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
691 b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
692 #endif
693 } break;
694
695 case Op_load_8888:{
696 U32 rgba = load<U32>(src + 4*i);
697
698 r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
699 g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
700 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
701 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
702 } break;
703
704 case Op_load_8888_palette8:{
705 const uint8_t* palette = (const uint8_t*) *args++;
706 I32 ix = cast<I32>(load<U8>(src + 1*i));
707 U32 rgba = gather_32(palette, ix);
708
709 r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
710 g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
711 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
712 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
713 } break;
714
715 case Op_load_1010102:{
716 U32 rgba = load<U32>(src + 4*i);
717
718 r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
719 g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
720 b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
721 a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
722 } break;
723
724 case Op_load_161616LE:{
725 uintptr_t ptr = (uintptr_t)(src + 6*i);
726 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
727 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
728 #if defined(USING_NEON)
729 uint16x4x3_t v = vld3_u16(rgb);
730 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
731 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
732 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
733 #else
734 r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
735 g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
736 b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
737 #endif
738 } break;
739
740 case Op_load_16161616LE:{
741 uintptr_t ptr = (uintptr_t)(src + 8*i);
742 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
743 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
744 #if defined(USING_NEON)
745 uint16x4x4_t v = vld4_u16(rgba);
746 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
747 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
748 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
749 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
750 #else
751 U64 px = load<U64>(rgba);
752
753 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
754 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
755 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
756 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
757 #endif
758 } break;
759
760 case Op_load_161616BE:{
761 uintptr_t ptr = (uintptr_t)(src + 6*i);
762 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
763 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
764 #if defined(USING_NEON)
765 uint16x4x3_t v = vld3_u16(rgb);
766 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
767 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
768 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
769 #else
770 U32 R = load_3<U32>(rgb+0),
771 G = load_3<U32>(rgb+1),
772 B = load_3<U32>(rgb+2);
773 // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
774 r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
775 g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
776 b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
777 #endif
778 } break;
779
780 case Op_load_16161616BE:{
781 uintptr_t ptr = (uintptr_t)(src + 8*i);
782 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
783 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
784 #if defined(USING_NEON)
785 uint16x4x4_t v = vld4_u16(rgba);
786 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
787 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
788 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
789 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
790 #else
791 U64 px = swap_endian_16x4(load<U64>(rgba));
792
793 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
794 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
795 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
796 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
797 #endif
798 } break;
799
800 case Op_load_hhh:{
801 uintptr_t ptr = (uintptr_t)(src + 6*i);
802 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
803 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
804 #if defined(USING_NEON)
805 uint16x4x3_t v = vld3_u16(rgb);
806 U16 R = (U16)v.val[0],
807 G = (U16)v.val[1],
808 B = (U16)v.val[2];
809 #else
810 U16 R = load_3<U16>(rgb+0),
811 G = load_3<U16>(rgb+1),
812 B = load_3<U16>(rgb+2);
813 #endif
814 r = F_from_Half(R);
815 g = F_from_Half(G);
816 b = F_from_Half(B);
817 } break;
818
819 case Op_load_hhhh:{
820 uintptr_t ptr = (uintptr_t)(src + 8*i);
821 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
822 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
823 #if defined(USING_NEON)
824 uint16x4x4_t v = vld4_u16(rgba);
825 U16 R = (U16)v.val[0],
826 G = (U16)v.val[1],
827 B = (U16)v.val[2],
828 A = (U16)v.val[3];
829 #else
830 U64 px = load<U64>(rgba);
831 U16 R = cast<U16>((px >> 0) & 0xffff),
832 G = cast<U16>((px >> 16) & 0xffff),
833 B = cast<U16>((px >> 32) & 0xffff),
834 A = cast<U16>((px >> 48) & 0xffff);
835 #endif
836 r = F_from_Half(R);
837 g = F_from_Half(G);
838 b = F_from_Half(B);
839 a = F_from_Half(A);
840 } break;
841
842 case Op_load_fff:{
843 uintptr_t ptr = (uintptr_t)(src + 12*i);
844 assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
845 const float* rgb = (const float*)ptr; // cast to const float* to be safe.
846 #if defined(USING_NEON)
847 float32x4x3_t v = vld3q_f32(rgb);
848 r = (F)v.val[0];
849 g = (F)v.val[1];
850 b = (F)v.val[2];
851 #else
852 r = load_3<F>(rgb+0);
853 g = load_3<F>(rgb+1);
854 b = load_3<F>(rgb+2);
855 #endif
856 } break;
857
858 case Op_load_ffff:{
859 uintptr_t ptr = (uintptr_t)(src + 16*i);
860 assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
861 const float* rgba = (const float*)ptr; // cast to const float* to be safe.
862 #if defined(USING_NEON)
863 float32x4x4_t v = vld4q_f32(rgba);
864 r = (F)v.val[0];
865 g = (F)v.val[1];
866 b = (F)v.val[2];
867 a = (F)v.val[3];
868 #else
869 r = load_4<F>(rgba+0);
870 g = load_4<F>(rgba+1);
871 b = load_4<F>(rgba+2);
872 a = load_4<F>(rgba+3);
873 #endif
874 } break;
875
876 case Op_swap_rb:{
877 F t = r;
878 r = b;
879 b = t;
880 } break;
881
882 case Op_clamp:{
883 r = max_(F0, min_(r, F1));
884 g = max_(F0, min_(g, F1));
885 b = max_(F0, min_(b, F1));
886 a = max_(F0, min_(a, F1));
887 } break;
888
889 case Op_invert:{
890 r = F1 - r;
891 g = F1 - g;
892 b = F1 - b;
893 a = F1 - a;
894 } break;
895
896 case Op_force_opaque:{
897 a = F1;
898 } break;
899
900 case Op_premul:{
901 r *= a;
902 g *= a;
903 b *= a;
904 } break;
905
906 case Op_unpremul:{
907 F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
908 r *= scale;
909 g *= scale;
910 b *= scale;
911 } break;
912
913 case Op_matrix_3x3:{
914 const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
915 const float* m = &matrix->vals[0][0];
916
917 F R = m[0]*r + m[1]*g + m[2]*b,
918 G = m[3]*r + m[4]*g + m[5]*b,
919 B = m[6]*r + m[7]*g + m[8]*b;
920
921 r = R;
922 g = G;
923 b = B;
924 } break;
925
926 case Op_matrix_3x4:{
927 const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
928 const float* m = &matrix->vals[0][0];
929
930 F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
931 G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
932 B = m[8]*r + m[9]*g + m[10]*b + m[11];
933
934 r = R;
935 g = G;
936 b = B;
937 } break;
938
939 case Op_lab_to_xyz:{
940 // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
941 F L = r * 100.0f,
942 A = g * 255.0f - 128.0f,
943 B = b * 255.0f - 128.0f;
944
945 // Convert to CIE XYZ.
946 F Y = (L + 16.0f) * (1/116.0f),
947 X = Y + A*(1/500.0f),
948 Z = Y - B*(1/200.0f);
949
950 X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
951 Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
952 Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
953
954 // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
955 r = X * 0.9642f;
956 g = Y ;
957 b = Z * 0.8249f;
958 } break;
959
960 case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
961 case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
962 case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
963 case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
964
965 case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
966 case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
967 case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
968 case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
969
970 case Op_clut: {
971 const skcms_A2B* a2b = (const skcms_A2B*) *args++;
972 clut(a2b, &r,&g,&b,a);
973
974 if (a2b->input_channels == 4) {
975 // CMYK is opaque.
976 a = F1;
977 }
978 } break;
979
980 // Notice, from here on down the store_ ops all return, ending the loop.
981
982 case Op_store_a8: {
983 store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
984 } return;
985
986 case Op_store_g8: {
987 // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
988 store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
989 } return;
990
991 case Op_store_4444: {
992 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
993 | cast<U16>(to_fixed(g * 15) << 8)
994 | cast<U16>(to_fixed(b * 15) << 4)
995 | cast<U16>(to_fixed(a * 15) << 0));
996 } return;
997
998 case Op_store_565: {
999 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
1000 | cast<U16>(to_fixed(g * 63) << 5 )
1001 | cast<U16>(to_fixed(b * 31) << 11 ));
1002 } return;
1003
1004 case Op_store_888: {
1005 uint8_t* rgb = (uint8_t*)dst + 3*i;
1006 #if defined(USING_NEON)
1007 // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1008 // get there via U16 to save some instructions converting to float. And just
1009 // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1010 U16 R = cast<U16>(to_fixed(r * 255)),
1011 G = cast<U16>(to_fixed(g * 255)),
1012 B = cast<U16>(to_fixed(b * 255));
1013
1014 uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1015 vst3_lane_u8(rgb+0, v, 0);
1016 vst3_lane_u8(rgb+3, v, 2);
1017 vst3_lane_u8(rgb+6, v, 4);
1018 vst3_lane_u8(rgb+9, v, 6);
1019 #else
1020 store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1021 store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1022 store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1023 #endif
1024 } return;
1025
1026 case Op_store_8888: {
1027 store(dst + 4*i, cast<U32>(to_fixed(r * 255) << 0)
1028 | cast<U32>(to_fixed(g * 255) << 8)
1029 | cast<U32>(to_fixed(b * 255) << 16)
1030 | cast<U32>(to_fixed(a * 255) << 24));
1031 } return;
1032
1033 case Op_store_1010102: {
1034 store(dst + 4*i, cast<U32>(to_fixed(r * 1023) << 0)
1035 | cast<U32>(to_fixed(g * 1023) << 10)
1036 | cast<U32>(to_fixed(b * 1023) << 20)
1037 | cast<U32>(to_fixed(a * 3) << 30));
1038 } return;
1039
1040 case Op_store_161616LE: {
1041 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1042 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1043 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1044 #if defined(USING_NEON)
1045 uint16x4x3_t v = {{
1046 (uint16x4_t)cast<U16>(to_fixed(r * 65535)),
1047 (uint16x4_t)cast<U16>(to_fixed(g * 65535)),
1048 (uint16x4_t)cast<U16>(to_fixed(b * 65535)),
1049 }};
1050 vst3_u16(rgb, v);
1051 #else
1052 store_3(rgb+0, cast<U16>(to_fixed(r * 65535)));
1053 store_3(rgb+1, cast<U16>(to_fixed(g * 65535)));
1054 store_3(rgb+2, cast<U16>(to_fixed(b * 65535)));
1055 #endif
1056
1057 } return;
1058
1059 case Op_store_16161616LE: {
1060 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1061 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1062 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1063 #if defined(USING_NEON)
1064 uint16x4x4_t v = {{
1065 (uint16x4_t)cast<U16>(to_fixed(r * 65535)),
1066 (uint16x4_t)cast<U16>(to_fixed(g * 65535)),
1067 (uint16x4_t)cast<U16>(to_fixed(b * 65535)),
1068 (uint16x4_t)cast<U16>(to_fixed(a * 65535)),
1069 }};
1070 vst4_u16(rgba, v);
1071 #else
1072 U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1073 | cast<U64>(to_fixed(g * 65535)) << 16
1074 | cast<U64>(to_fixed(b * 65535)) << 32
1075 | cast<U64>(to_fixed(a * 65535)) << 48;
1076 store(rgba, px);
1077 #endif
1078 } return;
1079
1080 case Op_store_161616BE: {
1081 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1082 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1083 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1084 #if defined(USING_NEON)
1085 uint16x4x3_t v = {{
1086 (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(r * 65535))),
1087 (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(g * 65535))),
1088 (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(b * 65535))),
1089 }};
1090 vst3_u16(rgb, v);
1091 #else
1092 I32 R = to_fixed(r * 65535),
1093 G = to_fixed(g * 65535),
1094 B = to_fixed(b * 65535);
1095 store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1096 store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1097 store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1098 #endif
1099
1100 } return;
1101
1102 case Op_store_16161616BE: {
1103 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1104 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1105 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1106 #if defined(USING_NEON)
1107 uint16x4x4_t v = {{
1108 (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(r * 65535))),
1109 (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(g * 65535))),
1110 (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(b * 65535))),
1111 (uint16x4_t)swap_endian_16(cast<U16>(to_fixed(a * 65535))),
1112 }};
1113 vst4_u16(rgba, v);
1114 #else
1115 U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1116 | cast<U64>(to_fixed(g * 65535)) << 16
1117 | cast<U64>(to_fixed(b * 65535)) << 32
1118 | cast<U64>(to_fixed(a * 65535)) << 48;
1119 store(rgba, swap_endian_16x4(px));
1120 #endif
1121 } return;
1122
1123 case Op_store_hhh: {
1124 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1125 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1126 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1127
1128 U16 R = Half_from_F(r),
1129 G = Half_from_F(g),
1130 B = Half_from_F(b);
1131 #if defined(USING_NEON)
1132 uint16x4x3_t v = {{
1133 (uint16x4_t)R,
1134 (uint16x4_t)G,
1135 (uint16x4_t)B,
1136 }};
1137 vst3_u16(rgb, v);
1138 #else
1139 store_3(rgb+0, R);
1140 store_3(rgb+1, G);
1141 store_3(rgb+2, B);
1142 #endif
1143 } return;
1144
1145 case Op_store_hhhh: {
1146 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1147 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1148 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1149
1150 U16 R = Half_from_F(r),
1151 G = Half_from_F(g),
1152 B = Half_from_F(b),
1153 A = Half_from_F(a);
1154 #if defined(USING_NEON)
1155 uint16x4x4_t v = {{
1156 (uint16x4_t)R,
1157 (uint16x4_t)G,
1158 (uint16x4_t)B,
1159 (uint16x4_t)A,
1160 }};
1161 vst4_u16(rgba, v);
1162 #else
1163 store(rgba, cast<U64>(R) << 0
1164 | cast<U64>(G) << 16
1165 | cast<U64>(B) << 32
1166 | cast<U64>(A) << 48);
1167 #endif
1168
1169 } return;
1170
1171 case Op_store_fff: {
1172 uintptr_t ptr = (uintptr_t)(dst + 12*i);
1173 assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1174 float* rgb = (float*)ptr; // for this cast to float* to be safe.
1175 #if defined(USING_NEON)
1176 float32x4x3_t v = {{
1177 (float32x4_t)r,
1178 (float32x4_t)g,
1179 (float32x4_t)b,
1180 }};
1181 vst3q_f32(rgb, v);
1182 #else
1183 store_3(rgb+0, r);
1184 store_3(rgb+1, g);
1185 store_3(rgb+2, b);
1186 #endif
1187 } return;
1188
1189 case Op_store_ffff: {
1190 uintptr_t ptr = (uintptr_t)(dst + 16*i);
1191 assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1192 float* rgba = (float*)ptr; // for this cast to float* to be safe.
1193 #if defined(USING_NEON)
1194 float32x4x4_t v = {{
1195 (float32x4_t)r,
1196 (float32x4_t)g,
1197 (float32x4_t)b,
1198 (float32x4_t)a,
1199 }};
1200 vst4q_f32(rgba, v);
1201 #else
1202 store_4(rgba+0, r);
1203 store_4(rgba+1, g);
1204 store_4(rgba+2, b);
1205 store_4(rgba+3, a);
1206 #endif
1207 } return;
1208 }
1209 }
1210 }
1211
1212
run_program(const Op * program,const void ** arguments,const char * src,char * dst,int n,const size_t src_bpp,const size_t dst_bpp)1213 static void run_program(const Op* program, const void** arguments,
1214 const char* src, char* dst, int n,
1215 const size_t src_bpp, const size_t dst_bpp) {
1216 int i = 0;
1217 while (n >= N) {
1218 exec_ops(program, arguments, src, dst, i);
1219 i += N;
1220 n -= N;
1221 }
1222 if (n > 0) {
1223 char tmp[4*4*N] = {0};
1224
1225 memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1226 exec_ops(program, arguments, tmp, tmp, 0);
1227 memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1228 }
1229 }
1230
1231 // Clean up any #defines we may have set so that we can be #included again.
1232 #if defined(USING_AVX)
1233 #undef USING_AVX
1234 #endif
1235 #if defined(USING_AVX_F16C)
1236 #undef USING_AVX_F16C
1237 #endif
1238 #if defined(USING_AVX2)
1239 #undef USING_AVX2
1240 #endif
1241
1242 #if defined(USING_NEON)
1243 #undef USING_NEON
1244 #endif
1245 #if defined(USING_NEON_F16C)
1246 #undef USING_NEON_F16C
1247 #endif
1248
1249 #undef FALLTHROUGH
1250