1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2019-2021 Arm Limited
4 // Copyright 2008 Jose Fonseca
5 //
6 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
7 // use this file except in compliance with the License. You may obtain a copy
8 // of the License at:
9 //
10 // http://www.apache.org/licenses/LICENSE-2.0
11 //
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15 // License for the specific language governing permissions and limitations
16 // under the License.
17 // ----------------------------------------------------------------------------
18
19 /*
20 * This module implements vector support for floats, ints, and vector lane
21 * control masks. It provides access to both explicit vector width types, and
22 * flexible N-wide types where N can be determined at compile time.
23 *
24 * The design of this module encourages use of vector length agnostic code, via
25 * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
26 * with that is available at compile time. The current vector width is
27 * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
28 *
29 * Explicit scalar types are acessible via the vint1, vfloat1, vmask1 types.
30 * These are provided primarily for prototyping and algorithm debug of VLA
31 * implementations.
32 *
33 * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
34 * types. These are provided for use by VLA code, but are also expected to be
35 * used as a fixed-width type and will supported a reference C++ fallback for
36 * use on platforms without SIMD intrinsics.
37 *
38 * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
39 * types. These are provide for use by VLA code, and are not expected to be
40 * used as a fixed-width type in normal code. No reference C implementation is
41 * provided on platforms without underlying SIMD intrinsics.
42 *
43 * With the current implementation ISA support is provided for:
44 *
45 * * 1-wide for scalar reference.
46 * * 4-wide for Armv8-A NEON.
47 * * 4-wide for x86-64 SSE2.
48 * * 4-wide for x86-64 SSE4.1.
49 * * 8-wide for x86-64 AVX2.
50 */
51
52 #ifndef ASTC_VECMATHLIB_H_INCLUDED
53 #define ASTC_VECMATHLIB_H_INCLUDED
54
55 #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
56 #include <immintrin.h>
57 #elif ASTCENC_NEON != 0
58 #include <arm_neon.h>
59 #endif
60
61 #if !defined(__clang__) && defined(_MSC_VER)
62 #define ASTCENC_SIMD_INLINE __forceinline
63 #elif defined(__GNUC__) && !defined(__clang__)
64 #define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
65 #else
66 #define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
67 #endif
68
69 #if ASTCENC_AVX >= 2
70 /* If we have AVX2 expose 8-wide VLA. */
71 #include "astcenc_vecmathlib_sse_4.h"
72 #include "astcenc_vecmathlib_common_4.h"
73 #include "astcenc_vecmathlib_avx2_8.h"
74
75 #define ASTCENC_SIMD_WIDTH 8
76
77 using vfloat = vfloat8;
78
79 #if defined(ASTCENC_NO_INVARIANCE)
80 using vfloatacc = vfloat8;
81 #else
82 using vfloatacc = vfloat4;
83 #endif
84
85 using vint = vint8;
86 using vmask = vmask8;
87
88 constexpr auto loada = vfloat8::loada;
89 constexpr auto load1 = vfloat8::load1;
90
91 #elif ASTCENC_SSE >= 20
92 /* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
93 #include "astcenc_vecmathlib_sse_4.h"
94 #include "astcenc_vecmathlib_common_4.h"
95
96 #define ASTCENC_SIMD_WIDTH 4
97
98 using vfloat = vfloat4;
99 using vfloatacc = vfloat4;
100 using vint = vint4;
101 using vmask = vmask4;
102
103 constexpr auto loada = vfloat4::loada;
104 constexpr auto load1 = vfloat4::load1;
105
106 #elif ASTCENC_NEON > 0
107 /* If we have NEON expose 4-wide VLA. */
108 #include "astcenc_vecmathlib_neon_4.h"
109 #include "astcenc_vecmathlib_common_4.h"
110
111 #define ASTCENC_SIMD_WIDTH 4
112
113 using vfloat = vfloat4;
114 using vfloatacc = vfloat4;
115 using vint = vint4;
116 using vmask = vmask4;
117
118 constexpr auto loada = vfloat4::loada;
119 constexpr auto load1 = vfloat4::load1;
120
121 #else
122 // If we have nothing expose 4-wide VLA, and 4-wide fixed width.
123
124 // Note: We no longer expose the 1-wide scalar fallback because it is not
125 // invariant with the 4-wide path due to algorithms that use horizontal
126 // operations that accumulate a local vector sum before accumulating into
127 // a running sum.
128 //
129 // For 4 items adding into an accumulator using 1-wide vectors the sum is:
130 //
131 // result = ((((sum + l0) + l1) + l2) + l3)
132 //
133 // ... whereas the accumulator for a 4-wide vector sum is:
134 //
135 // result = sum + ((l0 + l2) + (l1 + l3))
136 //
137 // In "normal maths" this is the same, but the floating point reassociation
138 // differences mean that these will not produce the same result.
139
140 #include "astcenc_vecmathlib_none_4.h"
141 #include "astcenc_vecmathlib_common_4.h"
142
143 #define ASTCENC_SIMD_WIDTH 4
144
145 using vfloat = vfloat4;
146 using vfloatacc = vfloat4;
147 using vint = vint4;
148 using vmask = vmask4;
149
150 constexpr auto loada = vfloat4::loada;
151 constexpr auto load1 = vfloat4::load1;
152 #endif
153
154 /**
155 * @brief Round a count down to the largest multiple of 8.
156 *
157 * @param count The unrounded value.
158 *
159 * @return The rounded value.
160 */
round_down_to_simd_multiple_8(unsigned int count)161 ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
162 {
163 return count & ~(8 - 1);
164 }
165
166 /**
167 * @brief Round a count down to the largest multiple of 4.
168 *
169 * @param count The unrounded value.
170 *
171 * @return The rounded value.
172 */
round_down_to_simd_multiple_4(unsigned int count)173 ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
174 {
175 return count & ~(4 - 1);
176 }
177
178 /**
179 * @brief Round a count down to the largest multiple of the SIMD width.
180 *
181 * Assumption that the vector width is a power of two ...
182 *
183 * @param count The unrounded value.
184 *
185 * @return The rounded value.
186 */
round_down_to_simd_multiple_vla(unsigned int count)187 ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
188 {
189 return count & ~(ASTCENC_SIMD_WIDTH - 1);
190 }
191
192 /**
193 * @brief Round a count up to the largest multiple of the SIMD width.
194 *
195 * Assumption that the vector width is a power of two ...
196 *
197 * @param count The unrounded value.
198 *
199 * @return The rounded value.
200 */
round_up_to_simd_multiple_vla(unsigned int count)201 ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
202 {
203 int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
204 return multiples * ASTCENC_SIMD_WIDTH;
205 }
206
207 /**
208 * @brief Return @c a with lanes negated if the @c b lane is negative.
209 */
change_sign(vfloat a,vfloat b)210 ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
211 {
212 vint ia = float_as_int(a);
213 vint ib = float_as_int(b);
214 vint sign_mask(static_cast<int>(0x80000000));
215 vint r = ia ^ (ib & sign_mask);
216 return int_as_float(r);
217 }
218
219 /**
220 * @brief Return fast, but approximate, vector atan(x).
221 *
222 * Max error of this implementaiton is 0.004883.
223 */
atan(vfloat x)224 ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
225 {
226 vmask c = abs(x) > vfloat(1.0f);
227 vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
228 vfloat y = select(x, vfloat(1.0f) / x, c);
229 y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
230 return select(y, z - y, c);
231 }
232
233 /**
234 * @brief Return fast, but approximate, vector atan2(x, y).
235 */
atan2(vfloat y,vfloat x)236 ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
237 {
238 vfloat z = atan(abs(y / x));
239 vmask xmask = vmask(float_as_int(x).m);
240 return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
241 }
242
243 /*
244 * @brief Factory that returns a unit length 4 component vfloat4.
245 */
unit4()246 static ASTCENC_SIMD_INLINE vfloat4 unit4()
247 {
248 return vfloat4(0.5f);
249 }
250
251 /**
252 * @brief Factory that returns a unit length 3 component vfloat4.
253 */
unit3()254 static ASTCENC_SIMD_INLINE vfloat4 unit3()
255 {
256 float val = 0.577350258827209473f;
257 return vfloat4(val, val, val, 0.0f);
258 }
259
260 /**
261 * @brief Factory that returns a unit length 2 component vfloat4.
262 */
unit2()263 static ASTCENC_SIMD_INLINE vfloat4 unit2()
264 {
265 float val = 0.707106769084930420f;
266 return vfloat4(val, val, 0.0f, 0.0f);
267 }
268
269 /**
270 * @brief Factory that returns a 3 component vfloat4.
271 */
vfloat3(float a,float b,float c)272 static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
273 {
274 return vfloat4(a, b, c, 0.0f);
275 }
276
277 /**
278 * @brief Factory that returns a 2 component vfloat4.
279 */
vfloat2(float a,float b)280 static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
281 {
282 return vfloat4(a, b, 0.0f, 0.0f);
283 }
284
285 /**
286 * @brief Normalize a non-zero length vector to unit length.
287 */
normalize(vfloat4 a)288 static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
289 {
290 vfloat4 length = dot(a, a);
291 return a / sqrt(length);
292 }
293
294 /**
295 * @brief Normalize a vector, returning @c safe if len is zero.
296 */
normalize_safe(vfloat4 a,vfloat4 safe)297 static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
298 {
299 vfloat4 length = dot(a, a);
300 if (length.lane<0>() != 0.0f)
301 {
302 return a / sqrt(length);
303 }
304
305 return safe;
306 }
307
308
309
310 #define POLY0(x, c0) ( c0)
311 #define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0)
312 #define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0)
313 #define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0)
314 #define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0)
315 #define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
316
317 /**
318 * @brief Compute an approximate exp2(x) for each lane in the vector.
319 *
320 * Based on 5th degree minimax polynomials, ported from this blog
321 * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
322 */
exp2(vfloat4 x)323 static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
324 {
325 x = clamp(-126.99999f, 129.0f, x);
326
327 vint4 ipart = float_to_int(x - 0.5f);
328 vfloat4 fpart = x - int_to_float(ipart);
329
330 // Integer contrib, using 1 << ipart
331 vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
332
333 // Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
334 vfloat4 fexp = POLY5(fpart,
335 9.9999994e-1f,
336 6.9315308e-1f,
337 2.4015361e-1f,
338 5.5826318e-2f,
339 8.9893397e-3f,
340 1.8775767e-3f);
341
342 return iexp * fexp;
343 }
344
345 /**
346 * @brief Compute an approximate log2(x) for each lane in the vector.
347 *
348 * Based on 5th degree minimax polynomials, ported from this blog
349 * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
350 */
log2(vfloat4 x)351 static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
352 {
353 vint4 exp(0x7F800000);
354 vint4 mant(0x007FFFFF);
355 vint4 one(0x3F800000);
356
357 vint4 i = float_as_int(x);
358
359 vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
360
361 vfloat4 m = int_as_float((i & mant) | one);
362
363 // Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
364 vfloat4 p = POLY4(m,
365 2.8882704548164776201f,
366 -2.52074962577807006663f,
367 1.48116647521213171641f,
368 -0.465725644288844778798f,
369 0.0596515482674574969533f);
370
371 // Increases the polynomial degree, but ensures that log2(1) == 0
372 p = p * (m - 1.0f);
373
374 return p + e;
375 }
376
377 /**
378 * @brief Compute an approximate pow(x, y) for each lane in the vector.
379 *
380 * Power function based on the exp2(log2(x) * y) transform.
381 */
pow(vfloat4 x,vfloat4 y)382 static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
383 {
384 vmask4 zero_mask = y == vfloat4(0.0f);
385 vfloat4 estimate = exp2(log2(x) * y);
386
387 // Guarantee that y == 0 returns exactly 1.0f
388 return select(estimate, vfloat4(1.0f), zero_mask);
389 }
390
391 /**
392 * @brief Count the leading zeros for each lane in @c a.
393 *
394 * Valid for all data values of @c a; will return a per-lane value [0, 32].
395 */
clz(vint4 a)396 static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
397 {
398 // This function is a horrible abuse of floating point exponents to convert
399 // the original integer value into a 2^N encoding we can recover easily.
400
401 // Convert to float without risk of rounding up by keeping only top 8 bits.
402 // This trick is is guranteed to keep top 8 bits and clear the 9th.
403 a = (~lsr<8>(a)) & a;
404 a = float_as_int(int_to_float(a));
405
406 // Extract and unbias exponent
407 a = vint4(127 + 31) - lsr<23>(a);
408
409 // Clamp result to a valid 32-bit range
410 return clamp(0, 32, a);
411 }
412
413 /**
414 * @brief Return lanewise 2^a for each lane in @c a.
415 *
416 * Use of signed int mean that this is only valid for values in range [0, 31].
417 */
two_to_the_n(vint4 a)418 static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
419 {
420 // 2^30 is the largest signed number than can be represented
421 assert(all(a < vint4(31)));
422
423 // This function is a horrible abuse of floating point to use the exponent
424 // and float conversion to generate a 2^N multiple.
425
426 // Bias the exponent
427 vint4 exp = a + 127;
428 exp = lsl<23>(exp);
429
430 // Reinterpret the bits as a float, and then convert to an int
431 vfloat4 f = int_as_float(exp);
432 return float_to_int(f);
433 }
434
435 /**
436 * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
437 */
unorm16_to_sf16(vint4 p)438 static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
439 {
440 vint4 fp16_one = vint4(0x3C00);
441 vint4 fp16_small = lsl<8>(p);
442
443 vmask4 is_one = p == vint4(0xFFFF);
444 vmask4 is_small = p < vint4(4);
445
446 // Manually inline clz() on Visual Studio to avoid release build codegen bug
447 // see https://github.com/ARM-software/astc-encoder/issues/259
448 #if !defined(__clang__) && defined(_MSC_VER)
449 vint4 a = (~lsr<8>(p)) & p;
450 a = float_as_int(int_to_float(a));
451 a = vint4(127 + 31) - lsr<23>(a);
452 vint4 lz = clamp(0, 32, a) - 16;
453 #else
454 vint4 lz = clz(p) - 16;
455 #endif
456
457 p = p * two_to_the_n(lz + 1);
458 p = p & vint4(0xFFFF);
459
460 p = lsr<6>(p);
461
462 p = p | lsl<10>(vint4(14) - lz);
463
464 vint4 r = select(p, fp16_one, is_one);
465 r = select(r, fp16_small, is_small);
466 return r;
467 }
468
469 /**
470 * @brief Convert 16-bit LNS to float16.
471 */
lns_to_sf16(vint4 p)472 static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
473 {
474 vint4 mc = p & 0x7FF;
475 vint4 ec = lsr<11>(p);
476
477 vint4 mc_512 = mc * 3;
478 vmask4 mask_512 = mc < vint4(512);
479
480 vint4 mc_1536 = mc * 4 - 512;
481 vmask4 mask_1536 = mc < vint4(1536);
482
483 vint4 mc_else = mc * 5 - 2048;
484
485 vint4 mt = mc_else;
486 mt = select(mt, mc_1536, mask_1536);
487 mt = select(mt, mc_512, mask_512);
488
489 vint4 res = lsl<10>(ec) | lsr<3>(mt);
490 return min(res, vint4(0x7BFF));
491 }
492
493 /**
494 * @brief Extract mantissa and exponent of a float value.
495 *
496 * @param a The input value.
497 * @param[out] exp The output exponent.
498 *
499 * @return The mantissa.
500 */
frexp(vfloat4 a,vint4 & exp)501 static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
502 {
503 // Interpret the bits as an integer
504 vint4 ai = float_as_int(a);
505
506 // Extract and unbias the exponent
507 exp = (lsr<23>(ai) & 0xFF) - 126;
508
509 // Extract and unbias the mantissa
510 vint4 manti = (ai & 0x807FFFFF) | 0x3F000000;
511 return int_as_float(manti);
512 }
513
514 /**
515 * @brief Convert float to 16-bit LNS.
516 */
float_to_lns(vfloat4 a)517 static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
518 {
519 vint4 exp;
520 vfloat4 mant = frexp(a, exp);
521
522 // Do these early before we start messing about ...
523 vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
524 vmask4 mask_infinity = a >= vfloat4(65536.0f);
525
526 // If input is smaller than 2^-14, multiply by 2^25 and don't bias.
527 vmask4 exp_lt_m13 = exp < vint4(-13);
528
529 vfloat4 a1a = a * 33554432.0f;
530 vint4 expa = vint4::zero();
531
532 vfloat4 a1b = (mant - 0.5f) * 4096;
533 vint4 expb = exp + 14;
534
535 a = select(a1b, a1a, exp_lt_m13);
536 exp = select(expb, expa, exp_lt_m13);
537
538 vmask4 a_lt_384 = a < vfloat4(384.0f);
539 vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
540
541 vfloat4 a2a = a * (4.0f / 3.0f);
542 vfloat4 a2b = a + 128.0f;
543 vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
544
545 a = a2c;
546 a = select(a, a2b, a_lt_1408);
547 a = select(a, a2a, a_lt_384);
548
549 a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
550
551 a = select(a, vfloat4(65535.0f), mask_infinity);
552 a = select(a, vfloat4::zero(), mask_underflow_nan);
553
554 return a;
555 }
556
557 namespace astc
558 {
559
pow(float x,float y)560 static ASTCENC_SIMD_INLINE float pow(float x, float y)
561 {
562 return pow(vfloat4(x), vfloat4(y)).lane<0>();
563 }
564
565 }
566
567 #endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
568