1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2021 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 /**
19 * @brief Soft-float library for IEEE-754.
20 */
21 #if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
22
23 #include "astcenc_mathlib.h"
24
25 /* sized soft-float types. These are mapped to the sized integer
26 types of C99, instead of C's floating-point types; this is because
27 the library needs to maintain exact, bit-level control on all
28 operations on these data types. */
29 typedef uint16_t sf16;
30 typedef uint32_t sf32;
31
32 /******************************************
33 helper functions and their lookup tables
34 ******************************************/
35 /* count leading zeros functions. Only used when the input is nonzero. */
36
37 #if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
38 #elif defined(__arm__) && defined(__ARMCC_VERSION)
39 #elif defined(__arm__) && defined(__GNUC__)
40 #else
41 /* table used for the slow default versions. */
42 static const uint8_t clz_table[256] =
43 {
44 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
45 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
46 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
60 };
61 #endif
62
63 /*
64 32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */
clz32(uint32_t inp)65 static uint32_t clz32(uint32_t inp)
66 {
67 #if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
68 uint32_t bsr;
69 __asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));
70 return 31 - bsr;
71 #else
72 #if defined(__arm__) && defined(__ARMCC_VERSION)
73 return __clz(inp); /* armcc builtin */
74 #else
75 #if defined(__arm__) && defined(__GNUC__)
76 uint32_t lz;
77 __asm__("clz %0, %1": "=r"(lz):"r"(inp));
78 return lz;
79 #else
80 /* slow default version */
81 uint32_t summa = 24;
82 if (inp >= UINT32_C(0x10000))
83 {
84 inp >>= 16;
85 summa -= 16;
86 }
87 if (inp >= UINT32_C(0x100))
88 {
89 inp >>= 8;
90 summa -= 8;
91 }
92 return summa + clz_table[inp];
93 #endif
94 #endif
95 #endif
96 }
97
98 /* the five rounding modes that IEEE-754r defines */
99 typedef enum
100 {
101 SF_UP = 0, /* round towards positive infinity */
102 SF_DOWN = 1, /* round towards negative infinity */
103 SF_TOZERO = 2, /* round towards zero */
104 SF_NEARESTEVEN = 3, /* round toward nearest value; if mid-between, round to even value */
105 SF_NEARESTAWAY = 4 /* round toward nearest value; if mid-between, round away from zero */
106 } roundmode;
107
108
rtne_shift32(uint32_t inp,uint32_t shamt)109 static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
110 {
111 uint32_t vl1 = UINT32_C(1) << shamt;
112 uint32_t inp2 = inp + (vl1 >> 1); /* added 0.5 ULP */
113 uint32_t msk = (inp | UINT32_C(1)) & vl1; /* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
114 msk--; /* negative if even, nonnegative if odd. */
115 inp2 -= (msk >> 31); /* subtract epsilon before shift if even. */
116 inp2 >>= shamt;
117 return inp2;
118 }
119
rtna_shift32(uint32_t inp,uint32_t shamt)120 static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
121 {
122 uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
123 inp += vl1;
124 inp >>= shamt;
125 return inp;
126 }
127
rtup_shift32(uint32_t inp,uint32_t shamt)128 static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
129 {
130 uint32_t vl1 = UINT32_C(1) << shamt;
131 inp += vl1;
132 inp--;
133 inp >>= shamt;
134 return inp;
135 }
136
137 /* convert from FP16 to FP32. */
sf16_to_sf32(sf16 inp)138 static sf32 sf16_to_sf32(sf16 inp)
139 {
140 uint32_t inpx = inp;
141
142 /*
143 This table contains, for every FP16 sign/exponent value combination,
144 the difference between the input FP16 value and the value obtained
145 by shifting the correct FP32 result right by 13 bits.
146 This table allows us to handle every case except denormals and NaN
147 with just 1 table lookup, 2 shifts and 1 add.
148 */
149
150 #define WITH_MSB(a) (UINT32_C(a) | (1u << 31))
151 static const uint32_t tbl[64] =
152 {
153 WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
154 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
155 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
156 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
157 WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
158 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
159 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
160 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
161 };
162
163 uint32_t res = tbl[inpx >> 10];
164 res += inpx;
165
166 /* Normal cases: MSB of 'res' not set. */
167 if ((res & WITH_MSB(0)) == 0)
168 {
169 return res << 13;
170 }
171
172 /* Infinity and Zero: 10 LSB of 'res' not set. */
173 if ((res & 0x3FF) == 0)
174 {
175 return res << 13;
176 }
177
178 /* NaN: the exponent field of 'inp' is non-zero. */
179 if ((inpx & 0x7C00) != 0)
180 {
181 /* All NaNs are quietened. */
182 return (res << 13) | 0x400000;
183 }
184
185 /* Denormal cases */
186 uint32_t sign = (inpx & 0x8000) << 16;
187 uint32_t mskval = inpx & 0x7FFF;
188 uint32_t leadingzeroes = clz32(mskval);
189 mskval <<= leadingzeroes;
190 return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
191 }
192
193 /* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
sf32_to_sf16(sf32 inp,roundmode rmode)194 static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
195 {
196 /* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
197 static const uint8_t tab[512] {
198 0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
199 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
200 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
201 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
202 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
203 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
204 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
205 20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
206 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
207 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
208 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
209 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
210 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
211 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
212 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
213 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
214
215 5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
216 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
217 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
218 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
219 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
220 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
221 15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
222 25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
223 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
224 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
225 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
226 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
227 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
228 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
229 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
230 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
231 };
232
233 /* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
234 size. */
235 static const uint32_t tabx[60] {
236 UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
237 UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
238 UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
239 UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
240 UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
241 UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
242 UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
243 UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
244 UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
245 };
246
247 uint32_t p;
248 uint32_t idx = rmode + tab[inp >> 23];
249 uint32_t vlx = tabx[idx];
250 switch (idx)
251 {
252 /*
253 Positive number which may be Infinity or NaN.
254 We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
255 (If we don't do this quieting, then a NaN that is distinguished only by having
256 its low-order bits set, would be turned into an INF. */
257 case 50:
258 case 51:
259 case 52:
260 case 53:
261 case 54:
262 case 55:
263 case 56:
264 case 57:
265 case 58:
266 case 59:
267 /*
268 the input value is 0x7F800000 or 0xFF800000 if it is INF.
269 By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
270 For NaNs, however, this operation will keep bit 23 with the value 1.
271 We can then extract bit 23, and logical-OR bit 9 of the result with this
272 bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
273 of the mantissa is set.)
274 */
275 p = (inp - 1) & UINT32_C(0x800000); /* zero if INF, nonzero if NaN. */
276 return ((inp + vlx) >> 13) | (p >> 14);
277 /*
278 positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
279 If it is, then return 0, else return 1 (the smallest representable nonzero number)
280 */
281 case 0:
282 /*
283 -inp will set the MSB if the input number is nonzero.
284 Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
285 */
286 return static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31;
287
288 /*
289 negative, exponent = , round-mode == DOWN, need to check whether number is
290 actually 0. If it is, return 0x8000 ( float -0.0 )
291 Else return the smallest negative number ( 0x8001 ) */
292 case 6:
293 /*
294 in this case 'vlx' is 0x80000000. By subtracting the input value from it,
295 we obtain a value that is 0 if the input value is in fact zero and has
296 the MSB set if it isn't. We then right-shift the value by 31 places to
297 get a value that is 0 if the input is -0.0 and 1 otherwise.
298 */
299 return ((vlx - inp) >> 31) + UINT32_C(0x8000);
300
301 /*
302 for all other cases involving underflow/overflow, we don't need to
303 do actual tests; we just return 'vlx'.
304 */
305 case 1:
306 case 2:
307 case 3:
308 case 4:
309 case 5:
310 case 7:
311 case 8:
312 case 9:
313 case 10:
314 case 11:
315 case 12:
316 case 13:
317 case 14:
318 case 15:
319 case 16:
320 case 17:
321 case 18:
322 case 19:
323 case 40:
324 case 41:
325 case 42:
326 case 43:
327 case 44:
328 case 45:
329 case 46:
330 case 47:
331 case 48:
332 case 49:
333 return vlx;
334
335 /*
336 for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
337 FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
338 baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
339 from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
340 for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
341 except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
342
343 /* normal number, all rounding modes except round-to-nearest-even: */
344 case 30:
345 case 31:
346 case 32:
347 case 34:
348 case 35:
349 case 36:
350 case 37:
351 case 39:
352 return (inp + vlx) >> 13;
353
354 /* normal number, round-to-nearest-even. */
355 case 33:
356 case 38:
357 p = inp + vlx;
358 p += (inp >> 13) & 1;
359 return p >> 13;
360
361 /*
362 the various denormal cases. These are not expected to be common, so their performance is a bit
363 less important. For each of these cases, we need to extract an exponent and a mantissa
364 (including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
365 depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
366 sign of the resulting denormal number.
367 */
368 case 21:
369 case 22:
370 case 25:
371 case 27:
372 /* denormal, round towards zero. */
373 p = 126 - ((inp >> 23) & 0xFF);
374 return (((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx;
375 case 20:
376 case 26:
377 /* denormal, round away from zero. */
378 p = 126 - ((inp >> 23) & 0xFF);
379 return rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
380 case 24:
381 case 29:
382 /* denormal, round to nearest-away */
383 p = 126 - ((inp >> 23) & 0xFF);
384 return rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
385 case 23:
386 case 28:
387 /* denormal, round to nearest-even. */
388 p = 126 - ((inp >> 23) & 0xFF);
389 return rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx;
390 }
391
392 return 0;
393 }
394
395 /* convert from soft-float to native-float */
sf16_to_float(uint16_t p)396 float sf16_to_float(uint16_t p)
397 {
398 if32 i;
399 i.u = sf16_to_sf32(p);
400 return i.f;
401 }
402
403 /* convert from native-float to soft-float */
float_to_sf16(float p)404 uint16_t float_to_sf16(float p)
405 {
406 if32 i;
407 i.f = p;
408 return sf32_to_sf16(i.u, SF_NEARESTEVEN);
409 }
410
411 #endif
412