• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * e_powf.c - single-precision power function
3  *
4  * Copyright (c) 2009-2018, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include <math.h>
9 #include <errno.h>
10 #include "math_private.h"
11 
12 float
powf(float x,float y)13 powf(float x, float y)
14 {
15     float logh, logl;
16     float rlogh, rlogl;
17     float sign = 1.0f;
18     int expadjust = 0;
19     unsigned ix, iy;
20 
21     ix = fai(x);
22     iy = fai(y);
23 
24     if (__builtin_expect((ix - 0x00800000) >= (0x7f800000 - 0x00800000) ||
25                          ((iy << 1) + 0x02000000) < 0x40000000, 0)) {
26         /*
27          * The above test rules out, as quickly as I can see how to,
28          * all possible inputs except for a normalised positive x
29          * being raised to the power of a normalised (and not
30          * excessively small) y. That's the fast-path case: if
31          * that's what the user wants, we can skip all of the
32          * difficult special-case handling.
33          *
34          * Now we must identify, as efficiently as we can, cases
35          * which will return to the fast path with a little tidying
36          * up. These are, in order of likelihood and hence of
37          * processing:
38          *
39          *  - a normalised _negative_ x raised to the power of a
40          *    non-zero finite y. Having identified this case, we
41          *    must categorise y into one of the three categories
42          *    'odd integer', 'even integer' and 'non-integer'; for
43          *    the last of these we return an error, while for the
44          *    other two we rejoin the main code path having rendered
45          *    x positive and stored an appropriate sign to append to
46          *    the eventual result.
47          *
48          *  - a _denormal_ x raised to the power of a non-zero
49          *    finite y, in which case we multiply it up by a power
50          *    of two to renormalise it, store an appropriate
51          *    adjustment for its base-2 logarithm, and depending on
52          *    the sign of y either return straight to the main code
53          *    path or go via the categorisation of y above.
54          *
55          *  - any of the above kinds of x raised to the power of a
56          *    zero, denormal, nearly-denormal or nearly-infinite y,
57          *    in which case we must do the checks on x as above but
58          *    otherwise the algorithm goes through basically
59          *    unchanged. Denormal and very tiny y values get scaled
60          *    up to something not in range of accidental underflow
61          *    when split into prec-and-a-half format; very large y
62          *    values get scaled down by a factor of two to prevent
63          *    CLEARBOTTOMHALF's round-up from overflowing them to
64          *    infinity. (Of course the _output_ will overflow either
65          *    way - the largest y value that can possibly yield a
66          *    finite result is well below this range anyway - so
67          *    this is a safe change.)
68          */
69         if (__builtin_expect(((iy << 1) + 0x02000000) >= 0x40000000, 1)) {   /* normalised and sensible y */
70             y_ok_check_x:
71 
72             if (__builtin_expect((ix - 0x80800000) < (0xff800000 - 0x80800000), 1)) {   /* normal but negative x */
73                 y_ok_x_negative:
74 
75                 x = fabsf(x);
76                 ix = fai(x);
77 
78                 /*
79                  * Determine the parity of y, if it's an integer at
80                  * all.
81                  */
82                 {
83                     int yexp, yunitsbit;
84 
85                     /*
86                      * Find the exponent of y.
87                      */
88                     yexp = (iy >> 23) & 0xFF;
89                     /*
90                      * Numbers with an exponent smaller than 0x7F
91                      * are strictly smaller than 1, and hence must
92                      * be fractional.
93                      */
94                     if (yexp < 0x7F)
95                         return MATHERR_POWF_NEGFRAC(x,y);
96                     /*
97                      * Numbers with an exponent at least 0x97 are by
98                      * definition even integers.
99                      */
100                     if (yexp >= 0x97)
101                         goto mainpath;  /* rejoin main code, giving positive result */
102                     /*
103                      * In between, we must check the mantissa.
104                      *
105                      * Note that this case includes yexp==0x7f,
106                      * which means 1 point something. In this case,
107                      * the 'units bit' we're testing is semantically
108                      * the lowest bit of the exponent field, not the
109                      * leading 1 on the mantissa - but fortunately,
110                      * that bit position will just happen to contain
111                      * the 1 that we would wish it to, because the
112                      * exponent describing that particular case just
113                      * happens to be odd.
114                      */
115                     yunitsbit = 0x96 - yexp;
116                     if (iy & ((1 << yunitsbit)-1))
117                         return MATHERR_POWF_NEGFRAC(x,y);
118                     else if (iy & (1 << yunitsbit))
119                         sign = -1.0f;  /* y is odd; result should be negative */
120                     goto mainpath;     /* now we can rejoin the main code */
121                 }
122             } else if (__builtin_expect((ix << 1) != 0 && (ix << 1) < 0x01000000, 0)) {   /* denormal x */
123                 /*
124                  * Renormalise x.
125                  */
126                 x *= 0x1p+27F;
127                 ix = fai(x);
128                 /*
129                  * Set expadjust to compensate for that.
130                  */
131                 expadjust = -27;
132 
133                 /* Now we need to handle negative x as above. */
134                 if (ix & 0x80000000)
135                     goto y_ok_x_negative;
136                 else
137                     goto mainpath;
138             } else if ((ix - 0x00800000) < (0x7f800000 - 0x00800000)) {
139                 /* normal positive x, back here from denormal-y case below */
140                 goto mainpath;
141             }
142         } else if (((iy << 1) + 0x02000000) >= 0x02000000) { /* denormal, nearly-denormal or zero y */
143             if (y == 0.0F) {
144                  /*
145                  * y == 0. Any finite x returns 1 here. (Quiet NaNs
146                  * do too, but we handle that below since we don't
147                  * mind doing them more slowly.)
148                  */
149                 if ((ix << 1) != 0 && (ix << 1) < 0xFF000000)
150                     return 1.0f;
151             } else {
152                 /*
153                  * Denormal or very very small y. In this situation
154                  * we have to be a bit careful, because when we
155                  * break up y into precision-and-a-half later on we
156                  * risk working with denormals and triggering
157                  * underflow exceptions within this function that
158                  * aren't related to the smallness of the output. So
159                  * here we convert all such y values into a standard
160                  * small-but-not-too-small value which will give the
161                  * same output.
162                  *
163                  * What value should that be? Well, we work in
164                  * 16*log2(x) below (equivalently, log to the base
165                  * 2^{1/16}). So the maximum magnitude of that for
166                  * any finite x is about 2416 (= 16 * (128+23), for
167                  * log of the smallest denormal x), i.e. certainly
168                  * less than 2^12. If multiplying that by y gives
169                  * anything of magnitude less than 2^-32 (and even
170                  * that's being generous), the final output will be
171                  * indistinguishable from 1. So any value of y with
172                  * magnitude less than 2^-(32+12) = 2^-44 is
173                  * completely indistinguishable from any other such
174                  * value. Hence we got here in the first place by
175                  * checking the exponent of y against 64 (i.e. -63,
176                  * counting the exponent bias), so we might as well
177                  * normalise all tiny y values to the same threshold
178                  * of 2^-64.
179                  */
180                 iy = 0x1f800000 | (iy & 0x80000000);   /* keep the sign; that's important */
181                 y = fhex(iy);
182             }
183             goto y_ok_check_x;
184         } else if (((iy << 1) + 0x02000000) < 0x01000000) { /* y in top finite exponent bracket */
185             y = fhex(fai(y) - 0x00800000);   /* scale down by a factor of 2 */
186             goto y_ok_check_x;
187         }
188 
189         /*
190          * Having dealt with the above cases, we now know that
191          * either x is zero, infinite or NaN, or y is infinite or
192          * NaN, or both. We can deal with all of those cases without
193          * ever rejoining the main code path.
194          */
195         if ((unsigned)(((ix & 0x7FFFFFFF) - 0x7f800001) < 0x7fc00000 - 0x7f800001) ||
196             (unsigned)(((iy & 0x7FFFFFFF) - 0x7f800001) < 0x7fc00000 - 0x7f800001)) {
197             /*
198              * At least one signalling NaN. Do a token arithmetic
199              * operation on the two operands to provoke an exception
200              * and return the appropriate QNaN.
201              */
202             return FLOAT_INFNAN2(x,y);
203         } else if (ix==0x3f800000 || (iy << 1)==0) {
204             /*
205              * C99 says that 1^anything and anything^0 should both
206              * return 1, _even for a NaN_. I modify that slightly to
207              * apply only to QNaNs (which doesn't violate C99, since
208              * C99 doesn't specify anything about SNaNs at all),
209              * because I can't bring myself not to throw an
210              * exception on an SNaN since its _entire purpose_ is to
211              * throw an exception whenever touched.
212              */
213             return 1.0f;
214         } else
215         if (((ix & 0x7FFFFFFF) > 0x7f800000) ||
216             ((iy & 0x7FFFFFFF) > 0x7f800000)) {
217             /*
218              * At least one QNaN. Do a token arithmetic operation on
219              * the two operands to get the right one to propagate to
220              * the output.
221              */
222             return FLOAT_INFNAN2(x,y);
223         } else if (ix == 0x7f800000) {
224             /*
225              * x = +infinity. Return +infinity for positive y, +0
226              * for negative y, and 1 for zero y.
227              */
228             if (!(iy << 1))
229                 return MATHERR_POWF_INF0(x,y);
230             else if (iy & 0x80000000)
231                 return 0.0f;
232             else
233                 return INFINITY;
234         } else {
235             /*
236              * Repeat the parity analysis of y above, returning 1
237              * (odd), 2 (even) or 0 (fraction).
238              */
239             int ypar, yexp, yunitsbit;
240             yexp = (iy >> 23) & 0xFF;
241             if (yexp < 0x7F)
242                 ypar = 0;
243             else if (yexp >= 0x97)
244                 ypar = 2;
245             else {
246                 yunitsbit = 0x96 - yexp;
247                 if (iy & ((1 << yunitsbit)-1))
248                     ypar = 0;
249                 else if (iy & (1 << yunitsbit))
250                     ypar = 1;
251                 else
252                     ypar = 2;
253             }
254 
255             if (ix == 0xff800000) {
256                 /*
257                  * x = -infinity. We return infinity or zero
258                  * depending on whether y is positive or negative,
259                  * and the sign is negative iff y is an odd integer.
260                  * (SGT: I don't like this, but it's what C99
261                  * mandates.)
262                  */
263                 if (!(iy & 0x80000000)) {
264                     if (ypar == 1)
265                         return -INFINITY;
266                     else
267                         return INFINITY;
268                 } else {
269                     if (ypar == 1)
270                         return -0.0f;
271                     else
272                         return +0.0f;
273                 }
274             } else if (ix == 0) {
275                 /*
276                  * x = +0. We return +0 for all positive y including
277                  * infinity; a divide-by-zero-like error for all
278                  * negative y including infinity; and an 0^0 error
279                  * for zero y.
280                  */
281                 if ((iy << 1) == 0)
282                     return MATHERR_POWF_00(x,y);
283                 else if (iy & 0x80000000)
284                     return MATHERR_POWF_0NEGEVEN(x,y);
285                 else
286                     return +0.0f;
287             } else if (ix == 0x80000000) {
288                 /*
289                  * x = -0. We return errors in almost all cases (the
290                  * exception being positive integer y, in which case
291                  * we return a zero of the appropriate sign), but
292                  * the errors are almost all different. Gah.
293                  */
294                 if ((iy << 1) == 0)
295                     return MATHERR_POWF_00(x,y);
296                 else if (iy == 0x7f800000)
297                     return MATHERR_POWF_NEG0FRAC(x,y);
298                 else if (iy == 0xff800000)
299                     return MATHERR_POWF_0NEG(x,y);
300                 else if (iy & 0x80000000)
301                     return (ypar == 0 ? MATHERR_POWF_0NEG(x,y) :
302                             ypar == 1 ? MATHERR_POWF_0NEGODD(x,y) :
303                             /* ypar == 2 ? */ MATHERR_POWF_0NEGEVEN(x,y));
304                 else
305                     return (ypar == 0 ? MATHERR_POWF_NEG0FRAC(x,y) :
306                             ypar == 1 ? -0.0f :
307                             /* ypar == 2 ? */ +0.0f);
308             } else {
309                 /*
310                  * Now we know y is an infinity of one sign or the
311                  * other and x is finite and nonzero. If x == -1 (+1
312                  * is already ruled out), we return +1; otherwise
313                  * C99 mandates that we return either +0 or +inf,
314                  * the former iff exactly one of |x| < 1 and y<0 is
315                  * true.
316                  */
317                 if (ix == 0xbf800000) {
318                     return +1.0f;
319                 } else if (!((ix << 1) < 0x7f000000) ^ !(iy & 0x80000000)) {
320                     return +0.0f;
321                 }
322                 else {
323                     return INFINITY;
324                 }
325             }
326         }
327     }
328 
329     mainpath:
330 
331 #define PHMULTIPLY(rh,rl, xh,xl, yh,yl) do { \
332     float tmph, tmpl; \
333     tmph = (xh) * (yh); \
334     tmpl = (xh) * (yl) + (xl) * ((yh)+(yl)); \
335 /* printf("PHMULTIPLY: tmp=%08x+%08x\n", fai(tmph), fai(tmpl)); */ \
336     (rh) = CLEARBOTTOMHALF(tmph + tmpl); \
337     (rl) = tmpl + (tmph - (rh)); \
338 } while (0)
339 
340 /*
341  * Same as the PHMULTIPLY macro above, but bounds the absolute value
342  * of rh+rl. In multiplications uncontrolled enough that rh can go
343  * infinite, we can get an IVO exception from the subtraction tmph -
344  * rh, so we should spot that case in advance and avoid it.
345  */
346 #define PHMULTIPLY_SATURATE(rh,rl, xh,xl, yh,yl, bound) do {            \
347         float tmph, tmpl;                                               \
348         tmph = (xh) * (yh);                                             \
349         if (fabsf(tmph) > (bound)) {                                  \
350             (rh) = copysignf((bound),(tmph));                           \
351             (rl) = 0.0f;                                                \
352         } else {                                                        \
353             tmpl = (xh) * (yl) + (xl) * ((yh)+(yl));                    \
354             (rh) = CLEARBOTTOMHALF(tmph + tmpl);                        \
355             (rl) = tmpl + (tmph - (rh));                                \
356         }                                                               \
357     } while (0)
358 
359     /*
360      * Determine log2 of x to relative prec-and-a-half, as logh +
361      * logl.
362      *
363      * Well, we're actually computing 16*log2(x), so that it's the
364      * right size for the subsequently fiddly messing with powers of
365      * 2^(1/16) in the exp step at the end.
366      */
367     if (__builtin_expect((ix - 0x3f7ff000) <= (0x3f801000 - 0x3f7ff000), 0)) {
368         /*
369          * For x this close to 1, we write x = 1 + t and then
370          * compute t - t^2/2 + t^3/3 - t^4/4; and the neat bit is
371          * that t itself, being the bottom half of an input
372          * mantissa, is in half-precision already, so the output is
373          * naturally in canonical prec-and-a-half form.
374          */
375         float t = x - 1.0;
376         float lnh, lnl;
377         /*
378          * Compute natural log of x in prec-and-a-half.
379          */
380         lnh = t;
381         lnl = - (t * t) * ((1.0f/2.0f) - t * ((1.0f/3.0f) - t * (1.0f/4.0f)));
382 
383         /*
384          * Now we must scale by 16/log(2), still in prec-and-a-half,
385          * to turn this from natural log(x) into 16*log2(x).
386          */
387         PHMULTIPLY(logh, logl, lnh, lnl, 0x1.716p+4F, -0x1.7135a8p-9F);
388     } else {
389         /*
390          * For all other x, we start by normalising to [1,2), and
391          * then dividing that further into subintervals. For each
392          * subinterval we pick a number a in that interval, compute
393          * s = (x-a)/(x+a) in precision-and-a-half, and then find
394          * the log base 2 of (1+s)/(1-s), still in precision-and-a-
395          * half.
396          *
397          * Why would we do anything so silly? For two main reasons.
398          *
399          * Firstly, if s = (x-a)/(x+a), then a bit of algebra tells
400          * us that x = a * (1+s)/(1-s); so once we've got
401          * log2((1+s)/(1-s)), we need only add on log2(a) and then
402          * we've got log2(x). So this lets us treat all our
403          * subintervals in essentially the same way, rather than
404          * requiring a separate approximation for each one; the only
405          * correction factor we need is to store a table of the
406          * base-2 logs of all our values of a.
407          *
408          * Secondly, log2((1+s)/(1-s)) is a nice thing to compute,
409          * once we've got s. Switching to natural logarithms for the
410          * moment (it's only a scaling factor to sort that out at
411          * the end), we write it as the difference of two logs:
412          *
413          *   log((1+s)/(1-s)) = log(1+s) - log(1-s)
414          *
415          * Now recall that Taylor series expansion gives us
416          *
417          *   log(1+s) = s - s^2/2 + s^3/3 - ...
418          *
419          * and therefore we also have
420          *
421          *   log(1-s) = -s - s^2/2 - s^3/3 - ...
422          *
423          * These series are exactly the same except that the odd
424          * terms (s, s^3 etc) have flipped signs; so subtracting the
425          * latter from the former gives us
426          *
427          *   log(1+s) - log(1-s) = 2s + 2s^3/3 + 2s^5/5 + ...
428          *
429          * which requires only half as many terms to be computed
430          * before the powers of s get too small to see. Then, of
431          * course, we have to scale the result by 1/log(2) to
432          * convert natural logs into logs base 2.
433          *
434          * To compute the above series in precision-and-a-half, we
435          * first extract a factor of 2s (which we can multiply back
436          * in later) so that we're computing 1 + s^2/3 + s^4/5 + ...
437          * and then observe that if s starts off small enough to
438          * make s^2/3 at most 2^-12, we need only compute the first
439          * couple of terms in laborious prec-and-a-half, and can
440          * delegate everything after that to a simple polynomial
441          * approximation whose error will end up at the bottom of
442          * the low word of the result.
443          *
444          * How many subintervals does that mean we need?
445          *
446          * To go back to s = (x-a)/(x+a). Let x = a + e, for some
447          * positive e. Then |s| = |e| / |2a+e| <= |e/2a|. So suppose
448          * we have n subintervals of equal width covering the space
449          * from 1 to 2. If a is at the centre of each interval, then
450          * we have e at most 1/2n and a can equal any of 1, 1+1/n,
451          * 1+2/n, ... 1+(n-1)/n. In that case, clearly the largest
452          * value of |e/2a| is given by the largest e (i.e. 1/2n) and
453          * the smallest a (i.e. 1); so |s| <= 1/4n. Hence, when we
454          * know how big we're prepared to let s be, we simply make
455          * sure 1/4n is at most that.
456          *
457          * And if we want s^2/3 to be at most 2^-12, then that means
458          * s^2 is at most 3*2^-12, so that s is at most sqrt(3)*2^-6
459          * = 0.02706. To get 1/4n smaller than that, we need to have
460          * n>=9.23; so we'll set n=16 (for ease of bit-twiddling),
461          * and then s is at most 1/64.
462          */
463         int n, i;
464         float a, ax, sh, sl, lsh, lsl;
465 
466         /*
467          * Let ax be x normalised to a single exponent range.
468          * However, the exponent range in question is not a simple
469          * one like [1,2). What we do is to round up the top four
470          * bits of the mantissa, so that the top 1/32 of each
471          * natural exponent range rounds up to the next one and is
472          * treated as a displacement from the lowest a in that
473          * range.
474          *
475          * So this piece of bit-twiddling gets us our input exponent
476          * and our subinterval index.
477          */
478         n = (ix + 0x00040000) >> 19;
479         i = n & 15;
480         n = ((n >> 4) & 0xFF) - 0x7F;
481         ax = fhex(ix - (n << 23));
482         n += expadjust;
483 
484         /*
485          * Compute the subinterval centre a.
486          */
487         a = 1.0f + i * (1.0f/16.0f);
488 
489         /*
490          * Compute s = (ax-a)/(ax+a), in precision-and-a-half.
491          */
492         {
493             float u, vh, vl, vapprox, rvapprox;
494 
495             u = ax - a;                /* exact numerator */
496             vapprox = ax + a;          /* approximate denominator */
497             vh = CLEARBOTTOMHALF(vapprox);
498             vl = (a - vh) + ax;        /* vh+vl is exact denominator */
499             rvapprox = 1.0f/vapprox;   /* approximate reciprocal of denominator */
500 
501             sh = CLEARBOTTOMHALF(u * rvapprox);
502             sl = ((u - sh*vh) - sh*vl) * rvapprox;
503         }
504 
505         /*
506          * Now compute log2(1+s) - log2(1-s). We do this in several
507          * steps.
508          *
509          * By polynomial approximation, we compute
510          *
511          *        log(1+s) - log(1-s)
512          *    p = ------------------- - 1
513          *                2s
514          *
515          * in single precision only, using a single-precision
516          * approximation to s. This polynomial has s^2 as its
517          * lowest-order term, so we expect the result to be in
518          * [0,2^-12).
519          *
520          * Then we form a prec-and-a-half number out of 1 and p,
521          * which is therefore equal to (log(1+s) - log(1-s))/(2s).
522          *
523          * Finally, we do two prec-and-a-half multiplications: one
524          * by s itself, and one by the constant 32/log(2).
525          */
526         {
527             float s = sh + sl;
528             float s2 = s*s;
529             /*
530              * p is actually a polynomial in s^2, with the first
531              * term constrained to zero. In other words, treated on
532              * its own terms, we're computing p(s^2) such that p(x)
533              * is an approximation to the sum of the series 1/3 +
534              * x/5 + x^2/7 + ..., valid on the range [0, 1/40^2].
535              */
536             float p = s2 * (0.33333332920177422f + s2 * 0.20008275183621479f);
537             float th, tl;
538 
539             PHMULTIPLY(th,tl, 1.0f,p, sh,sl);
540             PHMULTIPLY(lsh,lsl, th,tl, 0x1.716p+5F,-0x1.7135a8p-8F);
541         }
542 
543         /*
544          * And our final answer for 16*log2(x) is equal to 16n (from
545          * the exponent), plus lsh+lsl (the result of the above
546          * computation), plus 16*log2(a) which we must look up in a
547          * table.
548          */
549         {
550             struct f2 { float h, l; };
551             static const struct f2 table[16] = {
552                 /*
553                  * When constructing this table, we have to be sure
554                  * that we produce the same values of a which will
555                  * be produced by the computation above. Ideally, I
556                  * would tell Perl to actually do its _arithmetic_
557                  * in single precision here; but I don't know a way
558                  * to do that, so instead I just scrupulously
559                  * convert every intermediate value to and from SP.
560                  */
561                 // perl -e 'for ($i=0; $i<16; $i++) { $v = unpack "f", pack "f", 1/16.0; $a = unpack "f", pack "f", $i * $v; $a = unpack "f", pack "f", $a+1.0; $l = 16*log($a)/log(2); $top = unpack "f", pack "f", int($l*256.0+0.5)/256.0; $bot = unpack "f", pack "f", $l - $top; printf "{0f_%08X,0f_%08X}, ", unpack "VV", pack "ff", $top, $bot; } print "\n"' | fold -s -w56 | sed 's/^/                /'
562                 {0x0p+0F,0x0p+0F}, {0x1.66p+0F,0x1.fb7d64p-11F},
563                 {0x1.5cp+1F,0x1.a39fbep-15F}, {0x1.fcp+1F,-0x1.f4a37ep-10F},
564                 {0x1.49cp+2F,-0x1.87b432p-10F}, {0x1.91cp+2F,-0x1.15db84p-12F},
565                 {0x1.d68p+2F,-0x1.583f9ap-11F}, {0x1.0c2p+3F,-0x1.f5fe54p-10F},
566                 {0x1.2b8p+3F,0x1.a39fbep-16F}, {0x1.49ap+3F,0x1.e12f34p-11F},
567                 {0x1.66ap+3F,0x1.1c8f12p-18F}, {0x1.828p+3F,0x1.3ab7cep-14F},
568                 {0x1.9d6p+3F,-0x1.30158p-12F}, {0x1.b74p+3F,0x1.291eaap-10F},
569                 {0x1.d06p+3F,-0x1.8125b4p-10F}, {0x1.e88p+3F,0x1.8d66c4p-10F},
570             };
571             float lah = table[i].h, lal = table[i].l;
572             float fn = 16*n;
573             logh = CLEARBOTTOMHALF(lsl + lal + lsh + lah + fn);
574             logl = lsl - ((((logh - fn) - lah) - lsh) - lal);
575         }
576     }
577 
578     /*
579      * Now we have 16*log2(x), multiply it by y in prec-and-a-half.
580      */
581     {
582         float yh, yl;
583         int savedexcepts;
584 
585         yh = CLEARBOTTOMHALF(y);
586         yl = y - yh;
587 
588         /* This multiplication could become infinite, so to avoid IVO
589          * in PHMULTIPLY we bound the output at 4096, which is big
590          * enough to allow any non-overflowing case through
591          * unmodified. Also, we must mask out the OVF exception, which
592          * we won't want left in the FP status word in the case where
593          * rlogh becomes huge and _negative_ (since that will be an
594          * underflow from the perspective of powf's return value, not
595          * an overflow). */
596         savedexcepts = __ieee_status(0,0) & (FE_IEEE_OVERFLOW | FE_IEEE_UNDERFLOW);
597         PHMULTIPLY_SATURATE(rlogh, rlogl, logh, logl, yh, yl, 4096.0f);
598         __ieee_status(FE_IEEE_OVERFLOW | FE_IEEE_UNDERFLOW, savedexcepts);
599     }
600 
601     /*
602      * And raise 2 to the power of whatever that gave. Again, this
603      * is done in three parts: the fractional part of our input is
604      * fed through a polynomial approximation, all but the bottom
605      * four bits of the integer part go straight into the exponent,
606      * and the bottom four bits of the integer part index into a
607      * lookup table of powers of 2^(1/16) in prec-and-a-half.
608      */
609     {
610         float rlog = rlogh + rlogl;
611         int i16 = (rlog + (rlog < 0 ? -0.5f : +0.5f));
612         float rlogi = i16 >> 4;
613 
614         float x = rlogl + (rlogh - i16);
615 
616         static const float powersof2to1over16top[16] = { 0x1p+0F, 0x1.0b4p+0F, 0x1.172p+0F, 0x1.238p+0F, 0x1.306p+0F, 0x1.3dep+0F, 0x1.4bep+0F, 0x1.5aap+0F, 0x1.6ap+0F, 0x1.7ap+0F, 0x1.8acp+0F, 0x1.9c4p+0F, 0x1.ae8p+0F, 0x1.c18p+0F, 0x1.d58p+0F, 0x1.ea4p+0F };
617         static const float powersof2to1over16bot[16] = { 0x0p+0F, 0x1.586cfap-12F, 0x1.7078fap-13F, 0x1.e9b9d6p-14F, 0x1.fc1464p-13F, 0x1.4c9824p-13F, 0x1.dad536p-12F, 0x1.07dd48p-12F, 0x1.3cccfep-13F, 0x1.1473ecp-12F, 0x1.ca8456p-13F, 0x1.230548p-13F, 0x1.3f32b6p-13F, 0x1.9bdd86p-12F, 0x1.8dcfbap-16F, 0x1.5f454ap-13F };
618         static const float powersof2to1over16all[16] = { 0x1p+0F, 0x1.0b5586p+0F, 0x1.172b84p+0F, 0x1.2387a6p+0F, 0x1.306fep+0F, 0x1.3dea64p+0F, 0x1.4bfdaep+0F, 0x1.5ab07ep+0F, 0x1.6a09e6p+0F, 0x1.7a1148p+0F, 0x1.8ace54p+0F, 0x1.9c4918p+0F, 0x1.ae89fap+0F, 0x1.c199bep+0F, 0x1.d5818ep+0F, 0x1.ea4afap+0F };
619         /*
620          * Coefficients generated using the command
621 
622 ./auxiliary/remez.jl --suffix=f -- '-1/BigFloat(2)' '+1/BigFloat(2)' 2 0 'expm1(x*log(BigFloat(2))/16)/x'
623 
624          */
625         float p = x * (
626             4.332169876512769231967668743345473181486157887703125512683507537369503902991722e-02f+x*(9.384123108485637159805511308039285411735300871134684682779057580789341719567367e-04f+x*(1.355120515540562256928614563584948866224035897564701496826514330445829352922309e-05f))
627             );
628         int index = (i16 & 15);
629         p = powersof2to1over16top[index] + (powersof2to1over16bot[index] + powersof2to1over16all[index]*p);
630 
631         if (
632             fabsf(rlogi) < 126.0f
633             ) {
634             return sign * p * fhex((unsigned)((127.0f+rlogi) * 8388608.0f));
635         } else if (
636                    fabsf(rlogi) < 192.0f
637                    ) {
638             int i = rlogi;
639             float ret;
640 
641             ret = sign * p *
642                 fhex((unsigned)((0x7f+i/2) * 8388608)) *
643                 fhex((unsigned)((0x7f+i-i/2) * 8388608));
644 
645             if ((fai(ret) << 1) == 0xFF000000)
646                 return MATHERR_POWF_OFL(x, y, sign);
647             else if ((fai(ret) << 1) == 0)
648                 return MATHERR_POWF_UFL(x, y, sign);
649             else
650                 return FLOAT_CHECKDENORM(ret);
651         } else {
652             if (rlogi < 0)
653                 return MATHERR_POWF_UFL(x, y, sign);
654             else
655                 return MATHERR_POWF_OFL(x, y, sign);
656         }
657     }
658 }
659