• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* $NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $ */
2 
3 /*
4  * This version hacked for use with gcc -msoft-float by bjh21.
5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6  *  itself).
7  */
8 
9 /*
10  * Things you may want to define:
11  *
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14  *   properly renamed.
15  */
16 
17 /*
18 ===============================================================================
19 
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 #include <sys/cdefs.h>
48 #if defined(LIBC_SCCS) && !defined(lint)
49 __RCSID("$NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $");
50 #endif /* LIBC_SCCS and not lint */
51 
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
54 #endif
55 
56 #include "milieu.h"
57 #include "softfloat.h"
58 
59 /*
60  * Conversions between floats as stored in memory and floats as
61  * SoftFloat uses them
62  */
63 #ifndef FLOAT64_DEMANGLE
64 #define FLOAT64_DEMANGLE(a) (a)
65 #endif
66 #ifndef FLOAT64_MANGLE
67 #define FLOAT64_MANGLE(a)   (a)
68 #endif
69 
70 /*
71 -------------------------------------------------------------------------------
72 Floating-point rounding mode, extended double-precision rounding precision,
73 and exception flags.
74 -------------------------------------------------------------------------------
75 */
76 #ifndef set_float_rounding_mode
77 fp_rnd float_rounding_mode = float_round_nearest_even;
78 fp_except float_exception_flags = 0;
79 #endif
80 #ifndef set_float_exception_inexact_flag
81 #define set_float_exception_inexact_flag() \
82     ((void)(float_exception_flags |= float_flag_inexact))
83 #endif
84 #ifdef FLOATX80
85 int8 floatx80_rounding_precision = 80;
86 #endif
87 
88 /*
89 -------------------------------------------------------------------------------
90 Primitive arithmetic functions, including multi-word arithmetic, and
91 division and square root approximations.  (Can be specialized to target if
92 desired.)
93 -------------------------------------------------------------------------------
94 */
95 #include "softfloat-macros"
96 
97 /*
98 -------------------------------------------------------------------------------
99 Functions and definitions to determine:  (1) whether tininess for underflow
100 is detected before or after rounding by default, (2) what (if anything)
101 happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 are propagated from function inputs to output.  These details are target-
104 specific.
105 -------------------------------------------------------------------------------
106 */
107 #include "softfloat-specialize"
108 
109 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
110 /*
111 -------------------------------------------------------------------------------
112 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
113 and 7, and returns the properly rounded 32-bit integer corresponding to the
114 input.  If `zSign' is 1, the input is negated before being converted to an
115 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
116 is simply rounded to an integer, with the inexact exception raised if the
117 input cannot be represented exactly as an integer.  However, if the fixed-
118 point input is too large, the invalid exception is raised and the largest
119 positive or negative integer is returned.
120 -------------------------------------------------------------------------------
121 */
roundAndPackInt32(flag zSign,bits64 absZ)122 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
123 {
124     int8 roundingMode;
125     flag roundNearestEven;
126     int8 roundIncrement, roundBits;
127     int32 z;
128 
129     roundingMode = float_rounding_mode;
130     roundNearestEven = ( roundingMode == float_round_nearest_even );
131     roundIncrement = 0x40;
132     if ( ! roundNearestEven ) {
133         if ( roundingMode == float_round_to_zero ) {
134             roundIncrement = 0;
135         }
136         else {
137             roundIncrement = 0x7F;
138             if ( zSign ) {
139                 if ( roundingMode == float_round_up ) roundIncrement = 0;
140             }
141             else {
142                 if ( roundingMode == float_round_down ) roundIncrement = 0;
143             }
144         }
145     }
146     roundBits = (int8)(absZ & 0x7F);
147     absZ = ( absZ + roundIncrement )>>7;
148     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
149     z = (int32)absZ;
150     if ( zSign ) z = - z;
151     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
152         float_raise( float_flag_invalid );
153         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
154     }
155     if ( roundBits ) set_float_exception_inexact_flag();
156     return z;
157 
158 }
159 
160 /*
161 -------------------------------------------------------------------------------
162 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
163 `absZ1', with binary point between bits 63 and 64 (between the input words),
164 and returns the properly rounded 64-bit integer corresponding to the input.
165 If `zSign' is 1, the input is negated before being converted to an integer.
166 Ordinarily, the fixed-point input is simply rounded to an integer, with
167 the inexact exception raised if the input cannot be represented exactly as
168 an integer.  However, if the fixed-point input is too large, the invalid
169 exception is raised and the largest positive or negative integer is
170 returned.
171 -------------------------------------------------------------------------------
172 */
roundAndPackInt64(flag zSign,bits64 absZ0,bits64 absZ1)173 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
174 {
175     int8 roundingMode;
176     flag roundNearestEven, increment;
177     int64 z;
178 
179     roundingMode = float_rounding_mode;
180     roundNearestEven = ( roundingMode == float_round_nearest_even );
181     increment = ( (sbits64) absZ1 < 0 );
182     if ( ! roundNearestEven ) {
183         if ( roundingMode == float_round_to_zero ) {
184             increment = 0;
185         }
186         else {
187             if ( zSign ) {
188                 increment = ( roundingMode == float_round_down ) && absZ1;
189             }
190             else {
191                 increment = ( roundingMode == float_round_up ) && absZ1;
192             }
193         }
194     }
195     if ( increment ) {
196         ++absZ0;
197         if ( absZ0 == 0 ) goto overflow;
198         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
199     }
200     z = absZ0;
201     if ( zSign ) z = - z;
202     if ( z && ( ( z < 0 ) ^ zSign ) ) {
203  overflow:
204         float_raise( float_flag_invalid );
205         return
206               zSign ? (sbits64) LIT64( 0x8000000000000000 )
207             : LIT64( 0x7FFFFFFFFFFFFFFF );
208     }
209     if ( absZ1 ) set_float_exception_inexact_flag();
210     return z;
211 
212 }
213 #endif
214 
215 /*
216 -------------------------------------------------------------------------------
217 Returns the fraction bits of the single-precision floating-point value `a'.
218 -------------------------------------------------------------------------------
219 */
extractFloat32Frac(float32 a)220 INLINE bits32 extractFloat32Frac( float32 a )
221 {
222 
223     return a & 0x007FFFFF;
224 
225 }
226 
227 /*
228 -------------------------------------------------------------------------------
229 Returns the exponent bits of the single-precision floating-point value `a'.
230 -------------------------------------------------------------------------------
231 */
extractFloat32Exp(float32 a)232 INLINE int16 extractFloat32Exp( float32 a )
233 {
234 
235     return ( a>>23 ) & 0xFF;
236 
237 }
238 
239 /*
240 -------------------------------------------------------------------------------
241 Returns the sign bit of the single-precision floating-point value `a'.
242 -------------------------------------------------------------------------------
243 */
extractFloat32Sign(float32 a)244 INLINE flag extractFloat32Sign( float32 a )
245 {
246 
247     return a>>31;
248 
249 }
250 
251 /*
252 -------------------------------------------------------------------------------
253 Normalizes the subnormal single-precision floating-point value represented
254 by the denormalized significand `aSig'.  The normalized exponent and
255 significand are stored at the locations pointed to by `zExpPtr' and
256 `zSigPtr', respectively.
257 -------------------------------------------------------------------------------
258 */
259 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)260  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
261 {
262     int8 shiftCount;
263 
264     shiftCount = countLeadingZeros32( aSig ) - 8;
265     *zSigPtr = aSig<<shiftCount;
266     *zExpPtr = 1 - shiftCount;
267 
268 }
269 
270 /*
271 -------------------------------------------------------------------------------
272 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
273 single-precision floating-point value, returning the result.  After being
274 shifted into the proper positions, the three fields are simply added
275 together to form the result.  This means that any integer portion of `zSig'
276 will be added into the exponent.  Since a properly normalized significand
277 will have an integer portion equal to 1, the `zExp' input should be 1 less
278 than the desired result exponent whenever `zSig' is a complete, normalized
279 significand.
280 -------------------------------------------------------------------------------
281 */
packFloat32(flag zSign,int16 zExp,bits32 zSig)282 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
283 {
284 
285     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
286 
287 }
288 
289 /*
290 -------------------------------------------------------------------------------
291 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
292 and significand `zSig', and returns the proper single-precision floating-
293 point value corresponding to the abstract input.  Ordinarily, the abstract
294 value is simply rounded and packed into the single-precision format, with
295 the inexact exception raised if the abstract input cannot be represented
296 exactly.  However, if the abstract value is too large, the overflow and
297 inexact exceptions are raised and an infinity or maximal finite value is
298 returned.  If the abstract value is too small, the input value is rounded to
299 a subnormal number, and the underflow and inexact exceptions are raised if
300 the abstract input cannot be represented exactly as a subnormal single-
301 precision floating-point number.
302     The input significand `zSig' has its binary point between bits 30
303 and 29, which is 7 bits to the left of the usual location.  This shifted
304 significand must be normalized or smaller.  If `zSig' is not normalized,
305 `zExp' must be 0; in that case, the result returned is a subnormal number,
306 and it must not require rounding.  In the usual case that `zSig' is
307 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
308 The handling of underflow and overflow follows the IEC/IEEE Standard for
309 Binary Floating-Point Arithmetic.
310 -------------------------------------------------------------------------------
311 */
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)312 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
313 {
314     int8 roundingMode;
315     flag roundNearestEven;
316     int8 roundIncrement, roundBits;
317     flag isTiny;
318 
319     roundingMode = float_rounding_mode;
320     roundNearestEven = ( roundingMode == float_round_nearest_even );
321     roundIncrement = 0x40;
322     if ( ! roundNearestEven ) {
323         if ( roundingMode == float_round_to_zero ) {
324             roundIncrement = 0;
325         }
326         else {
327             roundIncrement = 0x7F;
328             if ( zSign ) {
329                 if ( roundingMode == float_round_up ) roundIncrement = 0;
330             }
331             else {
332                 if ( roundingMode == float_round_down ) roundIncrement = 0;
333             }
334         }
335     }
336     roundBits = zSig & 0x7F;
337     if ( 0xFD <= (bits16) zExp ) {
338         if (    ( 0xFD < zExp )
339              || (    ( zExp == 0xFD )
340                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
341            ) {
342             float_raise( float_flag_overflow | float_flag_inexact );
343             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
344         }
345         if ( zExp < 0 ) {
346             isTiny =
347                    ( float_detect_tininess == float_tininess_before_rounding )
348                 || ( zExp < -1 )
349                 || ( zSig + roundIncrement < 0x80000000U );
350             shift32RightJamming( zSig, - zExp, &zSig );
351             zExp = 0;
352             roundBits = zSig & 0x7F;
353             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
354         }
355     }
356     if ( roundBits ) set_float_exception_inexact_flag();
357     zSig = ( zSig + roundIncrement )>>7;
358     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
359     if ( zSig == 0 ) zExp = 0;
360     return packFloat32( zSign, zExp, zSig );
361 
362 }
363 
364 /*
365 -------------------------------------------------------------------------------
366 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
367 and significand `zSig', and returns the proper single-precision floating-
368 point value corresponding to the abstract input.  This routine is just like
369 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
370 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
371 floating-point exponent.
372 -------------------------------------------------------------------------------
373 */
374 static float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)375  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
376 {
377     int8 shiftCount;
378 
379     shiftCount = countLeadingZeros32( zSig ) - 1;
380     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
381 
382 }
383 
384 /*
385 -------------------------------------------------------------------------------
386 Returns the fraction bits of the double-precision floating-point value `a'.
387 -------------------------------------------------------------------------------
388 */
extractFloat64Frac(float64 a)389 INLINE bits64 extractFloat64Frac( float64 a )
390 {
391 
392     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
393 
394 }
395 
396 /*
397 -------------------------------------------------------------------------------
398 Returns the exponent bits of the double-precision floating-point value `a'.
399 -------------------------------------------------------------------------------
400 */
extractFloat64Exp(float64 a)401 INLINE int16 extractFloat64Exp( float64 a )
402 {
403 
404     return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF);
405 
406 }
407 
408 /*
409 -------------------------------------------------------------------------------
410 Returns the sign bit of the double-precision floating-point value `a'.
411 -------------------------------------------------------------------------------
412 */
extractFloat64Sign(float64 a)413 INLINE flag extractFloat64Sign( float64 a )
414 {
415 
416     return (flag)(FLOAT64_DEMANGLE(a) >> 63);
417 
418 }
419 
420 /*
421 -------------------------------------------------------------------------------
422 Normalizes the subnormal double-precision floating-point value represented
423 by the denormalized significand `aSig'.  The normalized exponent and
424 significand are stored at the locations pointed to by `zExpPtr' and
425 `zSigPtr', respectively.
426 -------------------------------------------------------------------------------
427 */
428 static void
normalizeFloat64Subnormal(bits64 aSig,int16 * zExpPtr,bits64 * zSigPtr)429  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
430 {
431     int8 shiftCount;
432 
433     shiftCount = countLeadingZeros64( aSig ) - 11;
434     *zSigPtr = aSig<<shiftCount;
435     *zExpPtr = 1 - shiftCount;
436 
437 }
438 
439 /*
440 -------------------------------------------------------------------------------
441 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
442 double-precision floating-point value, returning the result.  After being
443 shifted into the proper positions, the three fields are simply added
444 together to form the result.  This means that any integer portion of `zSig'
445 will be added into the exponent.  Since a properly normalized significand
446 will have an integer portion equal to 1, the `zExp' input should be 1 less
447 than the desired result exponent whenever `zSig' is a complete, normalized
448 significand.
449 -------------------------------------------------------------------------------
450 */
packFloat64(flag zSign,int16 zExp,bits64 zSig)451 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
452 {
453 
454     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
455                            ( ( (bits64) zExp )<<52 ) + zSig );
456 
457 }
458 
459 /*
460 -------------------------------------------------------------------------------
461 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
462 and significand `zSig', and returns the proper double-precision floating-
463 point value corresponding to the abstract input.  Ordinarily, the abstract
464 value is simply rounded and packed into the double-precision format, with
465 the inexact exception raised if the abstract input cannot be represented
466 exactly.  However, if the abstract value is too large, the overflow and
467 inexact exceptions are raised and an infinity or maximal finite value is
468 returned.  If the abstract value is too small, the input value is rounded to
469 a subnormal number, and the underflow and inexact exceptions are raised if
470 the abstract input cannot be represented exactly as a subnormal double-
471 precision floating-point number.
472     The input significand `zSig' has its binary point between bits 62
473 and 61, which is 10 bits to the left of the usual location.  This shifted
474 significand must be normalized or smaller.  If `zSig' is not normalized,
475 `zExp' must be 0; in that case, the result returned is a subnormal number,
476 and it must not require rounding.  In the usual case that `zSig' is
477 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
478 The handling of underflow and overflow follows the IEC/IEEE Standard for
479 Binary Floating-Point Arithmetic.
480 -------------------------------------------------------------------------------
481 */
roundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)482 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
483 {
484     int8 roundingMode;
485     flag roundNearestEven;
486     int16 roundIncrement, roundBits;
487     flag isTiny;
488 
489     roundingMode = float_rounding_mode;
490     roundNearestEven = ( roundingMode == float_round_nearest_even );
491     roundIncrement = 0x200;
492     if ( ! roundNearestEven ) {
493         if ( roundingMode == float_round_to_zero ) {
494             roundIncrement = 0;
495         }
496         else {
497             roundIncrement = 0x3FF;
498             if ( zSign ) {
499                 if ( roundingMode == float_round_up ) roundIncrement = 0;
500             }
501             else {
502                 if ( roundingMode == float_round_down ) roundIncrement = 0;
503             }
504         }
505     }
506     roundBits = (int16)(zSig & 0x3FF);
507     if ( 0x7FD <= (bits16) zExp ) {
508         if (    ( 0x7FD < zExp )
509              || (    ( zExp == 0x7FD )
510                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
511            ) {
512             float_raise( float_flag_overflow | float_flag_inexact );
513             return FLOAT64_MANGLE(
514                        FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
515                        ( roundIncrement == 0 ));
516         }
517         if ( zExp < 0 ) {
518             isTiny =
519                    ( float_detect_tininess == float_tininess_before_rounding )
520                 || ( zExp < -1 )
521                 || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) );
522             shift64RightJamming( zSig, - zExp, &zSig );
523             zExp = 0;
524             roundBits = (int16)(zSig & 0x3FF);
525             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
526         }
527     }
528     if ( roundBits ) set_float_exception_inexact_flag();
529     zSig = ( zSig + roundIncrement )>>10;
530     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
531     if ( zSig == 0 ) zExp = 0;
532     return packFloat64( zSign, zExp, zSig );
533 
534 }
535 
536 /*
537 -------------------------------------------------------------------------------
538 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
539 and significand `zSig', and returns the proper double-precision floating-
540 point value corresponding to the abstract input.  This routine is just like
541 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
542 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
543 floating-point exponent.
544 -------------------------------------------------------------------------------
545 */
546 static float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)547  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
548 {
549     int8 shiftCount;
550 
551     shiftCount = countLeadingZeros64( zSig ) - 1;
552     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
553 
554 }
555 
556 #ifdef FLOATX80
557 
558 /*
559 -------------------------------------------------------------------------------
560 Returns the fraction bits of the extended double-precision floating-point
561 value `a'.
562 -------------------------------------------------------------------------------
563 */
extractFloatx80Frac(floatx80 a)564 INLINE bits64 extractFloatx80Frac( floatx80 a )
565 {
566 
567     return a.low;
568 
569 }
570 
571 /*
572 -------------------------------------------------------------------------------
573 Returns the exponent bits of the extended double-precision floating-point
574 value `a'.
575 -------------------------------------------------------------------------------
576 */
extractFloatx80Exp(floatx80 a)577 INLINE int32 extractFloatx80Exp( floatx80 a )
578 {
579 
580     return a.high & 0x7FFF;
581 
582 }
583 
584 /*
585 -------------------------------------------------------------------------------
586 Returns the sign bit of the extended double-precision floating-point value
587 `a'.
588 -------------------------------------------------------------------------------
589 */
extractFloatx80Sign(floatx80 a)590 INLINE flag extractFloatx80Sign( floatx80 a )
591 {
592 
593     return a.high>>15;
594 
595 }
596 
597 /*
598 -------------------------------------------------------------------------------
599 Normalizes the subnormal extended double-precision floating-point value
600 represented by the denormalized significand `aSig'.  The normalized exponent
601 and significand are stored at the locations pointed to by `zExpPtr' and
602 `zSigPtr', respectively.
603 -------------------------------------------------------------------------------
604 */
605 static void
normalizeFloatx80Subnormal(bits64 aSig,int32 * zExpPtr,bits64 * zSigPtr)606  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
607 {
608     int8 shiftCount;
609 
610     shiftCount = countLeadingZeros64( aSig );
611     *zSigPtr = aSig<<shiftCount;
612     *zExpPtr = 1 - shiftCount;
613 
614 }
615 
616 /*
617 -------------------------------------------------------------------------------
618 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
619 extended double-precision floating-point value, returning the result.
620 -------------------------------------------------------------------------------
621 */
packFloatx80(flag zSign,int32 zExp,bits64 zSig)622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
623 {
624     floatx80 z;
625 
626     z.low = zSig;
627     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
628     return z;
629 
630 }
631 
632 /*
633 -------------------------------------------------------------------------------
634 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
635 and extended significand formed by the concatenation of `zSig0' and `zSig1',
636 and returns the proper extended double-precision floating-point value
637 corresponding to the abstract input.  Ordinarily, the abstract value is
638 rounded and packed into the extended double-precision format, with the
639 inexact exception raised if the abstract input cannot be represented
640 exactly.  However, if the abstract value is too large, the overflow and
641 inexact exceptions are raised and an infinity or maximal finite value is
642 returned.  If the abstract value is too small, the input value is rounded to
643 a subnormal number, and the underflow and inexact exceptions are raised if
644 the abstract input cannot be represented exactly as a subnormal extended
645 double-precision floating-point number.
646     If `roundingPrecision' is 32 or 64, the result is rounded to the same
647 number of bits as single or double precision, respectively.  Otherwise, the
648 result is rounded to the full precision of the extended double-precision
649 format.
650     The input significand must be normalized or smaller.  If the input
651 significand is not normalized, `zExp' must be 0; in that case, the result
652 returned is a subnormal number, and it must not require rounding.  The
653 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
654 Floating-Point Arithmetic.
655 -------------------------------------------------------------------------------
656 */
657 static floatx80
roundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)658  roundAndPackFloatx80(
659      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
660  )
661 {
662     int8 roundingMode;
663     flag roundNearestEven, increment, isTiny;
664     int64 roundIncrement, roundMask, roundBits;
665 
666     roundingMode = float_rounding_mode;
667     roundNearestEven = ( roundingMode == float_round_nearest_even );
668     if ( roundingPrecision == 80 ) goto precision80;
669     if ( roundingPrecision == 64 ) {
670         roundIncrement = LIT64( 0x0000000000000400 );
671         roundMask = LIT64( 0x00000000000007FF );
672     }
673     else if ( roundingPrecision == 32 ) {
674         roundIncrement = LIT64( 0x0000008000000000 );
675         roundMask = LIT64( 0x000000FFFFFFFFFF );
676     }
677     else {
678         goto precision80;
679     }
680     zSig0 |= ( zSig1 != 0 );
681     if ( ! roundNearestEven ) {
682         if ( roundingMode == float_round_to_zero ) {
683             roundIncrement = 0;
684         }
685         else {
686             roundIncrement = roundMask;
687             if ( zSign ) {
688                 if ( roundingMode == float_round_up ) roundIncrement = 0;
689             }
690             else {
691                 if ( roundingMode == float_round_down ) roundIncrement = 0;
692             }
693         }
694     }
695     roundBits = zSig0 & roundMask;
696     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
697         if (    ( 0x7FFE < zExp )
698              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
699            ) {
700             goto overflow;
701         }
702         if ( zExp <= 0 ) {
703             isTiny =
704                    ( float_detect_tininess == float_tininess_before_rounding )
705                 || ( zExp < 0 )
706                 || ( zSig0 <= zSig0 + roundIncrement );
707             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
708             zExp = 0;
709             roundBits = zSig0 & roundMask;
710             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
711             if ( roundBits ) set_float_exception_inexact_flag();
712             zSig0 += roundIncrement;
713             if ( (sbits64) zSig0 < 0 ) zExp = 1;
714             roundIncrement = roundMask + 1;
715             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
716                 roundMask |= roundIncrement;
717             }
718             zSig0 &= ~ roundMask;
719             return packFloatx80( zSign, zExp, zSig0 );
720         }
721     }
722     if ( roundBits ) set_float_exception_inexact_flag();
723     zSig0 += roundIncrement;
724     if ( zSig0 < roundIncrement ) {
725         ++zExp;
726         zSig0 = LIT64( 0x8000000000000000 );
727     }
728     roundIncrement = roundMask + 1;
729     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
730         roundMask |= roundIncrement;
731     }
732     zSig0 &= ~ roundMask;
733     if ( zSig0 == 0 ) zExp = 0;
734     return packFloatx80( zSign, zExp, zSig0 );
735  precision80:
736     increment = ( (sbits64) zSig1 < 0 );
737     if ( ! roundNearestEven ) {
738         if ( roundingMode == float_round_to_zero ) {
739             increment = 0;
740         }
741         else {
742             if ( zSign ) {
743                 increment = ( roundingMode == float_round_down ) && zSig1;
744             }
745             else {
746                 increment = ( roundingMode == float_round_up ) && zSig1;
747             }
748         }
749     }
750     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
751         if (    ( 0x7FFE < zExp )
752              || (    ( zExp == 0x7FFE )
753                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
754                   && increment
755                 )
756            ) {
757             roundMask = 0;
758  overflow:
759             float_raise( float_flag_overflow | float_flag_inexact );
760             if (    ( roundingMode == float_round_to_zero )
761                  || ( zSign && ( roundingMode == float_round_up ) )
762                  || ( ! zSign && ( roundingMode == float_round_down ) )
763                ) {
764                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
765             }
766             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
767         }
768         if ( zExp <= 0 ) {
769             isTiny =
770                    ( float_detect_tininess == float_tininess_before_rounding )
771                 || ( zExp < 0 )
772                 || ! increment
773                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
774             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
775             zExp = 0;
776             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
777             if ( zSig1 ) set_float_exception_inexact_flag();
778             if ( roundNearestEven ) {
779                 increment = ( (sbits64) zSig1 < 0 );
780             }
781             else {
782                 if ( zSign ) {
783                     increment = ( roundingMode == float_round_down ) && zSig1;
784                 }
785                 else {
786                     increment = ( roundingMode == float_round_up ) && zSig1;
787                 }
788             }
789             if ( increment ) {
790                 ++zSig0;
791                 zSig0 &=
792                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
793                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
794             }
795             return packFloatx80( zSign, zExp, zSig0 );
796         }
797     }
798     if ( zSig1 ) set_float_exception_inexact_flag();
799     if ( increment ) {
800         ++zSig0;
801         if ( zSig0 == 0 ) {
802             ++zExp;
803             zSig0 = LIT64( 0x8000000000000000 );
804         }
805         else {
806             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
807         }
808     }
809     else {
810         if ( zSig0 == 0 ) zExp = 0;
811     }
812     return packFloatx80( zSign, zExp, zSig0 );
813 
814 }
815 
816 /*
817 -------------------------------------------------------------------------------
818 Takes an abstract floating-point value having sign `zSign', exponent
819 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
820 and returns the proper extended double-precision floating-point value
821 corresponding to the abstract input.  This routine is just like
822 `roundAndPackFloatx80' except that the input significand does not have to be
823 normalized.
824 -------------------------------------------------------------------------------
825 */
826 static floatx80
normalizeRoundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)827  normalizeRoundAndPackFloatx80(
828      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
829  )
830 {
831     int8 shiftCount;
832 
833     if ( zSig0 == 0 ) {
834         zSig0 = zSig1;
835         zSig1 = 0;
836         zExp -= 64;
837     }
838     shiftCount = countLeadingZeros64( zSig0 );
839     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
840     zExp -= shiftCount;
841     return
842         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
843 
844 }
845 
846 #endif
847 
848 #ifdef FLOAT128
849 
850 /*
851 -------------------------------------------------------------------------------
852 Returns the least-significant 64 fraction bits of the quadruple-precision
853 floating-point value `a'.
854 -------------------------------------------------------------------------------
855 */
extractFloat128Frac1(float128 a)856 INLINE bits64 extractFloat128Frac1( float128 a )
857 {
858 
859     return a.low;
860 
861 }
862 
863 /*
864 -------------------------------------------------------------------------------
865 Returns the most-significant 48 fraction bits of the quadruple-precision
866 floating-point value `a'.
867 -------------------------------------------------------------------------------
868 */
extractFloat128Frac0(float128 a)869 INLINE bits64 extractFloat128Frac0( float128 a )
870 {
871 
872     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
873 
874 }
875 
876 /*
877 -------------------------------------------------------------------------------
878 Returns the exponent bits of the quadruple-precision floating-point value
879 `a'.
880 -------------------------------------------------------------------------------
881 */
extractFloat128Exp(float128 a)882 INLINE int32 extractFloat128Exp( float128 a )
883 {
884 
885     return (int32)((a.high >> 48) & 0x7FFF);
886 
887 }
888 
889 /*
890 -------------------------------------------------------------------------------
891 Returns the sign bit of the quadruple-precision floating-point value `a'.
892 -------------------------------------------------------------------------------
893 */
extractFloat128Sign(float128 a)894 INLINE flag extractFloat128Sign( float128 a )
895 {
896 
897     return (flag)(a.high >> 63);
898 
899 }
900 
901 /*
902 -------------------------------------------------------------------------------
903 Normalizes the subnormal quadruple-precision floating-point value
904 represented by the denormalized significand formed by the concatenation of
905 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
906 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
907 significand are stored at the location pointed to by `zSig0Ptr', and the
908 least significant 64 bits of the normalized significand are stored at the
909 location pointed to by `zSig1Ptr'.
910 -------------------------------------------------------------------------------
911 */
912 static void
normalizeFloat128Subnormal(bits64 aSig0,bits64 aSig1,int32 * zExpPtr,bits64 * zSig0Ptr,bits64 * zSig1Ptr)913  normalizeFloat128Subnormal(
914      bits64 aSig0,
915      bits64 aSig1,
916      int32 *zExpPtr,
917      bits64 *zSig0Ptr,
918      bits64 *zSig1Ptr
919  )
920 {
921     int8 shiftCount;
922 
923     if ( aSig0 == 0 ) {
924         shiftCount = countLeadingZeros64( aSig1 ) - 15;
925         if ( shiftCount < 0 ) {
926             *zSig0Ptr = aSig1>>( - shiftCount );
927             *zSig1Ptr = aSig1<<( shiftCount & 63 );
928         }
929         else {
930             *zSig0Ptr = aSig1<<shiftCount;
931             *zSig1Ptr = 0;
932         }
933         *zExpPtr = - shiftCount - 63;
934     }
935     else {
936         shiftCount = countLeadingZeros64( aSig0 ) - 15;
937         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
938         *zExpPtr = 1 - shiftCount;
939     }
940 
941 }
942 
943 /*
944 -------------------------------------------------------------------------------
945 Packs the sign `zSign', the exponent `zExp', and the significand formed
946 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
947 floating-point value, returning the result.  After being shifted into the
948 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
949 added together to form the most significant 32 bits of the result.  This
950 means that any integer portion of `zSig0' will be added into the exponent.
951 Since a properly normalized significand will have an integer portion equal
952 to 1, the `zExp' input should be 1 less than the desired result exponent
953 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
954 significand.
955 -------------------------------------------------------------------------------
956 */
957 INLINE float128
packFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)958  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
959 {
960     float128 z;
961 
962     z.low = zSig1;
963     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
964     return z;
965 
966 }
967 
968 /*
969 -------------------------------------------------------------------------------
970 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
971 and extended significand formed by the concatenation of `zSig0', `zSig1',
972 and `zSig2', and returns the proper quadruple-precision floating-point value
973 corresponding to the abstract input.  Ordinarily, the abstract value is
974 simply rounded and packed into the quadruple-precision format, with the
975 inexact exception raised if the abstract input cannot be represented
976 exactly.  However, if the abstract value is too large, the overflow and
977 inexact exceptions are raised and an infinity or maximal finite value is
978 returned.  If the abstract value is too small, the input value is rounded to
979 a subnormal number, and the underflow and inexact exceptions are raised if
980 the abstract input cannot be represented exactly as a subnormal quadruple-
981 precision floating-point number.
982     The input significand must be normalized or smaller.  If the input
983 significand is not normalized, `zExp' must be 0; in that case, the result
984 returned is a subnormal number, and it must not require rounding.  In the
985 usual case that the input significand is normalized, `zExp' must be 1 less
986 than the ``true'' floating-point exponent.  The handling of underflow and
987 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
988 -------------------------------------------------------------------------------
989 */
990 static float128
roundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1,bits64 zSig2)991  roundAndPackFloat128(
992      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
993 {
994     int8 roundingMode;
995     flag roundNearestEven, increment, isTiny;
996 
997     roundingMode = float_rounding_mode;
998     roundNearestEven = ( roundingMode == float_round_nearest_even );
999     increment = ( (sbits64) zSig2 < 0 );
1000     if ( ! roundNearestEven ) {
1001         if ( roundingMode == float_round_to_zero ) {
1002             increment = 0;
1003         }
1004         else {
1005             if ( zSign ) {
1006                 increment = ( roundingMode == float_round_down ) && zSig2;
1007             }
1008             else {
1009                 increment = ( roundingMode == float_round_up ) && zSig2;
1010             }
1011         }
1012     }
1013     if ( 0x7FFD <= (bits32) zExp ) {
1014         if (    ( 0x7FFD < zExp )
1015              || (    ( zExp == 0x7FFD )
1016                   && eq128(
1017                          LIT64( 0x0001FFFFFFFFFFFF ),
1018                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1019                          zSig0,
1020                          zSig1
1021                      )
1022                   && increment
1023                 )
1024            ) {
1025             float_raise( float_flag_overflow | float_flag_inexact );
1026             if (    ( roundingMode == float_round_to_zero )
1027                  || ( zSign && ( roundingMode == float_round_up ) )
1028                  || ( ! zSign && ( roundingMode == float_round_down ) )
1029                ) {
1030                 return
1031                     packFloat128(
1032                         zSign,
1033                         0x7FFE,
1034                         LIT64( 0x0000FFFFFFFFFFFF ),
1035                         LIT64( 0xFFFFFFFFFFFFFFFF )
1036                     );
1037             }
1038             return packFloat128( zSign, 0x7FFF, 0, 0 );
1039         }
1040         if ( zExp < 0 ) {
1041             isTiny =
1042                    ( float_detect_tininess == float_tininess_before_rounding )
1043                 || ( zExp < -1 )
1044                 || ! increment
1045                 || lt128(
1046                        zSig0,
1047                        zSig1,
1048                        LIT64( 0x0001FFFFFFFFFFFF ),
1049                        LIT64( 0xFFFFFFFFFFFFFFFF )
1050                    );
1051             shift128ExtraRightJamming(
1052                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1053             zExp = 0;
1054             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1055             if ( roundNearestEven ) {
1056                 increment = ( (sbits64) zSig2 < 0 );
1057             }
1058             else {
1059                 if ( zSign ) {
1060                     increment = ( roundingMode == float_round_down ) && zSig2;
1061                 }
1062                 else {
1063                     increment = ( roundingMode == float_round_up ) && zSig2;
1064                 }
1065             }
1066         }
1067     }
1068     if ( zSig2 ) set_float_exception_inexact_flag();
1069     if ( increment ) {
1070         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1071         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1072     }
1073     else {
1074         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1075     }
1076     return packFloat128( zSign, zExp, zSig0, zSig1 );
1077 
1078 }
1079 
1080 /*
1081 -------------------------------------------------------------------------------
1082 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1083 and significand formed by the concatenation of `zSig0' and `zSig1', and
1084 returns the proper quadruple-precision floating-point value corresponding
1085 to the abstract input.  This routine is just like `roundAndPackFloat128'
1086 except that the input significand has fewer bits and does not have to be
1087 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1088 point exponent.
1089 -------------------------------------------------------------------------------
1090 */
1091 static float128
normalizeRoundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)1092  normalizeRoundAndPackFloat128(
1093      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1094 {
1095     int8 shiftCount;
1096     bits64 zSig2;
1097 
1098     if ( zSig0 == 0 ) {
1099         zSig0 = zSig1;
1100         zSig1 = 0;
1101         zExp -= 64;
1102     }
1103     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1104     if ( 0 <= shiftCount ) {
1105         zSig2 = 0;
1106         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1107     }
1108     else {
1109         shift128ExtraRightJamming(
1110             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1111     }
1112     zExp -= shiftCount;
1113     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1114 
1115 }
1116 
1117 #endif
1118 
1119 /*
1120 -------------------------------------------------------------------------------
1121 Returns the result of converting the 32-bit two's complement integer `a'
1122 to the single-precision floating-point format.  The conversion is performed
1123 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1124 -------------------------------------------------------------------------------
1125 */
int32_to_float32(int32 a)1126 float32 int32_to_float32( int32 a )
1127 {
1128     flag zSign;
1129 
1130     if ( a == 0 ) return 0;
1131     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1132     zSign = ( a < 0 );
1133     return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a));
1134 
1135 }
1136 
uint32_to_float32(uint32 a)1137 float32 uint32_to_float32( uint32 a )
1138 {
1139     if ( a == 0 ) return 0;
1140     if ( a & (bits32) 0x80000000 )
1141         return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1142     return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1143 }
1144 
1145 
1146 /*
1147 -------------------------------------------------------------------------------
1148 Returns the result of converting the 32-bit two's complement integer `a'
1149 to the double-precision floating-point format.  The conversion is performed
1150 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1151 -------------------------------------------------------------------------------
1152 */
int32_to_float64(int32 a)1153 float64 int32_to_float64( int32 a )
1154 {
1155     flag zSign;
1156     uint32 absA;
1157     int8 shiftCount;
1158     bits64 zSig;
1159 
1160     if ( a == 0 ) return 0;
1161     zSign = ( a < 0 );
1162     absA = zSign ? - a : a;
1163     shiftCount = countLeadingZeros32( absA ) + 21;
1164     zSig = absA;
1165     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1166 
1167 }
1168 
uint32_to_float64(uint32 a)1169 float64 uint32_to_float64( uint32 a )
1170 {
1171     int8 shiftCount;
1172     bits64 zSig = a;
1173 
1174     if ( a == 0 ) return 0;
1175     shiftCount = countLeadingZeros32( a ) + 21;
1176     return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1177 
1178 }
1179 
1180 #ifdef FLOATX80
1181 
1182 /*
1183 -------------------------------------------------------------------------------
1184 Returns the result of converting the 32-bit two's complement integer `a'
1185 to the extended double-precision floating-point format.  The conversion
1186 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1187 Arithmetic.
1188 -------------------------------------------------------------------------------
1189 */
int32_to_floatx80(int32 a)1190 floatx80 int32_to_floatx80( int32 a )
1191 {
1192     flag zSign;
1193     uint32 absA;
1194     int8 shiftCount;
1195     bits64 zSig;
1196 
1197     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1198     zSign = ( a < 0 );
1199     absA = zSign ? - a : a;
1200     shiftCount = countLeadingZeros32( absA ) + 32;
1201     zSig = absA;
1202     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1203 
1204 }
1205 
uint32_to_floatx80(uint32 a)1206 floatx80 uint32_to_floatx80( uint32 a )
1207 {
1208     int8 shiftCount;
1209     bits64 zSig = a;
1210 
1211     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1212     shiftCount = countLeadingZeros32( a ) + 32;
1213     return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1214 
1215 }
1216 
1217 #endif
1218 
1219 #ifdef FLOAT128
1220 
1221 /*
1222 -------------------------------------------------------------------------------
1223 Returns the result of converting the 32-bit two's complement integer `a' to
1224 the quadruple-precision floating-point format.  The conversion is performed
1225 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1226 -------------------------------------------------------------------------------
1227 */
int32_to_float128(int32 a)1228 float128 int32_to_float128( int32 a )
1229 {
1230     flag zSign;
1231     uint32 absA;
1232     int8 shiftCount;
1233     bits64 zSig0;
1234 
1235     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1236     zSign = ( a < 0 );
1237     absA = zSign ? - a : a;
1238     shiftCount = countLeadingZeros32( absA ) + 17;
1239     zSig0 = absA;
1240     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1241 
1242 }
1243 
uint32_to_float128(uint32 a)1244 float128 uint32_to_float128( uint32 a )
1245 {
1246     int8 shiftCount;
1247     bits64 zSig0 = a;
1248 
1249     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1250     shiftCount = countLeadingZeros32( a ) + 17;
1251     return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1252 
1253 }
1254 
1255 #endif
1256 
1257 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1258 /*
1259 -------------------------------------------------------------------------------
1260 Returns the result of converting the 64-bit two's complement integer `a'
1261 to the single-precision floating-point format.  The conversion is performed
1262 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1263 -------------------------------------------------------------------------------
1264 */
int64_to_float32(int64 a)1265 float32 int64_to_float32( int64 a )
1266 {
1267     flag zSign;
1268     uint64 absA;
1269     int8 shiftCount;
1270 
1271     if ( a == 0 ) return 0;
1272     zSign = ( a < 0 );
1273     absA = zSign ? - a : a;
1274     shiftCount = countLeadingZeros64( absA ) - 40;
1275     if ( 0 <= shiftCount ) {
1276         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1277     }
1278     else {
1279         shiftCount += 7;
1280         if ( shiftCount < 0 ) {
1281             shift64RightJamming( absA, - shiftCount, &absA );
1282         }
1283         else {
1284             absA <<= shiftCount;
1285         }
1286         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1287     }
1288 
1289 }
1290 
1291 /*
1292 -------------------------------------------------------------------------------
1293 Returns the result of converting the 64-bit two's complement integer `a'
1294 to the double-precision floating-point format.  The conversion is performed
1295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 -------------------------------------------------------------------------------
1297 */
int64_to_float64(int64 a)1298 float64 int64_to_float64( int64 a )
1299 {
1300     flag zSign;
1301 
1302     if ( a == 0 ) return 0;
1303     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1304         return packFloat64( 1, 0x43E, 0 );
1305     }
1306     zSign = ( a < 0 );
1307     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1308 
1309 }
1310 
1311 #ifdef FLOATX80
1312 
1313 /*
1314 -------------------------------------------------------------------------------
1315 Returns the result of converting the 64-bit two's complement integer `a'
1316 to the extended double-precision floating-point format.  The conversion
1317 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 Arithmetic.
1319 -------------------------------------------------------------------------------
1320 */
int64_to_floatx80(int64 a)1321 floatx80 int64_to_floatx80( int64 a )
1322 {
1323     flag zSign;
1324     uint64 absA;
1325     int8 shiftCount;
1326 
1327     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1328     zSign = ( a < 0 );
1329     absA = zSign ? - a : a;
1330     shiftCount = countLeadingZeros64( absA );
1331     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1332 
1333 }
1334 
1335 #endif
1336 
1337 #endif /* !SOFTFLOAT_FOR_GCC */
1338 
1339 #ifdef FLOAT128
1340 
1341 /*
1342 -------------------------------------------------------------------------------
1343 Returns the result of converting the 64-bit two's complement integer `a' to
1344 the quadruple-precision floating-point format.  The conversion is performed
1345 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1346 -------------------------------------------------------------------------------
1347 */
int64_to_float128(int64 a)1348 float128 int64_to_float128( int64 a )
1349 {
1350     flag zSign;
1351     uint64 absA;
1352     int8 shiftCount;
1353     int32 zExp;
1354     bits64 zSig0, zSig1;
1355 
1356     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1357     zSign = ( a < 0 );
1358     absA = zSign ? - a : a;
1359     shiftCount = countLeadingZeros64( absA ) + 49;
1360     zExp = 0x406E - shiftCount;
1361     if ( 64 <= shiftCount ) {
1362         zSig1 = 0;
1363         zSig0 = absA;
1364         shiftCount -= 64;
1365     }
1366     else {
1367         zSig1 = absA;
1368         zSig0 = 0;
1369     }
1370     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1371     return packFloat128( zSign, zExp, zSig0, zSig1 );
1372 
1373 }
1374 
1375 #endif
1376 
1377 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1378 /*
1379 -------------------------------------------------------------------------------
1380 Returns the result of converting the single-precision floating-point value
1381 `a' to the 32-bit two's complement integer format.  The conversion is
1382 performed according to the IEC/IEEE Standard for Binary Floating-Point
1383 Arithmetic---which means in particular that the conversion is rounded
1384 according to the current rounding mode.  If `a' is a NaN, the largest
1385 positive integer is returned.  Otherwise, if the conversion overflows, the
1386 largest integer with the same sign as `a' is returned.
1387 -------------------------------------------------------------------------------
1388 */
float32_to_int32(float32 a)1389 int32 float32_to_int32( float32 a )
1390 {
1391     flag aSign;
1392     int16 aExp, shiftCount;
1393     bits32 aSig;
1394     bits64 aSig64;
1395 
1396     aSig = extractFloat32Frac( a );
1397     aExp = extractFloat32Exp( a );
1398     aSign = extractFloat32Sign( a );
1399     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1400     if ( aExp ) aSig |= 0x00800000;
1401     shiftCount = 0xAF - aExp;
1402     aSig64 = aSig;
1403     aSig64 <<= 32;
1404     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1405     return roundAndPackInt32( aSign, aSig64 );
1406 
1407 }
1408 #endif /* !SOFTFLOAT_FOR_GCC */
1409 
1410 /*
1411 -------------------------------------------------------------------------------
1412 Returns the result of converting the single-precision floating-point value
1413 `a' to the 32-bit two's complement integer format.  The conversion is
1414 performed according to the IEC/IEEE Standard for Binary Floating-Point
1415 Arithmetic, except that the conversion is always rounded toward zero.
1416 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1417 the conversion overflows, the largest integer with the same sign as `a' is
1418 returned.
1419 -------------------------------------------------------------------------------
1420 */
float32_to_int32_round_to_zero(float32 a)1421 int32 float32_to_int32_round_to_zero( float32 a )
1422 {
1423     flag aSign;
1424     int16 aExp, shiftCount;
1425     bits32 aSig;
1426     int32 z;
1427 
1428     aSig = extractFloat32Frac( a );
1429     aExp = extractFloat32Exp( a );
1430     aSign = extractFloat32Sign( a );
1431     shiftCount = aExp - 0x9E;
1432     if ( 0 <= shiftCount ) {
1433         if ( a != 0xCF000000 ) {
1434             float_raise( float_flag_invalid );
1435             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1436         }
1437         return (sbits32) 0x80000000;
1438     }
1439     else if ( aExp <= 0x7E ) {
1440         if ( aExp | aSig ) set_float_exception_inexact_flag();
1441         return 0;
1442     }
1443     aSig = ( aSig | 0x00800000 )<<8;
1444     z = aSig>>( - shiftCount );
1445     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1446         set_float_exception_inexact_flag();
1447     }
1448     if ( aSign ) z = - z;
1449     return z;
1450 
1451 }
1452 
1453 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1454 /*
1455 -------------------------------------------------------------------------------
1456 Returns the result of converting the single-precision floating-point value
1457 `a' to the 64-bit two's complement integer format.  The conversion is
1458 performed according to the IEC/IEEE Standard for Binary Floating-Point
1459 Arithmetic---which means in particular that the conversion is rounded
1460 according to the current rounding mode.  If `a' is a NaN, the largest
1461 positive integer is returned.  Otherwise, if the conversion overflows, the
1462 largest integer with the same sign as `a' is returned.
1463 -------------------------------------------------------------------------------
1464 */
float32_to_int64(float32 a)1465 int64 float32_to_int64( float32 a )
1466 {
1467     flag aSign;
1468     int16 aExp, shiftCount;
1469     bits32 aSig;
1470     bits64 aSig64, aSigExtra;
1471 
1472     aSig = extractFloat32Frac( a );
1473     aExp = extractFloat32Exp( a );
1474     aSign = extractFloat32Sign( a );
1475     shiftCount = 0xBE - aExp;
1476     if ( shiftCount < 0 ) {
1477         float_raise( float_flag_invalid );
1478         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1479             return LIT64( 0x7FFFFFFFFFFFFFFF );
1480         }
1481         return (sbits64) LIT64( 0x8000000000000000 );
1482     }
1483     if ( aExp ) aSig |= 0x00800000;
1484     aSig64 = aSig;
1485     aSig64 <<= 40;
1486     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1487     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1488 
1489 }
1490 
1491 /*
1492 -------------------------------------------------------------------------------
1493 Returns the result of converting the single-precision floating-point value
1494 `a' to the 64-bit two's complement integer format.  The conversion is
1495 performed according to the IEC/IEEE Standard for Binary Floating-Point
1496 Arithmetic, except that the conversion is always rounded toward zero.  If
1497 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1498 conversion overflows, the largest integer with the same sign as `a' is
1499 returned.
1500 -------------------------------------------------------------------------------
1501 */
float32_to_int64_round_to_zero(float32 a)1502 int64 float32_to_int64_round_to_zero( float32 a )
1503 {
1504     flag aSign;
1505     int16 aExp, shiftCount;
1506     bits32 aSig;
1507     bits64 aSig64;
1508     int64 z;
1509 
1510     aSig = extractFloat32Frac( a );
1511     aExp = extractFloat32Exp( a );
1512     aSign = extractFloat32Sign( a );
1513     shiftCount = aExp - 0xBE;
1514     if ( 0 <= shiftCount ) {
1515         if ( a != 0xDF000000 ) {
1516             float_raise( float_flag_invalid );
1517             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1518                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1519             }
1520         }
1521         return (sbits64) LIT64( 0x8000000000000000 );
1522     }
1523     else if ( aExp <= 0x7E ) {
1524         if ( aExp | aSig ) set_float_exception_inexact_flag();
1525         return 0;
1526     }
1527     aSig64 = aSig | 0x00800000;
1528     aSig64 <<= 40;
1529     z = aSig64>>( - shiftCount );
1530     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1531         set_float_exception_inexact_flag();
1532     }
1533     if ( aSign ) z = - z;
1534     return z;
1535 
1536 }
1537 #endif /* !SOFTFLOAT_FOR_GCC */
1538 
1539 /*
1540 -------------------------------------------------------------------------------
1541 Returns the result of converting the single-precision floating-point value
1542 `a' to the double-precision floating-point format.  The conversion is
1543 performed according to the IEC/IEEE Standard for Binary Floating-Point
1544 Arithmetic.
1545 -------------------------------------------------------------------------------
1546 */
float32_to_float64(float32 a)1547 float64 float32_to_float64( float32 a )
1548 {
1549     flag aSign;
1550     int16 aExp;
1551     bits32 aSig;
1552 
1553     aSig = extractFloat32Frac( a );
1554     aExp = extractFloat32Exp( a );
1555     aSign = extractFloat32Sign( a );
1556     if ( aExp == 0xFF ) {
1557         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1558         return packFloat64( aSign, 0x7FF, 0 );
1559     }
1560     if ( aExp == 0 ) {
1561         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1562         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1563         --aExp;
1564     }
1565     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1566 
1567 }
1568 
1569 #ifdef FLOATX80
1570 
1571 /*
1572 -------------------------------------------------------------------------------
1573 Returns the result of converting the single-precision floating-point value
1574 `a' to the extended double-precision floating-point format.  The conversion
1575 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1576 Arithmetic.
1577 -------------------------------------------------------------------------------
1578 */
float32_to_floatx80(float32 a)1579 floatx80 float32_to_floatx80( float32 a )
1580 {
1581     flag aSign;
1582     int16 aExp;
1583     bits32 aSig;
1584 
1585     aSig = extractFloat32Frac( a );
1586     aExp = extractFloat32Exp( a );
1587     aSign = extractFloat32Sign( a );
1588     if ( aExp == 0xFF ) {
1589         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1590         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1591     }
1592     if ( aExp == 0 ) {
1593         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1594         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1595     }
1596     aSig |= 0x00800000;
1597     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1598 
1599 }
1600 
1601 #endif
1602 
1603 #ifdef FLOAT128
1604 
1605 /*
1606 -------------------------------------------------------------------------------
1607 Returns the result of converting the single-precision floating-point value
1608 `a' to the double-precision floating-point format.  The conversion is
1609 performed according to the IEC/IEEE Standard for Binary Floating-Point
1610 Arithmetic.
1611 -------------------------------------------------------------------------------
1612 */
float32_to_float128(float32 a)1613 float128 float32_to_float128( float32 a )
1614 {
1615     flag aSign;
1616     int16 aExp;
1617     bits32 aSig;
1618 
1619     aSig = extractFloat32Frac( a );
1620     aExp = extractFloat32Exp( a );
1621     aSign = extractFloat32Sign( a );
1622     if ( aExp == 0xFF ) {
1623         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1624         return packFloat128( aSign, 0x7FFF, 0, 0 );
1625     }
1626     if ( aExp == 0 ) {
1627         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1628         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1629         --aExp;
1630     }
1631     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1632 
1633 }
1634 
1635 #endif
1636 
1637 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1638 /*
1639 -------------------------------------------------------------------------------
1640 Rounds the single-precision floating-point value `a' to an integer, and
1641 returns the result as a single-precision floating-point value.  The
1642 operation is performed according to the IEC/IEEE Standard for Binary
1643 Floating-Point Arithmetic.
1644 -------------------------------------------------------------------------------
1645 */
float32_round_to_int(float32 a)1646 float32 float32_round_to_int( float32 a )
1647 {
1648     flag aSign;
1649     int16 aExp;
1650     bits32 lastBitMask, roundBitsMask;
1651     int8 roundingMode;
1652     float32 z;
1653 
1654     aExp = extractFloat32Exp( a );
1655     if ( 0x96 <= aExp ) {
1656         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1657             return propagateFloat32NaN( a, a );
1658         }
1659         return a;
1660     }
1661     if ( aExp <= 0x7E ) {
1662         if ( (bits32) ( a<<1 ) == 0 ) return a;
1663         set_float_exception_inexact_flag();
1664         aSign = extractFloat32Sign( a );
1665         switch ( float_rounding_mode ) {
1666          case float_round_nearest_even:
1667             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1668                 return packFloat32( aSign, 0x7F, 0 );
1669             }
1670             break;
1671          case float_round_to_zero:
1672             break;
1673          case float_round_down:
1674             return aSign ? 0xBF800000 : 0;
1675          case float_round_up:
1676             return aSign ? 0x80000000 : 0x3F800000;
1677         }
1678         return packFloat32( aSign, 0, 0 );
1679     }
1680     lastBitMask = 1;
1681     lastBitMask <<= 0x96 - aExp;
1682     roundBitsMask = lastBitMask - 1;
1683     z = a;
1684     roundingMode = float_rounding_mode;
1685     if ( roundingMode == float_round_nearest_even ) {
1686         z += lastBitMask>>1;
1687         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1688     }
1689     else if ( roundingMode != float_round_to_zero ) {
1690         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1691             z += roundBitsMask;
1692         }
1693     }
1694     z &= ~ roundBitsMask;
1695     if ( z != a ) set_float_exception_inexact_flag();
1696     return z;
1697 
1698 }
1699 #endif /* !SOFTFLOAT_FOR_GCC */
1700 
1701 /*
1702 -------------------------------------------------------------------------------
1703 Returns the result of adding the absolute values of the single-precision
1704 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1705 before being returned.  `zSign' is ignored if the result is a NaN.
1706 The addition is performed according to the IEC/IEEE Standard for Binary
1707 Floating-Point Arithmetic.
1708 -------------------------------------------------------------------------------
1709 */
addFloat32Sigs(float32 a,float32 b,flag zSign)1710 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1711 {
1712     int16 aExp, bExp, zExp;
1713     bits32 aSig, bSig, zSig;
1714     int16 expDiff;
1715 
1716     aSig = extractFloat32Frac( a );
1717     aExp = extractFloat32Exp( a );
1718     bSig = extractFloat32Frac( b );
1719     bExp = extractFloat32Exp( b );
1720     expDiff = aExp - bExp;
1721     aSig <<= 6;
1722     bSig <<= 6;
1723     if ( 0 < expDiff ) {
1724         if ( aExp == 0xFF ) {
1725             if ( aSig ) return propagateFloat32NaN( a, b );
1726             return a;
1727         }
1728         if ( bExp == 0 ) {
1729             --expDiff;
1730         }
1731         else {
1732             bSig |= 0x20000000;
1733         }
1734         shift32RightJamming( bSig, expDiff, &bSig );
1735         zExp = aExp;
1736     }
1737     else if ( expDiff < 0 ) {
1738         if ( bExp == 0xFF ) {
1739             if ( bSig ) return propagateFloat32NaN( a, b );
1740             return packFloat32( zSign, 0xFF, 0 );
1741         }
1742         if ( aExp == 0 ) {
1743             ++expDiff;
1744         }
1745         else {
1746             aSig |= 0x20000000;
1747         }
1748         shift32RightJamming( aSig, - expDiff, &aSig );
1749         zExp = bExp;
1750     }
1751     else {
1752         if ( aExp == 0xFF ) {
1753             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1754             return a;
1755         }
1756         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1757         zSig = 0x40000000 + aSig + bSig;
1758         zExp = aExp;
1759         goto roundAndPack;
1760     }
1761     aSig |= 0x20000000;
1762     zSig = ( aSig + bSig )<<1;
1763     --zExp;
1764     if ( (sbits32) zSig < 0 ) {
1765         zSig = aSig + bSig;
1766         ++zExp;
1767     }
1768  roundAndPack:
1769     return roundAndPackFloat32( zSign, zExp, zSig );
1770 
1771 }
1772 
1773 /*
1774 -------------------------------------------------------------------------------
1775 Returns the result of subtracting the absolute values of the single-
1776 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1777 difference is negated before being returned.  `zSign' is ignored if the
1778 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1779 Standard for Binary Floating-Point Arithmetic.
1780 -------------------------------------------------------------------------------
1781 */
subFloat32Sigs(float32 a,float32 b,flag zSign)1782 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1783 {
1784     int16 aExp, bExp, zExp;
1785     bits32 aSig, bSig, zSig;
1786     int16 expDiff;
1787 
1788     aSig = extractFloat32Frac( a );
1789     aExp = extractFloat32Exp( a );
1790     bSig = extractFloat32Frac( b );
1791     bExp = extractFloat32Exp( b );
1792     expDiff = aExp - bExp;
1793     aSig <<= 7;
1794     bSig <<= 7;
1795     if ( 0 < expDiff ) goto aExpBigger;
1796     if ( expDiff < 0 ) goto bExpBigger;
1797     if ( aExp == 0xFF ) {
1798         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1799         float_raise( float_flag_invalid );
1800         return float32_default_nan;
1801     }
1802     if ( aExp == 0 ) {
1803         aExp = 1;
1804         bExp = 1;
1805     }
1806     if ( bSig < aSig ) goto aBigger;
1807     if ( aSig < bSig ) goto bBigger;
1808     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1809  bExpBigger:
1810     if ( bExp == 0xFF ) {
1811         if ( bSig ) return propagateFloat32NaN( a, b );
1812         return packFloat32( zSign ^ 1, 0xFF, 0 );
1813     }
1814     if ( aExp == 0 ) {
1815         ++expDiff;
1816     }
1817     else {
1818         aSig |= 0x40000000;
1819     }
1820     shift32RightJamming( aSig, - expDiff, &aSig );
1821     bSig |= 0x40000000;
1822  bBigger:
1823     zSig = bSig - aSig;
1824     zExp = bExp;
1825     zSign ^= 1;
1826     goto normalizeRoundAndPack;
1827  aExpBigger:
1828     if ( aExp == 0xFF ) {
1829         if ( aSig ) return propagateFloat32NaN( a, b );
1830         return a;
1831     }
1832     if ( bExp == 0 ) {
1833         --expDiff;
1834     }
1835     else {
1836         bSig |= 0x40000000;
1837     }
1838     shift32RightJamming( bSig, expDiff, &bSig );
1839     aSig |= 0x40000000;
1840  aBigger:
1841     zSig = aSig - bSig;
1842     zExp = aExp;
1843  normalizeRoundAndPack:
1844     --zExp;
1845     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1846 
1847 }
1848 
1849 /*
1850 -------------------------------------------------------------------------------
1851 Returns the result of adding the single-precision floating-point values `a'
1852 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1853 Binary Floating-Point Arithmetic.
1854 -------------------------------------------------------------------------------
1855 */
float32_add(float32 a,float32 b)1856 float32 float32_add( float32 a, float32 b )
1857 {
1858     flag aSign, bSign;
1859 
1860     aSign = extractFloat32Sign( a );
1861     bSign = extractFloat32Sign( b );
1862     if ( aSign == bSign ) {
1863         return addFloat32Sigs( a, b, aSign );
1864     }
1865     else {
1866         return subFloat32Sigs( a, b, aSign );
1867     }
1868 
1869 }
1870 
1871 /*
1872 -------------------------------------------------------------------------------
1873 Returns the result of subtracting the single-precision floating-point values
1874 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1875 for Binary Floating-Point Arithmetic.
1876 -------------------------------------------------------------------------------
1877 */
float32_sub(float32 a,float32 b)1878 float32 float32_sub( float32 a, float32 b )
1879 {
1880     flag aSign, bSign;
1881 
1882     aSign = extractFloat32Sign( a );
1883     bSign = extractFloat32Sign( b );
1884     if ( aSign == bSign ) {
1885         return subFloat32Sigs( a, b, aSign );
1886     }
1887     else {
1888         return addFloat32Sigs( a, b, aSign );
1889     }
1890 
1891 }
1892 
1893 /*
1894 -------------------------------------------------------------------------------
1895 Returns the result of multiplying the single-precision floating-point values
1896 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1897 for Binary Floating-Point Arithmetic.
1898 -------------------------------------------------------------------------------
1899 */
float32_mul(float32 a,float32 b)1900 float32 float32_mul( float32 a, float32 b )
1901 {
1902     flag aSign, bSign, zSign;
1903     int16 aExp, bExp, zExp;
1904     bits32 aSig, bSig;
1905     bits64 zSig64;
1906     bits32 zSig;
1907 
1908     aSig = extractFloat32Frac( a );
1909     aExp = extractFloat32Exp( a );
1910     aSign = extractFloat32Sign( a );
1911     bSig = extractFloat32Frac( b );
1912     bExp = extractFloat32Exp( b );
1913     bSign = extractFloat32Sign( b );
1914     zSign = aSign ^ bSign;
1915     if ( aExp == 0xFF ) {
1916         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1917             return propagateFloat32NaN( a, b );
1918         }
1919         if ( ( bExp | bSig ) == 0 ) {
1920             float_raise( float_flag_invalid );
1921             return float32_default_nan;
1922         }
1923         return packFloat32( zSign, 0xFF, 0 );
1924     }
1925     if ( bExp == 0xFF ) {
1926         if ( bSig ) return propagateFloat32NaN( a, b );
1927         if ( ( aExp | aSig ) == 0 ) {
1928             float_raise( float_flag_invalid );
1929             return float32_default_nan;
1930         }
1931         return packFloat32( zSign, 0xFF, 0 );
1932     }
1933     if ( aExp == 0 ) {
1934         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1935         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1936     }
1937     if ( bExp == 0 ) {
1938         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1939         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1940     }
1941     zExp = aExp + bExp - 0x7F;
1942     aSig = ( aSig | 0x00800000 )<<7;
1943     bSig = ( bSig | 0x00800000 )<<8;
1944     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1945     zSig = (bits32)zSig64;
1946     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1947         zSig <<= 1;
1948         --zExp;
1949     }
1950     return roundAndPackFloat32( zSign, zExp, zSig );
1951 
1952 }
1953 
1954 /*
1955 -------------------------------------------------------------------------------
1956 Returns the result of dividing the single-precision floating-point value `a'
1957 by the corresponding value `b'.  The operation is performed according to the
1958 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959 -------------------------------------------------------------------------------
1960 */
float32_div(float32 a,float32 b)1961 float32 float32_div( float32 a, float32 b )
1962 {
1963     flag aSign, bSign, zSign;
1964     int16 aExp, bExp, zExp;
1965     bits32 aSig, bSig, zSig;
1966 
1967     aSig = extractFloat32Frac( a );
1968     aExp = extractFloat32Exp( a );
1969     aSign = extractFloat32Sign( a );
1970     bSig = extractFloat32Frac( b );
1971     bExp = extractFloat32Exp( b );
1972     bSign = extractFloat32Sign( b );
1973     zSign = aSign ^ bSign;
1974     if ( aExp == 0xFF ) {
1975         if ( aSig ) return propagateFloat32NaN( a, b );
1976         if ( bExp == 0xFF ) {
1977             if ( bSig ) return propagateFloat32NaN( a, b );
1978             float_raise( float_flag_invalid );
1979             return float32_default_nan;
1980         }
1981         return packFloat32( zSign, 0xFF, 0 );
1982     }
1983     if ( bExp == 0xFF ) {
1984         if ( bSig ) return propagateFloat32NaN( a, b );
1985         return packFloat32( zSign, 0, 0 );
1986     }
1987     if ( bExp == 0 ) {
1988         if ( bSig == 0 ) {
1989             if ( ( aExp | aSig ) == 0 ) {
1990                 float_raise( float_flag_invalid );
1991                 return float32_default_nan;
1992             }
1993             float_raise( float_flag_divbyzero );
1994             return packFloat32( zSign, 0xFF, 0 );
1995         }
1996         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1997     }
1998     if ( aExp == 0 ) {
1999         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2000         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2001     }
2002     zExp = aExp - bExp + 0x7D;
2003     aSig = ( aSig | 0x00800000 )<<7;
2004     bSig = ( bSig | 0x00800000 )<<8;
2005     if ( bSig <= ( aSig + aSig ) ) {
2006         aSig >>= 1;
2007         ++zExp;
2008     }
2009     zSig = (bits32)((((bits64) aSig) << 32) / bSig);
2010     if ( ( zSig & 0x3F ) == 0 ) {
2011         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2012     }
2013     return roundAndPackFloat32( zSign, zExp, zSig );
2014 
2015 }
2016 
2017 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2018 /*
2019 -------------------------------------------------------------------------------
2020 Returns the remainder of the single-precision floating-point value `a'
2021 with respect to the corresponding value `b'.  The operation is performed
2022 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023 -------------------------------------------------------------------------------
2024 */
float32_rem(float32 a,float32 b)2025 float32 float32_rem( float32 a, float32 b )
2026 {
2027     flag aSign, bSign, zSign;
2028     int16 aExp, bExp, expDiff;
2029     bits32 aSig, bSig;
2030     bits32 q;
2031     bits64 aSig64, bSig64, q64;
2032     bits32 alternateASig;
2033     sbits32 sigMean;
2034 
2035     aSig = extractFloat32Frac( a );
2036     aExp = extractFloat32Exp( a );
2037     aSign = extractFloat32Sign( a );
2038     bSig = extractFloat32Frac( b );
2039     bExp = extractFloat32Exp( b );
2040     bSign = extractFloat32Sign( b );
2041     if ( aExp == 0xFF ) {
2042         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2043             return propagateFloat32NaN( a, b );
2044         }
2045         float_raise( float_flag_invalid );
2046         return float32_default_nan;
2047     }
2048     if ( bExp == 0xFF ) {
2049         if ( bSig ) return propagateFloat32NaN( a, b );
2050         return a;
2051     }
2052     if ( bExp == 0 ) {
2053         if ( bSig == 0 ) {
2054             float_raise( float_flag_invalid );
2055             return float32_default_nan;
2056         }
2057         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2058     }
2059     if ( aExp == 0 ) {
2060         if ( aSig == 0 ) return a;
2061         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2062     }
2063     expDiff = aExp - bExp;
2064     aSig |= 0x00800000;
2065     bSig |= 0x00800000;
2066     if ( expDiff < 32 ) {
2067         aSig <<= 8;
2068         bSig <<= 8;
2069         if ( expDiff < 0 ) {
2070             if ( expDiff < -1 ) return a;
2071             aSig >>= 1;
2072         }
2073         q = ( bSig <= aSig );
2074         if ( q ) aSig -= bSig;
2075         if ( 0 < expDiff ) {
2076             q = ( ( (bits64) aSig )<<32 ) / bSig;
2077             q >>= 32 - expDiff;
2078             bSig >>= 2;
2079             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2080         }
2081         else {
2082             aSig >>= 2;
2083             bSig >>= 2;
2084         }
2085     }
2086     else {
2087         if ( bSig <= aSig ) aSig -= bSig;
2088         aSig64 = ( (bits64) aSig )<<40;
2089         bSig64 = ( (bits64) bSig )<<40;
2090         expDiff -= 64;
2091         while ( 0 < expDiff ) {
2092             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2093             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2094             aSig64 = - ( ( bSig * q64 )<<38 );
2095             expDiff -= 62;
2096         }
2097         expDiff += 64;
2098         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2099         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2100         q = q64>>( 64 - expDiff );
2101         bSig <<= 6;
2102         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2103     }
2104     do {
2105         alternateASig = aSig;
2106         ++q;
2107         aSig -= bSig;
2108     } while ( 0 <= (sbits32) aSig );
2109     sigMean = aSig + alternateASig;
2110     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2111         aSig = alternateASig;
2112     }
2113     zSign = ( (sbits32) aSig < 0 );
2114     if ( zSign ) aSig = - aSig;
2115     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2116 
2117 }
2118 #endif /* !SOFTFLOAT_FOR_GCC */
2119 
2120 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2121 /*
2122 -------------------------------------------------------------------------------
2123 Returns the square root of the single-precision floating-point value `a'.
2124 The operation is performed according to the IEC/IEEE Standard for Binary
2125 Floating-Point Arithmetic.
2126 -------------------------------------------------------------------------------
2127 */
float32_sqrt(float32 a)2128 float32 float32_sqrt( float32 a )
2129 {
2130     flag aSign;
2131     int16 aExp, zExp;
2132     bits32 aSig, zSig;
2133     bits64 rem, term;
2134 
2135     aSig = extractFloat32Frac( a );
2136     aExp = extractFloat32Exp( a );
2137     aSign = extractFloat32Sign( a );
2138     if ( aExp == 0xFF ) {
2139         if ( aSig ) return propagateFloat32NaN( a, 0 );
2140         if ( ! aSign ) return a;
2141         float_raise( float_flag_invalid );
2142         return float32_default_nan;
2143     }
2144     if ( aSign ) {
2145         if ( ( aExp | aSig ) == 0 ) return a;
2146         float_raise( float_flag_invalid );
2147         return float32_default_nan;
2148     }
2149     if ( aExp == 0 ) {
2150         if ( aSig == 0 ) return 0;
2151         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2152     }
2153     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2154     aSig = ( aSig | 0x00800000 )<<8;
2155     zSig = estimateSqrt32( aExp, aSig ) + 2;
2156     if ( ( zSig & 0x7F ) <= 5 ) {
2157         if ( zSig < 2 ) {
2158             zSig = 0x7FFFFFFF;
2159             goto roundAndPack;
2160         }
2161         aSig >>= aExp & 1;
2162         term = ( (bits64) zSig ) * zSig;
2163         rem = ( ( (bits64) aSig )<<32 ) - term;
2164         while ( (sbits64) rem < 0 ) {
2165             --zSig;
2166             rem += ( ( (bits64) zSig )<<1 ) | 1;
2167         }
2168         zSig |= ( rem != 0 );
2169     }
2170     shift32RightJamming( zSig, 1, &zSig );
2171  roundAndPack:
2172     return roundAndPackFloat32( 0, zExp, zSig );
2173 
2174 }
2175 #endif /* !SOFTFLOAT_FOR_GCC */
2176 
2177 /*
2178 -------------------------------------------------------------------------------
2179 Returns 1 if the single-precision floating-point value `a' is equal to
2180 the corresponding value `b', and 0 otherwise.  The comparison is performed
2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2182 -------------------------------------------------------------------------------
2183 */
float32_eq(float32 a,float32 b)2184 flag float32_eq( float32 a, float32 b )
2185 {
2186 
2187     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2188          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2189        ) {
2190         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2191             float_raise( float_flag_invalid );
2192         }
2193         return 0;
2194     }
2195     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2196 
2197 }
2198 
2199 /*
2200 -------------------------------------------------------------------------------
2201 Returns 1 if the single-precision floating-point value `a' is less than
2202 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2203 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2204 Arithmetic.
2205 -------------------------------------------------------------------------------
2206 */
float32_le(float32 a,float32 b)2207 flag float32_le( float32 a, float32 b )
2208 {
2209     flag aSign, bSign;
2210 
2211     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2212          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2213        ) {
2214         float_raise( float_flag_invalid );
2215         return 0;
2216     }
2217     aSign = extractFloat32Sign( a );
2218     bSign = extractFloat32Sign( b );
2219     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2220     return ( a == b ) || ( aSign ^ ( a < b ) );
2221 
2222 }
2223 
2224 /*
2225 -------------------------------------------------------------------------------
2226 Returns 1 if the single-precision floating-point value `a' is less than
2227 the corresponding value `b', and 0 otherwise.  The comparison is performed
2228 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2229 -------------------------------------------------------------------------------
2230 */
float32_lt(float32 a,float32 b)2231 flag float32_lt( float32 a, float32 b )
2232 {
2233     flag aSign, bSign;
2234 
2235     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2236          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2237        ) {
2238         float_raise( float_flag_invalid );
2239         return 0;
2240     }
2241     aSign = extractFloat32Sign( a );
2242     bSign = extractFloat32Sign( b );
2243     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2244     return ( a != b ) && ( aSign ^ ( a < b ) );
2245 
2246 }
2247 
2248 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2249 /*
2250 -------------------------------------------------------------------------------
2251 Returns 1 if the single-precision floating-point value `a' is equal to
2252 the corresponding value `b', and 0 otherwise.  The invalid exception is
2253 raised if either operand is a NaN.  Otherwise, the comparison is performed
2254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2255 -------------------------------------------------------------------------------
2256 */
float32_eq_signaling(float32 a,float32 b)2257 flag float32_eq_signaling( float32 a, float32 b )
2258 {
2259 
2260     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2261          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2262        ) {
2263         float_raise( float_flag_invalid );
2264         return 0;
2265     }
2266     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2267 
2268 }
2269 
2270 /*
2271 -------------------------------------------------------------------------------
2272 Returns 1 if the single-precision floating-point value `a' is less than or
2273 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2274 cause an exception.  Otherwise, the comparison is performed according to the
2275 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 -------------------------------------------------------------------------------
2277 */
float32_le_quiet(float32 a,float32 b)2278 flag float32_le_quiet( float32 a, float32 b )
2279 {
2280     flag aSign, bSign;
2281 
2282     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2283          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2284        ) {
2285         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2286             float_raise( float_flag_invalid );
2287         }
2288         return 0;
2289     }
2290     aSign = extractFloat32Sign( a );
2291     bSign = extractFloat32Sign( b );
2292     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2293     return ( a == b ) || ( aSign ^ ( a < b ) );
2294 
2295 }
2296 
2297 /*
2298 -------------------------------------------------------------------------------
2299 Returns 1 if the single-precision floating-point value `a' is less than
2300 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2301 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2302 Standard for Binary Floating-Point Arithmetic.
2303 -------------------------------------------------------------------------------
2304 */
float32_lt_quiet(float32 a,float32 b)2305 flag float32_lt_quiet( float32 a, float32 b )
2306 {
2307     flag aSign, bSign;
2308 
2309     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2310          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2311        ) {
2312         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2313             float_raise( float_flag_invalid );
2314         }
2315         return 0;
2316     }
2317     aSign = extractFloat32Sign( a );
2318     bSign = extractFloat32Sign( b );
2319     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2320     return ( a != b ) && ( aSign ^ ( a < b ) );
2321 
2322 }
2323 #endif /* !SOFTFLOAT_FOR_GCC */
2324 
2325 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2326 /*
2327 -------------------------------------------------------------------------------
2328 Returns the result of converting the double-precision floating-point value
2329 `a' to the 32-bit two's complement integer format.  The conversion is
2330 performed according to the IEC/IEEE Standard for Binary Floating-Point
2331 Arithmetic---which means in particular that the conversion is rounded
2332 according to the current rounding mode.  If `a' is a NaN, the largest
2333 positive integer is returned.  Otherwise, if the conversion overflows, the
2334 largest integer with the same sign as `a' is returned.
2335 -------------------------------------------------------------------------------
2336 */
float64_to_int32(float64 a)2337 int32 float64_to_int32( float64 a )
2338 {
2339     flag aSign;
2340     int16 aExp, shiftCount;
2341     bits64 aSig;
2342 
2343     aSig = extractFloat64Frac( a );
2344     aExp = extractFloat64Exp( a );
2345     aSign = extractFloat64Sign( a );
2346     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2347     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2348     shiftCount = 0x42C - aExp;
2349     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2350     return roundAndPackInt32( aSign, aSig );
2351 
2352 }
2353 #endif /* !SOFTFLOAT_FOR_GCC */
2354 
2355 /*
2356 -------------------------------------------------------------------------------
2357 Returns the result of converting the double-precision floating-point value
2358 `a' to the 32-bit two's complement integer format.  The conversion is
2359 performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 Arithmetic, except that the conversion is always rounded toward zero.
2361 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2362 the conversion overflows, the largest integer with the same sign as `a' is
2363 returned.
2364 -------------------------------------------------------------------------------
2365 */
float64_to_int32_round_to_zero(float64 a)2366 int32 float64_to_int32_round_to_zero( float64 a )
2367 {
2368     flag aSign;
2369     int16 aExp, shiftCount;
2370     bits64 aSig, savedASig;
2371     int32 z;
2372 
2373     aSig = extractFloat64Frac( a );
2374     aExp = extractFloat64Exp( a );
2375     aSign = extractFloat64Sign( a );
2376     if ( 0x41E < aExp ) {
2377         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2378         goto invalid;
2379     }
2380     else if ( aExp < 0x3FF ) {
2381         if ( aExp || aSig ) set_float_exception_inexact_flag();
2382         return 0;
2383     }
2384     aSig |= LIT64( 0x0010000000000000 );
2385     shiftCount = 0x433 - aExp;
2386     savedASig = aSig;
2387     aSig >>= shiftCount;
2388     z = (int32)aSig;
2389     if ( aSign ) z = - z;
2390     if ( ( z < 0 ) ^ aSign ) {
2391  invalid:
2392         float_raise( float_flag_invalid );
2393         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2394     }
2395     if ( ( aSig<<shiftCount ) != savedASig ) {
2396         set_float_exception_inexact_flag();
2397     }
2398     return z;
2399 
2400 }
2401 
2402 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2403 /*
2404 -------------------------------------------------------------------------------
2405 Returns the result of converting the double-precision floating-point value
2406 `a' to the 64-bit two's complement integer format.  The conversion is
2407 performed according to the IEC/IEEE Standard for Binary Floating-Point
2408 Arithmetic---which means in particular that the conversion is rounded
2409 according to the current rounding mode.  If `a' is a NaN, the largest
2410 positive integer is returned.  Otherwise, if the conversion overflows, the
2411 largest integer with the same sign as `a' is returned.
2412 -------------------------------------------------------------------------------
2413 */
float64_to_int64(float64 a)2414 int64 float64_to_int64( float64 a )
2415 {
2416     flag aSign;
2417     int16 aExp, shiftCount;
2418     bits64 aSig, aSigExtra;
2419 
2420     aSig = extractFloat64Frac( a );
2421     aExp = extractFloat64Exp( a );
2422     aSign = extractFloat64Sign( a );
2423     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2424     shiftCount = 0x433 - aExp;
2425     if ( shiftCount <= 0 ) {
2426         if ( 0x43E < aExp ) {
2427             float_raise( float_flag_invalid );
2428             if (    ! aSign
2429                  || (    ( aExp == 0x7FF )
2430                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2431                ) {
2432                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2433             }
2434             return (sbits64) LIT64( 0x8000000000000000 );
2435         }
2436         aSigExtra = 0;
2437         aSig <<= - shiftCount;
2438     }
2439     else {
2440         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2441     }
2442     return roundAndPackInt64( aSign, aSig, aSigExtra );
2443 
2444 }
2445 
2446 /*
2447 -------------------------------------------------------------------------------
2448 Returns the result of converting the double-precision floating-point value
2449 `a' to the 64-bit two's complement integer format.  The conversion is
2450 performed according to the IEC/IEEE Standard for Binary Floating-Point
2451 Arithmetic, except that the conversion is always rounded toward zero.
2452 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2453 the conversion overflows, the largest integer with the same sign as `a' is
2454 returned.
2455 -------------------------------------------------------------------------------
2456 */
float64_to_int64_round_to_zero(float64 a)2457 int64 float64_to_int64_round_to_zero( float64 a )
2458 {
2459     flag aSign;
2460     int16 aExp, shiftCount;
2461     bits64 aSig;
2462     int64 z;
2463 
2464     aSig = extractFloat64Frac( a );
2465     aExp = extractFloat64Exp( a );
2466     aSign = extractFloat64Sign( a );
2467     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2468     shiftCount = aExp - 0x433;
2469     if ( 0 <= shiftCount ) {
2470         if ( 0x43E <= aExp ) {
2471             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2472                 float_raise( float_flag_invalid );
2473                 if (    ! aSign
2474                      || (    ( aExp == 0x7FF )
2475                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2476                    ) {
2477                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2478                 }
2479             }
2480             return (sbits64) LIT64( 0x8000000000000000 );
2481         }
2482         z = aSig<<shiftCount;
2483     }
2484     else {
2485         if ( aExp < 0x3FE ) {
2486             if ( aExp | aSig ) set_float_exception_inexact_flag();
2487             return 0;
2488         }
2489         z = aSig>>( - shiftCount );
2490         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2491             set_float_exception_inexact_flag();
2492         }
2493     }
2494     if ( aSign ) z = - z;
2495     return z;
2496 
2497 }
2498 #endif /* !SOFTFLOAT_FOR_GCC */
2499 
2500 /*
2501 -------------------------------------------------------------------------------
2502 Returns the result of converting the double-precision floating-point value
2503 `a' to the single-precision floating-point format.  The conversion is
2504 performed according to the IEC/IEEE Standard for Binary Floating-Point
2505 Arithmetic.
2506 -------------------------------------------------------------------------------
2507 */
float64_to_float32(float64 a)2508 float32 float64_to_float32( float64 a )
2509 {
2510     flag aSign;
2511     int16 aExp;
2512     bits64 aSig;
2513     bits32 zSig;
2514 
2515     aSig = extractFloat64Frac( a );
2516     aExp = extractFloat64Exp( a );
2517     aSign = extractFloat64Sign( a );
2518     if ( aExp == 0x7FF ) {
2519         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2520         return packFloat32( aSign, 0xFF, 0 );
2521     }
2522     shift64RightJamming( aSig, 22, &aSig );
2523     zSig = (bits32)aSig;
2524     if ( aExp || zSig ) {
2525         zSig |= 0x40000000;
2526         aExp -= 0x381;
2527     }
2528     return roundAndPackFloat32( aSign, aExp, zSig );
2529 
2530 }
2531 
2532 #ifdef FLOATX80
2533 
2534 /*
2535 -------------------------------------------------------------------------------
2536 Returns the result of converting the double-precision floating-point value
2537 `a' to the extended double-precision floating-point format.  The conversion
2538 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2539 Arithmetic.
2540 -------------------------------------------------------------------------------
2541 */
float64_to_floatx80(float64 a)2542 floatx80 float64_to_floatx80( float64 a )
2543 {
2544     flag aSign;
2545     int16 aExp;
2546     bits64 aSig;
2547 
2548     aSig = extractFloat64Frac( a );
2549     aExp = extractFloat64Exp( a );
2550     aSign = extractFloat64Sign( a );
2551     if ( aExp == 0x7FF ) {
2552         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2553         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2554     }
2555     if ( aExp == 0 ) {
2556         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2557         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2558     }
2559     return
2560         packFloatx80(
2561             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2562 
2563 }
2564 
2565 #endif
2566 
2567 #ifdef FLOAT128
2568 
2569 /*
2570 -------------------------------------------------------------------------------
2571 Returns the result of converting the double-precision floating-point value
2572 `a' to the quadruple-precision floating-point format.  The conversion is
2573 performed according to the IEC/IEEE Standard for Binary Floating-Point
2574 Arithmetic.
2575 -------------------------------------------------------------------------------
2576 */
float64_to_float128(float64 a)2577 float128 float64_to_float128( float64 a )
2578 {
2579     flag aSign;
2580     int16 aExp;
2581     bits64 aSig, zSig0, zSig1;
2582 
2583     aSig = extractFloat64Frac( a );
2584     aExp = extractFloat64Exp( a );
2585     aSign = extractFloat64Sign( a );
2586     if ( aExp == 0x7FF ) {
2587         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2588         return packFloat128( aSign, 0x7FFF, 0, 0 );
2589     }
2590     if ( aExp == 0 ) {
2591         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2592         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2593         --aExp;
2594     }
2595     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2596     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2597 
2598 }
2599 
2600 #endif
2601 
2602 #ifndef SOFTFLOAT_FOR_GCC
2603 /*
2604 -------------------------------------------------------------------------------
2605 Rounds the double-precision floating-point value `a' to an integer, and
2606 returns the result as a double-precision floating-point value.  The
2607 operation is performed according to the IEC/IEEE Standard for Binary
2608 Floating-Point Arithmetic.
2609 -------------------------------------------------------------------------------
2610 */
float64_round_to_int(float64 a)2611 float64 float64_round_to_int( float64 a )
2612 {
2613     flag aSign;
2614     int16 aExp;
2615     bits64 lastBitMask, roundBitsMask;
2616     int8 roundingMode;
2617     float64 z;
2618 
2619     aExp = extractFloat64Exp( a );
2620     if ( 0x433 <= aExp ) {
2621         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2622             return propagateFloat64NaN( a, a );
2623         }
2624         return a;
2625     }
2626     if ( aExp < 0x3FF ) {
2627         if ( (bits64) ( a<<1 ) == 0 ) return a;
2628         set_float_exception_inexact_flag();
2629         aSign = extractFloat64Sign( a );
2630         switch ( float_rounding_mode ) {
2631          case float_round_nearest_even:
2632             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2633                 return packFloat64( aSign, 0x3FF, 0 );
2634             }
2635             break;
2636          case float_round_to_zero:
2637             break;
2638          case float_round_down:
2639             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2640          case float_round_up:
2641             return
2642             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2643         }
2644         return packFloat64( aSign, 0, 0 );
2645     }
2646     lastBitMask = 1;
2647     lastBitMask <<= 0x433 - aExp;
2648     roundBitsMask = lastBitMask - 1;
2649     z = a;
2650     roundingMode = float_rounding_mode;
2651     if ( roundingMode == float_round_nearest_even ) {
2652         z += lastBitMask>>1;
2653         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2654     }
2655     else if ( roundingMode != float_round_to_zero ) {
2656         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2657             z += roundBitsMask;
2658         }
2659     }
2660     z &= ~ roundBitsMask;
2661     if ( z != a ) set_float_exception_inexact_flag();
2662     return z;
2663 
2664 }
2665 #endif
2666 
2667 /*
2668 -------------------------------------------------------------------------------
2669 Returns the result of adding the absolute values of the double-precision
2670 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2671 before being returned.  `zSign' is ignored if the result is a NaN.
2672 The addition is performed according to the IEC/IEEE Standard for Binary
2673 Floating-Point Arithmetic.
2674 -------------------------------------------------------------------------------
2675 */
addFloat64Sigs(float64 a,float64 b,flag zSign)2676 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2677 {
2678     int16 aExp, bExp, zExp;
2679     bits64 aSig, bSig, zSig;
2680     int16 expDiff;
2681 
2682     aSig = extractFloat64Frac( a );
2683     aExp = extractFloat64Exp( a );
2684     bSig = extractFloat64Frac( b );
2685     bExp = extractFloat64Exp( b );
2686     expDiff = aExp - bExp;
2687     aSig <<= 9;
2688     bSig <<= 9;
2689     if ( 0 < expDiff ) {
2690         if ( aExp == 0x7FF ) {
2691             if ( aSig ) return propagateFloat64NaN( a, b );
2692             return a;
2693         }
2694         if ( bExp == 0 ) {
2695             --expDiff;
2696         }
2697         else {
2698             bSig |= LIT64( 0x2000000000000000 );
2699         }
2700         shift64RightJamming( bSig, expDiff, &bSig );
2701         zExp = aExp;
2702     }
2703     else if ( expDiff < 0 ) {
2704         if ( bExp == 0x7FF ) {
2705             if ( bSig ) return propagateFloat64NaN( a, b );
2706             return packFloat64( zSign, 0x7FF, 0 );
2707         }
2708         if ( aExp == 0 ) {
2709             ++expDiff;
2710         }
2711         else {
2712             aSig |= LIT64( 0x2000000000000000 );
2713         }
2714         shift64RightJamming( aSig, - expDiff, &aSig );
2715         zExp = bExp;
2716     }
2717     else {
2718         if ( aExp == 0x7FF ) {
2719             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2720             return a;
2721         }
2722         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2723         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2724         zExp = aExp;
2725         goto roundAndPack;
2726     }
2727     aSig |= LIT64( 0x2000000000000000 );
2728     zSig = ( aSig + bSig )<<1;
2729     --zExp;
2730     if ( (sbits64) zSig < 0 ) {
2731         zSig = aSig + bSig;
2732         ++zExp;
2733     }
2734  roundAndPack:
2735     return roundAndPackFloat64( zSign, zExp, zSig );
2736 
2737 }
2738 
2739 /*
2740 -------------------------------------------------------------------------------
2741 Returns the result of subtracting the absolute values of the double-
2742 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2743 difference is negated before being returned.  `zSign' is ignored if the
2744 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2745 Standard for Binary Floating-Point Arithmetic.
2746 -------------------------------------------------------------------------------
2747 */
subFloat64Sigs(float64 a,float64 b,flag zSign)2748 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2749 {
2750     int16 aExp, bExp, zExp;
2751     bits64 aSig, bSig, zSig;
2752     int16 expDiff;
2753 
2754     aSig = extractFloat64Frac( a );
2755     aExp = extractFloat64Exp( a );
2756     bSig = extractFloat64Frac( b );
2757     bExp = extractFloat64Exp( b );
2758     expDiff = aExp - bExp;
2759     aSig <<= 10;
2760     bSig <<= 10;
2761     if ( 0 < expDiff ) goto aExpBigger;
2762     if ( expDiff < 0 ) goto bExpBigger;
2763     if ( aExp == 0x7FF ) {
2764         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2765         float_raise( float_flag_invalid );
2766         return float64_default_nan;
2767     }
2768     if ( aExp == 0 ) {
2769         aExp = 1;
2770         bExp = 1;
2771     }
2772     if ( bSig < aSig ) goto aBigger;
2773     if ( aSig < bSig ) goto bBigger;
2774     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2775  bExpBigger:
2776     if ( bExp == 0x7FF ) {
2777         if ( bSig ) return propagateFloat64NaN( a, b );
2778         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2779     }
2780     if ( aExp == 0 ) {
2781         ++expDiff;
2782     }
2783     else {
2784         aSig |= LIT64( 0x4000000000000000 );
2785     }
2786     shift64RightJamming( aSig, - expDiff, &aSig );
2787     bSig |= LIT64( 0x4000000000000000 );
2788  bBigger:
2789     zSig = bSig - aSig;
2790     zExp = bExp;
2791     zSign ^= 1;
2792     goto normalizeRoundAndPack;
2793  aExpBigger:
2794     if ( aExp == 0x7FF ) {
2795         if ( aSig ) return propagateFloat64NaN( a, b );
2796         return a;
2797     }
2798     if ( bExp == 0 ) {
2799         --expDiff;
2800     }
2801     else {
2802         bSig |= LIT64( 0x4000000000000000 );
2803     }
2804     shift64RightJamming( bSig, expDiff, &bSig );
2805     aSig |= LIT64( 0x4000000000000000 );
2806  aBigger:
2807     zSig = aSig - bSig;
2808     zExp = aExp;
2809  normalizeRoundAndPack:
2810     --zExp;
2811     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2812 
2813 }
2814 
2815 /*
2816 -------------------------------------------------------------------------------
2817 Returns the result of adding the double-precision floating-point values `a'
2818 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2819 Binary Floating-Point Arithmetic.
2820 -------------------------------------------------------------------------------
2821 */
float64_add(float64 a,float64 b)2822 float64 float64_add( float64 a, float64 b )
2823 {
2824     flag aSign, bSign;
2825 
2826     aSign = extractFloat64Sign( a );
2827     bSign = extractFloat64Sign( b );
2828     if ( aSign == bSign ) {
2829         return addFloat64Sigs( a, b, aSign );
2830     }
2831     else {
2832         return subFloat64Sigs( a, b, aSign );
2833     }
2834 
2835 }
2836 
2837 /*
2838 -------------------------------------------------------------------------------
2839 Returns the result of subtracting the double-precision floating-point values
2840 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2841 for Binary Floating-Point Arithmetic.
2842 -------------------------------------------------------------------------------
2843 */
float64_sub(float64 a,float64 b)2844 float64 float64_sub( float64 a, float64 b )
2845 {
2846     flag aSign, bSign;
2847 
2848     aSign = extractFloat64Sign( a );
2849     bSign = extractFloat64Sign( b );
2850     if ( aSign == bSign ) {
2851         return subFloat64Sigs( a, b, aSign );
2852     }
2853     else {
2854         return addFloat64Sigs( a, b, aSign );
2855     }
2856 
2857 }
2858 
2859 /*
2860 -------------------------------------------------------------------------------
2861 Returns the result of multiplying the double-precision floating-point values
2862 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2863 for Binary Floating-Point Arithmetic.
2864 -------------------------------------------------------------------------------
2865 */
float64_mul(float64 a,float64 b)2866 float64 float64_mul( float64 a, float64 b )
2867 {
2868     flag aSign, bSign, zSign;
2869     int16 aExp, bExp, zExp;
2870     bits64 aSig, bSig, zSig0, zSig1;
2871 
2872     aSig = extractFloat64Frac( a );
2873     aExp = extractFloat64Exp( a );
2874     aSign = extractFloat64Sign( a );
2875     bSig = extractFloat64Frac( b );
2876     bExp = extractFloat64Exp( b );
2877     bSign = extractFloat64Sign( b );
2878     zSign = aSign ^ bSign;
2879     if ( aExp == 0x7FF ) {
2880         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2881             return propagateFloat64NaN( a, b );
2882         }
2883         if ( ( bExp | bSig ) == 0 ) {
2884             float_raise( float_flag_invalid );
2885             return float64_default_nan;
2886         }
2887         return packFloat64( zSign, 0x7FF, 0 );
2888     }
2889     if ( bExp == 0x7FF ) {
2890         if ( bSig ) return propagateFloat64NaN( a, b );
2891         if ( ( aExp | aSig ) == 0 ) {
2892             float_raise( float_flag_invalid );
2893             return float64_default_nan;
2894         }
2895         return packFloat64( zSign, 0x7FF, 0 );
2896     }
2897     if ( aExp == 0 ) {
2898         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2899         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2900     }
2901     if ( bExp == 0 ) {
2902         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2903         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2904     }
2905     zExp = aExp + bExp - 0x3FF;
2906     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2907     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2908     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2909     zSig0 |= ( zSig1 != 0 );
2910     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2911         zSig0 <<= 1;
2912         --zExp;
2913     }
2914     return roundAndPackFloat64( zSign, zExp, zSig0 );
2915 
2916 }
2917 
2918 /*
2919 -------------------------------------------------------------------------------
2920 Returns the result of dividing the double-precision floating-point value `a'
2921 by the corresponding value `b'.  The operation is performed according to
2922 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2923 -------------------------------------------------------------------------------
2924 */
float64_div(float64 a,float64 b)2925 float64 float64_div( float64 a, float64 b )
2926 {
2927     flag aSign, bSign, zSign;
2928     int16 aExp, bExp, zExp;
2929     bits64 aSig, bSig, zSig;
2930     bits64 rem0, rem1;
2931     bits64 term0, term1;
2932 
2933     aSig = extractFloat64Frac( a );
2934     aExp = extractFloat64Exp( a );
2935     aSign = extractFloat64Sign( a );
2936     bSig = extractFloat64Frac( b );
2937     bExp = extractFloat64Exp( b );
2938     bSign = extractFloat64Sign( b );
2939     zSign = aSign ^ bSign;
2940     if ( aExp == 0x7FF ) {
2941         if ( aSig ) return propagateFloat64NaN( a, b );
2942         if ( bExp == 0x7FF ) {
2943             if ( bSig ) return propagateFloat64NaN( a, b );
2944             float_raise( float_flag_invalid );
2945             return float64_default_nan;
2946         }
2947         return packFloat64( zSign, 0x7FF, 0 );
2948     }
2949     if ( bExp == 0x7FF ) {
2950         if ( bSig ) return propagateFloat64NaN( a, b );
2951         return packFloat64( zSign, 0, 0 );
2952     }
2953     if ( bExp == 0 ) {
2954         if ( bSig == 0 ) {
2955             if ( ( aExp | aSig ) == 0 ) {
2956                 float_raise( float_flag_invalid );
2957                 return float64_default_nan;
2958             }
2959             float_raise( float_flag_divbyzero );
2960             return packFloat64( zSign, 0x7FF, 0 );
2961         }
2962         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2963     }
2964     if ( aExp == 0 ) {
2965         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2966         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2967     }
2968     zExp = aExp - bExp + 0x3FD;
2969     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2970     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2971     if ( bSig <= ( aSig + aSig ) ) {
2972         aSig >>= 1;
2973         ++zExp;
2974     }
2975     zSig = estimateDiv128To64( aSig, 0, bSig );
2976     if ( ( zSig & 0x1FF ) <= 2 ) {
2977         mul64To128( bSig, zSig, &term0, &term1 );
2978         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2979         while ( (sbits64) rem0 < 0 ) {
2980             --zSig;
2981             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2982         }
2983         zSig |= ( rem1 != 0 );
2984     }
2985     return roundAndPackFloat64( zSign, zExp, zSig );
2986 
2987 }
2988 
2989 #ifndef SOFTFLOAT_FOR_GCC
2990 /*
2991 -------------------------------------------------------------------------------
2992 Returns the remainder of the double-precision floating-point value `a'
2993 with respect to the corresponding value `b'.  The operation is performed
2994 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2995 -------------------------------------------------------------------------------
2996 */
float64_rem(float64 a,float64 b)2997 float64 float64_rem( float64 a, float64 b )
2998 {
2999     flag aSign, bSign, zSign;
3000     int16 aExp, bExp, expDiff;
3001     bits64 aSig, bSig;
3002     bits64 q, alternateASig;
3003     sbits64 sigMean;
3004 
3005     aSig = extractFloat64Frac( a );
3006     aExp = extractFloat64Exp( a );
3007     aSign = extractFloat64Sign( a );
3008     bSig = extractFloat64Frac( b );
3009     bExp = extractFloat64Exp( b );
3010     bSign = extractFloat64Sign( b );
3011     if ( aExp == 0x7FF ) {
3012         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3013             return propagateFloat64NaN( a, b );
3014         }
3015         float_raise( float_flag_invalid );
3016         return float64_default_nan;
3017     }
3018     if ( bExp == 0x7FF ) {
3019         if ( bSig ) return propagateFloat64NaN( a, b );
3020         return a;
3021     }
3022     if ( bExp == 0 ) {
3023         if ( bSig == 0 ) {
3024             float_raise( float_flag_invalid );
3025             return float64_default_nan;
3026         }
3027         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3028     }
3029     if ( aExp == 0 ) {
3030         if ( aSig == 0 ) return a;
3031         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3032     }
3033     expDiff = aExp - bExp;
3034     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3035     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3036     if ( expDiff < 0 ) {
3037         if ( expDiff < -1 ) return a;
3038         aSig >>= 1;
3039     }
3040     q = ( bSig <= aSig );
3041     if ( q ) aSig -= bSig;
3042     expDiff -= 64;
3043     while ( 0 < expDiff ) {
3044         q = estimateDiv128To64( aSig, 0, bSig );
3045         q = ( 2 < q ) ? q - 2 : 0;
3046         aSig = - ( ( bSig>>2 ) * q );
3047         expDiff -= 62;
3048     }
3049     expDiff += 64;
3050     if ( 0 < expDiff ) {
3051         q = estimateDiv128To64( aSig, 0, bSig );
3052         q = ( 2 < q ) ? q - 2 : 0;
3053         q >>= 64 - expDiff;
3054         bSig >>= 2;
3055         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3056     }
3057     else {
3058         aSig >>= 2;
3059         bSig >>= 2;
3060     }
3061     do {
3062         alternateASig = aSig;
3063         ++q;
3064         aSig -= bSig;
3065     } while ( 0 <= (sbits64) aSig );
3066     sigMean = aSig + alternateASig;
3067     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3068         aSig = alternateASig;
3069     }
3070     zSign = ( (sbits64) aSig < 0 );
3071     if ( zSign ) aSig = - aSig;
3072     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3073 
3074 }
3075 
3076 /*
3077 -------------------------------------------------------------------------------
3078 Returns the square root of the double-precision floating-point value `a'.
3079 The operation is performed according to the IEC/IEEE Standard for Binary
3080 Floating-Point Arithmetic.
3081 -------------------------------------------------------------------------------
3082 */
float64_sqrt(float64 a)3083 float64 float64_sqrt( float64 a )
3084 {
3085     flag aSign;
3086     int16 aExp, zExp;
3087     bits64 aSig, zSig, doubleZSig;
3088     bits64 rem0, rem1, term0, term1;
3089 
3090     aSig = extractFloat64Frac( a );
3091     aExp = extractFloat64Exp( a );
3092     aSign = extractFloat64Sign( a );
3093     if ( aExp == 0x7FF ) {
3094         if ( aSig ) return propagateFloat64NaN( a, a );
3095         if ( ! aSign ) return a;
3096         float_raise( float_flag_invalid );
3097         return float64_default_nan;
3098     }
3099     if ( aSign ) {
3100         if ( ( aExp | aSig ) == 0 ) return a;
3101         float_raise( float_flag_invalid );
3102         return float64_default_nan;
3103     }
3104     if ( aExp == 0 ) {
3105         if ( aSig == 0 ) return 0;
3106         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3107     }
3108     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3109     aSig |= LIT64( 0x0010000000000000 );
3110     zSig = estimateSqrt32( aExp, aSig>>21 );
3111     aSig <<= 9 - ( aExp & 1 );
3112     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3113     if ( ( zSig & 0x1FF ) <= 5 ) {
3114         doubleZSig = zSig<<1;
3115         mul64To128( zSig, zSig, &term0, &term1 );
3116         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3117         while ( (sbits64) rem0 < 0 ) {
3118             --zSig;
3119             doubleZSig -= 2;
3120             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3121         }
3122         zSig |= ( ( rem0 | rem1 ) != 0 );
3123     }
3124     return roundAndPackFloat64( 0, zExp, zSig );
3125 
3126 }
3127 #endif
3128 
3129 /*
3130 -------------------------------------------------------------------------------
3131 Returns 1 if the double-precision floating-point value `a' is equal to the
3132 corresponding value `b', and 0 otherwise.  The comparison is performed
3133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3134 -------------------------------------------------------------------------------
3135 */
float64_eq(float64 a,float64 b)3136 flag float64_eq( float64 a, float64 b )
3137 {
3138 
3139     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3140          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3141        ) {
3142         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3143             float_raise( float_flag_invalid );
3144         }
3145         return 0;
3146     }
3147     return ( a == b ) ||
3148         ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3149 
3150 }
3151 
3152 /*
3153 -------------------------------------------------------------------------------
3154 Returns 1 if the double-precision floating-point value `a' is less than or
3155 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3156 performed according to the IEC/IEEE Standard for Binary Floating-Point
3157 Arithmetic.
3158 -------------------------------------------------------------------------------
3159 */
float64_le(float64 a,float64 b)3160 flag float64_le( float64 a, float64 b )
3161 {
3162     flag aSign, bSign;
3163 
3164     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3165          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3166        ) {
3167         float_raise( float_flag_invalid );
3168         return 0;
3169     }
3170     aSign = extractFloat64Sign( a );
3171     bSign = extractFloat64Sign( b );
3172     if ( aSign != bSign )
3173         return aSign ||
3174             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3175               0 );
3176     return ( a == b ) ||
3177         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3178 
3179 }
3180 
3181 /*
3182 -------------------------------------------------------------------------------
3183 Returns 1 if the double-precision floating-point value `a' is less than
3184 the corresponding value `b', and 0 otherwise.  The comparison is performed
3185 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3186 -------------------------------------------------------------------------------
3187 */
float64_lt(float64 a,float64 b)3188 flag float64_lt( float64 a, float64 b )
3189 {
3190     flag aSign, bSign;
3191 
3192     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3193          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3194        ) {
3195         float_raise( float_flag_invalid );
3196         return 0;
3197     }
3198     aSign = extractFloat64Sign( a );
3199     bSign = extractFloat64Sign( b );
3200     if ( aSign != bSign )
3201         return aSign &&
3202             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3203               0 );
3204     return ( a != b ) &&
3205         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3206 
3207 }
3208 
3209 #ifndef SOFTFLOAT_FOR_GCC
3210 /*
3211 -------------------------------------------------------------------------------
3212 Returns 1 if the double-precision floating-point value `a' is equal to the
3213 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3214 if either operand is a NaN.  Otherwise, the comparison is performed
3215 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3216 -------------------------------------------------------------------------------
3217 */
float64_eq_signaling(float64 a,float64 b)3218 flag float64_eq_signaling( float64 a, float64 b )
3219 {
3220 
3221     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3222          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3223        ) {
3224         float_raise( float_flag_invalid );
3225         return 0;
3226     }
3227     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3228 
3229 }
3230 
3231 /*
3232 -------------------------------------------------------------------------------
3233 Returns 1 if the double-precision floating-point value `a' is less than or
3234 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3235 cause an exception.  Otherwise, the comparison is performed according to the
3236 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3237 -------------------------------------------------------------------------------
3238 */
float64_le_quiet(float64 a,float64 b)3239 flag float64_le_quiet( float64 a, float64 b )
3240 {
3241     flag aSign, bSign;
3242 
3243     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3244          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3245        ) {
3246         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3247             float_raise( float_flag_invalid );
3248         }
3249         return 0;
3250     }
3251     aSign = extractFloat64Sign( a );
3252     bSign = extractFloat64Sign( b );
3253     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3254     return ( a == b ) || ( aSign ^ ( a < b ) );
3255 
3256 }
3257 
3258 /*
3259 -------------------------------------------------------------------------------
3260 Returns 1 if the double-precision floating-point value `a' is less than
3261 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3262 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3263 Standard for Binary Floating-Point Arithmetic.
3264 -------------------------------------------------------------------------------
3265 */
float64_lt_quiet(float64 a,float64 b)3266 flag float64_lt_quiet( float64 a, float64 b )
3267 {
3268     flag aSign, bSign;
3269 
3270     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3271          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3272        ) {
3273         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3274             float_raise( float_flag_invalid );
3275         }
3276         return 0;
3277     }
3278     aSign = extractFloat64Sign( a );
3279     bSign = extractFloat64Sign( b );
3280     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3281     return ( a != b ) && ( aSign ^ ( a < b ) );
3282 
3283 }
3284 #endif
3285 
3286 #ifdef FLOATX80
3287 
3288 /*
3289 -------------------------------------------------------------------------------
3290 Returns the result of converting the extended double-precision floating-
3291 point value `a' to the 32-bit two's complement integer format.  The
3292 conversion is performed according to the IEC/IEEE Standard for Binary
3293 Floating-Point Arithmetic---which means in particular that the conversion
3294 is rounded according to the current rounding mode.  If `a' is a NaN, the
3295 largest positive integer is returned.  Otherwise, if the conversion
3296 overflows, the largest integer with the same sign as `a' is returned.
3297 -------------------------------------------------------------------------------
3298 */
floatx80_to_int32(floatx80 a)3299 int32 floatx80_to_int32( floatx80 a )
3300 {
3301     flag aSign;
3302     int32 aExp, shiftCount;
3303     bits64 aSig;
3304 
3305     aSig = extractFloatx80Frac( a );
3306     aExp = extractFloatx80Exp( a );
3307     aSign = extractFloatx80Sign( a );
3308     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3309     shiftCount = 0x4037 - aExp;
3310     if ( shiftCount <= 0 ) shiftCount = 1;
3311     shift64RightJamming( aSig, shiftCount, &aSig );
3312     return roundAndPackInt32( aSign, aSig );
3313 
3314 }
3315 
3316 /*
3317 -------------------------------------------------------------------------------
3318 Returns the result of converting the extended double-precision floating-
3319 point value `a' to the 32-bit two's complement integer format.  The
3320 conversion is performed according to the IEC/IEEE Standard for Binary
3321 Floating-Point Arithmetic, except that the conversion is always rounded
3322 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3323 Otherwise, if the conversion overflows, the largest integer with the same
3324 sign as `a' is returned.
3325 -------------------------------------------------------------------------------
3326 */
floatx80_to_int32_round_to_zero(floatx80 a)3327 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3328 {
3329     flag aSign;
3330     int32 aExp, shiftCount;
3331     bits64 aSig, savedASig;
3332     int32 z;
3333 
3334     aSig = extractFloatx80Frac( a );
3335     aExp = extractFloatx80Exp( a );
3336     aSign = extractFloatx80Sign( a );
3337     if ( 0x401E < aExp ) {
3338         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3339         goto invalid;
3340     }
3341     else if ( aExp < 0x3FFF ) {
3342         if ( aExp || aSig ) set_float_exception_inexact_flag();
3343         return 0;
3344     }
3345     shiftCount = 0x403E - aExp;
3346     savedASig = aSig;
3347     aSig >>= shiftCount;
3348     z = aSig;
3349     if ( aSign ) z = - z;
3350     if ( ( z < 0 ) ^ aSign ) {
3351  invalid:
3352         float_raise( float_flag_invalid );
3353         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3354     }
3355     if ( ( aSig<<shiftCount ) != savedASig ) {
3356         set_float_exception_inexact_flag();
3357     }
3358     return z;
3359 
3360 }
3361 
3362 /*
3363 -------------------------------------------------------------------------------
3364 Returns the result of converting the extended double-precision floating-
3365 point value `a' to the 64-bit two's complement integer format.  The
3366 conversion is performed according to the IEC/IEEE Standard for Binary
3367 Floating-Point Arithmetic---which means in particular that the conversion
3368 is rounded according to the current rounding mode.  If `a' is a NaN,
3369 the largest positive integer is returned.  Otherwise, if the conversion
3370 overflows, the largest integer with the same sign as `a' is returned.
3371 -------------------------------------------------------------------------------
3372 */
floatx80_to_int64(floatx80 a)3373 int64 floatx80_to_int64( floatx80 a )
3374 {
3375     flag aSign;
3376     int32 aExp, shiftCount;
3377     bits64 aSig, aSigExtra;
3378 
3379     aSig = extractFloatx80Frac( a );
3380     aExp = extractFloatx80Exp( a );
3381     aSign = extractFloatx80Sign( a );
3382     shiftCount = 0x403E - aExp;
3383     if ( shiftCount <= 0 ) {
3384         if ( shiftCount ) {
3385             float_raise( float_flag_invalid );
3386             if (    ! aSign
3387                  || (    ( aExp == 0x7FFF )
3388                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3389                ) {
3390                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3391             }
3392             return (sbits64) LIT64( 0x8000000000000000 );
3393         }
3394         aSigExtra = 0;
3395     }
3396     else {
3397         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3398     }
3399     return roundAndPackInt64( aSign, aSig, aSigExtra );
3400 
3401 }
3402 
3403 /*
3404 -------------------------------------------------------------------------------
3405 Returns the result of converting the extended double-precision floating-
3406 point value `a' to the 64-bit two's complement integer format.  The
3407 conversion is performed according to the IEC/IEEE Standard for Binary
3408 Floating-Point Arithmetic, except that the conversion is always rounded
3409 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3410 Otherwise, if the conversion overflows, the largest integer with the same
3411 sign as `a' is returned.
3412 -------------------------------------------------------------------------------
3413 */
floatx80_to_int64_round_to_zero(floatx80 a)3414 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3415 {
3416     flag aSign;
3417     int32 aExp, shiftCount;
3418     bits64 aSig;
3419     int64 z;
3420 
3421     aSig = extractFloatx80Frac( a );
3422     aExp = extractFloatx80Exp( a );
3423     aSign = extractFloatx80Sign( a );
3424     shiftCount = aExp - 0x403E;
3425     if ( 0 <= shiftCount ) {
3426         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3427         if ( ( a.high != 0xC03E ) || aSig ) {
3428             float_raise( float_flag_invalid );
3429             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3430                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3431             }
3432         }
3433         return (sbits64) LIT64( 0x8000000000000000 );
3434     }
3435     else if ( aExp < 0x3FFF ) {
3436         if ( aExp | aSig ) set_float_exception_inexact_flag();
3437         return 0;
3438     }
3439     z = aSig>>( - shiftCount );
3440     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3441         set_float_exception_inexact_flag();
3442     }
3443     if ( aSign ) z = - z;
3444     return z;
3445 
3446 }
3447 
3448 /*
3449 -------------------------------------------------------------------------------
3450 Returns the result of converting the extended double-precision floating-
3451 point value `a' to the single-precision floating-point format.  The
3452 conversion is performed according to the IEC/IEEE Standard for Binary
3453 Floating-Point Arithmetic.
3454 -------------------------------------------------------------------------------
3455 */
floatx80_to_float32(floatx80 a)3456 float32 floatx80_to_float32( floatx80 a )
3457 {
3458     flag aSign;
3459     int32 aExp;
3460     bits64 aSig;
3461 
3462     aSig = extractFloatx80Frac( a );
3463     aExp = extractFloatx80Exp( a );
3464     aSign = extractFloatx80Sign( a );
3465     if ( aExp == 0x7FFF ) {
3466         if ( (bits64) ( aSig<<1 ) ) {
3467             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3468         }
3469         return packFloat32( aSign, 0xFF, 0 );
3470     }
3471     shift64RightJamming( aSig, 33, &aSig );
3472     if ( aExp || aSig ) aExp -= 0x3F81;
3473     return roundAndPackFloat32( aSign, aExp, aSig );
3474 
3475 }
3476 
3477 /*
3478 -------------------------------------------------------------------------------
3479 Returns the result of converting the extended double-precision floating-
3480 point value `a' to the double-precision floating-point format.  The
3481 conversion is performed according to the IEC/IEEE Standard for Binary
3482 Floating-Point Arithmetic.
3483 -------------------------------------------------------------------------------
3484 */
floatx80_to_float64(floatx80 a)3485 float64 floatx80_to_float64( floatx80 a )
3486 {
3487     flag aSign;
3488     int32 aExp;
3489     bits64 aSig, zSig;
3490 
3491     aSig = extractFloatx80Frac( a );
3492     aExp = extractFloatx80Exp( a );
3493     aSign = extractFloatx80Sign( a );
3494     if ( aExp == 0x7FFF ) {
3495         if ( (bits64) ( aSig<<1 ) ) {
3496             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3497         }
3498         return packFloat64( aSign, 0x7FF, 0 );
3499     }
3500     shift64RightJamming( aSig, 1, &zSig );
3501     if ( aExp || aSig ) aExp -= 0x3C01;
3502     return roundAndPackFloat64( aSign, aExp, zSig );
3503 
3504 }
3505 
3506 #ifdef FLOAT128
3507 
3508 /*
3509 -------------------------------------------------------------------------------
3510 Returns the result of converting the extended double-precision floating-
3511 point value `a' to the quadruple-precision floating-point format.  The
3512 conversion is performed according to the IEC/IEEE Standard for Binary
3513 Floating-Point Arithmetic.
3514 -------------------------------------------------------------------------------
3515 */
floatx80_to_float128(floatx80 a)3516 float128 floatx80_to_float128( floatx80 a )
3517 {
3518     flag aSign;
3519     int16 aExp;
3520     bits64 aSig, zSig0, zSig1;
3521 
3522     aSig = extractFloatx80Frac( a );
3523     aExp = extractFloatx80Exp( a );
3524     aSign = extractFloatx80Sign( a );
3525     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3526         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3527     }
3528     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3529     return packFloat128( aSign, aExp, zSig0, zSig1 );
3530 
3531 }
3532 
3533 #endif
3534 
3535 /*
3536 -------------------------------------------------------------------------------
3537 Rounds the extended double-precision floating-point value `a' to an integer,
3538 and returns the result as an extended quadruple-precision floating-point
3539 value.  The operation is performed according to the IEC/IEEE Standard for
3540 Binary Floating-Point Arithmetic.
3541 -------------------------------------------------------------------------------
3542 */
floatx80_round_to_int(floatx80 a)3543 floatx80 floatx80_round_to_int( floatx80 a )
3544 {
3545     flag aSign;
3546     int32 aExp;
3547     bits64 lastBitMask, roundBitsMask;
3548     int8 roundingMode;
3549     floatx80 z;
3550 
3551     aExp = extractFloatx80Exp( a );
3552     if ( 0x403E <= aExp ) {
3553         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3554             return propagateFloatx80NaN( a, a );
3555         }
3556         return a;
3557     }
3558     if ( aExp < 0x3FFF ) {
3559         if (    ( aExp == 0 )
3560              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3561             return a;
3562         }
3563         set_float_exception_inexact_flag();
3564         aSign = extractFloatx80Sign( a );
3565         switch ( float_rounding_mode ) {
3566          case float_round_nearest_even:
3567             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3568                ) {
3569                 return
3570                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3571             }
3572             break;
3573          case float_round_to_zero:
3574             break;
3575          case float_round_down:
3576             return
3577                   aSign ?
3578                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3579                 : packFloatx80( 0, 0, 0 );
3580          case float_round_up:
3581             return
3582                   aSign ? packFloatx80( 1, 0, 0 )
3583                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3584         }
3585         return packFloatx80( aSign, 0, 0 );
3586     }
3587     lastBitMask = 1;
3588     lastBitMask <<= 0x403E - aExp;
3589     roundBitsMask = lastBitMask - 1;
3590     z = a;
3591     roundingMode = float_rounding_mode;
3592     if ( roundingMode == float_round_nearest_even ) {
3593         z.low += lastBitMask>>1;
3594         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3595     }
3596     else if ( roundingMode != float_round_to_zero ) {
3597         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3598             z.low += roundBitsMask;
3599         }
3600     }
3601     z.low &= ~ roundBitsMask;
3602     if ( z.low == 0 ) {
3603         ++z.high;
3604         z.low = LIT64( 0x8000000000000000 );
3605     }
3606     if ( z.low != a.low ) set_float_exception_inexact_flag();
3607     return z;
3608 
3609 }
3610 
3611 /*
3612 -------------------------------------------------------------------------------
3613 Returns the result of adding the absolute values of the extended double-
3614 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3615 negated before being returned.  `zSign' is ignored if the result is a NaN.
3616 The addition is performed according to the IEC/IEEE Standard for Binary
3617 Floating-Point Arithmetic.
3618 -------------------------------------------------------------------------------
3619 */
addFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3620 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3621 {
3622     int32 aExp, bExp, zExp;
3623     bits64 aSig, bSig, zSig0, zSig1;
3624     int32 expDiff;
3625 
3626     aSig = extractFloatx80Frac( a );
3627     aExp = extractFloatx80Exp( a );
3628     bSig = extractFloatx80Frac( b );
3629     bExp = extractFloatx80Exp( b );
3630     expDiff = aExp - bExp;
3631     if ( 0 < expDiff ) {
3632         if ( aExp == 0x7FFF ) {
3633             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3634             return a;
3635         }
3636         if ( bExp == 0 ) --expDiff;
3637         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3638         zExp = aExp;
3639     }
3640     else if ( expDiff < 0 ) {
3641         if ( bExp == 0x7FFF ) {
3642             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3643             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3644         }
3645         if ( aExp == 0 ) ++expDiff;
3646         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3647         zExp = bExp;
3648     }
3649     else {
3650         if ( aExp == 0x7FFF ) {
3651             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3652                 return propagateFloatx80NaN( a, b );
3653             }
3654             return a;
3655         }
3656         zSig1 = 0;
3657         zSig0 = aSig + bSig;
3658         if ( aExp == 0 ) {
3659             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3660             goto roundAndPack;
3661         }
3662         zExp = aExp;
3663         goto shiftRight1;
3664     }
3665     zSig0 = aSig + bSig;
3666     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3667  shiftRight1:
3668     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3669     zSig0 |= LIT64( 0x8000000000000000 );
3670     ++zExp;
3671  roundAndPack:
3672     return
3673         roundAndPackFloatx80(
3674             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3675 
3676 }
3677 
3678 /*
3679 -------------------------------------------------------------------------------
3680 Returns the result of subtracting the absolute values of the extended
3681 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3682 difference is negated before being returned.  `zSign' is ignored if the
3683 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3684 Standard for Binary Floating-Point Arithmetic.
3685 -------------------------------------------------------------------------------
3686 */
subFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3687 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3688 {
3689     int32 aExp, bExp, zExp;
3690     bits64 aSig, bSig, zSig0, zSig1;
3691     int32 expDiff;
3692     floatx80 z;
3693 
3694     aSig = extractFloatx80Frac( a );
3695     aExp = extractFloatx80Exp( a );
3696     bSig = extractFloatx80Frac( b );
3697     bExp = extractFloatx80Exp( b );
3698     expDiff = aExp - bExp;
3699     if ( 0 < expDiff ) goto aExpBigger;
3700     if ( expDiff < 0 ) goto bExpBigger;
3701     if ( aExp == 0x7FFF ) {
3702         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3703             return propagateFloatx80NaN( a, b );
3704         }
3705         float_raise( float_flag_invalid );
3706         z.low = floatx80_default_nan_low;
3707         z.high = floatx80_default_nan_high;
3708         return z;
3709     }
3710     if ( aExp == 0 ) {
3711         aExp = 1;
3712         bExp = 1;
3713     }
3714     zSig1 = 0;
3715     if ( bSig < aSig ) goto aBigger;
3716     if ( aSig < bSig ) goto bBigger;
3717     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3718  bExpBigger:
3719     if ( bExp == 0x7FFF ) {
3720         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3721         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3722     }
3723     if ( aExp == 0 ) ++expDiff;
3724     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3725  bBigger:
3726     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3727     zExp = bExp;
3728     zSign ^= 1;
3729     goto normalizeRoundAndPack;
3730  aExpBigger:
3731     if ( aExp == 0x7FFF ) {
3732         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3733         return a;
3734     }
3735     if ( bExp == 0 ) --expDiff;
3736     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3737  aBigger:
3738     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3739     zExp = aExp;
3740  normalizeRoundAndPack:
3741     return
3742         normalizeRoundAndPackFloatx80(
3743             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3744 
3745 }
3746 
3747 /*
3748 -------------------------------------------------------------------------------
3749 Returns the result of adding the extended double-precision floating-point
3750 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3751 Standard for Binary Floating-Point Arithmetic.
3752 -------------------------------------------------------------------------------
3753 */
floatx80_add(floatx80 a,floatx80 b)3754 floatx80 floatx80_add( floatx80 a, floatx80 b )
3755 {
3756     flag aSign, bSign;
3757 
3758     aSign = extractFloatx80Sign( a );
3759     bSign = extractFloatx80Sign( b );
3760     if ( aSign == bSign ) {
3761         return addFloatx80Sigs( a, b, aSign );
3762     }
3763     else {
3764         return subFloatx80Sigs( a, b, aSign );
3765     }
3766 
3767 }
3768 
3769 /*
3770 -------------------------------------------------------------------------------
3771 Returns the result of subtracting the extended double-precision floating-
3772 point values `a' and `b'.  The operation is performed according to the
3773 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3774 -------------------------------------------------------------------------------
3775 */
floatx80_sub(floatx80 a,floatx80 b)3776 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3777 {
3778     flag aSign, bSign;
3779 
3780     aSign = extractFloatx80Sign( a );
3781     bSign = extractFloatx80Sign( b );
3782     if ( aSign == bSign ) {
3783         return subFloatx80Sigs( a, b, aSign );
3784     }
3785     else {
3786         return addFloatx80Sigs( a, b, aSign );
3787     }
3788 
3789 }
3790 
3791 /*
3792 -------------------------------------------------------------------------------
3793 Returns the result of multiplying the extended double-precision floating-
3794 point values `a' and `b'.  The operation is performed according to the
3795 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3796 -------------------------------------------------------------------------------
3797 */
floatx80_mul(floatx80 a,floatx80 b)3798 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3799 {
3800     flag aSign, bSign, zSign;
3801     int32 aExp, bExp, zExp;
3802     bits64 aSig, bSig, zSig0, zSig1;
3803     floatx80 z;
3804 
3805     aSig = extractFloatx80Frac( a );
3806     aExp = extractFloatx80Exp( a );
3807     aSign = extractFloatx80Sign( a );
3808     bSig = extractFloatx80Frac( b );
3809     bExp = extractFloatx80Exp( b );
3810     bSign = extractFloatx80Sign( b );
3811     zSign = aSign ^ bSign;
3812     if ( aExp == 0x7FFF ) {
3813         if (    (bits64) ( aSig<<1 )
3814              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3815             return propagateFloatx80NaN( a, b );
3816         }
3817         if ( ( bExp | bSig ) == 0 ) goto invalid;
3818         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3819     }
3820     if ( bExp == 0x7FFF ) {
3821         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3822         if ( ( aExp | aSig ) == 0 ) {
3823  invalid:
3824             float_raise( float_flag_invalid );
3825             z.low = floatx80_default_nan_low;
3826             z.high = floatx80_default_nan_high;
3827             return z;
3828         }
3829         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3830     }
3831     if ( aExp == 0 ) {
3832         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3833         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3834     }
3835     if ( bExp == 0 ) {
3836         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3837         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3838     }
3839     zExp = aExp + bExp - 0x3FFE;
3840     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3841     if ( 0 < (sbits64) zSig0 ) {
3842         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3843         --zExp;
3844     }
3845     return
3846         roundAndPackFloatx80(
3847             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3848 
3849 }
3850 
3851 /*
3852 -------------------------------------------------------------------------------
3853 Returns the result of dividing the extended double-precision floating-point
3854 value `a' by the corresponding value `b'.  The operation is performed
3855 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3856 -------------------------------------------------------------------------------
3857 */
floatx80_div(floatx80 a,floatx80 b)3858 floatx80 floatx80_div( floatx80 a, floatx80 b )
3859 {
3860     flag aSign, bSign, zSign;
3861     int32 aExp, bExp, zExp;
3862     bits64 aSig, bSig, zSig0, zSig1;
3863     bits64 rem0, rem1, rem2, term0, term1, term2;
3864     floatx80 z;
3865 
3866     aSig = extractFloatx80Frac( a );
3867     aExp = extractFloatx80Exp( a );
3868     aSign = extractFloatx80Sign( a );
3869     bSig = extractFloatx80Frac( b );
3870     bExp = extractFloatx80Exp( b );
3871     bSign = extractFloatx80Sign( b );
3872     zSign = aSign ^ bSign;
3873     if ( aExp == 0x7FFF ) {
3874         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3875         if ( bExp == 0x7FFF ) {
3876             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3877             goto invalid;
3878         }
3879         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3880     }
3881     if ( bExp == 0x7FFF ) {
3882         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3883         return packFloatx80( zSign, 0, 0 );
3884     }
3885     if ( bExp == 0 ) {
3886         if ( bSig == 0 ) {
3887             if ( ( aExp | aSig ) == 0 ) {
3888  invalid:
3889                 float_raise( float_flag_invalid );
3890                 z.low = floatx80_default_nan_low;
3891                 z.high = floatx80_default_nan_high;
3892                 return z;
3893             }
3894             float_raise( float_flag_divbyzero );
3895             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3896         }
3897         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3898     }
3899     if ( aExp == 0 ) {
3900         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3901         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3902     }
3903     zExp = aExp - bExp + 0x3FFE;
3904     rem1 = 0;
3905     if ( bSig <= aSig ) {
3906         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3907         ++zExp;
3908     }
3909     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3910     mul64To128( bSig, zSig0, &term0, &term1 );
3911     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3912     while ( (sbits64) rem0 < 0 ) {
3913         --zSig0;
3914         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3915     }
3916     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3917     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3918         mul64To128( bSig, zSig1, &term1, &term2 );
3919         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3920         while ( (sbits64) rem1 < 0 ) {
3921             --zSig1;
3922             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3923         }
3924         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3925     }
3926     return
3927         roundAndPackFloatx80(
3928             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3929 
3930 }
3931 
3932 /*
3933 -------------------------------------------------------------------------------
3934 Returns the remainder of the extended double-precision floating-point value
3935 `a' with respect to the corresponding value `b'.  The operation is performed
3936 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3937 -------------------------------------------------------------------------------
3938 */
floatx80_rem(floatx80 a,floatx80 b)3939 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3940 {
3941     flag aSign, bSign, zSign;
3942     int32 aExp, bExp, expDiff;
3943     bits64 aSig0, aSig1, bSig;
3944     bits64 q, term0, term1, alternateASig0, alternateASig1;
3945     floatx80 z;
3946 
3947     aSig0 = extractFloatx80Frac( a );
3948     aExp = extractFloatx80Exp( a );
3949     aSign = extractFloatx80Sign( a );
3950     bSig = extractFloatx80Frac( b );
3951     bExp = extractFloatx80Exp( b );
3952     bSign = extractFloatx80Sign( b );
3953     if ( aExp == 0x7FFF ) {
3954         if (    (bits64) ( aSig0<<1 )
3955              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3956             return propagateFloatx80NaN( a, b );
3957         }
3958         goto invalid;
3959     }
3960     if ( bExp == 0x7FFF ) {
3961         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3962         return a;
3963     }
3964     if ( bExp == 0 ) {
3965         if ( bSig == 0 ) {
3966  invalid:
3967             float_raise( float_flag_invalid );
3968             z.low = floatx80_default_nan_low;
3969             z.high = floatx80_default_nan_high;
3970             return z;
3971         }
3972         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3973     }
3974     if ( aExp == 0 ) {
3975         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3976         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3977     }
3978     bSig |= LIT64( 0x8000000000000000 );
3979     zSign = aSign;
3980     expDiff = aExp - bExp;
3981     aSig1 = 0;
3982     if ( expDiff < 0 ) {
3983         if ( expDiff < -1 ) return a;
3984         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3985         expDiff = 0;
3986     }
3987     q = ( bSig <= aSig0 );
3988     if ( q ) aSig0 -= bSig;
3989     expDiff -= 64;
3990     while ( 0 < expDiff ) {
3991         q = estimateDiv128To64( aSig0, aSig1, bSig );
3992         q = ( 2 < q ) ? q - 2 : 0;
3993         mul64To128( bSig, q, &term0, &term1 );
3994         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3995         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3996         expDiff -= 62;
3997     }
3998     expDiff += 64;
3999     if ( 0 < expDiff ) {
4000         q = estimateDiv128To64( aSig0, aSig1, bSig );
4001         q = ( 2 < q ) ? q - 2 : 0;
4002         q >>= 64 - expDiff;
4003         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4004         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4005         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4006         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4007             ++q;
4008             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4009         }
4010     }
4011     else {
4012         term1 = 0;
4013         term0 = bSig;
4014     }
4015     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4016     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4017          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4018               && ( q & 1 ) )
4019        ) {
4020         aSig0 = alternateASig0;
4021         aSig1 = alternateASig1;
4022         zSign = ! zSign;
4023     }
4024     return
4025         normalizeRoundAndPackFloatx80(
4026             80, zSign, bExp + expDiff, aSig0, aSig1 );
4027 
4028 }
4029 
4030 /*
4031 -------------------------------------------------------------------------------
4032 Returns the square root of the extended double-precision floating-point
4033 value `a'.  The operation is performed according to the IEC/IEEE Standard
4034 for Binary Floating-Point Arithmetic.
4035 -------------------------------------------------------------------------------
4036 */
floatx80_sqrt(floatx80 a)4037 floatx80 floatx80_sqrt( floatx80 a )
4038 {
4039     flag aSign;
4040     int32 aExp, zExp;
4041     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4042     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4043     floatx80 z;
4044 
4045     aSig0 = extractFloatx80Frac( a );
4046     aExp = extractFloatx80Exp( a );
4047     aSign = extractFloatx80Sign( a );
4048     if ( aExp == 0x7FFF ) {
4049         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4050         if ( ! aSign ) return a;
4051         goto invalid;
4052     }
4053     if ( aSign ) {
4054         if ( ( aExp | aSig0 ) == 0 ) return a;
4055  invalid:
4056         float_raise( float_flag_invalid );
4057         z.low = floatx80_default_nan_low;
4058         z.high = floatx80_default_nan_high;
4059         return z;
4060     }
4061     if ( aExp == 0 ) {
4062         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4063         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4064     }
4065     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4066     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4067     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4068     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4069     doubleZSig0 = zSig0<<1;
4070     mul64To128( zSig0, zSig0, &term0, &term1 );
4071     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4072     while ( (sbits64) rem0 < 0 ) {
4073         --zSig0;
4074         doubleZSig0 -= 2;
4075         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4076     }
4077     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4078     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4079         if ( zSig1 == 0 ) zSig1 = 1;
4080         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4081         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4082         mul64To128( zSig1, zSig1, &term2, &term3 );
4083         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4084         while ( (sbits64) rem1 < 0 ) {
4085             --zSig1;
4086             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4087             term3 |= 1;
4088             term2 |= doubleZSig0;
4089             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4090         }
4091         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4092     }
4093     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4094     zSig0 |= doubleZSig0;
4095     return
4096         roundAndPackFloatx80(
4097             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4098 
4099 }
4100 
4101 /*
4102 -------------------------------------------------------------------------------
4103 Returns 1 if the extended double-precision floating-point value `a' is
4104 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4105 performed according to the IEC/IEEE Standard for Binary Floating-Point
4106 Arithmetic.
4107 -------------------------------------------------------------------------------
4108 */
floatx80_eq(floatx80 a,floatx80 b)4109 flag floatx80_eq( floatx80 a, floatx80 b )
4110 {
4111 
4112     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4113               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4114          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4115               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4116        ) {
4117         if (    floatx80_is_signaling_nan( a )
4118              || floatx80_is_signaling_nan( b ) ) {
4119             float_raise( float_flag_invalid );
4120         }
4121         return 0;
4122     }
4123     return
4124            ( a.low == b.low )
4125         && (    ( a.high == b.high )
4126              || (    ( a.low == 0 )
4127                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4128            );
4129 
4130 }
4131 
4132 /*
4133 -------------------------------------------------------------------------------
4134 Returns 1 if the extended double-precision floating-point value `a' is
4135 less than or equal to the corresponding value `b', and 0 otherwise.  The
4136 comparison is performed according to the IEC/IEEE Standard for Binary
4137 Floating-Point Arithmetic.
4138 -------------------------------------------------------------------------------
4139 */
floatx80_le(floatx80 a,floatx80 b)4140 flag floatx80_le( floatx80 a, floatx80 b )
4141 {
4142     flag aSign, bSign;
4143 
4144     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4145               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4146          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4147               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4148        ) {
4149         float_raise( float_flag_invalid );
4150         return 0;
4151     }
4152     aSign = extractFloatx80Sign( a );
4153     bSign = extractFloatx80Sign( b );
4154     if ( aSign != bSign ) {
4155         return
4156                aSign
4157             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4158                  == 0 );
4159     }
4160     return
4161           aSign ? le128( b.high, b.low, a.high, a.low )
4162         : le128( a.high, a.low, b.high, b.low );
4163 
4164 }
4165 
4166 /*
4167 -------------------------------------------------------------------------------
4168 Returns 1 if the extended double-precision floating-point value `a' is
4169 less than the corresponding value `b', and 0 otherwise.  The comparison
4170 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4171 Arithmetic.
4172 -------------------------------------------------------------------------------
4173 */
floatx80_lt(floatx80 a,floatx80 b)4174 flag floatx80_lt( floatx80 a, floatx80 b )
4175 {
4176     flag aSign, bSign;
4177 
4178     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4179               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4180          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4181               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4182        ) {
4183         float_raise( float_flag_invalid );
4184         return 0;
4185     }
4186     aSign = extractFloatx80Sign( a );
4187     bSign = extractFloatx80Sign( b );
4188     if ( aSign != bSign ) {
4189         return
4190                aSign
4191             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4192                  != 0 );
4193     }
4194     return
4195           aSign ? lt128( b.high, b.low, a.high, a.low )
4196         : lt128( a.high, a.low, b.high, b.low );
4197 
4198 }
4199 
4200 /*
4201 -------------------------------------------------------------------------------
4202 Returns 1 if the extended double-precision floating-point value `a' is equal
4203 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4204 raised if either operand is a NaN.  Otherwise, the comparison is performed
4205 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4206 -------------------------------------------------------------------------------
4207 */
floatx80_eq_signaling(floatx80 a,floatx80 b)4208 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4209 {
4210 
4211     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4212               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4213          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4214               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4215        ) {
4216         float_raise( float_flag_invalid );
4217         return 0;
4218     }
4219     return
4220            ( a.low == b.low )
4221         && (    ( a.high == b.high )
4222              || (    ( a.low == 0 )
4223                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4224            );
4225 
4226 }
4227 
4228 /*
4229 -------------------------------------------------------------------------------
4230 Returns 1 if the extended double-precision floating-point value `a' is less
4231 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4232 do not cause an exception.  Otherwise, the comparison is performed according
4233 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4234 -------------------------------------------------------------------------------
4235 */
floatx80_le_quiet(floatx80 a,floatx80 b)4236 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4237 {
4238     flag aSign, bSign;
4239 
4240     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4241               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4242          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4243               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4244        ) {
4245         if (    floatx80_is_signaling_nan( a )
4246              || floatx80_is_signaling_nan( b ) ) {
4247             float_raise( float_flag_invalid );
4248         }
4249         return 0;
4250     }
4251     aSign = extractFloatx80Sign( a );
4252     bSign = extractFloatx80Sign( b );
4253     if ( aSign != bSign ) {
4254         return
4255                aSign
4256             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4257                  == 0 );
4258     }
4259     return
4260           aSign ? le128( b.high, b.low, a.high, a.low )
4261         : le128( a.high, a.low, b.high, b.low );
4262 
4263 }
4264 
4265 /*
4266 -------------------------------------------------------------------------------
4267 Returns 1 if the extended double-precision floating-point value `a' is less
4268 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4269 an exception.  Otherwise, the comparison is performed according to the
4270 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 -------------------------------------------------------------------------------
4272 */
floatx80_lt_quiet(floatx80 a,floatx80 b)4273 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4274 {
4275     flag aSign, bSign;
4276 
4277     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4278               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4279          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4280               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4281        ) {
4282         if (    floatx80_is_signaling_nan( a )
4283              || floatx80_is_signaling_nan( b ) ) {
4284             float_raise( float_flag_invalid );
4285         }
4286         return 0;
4287     }
4288     aSign = extractFloatx80Sign( a );
4289     bSign = extractFloatx80Sign( b );
4290     if ( aSign != bSign ) {
4291         return
4292                aSign
4293             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4294                  != 0 );
4295     }
4296     return
4297           aSign ? lt128( b.high, b.low, a.high, a.low )
4298         : lt128( a.high, a.low, b.high, b.low );
4299 
4300 }
4301 
4302 #endif
4303 
4304 #ifdef FLOAT128
4305 
4306 /*
4307 -------------------------------------------------------------------------------
4308 Returns the result of converting the quadruple-precision floating-point
4309 value `a' to the 32-bit two's complement integer format.  The conversion
4310 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4311 Arithmetic---which means in particular that the conversion is rounded
4312 according to the current rounding mode.  If `a' is a NaN, the largest
4313 positive integer is returned.  Otherwise, if the conversion overflows, the
4314 largest integer with the same sign as `a' is returned.
4315 -------------------------------------------------------------------------------
4316 */
float128_to_int32(float128 a)4317 int32 float128_to_int32( float128 a )
4318 {
4319     flag aSign;
4320     int32 aExp, shiftCount;
4321     bits64 aSig0, aSig1;
4322 
4323     aSig1 = extractFloat128Frac1( a );
4324     aSig0 = extractFloat128Frac0( a );
4325     aExp = extractFloat128Exp( a );
4326     aSign = extractFloat128Sign( a );
4327     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4328     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4329     aSig0 |= ( aSig1 != 0 );
4330     shiftCount = 0x4028 - aExp;
4331     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4332     return roundAndPackInt32( aSign, aSig0 );
4333 
4334 }
4335 
4336 /*
4337 -------------------------------------------------------------------------------
4338 Returns the result of converting the quadruple-precision floating-point
4339 value `a' to the 32-bit two's complement integer format.  The conversion
4340 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4341 Arithmetic, except that the conversion is always rounded toward zero.  If
4342 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4343 conversion overflows, the largest integer with the same sign as `a' is
4344 returned.
4345 -------------------------------------------------------------------------------
4346 */
float128_to_int32_round_to_zero(float128 a)4347 int32 float128_to_int32_round_to_zero( float128 a )
4348 {
4349     flag aSign;
4350     int32 aExp, shiftCount;
4351     bits64 aSig0, aSig1, savedASig;
4352     int32 z;
4353 
4354     aSig1 = extractFloat128Frac1( a );
4355     aSig0 = extractFloat128Frac0( a );
4356     aExp = extractFloat128Exp( a );
4357     aSign = extractFloat128Sign( a );
4358     aSig0 |= ( aSig1 != 0 );
4359     if ( 0x401E < aExp ) {
4360         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4361         goto invalid;
4362     }
4363     else if ( aExp < 0x3FFF ) {
4364         if ( aExp || aSig0 ) set_float_exception_inexact_flag();
4365         return 0;
4366     }
4367     aSig0 |= LIT64( 0x0001000000000000 );
4368     shiftCount = 0x402F - aExp;
4369     savedASig = aSig0;
4370     aSig0 >>= shiftCount;
4371     z = (int32)aSig0;
4372     if ( aSign ) z = - z;
4373     if ( ( z < 0 ) ^ aSign ) {
4374  invalid:
4375         float_raise( float_flag_invalid );
4376         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4377     }
4378     if ( ( aSig0<<shiftCount ) != savedASig ) {
4379         set_float_exception_inexact_flag();
4380     }
4381     return z;
4382 
4383 }
4384 
4385 /*
4386 -------------------------------------------------------------------------------
4387 Returns the result of converting the quadruple-precision floating-point
4388 value `a' to the 64-bit two's complement integer format.  The conversion
4389 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4390 Arithmetic---which means in particular that the conversion is rounded
4391 according to the current rounding mode.  If `a' is a NaN, the largest
4392 positive integer is returned.  Otherwise, if the conversion overflows, the
4393 largest integer with the same sign as `a' is returned.
4394 -------------------------------------------------------------------------------
4395 */
float128_to_int64(float128 a)4396 int64 float128_to_int64( float128 a )
4397 {
4398     flag aSign;
4399     int32 aExp, shiftCount;
4400     bits64 aSig0, aSig1;
4401 
4402     aSig1 = extractFloat128Frac1( a );
4403     aSig0 = extractFloat128Frac0( a );
4404     aExp = extractFloat128Exp( a );
4405     aSign = extractFloat128Sign( a );
4406     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4407     shiftCount = 0x402F - aExp;
4408     if ( shiftCount <= 0 ) {
4409         if ( 0x403E < aExp ) {
4410             float_raise( float_flag_invalid );
4411             if (    ! aSign
4412                  || (    ( aExp == 0x7FFF )
4413                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4414                     )
4415                ) {
4416                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4417             }
4418             return (sbits64) LIT64( 0x8000000000000000 );
4419         }
4420         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4421     }
4422     else {
4423         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4424     }
4425     return roundAndPackInt64( aSign, aSig0, aSig1 );
4426 
4427 }
4428 
4429 /*
4430 -------------------------------------------------------------------------------
4431 Returns the result of converting the quadruple-precision floating-point
4432 value `a' to the 64-bit two's complement integer format.  The conversion
4433 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4434 Arithmetic, except that the conversion is always rounded toward zero.
4435 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4436 the conversion overflows, the largest integer with the same sign as `a' is
4437 returned.
4438 -------------------------------------------------------------------------------
4439 */
float128_to_int64_round_to_zero(float128 a)4440 int64 float128_to_int64_round_to_zero( float128 a )
4441 {
4442     flag aSign;
4443     int32 aExp, shiftCount;
4444     bits64 aSig0, aSig1;
4445     int64 z;
4446 
4447     aSig1 = extractFloat128Frac1( a );
4448     aSig0 = extractFloat128Frac0( a );
4449     aExp = extractFloat128Exp( a );
4450     aSign = extractFloat128Sign( a );
4451     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4452     shiftCount = aExp - 0x402F;
4453     if ( 0 < shiftCount ) {
4454         if ( 0x403E <= aExp ) {
4455             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4456             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4457                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4458                 if ( aSig1 ) set_float_exception_inexact_flag();
4459             }
4460             else {
4461                 float_raise( float_flag_invalid );
4462                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4463                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4464                 }
4465             }
4466             return (sbits64) LIT64( 0x8000000000000000 );
4467         }
4468         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4469         if ( (bits64) ( aSig1<<shiftCount ) ) {
4470             set_float_exception_inexact_flag();
4471         }
4472     }
4473     else {
4474         if ( aExp < 0x3FFF ) {
4475             if ( aExp | aSig0 | aSig1 ) {
4476                 set_float_exception_inexact_flag();
4477             }
4478             return 0;
4479         }
4480         z = aSig0>>( - shiftCount );
4481         if (    aSig1
4482              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4483             set_float_exception_inexact_flag();
4484         }
4485     }
4486     if ( aSign ) z = - z;
4487     return z;
4488 
4489 }
4490 
4491 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4492     && defined(SOFTFLOAT_NEED_FIXUNS)
4493 /*
4494  * just like above - but do not care for overflow of signed results
4495  */
float128_to_uint64_round_to_zero(float128 a)4496 uint64 float128_to_uint64_round_to_zero( float128 a )
4497 {
4498     flag aSign;
4499     int32 aExp, shiftCount;
4500     bits64 aSig0, aSig1;
4501     uint64 z;
4502 
4503     aSig1 = extractFloat128Frac1( a );
4504     aSig0 = extractFloat128Frac0( a );
4505     aExp = extractFloat128Exp( a );
4506     aSign = extractFloat128Sign( a );
4507     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4508     shiftCount = aExp - 0x402F;
4509     if ( 0 < shiftCount ) {
4510         if ( 0x403F <= aExp ) {
4511             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4512             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4513                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4514                 if ( aSig1 ) set_float_exception_inexact_flag();
4515             }
4516             else {
4517                 float_raise( float_flag_invalid );
4518             }
4519             return LIT64( 0xFFFFFFFFFFFFFFFF );
4520         }
4521         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4522         if ( (bits64) ( aSig1<<shiftCount ) ) {
4523             set_float_exception_inexact_flag();
4524         }
4525     }
4526     else {
4527         if ( aExp < 0x3FFF ) {
4528             if ( aExp | aSig0 | aSig1 ) {
4529                 set_float_exception_inexact_flag();
4530             }
4531             return 0;
4532         }
4533         z = aSig0>>( - shiftCount );
4534         if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4535             set_float_exception_inexact_flag();
4536         }
4537     }
4538     if ( aSign ) z = - z;
4539     return z;
4540 
4541 }
4542 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4543 
4544 /*
4545 -------------------------------------------------------------------------------
4546 Returns the result of converting the quadruple-precision floating-point
4547 value `a' to the single-precision floating-point format.  The conversion
4548 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4549 Arithmetic.
4550 -------------------------------------------------------------------------------
4551 */
float128_to_float32(float128 a)4552 float32 float128_to_float32( float128 a )
4553 {
4554     flag aSign;
4555     int32 aExp;
4556     bits64 aSig0, aSig1;
4557     bits32 zSig;
4558 
4559     aSig1 = extractFloat128Frac1( a );
4560     aSig0 = extractFloat128Frac0( a );
4561     aExp = extractFloat128Exp( a );
4562     aSign = extractFloat128Sign( a );
4563     if ( aExp == 0x7FFF ) {
4564         if ( aSig0 | aSig1 ) {
4565             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4566         }
4567         return packFloat32( aSign, 0xFF, 0 );
4568     }
4569     aSig0 |= ( aSig1 != 0 );
4570     shift64RightJamming( aSig0, 18, &aSig0 );
4571     zSig = (bits32)aSig0;
4572     if ( aExp || zSig ) {
4573         zSig |= 0x40000000;
4574         aExp -= 0x3F81;
4575     }
4576     return roundAndPackFloat32( aSign, aExp, zSig );
4577 
4578 }
4579 
4580 /*
4581 -------------------------------------------------------------------------------
4582 Returns the result of converting the quadruple-precision floating-point
4583 value `a' to the double-precision floating-point format.  The conversion
4584 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4585 Arithmetic.
4586 -------------------------------------------------------------------------------
4587 */
float128_to_float64(float128 a)4588 float64 float128_to_float64( float128 a )
4589 {
4590     flag aSign;
4591     int32 aExp;
4592     bits64 aSig0, aSig1;
4593 
4594     aSig1 = extractFloat128Frac1( a );
4595     aSig0 = extractFloat128Frac0( a );
4596     aExp = extractFloat128Exp( a );
4597     aSign = extractFloat128Sign( a );
4598     if ( aExp == 0x7FFF ) {
4599         if ( aSig0 | aSig1 ) {
4600             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4601         }
4602         return packFloat64( aSign, 0x7FF, 0 );
4603     }
4604     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4605     aSig0 |= ( aSig1 != 0 );
4606     if ( aExp || aSig0 ) {
4607         aSig0 |= LIT64( 0x4000000000000000 );
4608         aExp -= 0x3C01;
4609     }
4610     return roundAndPackFloat64( aSign, aExp, aSig0 );
4611 
4612 }
4613 
4614 #ifdef FLOATX80
4615 
4616 /*
4617 -------------------------------------------------------------------------------
4618 Returns the result of converting the quadruple-precision floating-point
4619 value `a' to the extended double-precision floating-point format.  The
4620 conversion is performed according to the IEC/IEEE Standard for Binary
4621 Floating-Point Arithmetic.
4622 -------------------------------------------------------------------------------
4623 */
float128_to_floatx80(float128 a)4624 floatx80 float128_to_floatx80( float128 a )
4625 {
4626     flag aSign;
4627     int32 aExp;
4628     bits64 aSig0, aSig1;
4629 
4630     aSig1 = extractFloat128Frac1( a );
4631     aSig0 = extractFloat128Frac0( a );
4632     aExp = extractFloat128Exp( a );
4633     aSign = extractFloat128Sign( a );
4634     if ( aExp == 0x7FFF ) {
4635         if ( aSig0 | aSig1 ) {
4636             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4637         }
4638         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4639     }
4640     if ( aExp == 0 ) {
4641         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4642         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4643     }
4644     else {
4645         aSig0 |= LIT64( 0x0001000000000000 );
4646     }
4647     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4648     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4649 
4650 }
4651 
4652 #endif
4653 
4654 /*
4655 -------------------------------------------------------------------------------
4656 Rounds the quadruple-precision floating-point value `a' to an integer, and
4657 returns the result as a quadruple-precision floating-point value.  The
4658 operation is performed according to the IEC/IEEE Standard for Binary
4659 Floating-Point Arithmetic.
4660 -------------------------------------------------------------------------------
4661 */
float128_round_to_int(float128 a)4662 float128 float128_round_to_int( float128 a )
4663 {
4664     flag aSign;
4665     int32 aExp;
4666     bits64 lastBitMask, roundBitsMask;
4667     int8 roundingMode;
4668     float128 z;
4669 
4670     aExp = extractFloat128Exp( a );
4671     if ( 0x402F <= aExp ) {
4672         if ( 0x406F <= aExp ) {
4673             if (    ( aExp == 0x7FFF )
4674                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4675                ) {
4676                 return propagateFloat128NaN( a, a );
4677             }
4678             return a;
4679         }
4680         lastBitMask = 1;
4681         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4682         roundBitsMask = lastBitMask - 1;
4683         z = a;
4684         roundingMode = float_rounding_mode;
4685         if ( roundingMode == float_round_nearest_even ) {
4686             if ( lastBitMask ) {
4687                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4688                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4689             }
4690             else {
4691                 if ( (sbits64) z.low < 0 ) {
4692                     ++z.high;
4693                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4694                 }
4695             }
4696         }
4697         else if ( roundingMode != float_round_to_zero ) {
4698             if (   extractFloat128Sign( z )
4699                  ^ ( roundingMode == float_round_up ) ) {
4700                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4701             }
4702         }
4703         z.low &= ~ roundBitsMask;
4704     }
4705     else {
4706         if ( aExp < 0x3FFF ) {
4707             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4708             set_float_exception_inexact_flag();
4709             aSign = extractFloat128Sign( a );
4710             switch ( float_rounding_mode ) {
4711              case float_round_nearest_even:
4712                 if (    ( aExp == 0x3FFE )
4713                      && (   extractFloat128Frac0( a )
4714                           | extractFloat128Frac1( a ) )
4715                    ) {
4716                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4717                 }
4718                 break;
4719              case float_round_to_zero:
4720                 break;
4721              case float_round_down:
4722                 return
4723                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4724                     : packFloat128( 0, 0, 0, 0 );
4725              case float_round_up:
4726                 return
4727                       aSign ? packFloat128( 1, 0, 0, 0 )
4728                     : packFloat128( 0, 0x3FFF, 0, 0 );
4729             }
4730             return packFloat128( aSign, 0, 0, 0 );
4731         }
4732         lastBitMask = 1;
4733         lastBitMask <<= 0x402F - aExp;
4734         roundBitsMask = lastBitMask - 1;
4735         z.low = 0;
4736         z.high = a.high;
4737         roundingMode = float_rounding_mode;
4738         if ( roundingMode == float_round_nearest_even ) {
4739             z.high += lastBitMask>>1;
4740             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4741                 z.high &= ~ lastBitMask;
4742             }
4743         }
4744         else if ( roundingMode != float_round_to_zero ) {
4745             if (   extractFloat128Sign( z )
4746                  ^ ( roundingMode == float_round_up ) ) {
4747                 z.high |= ( a.low != 0 );
4748                 z.high += roundBitsMask;
4749             }
4750         }
4751         z.high &= ~ roundBitsMask;
4752     }
4753     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4754         set_float_exception_inexact_flag();
4755     }
4756     return z;
4757 
4758 }
4759 
4760 /*
4761 -------------------------------------------------------------------------------
4762 Returns the result of adding the absolute values of the quadruple-precision
4763 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4764 before being returned.  `zSign' is ignored if the result is a NaN.
4765 The addition is performed according to the IEC/IEEE Standard for Binary
4766 Floating-Point Arithmetic.
4767 -------------------------------------------------------------------------------
4768 */
addFloat128Sigs(float128 a,float128 b,flag zSign)4769 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4770 {
4771     int32 aExp, bExp, zExp;
4772     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4773     int32 expDiff;
4774 
4775     aSig1 = extractFloat128Frac1( a );
4776     aSig0 = extractFloat128Frac0( a );
4777     aExp = extractFloat128Exp( a );
4778     bSig1 = extractFloat128Frac1( b );
4779     bSig0 = extractFloat128Frac0( b );
4780     bExp = extractFloat128Exp( b );
4781     expDiff = aExp - bExp;
4782     if ( 0 < expDiff ) {
4783         if ( aExp == 0x7FFF ) {
4784             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4785             return a;
4786         }
4787         if ( bExp == 0 ) {
4788             --expDiff;
4789         }
4790         else {
4791             bSig0 |= LIT64( 0x0001000000000000 );
4792         }
4793         shift128ExtraRightJamming(
4794             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4795         zExp = aExp;
4796     }
4797     else if ( expDiff < 0 ) {
4798         if ( bExp == 0x7FFF ) {
4799             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4800             return packFloat128( zSign, 0x7FFF, 0, 0 );
4801         }
4802         if ( aExp == 0 ) {
4803             ++expDiff;
4804         }
4805         else {
4806             aSig0 |= LIT64( 0x0001000000000000 );
4807         }
4808         shift128ExtraRightJamming(
4809             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4810         zExp = bExp;
4811     }
4812     else {
4813         if ( aExp == 0x7FFF ) {
4814             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4815                 return propagateFloat128NaN( a, b );
4816             }
4817             return a;
4818         }
4819         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4820         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4821         zSig2 = 0;
4822         zSig0 |= LIT64( 0x0002000000000000 );
4823         zExp = aExp;
4824         goto shiftRight1;
4825     }
4826     aSig0 |= LIT64( 0x0001000000000000 );
4827     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4828     --zExp;
4829     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4830     ++zExp;
4831  shiftRight1:
4832     shift128ExtraRightJamming(
4833         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4834  roundAndPack:
4835     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4836 
4837 }
4838 
4839 /*
4840 -------------------------------------------------------------------------------
4841 Returns the result of subtracting the absolute values of the quadruple-
4842 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4843 difference is negated before being returned.  `zSign' is ignored if the
4844 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4845 Standard for Binary Floating-Point Arithmetic.
4846 -------------------------------------------------------------------------------
4847 */
subFloat128Sigs(float128 a,float128 b,flag zSign)4848 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4849 {
4850     int32 aExp, bExp, zExp;
4851     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4852     int32 expDiff;
4853     float128 z;
4854 
4855     aSig1 = extractFloat128Frac1( a );
4856     aSig0 = extractFloat128Frac0( a );
4857     aExp = extractFloat128Exp( a );
4858     bSig1 = extractFloat128Frac1( b );
4859     bSig0 = extractFloat128Frac0( b );
4860     bExp = extractFloat128Exp( b );
4861     expDiff = aExp - bExp;
4862     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4863     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4864     if ( 0 < expDiff ) goto aExpBigger;
4865     if ( expDiff < 0 ) goto bExpBigger;
4866     if ( aExp == 0x7FFF ) {
4867         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4868             return propagateFloat128NaN( a, b );
4869         }
4870         float_raise( float_flag_invalid );
4871         z.low = float128_default_nan_low;
4872         z.high = float128_default_nan_high;
4873         return z;
4874     }
4875     if ( aExp == 0 ) {
4876         aExp = 1;
4877         bExp = 1;
4878     }
4879     if ( bSig0 < aSig0 ) goto aBigger;
4880     if ( aSig0 < bSig0 ) goto bBigger;
4881     if ( bSig1 < aSig1 ) goto aBigger;
4882     if ( aSig1 < bSig1 ) goto bBigger;
4883     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4884  bExpBigger:
4885     if ( bExp == 0x7FFF ) {
4886         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4887         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4888     }
4889     if ( aExp == 0 ) {
4890         ++expDiff;
4891     }
4892     else {
4893         aSig0 |= LIT64( 0x4000000000000000 );
4894     }
4895     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4896     bSig0 |= LIT64( 0x4000000000000000 );
4897  bBigger:
4898     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4899     zExp = bExp;
4900     zSign ^= 1;
4901     goto normalizeRoundAndPack;
4902  aExpBigger:
4903     if ( aExp == 0x7FFF ) {
4904         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4905         return a;
4906     }
4907     if ( bExp == 0 ) {
4908         --expDiff;
4909     }
4910     else {
4911         bSig0 |= LIT64( 0x4000000000000000 );
4912     }
4913     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4914     aSig0 |= LIT64( 0x4000000000000000 );
4915  aBigger:
4916     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4917     zExp = aExp;
4918  normalizeRoundAndPack:
4919     --zExp;
4920     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4921 
4922 }
4923 
4924 /*
4925 -------------------------------------------------------------------------------
4926 Returns the result of adding the quadruple-precision floating-point values
4927 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4928 for Binary Floating-Point Arithmetic.
4929 -------------------------------------------------------------------------------
4930 */
float128_add(float128 a,float128 b)4931 float128 float128_add( float128 a, float128 b )
4932 {
4933     flag aSign, bSign;
4934 
4935     aSign = extractFloat128Sign( a );
4936     bSign = extractFloat128Sign( b );
4937     if ( aSign == bSign ) {
4938         return addFloat128Sigs( a, b, aSign );
4939     }
4940     else {
4941         return subFloat128Sigs( a, b, aSign );
4942     }
4943 
4944 }
4945 
4946 /*
4947 -------------------------------------------------------------------------------
4948 Returns the result of subtracting the quadruple-precision floating-point
4949 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4950 Standard for Binary Floating-Point Arithmetic.
4951 -------------------------------------------------------------------------------
4952 */
float128_sub(float128 a,float128 b)4953 float128 float128_sub( float128 a, float128 b )
4954 {
4955     flag aSign, bSign;
4956 
4957     aSign = extractFloat128Sign( a );
4958     bSign = extractFloat128Sign( b );
4959     if ( aSign == bSign ) {
4960         return subFloat128Sigs( a, b, aSign );
4961     }
4962     else {
4963         return addFloat128Sigs( a, b, aSign );
4964     }
4965 
4966 }
4967 
4968 /*
4969 -------------------------------------------------------------------------------
4970 Returns the result of multiplying the quadruple-precision floating-point
4971 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4972 Standard for Binary Floating-Point Arithmetic.
4973 -------------------------------------------------------------------------------
4974 */
float128_mul(float128 a,float128 b)4975 float128 float128_mul( float128 a, float128 b )
4976 {
4977     flag aSign, bSign, zSign;
4978     int32 aExp, bExp, zExp;
4979     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4980     float128 z;
4981 
4982     aSig1 = extractFloat128Frac1( a );
4983     aSig0 = extractFloat128Frac0( a );
4984     aExp = extractFloat128Exp( a );
4985     aSign = extractFloat128Sign( a );
4986     bSig1 = extractFloat128Frac1( b );
4987     bSig0 = extractFloat128Frac0( b );
4988     bExp = extractFloat128Exp( b );
4989     bSign = extractFloat128Sign( b );
4990     zSign = aSign ^ bSign;
4991     if ( aExp == 0x7FFF ) {
4992         if (    ( aSig0 | aSig1 )
4993              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4994             return propagateFloat128NaN( a, b );
4995         }
4996         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4997         return packFloat128( zSign, 0x7FFF, 0, 0 );
4998     }
4999     if ( bExp == 0x7FFF ) {
5000         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5001         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5002  invalid:
5003             float_raise( float_flag_invalid );
5004             z.low = float128_default_nan_low;
5005             z.high = float128_default_nan_high;
5006             return z;
5007         }
5008         return packFloat128( zSign, 0x7FFF, 0, 0 );
5009     }
5010     if ( aExp == 0 ) {
5011         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5012         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5013     }
5014     if ( bExp == 0 ) {
5015         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5016         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5017     }
5018     zExp = aExp + bExp - 0x4000;
5019     aSig0 |= LIT64( 0x0001000000000000 );
5020     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5021     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5022     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5023     zSig2 |= ( zSig3 != 0 );
5024     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5025         shift128ExtraRightJamming(
5026             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5027         ++zExp;
5028     }
5029     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5030 
5031 }
5032 
5033 /*
5034 -------------------------------------------------------------------------------
5035 Returns the result of dividing the quadruple-precision floating-point value
5036 `a' by the corresponding value `b'.  The operation is performed according to
5037 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5038 -------------------------------------------------------------------------------
5039 */
float128_div(float128 a,float128 b)5040 float128 float128_div( float128 a, float128 b )
5041 {
5042     flag aSign, bSign, zSign;
5043     int32 aExp, bExp, zExp;
5044     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5045     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5046     float128 z;
5047 
5048     aSig1 = extractFloat128Frac1( a );
5049     aSig0 = extractFloat128Frac0( a );
5050     aExp = extractFloat128Exp( a );
5051     aSign = extractFloat128Sign( a );
5052     bSig1 = extractFloat128Frac1( b );
5053     bSig0 = extractFloat128Frac0( b );
5054     bExp = extractFloat128Exp( b );
5055     bSign = extractFloat128Sign( b );
5056     zSign = aSign ^ bSign;
5057     if ( aExp == 0x7FFF ) {
5058         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5059         if ( bExp == 0x7FFF ) {
5060             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5061             goto invalid;
5062         }
5063         return packFloat128( zSign, 0x7FFF, 0, 0 );
5064     }
5065     if ( bExp == 0x7FFF ) {
5066         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5067         return packFloat128( zSign, 0, 0, 0 );
5068     }
5069     if ( bExp == 0 ) {
5070         if ( ( bSig0 | bSig1 ) == 0 ) {
5071             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5072  invalid:
5073                 float_raise( float_flag_invalid );
5074                 z.low = float128_default_nan_low;
5075                 z.high = float128_default_nan_high;
5076                 return z;
5077             }
5078             float_raise( float_flag_divbyzero );
5079             return packFloat128( zSign, 0x7FFF, 0, 0 );
5080         }
5081         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5082     }
5083     if ( aExp == 0 ) {
5084         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5085         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5086     }
5087     zExp = aExp - bExp + 0x3FFD;
5088     shortShift128Left(
5089         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5090     shortShift128Left(
5091         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5092     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5093         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5094         ++zExp;
5095     }
5096     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5097     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5098     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5099     while ( (sbits64) rem0 < 0 ) {
5100         --zSig0;
5101         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5102     }
5103     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5104     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5105         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5106         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5107         while ( (sbits64) rem1 < 0 ) {
5108             --zSig1;
5109             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5110         }
5111         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5112     }
5113     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5114     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5115 
5116 }
5117 
5118 /*
5119 -------------------------------------------------------------------------------
5120 Returns the remainder of the quadruple-precision floating-point value `a'
5121 with respect to the corresponding value `b'.  The operation is performed
5122 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5123 -------------------------------------------------------------------------------
5124 */
float128_rem(float128 a,float128 b)5125 float128 float128_rem( float128 a, float128 b )
5126 {
5127     flag aSign, zSign;
5128     int32 aExp, bExp, expDiff;
5129     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5130     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5131     sbits64 sigMean0;
5132     float128 z;
5133 
5134     aSig1 = extractFloat128Frac1( a );
5135     aSig0 = extractFloat128Frac0( a );
5136     aExp = extractFloat128Exp( a );
5137     aSign = extractFloat128Sign( a );
5138     bSig1 = extractFloat128Frac1( b );
5139     bSig0 = extractFloat128Frac0( b );
5140     bExp = extractFloat128Exp( b );
5141     if ( aExp == 0x7FFF ) {
5142         if (    ( aSig0 | aSig1 )
5143              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5144             return propagateFloat128NaN( a, b );
5145         }
5146         goto invalid;
5147     }
5148     if ( bExp == 0x7FFF ) {
5149         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5150         return a;
5151     }
5152     if ( bExp == 0 ) {
5153         if ( ( bSig0 | bSig1 ) == 0 ) {
5154  invalid:
5155             float_raise( float_flag_invalid );
5156             z.low = float128_default_nan_low;
5157             z.high = float128_default_nan_high;
5158             return z;
5159         }
5160         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5161     }
5162     if ( aExp == 0 ) {
5163         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5164         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5165     }
5166     expDiff = aExp - bExp;
5167     if ( expDiff < -1 ) return a;
5168     shortShift128Left(
5169         aSig0 | LIT64( 0x0001000000000000 ),
5170         aSig1,
5171         15 - ( expDiff < 0 ),
5172         &aSig0,
5173         &aSig1
5174     );
5175     shortShift128Left(
5176         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5177     q = le128( bSig0, bSig1, aSig0, aSig1 );
5178     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5179     expDiff -= 64;
5180     while ( 0 < expDiff ) {
5181         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5182         q = ( 4 < q ) ? q - 4 : 0;
5183         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5184         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5185         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5186         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5187         expDiff -= 61;
5188     }
5189     if ( -64 < expDiff ) {
5190         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5191         q = ( 4 < q ) ? q - 4 : 0;
5192         q >>= - expDiff;
5193         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5194         expDiff += 52;
5195         if ( expDiff < 0 ) {
5196             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5197         }
5198         else {
5199             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5200         }
5201         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5202         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5203     }
5204     else {
5205         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5206         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5207     }
5208     do {
5209         alternateASig0 = aSig0;
5210         alternateASig1 = aSig1;
5211         ++q;
5212         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5213     } while ( 0 <= (sbits64) aSig0 );
5214     add128(
5215         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5216     if (    ( sigMean0 < 0 )
5217          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5218         aSig0 = alternateASig0;
5219         aSig1 = alternateASig1;
5220     }
5221     zSign = ( (sbits64) aSig0 < 0 );
5222     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5223     return
5224         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5225 
5226 }
5227 
5228 /*
5229 -------------------------------------------------------------------------------
5230 Returns the square root of the quadruple-precision floating-point value `a'.
5231 The operation is performed according to the IEC/IEEE Standard for Binary
5232 Floating-Point Arithmetic.
5233 -------------------------------------------------------------------------------
5234 */
float128_sqrt(float128 a)5235 float128 float128_sqrt( float128 a )
5236 {
5237     flag aSign;
5238     int32 aExp, zExp;
5239     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5240     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5241     float128 z;
5242 
5243     aSig1 = extractFloat128Frac1( a );
5244     aSig0 = extractFloat128Frac0( a );
5245     aExp = extractFloat128Exp( a );
5246     aSign = extractFloat128Sign( a );
5247     if ( aExp == 0x7FFF ) {
5248         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5249         if ( ! aSign ) return a;
5250         goto invalid;
5251     }
5252     if ( aSign ) {
5253         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5254  invalid:
5255         float_raise( float_flag_invalid );
5256         z.low = float128_default_nan_low;
5257         z.high = float128_default_nan_high;
5258         return z;
5259     }
5260     if ( aExp == 0 ) {
5261         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5262         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5263     }
5264     zExp = (int32) ( (aExp - 0x3FFF) >> 1) + 0x3FFE;
5265     aSig0 |= LIT64( 0x0001000000000000 );
5266     zSig0 = estimateSqrt32((int16)aExp, (bits32)(aSig0>>17));
5267     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5268     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5269     doubleZSig0 = zSig0<<1;
5270     mul64To128( zSig0, zSig0, &term0, &term1 );
5271     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5272     while ( (sbits64) rem0 < 0 ) {
5273         --zSig0;
5274         doubleZSig0 -= 2;
5275         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5276     }
5277     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5278     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5279         if ( zSig1 == 0 ) zSig1 = 1;
5280         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5281         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5282         mul64To128( zSig1, zSig1, &term2, &term3 );
5283         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5284         while ( (sbits64) rem1 < 0 ) {
5285             --zSig1;
5286             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5287             term3 |= 1;
5288             term2 |= doubleZSig0;
5289             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5290         }
5291         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5292     }
5293     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5294     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5295 
5296 }
5297 
5298 /*
5299 -------------------------------------------------------------------------------
5300 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5301 the corresponding value `b', and 0 otherwise.  The comparison is performed
5302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5303 -------------------------------------------------------------------------------
5304 */
float128_eq(float128 a,float128 b)5305 flag float128_eq( float128 a, float128 b )
5306 {
5307 
5308     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5309               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5310          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5311               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5312        ) {
5313         if (    float128_is_signaling_nan( a )
5314              || float128_is_signaling_nan( b ) ) {
5315             float_raise( float_flag_invalid );
5316         }
5317         return 0;
5318     }
5319     return
5320            ( a.low == b.low )
5321         && (    ( a.high == b.high )
5322              || (    ( a.low == 0 )
5323                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5324            );
5325 
5326 }
5327 
5328 /*
5329 -------------------------------------------------------------------------------
5330 Returns 1 if the quadruple-precision floating-point value `a' is less than
5331 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5332 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5333 Arithmetic.
5334 -------------------------------------------------------------------------------
5335 */
float128_le(float128 a,float128 b)5336 flag float128_le( float128 a, float128 b )
5337 {
5338     flag aSign, bSign;
5339 
5340     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5341               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5342          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5343               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5344        ) {
5345         float_raise( float_flag_invalid );
5346         return 0;
5347     }
5348     aSign = extractFloat128Sign( a );
5349     bSign = extractFloat128Sign( b );
5350     if ( aSign != bSign ) {
5351         return
5352                aSign
5353             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5354                  == 0 );
5355     }
5356     return
5357           aSign ? le128( b.high, b.low, a.high, a.low )
5358         : le128( a.high, a.low, b.high, b.low );
5359 
5360 }
5361 
5362 /*
5363 -------------------------------------------------------------------------------
5364 Returns 1 if the quadruple-precision floating-point value `a' is less than
5365 the corresponding value `b', and 0 otherwise.  The comparison is performed
5366 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5367 -------------------------------------------------------------------------------
5368 */
float128_lt(float128 a,float128 b)5369 flag float128_lt( float128 a, float128 b )
5370 {
5371     flag aSign, bSign;
5372 
5373     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5374               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5375          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5376               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5377        ) {
5378         float_raise( float_flag_invalid );
5379         return 0;
5380     }
5381     aSign = extractFloat128Sign( a );
5382     bSign = extractFloat128Sign( b );
5383     if ( aSign != bSign ) {
5384         return
5385                aSign
5386             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5387                  != 0 );
5388     }
5389     return
5390           aSign ? lt128( b.high, b.low, a.high, a.low )
5391         : lt128( a.high, a.low, b.high, b.low );
5392 
5393 }
5394 
5395 /*
5396 -------------------------------------------------------------------------------
5397 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5398 the corresponding value `b', and 0 otherwise.  The invalid exception is
5399 raised if either operand is a NaN.  Otherwise, the comparison is performed
5400 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5401 -------------------------------------------------------------------------------
5402 */
float128_eq_signaling(float128 a,float128 b)5403 flag float128_eq_signaling( float128 a, float128 b )
5404 {
5405 
5406     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5407               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5408          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5409               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5410        ) {
5411         float_raise( float_flag_invalid );
5412         return 0;
5413     }
5414     return
5415            ( a.low == b.low )
5416         && (    ( a.high == b.high )
5417              || (    ( a.low == 0 )
5418                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5419            );
5420 
5421 }
5422 
5423 /*
5424 -------------------------------------------------------------------------------
5425 Returns 1 if the quadruple-precision floating-point value `a' is less than
5426 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5427 cause an exception.  Otherwise, the comparison is performed according to the
5428 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5429 -------------------------------------------------------------------------------
5430 */
float128_le_quiet(float128 a,float128 b)5431 flag float128_le_quiet( float128 a, float128 b )
5432 {
5433     flag aSign, bSign;
5434 
5435     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5436               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5437          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5438               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5439        ) {
5440         if (    float128_is_signaling_nan( a )
5441              || float128_is_signaling_nan( b ) ) {
5442             float_raise( float_flag_invalid );
5443         }
5444         return 0;
5445     }
5446     aSign = extractFloat128Sign( a );
5447     bSign = extractFloat128Sign( b );
5448     if ( aSign != bSign ) {
5449         return
5450                aSign
5451             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5452                  == 0 );
5453     }
5454     return
5455           aSign ? le128( b.high, b.low, a.high, a.low )
5456         : le128( a.high, a.low, b.high, b.low );
5457 
5458 }
5459 
5460 /*
5461 -------------------------------------------------------------------------------
5462 Returns 1 if the quadruple-precision floating-point value `a' is less than
5463 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5464 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5465 Standard for Binary Floating-Point Arithmetic.
5466 -------------------------------------------------------------------------------
5467 */
float128_lt_quiet(float128 a,float128 b)5468 flag float128_lt_quiet( float128 a, float128 b )
5469 {
5470     flag aSign, bSign;
5471 
5472     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5473               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5474          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5475               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5476        ) {
5477         if (    float128_is_signaling_nan( a )
5478              || float128_is_signaling_nan( b ) ) {
5479             float_raise( float_flag_invalid );
5480         }
5481         return 0;
5482     }
5483     aSign = extractFloat128Sign( a );
5484     bSign = extractFloat128Sign( b );
5485     if ( aSign != bSign ) {
5486         return
5487                aSign
5488             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5489                  != 0 );
5490     }
5491     return
5492           aSign ? lt128( b.high, b.low, a.high, a.low )
5493         : lt128( a.high, a.low, b.high, b.low );
5494 
5495 }
5496 
5497 #endif
5498 
5499 
5500 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5501 
5502 /*
5503  * These two routines are not part of the original softfloat distribution.
5504  *
5505  * They are based on the corresponding conversions to integer but return
5506  * unsigned numbers instead since these functions are required by GCC.
5507  *
5508  * Added by Mark Brinicombe <mark@NetBSD.org>   27/09/97
5509  *
5510  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5511  */
5512 
5513 /*
5514 -------------------------------------------------------------------------------
5515 Returns the result of converting the double-precision floating-point value
5516 `a' to the 32-bit unsigned integer format.  The conversion is
5517 performed according to the IEC/IEEE Standard for Binary Floating-point
5518 Arithmetic, except that the conversion is always rounded toward zero.  If
5519 `a' is a NaN, the largest positive integer is returned.  If the conversion
5520 overflows, the largest integer positive is returned.
5521 -------------------------------------------------------------------------------
5522 */
float64_to_uint32_round_to_zero(float64 a)5523 uint32 float64_to_uint32_round_to_zero( float64 a )
5524 {
5525     flag aSign;
5526     int16 aExp, shiftCount;
5527     bits64 aSig, savedASig;
5528     uint32 z;
5529 
5530     aSig = extractFloat64Frac( a );
5531     aExp = extractFloat64Exp( a );
5532     aSign = extractFloat64Sign( a );
5533 
5534     if (aSign) {
5535         float_raise( float_flag_invalid );
5536         return(0);
5537     }
5538 
5539     if ( 0x41E < aExp ) {
5540         float_raise( float_flag_invalid );
5541         return 0xffffffff;
5542     }
5543     else if ( aExp < 0x3FF ) {
5544         if ( aExp || aSig ) set_float_exception_inexact_flag();
5545         return 0;
5546     }
5547     aSig |= LIT64( 0x0010000000000000 );
5548     shiftCount = 0x433 - aExp;
5549     savedASig = aSig;
5550     aSig >>= shiftCount;
5551     z = (uint32)aSig;
5552     if ( ( aSig<<shiftCount ) != savedASig ) {
5553         set_float_exception_inexact_flag();
5554     }
5555     return z;
5556 
5557 }
5558 
5559 /*
5560 -------------------------------------------------------------------------------
5561 Returns the result of converting the single-precision floating-point value
5562 `a' to the 32-bit unsigned integer format.  The conversion is
5563 performed according to the IEC/IEEE Standard for Binary Floating-point
5564 Arithmetic, except that the conversion is always rounded toward zero.  If
5565 `a' is a NaN, the largest positive integer is returned.  If the conversion
5566 overflows, the largest positive integer is returned.
5567 -------------------------------------------------------------------------------
5568 */
float32_to_uint32_round_to_zero(float32 a)5569 uint32 float32_to_uint32_round_to_zero( float32 a )
5570 {
5571     flag aSign;
5572     int16 aExp, shiftCount;
5573     bits32 aSig;
5574     uint32 z;
5575 
5576     aSig = extractFloat32Frac( a );
5577     aExp = extractFloat32Exp( a );
5578     aSign = extractFloat32Sign( a );
5579     shiftCount = aExp - 0x9E;
5580 
5581     if (aSign) {
5582         float_raise( float_flag_invalid );
5583         return(0);
5584     }
5585     if ( 0 < shiftCount ) {
5586         float_raise( float_flag_invalid );
5587         return 0xFFFFFFFF;
5588     }
5589     else if ( aExp <= 0x7E ) {
5590         if ( aExp | aSig ) set_float_exception_inexact_flag();
5591         return 0;
5592     }
5593     aSig = ( aSig | 0x800000 )<<8;
5594     z = aSig>>( - shiftCount );
5595     if ( aSig<<( shiftCount & 31 ) ) {
5596         set_float_exception_inexact_flag();
5597     }
5598     return z;
5599 
5600 }
5601 
5602 #endif
5603