1 // Copyright 2015, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 // * Redistributions of source code must retain the above copyright notice,
8 // this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright notice,
10 // this list of conditions and the following disclaimer in the documentation
11 // and/or other materials provided with the distribution.
12 // * Neither the name of ARM Limited nor the names of its contributors may be
13 // used to endorse or promote products derived from this software without
14 // specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27 #include <cstdio>
28
29 #include "utils-vixl.h"
30
31 namespace vixl {
32
33 // The default NaN values (for FPCR.DN=1).
34 const double kFP64DefaultNaN = RawbitsToDouble(UINT64_C(0x7ff8000000000000));
35 const float kFP32DefaultNaN = RawbitsToFloat(0x7fc00000);
36 const Float16 kFP16DefaultNaN = RawbitsToFloat16(0x7e00);
37
38 // Floating-point zero values.
39 const Float16 kFP16PositiveZero = RawbitsToFloat16(0x0);
40 const Float16 kFP16NegativeZero = RawbitsToFloat16(0x8000);
41
42 // Floating-point infinity values.
43 const Float16 kFP16PositiveInfinity = RawbitsToFloat16(0x7c00);
44 const Float16 kFP16NegativeInfinity = RawbitsToFloat16(0xfc00);
45 const float kFP32PositiveInfinity = RawbitsToFloat(0x7f800000);
46 const float kFP32NegativeInfinity = RawbitsToFloat(0xff800000);
47 const double kFP64PositiveInfinity =
48 RawbitsToDouble(UINT64_C(0x7ff0000000000000));
49 const double kFP64NegativeInfinity =
50 RawbitsToDouble(UINT64_C(0xfff0000000000000));
51
IsZero(Float16 value)52 bool IsZero(Float16 value) {
53 uint16_t bits = Float16ToRawbits(value);
54 return (bits == Float16ToRawbits(kFP16PositiveZero) ||
55 bits == Float16ToRawbits(kFP16NegativeZero));
56 }
57
Float16ToRawbits(Float16 value)58 uint16_t Float16ToRawbits(Float16 value) { return value.rawbits_; }
59
FloatToRawbits(float value)60 uint32_t FloatToRawbits(float value) {
61 uint32_t bits = 0;
62 memcpy(&bits, &value, 4);
63 return bits;
64 }
65
66
DoubleToRawbits(double value)67 uint64_t DoubleToRawbits(double value) {
68 uint64_t bits = 0;
69 memcpy(&bits, &value, 8);
70 return bits;
71 }
72
73
RawbitsToFloat16(uint16_t bits)74 Float16 RawbitsToFloat16(uint16_t bits) {
75 Float16 f;
76 f.rawbits_ = bits;
77 return f;
78 }
79
80
RawbitsToFloat(uint32_t bits)81 float RawbitsToFloat(uint32_t bits) {
82 float value = 0.0;
83 memcpy(&value, &bits, 4);
84 return value;
85 }
86
87
RawbitsToDouble(uint64_t bits)88 double RawbitsToDouble(uint64_t bits) {
89 double value = 0.0;
90 memcpy(&value, &bits, 8);
91 return value;
92 }
93
94
Float16Sign(internal::SimFloat16 val)95 uint32_t Float16Sign(internal::SimFloat16 val) {
96 uint16_t rawbits = Float16ToRawbits(val);
97 return ExtractUnsignedBitfield32(15, 15, rawbits);
98 }
99
100
Float16Exp(internal::SimFloat16 val)101 uint32_t Float16Exp(internal::SimFloat16 val) {
102 uint16_t rawbits = Float16ToRawbits(val);
103 return ExtractUnsignedBitfield32(14, 10, rawbits);
104 }
105
Float16Mantissa(internal::SimFloat16 val)106 uint32_t Float16Mantissa(internal::SimFloat16 val) {
107 uint16_t rawbits = Float16ToRawbits(val);
108 return ExtractUnsignedBitfield32(9, 0, rawbits);
109 }
110
111
FloatSign(float val)112 uint32_t FloatSign(float val) {
113 uint32_t rawbits = FloatToRawbits(val);
114 return ExtractUnsignedBitfield32(31, 31, rawbits);
115 }
116
117
FloatExp(float val)118 uint32_t FloatExp(float val) {
119 uint32_t rawbits = FloatToRawbits(val);
120 return ExtractUnsignedBitfield32(30, 23, rawbits);
121 }
122
123
FloatMantissa(float val)124 uint32_t FloatMantissa(float val) {
125 uint32_t rawbits = FloatToRawbits(val);
126 return ExtractUnsignedBitfield32(22, 0, rawbits);
127 }
128
129
DoubleSign(double val)130 uint32_t DoubleSign(double val) {
131 uint64_t rawbits = DoubleToRawbits(val);
132 return static_cast<uint32_t>(ExtractUnsignedBitfield64(63, 63, rawbits));
133 }
134
135
DoubleExp(double val)136 uint32_t DoubleExp(double val) {
137 uint64_t rawbits = DoubleToRawbits(val);
138 return static_cast<uint32_t>(ExtractUnsignedBitfield64(62, 52, rawbits));
139 }
140
141
DoubleMantissa(double val)142 uint64_t DoubleMantissa(double val) {
143 uint64_t rawbits = DoubleToRawbits(val);
144 return ExtractUnsignedBitfield64(51, 0, rawbits);
145 }
146
147
Float16Pack(uint16_t sign,uint16_t exp,uint16_t mantissa)148 internal::SimFloat16 Float16Pack(uint16_t sign,
149 uint16_t exp,
150 uint16_t mantissa) {
151 uint16_t bits = (sign << 15) | (exp << 10) | mantissa;
152 return RawbitsToFloat16(bits);
153 }
154
155
FloatPack(uint32_t sign,uint32_t exp,uint32_t mantissa)156 float FloatPack(uint32_t sign, uint32_t exp, uint32_t mantissa) {
157 uint32_t bits = (sign << 31) | (exp << 23) | mantissa;
158 return RawbitsToFloat(bits);
159 }
160
161
DoublePack(uint64_t sign,uint64_t exp,uint64_t mantissa)162 double DoublePack(uint64_t sign, uint64_t exp, uint64_t mantissa) {
163 uint64_t bits = (sign << 63) | (exp << 52) | mantissa;
164 return RawbitsToDouble(bits);
165 }
166
167
Float16Classify(Float16 value)168 int Float16Classify(Float16 value) {
169 uint16_t bits = Float16ToRawbits(value);
170 uint16_t exponent_max = (1 << 5) - 1;
171 uint16_t exponent_mask = exponent_max << 10;
172 uint16_t mantissa_mask = (1 << 10) - 1;
173
174 uint16_t exponent = (bits & exponent_mask) >> 10;
175 uint16_t mantissa = bits & mantissa_mask;
176 if (exponent == 0) {
177 if (mantissa == 0) {
178 return FP_ZERO;
179 }
180 return FP_SUBNORMAL;
181 } else if (exponent == exponent_max) {
182 if (mantissa == 0) {
183 return FP_INFINITE;
184 }
185 return FP_NAN;
186 }
187 return FP_NORMAL;
188 }
189
190
CountClearHalfWords(uint64_t imm,unsigned reg_size)191 unsigned CountClearHalfWords(uint64_t imm, unsigned reg_size) {
192 VIXL_ASSERT((reg_size % 8) == 0);
193 int count = 0;
194 for (unsigned i = 0; i < (reg_size / 16); i++) {
195 if ((imm & 0xffff) == 0) {
196 count++;
197 }
198 imm >>= 16;
199 }
200 return count;
201 }
202
203
BitCount(uint64_t value)204 int BitCount(uint64_t value) { return CountSetBits(value); }
205
206 // Float16 definitions.
207
Float16(double dvalue)208 Float16::Float16(double dvalue) {
209 rawbits_ =
210 Float16ToRawbits(FPToFloat16(dvalue, FPTieEven, kIgnoreDefaultNaN));
211 }
212
213 namespace internal {
214
operator -() const215 SimFloat16 SimFloat16::operator-() const {
216 return RawbitsToFloat16(rawbits_ ^ 0x8000);
217 }
218
219 // SimFloat16 definitions.
operator +(SimFloat16 rhs) const220 SimFloat16 SimFloat16::operator+(SimFloat16 rhs) const {
221 return static_cast<double>(*this) + static_cast<double>(rhs);
222 }
223
operator -(SimFloat16 rhs) const224 SimFloat16 SimFloat16::operator-(SimFloat16 rhs) const {
225 return static_cast<double>(*this) - static_cast<double>(rhs);
226 }
227
operator *(SimFloat16 rhs) const228 SimFloat16 SimFloat16::operator*(SimFloat16 rhs) const {
229 return static_cast<double>(*this) * static_cast<double>(rhs);
230 }
231
operator /(SimFloat16 rhs) const232 SimFloat16 SimFloat16::operator/(SimFloat16 rhs) const {
233 return static_cast<double>(*this) / static_cast<double>(rhs);
234 }
235
operator <(SimFloat16 rhs) const236 bool SimFloat16::operator<(SimFloat16 rhs) const {
237 return static_cast<double>(*this) < static_cast<double>(rhs);
238 }
239
operator >(SimFloat16 rhs) const240 bool SimFloat16::operator>(SimFloat16 rhs) const {
241 return static_cast<double>(*this) > static_cast<double>(rhs);
242 }
243
operator ==(SimFloat16 rhs) const244 bool SimFloat16::operator==(SimFloat16 rhs) const {
245 if (IsNaN(*this) || IsNaN(rhs)) {
246 return false;
247 } else if (IsZero(rhs) && IsZero(*this)) {
248 // +0 and -0 should be treated as equal.
249 return true;
250 }
251 return this->rawbits_ == rhs.rawbits_;
252 }
253
operator !=(SimFloat16 rhs) const254 bool SimFloat16::operator!=(SimFloat16 rhs) const { return !(*this == rhs); }
255
operator ==(double rhs) const256 bool SimFloat16::operator==(double rhs) const {
257 return static_cast<double>(*this) == static_cast<double>(rhs);
258 }
259
operator double() const260 SimFloat16::operator double() const {
261 return FPToDouble(*this, kIgnoreDefaultNaN);
262 }
263
BitCount(Uint32 value)264 Int64 BitCount(Uint32 value) { return CountSetBits(value.Get()); }
265
266 } // namespace internal
267
FPToFloat(Float16 value,UseDefaultNaN DN,bool * exception)268 float FPToFloat(Float16 value, UseDefaultNaN DN, bool* exception) {
269 uint16_t bits = Float16ToRawbits(value);
270 uint32_t sign = bits >> 15;
271 uint32_t exponent =
272 ExtractUnsignedBitfield32(kFloat16MantissaBits + kFloat16ExponentBits - 1,
273 kFloat16MantissaBits,
274 bits);
275 uint32_t mantissa =
276 ExtractUnsignedBitfield32(kFloat16MantissaBits - 1, 0, bits);
277
278 switch (Float16Classify(value)) {
279 case FP_ZERO:
280 return (sign == 0) ? 0.0f : -0.0f;
281
282 case FP_INFINITE:
283 return (sign == 0) ? kFP32PositiveInfinity : kFP32NegativeInfinity;
284
285 case FP_SUBNORMAL: {
286 // Calculate shift required to put mantissa into the most-significant bits
287 // of the destination mantissa.
288 int shift = CountLeadingZeros(mantissa << (32 - 10));
289
290 // Shift mantissa and discard implicit '1'.
291 mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits) + shift + 1;
292 mantissa &= (1 << kFloatMantissaBits) - 1;
293
294 // Adjust the exponent for the shift applied, and rebias.
295 exponent = exponent - shift + (-15 + 127);
296 break;
297 }
298
299 case FP_NAN:
300 if (IsSignallingNaN(value)) {
301 if (exception != NULL) {
302 *exception = true;
303 }
304 }
305 if (DN == kUseDefaultNaN) return kFP32DefaultNaN;
306
307 // Convert NaNs as the processor would:
308 // - The sign is propagated.
309 // - The payload (mantissa) is transferred entirely, except that the top
310 // bit is forced to '1', making the result a quiet NaN. The unused
311 // (low-order) payload bits are set to 0.
312 exponent = (1 << kFloatExponentBits) - 1;
313
314 // Increase bits in mantissa, making low-order bits 0.
315 mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
316 mantissa |= 1 << 22; // Force a quiet NaN.
317 break;
318
319 case FP_NORMAL:
320 // Increase bits in mantissa, making low-order bits 0.
321 mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
322
323 // Change exponent bias.
324 exponent += (-15 + 127);
325 break;
326
327 default:
328 VIXL_UNREACHABLE();
329 }
330 return RawbitsToFloat((sign << 31) | (exponent << kFloatMantissaBits) |
331 mantissa);
332 }
333
334
FPToFloat(double value,FPRounding round_mode,UseDefaultNaN DN,bool * exception)335 float FPToFloat(double value,
336 FPRounding round_mode,
337 UseDefaultNaN DN,
338 bool* exception) {
339 // Only the FPTieEven rounding mode is implemented.
340 VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
341 USE(round_mode);
342
343 switch (std::fpclassify(value)) {
344 case FP_NAN: {
345 if (IsSignallingNaN(value)) {
346 if (exception != NULL) {
347 *exception = true;
348 }
349 }
350 if (DN == kUseDefaultNaN) return kFP32DefaultNaN;
351
352 // Convert NaNs as the processor would:
353 // - The sign is propagated.
354 // - The payload (mantissa) is transferred as much as possible, except
355 // that the top bit is forced to '1', making the result a quiet NaN.
356 uint64_t raw = DoubleToRawbits(value);
357
358 uint32_t sign = raw >> 63;
359 uint32_t exponent = (1 << 8) - 1;
360 uint32_t payload =
361 static_cast<uint32_t>(ExtractUnsignedBitfield64(50, 52 - 23, raw));
362 payload |= (1 << 22); // Force a quiet NaN.
363
364 return RawbitsToFloat((sign << 31) | (exponent << 23) | payload);
365 }
366
367 case FP_ZERO:
368 case FP_INFINITE: {
369 // In a C++ cast, any value representable in the target type will be
370 // unchanged. This is always the case for +/-0.0 and infinities.
371 return static_cast<float>(value);
372 }
373
374 case FP_NORMAL:
375 case FP_SUBNORMAL: {
376 // Convert double-to-float as the processor would, assuming that FPCR.FZ
377 // (flush-to-zero) is not set.
378 uint64_t raw = DoubleToRawbits(value);
379 // Extract the IEEE-754 double components.
380 uint32_t sign = raw >> 63;
381 // Extract the exponent and remove the IEEE-754 encoding bias.
382 int32_t exponent =
383 static_cast<int32_t>(ExtractUnsignedBitfield64(62, 52, raw)) - 1023;
384 // Extract the mantissa and add the implicit '1' bit.
385 uint64_t mantissa = ExtractUnsignedBitfield64(51, 0, raw);
386 if (std::fpclassify(value) == FP_NORMAL) {
387 mantissa |= (UINT64_C(1) << 52);
388 }
389 return FPRoundToFloat(sign, exponent, mantissa, round_mode);
390 }
391 }
392
393 VIXL_UNREACHABLE();
394 return value;
395 }
396
397 // TODO: We should consider implementing a full FPToDouble(Float16)
398 // conversion function (for performance reasons).
FPToDouble(Float16 value,UseDefaultNaN DN,bool * exception)399 double FPToDouble(Float16 value, UseDefaultNaN DN, bool* exception) {
400 // We can rely on implicit float to double conversion here.
401 return FPToFloat(value, DN, exception);
402 }
403
404
FPToDouble(float value,UseDefaultNaN DN,bool * exception)405 double FPToDouble(float value, UseDefaultNaN DN, bool* exception) {
406 switch (std::fpclassify(value)) {
407 case FP_NAN: {
408 if (IsSignallingNaN(value)) {
409 if (exception != NULL) {
410 *exception = true;
411 }
412 }
413 if (DN == kUseDefaultNaN) return kFP64DefaultNaN;
414
415 // Convert NaNs as the processor would:
416 // - The sign is propagated.
417 // - The payload (mantissa) is transferred entirely, except that the top
418 // bit is forced to '1', making the result a quiet NaN. The unused
419 // (low-order) payload bits are set to 0.
420 uint32_t raw = FloatToRawbits(value);
421
422 uint64_t sign = raw >> 31;
423 uint64_t exponent = (1 << 11) - 1;
424 uint64_t payload = ExtractUnsignedBitfield64(21, 0, raw);
425 payload <<= (52 - 23); // The unused low-order bits should be 0.
426 payload |= (UINT64_C(1) << 51); // Force a quiet NaN.
427
428 return RawbitsToDouble((sign << 63) | (exponent << 52) | payload);
429 }
430
431 case FP_ZERO:
432 case FP_NORMAL:
433 case FP_SUBNORMAL:
434 case FP_INFINITE: {
435 // All other inputs are preserved in a standard cast, because every value
436 // representable using an IEEE-754 float is also representable using an
437 // IEEE-754 double.
438 return static_cast<double>(value);
439 }
440 }
441
442 VIXL_UNREACHABLE();
443 return static_cast<double>(value);
444 }
445
446
FPToFloat16(float value,FPRounding round_mode,UseDefaultNaN DN,bool * exception)447 Float16 FPToFloat16(float value,
448 FPRounding round_mode,
449 UseDefaultNaN DN,
450 bool* exception) {
451 // Only the FPTieEven rounding mode is implemented.
452 VIXL_ASSERT(round_mode == FPTieEven);
453 USE(round_mode);
454
455 uint32_t raw = FloatToRawbits(value);
456 int32_t sign = raw >> 31;
457 int32_t exponent = ExtractUnsignedBitfield32(30, 23, raw) - 127;
458 uint32_t mantissa = ExtractUnsignedBitfield32(22, 0, raw);
459
460 switch (std::fpclassify(value)) {
461 case FP_NAN: {
462 if (IsSignallingNaN(value)) {
463 if (exception != NULL) {
464 *exception = true;
465 }
466 }
467 if (DN == kUseDefaultNaN) return kFP16DefaultNaN;
468
469 // Convert NaNs as the processor would:
470 // - The sign is propagated.
471 // - The payload (mantissa) is transferred as much as possible, except
472 // that the top bit is forced to '1', making the result a quiet NaN.
473 uint16_t result = (sign == 0) ? Float16ToRawbits(kFP16PositiveInfinity)
474 : Float16ToRawbits(kFP16NegativeInfinity);
475 result |= mantissa >> (kFloatMantissaBits - kFloat16MantissaBits);
476 result |= (1 << 9); // Force a quiet NaN;
477 return RawbitsToFloat16(result);
478 }
479
480 case FP_ZERO:
481 return (sign == 0) ? kFP16PositiveZero : kFP16NegativeZero;
482
483 case FP_INFINITE:
484 return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
485
486 case FP_NORMAL:
487 case FP_SUBNORMAL: {
488 // Convert float-to-half as the processor would, assuming that FPCR.FZ
489 // (flush-to-zero) is not set.
490
491 // Add the implicit '1' bit to the mantissa.
492 mantissa += (1 << 23);
493 return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
494 }
495 }
496
497 VIXL_UNREACHABLE();
498 return kFP16PositiveZero;
499 }
500
501
FPToFloat16(double value,FPRounding round_mode,UseDefaultNaN DN,bool * exception)502 Float16 FPToFloat16(double value,
503 FPRounding round_mode,
504 UseDefaultNaN DN,
505 bool* exception) {
506 // Only the FPTieEven rounding mode is implemented.
507 VIXL_ASSERT(round_mode == FPTieEven);
508 USE(round_mode);
509
510 uint64_t raw = DoubleToRawbits(value);
511 int32_t sign = raw >> 63;
512 int64_t exponent = ExtractUnsignedBitfield64(62, 52, raw) - 1023;
513 uint64_t mantissa = ExtractUnsignedBitfield64(51, 0, raw);
514
515 switch (std::fpclassify(value)) {
516 case FP_NAN: {
517 if (IsSignallingNaN(value)) {
518 if (exception != NULL) {
519 *exception = true;
520 }
521 }
522 if (DN == kUseDefaultNaN) return kFP16DefaultNaN;
523
524 // Convert NaNs as the processor would:
525 // - The sign is propagated.
526 // - The payload (mantissa) is transferred as much as possible, except
527 // that the top bit is forced to '1', making the result a quiet NaN.
528 uint16_t result = (sign == 0) ? Float16ToRawbits(kFP16PositiveInfinity)
529 : Float16ToRawbits(kFP16NegativeInfinity);
530 result |= mantissa >> (kDoubleMantissaBits - kFloat16MantissaBits);
531 result |= (1 << 9); // Force a quiet NaN;
532 return RawbitsToFloat16(result);
533 }
534
535 case FP_ZERO:
536 return (sign == 0) ? kFP16PositiveZero : kFP16NegativeZero;
537
538 case FP_INFINITE:
539 return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
540 case FP_NORMAL:
541 case FP_SUBNORMAL: {
542 // Convert double-to-half as the processor would, assuming that FPCR.FZ
543 // (flush-to-zero) is not set.
544
545 // Add the implicit '1' bit to the mantissa.
546 mantissa += (UINT64_C(1) << 52);
547 return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
548 }
549 }
550
551 VIXL_UNREACHABLE();
552 return kFP16PositiveZero;
553 }
554
555 } // namespace vixl
556