framework/common/tcuFloat.hpp

#ifndef _TCUFLOAT_HPP
#define _TCUFLOAT_HPP
/*-------------------------------------------------------------------------
 * drawElements Quality Program Tester Core
 * ----------------------------------------
 *
 * Copyright 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *//*!
 * \file
 * \brief Reconfigurable floating-point value template.
 *//*--------------------------------------------------------------------*/

#include "tcuDefs.hpp"

// For memcpy().
#include <string.h>

namespace tcu
{

enum FloatFlags
{
	FLOAT_HAS_SIGN			= (1<<0),
	FLOAT_SUPPORT_DENORM	= (1<<1)
};

enum RoundingDirection
{
	ROUND_TO_EVEN = 0,
	ROUND_DOWNWARD,		// Towards -Inf.
	ROUND_UPWARD,		// Towards +Inf.
};

/*--------------------------------------------------------------------*//*!
 * \brief Floating-point format template
 *
 * This template implements arbitrary floating-point handling. Template
 * can be used for conversion between different formats and checking
 * various properties of floating-point values.
 *//*--------------------------------------------------------------------*/
template <typename StorageType_, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
class Float
{
public:
	typedef StorageType_ StorageType;

	enum
	{
		EXPONENT_BITS	= ExponentBits,
		MANTISSA_BITS	= MantissaBits,
		EXPONENT_BIAS	= ExponentBias,
		FLAGS			= Flags,
	};

							Float			(void);
	explicit				Float			(StorageType value);
	explicit				Float			(float v, RoundingDirection rd = ROUND_TO_EVEN);
	explicit				Float			(double v, RoundingDirection rd = ROUND_TO_EVEN);

	template <typename OtherStorageType, int OtherExponentBits, int OtherMantissaBits, int OtherExponentBias, deUint32 OtherFlags>
	static Float			convert			(const Float<OtherStorageType, OtherExponentBits, OtherMantissaBits, OtherExponentBias, OtherFlags>& src, RoundingDirection rd = ROUND_TO_EVEN);

	static inline Float		convert			(const Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>& src, RoundingDirection = ROUND_TO_EVEN) { return src; }

	/*--------------------------------------------------------------------*//*!
	 * \brief Construct floating point value
	 * \param sign		Sign. Must be +1/-1
	 * \param exponent	Exponent in range [1-ExponentBias, ExponentBias+1]
	 * \param mantissa	Mantissa bits with implicit leading bit explicitly set
	 * \return The specified float
	 *
	 * This function constructs a floating point value from its inputs.
	 * The normally implicit leading bit of the mantissa must be explicitly set.
	 * The exponent normally used for zero/subnormals is an invalid input. Such
	 * values are specified with the leading mantissa bit of zero and the lowest
	 * normal exponent (1-ExponentBias). Additionally having both exponent and
	 * mantissa set to zero is a shorthand notation for the correctly signed
	 * floating point zero. Inf and NaN must be specified directly with an
	 * exponent of ExponentBias+1 and the appropriate mantissa (with leading
	 * bit set)
	 *//*--------------------------------------------------------------------*/
	static inline Float		construct		(int sign, int exponent, StorageType mantissa);

	/*--------------------------------------------------------------------*//*!
	 * \brief Construct floating point value. Explicit version
	 * \param sign		Sign. Must be +1/-1
	 * \param exponent	Exponent in range [-ExponentBias, ExponentBias+1]
	 * \param mantissa	Mantissa bits
	 * \return The specified float
	 *
	 * This function constructs a floating point value from its inputs with
	 * minimal intervention.
	 * The sign is turned into a sign bit and the exponent bias is added.
	 * See IEEE-754 for additional information on the inputs and
	 * the encoding of special values.
	 *//*--------------------------------------------------------------------*/
	static Float			constructBits	(int sign, int exponent, StorageType mantissaBits);

	StorageType				bits			(void) const	{ return m_value;															}
	float					asFloat			(void) const;
	double					asDouble		(void) const;

	inline int				signBit			(void) const	{ return (int)(m_value >> (ExponentBits+MantissaBits)) & 1;					}
	inline StorageType		exponentBits	(void) const	{ return (m_value >> MantissaBits) & ((StorageType(1)<<ExponentBits)-1);	}
	inline StorageType		mantissaBits	(void) const	{ return m_value & ((StorageType(1)<<MantissaBits)-1);						}

	inline int				sign			(void) const	{ return signBit() ? -1 : 1;																			}
	inline int				exponent		(void) const	{ return isDenorm() ? 1	- ExponentBias : (int)exponentBits() - ExponentBias;							}
	inline StorageType		mantissa		(void) const	{ return isZero() || isDenorm() ? mantissaBits() : (mantissaBits() | (StorageType(1)<<MantissaBits));	}

	inline bool				isInf			(void) const	{ return exponentBits() == ((1<<ExponentBits)-1)	&& mantissaBits() == 0;	}
	inline bool				isNaN			(void) const	{ return exponentBits() == ((1<<ExponentBits)-1)	&& mantissaBits() != 0;	}
	inline bool				isZero			(void) const	{ return exponentBits() == 0						&& mantissaBits() == 0;	}
	inline bool				isDenorm		(void) const	{ return exponentBits() == 0						&& mantissaBits() != 0;	}

	inline bool				operator<		(const Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>& other) const { return this->asDouble() < other.asDouble(); }

	static Float			zero			(int sign);
	static Float			inf				(int sign);
	static Float			nan				(void);

	static Float			largestNormal	(int sign);
	static Float			smallestNormal	(int sign);

private:
	StorageType				m_value;
} DE_WARN_UNUSED_TYPE;

// Common floating-point types.
typedef Float<deUint16,  5, 10,   15, FLOAT_HAS_SIGN|FLOAT_SUPPORT_DENORM>	Float16;	//!< IEEE 754-2008 16-bit floating-point value
typedef Float<deUint32,  8, 23,  127, FLOAT_HAS_SIGN|FLOAT_SUPPORT_DENORM>	Float32;	//!< IEEE 754 32-bit floating-point value
typedef Float<deUint64, 11, 52, 1023, FLOAT_HAS_SIGN|FLOAT_SUPPORT_DENORM>	Float64;	//!< IEEE 754 64-bit floating-point value

typedef Float<deUint16,  5, 10,   15, FLOAT_HAS_SIGN>	Float16Denormless;	//!< IEEE 754-2008 16-bit floating-point value without denormalized support

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float (void)
	: m_value(0)
{
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float (StorageType value)
	: m_value(value)
{
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float (float value, RoundingDirection rd)
	: m_value(0)
{
	deUint32 u32;
	memcpy(&u32, &value, sizeof(deUint32));
	*this = convert(Float32(u32), rd);
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float (double value, RoundingDirection rd)
	: m_value(0)
{
	deUint64 u64;
	memcpy(&u64, &value, sizeof(deUint64));
	*this = convert(Float64(u64), rd);
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
inline float Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::asFloat (void) const
{
	float		v;
	deUint32	u32		= Float32::convert(*this).bits();
	memcpy(&v, &u32, sizeof(deUint32));
	return v;
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
inline double Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::asDouble (void) const
{
	double		v;
	deUint64	u64		= Float64::convert(*this).bits();
	memcpy(&v, &u64, sizeof(deUint64));
	return v;
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::zero (int sign)
{
	DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
	return Float(StorageType((sign > 0 ? 0ull : 1ull) << (ExponentBits+MantissaBits)));
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::inf (int sign)
{
	DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
	return Float(StorageType(((sign > 0 ? 0ull : 1ull) << (ExponentBits+MantissaBits)) | (((1ull<<ExponentBits)-1) << MantissaBits)));
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::nan (void)
{
	return Float(StorageType((1ull<<(ExponentBits+MantissaBits))-1));
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::largestNormal (int sign)
{
	DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
	return Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct(sign, ExponentBias, (static_cast<StorageType>(1) << (MantissaBits + 1)) - 1);
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::smallestNormal (int sign)
{
	DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
	return Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct(sign, 1 - ExponentBias, (static_cast<StorageType>(1) << MantissaBits));
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>
Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct
	(int sign, int exponent, StorageType mantissa)
{
	// Repurpose this otherwise invalid input as a shorthand notation for zero (no need for caller to care about internal representation)
	const bool			isShorthandZero	= exponent == 0 && mantissa == 0;

	// Handles the typical notation for zero (min exponent, mantissa 0). Note that the exponent usually used exponent (-ExponentBias) for zero/subnormals is not used.
	// Instead zero/subnormals have the (normally implicit) leading mantissa bit set to zero.
	const bool			isDenormOrZero	= (exponent == 1 - ExponentBias) && (mantissa >> MantissaBits == 0);
	const StorageType	s				= StorageType((StorageType(sign < 0 ? 1 : 0)) << (StorageType(ExponentBits+MantissaBits)));
	const StorageType	exp				= (isShorthandZero  || isDenormOrZero) ? StorageType(0) : StorageType(exponent + ExponentBias);

	DE_ASSERT(sign == +1 || sign == -1);
	DE_ASSERT(isShorthandZero || isDenormOrZero || mantissa >> MantissaBits == 1);
	DE_ASSERT(exp >> ExponentBits == 0);

	return Float(StorageType(s | (exp << MantissaBits) | (mantissa & ((StorageType(1)<<MantissaBits)-1))));
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>
Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::constructBits
	(int sign, int exponent, StorageType mantissaBits)
{
	const StorageType signBit		= static_cast<StorageType>(sign < 0 ? 1 : 0);
	const StorageType exponentBits	= static_cast<StorageType>(exponent + ExponentBias);

	DE_ASSERT(sign == +1 || sign == -1 );
	DE_ASSERT(exponentBits >> ExponentBits == 0);
	DE_ASSERT(mantissaBits >> MantissaBits == 0);

	return Float(StorageType((signBit << (ExponentBits+MantissaBits)) | (exponentBits << MantissaBits) | (mantissaBits)));
}

template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, deUint32 Flags>
template <typename OtherStorageType, int OtherExponentBits, int OtherMantissaBits, int OtherExponentBias, deUint32 OtherFlags>
Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>
Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::convert
	(const Float<OtherStorageType, OtherExponentBits, OtherMantissaBits, OtherExponentBias, OtherFlags>& other, RoundingDirection rd)
{
	if (!(Flags & FLOAT_HAS_SIGN) && other.sign() < 0)
	{
		// Negative number, truncate to zero.
		return zero(+1);
	}

	if (other.isInf())
	{
		return inf(other.sign());
	}

	if (other.isNaN())
	{
		return nan();
	}

	if (other.isZero())
	{
		return zero(other.sign());
	}

	const int			eMin	= 1 - ExponentBias;
	const int			eMax	= ((1<<ExponentBits)-2) - ExponentBias;

	const StorageType	s		= StorageType((StorageType(other.signBit())) << (StorageType(ExponentBits+MantissaBits))); // \note Not sign, but sign bit.
	int					e		= other.exponent();
	deUint64			m		= other.mantissa();

	// Normalize denormalized values prior to conversion.
	while (!(m & (1ull<<OtherMantissaBits)))
	{
		m <<= 1;
		e  -= 1;
	}

	if (e < eMin)
	{
		// Underflow.
		if ((Flags & FLOAT_SUPPORT_DENORM) && (eMin-e-1 <= MantissaBits))
		{
			// Shift and round.
			int			bitDiff			= (OtherMantissaBits-MantissaBits) + (eMin-e);
			deUint64	lastBitsMask	= (1ull << bitDiff) - 1ull;
			deUint64	lastBits		= (static_cast<deUint64>(m) & lastBitsMask);
			deUint64	half			= (1ull << (bitDiff - 1)) - 1;
			deUint64	bias			= (m >> bitDiff) & 1;

			switch (rd)
			{
			case ROUND_TO_EVEN:
				return Float(StorageType(s | (m + half + bias) >> bitDiff));

			case ROUND_DOWNWARD:
				m = (m >> bitDiff);
				if (lastBits != 0ull && other.sign() < 0)
				{
					m += 1;
				}
				return Float(StorageType(s | m));

			case ROUND_UPWARD:
				m = (m >> bitDiff);
				if (lastBits != 0ull && other.sign() > 0)
				{
					m += 1;
				}
				return Float(StorageType(s | m));

			default:
				DE_ASSERT(false);
				break;
			}
		}

		return zero(other.sign());
	}

	// Remove leading 1.
	m = m & ~(1ull<<OtherMantissaBits);

	if (MantissaBits < OtherMantissaBits)
	{
		// Round mantissa.
		int			bitDiff			= OtherMantissaBits-MantissaBits;
		deUint64	lastBitsMask	= (1ull << bitDiff) - 1ull;
		deUint64	lastBits		= (static_cast<deUint64>(m) & lastBitsMask);
		deUint64	half			= (1ull << (bitDiff - 1)) - 1;
		deUint64	bias			= (m >> bitDiff) & 1;

		switch (rd)
		{
		case ROUND_TO_EVEN:
			m = (m + half + bias) >> bitDiff;
			break;

		case ROUND_DOWNWARD:
			m = (m >> bitDiff);
			if (lastBits != 0ull && other.sign() < 0)
			{
				m += 1;
			}
			break;

		case ROUND_UPWARD:
			m = (m >> bitDiff);
			if (lastBits != 0ull && other.sign() > 0)
			{
				m += 1;
			}
			break;

		default:
			DE_ASSERT(false);
			break;
		}

		if (m & (1ull<<MantissaBits))
		{
			// Overflow in mantissa.
			m  = 0;
			e += 1;
		}
	}
	else
	{
		int bitDiff = MantissaBits-OtherMantissaBits;
		m = m << bitDiff;
	}

	if (e > eMax)
	{
		// Overflow.
		return (((other.sign() < 0 && rd == ROUND_UPWARD) || (other.sign() > 0 && rd == ROUND_DOWNWARD)) ? largestNormal(other.sign()) : inf(other.sign()));
	}

	DE_ASSERT(de::inRange(e, eMin, eMax));
	DE_ASSERT(((e + ExponentBias) & ~((1ull<<ExponentBits)-1)) == 0);
	DE_ASSERT((m & ~((1ull<<MantissaBits)-1)) == 0);

	return Float(StorageType(s | (StorageType(e + ExponentBias) << MantissaBits) | m));
}

} // tcu

#endif // _TCUFLOAT_HPP