blob: 85422fecee2121f6cb483666af4b9571be9eb06f [file] [log] [blame]
/* ==========================================================================
* ieee754.h -- Conversion between half, double & single-precision floats
*
* Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*
* See BSD-3-Clause license in file named "LICENSE"
*
* Created on 7/23/18
* ========================================================================== */
#ifndef ieee754_h
#define ieee754_h
#include <stdint.h>
/** @file ieee754.h
*
* This implements floating-point conversion between half, single and
* double precision floating-point numbers, in particular conversion to
* smaller representation (e.g., double to single) that does not lose
* precision for CBOR preferred serialization.
*
* This also implements conversion of floats to whole numbers as
* is required for dCBOR.
*
* This implementation works entirely with shifts and masks and does
* not require any floating-point HW or library.
*
* This conforms to IEEE 754-2008, but note that it doesn't specify
* conversions, just the encodings.
*
* This is complete, supporting +/- infinity, +/- zero, subnormals and
* NaN payloads. NaN payloads are converted to smaller by dropping the
* right most bits if they are zero and shifting to the right. If the
* rightmost bits are not zero, the conversion is not performed. When
* converting from smaller to larger, the payload is shifted left and
* zero-padded. This is what is specified by CBOR preferred
* serialization and what modern HW conversion instructions do.
*
* There is no special handling of silent and quiet NaNs. It probably
* isn't necessary to transmit these special NaNs as their purpose is
* more for propagating errors up through some calculation. In many
* cases the handling of the NaN payload will work for silent and quiet
* NaNs.
*
* A previous version of this was usable as a general library for
* conversion. This version is reduced to what is needed for CBOR.
*/
#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
/**
* @brief Convert half-precision float to double-precision float.
*
* @param[in] uHalfPrecision Half-precision number to convert.
*
* @returns double-precision value.
*
* This is a lossless conversion because every half-precision value
* can be represented as a double. There is no error condition.
*
* There is no half-precision type in C, so it is represented here as
* a @c uint16_t. The bits of @c uHalfPrecision are as described for
* half-precision by IEEE 754.
*/
double
IEEE754_HalfToDouble(uint16_t uHalfPrecision);
/** Holds a floating-point value that could be half, single or
* double-precision. The value is in a @c uint64_t that may be copied
* to a float or double. Simply casting uValue will usually work but
* may generate compiler or static analyzer warnings. Using
* UsefulBufUtil_CopyUint64ToDouble() or
* UsefulBufUtil_CopyUint32ToFloat() will not (and will not generate
* any extra code).
*/
typedef struct {
enum {IEEE754_UNION_IS_HALF = 2,
IEEE754_UNION_IS_SINGLE = 4,
IEEE754_UNION_IS_DOUBLE = 8,
} uSize; /* Size of uValue */
uint64_t uValue;
} IEEE754_union;
/** Holds result of an attempt to convert a floating-point
* number to an int64_t or uint64_t.
*/
struct IEEE754_ToInt {
enum {IEEE754_ToInt_IS_INT,
IEEE754_ToInt_IS_UINT,
IEEE754_ToInt_IS_65BIT_NEG,
IEEE754_ToInt_NO_CONVERSION,
IEEE754_ToInt_NaN
} type;
union {
uint64_t un_signed;
int64_t is_signed;
} integer;
};
/**
* @brief Convert a double to either single or half-precision.
*
* @param[in] d The value to convert.
* @param[in] bAllowHalfPrecision If true, convert to either half or
* single precision.
*
* @returns Unconverted value, or value converted to single or half-precision.
*
* This always succeeds. If the value cannot be converted without the
* loss of precision, it is not converted.
*
* This handles all subnormals and NaN payloads.
*/
IEEE754_union
IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision, int bNoNaNPayload);
/**
* @brief Convert a single-precision float to half-precision.
*
* @param[in] f The value to convert.
*
* @returns Either unconverted value or value converted to half-precision.
*
* This always succeeds. If the value cannot be converted without the
* loss of precision, it is not converted.
*
* This handles all subnormals and NaN payloads.
*/
IEEE754_union
IEEE754_SingleToHalf(float f, int bNoNanPayloads);
/**
* @brief Convert a double-precision float to an integer if whole number
*
* @param[in] d The value to convert.
*
* @returns Either converted number or conversion status.
*
* If the value is a whole number that will fit either in a uint64_t
* or an int64_t, it is converted. If it is a NaN, then there is no
* conversion and the fact that it is a NaN is indicated in the
* returned structure. If it can't be converted, then that is
* indicated in the returned structure.
*
* This always returns positive numbers as a uint64_t even if they will
* fit in an int64_t.
*
* This never fails because of precision, but may fail because of range.
*/
struct IEEE754_ToInt
IEEE754_DoubleToInt(double d);
/**
* @brief Convert a single-precision float to an integer if whole number
*
* @param[in] f The value to convert.
*
* @returns Either converted number or conversion status.
*
* If the value is a whole number that will fit either in a uint64_t
* or an int64_t, it is converted. If it is a NaN, then there is no
* conversion and the fact that it is a NaN is indicated in the
* returned structure. If it can't be converted, then that is
* indicated in the returned structure.
*
* This always returns positive numbers as a uint64_t even if they will
* fit in an int64_t.
*
* This never fails because of precision, but may fail because of range.
*/
struct IEEE754_ToInt
IEEE754_SingleToInt(float f);
/**
* @brief Convert an unsigned integer to a double with no precision loss.
*
* @param[in] uInt The value to convert.
* @param[in] uIsNegative 0 if positive, 1 if negative.
*
* @returns Either the converted number or 0.5 if no conversion.
*
* The conversion will fail if the input can not be represented in the
* 52 bits or precision that a double has. 0.5 is returned to indicate
* no conversion. It is out-of-band from non-error results, because
* all non-error results are whole integers.
*/
#define IEEE754_UINT_TO_DOUBLE_OOB 0.5
double
IEEE754_UintToDouble(uint64_t uInt, int uIsNegative);
#endif /* ! QCBOR_DISABLE_PREFERRED_FLOAT */
/**
* @brief Tests whether NaN is "quiet" vs having a payload.
*
* @param[in] dNum Double number to test.
*
* @returns 0 if a quiet NaN, 1 if it has a payload.
*
* A quiet NaN is usually represented as 0x7ff8000000000000. That is
* the significand bits are 0x8000000000000. If the significand bits
* are other than 0x8000000000000 it is considered to have a NaN
* payload.
*
* Note that 0x7ff8000000000000 is not specified in a standard, but it
* is commonly implemented and chosen by CBOR as the best way to
* represent a NaN.
*/
int
IEEE754_DoubleHasNaNPayload(double dNum);
/**
* @brief Tests whether NaN is "quiet" vs having a payload.
*
* @param[in] fNum Float number to test.
*
* @returns 0 if a quiet NaN, 1 if it has a payload.
*
* See IEEE754_DoubleHasNaNPayload(). A single precision quiet NaN
* is 0x7fc00000.
*/
int
IEEE754_SingleHasNaNPayload(float fNum);
#endif /* ieee754_h */