blob: 17443680c1e2996759cf34ec79560e1521d179bb [file] [log] [blame]
/* ==========================================================================
* ieee754.c -- floating-point conversion between half, double & single-precision
*
* Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved.
* Copyright (c) 2021, Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*
* See BSD-3-Clause license in file named "LICENSE"
*
* Created on 7/23/18
* ========================================================================== */
/*
* Include before QCBOR_DISABLE_PREFERRED_FLOAT is checked as
* QCBOR_DISABLE_PREFERRED_FLOAT might be defined in qcbor/qcbor_common.h
*/
#include "qcbor/qcbor_common.h"
#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
#include "ieee754.h"
#include <string.h> /* For memcpy() */
/*
* This code has long lines and is easier to read because of
* them. Some coding guidelines prefer 80 column lines (can they not
* afford big displays?).
*
* This code works solely using shifts and masks and thus has no
* dependency on any math libraries. It can even work if the CPU
* doesn't have any floating-point support, though that isn't the most
* useful thing to do.
*
* The memcpy() dependency is only for CopyFloatToUint32() and friends
* which only is needed to avoid type punning when converting the
* actual float bits to an unsigned value so the bit shifts and masks
* can work.
*
* The references used to write this code:
*
* IEEE 754-2008, particularly section 3.6 and 6.2.1
*
* https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
*
* https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
*
* https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules
*
* https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be
*
* IEEE754_FloatToDouble(uint32_t uFloat) was created but is not
* needed. It can be retrieved from github history if needed.
*/
/* ----- Half Precsion ----------- */
#define HALF_NUM_SIGNIFICAND_BITS (10)
#define HALF_NUM_EXPONENT_BITS (5)
#define HALF_NUM_SIGN_BITS (1)
#define HALF_SIGNIFICAND_SHIFT (0)
#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
#define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits
#define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
#define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // 0x8000 1 bit of sign
#define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
/* Biased Biased Unbiased Use
* 0x00 0 -15 0 and subnormal
* 0x01 1 -14 Smallest normal exponent
* 0x1e 30 15 Largest normal exponent
* 0x1F 31 16 NaN and Infinity */
#define HALF_EXPONENT_BIAS (15)
#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
/* ------ Single-Precision -------- */
#define SINGLE_NUM_SIGNIFICAND_BITS (23)
#define SINGLE_NUM_EXPONENT_BITS (8)
#define SINGLE_NUM_SIGN_BITS (1)
#define SINGLE_SIGNIFICAND_SHIFT (0)
#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
#define SINGLE_SIGNIFICAND_MASK (0x7fffffU) // The lower 23 bits
#define SINGLE_EXPONENT_MASK (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
#define SINGLE_SIGN_MASK (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign
#define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1))
/* Biased Biased Unbiased Use
* 0x0000 0 -127 0 and subnormal
* 0x0001 1 -126 Smallest normal exponent
* 0x7f 127 0 1
* 0xfe 254 127 Largest normal exponent
* 0xff 255 128 NaN and Infinity */
#define SINGLE_EXPONENT_BIAS (127)
#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS)
#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1)
#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS)
#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1)
/* --------- Double-Precision ---------- */
#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
#define DOUBLE_NUM_EXPONENT_BITS (11)
#define DOUBLE_NUM_SIGN_BITS (1)
#define DOUBLE_SIGNIFICAND_SHIFT (0)
#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
/* Biased Biased Unbiased Use
* 0x00000000 0 -1023 0 and subnormal
* 0x00000001 1 -1022 Smallest normal exponent
* 0x000007fe 2046 1023 Largest normal exponent
* 0x000007ff 2047 1024 NaN and Infinity */
#define DOUBLE_EXPONENT_BIAS (1023)
#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS)
#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1)
#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS)
#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1)
/*
* Convenient functions to avoid type punning, compiler warnings and
* such. The optimizer reduces them to a simple assignment. This is a
* crusty corner of C. It shouldn't be this hard.
*
* These are also in UsefulBuf.h under a different name. They are copied
* here to avoid a dependency on UsefulBuf.h. There is no object code
* size impact because these always optimze down to a simple assignment.
*/
static inline uint32_t
CopyFloatToUint32(float f)
{
uint32_t u32;
memcpy(&u32, &f, sizeof(uint32_t));
return u32;
}
static inline uint64_t
CopyDoubleToUint64(double d)
{
uint64_t u64;
memcpy(&u64, &d, sizeof(uint64_t));
return u64;
}
static inline double
CopyUint64ToDouble(uint64_t u64)
{
double d;
memcpy(&d, &u64, sizeof(uint64_t));
return d;
}
static inline float
CopyUint32ToSingle(uint32_t u32)
{
float f;
memcpy(&f, &u32, sizeof(uint32_t));
return f;
}
/**
* @brief Assemble sign, significand and exponent into single precision float.
*
* @param[in] uDoubleSign 0 if positive, 1 if negative
* @pararm[in] uDoubleSignificand Bits of the significand
* @param[in] nDoubleUnBiasedExponent Exponent
*
* This returns the bits for a single-precision float, a binary64
* as specified in IEEE754.
*/
static double
IEEE754_AssembleDouble(uint64_t uDoubleSign,
uint64_t uDoubleSignificand,
int64_t nDoubleUnBiasedExponent)
{
uint64_t uDoubleBiasedExponent;
uDoubleBiasedExponent = (uint64_t)(nDoubleUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
return CopyUint64ToDouble(uDoubleSignificand |
(uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
(uDoubleSign << DOUBLE_SIGN_SHIFT));
}
double
IEEE754_HalfToDouble(uint16_t uHalfPrecision)
{
uint64_t uDoubleSignificand;
int64_t nDoubleUnBiasedExponent;
double dResult;
/* Pull out the three parts of the half-precision float. Do all
* the work in 64 bits because that is what the end result is. It
* may give smaller code size and will keep static analyzers
* happier.
*/
const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
const uint64_t uHalfBiasedExponent = (uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT;
const int64_t nHalfUnBiasedExponent = (int64_t)uHalfBiasedExponent - HALF_EXPONENT_BIAS;
const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
/* 0 or subnormal */
if(uHalfSignificand) {
/* --- SUBNORMAL --- */
/* A half-precision subnormal can always be converted to a
* normal double-precision float because the ranges line up.
* The exponent of a subnormal starts out at the min exponent
* for a normal. As the sub normal significand bits are
* shifted, left to normalize, the exponent is
* decremented. Shifting continues until fully normalized.
*/
nDoubleUnBiasedExponent = HALF_EXPONENT_MIN;
uDoubleSignificand = uHalfSignificand;
do {
uDoubleSignificand <<= 1;
nDoubleUnBiasedExponent--;
} while ((uDoubleSignificand & (1ULL << HALF_NUM_SIGNIFICAND_BITS)) == 0);
/* A normal has an implied 1 in the most significant
* position that a subnormal doesn't. */
uDoubleSignificand -= 1ULL << HALF_NUM_SIGNIFICAND_BITS;
/* Must shift into place for a double significand */
uDoubleSignificand <<= DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
dResult = IEEE754_AssembleDouble(uHalfSign,
uDoubleSignificand,
nDoubleUnBiasedExponent);
} else {
/* --- ZERO --- */
dResult = IEEE754_AssembleDouble(uHalfSign,
0,
DOUBLE_EXPONENT_ZERO);
}
} else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
/* NaN or Inifinity */
if(uHalfSignificand) {
/* --- NaN --- */
/* Half-precision payloads always fit into double precision
* payloads. They are shifted left the same as a normal
* number significand.
*/
uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
dResult = IEEE754_AssembleDouble(uHalfSign,
uDoubleSignificand,
DOUBLE_EXPONENT_INF_OR_NAN);
} else {
/* --- INFINITY --- */
dResult = IEEE754_AssembleDouble(uHalfSign,
0,
DOUBLE_EXPONENT_INF_OR_NAN);
}
} else {
/* --- NORMAL NUMBER --- */
uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
dResult = IEEE754_AssembleDouble(uHalfSign,
uDoubleSignificand,
nHalfUnBiasedExponent);
}
return dResult;
}
/**
* @brief Assemble sign, significand and exponent into single precision float.
*
* @param[in] uHalfSign 0 if positive, 1 if negative
* @pararm[in] uHalfSignificand Bits of the significand
* @param[in] nHalfUnBiasedExponent Exponent
*
* This returns the bits for a single-precision float, a binary32 as
* specified in IEEE754. It is returned as a uint64_t rather than a
* uint32_t or a float for convenience of usage.
*/
static uint32_t
IEEE754_AssembleHalf(uint32_t uHalfSign,
uint32_t uHalfSignificand,
int32_t nHalfUnBiasedExponent)
{
uint32_t uHalfUnbiasedExponent;
uHalfUnbiasedExponent = (uint32_t)(nHalfUnBiasedExponent + HALF_EXPONENT_BIAS);
return uHalfSignificand |
(uHalfUnbiasedExponent << HALF_EXPONENT_SHIFT) |
(uHalfSign << HALF_SIGN_SHIFT);
}
/* Public function; see ieee754.h */
IEEE754_union
IEEE754_SingleToHalf(float f)
{
IEEE754_union result;
uint32_t uDroppedBits;
int32_t nExponentDifference;
int32_t nShiftAmount;
uint32_t uHalfSignificand;
/* Pull the three parts out of the double-precision float Most work
* is done with uint32_t which helps avoid integer promotions and
* static analyzer complaints.
*/
const uint32_t uSingle = CopyFloatToUint32(f);
const uint32_t uSingleBiasedExponent = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT;
const int32_t nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS;
const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
if(uSingleSignificand == 0) {
/* --- IS ZERO --- */
result.uSize = IEEE754_UNION_IS_HALF;
result.uValue = IEEE754_AssembleHalf(uSingleSign,
0,
HALF_EXPONENT_ZERO);
} else {
/* --- IS SINGLE SUBNORMAL --- */
/* The largest single subnormal is slightly less than the
* largest single normal which is 2^-149 or
* 2.2040517676619426e-38. The smallest half subnormal is
* 2^-14 or 5.9604644775390625E-8. There is no overlap so
* single subnormals can't be converted to halfs of any sort.
*/
result.uSize = IEEE754_UNION_IS_SINGLE;
result.uValue = uSingle;
}
} else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
if(uSingleSignificand == 0) {
/* ---- IS INFINITY ---- */
result.uSize = IEEE754_UNION_IS_HALF;
result.uValue = IEEE754_AssembleHalf(uSingleSign, 0, HALF_EXPONENT_INF_OR_NAN);
} else {
/* The NaN can only be converted if no payload bits are lost
* per RFC 8949 section 4.1 that defines Preferred
* Serializaton. Note that Deterministically Encode CBOR in
* section 4.2 allows for some variation of this rule, but at
* the moment this implementation is of Preferred
* Serialization, not CDE. As of December 2023, we are also
* expecting an update to CDE. This code may need to be
* updated for CDE.
*/
uDroppedBits = uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS);
if(uDroppedBits == 0) {
/* --- IS CONVERTABLE NAN --- */
uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
result.uSize = IEEE754_UNION_IS_HALF;
result.uValue = IEEE754_AssembleHalf(uSingleSign,
uHalfSignificand,
HALF_EXPONENT_INF_OR_NAN);
} else {
/* --- IS UNCONVERTABLE NAN --- */
result.uSize = IEEE754_UNION_IS_SINGLE;
result.uValue = uSingle;
}
}
} else {
/* ---- REGULAR NUMBER ---- */
/* A regular single can be converted to a regular half if the
* single's exponent is in the smaller range of a half and if no
* precision is lost in the significand.
*/
if(nSingleUnbiasedExponent >= HALF_EXPONENT_MIN &&
nSingleUnbiasedExponent <= HALF_EXPONENT_MAX &&
(uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS)) == 0) {
uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
/* --- CONVERT TO HALF NORMAL --- */
result.uSize = IEEE754_UNION_IS_HALF;
result.uValue = IEEE754_AssembleHalf(uSingleSign,
uHalfSignificand,
nSingleUnbiasedExponent);
} else {
/* Unable to convert to a half normal. See if it can be
* converted to a half subnormal. To do that, the exponent
* must be in range and no precision can be lost in the
* signficand.
*
* This is more complicated because the number is not
* normalized. The signficand must be shifted proprotionally
* to the exponent and 1 must be added in. See
* https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
*
* Exponents -14 to -24 map to a shift of 0 to 10 of the
* significand. The largest value of a half subnormal has an
* exponent of -14. Subnormals are not normalized like
* normals meaning they lose precision as the numbers get
* smaller. Normals don't lose precision because the exponent
* allows all the bits of the significand to be significant.
*/
/* The exponent of the largest possible half-precision
* subnormal is HALF_EXPONENT_MIN (-14). Exponents larger
* than this are normal and handled above. We're going to
* shift the significand right by at least this amount.
*/
nExponentDifference = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
/* In addition to the shift based on the exponent's value,
* the single significand has to be shifted right to fit into
* a half-precision significand */
nShiftAmount = nExponentDifference + (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
/* Must add 1 in to the possible significand because there is
* an implied 1 for normal values and not for subnormal
* values. See equations here:
* https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
*/
uHalfSignificand = (uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;
/* If only zero bits get shifted out, this can be converted
* to subnormal */
if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN &&
nSingleUnbiasedExponent >= HALF_EXPONENT_MIN - HALF_NUM_SIGNIFICAND_BITS &&
uHalfSignificand << nShiftAmount == uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) {
/* --- CONVERTABLE TO HALF SUBNORMAL --- */
result.uSize = IEEE754_UNION_IS_HALF;
result.uValue = IEEE754_AssembleHalf(uSingleSign,
uHalfSignificand,
HALF_EXPONENT_ZERO);
} else {
/* --- DO NOT CONVERT --- */
result.uSize = IEEE754_UNION_IS_SINGLE;
result.uValue = uSingle;
}
}
}
return result;
}
/**
* @brief Assemble sign, significand and exponent into single precision float.
*
* @param[in] uSingleSign 0 if positive, 1 if negative
* @pararm[in] uSingleSignificand Bits of the significand
* @param[in] nSingleUnBiasedExponent Exponent
*
* This returns the bits for a single-precision float, a binary32 as
* specified in IEEE754. It is returned as a uint64_t rather than a
* uint32_t or a float for convenience of usage.
*/
static uint64_t
IEEE754_AssembleSingle(uint64_t uSingleSign,
uint64_t uSingleSignificand,
int64_t nSingleUnBiasedExponent)
{
uint64_t uSingleBiasedExponent;
uSingleBiasedExponent = (uint64_t)(nSingleUnBiasedExponent + SINGLE_EXPONENT_BIAS);
return uSingleSignificand |
(uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
(uSingleSign << SINGLE_SIGN_SHIFT);
}
/**
* @brief Convert a double-precision float to single-precision.
*
* @param[in] d The value to convert.
*
* @returns Either unconverted value or value converted to single-precision.
*
* This always succeeds. If the value cannot be converted without the
* loss of precision, it is not converted.
*
* This handles all subnormals and NaN payloads.
*/
static IEEE754_union
IEEE754_DoubleToSingle(double d)
{
IEEE754_union Result;
int64_t nExponentDifference;
int64_t nShiftAmount;
uint64_t uSingleSignificand;
uint64_t uDroppedBits;
/* Pull the three parts out of the double-precision float. Most
* work is done with uint64_t which helps avoid integer promotions
* and static analyzer complaints.
*/
const uint64_t uDouble = CopyDoubleToUint64(d);
const uint64_t uDoubleBiasedExponent = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT;
const int64_t nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS;
const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
if(uDoubleSignificand == 0) {
/* --- IS ZERO --- */
Result.uSize = IEEE754_UNION_IS_SINGLE;
Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
0,
SINGLE_EXPONENT_ZERO);
} else {
/* --- IS DOUBLE SUBNORMAL --- */
/* The largest double subnormal is slightly less than the
* largest double normal which is 2^-1022 or
* 2.2250738585072014e-308. The smallest single subnormal
* is 2^-149 or 1.401298464324817e-45. There is no
* overlap so double subnormals can't be converted to
* singles of any sort.
*/
Result.uSize = IEEE754_UNION_IS_DOUBLE;
Result.uValue = uDouble;
}
} else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
if(uDoubleSignificand == 0) {
/* ---- IS INFINITY ---- */
Result.uSize = IEEE754_UNION_IS_SINGLE;
Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
0,
SINGLE_EXPONENT_INF_OR_NAN);
} else {
/* The NaN can only be converted if no payload bits are
* lost per RFC 8949 section 4.1 that defines Preferred
* Serializaton. Note that Deterministically Encode CBOR
* in section 4.2 allows for some variation of this rule,
* but at the moment this implementation is of Preferred
* Serialization, not CDE. As of December 2023, we are
* also expecting an update to CDE. This code may need to
* be updated for CDE.
*/
uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);
if(uDroppedBits == 0) {
/* --- IS CONVERTABLE NAN --- */
uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
Result.uSize = IEEE754_UNION_IS_SINGLE;
Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
uSingleSignificand,
SINGLE_EXPONENT_INF_OR_NAN);
} else {
/* --- IS UNCONVERTABLE NAN --- */
Result.uSize = IEEE754_UNION_IS_DOUBLE;
Result.uValue = uDouble;
}
}
} else {
/* ---- REGULAR NUMBER ---- */
/* A regular double can be converted to a regular single if
* the double's exponent is in the smaller range of a single
* and if no precision is lost in the significand.
*/
uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);
if(nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN &&
nDoubleUnbiasedExponent <= SINGLE_EXPONENT_MAX &&
uDroppedBits == 0) {
/* --- IS CONVERTABLE TO SINGLE --- */
uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
Result.uSize = IEEE754_UNION_IS_SINGLE;
Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
uSingleSignificand,
nDoubleUnbiasedExponent);
} else {
/* Unable to convert to a single normal. See if it can be
* converted to a single subnormal. To do that, the
* exponent must be in range and no precision can be lost
* in the signficand.
*
* This is more complicated because the number is not
* normalized. The signficand must be shifted
* proprotionally to the exponent and 1 must be added
* in. See
* https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
*/
nExponentDifference = -(nDoubleUnbiasedExponent - SINGLE_EXPONENT_MIN);
nShiftAmount = nExponentDifference + (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
uSingleSignificand = (uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;
if(nDoubleUnbiasedExponent < SINGLE_EXPONENT_MIN &&
nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN - SINGLE_NUM_SIGNIFICAND_BITS &&
uSingleSignificand << nShiftAmount == uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) {
/* --- IS CONVERTABLE TO SINGLE SUBNORMAL --- */
Result.uSize = IEEE754_UNION_IS_SINGLE;
Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
uSingleSignificand,
SINGLE_EXPONENT_ZERO);
} else {
/* --- CAN NOT BE CONVERTED --- */
Result.uSize = IEEE754_UNION_IS_DOUBLE;
Result.uValue = uDouble;
}
}
}
return Result;
}
/* Public function; see ieee754.h */
IEEE754_union
IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision)
{
IEEE754_union result;
result = IEEE754_DoubleToSingle(d);
if(result.uSize == IEEE754_UNION_IS_SINGLE && bAllowHalfPrecision) {
/* Cast to uint32_t is OK, because value was just successfully
* converted to single. */
float uSingle = CopyUint32ToSingle((uint32_t)result.uValue);
result = IEEE754_SingleToHalf(uSingle);
}
return result;
}
#else /* QCBOR_DISABLE_PREFERRED_FLOAT */
int ieee754_dummy_place_holder;
#endif /* QCBOR_DISABLE_PREFERRED_FLOAT */