| /* ========================================================================== |
| * ieee754.c -- floating-point conversion between half, double & single-precision |
| * |
| * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved. |
| * Copyright (c) 2021, Arm Limited. All rights reserved. |
| * |
| * SPDX-License-Identifier: BSD-3-Clause |
| * |
| * See BSD-3-Clause license in file named "LICENSE" |
| * |
| * Created on 7/23/18 |
| * ========================================================================== */ |
| |
| /* |
| * Include before QCBOR_DISABLE_PREFERRED_FLOAT is checked as |
| * QCBOR_DISABLE_PREFERRED_FLOAT might be defined in qcbor/qcbor_common.h |
| */ |
| #include "qcbor/qcbor_common.h" |
| |
| #ifndef QCBOR_DISABLE_PREFERRED_FLOAT |
| |
| #include "ieee754.h" |
| #include <string.h> /* For memcpy() */ |
| |
| |
| /* |
| * This code has long lines and is easier to read because of |
| * them. Some coding guidelines prefer 80 column lines (can they not |
| * afford big displays?). |
| * |
| * This code works solely using shifts and masks and thus has no |
| * dependency on any math libraries. It can even work if the CPU |
| * doesn't have any floating-point support, though that isn't the most |
| * useful thing to do. |
| * |
| * The memcpy() dependency is only for CopyFloatToUint32() and friends |
| * which only is needed to avoid type punning when converting the |
| * actual float bits to an unsigned value so the bit shifts and masks |
| * can work. |
| * |
| * The references used to write this code: |
| * |
| * IEEE 754-2008, particularly section 3.6 and 6.2.1 |
| * |
| * https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages |
| * |
| * https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values |
| * |
| * https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules |
| * |
| * https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be |
| * |
| * IEEE754_FloatToDouble(uint32_t uFloat) was created but is not |
| * needed. It can be retrieved from github history if needed. |
| */ |
| |
| |
| |
| |
| /* ----- Half Precsion ----------- */ |
| #define HALF_NUM_SIGNIFICAND_BITS (10) |
| #define HALF_NUM_EXPONENT_BITS (5) |
| #define HALF_NUM_SIGN_BITS (1) |
| |
| #define HALF_SIGNIFICAND_SHIFT (0) |
| #define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS) |
| #define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS) |
| |
| #define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits |
| #define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent |
| #define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // 0x8000 1 bit of sign |
| #define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200 |
| |
| /* Biased Biased Unbiased Use |
| * 0x00 0 -15 0 and subnormal |
| * 0x01 1 -14 Smallest normal exponent |
| * 0x1e 30 15 Largest normal exponent |
| * 0x1F 31 16 NaN and Infinity */ |
| #define HALF_EXPONENT_BIAS (15) |
| #define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased |
| #define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased |
| #define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased |
| #define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased |
| |
| |
| /* ------ Single-Precision -------- */ |
| #define SINGLE_NUM_SIGNIFICAND_BITS (23) |
| #define SINGLE_NUM_EXPONENT_BITS (8) |
| #define SINGLE_NUM_SIGN_BITS (1) |
| |
| #define SINGLE_SIGNIFICAND_SHIFT (0) |
| #define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS) |
| #define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS) |
| |
| #define SINGLE_SIGNIFICAND_MASK (0x7fffffU) // The lower 23 bits |
| #define SINGLE_EXPONENT_MASK (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent |
| #define SINGLE_SIGN_MASK (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign |
| #define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1)) |
| |
| /* Biased Biased Unbiased Use |
| * 0x0000 0 -127 0 and subnormal |
| * 0x0001 1 -126 Smallest normal exponent |
| * 0x7f 127 0 1 |
| * 0xfe 254 127 Largest normal exponent |
| * 0xff 255 128 NaN and Infinity */ |
| #define SINGLE_EXPONENT_BIAS (127) |
| #define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) |
| #define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) |
| #define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) |
| #define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) |
| |
| |
| /* --------- Double-Precision ---------- */ |
| #define DOUBLE_NUM_SIGNIFICAND_BITS (52) |
| #define DOUBLE_NUM_EXPONENT_BITS (11) |
| #define DOUBLE_NUM_SIGN_BITS (1) |
| |
| #define DOUBLE_SIGNIFICAND_SHIFT (0) |
| #define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS) |
| #define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS) |
| |
| #define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits |
| #define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent |
| #define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign |
| #define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1)) |
| |
| |
| /* Biased Biased Unbiased Use |
| * 0x00000000 0 -1023 0 and subnormal |
| * 0x00000001 1 -1022 Smallest normal exponent |
| * 0x000007fe 2046 1023 Largest normal exponent |
| * 0x000007ff 2047 1024 NaN and Infinity */ |
| #define DOUBLE_EXPONENT_BIAS (1023) |
| #define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) |
| #define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) |
| #define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) |
| #define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) |
| |
| |
| |
| |
| /* |
| * Convenient functions to avoid type punning, compiler warnings and |
| * such. The optimizer reduces them to a simple assignment. This is a |
| * crusty corner of C. It shouldn't be this hard. |
| * |
| * These are also in UsefulBuf.h under a different name. They are copied |
| * here to avoid a dependency on UsefulBuf.h. There is no object code |
| * size impact because these always optimze down to a simple assignment. |
| */ |
| static inline uint32_t |
| CopyFloatToUint32(float f) |
| { |
| uint32_t u32; |
| memcpy(&u32, &f, sizeof(uint32_t)); |
| return u32; |
| } |
| |
| static inline uint64_t |
| CopyDoubleToUint64(double d) |
| { |
| uint64_t u64; |
| memcpy(&u64, &d, sizeof(uint64_t)); |
| return u64; |
| } |
| |
| static inline double |
| CopyUint64ToDouble(uint64_t u64) |
| { |
| double d; |
| memcpy(&d, &u64, sizeof(uint64_t)); |
| return d; |
| } |
| |
| static inline float |
| CopyUint32ToSingle(uint32_t u32) |
| { |
| float f; |
| memcpy(&f, &u32, sizeof(uint32_t)); |
| return f; |
| } |
| |
| |
| |
| |
| /** |
| * @brief Assemble sign, significand and exponent into single precision float. |
| * |
| * @param[in] uDoubleSign 0 if positive, 1 if negative |
| * @pararm[in] uDoubleSignificand Bits of the significand |
| * @param[in] nDoubleUnBiasedExponent Exponent |
| * |
| * This returns the bits for a single-precision float, a binary64 |
| * as specified in IEEE754. |
| */ |
| static double |
| IEEE754_AssembleDouble(uint64_t uDoubleSign, |
| uint64_t uDoubleSignificand, |
| int64_t nDoubleUnBiasedExponent) |
| { |
| uint64_t uDoubleBiasedExponent; |
| |
| uDoubleBiasedExponent = (uint64_t)(nDoubleUnBiasedExponent + DOUBLE_EXPONENT_BIAS); |
| |
| return CopyUint64ToDouble(uDoubleSignificand | |
| (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) | |
| (uDoubleSign << DOUBLE_SIGN_SHIFT)); |
| } |
| |
| |
| double |
| IEEE754_HalfToDouble(uint16_t uHalfPrecision) |
| { |
| uint64_t uDoubleSignificand; |
| int64_t nDoubleUnBiasedExponent; |
| double dResult; |
| |
| /* Pull out the three parts of the half-precision float. Do all |
| * the work in 64 bits because that is what the end result is. It |
| * may give smaller code size and will keep static analyzers |
| * happier. |
| */ |
| const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK; |
| const uint64_t uHalfBiasedExponent = (uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT; |
| const int64_t nHalfUnBiasedExponent = (int64_t)uHalfBiasedExponent - HALF_EXPONENT_BIAS; |
| const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT; |
| |
| if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) { |
| /* 0 or subnormal */ |
| if(uHalfSignificand) { |
| /* --- SUBNORMAL --- */ |
| /* A half-precision subnormal can always be converted to a |
| * normal double-precision float because the ranges line up. |
| * The exponent of a subnormal starts out at the min exponent |
| * for a normal. As the sub normal significand bits are |
| * shifted, left to normalize, the exponent is |
| * decremented. Shifting continues until fully normalized. |
| */ |
| nDoubleUnBiasedExponent = HALF_EXPONENT_MIN; |
| uDoubleSignificand = uHalfSignificand; |
| do { |
| uDoubleSignificand <<= 1; |
| nDoubleUnBiasedExponent--; |
| } while ((uDoubleSignificand & (1ULL << HALF_NUM_SIGNIFICAND_BITS)) == 0); |
| /* A normal has an implied 1 in the most significant |
| * position that a subnormal doesn't. */ |
| uDoubleSignificand -= 1ULL << HALF_NUM_SIGNIFICAND_BITS; |
| /* Must shift into place for a double significand */ |
| uDoubleSignificand <<= DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS; |
| |
| dResult = IEEE754_AssembleDouble(uHalfSign, |
| uDoubleSignificand, |
| nDoubleUnBiasedExponent); |
| } else { |
| /* --- ZERO --- */ |
| dResult = IEEE754_AssembleDouble(uHalfSign, |
| 0, |
| DOUBLE_EXPONENT_ZERO); |
| } |
| } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) { |
| /* NaN or Inifinity */ |
| if(uHalfSignificand) { |
| /* --- NaN --- */ |
| /* Half-precision payloads always fit into double precision |
| * payloads. They are shifted left the same as a normal |
| * number significand. |
| */ |
| uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| dResult = IEEE754_AssembleDouble(uHalfSign, |
| uDoubleSignificand, |
| DOUBLE_EXPONENT_INF_OR_NAN); |
| } else { |
| /* --- INFINITY --- */ |
| dResult = IEEE754_AssembleDouble(uHalfSign, |
| 0, |
| DOUBLE_EXPONENT_INF_OR_NAN); |
| } |
| } else { |
| /* --- NORMAL NUMBER --- */ |
| uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| dResult = IEEE754_AssembleDouble(uHalfSign, |
| uDoubleSignificand, |
| nHalfUnBiasedExponent); |
| } |
| |
| return dResult; |
| } |
| |
| |
| /** |
| * @brief Assemble sign, significand and exponent into single precision float. |
| * |
| * @param[in] uHalfSign 0 if positive, 1 if negative |
| * @pararm[in] uHalfSignificand Bits of the significand |
| * @param[in] nHalfUnBiasedExponent Exponent |
| * |
| * This returns the bits for a single-precision float, a binary32 as |
| * specified in IEEE754. It is returned as a uint64_t rather than a |
| * uint32_t or a float for convenience of usage. |
| */ |
| static uint32_t |
| IEEE754_AssembleHalf(uint32_t uHalfSign, |
| uint32_t uHalfSignificand, |
| int32_t nHalfUnBiasedExponent) |
| { |
| uint32_t uHalfUnbiasedExponent; |
| |
| uHalfUnbiasedExponent = (uint32_t)(nHalfUnBiasedExponent + HALF_EXPONENT_BIAS); |
| |
| return uHalfSignificand | |
| (uHalfUnbiasedExponent << HALF_EXPONENT_SHIFT) | |
| (uHalfSign << HALF_SIGN_SHIFT); |
| } |
| |
| |
| /* Public function; see ieee754.h */ |
| IEEE754_union |
| IEEE754_SingleToHalf(float f) |
| { |
| IEEE754_union result; |
| uint32_t uDroppedBits; |
| int32_t nExponentDifference; |
| int32_t nShiftAmount; |
| uint32_t uHalfSignificand; |
| |
| /* Pull the three parts out of the double-precision float Most work |
| * is done with uint32_t which helps avoid integer promotions and |
| * static analyzer complaints. |
| */ |
| const uint32_t uSingle = CopyFloatToUint32(f); |
| const uint32_t uSingleBiasedExponent = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT; |
| const int32_t nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS; |
| const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK; |
| const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT; |
| |
| if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) { |
| if(uSingleSignificand == 0) { |
| /* --- IS ZERO --- */ |
| result.uSize = IEEE754_UNION_IS_HALF; |
| result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| 0, |
| HALF_EXPONENT_ZERO); |
| } else { |
| /* --- IS SINGLE SUBNORMAL --- */ |
| /* The largest single subnormal is slightly less than the |
| * largest single normal which is 2^-149 or |
| * 2.2040517676619426e-38. The smallest half subnormal is |
| * 2^-14 or 5.9604644775390625E-8. There is no overlap so |
| * single subnormals can't be converted to halfs of any sort. |
| */ |
| result.uSize = IEEE754_UNION_IS_SINGLE; |
| result.uValue = uSingle; |
| } |
| } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) { |
| if(uSingleSignificand == 0) { |
| /* ---- IS INFINITY ---- */ |
| result.uSize = IEEE754_UNION_IS_HALF; |
| result.uValue = IEEE754_AssembleHalf(uSingleSign, 0, HALF_EXPONENT_INF_OR_NAN); |
| } else { |
| /* The NaN can only be converted if no payload bits are lost |
| * per RFC 8949 section 4.1 that defines Preferred |
| * Serializaton. Note that Deterministically Encode CBOR in |
| * section 4.2 allows for some variation of this rule, but at |
| * the moment this implementation is of Preferred |
| * Serialization, not CDE. As of December 2023, we are also |
| * expecting an update to CDE. This code may need to be |
| * updated for CDE. |
| */ |
| uDroppedBits = uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS); |
| if(uDroppedBits == 0) { |
| /* --- IS CONVERTABLE NAN --- */ |
| uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| result.uSize = IEEE754_UNION_IS_HALF; |
| result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| uHalfSignificand, |
| HALF_EXPONENT_INF_OR_NAN); |
| |
| } else { |
| /* --- IS UNCONVERTABLE NAN --- */ |
| result.uSize = IEEE754_UNION_IS_SINGLE; |
| result.uValue = uSingle; |
| } |
| } |
| } else { |
| /* ---- REGULAR NUMBER ---- */ |
| /* A regular single can be converted to a regular half if the |
| * single's exponent is in the smaller range of a half and if no |
| * precision is lost in the significand. |
| */ |
| if(nSingleUnbiasedExponent >= HALF_EXPONENT_MIN && |
| nSingleUnbiasedExponent <= HALF_EXPONENT_MAX && |
| (uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS)) == 0) { |
| uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| |
| /* --- CONVERT TO HALF NORMAL --- */ |
| result.uSize = IEEE754_UNION_IS_HALF; |
| result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| uHalfSignificand, |
| nSingleUnbiasedExponent); |
| } else { |
| /* Unable to convert to a half normal. See if it can be |
| * converted to a half subnormal. To do that, the exponent |
| * must be in range and no precision can be lost in the |
| * signficand. |
| * |
| * This is more complicated because the number is not |
| * normalized. The signficand must be shifted proprotionally |
| * to the exponent and 1 must be added in. See |
| * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding |
| * |
| * Exponents -14 to -24 map to a shift of 0 to 10 of the |
| * significand. The largest value of a half subnormal has an |
| * exponent of -14. Subnormals are not normalized like |
| * normals meaning they lose precision as the numbers get |
| * smaller. Normals don't lose precision because the exponent |
| * allows all the bits of the significand to be significant. |
| */ |
| /* The exponent of the largest possible half-precision |
| * subnormal is HALF_EXPONENT_MIN (-14). Exponents larger |
| * than this are normal and handled above. We're going to |
| * shift the significand right by at least this amount. |
| */ |
| nExponentDifference = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN); |
| |
| /* In addition to the shift based on the exponent's value, |
| * the single significand has to be shifted right to fit into |
| * a half-precision significand */ |
| nShiftAmount = nExponentDifference + (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| |
| /* Must add 1 in to the possible significand because there is |
| * an implied 1 for normal values and not for subnormal |
| * values. See equations here: |
| * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding |
| */ |
| uHalfSignificand = (uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount; |
| |
| /* If only zero bits get shifted out, this can be converted |
| * to subnormal */ |
| if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN && |
| nSingleUnbiasedExponent >= HALF_EXPONENT_MIN - HALF_NUM_SIGNIFICAND_BITS && |
| uHalfSignificand << nShiftAmount == uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) { |
| /* --- CONVERTABLE TO HALF SUBNORMAL --- */ |
| result.uSize = IEEE754_UNION_IS_HALF; |
| result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| uHalfSignificand, |
| HALF_EXPONENT_ZERO); |
| } else { |
| /* --- DO NOT CONVERT --- */ |
| result.uSize = IEEE754_UNION_IS_SINGLE; |
| result.uValue = uSingle; |
| } |
| } |
| } |
| |
| return result; |
| } |
| |
| |
| /** |
| * @brief Assemble sign, significand and exponent into single precision float. |
| * |
| * @param[in] uSingleSign 0 if positive, 1 if negative |
| * @pararm[in] uSingleSignificand Bits of the significand |
| * @param[in] nSingleUnBiasedExponent Exponent |
| * |
| * This returns the bits for a single-precision float, a binary32 as |
| * specified in IEEE754. It is returned as a uint64_t rather than a |
| * uint32_t or a float for convenience of usage. |
| */ |
| static uint64_t |
| IEEE754_AssembleSingle(uint64_t uSingleSign, |
| uint64_t uSingleSignificand, |
| int64_t nSingleUnBiasedExponent) |
| { |
| uint64_t uSingleBiasedExponent; |
| |
| uSingleBiasedExponent = (uint64_t)(nSingleUnBiasedExponent + SINGLE_EXPONENT_BIAS); |
| |
| return uSingleSignificand | |
| (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) | |
| (uSingleSign << SINGLE_SIGN_SHIFT); |
| } |
| |
| |
| /** |
| * @brief Convert a double-precision float to single-precision. |
| * |
| * @param[in] d The value to convert. |
| * |
| * @returns Either unconverted value or value converted to single-precision. |
| * |
| * This always succeeds. If the value cannot be converted without the |
| * loss of precision, it is not converted. |
| * |
| * This handles all subnormals and NaN payloads. |
| */ |
| static IEEE754_union |
| IEEE754_DoubleToSingle(double d) |
| { |
| IEEE754_union Result; |
| int64_t nExponentDifference; |
| int64_t nShiftAmount; |
| uint64_t uSingleSignificand; |
| uint64_t uDroppedBits; |
| |
| |
| /* Pull the three parts out of the double-precision float. Most |
| * work is done with uint64_t which helps avoid integer promotions |
| * and static analyzer complaints. |
| */ |
| const uint64_t uDouble = CopyDoubleToUint64(d); |
| const uint64_t uDoubleBiasedExponent = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT; |
| const int64_t nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS; |
| const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT; |
| const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK; |
| |
| |
| if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) { |
| if(uDoubleSignificand == 0) { |
| /* --- IS ZERO --- */ |
| Result.uSize = IEEE754_UNION_IS_SINGLE; |
| Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 0, |
| SINGLE_EXPONENT_ZERO); |
| } else { |
| /* --- IS DOUBLE SUBNORMAL --- */ |
| /* The largest double subnormal is slightly less than the |
| * largest double normal which is 2^-1022 or |
| * 2.2250738585072014e-308. The smallest single subnormal |
| * is 2^-149 or 1.401298464324817e-45. There is no |
| * overlap so double subnormals can't be converted to |
| * singles of any sort. |
| */ |
| Result.uSize = IEEE754_UNION_IS_DOUBLE; |
| Result.uValue = uDouble; |
| } |
| } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) { |
| if(uDoubleSignificand == 0) { |
| /* ---- IS INFINITY ---- */ |
| Result.uSize = IEEE754_UNION_IS_SINGLE; |
| Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 0, |
| SINGLE_EXPONENT_INF_OR_NAN); |
| } else { |
| /* The NaN can only be converted if no payload bits are |
| * lost per RFC 8949 section 4.1 that defines Preferred |
| * Serializaton. Note that Deterministically Encode CBOR |
| * in section 4.2 allows for some variation of this rule, |
| * but at the moment this implementation is of Preferred |
| * Serialization, not CDE. As of December 2023, we are |
| * also expecting an update to CDE. This code may need to |
| * be updated for CDE. |
| */ |
| uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS); |
| if(uDroppedBits == 0) { |
| /* --- IS CONVERTABLE NAN --- */ |
| uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); |
| Result.uSize = IEEE754_UNION_IS_SINGLE; |
| Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| uSingleSignificand, |
| SINGLE_EXPONENT_INF_OR_NAN); |
| } else { |
| /* --- IS UNCONVERTABLE NAN --- */ |
| Result.uSize = IEEE754_UNION_IS_DOUBLE; |
| Result.uValue = uDouble; |
| } |
| } |
| } else { |
| /* ---- REGULAR NUMBER ---- */ |
| /* A regular double can be converted to a regular single if |
| * the double's exponent is in the smaller range of a single |
| * and if no precision is lost in the significand. |
| */ |
| uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS); |
| if(nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN && |
| nDoubleUnbiasedExponent <= SINGLE_EXPONENT_MAX && |
| uDroppedBits == 0) { |
| /* --- IS CONVERTABLE TO SINGLE --- */ |
| uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); |
| Result.uSize = IEEE754_UNION_IS_SINGLE; |
| Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| uSingleSignificand, |
| nDoubleUnbiasedExponent); |
| } else { |
| /* Unable to convert to a single normal. See if it can be |
| * converted to a single subnormal. To do that, the |
| * exponent must be in range and no precision can be lost |
| * in the signficand. |
| * |
| * This is more complicated because the number is not |
| * normalized. The signficand must be shifted |
| * proprotionally to the exponent and 1 must be added |
| * in. See |
| * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding |
| */ |
| nExponentDifference = -(nDoubleUnbiasedExponent - SINGLE_EXPONENT_MIN); |
| nShiftAmount = nExponentDifference + (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); |
| uSingleSignificand = (uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount; |
| |
| if(nDoubleUnbiasedExponent < SINGLE_EXPONENT_MIN && |
| nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN - SINGLE_NUM_SIGNIFICAND_BITS && |
| uSingleSignificand << nShiftAmount == uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) { |
| /* --- IS CONVERTABLE TO SINGLE SUBNORMAL --- */ |
| Result.uSize = IEEE754_UNION_IS_SINGLE; |
| Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| uSingleSignificand, |
| SINGLE_EXPONENT_ZERO); |
| } else { |
| /* --- CAN NOT BE CONVERTED --- */ |
| Result.uSize = IEEE754_UNION_IS_DOUBLE; |
| Result.uValue = uDouble; |
| } |
| } |
| } |
| |
| return Result; |
| } |
| |
| |
| /* Public function; see ieee754.h */ |
| IEEE754_union |
| IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision) |
| { |
| IEEE754_union result; |
| |
| result = IEEE754_DoubleToSingle(d); |
| |
| if(result.uSize == IEEE754_UNION_IS_SINGLE && bAllowHalfPrecision) { |
| /* Cast to uint32_t is OK, because value was just successfully |
| * converted to single. */ |
| float uSingle = CopyUint32ToSingle((uint32_t)result.uValue); |
| result = IEEE754_SingleToHalf(uSingle); |
| } |
| |
| return result; |
| } |
| |
| |
| #else /* QCBOR_DISABLE_PREFERRED_FLOAT */ |
| |
| int ieee754_dummy_place_holder; |
| |
| #endif /* QCBOR_DISABLE_PREFERRED_FLOAT */ |