Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 1 | /* ========================================================================== |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 2 | * ieee754.c -- floating-point conversion for half, double & single-precision |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 3 | * |
| 4 | * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved. |
| 5 | * Copyright (c) 2021, Arm Limited. All rights reserved. |
| 6 | * |
| 7 | * SPDX-License-Identifier: BSD-3-Clause |
| 8 | * |
| 9 | * See BSD-3-Clause license in README.md |
| 10 | * |
| 11 | * Created on 7/23/18 |
| 12 | * ========================================================================== */ |
Laurence Lundblade | cc2ed34 | 2018-09-22 17:29:55 -0700 | [diff] [blame] | 13 | |
Máté Tóth-Pál | ef5f07a | 2021-09-17 19:31:37 +0200 | [diff] [blame] | 14 | #include "qcbor/qcbor_common.h" |
| 15 | |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 16 | #include "ieee754.h" |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 17 | #include <string.h> /* For memcpy() */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 18 | |
Laurence Lundblade | 8db3d3e | 2018-09-29 11:46:37 -0700 | [diff] [blame] | 19 | |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 20 | /* |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 21 | * This has long lines and is easier to read because of |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 22 | * them. Some coding guidelines prefer 80 column lines (can they not |
| 23 | * afford big displays?). |
| 24 | * |
| 25 | * This code works solely using shifts and masks and thus has no |
| 26 | * dependency on any math libraries. It can even work if the CPU |
| 27 | * doesn't have any floating-point support, though that isn't the most |
| 28 | * useful thing to do. |
| 29 | * |
| 30 | * The memcpy() dependency is only for CopyFloatToUint32() and friends |
| 31 | * which only is needed to avoid type punning when converting the |
| 32 | * actual float bits to an unsigned value so the bit shifts and masks |
| 33 | * can work. |
| 34 | * |
| 35 | * The references used to write this code: |
| 36 | * |
| 37 | * IEEE 754-2008, particularly section 3.6 and 6.2.1 |
| 38 | * |
| 39 | * https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages |
| 40 | * |
| 41 | * https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values |
| 42 | * |
| 43 | * https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules |
| 44 | * |
| 45 | * https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be |
| 46 | * |
| 47 | * IEEE754_FloatToDouble(uint32_t uFloat) was created but is not |
| 48 | * needed. It can be retrieved from github history if needed. |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 49 | */ |
| 50 | |
| 51 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 52 | |
| 53 | |
| 54 | /* ----- Half Precsion ----------- */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 55 | #define HALF_NUM_SIGNIFICAND_BITS (10) |
| 56 | #define HALF_NUM_EXPONENT_BITS (5) |
| 57 | #define HALF_NUM_SIGN_BITS (1) |
| 58 | |
| 59 | #define HALF_SIGNIFICAND_SHIFT (0) |
| 60 | #define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS) |
| 61 | #define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS) |
| 62 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 63 | #define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits |
Laurence Lundblade | 06350ea | 2020-01-27 19:32:40 -0800 | [diff] [blame] | 64 | #define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 65 | #define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // 0x8000 1 bit of sign |
Laurence Lundblade | 06350ea | 2020-01-27 19:32:40 -0800 | [diff] [blame] | 66 | #define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200 |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 67 | |
| 68 | /* Biased Biased Unbiased Use |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 69 | * 0x00 0 -15 0 and subnormal |
| 70 | * 0x01 1 -14 Smallest normal exponent |
| 71 | * 0x1e 30 15 Largest normal exponent |
| 72 | * 0x1F 31 16 NaN and Infinity */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 73 | #define HALF_EXPONENT_BIAS (15) |
| 74 | #define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased |
| 75 | #define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased |
| 76 | #define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased |
| 77 | #define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased |
| 78 | |
| 79 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 80 | /* ------ Single-Precision -------- */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 81 | #define SINGLE_NUM_SIGNIFICAND_BITS (23) |
| 82 | #define SINGLE_NUM_EXPONENT_BITS (8) |
| 83 | #define SINGLE_NUM_SIGN_BITS (1) |
| 84 | |
| 85 | #define SINGLE_SIGNIFICAND_SHIFT (0) |
| 86 | #define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS) |
| 87 | #define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS) |
| 88 | |
Laurence Lundblade | 06350ea | 2020-01-27 19:32:40 -0800 | [diff] [blame] | 89 | #define SINGLE_SIGNIFICAND_MASK (0x7fffffU) // The lower 23 bits |
| 90 | #define SINGLE_EXPONENT_MASK (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent |
| 91 | #define SINGLE_SIGN_MASK (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign |
| 92 | #define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1)) |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 93 | |
| 94 | /* Biased Biased Unbiased Use |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 95 | * 0x0000 0 -127 0 and subnormal |
| 96 | * 0x0001 1 -126 Smallest normal exponent |
| 97 | * 0x7f 127 0 1 |
| 98 | * 0xfe 254 127 Largest normal exponent |
| 99 | * 0xff 255 128 NaN and Infinity */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 100 | #define SINGLE_EXPONENT_BIAS (127) |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 101 | #define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) |
| 102 | #define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) |
| 103 | #define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) |
| 104 | #define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 105 | |
| 106 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 107 | /* --------- Double-Precision ---------- */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 108 | #define DOUBLE_NUM_SIGNIFICAND_BITS (52) |
| 109 | #define DOUBLE_NUM_EXPONENT_BITS (11) |
| 110 | #define DOUBLE_NUM_SIGN_BITS (1) |
| 111 | |
| 112 | #define DOUBLE_SIGNIFICAND_SHIFT (0) |
| 113 | #define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS) |
| 114 | #define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS) |
| 115 | |
Laurence Lundblade | 8db3d3e | 2018-09-29 11:46:37 -0700 | [diff] [blame] | 116 | #define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits |
| 117 | #define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent |
| 118 | #define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign |
| 119 | #define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1)) |
| 120 | |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 121 | |
| 122 | /* Biased Biased Unbiased Use |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 123 | * 0x00000000 0 -1023 0 and subnormal |
| 124 | * 0x00000001 1 -1022 Smallest normal exponent |
| 125 | * 0x000007fe 2046 1023 Largest normal exponent |
| 126 | * 0x000007ff 2047 1024 NaN and Infinity */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 127 | #define DOUBLE_EXPONENT_BIAS (1023) |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 128 | #define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) |
| 129 | #define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) |
| 130 | #define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) |
| 131 | #define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) |
| 132 | |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 133 | |
| 134 | |
| 135 | |
| 136 | /* |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 137 | * Convenient functions to avoid type punning, compiler warnings and |
| 138 | * such. The optimizer reduces them to a simple assignment. This is a |
| 139 | * crusty corner of C. It shouldn't be this hard. |
| 140 | * |
| 141 | * These are also in UsefulBuf.h under a different name. They are copied |
| 142 | * here to avoid a dependency on UsefulBuf.h. There is no object code |
| 143 | * size impact because these always optimze down to a simple assignment. |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 144 | */ |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 145 | static inline uint32_t |
| 146 | CopyFloatToUint32(float f) |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 147 | { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 148 | uint32_t u32; |
| 149 | memcpy(&u32, &f, sizeof(uint32_t)); |
| 150 | return u32; |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 151 | } |
| 152 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 153 | static inline uint64_t |
| 154 | CopyDoubleToUint64(double d) |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 155 | { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 156 | uint64_t u64; |
| 157 | memcpy(&u64, &d, sizeof(uint64_t)); |
| 158 | return u64; |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 159 | } |
| 160 | |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 161 | |
| 162 | #ifndef QCBOR_DISABLE_PREFERRED_FLOAT |
| 163 | |
| 164 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 165 | static inline double |
| 166 | CopyUint64ToDouble(uint64_t u64) |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 167 | { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 168 | double d; |
| 169 | memcpy(&d, &u64, sizeof(uint64_t)); |
| 170 | return d; |
| 171 | } |
| 172 | |
| 173 | static inline float |
| 174 | CopyUint32ToSingle(uint32_t u32) |
| 175 | { |
| 176 | float f; |
| 177 | memcpy(&f, &u32, sizeof(uint32_t)); |
| 178 | return f; |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 179 | } |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 180 | |
| 181 | |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 182 | |
| 183 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 184 | /** |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 185 | * @brief Assemble sign, significand and exponent into double precision float. |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 186 | * |
| 187 | * @param[in] uDoubleSign 0 if positive, 1 if negative |
| 188 | * @pararm[in] uDoubleSignificand Bits of the significand |
| 189 | * @param[in] nDoubleUnBiasedExponent Exponent |
| 190 | * |
| 191 | * This returns the bits for a single-precision float, a binary64 |
| 192 | * as specified in IEEE754. |
Laurence Lundblade | fe09bbf | 2020-07-16 12:14:51 -0700 | [diff] [blame] | 193 | */ |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 194 | static double |
| 195 | IEEE754_AssembleDouble(uint64_t uDoubleSign, |
| 196 | uint64_t uDoubleSignificand, |
| 197 | int64_t nDoubleUnBiasedExponent) |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 198 | { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 199 | uint64_t uDoubleBiasedExponent; |
| 200 | |
| 201 | uDoubleBiasedExponent = (uint64_t)(nDoubleUnBiasedExponent + DOUBLE_EXPONENT_BIAS); |
| 202 | |
| 203 | return CopyUint64ToDouble(uDoubleSignificand | |
| 204 | (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) | |
| 205 | (uDoubleSign << DOUBLE_SIGN_SHIFT)); |
| 206 | } |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 207 | |
| 208 | |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 209 | /* Public function; see ieee754.h */ |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 210 | double |
| 211 | IEEE754_HalfToDouble(uint16_t uHalfPrecision) |
| 212 | { |
| 213 | uint64_t uDoubleSignificand; |
| 214 | int64_t nDoubleUnBiasedExponent; |
| 215 | double dResult; |
| 216 | |
| 217 | /* Pull out the three parts of the half-precision float. Do all |
| 218 | * the work in 64 bits because that is what the end result is. It |
| 219 | * may give smaller code size and will keep static analyzers |
| 220 | * happier. |
| 221 | */ |
| 222 | const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK; |
| 223 | const uint64_t uHalfBiasedExponent = (uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT; |
| 224 | const int64_t nHalfUnBiasedExponent = (int64_t)uHalfBiasedExponent - HALF_EXPONENT_BIAS; |
| 225 | const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT; |
| 226 | |
| 227 | if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) { |
| 228 | /* 0 or subnormal */ |
| 229 | if(uHalfSignificand) { |
| 230 | /* --- SUBNORMAL --- */ |
| 231 | /* A half-precision subnormal can always be converted to a |
| 232 | * normal double-precision float because the ranges line up. |
| 233 | * The exponent of a subnormal starts out at the min exponent |
| 234 | * for a normal. As the sub normal significand bits are |
| 235 | * shifted, left to normalize, the exponent is |
| 236 | * decremented. Shifting continues until fully normalized. |
| 237 | */ |
| 238 | nDoubleUnBiasedExponent = HALF_EXPONENT_MIN; |
| 239 | uDoubleSignificand = uHalfSignificand; |
| 240 | do { |
| 241 | uDoubleSignificand <<= 1; |
| 242 | nDoubleUnBiasedExponent--; |
| 243 | } while ((uDoubleSignificand & (1ULL << HALF_NUM_SIGNIFICAND_BITS)) == 0); |
| 244 | /* A normal has an implied 1 in the most significant |
| 245 | * position that a subnormal doesn't. */ |
| 246 | uDoubleSignificand -= 1ULL << HALF_NUM_SIGNIFICAND_BITS; |
| 247 | /* Must shift into place for a double significand */ |
| 248 | uDoubleSignificand <<= DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS; |
| 249 | |
| 250 | dResult = IEEE754_AssembleDouble(uHalfSign, |
| 251 | uDoubleSignificand, |
| 252 | nDoubleUnBiasedExponent); |
| 253 | } else { |
| 254 | /* --- ZERO --- */ |
| 255 | dResult = IEEE754_AssembleDouble(uHalfSign, |
| 256 | 0, |
| 257 | DOUBLE_EXPONENT_ZERO); |
| 258 | } |
| 259 | } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) { |
| 260 | /* NaN or Inifinity */ |
| 261 | if(uHalfSignificand) { |
| 262 | /* --- NaN --- */ |
| 263 | /* Half-precision payloads always fit into double precision |
| 264 | * payloads. They are shifted left the same as a normal |
| 265 | * number significand. |
| 266 | */ |
| 267 | uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| 268 | dResult = IEEE754_AssembleDouble(uHalfSign, |
| 269 | uDoubleSignificand, |
| 270 | DOUBLE_EXPONENT_INF_OR_NAN); |
| 271 | } else { |
| 272 | /* --- INFINITY --- */ |
| 273 | dResult = IEEE754_AssembleDouble(uHalfSign, |
| 274 | 0, |
| 275 | DOUBLE_EXPONENT_INF_OR_NAN); |
| 276 | } |
| 277 | } else { |
| 278 | /* --- NORMAL NUMBER --- */ |
| 279 | uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| 280 | dResult = IEEE754_AssembleDouble(uHalfSign, |
| 281 | uDoubleSignificand, |
| 282 | nHalfUnBiasedExponent); |
| 283 | } |
| 284 | |
| 285 | return dResult; |
| 286 | } |
| 287 | |
| 288 | |
| 289 | /** |
| 290 | * @brief Assemble sign, significand and exponent into single precision float. |
| 291 | * |
| 292 | * @param[in] uHalfSign 0 if positive, 1 if negative |
| 293 | * @pararm[in] uHalfSignificand Bits of the significand |
| 294 | * @param[in] nHalfUnBiasedExponent Exponent |
| 295 | * |
| 296 | * This returns the bits for a single-precision float, a binary32 as |
| 297 | * specified in IEEE754. It is returned as a uint64_t rather than a |
| 298 | * uint32_t or a float for convenience of usage. |
| 299 | */ |
| 300 | static uint32_t |
| 301 | IEEE754_AssembleHalf(uint32_t uHalfSign, |
| 302 | uint32_t uHalfSignificand, |
| 303 | int32_t nHalfUnBiasedExponent) |
| 304 | { |
| 305 | uint32_t uHalfUnbiasedExponent; |
| 306 | |
| 307 | uHalfUnbiasedExponent = (uint32_t)(nHalfUnBiasedExponent + HALF_EXPONENT_BIAS); |
| 308 | |
| 309 | return uHalfSignificand | |
| 310 | (uHalfUnbiasedExponent << HALF_EXPONENT_SHIFT) | |
| 311 | (uHalfSign << HALF_SIGN_SHIFT); |
| 312 | } |
| 313 | |
| 314 | |
| 315 | /* Public function; see ieee754.h */ |
| 316 | IEEE754_union |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 317 | IEEE754_SingleToHalf(const float f, const int bNoNaNPayload) |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 318 | { |
| 319 | IEEE754_union result; |
| 320 | uint32_t uDroppedBits; |
| 321 | int32_t nExponentDifference; |
| 322 | int32_t nShiftAmount; |
| 323 | uint32_t uHalfSignificand; |
| 324 | |
| 325 | /* Pull the three parts out of the double-precision float Most work |
| 326 | * is done with uint32_t which helps avoid integer promotions and |
| 327 | * static analyzer complaints. |
| 328 | */ |
| 329 | const uint32_t uSingle = CopyFloatToUint32(f); |
| 330 | const uint32_t uSingleBiasedExponent = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT; |
| 331 | const int32_t nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS; |
| 332 | const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK; |
| 333 | const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT; |
| 334 | |
| 335 | if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) { |
| 336 | if(uSingleSignificand == 0) { |
| 337 | /* --- IS ZERO --- */ |
| 338 | result.uSize = IEEE754_UNION_IS_HALF; |
| 339 | result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| 340 | 0, |
| 341 | HALF_EXPONENT_ZERO); |
| 342 | } else { |
| 343 | /* --- IS SINGLE SUBNORMAL --- */ |
| 344 | /* The largest single subnormal is slightly less than the |
| 345 | * largest single normal which is 2^-149 or |
| 346 | * 2.2040517676619426e-38. The smallest half subnormal is |
| 347 | * 2^-14 or 5.9604644775390625E-8. There is no overlap so |
| 348 | * single subnormals can't be converted to halfs of any sort. |
| 349 | */ |
| 350 | result.uSize = IEEE754_UNION_IS_SINGLE; |
| 351 | result.uValue = uSingle; |
| 352 | } |
| 353 | } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) { |
| 354 | if(uSingleSignificand == 0) { |
| 355 | /* ---- IS INFINITY ---- */ |
| 356 | result.uSize = IEEE754_UNION_IS_HALF; |
| 357 | result.uValue = IEEE754_AssembleHalf(uSingleSign, 0, HALF_EXPONENT_INF_OR_NAN); |
| 358 | } else { |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 359 | if(bNoNaNPayload) { |
| 360 | /* --- REQUIRE CANNONICAL NAN --- */ |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 361 | result.uSize = IEEE754_UNION_IS_HALF; |
| 362 | result.uValue = IEEE754_AssembleHalf(uSingleSign, |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 363 | HALF_QUIET_NAN_BIT, |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 364 | HALF_EXPONENT_INF_OR_NAN); |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 365 | } else { |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 366 | /* The NaN can only be converted if no payload bits are lost |
| 367 | * per RFC 8949 section 4.1 that defines Preferred |
| 368 | * Serializaton. Note that Deterministically Encode CBOR in |
| 369 | * section 4.2 allows for some variation of this rule, but at |
| 370 | * the moment this implementation is of Preferred |
| 371 | * Serialization, not CDE. As of December 2023, we are also |
| 372 | * expecting an update to CDE. This code may need to be |
| 373 | * updated for CDE. |
| 374 | */ |
| 375 | uDroppedBits = uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS); |
| 376 | if(uDroppedBits == 0) { |
| 377 | /* --- IS CONVERTABLE NAN --- */ |
| 378 | uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| 379 | result.uSize = IEEE754_UNION_IS_HALF; |
| 380 | result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| 381 | uHalfSignificand, |
| 382 | HALF_EXPONENT_INF_OR_NAN); |
| 383 | |
| 384 | } else { |
| 385 | /* --- IS UNCONVERTABLE NAN --- */ |
| 386 | result.uSize = IEEE754_UNION_IS_SINGLE; |
| 387 | result.uValue = uSingle; |
| 388 | } |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 389 | } |
| 390 | } |
| 391 | } else { |
| 392 | /* ---- REGULAR NUMBER ---- */ |
| 393 | /* A regular single can be converted to a regular half if the |
| 394 | * single's exponent is in the smaller range of a half and if no |
| 395 | * precision is lost in the significand. |
| 396 | */ |
| 397 | if(nSingleUnbiasedExponent >= HALF_EXPONENT_MIN && |
| 398 | nSingleUnbiasedExponent <= HALF_EXPONENT_MAX && |
| 399 | (uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS)) == 0) { |
| 400 | uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| 401 | |
| 402 | /* --- CONVERT TO HALF NORMAL --- */ |
| 403 | result.uSize = IEEE754_UNION_IS_HALF; |
| 404 | result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| 405 | uHalfSignificand, |
| 406 | nSingleUnbiasedExponent); |
| 407 | } else { |
| 408 | /* Unable to convert to a half normal. See if it can be |
| 409 | * converted to a half subnormal. To do that, the exponent |
| 410 | * must be in range and no precision can be lost in the |
| 411 | * signficand. |
| 412 | * |
| 413 | * This is more complicated because the number is not |
| 414 | * normalized. The signficand must be shifted proprotionally |
| 415 | * to the exponent and 1 must be added in. See |
| 416 | * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding |
| 417 | * |
| 418 | * Exponents -14 to -24 map to a shift of 0 to 10 of the |
| 419 | * significand. The largest value of a half subnormal has an |
| 420 | * exponent of -14. Subnormals are not normalized like |
| 421 | * normals meaning they lose precision as the numbers get |
| 422 | * smaller. Normals don't lose precision because the exponent |
| 423 | * allows all the bits of the significand to be significant. |
| 424 | */ |
| 425 | /* The exponent of the largest possible half-precision |
| 426 | * subnormal is HALF_EXPONENT_MIN (-14). Exponents larger |
| 427 | * than this are normal and handled above. We're going to |
| 428 | * shift the significand right by at least this amount. |
| 429 | */ |
| 430 | nExponentDifference = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN); |
| 431 | |
| 432 | /* In addition to the shift based on the exponent's value, |
| 433 | * the single significand has to be shifted right to fit into |
| 434 | * a half-precision significand */ |
| 435 | nShiftAmount = nExponentDifference + (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| 436 | |
| 437 | /* Must add 1 in to the possible significand because there is |
| 438 | * an implied 1 for normal values and not for subnormal |
| 439 | * values. See equations here: |
| 440 | * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding |
| 441 | */ |
| 442 | uHalfSignificand = (uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount; |
| 443 | |
| 444 | /* If only zero bits get shifted out, this can be converted |
| 445 | * to subnormal */ |
| 446 | if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN && |
| 447 | nSingleUnbiasedExponent >= HALF_EXPONENT_MIN - HALF_NUM_SIGNIFICAND_BITS && |
| 448 | uHalfSignificand << nShiftAmount == uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) { |
| 449 | /* --- CONVERTABLE TO HALF SUBNORMAL --- */ |
| 450 | result.uSize = IEEE754_UNION_IS_HALF; |
| 451 | result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| 452 | uHalfSignificand, |
| 453 | HALF_EXPONENT_ZERO); |
| 454 | } else { |
| 455 | /* --- DO NOT CONVERT --- */ |
| 456 | result.uSize = IEEE754_UNION_IS_SINGLE; |
| 457 | result.uValue = uSingle; |
| 458 | } |
| 459 | } |
| 460 | } |
| 461 | |
| 462 | return result; |
| 463 | } |
| 464 | |
| 465 | |
| 466 | /** |
| 467 | * @brief Assemble sign, significand and exponent into single precision float. |
| 468 | * |
| 469 | * @param[in] uSingleSign 0 if positive, 1 if negative |
| 470 | * @pararm[in] uSingleSignificand Bits of the significand |
| 471 | * @param[in] nSingleUnBiasedExponent Exponent |
| 472 | * |
| 473 | * This returns the bits for a single-precision float, a binary32 as |
| 474 | * specified in IEEE754. It is returned as a uint64_t rather than a |
| 475 | * uint32_t or a float for convenience of usage. |
| 476 | */ |
| 477 | static uint64_t |
| 478 | IEEE754_AssembleSingle(uint64_t uSingleSign, |
| 479 | uint64_t uSingleSignificand, |
| 480 | int64_t nSingleUnBiasedExponent) |
| 481 | { |
| 482 | uint64_t uSingleBiasedExponent; |
| 483 | |
| 484 | uSingleBiasedExponent = (uint64_t)(nSingleUnBiasedExponent + SINGLE_EXPONENT_BIAS); |
| 485 | |
| 486 | return uSingleSignificand | |
| 487 | (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) | |
| 488 | (uSingleSign << SINGLE_SIGN_SHIFT); |
| 489 | } |
| 490 | |
| 491 | |
| 492 | /** |
| 493 | * @brief Convert a double-precision float to single-precision. |
| 494 | * |
| 495 | * @param[in] d The value to convert. |
| 496 | * |
| 497 | * @returns Either unconverted value or value converted to single-precision. |
| 498 | * |
| 499 | * This always succeeds. If the value cannot be converted without the |
| 500 | * loss of precision, it is not converted. |
| 501 | * |
| 502 | * This handles all subnormals and NaN payloads. |
| 503 | */ |
| 504 | static IEEE754_union |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 505 | IEEE754_DoubleToSingle(const double d) |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 506 | { |
| 507 | IEEE754_union Result; |
| 508 | int64_t nExponentDifference; |
| 509 | int64_t nShiftAmount; |
| 510 | uint64_t uSingleSignificand; |
| 511 | uint64_t uDroppedBits; |
| 512 | |
| 513 | |
| 514 | /* Pull the three parts out of the double-precision float. Most |
| 515 | * work is done with uint64_t which helps avoid integer promotions |
| 516 | * and static analyzer complaints. |
| 517 | */ |
| 518 | const uint64_t uDouble = CopyDoubleToUint64(d); |
| 519 | const uint64_t uDoubleBiasedExponent = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT; |
| 520 | const int64_t nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS; |
| 521 | const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT; |
| 522 | const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK; |
| 523 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 524 | if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) { |
| 525 | if(uDoubleSignificand == 0) { |
| 526 | /* --- IS ZERO --- */ |
| 527 | Result.uSize = IEEE754_UNION_IS_SINGLE; |
| 528 | Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 529 | 0, |
| 530 | SINGLE_EXPONENT_ZERO); |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 531 | } else { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 532 | /* --- IS DOUBLE SUBNORMAL --- */ |
| 533 | /* The largest double subnormal is slightly less than the |
| 534 | * largest double normal which is 2^-1022 or |
| 535 | * 2.2250738585072014e-308. The smallest single subnormal |
| 536 | * is 2^-149 or 1.401298464324817e-45. There is no |
| 537 | * overlap so double subnormals can't be converted to |
| 538 | * singles of any sort. |
| 539 | */ |
| 540 | Result.uSize = IEEE754_UNION_IS_DOUBLE; |
| 541 | Result.uValue = uDouble; |
| 542 | } |
| 543 | } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) { |
| 544 | if(uDoubleSignificand == 0) { |
| 545 | /* ---- IS INFINITY ---- */ |
| 546 | Result.uSize = IEEE754_UNION_IS_SINGLE; |
| 547 | Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 548 | 0, |
| 549 | SINGLE_EXPONENT_INF_OR_NAN); |
| 550 | } else { |
| 551 | /* The NaN can only be converted if no payload bits are |
| 552 | * lost per RFC 8949 section 4.1 that defines Preferred |
| 553 | * Serializaton. Note that Deterministically Encode CBOR |
| 554 | * in section 4.2 allows for some variation of this rule, |
| 555 | * but at the moment this implementation is of Preferred |
| 556 | * Serialization, not CDE. As of December 2023, we are |
| 557 | * also expecting an update to CDE. This code may need to |
| 558 | * be updated for CDE. |
| 559 | */ |
| 560 | uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS); |
| 561 | if(uDroppedBits == 0) { |
| 562 | /* --- IS CONVERTABLE NAN --- */ |
| 563 | uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); |
| 564 | Result.uSize = IEEE754_UNION_IS_SINGLE; |
| 565 | Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 566 | uSingleSignificand, |
| 567 | SINGLE_EXPONENT_INF_OR_NAN); |
| 568 | } else { |
| 569 | /* --- IS UNCONVERTABLE NAN --- */ |
| 570 | Result.uSize = IEEE754_UNION_IS_DOUBLE; |
| 571 | Result.uValue = uDouble; |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 572 | } |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 573 | } |
| 574 | } else { |
| 575 | /* ---- REGULAR NUMBER ---- */ |
| 576 | /* A regular double can be converted to a regular single if |
| 577 | * the double's exponent is in the smaller range of a single |
| 578 | * and if no precision is lost in the significand. |
| 579 | */ |
| 580 | uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS); |
| 581 | if(nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN && |
| 582 | nDoubleUnbiasedExponent <= SINGLE_EXPONENT_MAX && |
| 583 | uDroppedBits == 0) { |
| 584 | /* --- IS CONVERTABLE TO SINGLE --- */ |
| 585 | uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); |
| 586 | Result.uSize = IEEE754_UNION_IS_SINGLE; |
| 587 | Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 588 | uSingleSignificand, |
| 589 | nDoubleUnbiasedExponent); |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 590 | } else { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 591 | /* Unable to convert to a single normal. See if it can be |
| 592 | * converted to a single subnormal. To do that, the |
| 593 | * exponent must be in range and no precision can be lost |
| 594 | * in the signficand. |
| 595 | * |
| 596 | * This is more complicated because the number is not |
| 597 | * normalized. The signficand must be shifted |
| 598 | * proprotionally to the exponent and 1 must be added |
| 599 | * in. See |
| 600 | * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding |
| 601 | */ |
| 602 | nExponentDifference = -(nDoubleUnbiasedExponent - SINGLE_EXPONENT_MIN); |
| 603 | nShiftAmount = nExponentDifference + (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); |
| 604 | uSingleSignificand = (uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount; |
| 605 | |
| 606 | if(nDoubleUnbiasedExponent < SINGLE_EXPONENT_MIN && |
| 607 | nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN - SINGLE_NUM_SIGNIFICAND_BITS && |
| 608 | uSingleSignificand << nShiftAmount == uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) { |
| 609 | /* --- IS CONVERTABLE TO SINGLE SUBNORMAL --- */ |
| 610 | Result.uSize = IEEE754_UNION_IS_SINGLE; |
| 611 | Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 612 | uSingleSignificand, |
| 613 | SINGLE_EXPONENT_ZERO); |
| 614 | } else { |
| 615 | /* --- CAN NOT BE CONVERTED --- */ |
| 616 | Result.uSize = IEEE754_UNION_IS_DOUBLE; |
| 617 | Result.uValue = uDouble; |
| 618 | } |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 619 | } |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 620 | } |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 621 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 622 | return Result; |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 623 | } |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 624 | |
| 625 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 626 | /* Public function; see ieee754.h */ |
| 627 | IEEE754_union |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 628 | IEEE754_DoubleToSmaller(const double d, |
| 629 | const int bAllowHalfPrecision, |
| 630 | const int bNoNanPayload) |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 631 | { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 632 | IEEE754_union result; |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 633 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 634 | result = IEEE754_DoubleToSingle(d); |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 635 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 636 | if(result.uSize == IEEE754_UNION_IS_SINGLE && bAllowHalfPrecision) { |
| 637 | /* Cast to uint32_t is OK, because value was just successfully |
| 638 | * converted to single. */ |
| 639 | float uSingle = CopyUint32ToSingle((uint32_t)result.uValue); |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 640 | result = IEEE754_SingleToHalf(uSingle, bNoNanPayload); |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 641 | } |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 642 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 643 | return result; |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 644 | } |
| 645 | |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 646 | |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 647 | static int |
| 648 | IEEE754_Private_CountNonZeroBits(int nMax, uint64_t uTarget) |
| 649 | { |
| 650 | int nNonZeroBitsCount; |
| 651 | uint64_t uMask; |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 652 | |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 653 | for(nNonZeroBitsCount = nMax; nNonZeroBitsCount > 0; nNonZeroBitsCount--) { |
| 654 | uMask = (0x01UL << nMax) >> nNonZeroBitsCount; |
| 655 | if(uMask & uTarget) { |
| 656 | break; |
| 657 | } |
| 658 | } |
| 659 | return nNonZeroBitsCount; |
| 660 | } |
| 661 | |
| 662 | |
| 663 | /* Public function; see ieee754.h */ |
| 664 | struct IEEE754_ToInt |
| 665 | IEEE754_DoubleToInt(const double d) |
| 666 | { |
| 667 | int64_t nNonZeroBitsCount; |
| 668 | struct IEEE754_ToInt Result; |
| 669 | uint64_t uInteger; |
| 670 | |
| 671 | /* Pull the three parts out of the double-precision float. Most |
| 672 | * work is done with uint64_t which helps avoid integer promotions |
| 673 | * and static analyzer complaints. |
| 674 | */ |
| 675 | const uint64_t uDouble = CopyDoubleToUint64(d); |
| 676 | const uint64_t uDoubleBiasedExponent = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT; |
| 677 | /* Cast safe because of mask above; exponents < DOUBLE_EXPONENT_MAX */ |
| 678 | const int64_t nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS; |
| 679 | const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK; |
| 680 | |
| 681 | if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) { |
| 682 | if(uDoubleSignificand == 0) { |
| 683 | /* --- POSITIVE AND NEGATIVE ZERO --- */ |
| 684 | Result.integer.un_signed = 0; |
| 685 | Result.type = IEEE754_ToInt_IS_UINT; |
| 686 | } else { |
| 687 | /* --- SUBNORMAL --- */ |
| 688 | Result.type = IEEE754_ToInt_NO_CONVERSION; |
| 689 | } |
| 690 | } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) { |
| 691 | if(uDoubleSignificand != 0) { |
| 692 | /* --- NAN --- */ |
| 693 | Result.type = IEEE754_ToInt_NaN; /* dCBOR doesn't care about payload */ |
| 694 | } else { |
| 695 | /* --- INIFINITY --- */ |
| 696 | Result.type = IEEE754_ToInt_NO_CONVERSION; |
| 697 | } |
| 698 | } else if(nDoubleUnbiasedExponent < 0 || |
| 699 | (nDoubleUnbiasedExponent >= ((uDouble & DOUBLE_SIGN_MASK) ? 63 : 64))) { |
| 700 | /* --- Exponent out of range --- */ |
| 701 | Result.type = IEEE754_ToInt_NO_CONVERSION; |
| 702 | } else { |
| 703 | /* Count down from 52 to the number of bits that are not zero in |
| 704 | * the significand. This counts from the least significant bit |
| 705 | * until a non-zero bit is found to know if it is a whole |
| 706 | * number. |
| 707 | * |
| 708 | * Conversion only fails when the input is too large or is not a |
| 709 | * whole number, never because of lack of precision because |
| 710 | * 64-bit integers always have more precision than the 52-bits |
| 711 | * of a double. |
| 712 | */ |
| 713 | nNonZeroBitsCount = IEEE754_Private_CountNonZeroBits(DOUBLE_NUM_SIGNIFICAND_BITS, uDoubleSignificand); |
| 714 | |
| 715 | if(nNonZeroBitsCount && nNonZeroBitsCount > nDoubleUnbiasedExponent) { |
| 716 | /* --- Not a whole number --- */ |
| 717 | Result.type = IEEE754_ToInt_NO_CONVERSION; |
| 718 | } else { |
| 719 | /* --- CONVERTABLE WHOLE NUMBER --- */ |
| 720 | /* Add in the one that is implied in normal floats */ |
| 721 | uInteger = uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS); |
| 722 | /* Factor in the exponent */ |
| 723 | if(nDoubleUnbiasedExponent < DOUBLE_NUM_SIGNIFICAND_BITS) { |
| 724 | /* Numbers less than 2^52 with up to 52 significant bits */ |
| 725 | uInteger >>= DOUBLE_NUM_SIGNIFICAND_BITS - nDoubleUnbiasedExponent; |
| 726 | } else { |
| 727 | /* Numbers greater than 2^52 with at most 52 significant bits */ |
| 728 | uInteger <<= nDoubleUnbiasedExponent - DOUBLE_NUM_SIGNIFICAND_BITS; |
| 729 | } |
| 730 | if(uDouble & DOUBLE_SIGN_MASK) { |
| 731 | /* Cast safe because exponent range check above */ |
| 732 | Result.integer.is_signed = -((int64_t)uInteger); |
| 733 | Result.type = IEEE754_ToInt_IS_INT; |
| 734 | } else { |
| 735 | Result.integer.un_signed = uInteger; |
| 736 | Result.type = IEEE754_ToInt_IS_UINT; |
| 737 | } |
| 738 | } |
| 739 | } |
| 740 | |
| 741 | return Result; |
| 742 | } |
| 743 | |
| 744 | |
| 745 | /* Public function; see ieee754.h */ |
| 746 | struct IEEE754_ToInt |
| 747 | IEEE754_SingleToInt(const float f) |
| 748 | { |
| 749 | int32_t nNonZeroBitsCount; |
| 750 | struct IEEE754_ToInt Result; |
| 751 | uint64_t uInteger; |
| 752 | |
| 753 | /* Pull the three parts out of the single-precision float. Most |
| 754 | * work is done with uint32_t which helps avoid integer promotions |
| 755 | * and static analyzer complaints. |
| 756 | */ |
| 757 | const uint32_t uSingle = CopyFloatToUint32(f); |
| 758 | const uint32_t uSingleBiasedExponent = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT; |
| 759 | /* Cast safe because of mask above; exponents < SINGLE_EXPONENT_MAX */ |
| 760 | const int32_t nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS; |
| 761 | const uint32_t uSingleleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK; |
| 762 | |
| 763 | if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) { |
| 764 | if(uSingleleSignificand == 0 && !(uSingle & SINGLE_SIGN_MASK)) { |
| 765 | /* --- POSITIVE AND NEGATIVE ZERO --- */ |
| 766 | Result.integer.un_signed = 0; |
| 767 | Result.type = IEEE754_ToInt_IS_UINT; |
| 768 | } else { |
| 769 | /* --- Subnormal --- */ |
| 770 | Result.type = IEEE754_ToInt_NO_CONVERSION; |
| 771 | } |
| 772 | } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) { |
| 773 | /* --- NAN or INFINITY --- */ |
| 774 | if(uSingleleSignificand != 0) { |
| 775 | Result.type = IEEE754_ToInt_NaN; /* dCBOR doesn't care about payload */ |
| 776 | } else { |
| 777 | Result.type = IEEE754_ToInt_NO_CONVERSION; |
| 778 | } |
| 779 | } else if(nSingleUnbiasedExponent < 0 || |
| 780 | (nSingleUnbiasedExponent >= ((uSingle & SINGLE_SIGN_MASK) ? 63 : 64))) { |
| 781 | /* --- Exponent out of range --- */ |
| 782 | Result.type = IEEE754_ToInt_NO_CONVERSION; |
| 783 | } else { |
| 784 | /* Count down from 23 to the number of bits that are not zero in |
| 785 | * the significand. This counts from the least significant bit |
| 786 | * until a non-zero bit is found. |
| 787 | * |
| 788 | * Conversion only fails when the input is too large or is not a |
| 789 | * whole number, never because of lack of precision because |
| 790 | * 64-bit integers always have more precision than the 52-bits |
| 791 | * of a double. |
| 792 | */ |
| 793 | nNonZeroBitsCount = IEEE754_Private_CountNonZeroBits(SINGLE_NUM_SIGNIFICAND_BITS, uSingleleSignificand); |
| 794 | |
| 795 | if(nNonZeroBitsCount && nNonZeroBitsCount > nSingleUnbiasedExponent) { |
| 796 | /* --- Not a whole number --- */ |
| 797 | Result.type = IEEE754_ToInt_NO_CONVERSION; |
| 798 | } else { |
| 799 | /* --- CONVERTABLE WHOLE NUMBER --- */ |
| 800 | /* Add in the one that is implied in normal floats */ |
| 801 | uInteger = uSingleleSignificand + (1ULL << SINGLE_NUM_SIGNIFICAND_BITS); |
| 802 | /* Factor in the exponent */ |
| 803 | if(nSingleUnbiasedExponent < SINGLE_NUM_SIGNIFICAND_BITS) { |
| 804 | /* Numbers less than 2^23 with up to 23 significant bits */ |
| 805 | uInteger >>= SINGLE_NUM_SIGNIFICAND_BITS - nSingleUnbiasedExponent; |
| 806 | } else { |
| 807 | /* Numbers greater than 2^23 with at most 23 significant bits*/ |
| 808 | uInteger <<= nSingleUnbiasedExponent - SINGLE_NUM_SIGNIFICAND_BITS; |
| 809 | } |
| 810 | if(uSingle & SINGLE_SIGN_MASK) { |
| 811 | Result.integer.is_signed = -((int64_t)uInteger); |
| 812 | Result.type = IEEE754_ToInt_IS_INT; |
| 813 | } else { |
| 814 | Result.integer.un_signed = uInteger; |
| 815 | Result.type = IEEE754_ToInt_IS_UINT; |
| 816 | } |
| 817 | } |
| 818 | } |
| 819 | |
| 820 | return Result; |
| 821 | } |
Laurence Lundblade | fe09bbf | 2020-07-16 12:14:51 -0700 | [diff] [blame] | 822 | |
Laurence Lundblade | b275cdc | 2020-07-12 12:34:38 -0700 | [diff] [blame] | 823 | #endif /* QCBOR_DISABLE_PREFERRED_FLOAT */ |
Laurence Lundblade | eb3cdef | 2024-02-17 20:38:55 -0800 | [diff] [blame] | 824 | |
| 825 | |
| 826 | |
| 827 | /* Public function; see ieee754.h */ |
| 828 | int |
| 829 | IEEE754_IsNotStandardDoubleNaN(const double d) |
| 830 | { |
| 831 | const uint64_t uDouble = CopyDoubleToUint64(d); |
| 832 | const uint64_t uDoubleBiasedExponent = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT; |
| 833 | /* Cast safe because of mask above; exponents < DOUBLE_EXPONENT_MAX */ |
| 834 | const int64_t nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS; |
| 835 | const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK; |
| 836 | |
| 837 | if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN && |
| 838 | uDoubleSignificand != 0 && |
| 839 | uDoubleSignificand != DOUBLE_QUIET_NAN_BIT) { |
| 840 | return 1; |
| 841 | } else { |
| 842 | return 0; |
| 843 | } |
| 844 | } |
| 845 | |
| 846 | |
| 847 | /* Public function; see ieee754.h */ |
| 848 | int |
| 849 | IEEE754_IsNotStandardSingleNaN(const float f) |
| 850 | { |
| 851 | const uint32_t uSingle = CopyFloatToUint32(f); |
| 852 | const uint32_t uSingleBiasedExponent = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT; |
| 853 | /* Cast safe because of mask above; exponents < SINGLE_EXPONENT_MAX */ |
| 854 | const int32_t nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS; |
| 855 | const uint32_t uSingleleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK; |
| 856 | |
| 857 | if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN && |
| 858 | uSingleleSignificand != 0 && |
| 859 | uSingleleSignificand != SINGLE_QUIET_NAN_BIT) { |
| 860 | return 1; |
| 861 | } else { |
| 862 | return 0; |
| 863 | } |
| 864 | } |