Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 1 | /* ========================================================================== |
| 2 | * ieee754.c -- floating-point conversion between half, double & single-precision |
| 3 | * |
| 4 | * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved. |
| 5 | * Copyright (c) 2021, Arm Limited. All rights reserved. |
| 6 | * |
| 7 | * SPDX-License-Identifier: BSD-3-Clause |
| 8 | * |
| 9 | * See BSD-3-Clause license in README.md |
| 10 | * |
| 11 | * Created on 7/23/18 |
| 12 | * ========================================================================== */ |
Laurence Lundblade | cc2ed34 | 2018-09-22 17:29:55 -0700 | [diff] [blame] | 13 | |
Máté Tóth-Pál | ef5f07a | 2021-09-17 19:31:37 +0200 | [diff] [blame] | 14 | /* |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 15 | * Include before QCBOR_DISABLE_PREFERRED_FLOAT is checked as |
| 16 | * QCBOR_DISABLE_PREFERRED_FLOAT might be defined in qcbor/qcbor_common.h |
Máté Tóth-Pál | ef5f07a | 2021-09-17 19:31:37 +0200 | [diff] [blame] | 17 | */ |
| 18 | #include "qcbor/qcbor_common.h" |
| 19 | |
Laurence Lundblade | b275cdc | 2020-07-12 12:34:38 -0700 | [diff] [blame] | 20 | #ifndef QCBOR_DISABLE_PREFERRED_FLOAT |
Laurence Lundblade | 9682a53 | 2020-06-06 18:33:04 -0700 | [diff] [blame] | 21 | |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 22 | #include "ieee754.h" |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 23 | #include <string.h> /* For memcpy() */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 24 | |
Laurence Lundblade | 8db3d3e | 2018-09-29 11:46:37 -0700 | [diff] [blame] | 25 | |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 26 | /* |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 27 | * This code has long lines and is easier to read because of |
| 28 | * them. Some coding guidelines prefer 80 column lines (can they not |
| 29 | * afford big displays?). |
| 30 | * |
| 31 | * This code works solely using shifts and masks and thus has no |
| 32 | * dependency on any math libraries. It can even work if the CPU |
| 33 | * doesn't have any floating-point support, though that isn't the most |
| 34 | * useful thing to do. |
| 35 | * |
| 36 | * The memcpy() dependency is only for CopyFloatToUint32() and friends |
| 37 | * which only is needed to avoid type punning when converting the |
| 38 | * actual float bits to an unsigned value so the bit shifts and masks |
| 39 | * can work. |
| 40 | * |
| 41 | * The references used to write this code: |
| 42 | * |
| 43 | * IEEE 754-2008, particularly section 3.6 and 6.2.1 |
| 44 | * |
| 45 | * https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages |
| 46 | * |
| 47 | * https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values |
| 48 | * |
| 49 | * https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules |
| 50 | * |
| 51 | * https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be |
| 52 | * |
| 53 | * IEEE754_FloatToDouble(uint32_t uFloat) was created but is not |
| 54 | * needed. It can be retrieved from github history if needed. |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 55 | */ |
| 56 | |
| 57 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 58 | |
| 59 | |
| 60 | /* ----- Half Precsion ----------- */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 61 | #define HALF_NUM_SIGNIFICAND_BITS (10) |
| 62 | #define HALF_NUM_EXPONENT_BITS (5) |
| 63 | #define HALF_NUM_SIGN_BITS (1) |
| 64 | |
| 65 | #define HALF_SIGNIFICAND_SHIFT (0) |
| 66 | #define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS) |
| 67 | #define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS) |
| 68 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 69 | #define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits |
Laurence Lundblade | 06350ea | 2020-01-27 19:32:40 -0800 | [diff] [blame] | 70 | #define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 71 | #define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // 0x8000 1 bit of sign |
Laurence Lundblade | 06350ea | 2020-01-27 19:32:40 -0800 | [diff] [blame] | 72 | #define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200 |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 73 | |
| 74 | /* Biased Biased Unbiased Use |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 75 | * 0x00 0 -15 0 and subnormal |
| 76 | * 0x01 1 -14 Smallest normal exponent |
| 77 | * 0x1e 30 15 Largest normal exponent |
| 78 | * 0x1F 31 16 NaN and Infinity */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 79 | #define HALF_EXPONENT_BIAS (15) |
| 80 | #define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased |
| 81 | #define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased |
| 82 | #define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased |
| 83 | #define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased |
| 84 | |
| 85 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 86 | /* ------ Single-Precision -------- */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 87 | #define SINGLE_NUM_SIGNIFICAND_BITS (23) |
| 88 | #define SINGLE_NUM_EXPONENT_BITS (8) |
| 89 | #define SINGLE_NUM_SIGN_BITS (1) |
| 90 | |
| 91 | #define SINGLE_SIGNIFICAND_SHIFT (0) |
| 92 | #define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS) |
| 93 | #define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS) |
| 94 | |
Laurence Lundblade | 06350ea | 2020-01-27 19:32:40 -0800 | [diff] [blame] | 95 | #define SINGLE_SIGNIFICAND_MASK (0x7fffffU) // The lower 23 bits |
| 96 | #define SINGLE_EXPONENT_MASK (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent |
| 97 | #define SINGLE_SIGN_MASK (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign |
| 98 | #define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1)) |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 99 | |
| 100 | /* Biased Biased Unbiased Use |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 101 | * 0x0000 0 -127 0 and subnormal |
| 102 | * 0x0001 1 -126 Smallest normal exponent |
| 103 | * 0x7f 127 0 1 |
| 104 | * 0xfe 254 127 Largest normal exponent |
| 105 | * 0xff 255 128 NaN and Infinity */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 106 | #define SINGLE_EXPONENT_BIAS (127) |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 107 | #define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) |
| 108 | #define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) |
| 109 | #define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) |
| 110 | #define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 111 | |
| 112 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 113 | /* --------- Double-Precision ---------- */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 114 | #define DOUBLE_NUM_SIGNIFICAND_BITS (52) |
| 115 | #define DOUBLE_NUM_EXPONENT_BITS (11) |
| 116 | #define DOUBLE_NUM_SIGN_BITS (1) |
| 117 | |
| 118 | #define DOUBLE_SIGNIFICAND_SHIFT (0) |
| 119 | #define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS) |
| 120 | #define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS) |
| 121 | |
Laurence Lundblade | 8db3d3e | 2018-09-29 11:46:37 -0700 | [diff] [blame] | 122 | #define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits |
| 123 | #define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent |
| 124 | #define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign |
| 125 | #define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1)) |
| 126 | |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 127 | |
| 128 | /* Biased Biased Unbiased Use |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 129 | * 0x00000000 0 -1023 0 and subnormal |
| 130 | * 0x00000001 1 -1022 Smallest normal exponent |
| 131 | * 0x000007fe 2046 1023 Largest normal exponent |
| 132 | * 0x000007ff 2047 1024 NaN and Infinity */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 133 | #define DOUBLE_EXPONENT_BIAS (1023) |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 134 | #define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) |
| 135 | #define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) |
| 136 | #define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) |
| 137 | #define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) |
| 138 | |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 139 | |
| 140 | |
| 141 | |
| 142 | /* |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 143 | * Convenient functions to avoid type punning, compiler warnings and |
| 144 | * such. The optimizer reduces them to a simple assignment. This is a |
| 145 | * crusty corner of C. It shouldn't be this hard. |
| 146 | * |
| 147 | * These are also in UsefulBuf.h under a different name. They are copied |
| 148 | * here to avoid a dependency on UsefulBuf.h. There is no object code |
| 149 | * size impact because these always optimze down to a simple assignment. |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 150 | */ |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 151 | static inline uint32_t |
| 152 | CopyFloatToUint32(float f) |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 153 | { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 154 | uint32_t u32; |
| 155 | memcpy(&u32, &f, sizeof(uint32_t)); |
| 156 | return u32; |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 157 | } |
| 158 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 159 | static inline uint64_t |
| 160 | CopyDoubleToUint64(double d) |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 161 | { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 162 | uint64_t u64; |
| 163 | memcpy(&u64, &d, sizeof(uint64_t)); |
| 164 | return u64; |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 165 | } |
| 166 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 167 | static inline double |
| 168 | CopyUint64ToDouble(uint64_t u64) |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 169 | { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 170 | double d; |
| 171 | memcpy(&d, &u64, sizeof(uint64_t)); |
| 172 | return d; |
| 173 | } |
| 174 | |
| 175 | static inline float |
| 176 | CopyUint32ToSingle(uint32_t u32) |
| 177 | { |
| 178 | float f; |
| 179 | memcpy(&f, &u32, sizeof(uint32_t)); |
| 180 | return f; |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 181 | } |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 182 | |
| 183 | |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 184 | |
| 185 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 186 | /** |
| 187 | * @brief Assemble sign, significand and exponent into single precision float. |
| 188 | * |
| 189 | * @param[in] uDoubleSign 0 if positive, 1 if negative |
| 190 | * @pararm[in] uDoubleSignificand Bits of the significand |
| 191 | * @param[in] nDoubleUnBiasedExponent Exponent |
| 192 | * |
| 193 | * This returns the bits for a single-precision float, a binary64 |
| 194 | * as specified in IEEE754. |
Laurence Lundblade | fe09bbf | 2020-07-16 12:14:51 -0700 | [diff] [blame] | 195 | */ |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 196 | static double |
| 197 | IEEE754_AssembleDouble(uint64_t uDoubleSign, |
| 198 | uint64_t uDoubleSignificand, |
| 199 | int64_t nDoubleUnBiasedExponent) |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 200 | { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 201 | uint64_t uDoubleBiasedExponent; |
| 202 | |
| 203 | uDoubleBiasedExponent = (uint64_t)(nDoubleUnBiasedExponent + DOUBLE_EXPONENT_BIAS); |
| 204 | |
| 205 | return CopyUint64ToDouble(uDoubleSignificand | |
| 206 | (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) | |
| 207 | (uDoubleSign << DOUBLE_SIGN_SHIFT)); |
| 208 | } |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 209 | |
| 210 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 211 | double |
| 212 | IEEE754_HalfToDouble(uint16_t uHalfPrecision) |
| 213 | { |
| 214 | uint64_t uDoubleSignificand; |
| 215 | int64_t nDoubleUnBiasedExponent; |
| 216 | double dResult; |
| 217 | |
| 218 | /* Pull out the three parts of the half-precision float. Do all |
| 219 | * the work in 64 bits because that is what the end result is. It |
| 220 | * may give smaller code size and will keep static analyzers |
| 221 | * happier. |
| 222 | */ |
| 223 | const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK; |
| 224 | const uint64_t uHalfBiasedExponent = (uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT; |
| 225 | const int64_t nHalfUnBiasedExponent = (int64_t)uHalfBiasedExponent - HALF_EXPONENT_BIAS; |
| 226 | const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT; |
| 227 | |
| 228 | if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) { |
| 229 | /* 0 or subnormal */ |
| 230 | if(uHalfSignificand) { |
| 231 | /* --- SUBNORMAL --- */ |
| 232 | /* A half-precision subnormal can always be converted to a |
| 233 | * normal double-precision float because the ranges line up. |
| 234 | * The exponent of a subnormal starts out at the min exponent |
| 235 | * for a normal. As the sub normal significand bits are |
| 236 | * shifted, left to normalize, the exponent is |
| 237 | * decremented. Shifting continues until fully normalized. |
| 238 | */ |
| 239 | nDoubleUnBiasedExponent = HALF_EXPONENT_MIN; |
| 240 | uDoubleSignificand = uHalfSignificand; |
| 241 | do { |
| 242 | uDoubleSignificand <<= 1; |
| 243 | nDoubleUnBiasedExponent--; |
| 244 | } while ((uDoubleSignificand & (1ULL << HALF_NUM_SIGNIFICAND_BITS)) == 0); |
| 245 | /* A normal has an implied 1 in the most significant |
| 246 | * position that a subnormal doesn't. */ |
| 247 | uDoubleSignificand -= 1ULL << HALF_NUM_SIGNIFICAND_BITS; |
| 248 | /* Must shift into place for a double significand */ |
| 249 | uDoubleSignificand <<= DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS; |
| 250 | |
| 251 | dResult = IEEE754_AssembleDouble(uHalfSign, |
| 252 | uDoubleSignificand, |
| 253 | nDoubleUnBiasedExponent); |
| 254 | } else { |
| 255 | /* --- ZERO --- */ |
| 256 | dResult = IEEE754_AssembleDouble(uHalfSign, |
| 257 | 0, |
| 258 | DOUBLE_EXPONENT_ZERO); |
| 259 | } |
| 260 | } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) { |
| 261 | /* NaN or Inifinity */ |
| 262 | if(uHalfSignificand) { |
| 263 | /* --- NaN --- */ |
| 264 | /* Half-precision payloads always fit into double precision |
| 265 | * payloads. They are shifted left the same as a normal |
| 266 | * number significand. |
| 267 | */ |
| 268 | uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| 269 | dResult = IEEE754_AssembleDouble(uHalfSign, |
| 270 | uDoubleSignificand, |
| 271 | DOUBLE_EXPONENT_INF_OR_NAN); |
| 272 | } else { |
| 273 | /* --- INFINITY --- */ |
| 274 | dResult = IEEE754_AssembleDouble(uHalfSign, |
| 275 | 0, |
| 276 | DOUBLE_EXPONENT_INF_OR_NAN); |
| 277 | } |
| 278 | } else { |
| 279 | /* --- NORMAL NUMBER --- */ |
| 280 | uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| 281 | dResult = IEEE754_AssembleDouble(uHalfSign, |
| 282 | uDoubleSignificand, |
| 283 | nHalfUnBiasedExponent); |
| 284 | } |
| 285 | |
| 286 | return dResult; |
| 287 | } |
| 288 | |
| 289 | |
| 290 | /** |
| 291 | * @brief Assemble sign, significand and exponent into single precision float. |
| 292 | * |
| 293 | * @param[in] uHalfSign 0 if positive, 1 if negative |
| 294 | * @pararm[in] uHalfSignificand Bits of the significand |
| 295 | * @param[in] nHalfUnBiasedExponent Exponent |
| 296 | * |
| 297 | * This returns the bits for a single-precision float, a binary32 as |
| 298 | * specified in IEEE754. It is returned as a uint64_t rather than a |
| 299 | * uint32_t or a float for convenience of usage. |
| 300 | */ |
| 301 | static uint32_t |
| 302 | IEEE754_AssembleHalf(uint32_t uHalfSign, |
| 303 | uint32_t uHalfSignificand, |
| 304 | int32_t nHalfUnBiasedExponent) |
| 305 | { |
| 306 | uint32_t uHalfUnbiasedExponent; |
| 307 | |
| 308 | uHalfUnbiasedExponent = (uint32_t)(nHalfUnBiasedExponent + HALF_EXPONENT_BIAS); |
| 309 | |
| 310 | return uHalfSignificand | |
| 311 | (uHalfUnbiasedExponent << HALF_EXPONENT_SHIFT) | |
| 312 | (uHalfSign << HALF_SIGN_SHIFT); |
| 313 | } |
| 314 | |
| 315 | |
| 316 | /* Public function; see ieee754.h */ |
| 317 | IEEE754_union |
| 318 | IEEE754_SingleToHalf(float f) |
| 319 | { |
| 320 | IEEE754_union result; |
| 321 | uint32_t uDroppedBits; |
| 322 | int32_t nExponentDifference; |
| 323 | int32_t nShiftAmount; |
| 324 | uint32_t uHalfSignificand; |
| 325 | |
| 326 | /* Pull the three parts out of the double-precision float Most work |
| 327 | * is done with uint32_t which helps avoid integer promotions and |
| 328 | * static analyzer complaints. |
| 329 | */ |
| 330 | const uint32_t uSingle = CopyFloatToUint32(f); |
| 331 | const uint32_t uSingleBiasedExponent = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT; |
| 332 | const int32_t nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS; |
| 333 | const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK; |
| 334 | const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT; |
| 335 | |
| 336 | if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) { |
| 337 | if(uSingleSignificand == 0) { |
| 338 | /* --- IS ZERO --- */ |
| 339 | result.uSize = IEEE754_UNION_IS_HALF; |
| 340 | result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| 341 | 0, |
| 342 | HALF_EXPONENT_ZERO); |
| 343 | } else { |
| 344 | /* --- IS SINGLE SUBNORMAL --- */ |
| 345 | /* The largest single subnormal is slightly less than the |
| 346 | * largest single normal which is 2^-149 or |
| 347 | * 2.2040517676619426e-38. The smallest half subnormal is |
| 348 | * 2^-14 or 5.9604644775390625E-8. There is no overlap so |
| 349 | * single subnormals can't be converted to halfs of any sort. |
| 350 | */ |
| 351 | result.uSize = IEEE754_UNION_IS_SINGLE; |
| 352 | result.uValue = uSingle; |
| 353 | } |
| 354 | } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) { |
| 355 | if(uSingleSignificand == 0) { |
| 356 | /* ---- IS INFINITY ---- */ |
| 357 | result.uSize = IEEE754_UNION_IS_HALF; |
| 358 | result.uValue = IEEE754_AssembleHalf(uSingleSign, 0, HALF_EXPONENT_INF_OR_NAN); |
| 359 | } else { |
| 360 | /* The NaN can only be converted if no payload bits are lost |
| 361 | * per RFC 8949 section 4.1 that defines Preferred |
| 362 | * Serializaton. Note that Deterministically Encode CBOR in |
| 363 | * section 4.2 allows for some variation of this rule, but at |
| 364 | * the moment this implementation is of Preferred |
| 365 | * Serialization, not CDE. As of December 2023, we are also |
| 366 | * expecting an update to CDE. This code may need to be |
| 367 | * updated for CDE. |
| 368 | */ |
| 369 | uDroppedBits = uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS); |
| 370 | if(uDroppedBits == 0) { |
| 371 | /* --- IS CONVERTABLE NAN --- */ |
| 372 | uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| 373 | result.uSize = IEEE754_UNION_IS_HALF; |
| 374 | result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| 375 | uHalfSignificand, |
| 376 | HALF_EXPONENT_INF_OR_NAN); |
| 377 | |
| 378 | } else { |
| 379 | /* --- IS UNCONVERTABLE NAN --- */ |
| 380 | result.uSize = IEEE754_UNION_IS_SINGLE; |
| 381 | result.uValue = uSingle; |
| 382 | } |
| 383 | } |
| 384 | } else { |
| 385 | /* ---- REGULAR NUMBER ---- */ |
| 386 | /* A regular single can be converted to a regular half if the |
| 387 | * single's exponent is in the smaller range of a half and if no |
| 388 | * precision is lost in the significand. |
| 389 | */ |
| 390 | if(nSingleUnbiasedExponent >= HALF_EXPONENT_MIN && |
| 391 | nSingleUnbiasedExponent <= HALF_EXPONENT_MAX && |
| 392 | (uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS)) == 0) { |
| 393 | uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| 394 | |
| 395 | /* --- CONVERT TO HALF NORMAL --- */ |
| 396 | result.uSize = IEEE754_UNION_IS_HALF; |
| 397 | result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| 398 | uHalfSignificand, |
| 399 | nSingleUnbiasedExponent); |
| 400 | } else { |
| 401 | /* Unable to convert to a half normal. See if it can be |
| 402 | * converted to a half subnormal. To do that, the exponent |
| 403 | * must be in range and no precision can be lost in the |
| 404 | * signficand. |
| 405 | * |
| 406 | * This is more complicated because the number is not |
| 407 | * normalized. The signficand must be shifted proprotionally |
| 408 | * to the exponent and 1 must be added in. See |
| 409 | * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding |
| 410 | * |
| 411 | * Exponents -14 to -24 map to a shift of 0 to 10 of the |
| 412 | * significand. The largest value of a half subnormal has an |
| 413 | * exponent of -14. Subnormals are not normalized like |
| 414 | * normals meaning they lose precision as the numbers get |
| 415 | * smaller. Normals don't lose precision because the exponent |
| 416 | * allows all the bits of the significand to be significant. |
| 417 | */ |
| 418 | /* The exponent of the largest possible half-precision |
| 419 | * subnormal is HALF_EXPONENT_MIN (-14). Exponents larger |
| 420 | * than this are normal and handled above. We're going to |
| 421 | * shift the significand right by at least this amount. |
| 422 | */ |
| 423 | nExponentDifference = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN); |
| 424 | |
| 425 | /* In addition to the shift based on the exponent's value, |
| 426 | * the single significand has to be shifted right to fit into |
| 427 | * a half-precision significand */ |
| 428 | nShiftAmount = nExponentDifference + (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); |
| 429 | |
| 430 | /* Must add 1 in to the possible significand because there is |
| 431 | * an implied 1 for normal values and not for subnormal |
| 432 | * values. See equations here: |
| 433 | * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding |
| 434 | */ |
| 435 | uHalfSignificand = (uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount; |
| 436 | |
| 437 | /* If only zero bits get shifted out, this can be converted |
| 438 | * to subnormal */ |
| 439 | if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN && |
| 440 | nSingleUnbiasedExponent >= HALF_EXPONENT_MIN - HALF_NUM_SIGNIFICAND_BITS && |
| 441 | uHalfSignificand << nShiftAmount == uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) { |
| 442 | /* --- CONVERTABLE TO HALF SUBNORMAL --- */ |
| 443 | result.uSize = IEEE754_UNION_IS_HALF; |
| 444 | result.uValue = IEEE754_AssembleHalf(uSingleSign, |
| 445 | uHalfSignificand, |
| 446 | HALF_EXPONENT_ZERO); |
| 447 | } else { |
| 448 | /* --- DO NOT CONVERT --- */ |
| 449 | result.uSize = IEEE754_UNION_IS_SINGLE; |
| 450 | result.uValue = uSingle; |
| 451 | } |
| 452 | } |
| 453 | } |
| 454 | |
| 455 | return result; |
| 456 | } |
| 457 | |
| 458 | |
| 459 | /** |
| 460 | * @brief Assemble sign, significand and exponent into single precision float. |
| 461 | * |
| 462 | * @param[in] uSingleSign 0 if positive, 1 if negative |
| 463 | * @pararm[in] uSingleSignificand Bits of the significand |
| 464 | * @param[in] nSingleUnBiasedExponent Exponent |
| 465 | * |
| 466 | * This returns the bits for a single-precision float, a binary32 as |
| 467 | * specified in IEEE754. It is returned as a uint64_t rather than a |
| 468 | * uint32_t or a float for convenience of usage. |
| 469 | */ |
| 470 | static uint64_t |
| 471 | IEEE754_AssembleSingle(uint64_t uSingleSign, |
| 472 | uint64_t uSingleSignificand, |
| 473 | int64_t nSingleUnBiasedExponent) |
| 474 | { |
| 475 | uint64_t uSingleBiasedExponent; |
| 476 | |
| 477 | uSingleBiasedExponent = (uint64_t)(nSingleUnBiasedExponent + SINGLE_EXPONENT_BIAS); |
| 478 | |
| 479 | return uSingleSignificand | |
| 480 | (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) | |
| 481 | (uSingleSign << SINGLE_SIGN_SHIFT); |
| 482 | } |
| 483 | |
| 484 | |
| 485 | /** |
| 486 | * @brief Convert a double-precision float to single-precision. |
| 487 | * |
| 488 | * @param[in] d The value to convert. |
| 489 | * |
| 490 | * @returns Either unconverted value or value converted to single-precision. |
| 491 | * |
| 492 | * This always succeeds. If the value cannot be converted without the |
| 493 | * loss of precision, it is not converted. |
| 494 | * |
| 495 | * This handles all subnormals and NaN payloads. |
| 496 | */ |
| 497 | static IEEE754_union |
| 498 | IEEE754_DoubleToSingle(double d) |
| 499 | { |
| 500 | IEEE754_union Result; |
| 501 | int64_t nExponentDifference; |
| 502 | int64_t nShiftAmount; |
| 503 | uint64_t uSingleSignificand; |
| 504 | uint64_t uDroppedBits; |
| 505 | |
| 506 | |
| 507 | /* Pull the three parts out of the double-precision float. Most |
| 508 | * work is done with uint64_t which helps avoid integer promotions |
| 509 | * and static analyzer complaints. |
| 510 | */ |
| 511 | const uint64_t uDouble = CopyDoubleToUint64(d); |
| 512 | const uint64_t uDoubleBiasedExponent = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT; |
| 513 | const int64_t nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS; |
| 514 | const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT; |
| 515 | const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK; |
| 516 | |
| 517 | |
| 518 | if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) { |
| 519 | if(uDoubleSignificand == 0) { |
| 520 | /* --- IS ZERO --- */ |
| 521 | Result.uSize = IEEE754_UNION_IS_SINGLE; |
| 522 | Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 523 | 0, |
| 524 | SINGLE_EXPONENT_ZERO); |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 525 | } else { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 526 | /* --- IS DOUBLE SUBNORMAL --- */ |
| 527 | /* The largest double subnormal is slightly less than the |
| 528 | * largest double normal which is 2^-1022 or |
| 529 | * 2.2250738585072014e-308. The smallest single subnormal |
| 530 | * is 2^-149 or 1.401298464324817e-45. There is no |
| 531 | * overlap so double subnormals can't be converted to |
| 532 | * singles of any sort. |
| 533 | */ |
| 534 | Result.uSize = IEEE754_UNION_IS_DOUBLE; |
| 535 | Result.uValue = uDouble; |
| 536 | } |
| 537 | } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) { |
| 538 | if(uDoubleSignificand == 0) { |
| 539 | /* ---- IS INFINITY ---- */ |
| 540 | Result.uSize = IEEE754_UNION_IS_SINGLE; |
| 541 | Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 542 | 0, |
| 543 | SINGLE_EXPONENT_INF_OR_NAN); |
| 544 | } else { |
| 545 | /* The NaN can only be converted if no payload bits are |
| 546 | * lost per RFC 8949 section 4.1 that defines Preferred |
| 547 | * Serializaton. Note that Deterministically Encode CBOR |
| 548 | * in section 4.2 allows for some variation of this rule, |
| 549 | * but at the moment this implementation is of Preferred |
| 550 | * Serialization, not CDE. As of December 2023, we are |
| 551 | * also expecting an update to CDE. This code may need to |
| 552 | * be updated for CDE. |
| 553 | */ |
| 554 | uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS); |
| 555 | if(uDroppedBits == 0) { |
| 556 | /* --- IS CONVERTABLE NAN --- */ |
| 557 | uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); |
| 558 | Result.uSize = IEEE754_UNION_IS_SINGLE; |
| 559 | Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 560 | uSingleSignificand, |
| 561 | SINGLE_EXPONENT_INF_OR_NAN); |
| 562 | } else { |
| 563 | /* --- IS UNCONVERTABLE NAN --- */ |
| 564 | Result.uSize = IEEE754_UNION_IS_DOUBLE; |
| 565 | Result.uValue = uDouble; |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 566 | } |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 567 | } |
| 568 | } else { |
| 569 | /* ---- REGULAR NUMBER ---- */ |
| 570 | /* A regular double can be converted to a regular single if |
| 571 | * the double's exponent is in the smaller range of a single |
| 572 | * and if no precision is lost in the significand. |
| 573 | */ |
| 574 | uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS); |
| 575 | if(nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN && |
| 576 | nDoubleUnbiasedExponent <= SINGLE_EXPONENT_MAX && |
| 577 | uDroppedBits == 0) { |
| 578 | /* --- IS CONVERTABLE TO SINGLE --- */ |
| 579 | uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); |
| 580 | Result.uSize = IEEE754_UNION_IS_SINGLE; |
| 581 | Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 582 | uSingleSignificand, |
| 583 | nDoubleUnbiasedExponent); |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 584 | } else { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 585 | /* Unable to convert to a single normal. See if it can be |
| 586 | * converted to a single subnormal. To do that, the |
| 587 | * exponent must be in range and no precision can be lost |
| 588 | * in the signficand. |
| 589 | * |
| 590 | * This is more complicated because the number is not |
| 591 | * normalized. The signficand must be shifted |
| 592 | * proprotionally to the exponent and 1 must be added |
| 593 | * in. See |
| 594 | * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding |
| 595 | */ |
| 596 | nExponentDifference = -(nDoubleUnbiasedExponent - SINGLE_EXPONENT_MIN); |
| 597 | nShiftAmount = nExponentDifference + (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); |
| 598 | uSingleSignificand = (uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount; |
| 599 | |
| 600 | if(nDoubleUnbiasedExponent < SINGLE_EXPONENT_MIN && |
| 601 | nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN - SINGLE_NUM_SIGNIFICAND_BITS && |
| 602 | uSingleSignificand << nShiftAmount == uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) { |
| 603 | /* --- IS CONVERTABLE TO SINGLE SUBNORMAL --- */ |
| 604 | Result.uSize = IEEE754_UNION_IS_SINGLE; |
| 605 | Result.uValue = IEEE754_AssembleSingle(uDoubleSign, |
| 606 | uSingleSignificand, |
| 607 | SINGLE_EXPONENT_ZERO); |
| 608 | } else { |
| 609 | /* --- CAN NOT BE CONVERTED --- */ |
| 610 | Result.uSize = IEEE754_UNION_IS_DOUBLE; |
| 611 | Result.uValue = uDouble; |
| 612 | } |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 613 | } |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 614 | } |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 615 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 616 | return Result; |
Laurence Lundblade | 67bd551 | 2018-11-02 21:44:06 +0700 | [diff] [blame] | 617 | } |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 618 | |
| 619 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 620 | /* Public function; see ieee754.h */ |
| 621 | IEEE754_union |
| 622 | IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision) |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 623 | { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 624 | IEEE754_union result; |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 625 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 626 | result = IEEE754_DoubleToSingle(d); |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 627 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 628 | if(result.uSize == IEEE754_UNION_IS_SINGLE && bAllowHalfPrecision) { |
| 629 | /* Cast to uint32_t is OK, because value was just successfully |
| 630 | * converted to single. */ |
| 631 | float uSingle = CopyUint32ToSingle((uint32_t)result.uValue); |
| 632 | result = IEEE754_SingleToHalf(uSingle); |
| 633 | } |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 634 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 635 | return result; |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 636 | } |
| 637 | |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 638 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 639 | #else /* QCBOR_DISABLE_PREFERRED_FLOAT */ |
Laurence Lundblade | 3aee3a3 | 2018-12-17 16:17:45 -0800 | [diff] [blame] | 640 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame^] | 641 | int ieee754_dummy_place_holder; |
Laurence Lundblade | fe09bbf | 2020-07-16 12:14:51 -0700 | [diff] [blame] | 642 | |
Laurence Lundblade | b275cdc | 2020-07-12 12:34:38 -0700 | [diff] [blame] | 643 | #endif /* QCBOR_DISABLE_PREFERRED_FLOAT */ |