blob: 2d981590dc5cee1f4cd9c2b8028f034571e86e3d [file] [log] [blame]
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -07001/* ==========================================================================
2 * ieee754.c -- floating-point conversion between half, double & single-precision
3 *
4 * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved.
5 * Copyright (c) 2021, Arm Limited. All rights reserved.
6 *
7 * SPDX-License-Identifier: BSD-3-Clause
8 *
9 * See BSD-3-Clause license in README.md
10 *
11 * Created on 7/23/18
12 * ========================================================================== */
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070013
Máté Tóth-Pálef5f07a2021-09-17 19:31:37 +020014/*
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070015 * Include before QCBOR_DISABLE_PREFERRED_FLOAT is checked as
16 * QCBOR_DISABLE_PREFERRED_FLOAT might be defined in qcbor/qcbor_common.h
Máté Tóth-Pálef5f07a2021-09-17 19:31:37 +020017 */
18#include "qcbor/qcbor_common.h"
19
Laurence Lundbladeb275cdc2020-07-12 12:34:38 -070020#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
Laurence Lundblade9682a532020-06-06 18:33:04 -070021
Laurence Lundblade12d32c52018-09-19 11:25:27 -070022#include "ieee754.h"
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070023#include <string.h> /* For memcpy() */
Laurence Lundblade12d32c52018-09-19 11:25:27 -070024
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070025
Laurence Lundblade12d32c52018-09-19 11:25:27 -070026/*
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070027 * This code has long lines and is easier to read because of
28 * them. Some coding guidelines prefer 80 column lines (can they not
29 * afford big displays?).
30 *
31 * This code works solely using shifts and masks and thus has no
32 * dependency on any math libraries. It can even work if the CPU
33 * doesn't have any floating-point support, though that isn't the most
34 * useful thing to do.
35 *
36 * The memcpy() dependency is only for CopyFloatToUint32() and friends
37 * which only is needed to avoid type punning when converting the
38 * actual float bits to an unsigned value so the bit shifts and masks
39 * can work.
40 *
41 * The references used to write this code:
42 *
43 * IEEE 754-2008, particularly section 3.6 and 6.2.1
44 *
45 * https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
46 *
47 * https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
48 *
49 * https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules
50 *
51 * https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be
52 *
53 * IEEE754_FloatToDouble(uint32_t uFloat) was created but is not
54 * needed. It can be retrieved from github history if needed.
Laurence Lundblade12d32c52018-09-19 11:25:27 -070055 */
56
57
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070058
59
60/* ----- Half Precsion ----------- */
Laurence Lundblade12d32c52018-09-19 11:25:27 -070061#define HALF_NUM_SIGNIFICAND_BITS (10)
62#define HALF_NUM_EXPONENT_BITS (5)
63#define HALF_NUM_SIGN_BITS (1)
64
65#define HALF_SIGNIFICAND_SHIFT (0)
66#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
67#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
68
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070069#define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits
Laurence Lundblade06350ea2020-01-27 19:32:40 -080070#define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070071#define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // 0x8000 1 bit of sign
Laurence Lundblade06350ea2020-01-27 19:32:40 -080072#define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
Laurence Lundblade12d32c52018-09-19 11:25:27 -070073
74/* Biased Biased Unbiased Use
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070075 * 0x00 0 -15 0 and subnormal
76 * 0x01 1 -14 Smallest normal exponent
77 * 0x1e 30 15 Largest normal exponent
78 * 0x1F 31 16 NaN and Infinity */
Laurence Lundblade12d32c52018-09-19 11:25:27 -070079#define HALF_EXPONENT_BIAS (15)
80#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
81#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
82#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
83#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
84
85
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070086/* ------ Single-Precision -------- */
Laurence Lundblade12d32c52018-09-19 11:25:27 -070087#define SINGLE_NUM_SIGNIFICAND_BITS (23)
88#define SINGLE_NUM_EXPONENT_BITS (8)
89#define SINGLE_NUM_SIGN_BITS (1)
90
91#define SINGLE_SIGNIFICAND_SHIFT (0)
92#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
93#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
94
Laurence Lundblade06350ea2020-01-27 19:32:40 -080095#define SINGLE_SIGNIFICAND_MASK (0x7fffffU) // The lower 23 bits
96#define SINGLE_EXPONENT_MASK (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
97#define SINGLE_SIGN_MASK (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign
98#define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade12d32c52018-09-19 11:25:27 -070099
100/* Biased Biased Unbiased Use
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700101 * 0x0000 0 -127 0 and subnormal
102 * 0x0001 1 -126 Smallest normal exponent
103 * 0x7f 127 0 1
104 * 0xfe 254 127 Largest normal exponent
105 * 0xff 255 128 NaN and Infinity */
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700106#define SINGLE_EXPONENT_BIAS (127)
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700107#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS)
108#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1)
109#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS)
110#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700111
112
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700113/* --------- Double-Precision ---------- */
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700114#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
115#define DOUBLE_NUM_EXPONENT_BITS (11)
116#define DOUBLE_NUM_SIGN_BITS (1)
117
118#define DOUBLE_SIGNIFICAND_SHIFT (0)
119#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
120#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
121
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700122#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
123#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
124#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
125#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
126
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700127
128/* Biased Biased Unbiased Use
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700129 * 0x00000000 0 -1023 0 and subnormal
130 * 0x00000001 1 -1022 Smallest normal exponent
131 * 0x000007fe 2046 1023 Largest normal exponent
132 * 0x000007ff 2047 1024 NaN and Infinity */
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700133#define DOUBLE_EXPONENT_BIAS (1023)
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700134#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS)
135#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1)
136#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS)
137#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1)
138
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700139
140
141
142/*
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700143 * Convenient functions to avoid type punning, compiler warnings and
144 * such. The optimizer reduces them to a simple assignment. This is a
145 * crusty corner of C. It shouldn't be this hard.
146 *
147 * These are also in UsefulBuf.h under a different name. They are copied
148 * here to avoid a dependency on UsefulBuf.h. There is no object code
149 * size impact because these always optimze down to a simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700150 */
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700151static inline uint32_t
152CopyFloatToUint32(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700153{
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700154 uint32_t u32;
155 memcpy(&u32, &f, sizeof(uint32_t));
156 return u32;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700157}
158
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700159static inline uint64_t
160CopyDoubleToUint64(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700161{
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700162 uint64_t u64;
163 memcpy(&u64, &d, sizeof(uint64_t));
164 return u64;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700165}
166
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700167static inline double
168CopyUint64ToDouble(uint64_t u64)
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700169{
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700170 double d;
171 memcpy(&d, &u64, sizeof(uint64_t));
172 return d;
173}
174
175static inline float
176CopyUint32ToSingle(uint32_t u32)
177{
178 float f;
179 memcpy(&f, &u32, sizeof(uint32_t));
180 return f;
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700181}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700182
183
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800184
185
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700186/**
187 * @brief Assemble sign, significand and exponent into single precision float.
188 *
189 * @param[in] uDoubleSign 0 if positive, 1 if negative
190 * @pararm[in] uDoubleSignificand Bits of the significand
191 * @param[in] nDoubleUnBiasedExponent Exponent
192 *
193 * This returns the bits for a single-precision float, a binary64
194 * as specified in IEEE754.
Laurence Lundbladefe09bbf2020-07-16 12:14:51 -0700195 */
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700196static double
197IEEE754_AssembleDouble(uint64_t uDoubleSign,
198 uint64_t uDoubleSignificand,
199 int64_t nDoubleUnBiasedExponent)
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700200{
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700201 uint64_t uDoubleBiasedExponent;
202
203 uDoubleBiasedExponent = (uint64_t)(nDoubleUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
204
205 return CopyUint64ToDouble(uDoubleSignificand |
206 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
207 (uDoubleSign << DOUBLE_SIGN_SHIFT));
208}
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800209
210
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700211double
212IEEE754_HalfToDouble(uint16_t uHalfPrecision)
213{
214 uint64_t uDoubleSignificand;
215 int64_t nDoubleUnBiasedExponent;
216 double dResult;
217
218 /* Pull out the three parts of the half-precision float. Do all
219 * the work in 64 bits because that is what the end result is. It
220 * may give smaller code size and will keep static analyzers
221 * happier.
222 */
223 const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
224 const uint64_t uHalfBiasedExponent = (uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT;
225 const int64_t nHalfUnBiasedExponent = (int64_t)uHalfBiasedExponent - HALF_EXPONENT_BIAS;
226 const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
227
228 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
229 /* 0 or subnormal */
230 if(uHalfSignificand) {
231 /* --- SUBNORMAL --- */
232 /* A half-precision subnormal can always be converted to a
233 * normal double-precision float because the ranges line up.
234 * The exponent of a subnormal starts out at the min exponent
235 * for a normal. As the sub normal significand bits are
236 * shifted, left to normalize, the exponent is
237 * decremented. Shifting continues until fully normalized.
238 */
239 nDoubleUnBiasedExponent = HALF_EXPONENT_MIN;
240 uDoubleSignificand = uHalfSignificand;
241 do {
242 uDoubleSignificand <<= 1;
243 nDoubleUnBiasedExponent--;
244 } while ((uDoubleSignificand & (1ULL << HALF_NUM_SIGNIFICAND_BITS)) == 0);
245 /* A normal has an implied 1 in the most significant
246 * position that a subnormal doesn't. */
247 uDoubleSignificand -= 1ULL << HALF_NUM_SIGNIFICAND_BITS;
248 /* Must shift into place for a double significand */
249 uDoubleSignificand <<= DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
250
251 dResult = IEEE754_AssembleDouble(uHalfSign,
252 uDoubleSignificand,
253 nDoubleUnBiasedExponent);
254 } else {
255 /* --- ZERO --- */
256 dResult = IEEE754_AssembleDouble(uHalfSign,
257 0,
258 DOUBLE_EXPONENT_ZERO);
259 }
260 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
261 /* NaN or Inifinity */
262 if(uHalfSignificand) {
263 /* --- NaN --- */
264 /* Half-precision payloads always fit into double precision
265 * payloads. They are shifted left the same as a normal
266 * number significand.
267 */
268 uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
269 dResult = IEEE754_AssembleDouble(uHalfSign,
270 uDoubleSignificand,
271 DOUBLE_EXPONENT_INF_OR_NAN);
272 } else {
273 /* --- INFINITY --- */
274 dResult = IEEE754_AssembleDouble(uHalfSign,
275 0,
276 DOUBLE_EXPONENT_INF_OR_NAN);
277 }
278 } else {
279 /* --- NORMAL NUMBER --- */
280 uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
281 dResult = IEEE754_AssembleDouble(uHalfSign,
282 uDoubleSignificand,
283 nHalfUnBiasedExponent);
284 }
285
286 return dResult;
287}
288
289
290/**
291 * @brief Assemble sign, significand and exponent into single precision float.
292 *
293 * @param[in] uHalfSign 0 if positive, 1 if negative
294 * @pararm[in] uHalfSignificand Bits of the significand
295 * @param[in] nHalfUnBiasedExponent Exponent
296 *
297 * This returns the bits for a single-precision float, a binary32 as
298 * specified in IEEE754. It is returned as a uint64_t rather than a
299 * uint32_t or a float for convenience of usage.
300 */
301static uint32_t
302IEEE754_AssembleHalf(uint32_t uHalfSign,
303 uint32_t uHalfSignificand,
304 int32_t nHalfUnBiasedExponent)
305{
306 uint32_t uHalfUnbiasedExponent;
307
308 uHalfUnbiasedExponent = (uint32_t)(nHalfUnBiasedExponent + HALF_EXPONENT_BIAS);
309
310 return uHalfSignificand |
311 (uHalfUnbiasedExponent << HALF_EXPONENT_SHIFT) |
312 (uHalfSign << HALF_SIGN_SHIFT);
313}
314
315
316/* Public function; see ieee754.h */
317IEEE754_union
318IEEE754_SingleToHalf(float f)
319{
320 IEEE754_union result;
321 uint32_t uDroppedBits;
322 int32_t nExponentDifference;
323 int32_t nShiftAmount;
324 uint32_t uHalfSignificand;
325
326 /* Pull the three parts out of the double-precision float Most work
327 * is done with uint32_t which helps avoid integer promotions and
328 * static analyzer complaints.
329 */
330 const uint32_t uSingle = CopyFloatToUint32(f);
331 const uint32_t uSingleBiasedExponent = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT;
332 const int32_t nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS;
333 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
334 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
335
336 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
337 if(uSingleSignificand == 0) {
338 /* --- IS ZERO --- */
339 result.uSize = IEEE754_UNION_IS_HALF;
340 result.uValue = IEEE754_AssembleHalf(uSingleSign,
341 0,
342 HALF_EXPONENT_ZERO);
343 } else {
344 /* --- IS SINGLE SUBNORMAL --- */
345 /* The largest single subnormal is slightly less than the
346 * largest single normal which is 2^-149 or
347 * 2.2040517676619426e-38. The smallest half subnormal is
348 * 2^-14 or 5.9604644775390625E-8. There is no overlap so
349 * single subnormals can't be converted to halfs of any sort.
350 */
351 result.uSize = IEEE754_UNION_IS_SINGLE;
352 result.uValue = uSingle;
353 }
354 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
355 if(uSingleSignificand == 0) {
356 /* ---- IS INFINITY ---- */
357 result.uSize = IEEE754_UNION_IS_HALF;
358 result.uValue = IEEE754_AssembleHalf(uSingleSign, 0, HALF_EXPONENT_INF_OR_NAN);
359 } else {
360 /* The NaN can only be converted if no payload bits are lost
361 * per RFC 8949 section 4.1 that defines Preferred
362 * Serializaton. Note that Deterministically Encode CBOR in
363 * section 4.2 allows for some variation of this rule, but at
364 * the moment this implementation is of Preferred
365 * Serialization, not CDE. As of December 2023, we are also
366 * expecting an update to CDE. This code may need to be
367 * updated for CDE.
368 */
369 uDroppedBits = uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS);
370 if(uDroppedBits == 0) {
371 /* --- IS CONVERTABLE NAN --- */
372 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
373 result.uSize = IEEE754_UNION_IS_HALF;
374 result.uValue = IEEE754_AssembleHalf(uSingleSign,
375 uHalfSignificand,
376 HALF_EXPONENT_INF_OR_NAN);
377
378 } else {
379 /* --- IS UNCONVERTABLE NAN --- */
380 result.uSize = IEEE754_UNION_IS_SINGLE;
381 result.uValue = uSingle;
382 }
383 }
384 } else {
385 /* ---- REGULAR NUMBER ---- */
386 /* A regular single can be converted to a regular half if the
387 * single's exponent is in the smaller range of a half and if no
388 * precision is lost in the significand.
389 */
390 if(nSingleUnbiasedExponent >= HALF_EXPONENT_MIN &&
391 nSingleUnbiasedExponent <= HALF_EXPONENT_MAX &&
392 (uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS)) == 0) {
393 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
394
395 /* --- CONVERT TO HALF NORMAL --- */
396 result.uSize = IEEE754_UNION_IS_HALF;
397 result.uValue = IEEE754_AssembleHalf(uSingleSign,
398 uHalfSignificand,
399 nSingleUnbiasedExponent);
400 } else {
401 /* Unable to convert to a half normal. See if it can be
402 * converted to a half subnormal. To do that, the exponent
403 * must be in range and no precision can be lost in the
404 * signficand.
405 *
406 * This is more complicated because the number is not
407 * normalized. The signficand must be shifted proprotionally
408 * to the exponent and 1 must be added in. See
409 * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
410 *
411 * Exponents -14 to -24 map to a shift of 0 to 10 of the
412 * significand. The largest value of a half subnormal has an
413 * exponent of -14. Subnormals are not normalized like
414 * normals meaning they lose precision as the numbers get
415 * smaller. Normals don't lose precision because the exponent
416 * allows all the bits of the significand to be significant.
417 */
418 /* The exponent of the largest possible half-precision
419 * subnormal is HALF_EXPONENT_MIN (-14). Exponents larger
420 * than this are normal and handled above. We're going to
421 * shift the significand right by at least this amount.
422 */
423 nExponentDifference = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
424
425 /* In addition to the shift based on the exponent's value,
426 * the single significand has to be shifted right to fit into
427 * a half-precision significand */
428 nShiftAmount = nExponentDifference + (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
429
430 /* Must add 1 in to the possible significand because there is
431 * an implied 1 for normal values and not for subnormal
432 * values. See equations here:
433 * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
434 */
435 uHalfSignificand = (uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;
436
437 /* If only zero bits get shifted out, this can be converted
438 * to subnormal */
439 if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN &&
440 nSingleUnbiasedExponent >= HALF_EXPONENT_MIN - HALF_NUM_SIGNIFICAND_BITS &&
441 uHalfSignificand << nShiftAmount == uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) {
442 /* --- CONVERTABLE TO HALF SUBNORMAL --- */
443 result.uSize = IEEE754_UNION_IS_HALF;
444 result.uValue = IEEE754_AssembleHalf(uSingleSign,
445 uHalfSignificand,
446 HALF_EXPONENT_ZERO);
447 } else {
448 /* --- DO NOT CONVERT --- */
449 result.uSize = IEEE754_UNION_IS_SINGLE;
450 result.uValue = uSingle;
451 }
452 }
453 }
454
455 return result;
456}
457
458
459/**
460 * @brief Assemble sign, significand and exponent into single precision float.
461 *
462 * @param[in] uSingleSign 0 if positive, 1 if negative
463 * @pararm[in] uSingleSignificand Bits of the significand
464 * @param[in] nSingleUnBiasedExponent Exponent
465 *
466 * This returns the bits for a single-precision float, a binary32 as
467 * specified in IEEE754. It is returned as a uint64_t rather than a
468 * uint32_t or a float for convenience of usage.
469 */
470static uint64_t
471IEEE754_AssembleSingle(uint64_t uSingleSign,
472 uint64_t uSingleSignificand,
473 int64_t nSingleUnBiasedExponent)
474{
475 uint64_t uSingleBiasedExponent;
476
477 uSingleBiasedExponent = (uint64_t)(nSingleUnBiasedExponent + SINGLE_EXPONENT_BIAS);
478
479 return uSingleSignificand |
480 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
481 (uSingleSign << SINGLE_SIGN_SHIFT);
482}
483
484
485/**
486 * @brief Convert a double-precision float to single-precision.
487 *
488 * @param[in] d The value to convert.
489 *
490 * @returns Either unconverted value or value converted to single-precision.
491 *
492 * This always succeeds. If the value cannot be converted without the
493 * loss of precision, it is not converted.
494 *
495 * This handles all subnormals and NaN payloads.
496 */
497static IEEE754_union
498IEEE754_DoubleToSingle(double d)
499{
500 IEEE754_union Result;
501 int64_t nExponentDifference;
502 int64_t nShiftAmount;
503 uint64_t uSingleSignificand;
504 uint64_t uDroppedBits;
505
506
507 /* Pull the three parts out of the double-precision float. Most
508 * work is done with uint64_t which helps avoid integer promotions
509 * and static analyzer complaints.
510 */
511 const uint64_t uDouble = CopyDoubleToUint64(d);
512 const uint64_t uDoubleBiasedExponent = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT;
513 const int64_t nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS;
514 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
515 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
516
517
518 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
519 if(uDoubleSignificand == 0) {
520 /* --- IS ZERO --- */
521 Result.uSize = IEEE754_UNION_IS_SINGLE;
522 Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
523 0,
524 SINGLE_EXPONENT_ZERO);
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700525 } else {
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700526 /* --- IS DOUBLE SUBNORMAL --- */
527 /* The largest double subnormal is slightly less than the
528 * largest double normal which is 2^-1022 or
529 * 2.2250738585072014e-308. The smallest single subnormal
530 * is 2^-149 or 1.401298464324817e-45. There is no
531 * overlap so double subnormals can't be converted to
532 * singles of any sort.
533 */
534 Result.uSize = IEEE754_UNION_IS_DOUBLE;
535 Result.uValue = uDouble;
536 }
537 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
538 if(uDoubleSignificand == 0) {
539 /* ---- IS INFINITY ---- */
540 Result.uSize = IEEE754_UNION_IS_SINGLE;
541 Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
542 0,
543 SINGLE_EXPONENT_INF_OR_NAN);
544 } else {
545 /* The NaN can only be converted if no payload bits are
546 * lost per RFC 8949 section 4.1 that defines Preferred
547 * Serializaton. Note that Deterministically Encode CBOR
548 * in section 4.2 allows for some variation of this rule,
549 * but at the moment this implementation is of Preferred
550 * Serialization, not CDE. As of December 2023, we are
551 * also expecting an update to CDE. This code may need to
552 * be updated for CDE.
553 */
554 uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);
555 if(uDroppedBits == 0) {
556 /* --- IS CONVERTABLE NAN --- */
557 uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
558 Result.uSize = IEEE754_UNION_IS_SINGLE;
559 Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
560 uSingleSignificand,
561 SINGLE_EXPONENT_INF_OR_NAN);
562 } else {
563 /* --- IS UNCONVERTABLE NAN --- */
564 Result.uSize = IEEE754_UNION_IS_DOUBLE;
565 Result.uValue = uDouble;
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700566 }
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700567 }
568 } else {
569 /* ---- REGULAR NUMBER ---- */
570 /* A regular double can be converted to a regular single if
571 * the double's exponent is in the smaller range of a single
572 * and if no precision is lost in the significand.
573 */
574 uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);
575 if(nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN &&
576 nDoubleUnbiasedExponent <= SINGLE_EXPONENT_MAX &&
577 uDroppedBits == 0) {
578 /* --- IS CONVERTABLE TO SINGLE --- */
579 uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
580 Result.uSize = IEEE754_UNION_IS_SINGLE;
581 Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
582 uSingleSignificand,
583 nDoubleUnbiasedExponent);
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700584 } else {
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700585 /* Unable to convert to a single normal. See if it can be
586 * converted to a single subnormal. To do that, the
587 * exponent must be in range and no precision can be lost
588 * in the signficand.
589 *
590 * This is more complicated because the number is not
591 * normalized. The signficand must be shifted
592 * proprotionally to the exponent and 1 must be added
593 * in. See
594 * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
595 */
596 nExponentDifference = -(nDoubleUnbiasedExponent - SINGLE_EXPONENT_MIN);
597 nShiftAmount = nExponentDifference + (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
598 uSingleSignificand = (uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;
599
600 if(nDoubleUnbiasedExponent < SINGLE_EXPONENT_MIN &&
601 nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN - SINGLE_NUM_SIGNIFICAND_BITS &&
602 uSingleSignificand << nShiftAmount == uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) {
603 /* --- IS CONVERTABLE TO SINGLE SUBNORMAL --- */
604 Result.uSize = IEEE754_UNION_IS_SINGLE;
605 Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
606 uSingleSignificand,
607 SINGLE_EXPONENT_ZERO);
608 } else {
609 /* --- CAN NOT BE CONVERTED --- */
610 Result.uSize = IEEE754_UNION_IS_DOUBLE;
611 Result.uValue = uDouble;
612 }
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700613 }
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700614 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800615
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700616 return Result;
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700617}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700618
619
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700620/* Public function; see ieee754.h */
621IEEE754_union
622IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700623{
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700624 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800625
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700626 result = IEEE754_DoubleToSingle(d);
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800627
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700628 if(result.uSize == IEEE754_UNION_IS_SINGLE && bAllowHalfPrecision) {
629 /* Cast to uint32_t is OK, because value was just successfully
630 * converted to single. */
631 float uSingle = CopyUint32ToSingle((uint32_t)result.uValue);
632 result = IEEE754_SingleToHalf(uSingle);
633 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700634
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700635 return result;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700636}
637
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800638
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700639#else /* QCBOR_DISABLE_PREFERRED_FLOAT */
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800640
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700641int ieee754_dummy_place_holder;
Laurence Lundbladefe09bbf2020-07-16 12:14:51 -0700642
Laurence Lundbladeb275cdc2020-07-12 12:34:38 -0700643#endif /* QCBOR_DISABLE_PREFERRED_FLOAT */