blob: 226282a07191b2bf6b2147f9cf0756cc335b026b [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
Laurence Lundbladeee851742020-01-08 08:37:05 -08002 ieee754.c -- floating-point conversion between half, double & single-precision
Laurence Lundblade035bd782019-01-21 17:01:31 -08003
Laurence Lundbladeee851742020-01-08 08:37:05 -08004 Copyright (c) 2018-2020, Laurence Lundblade. All rights reserved.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -08005
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08006 SPDX-License-Identifier: BSD-3-Clause
Laurence Lundblade035bd782019-01-21 17:01:31 -08007
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08008 See BSD-3-Clause license in README.md
Laurence Lundblade035bd782019-01-21 17:01:31 -08009
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -080010 Created on 7/23/18
Laurence Lundbladeee851742020-01-08 08:37:05 -080011 =============================================================================*/
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070012
Laurence Lundblade9682a532020-06-06 18:33:04 -070013#ifndef QCBOR_CONFIG_DISABLE_ENCODE_IEEE754
14
Laurence Lundblade12d32c52018-09-19 11:25:27 -070015#include "ieee754.h"
16#include <string.h> // For memcpy()
17
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070018
Laurence Lundblade12d32c52018-09-19 11:25:27 -070019/*
Laurence Lundbladeee851742020-01-08 08:37:05 -080020 This code is written for clarity and verifiability, not for size, on
21 the assumption that the optimizer will do a good job. The LLVM
22 optimizer, -Os, does seem to do the job and the resulting object code
23 is smaller from combining code for the many different cases (normal,
Laurence Lundbladec5fef682020-01-25 11:38:45 -080024 subnormal, infinity, zero...) for the conversions. GCC is no where near
25 as good.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080026
Laurence Lundbladeee851742020-01-08 08:37:05 -080027 This code has really long lines and is much easier to read because of
28 them. Some coding guidelines prefer 80 column lines (can they not afford
29 big displays?). It would make this code much worse even to wrap at 120
30 columns.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080031
Laurence Lundbladeee851742020-01-08 08:37:05 -080032 Dead stripping is also really helpful to get code size down when
Laurence Lundbladec5fef682020-01-25 11:38:45 -080033 floating-point encoding is not needed. (If this is put in a library
34 and linking is against the library, then dead stripping is automatic).
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080035
Laurence Lundbladeee851742020-01-08 08:37:05 -080036 This code works solely using shifts and masks and thus has no
37 dependency on any math libraries. It can even work if the CPU doesn't
38 have any floating-point support, though that isn't the most useful
39 thing to do.
40
41 The memcpy() dependency is only for CopyFloatToUint32() and friends
42 which only is needed to avoid type punning when converting the actual
43 float bits to an unsigned value so the bit shifts and masks can work.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070044 */
45
46/*
47 The references used to write this code:
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080048
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070049 - IEEE 754-2008, particularly section 3.6 and 6.2.1
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080050
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070051 - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080052
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070053 - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundbladec5fef682020-01-25 11:38:45 -080054
55 - https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules
56
57 - https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be
Laurence Lundblade12d32c52018-09-19 11:25:27 -070058 */
59
60
61// ----- Half Precsion -----------
62#define HALF_NUM_SIGNIFICAND_BITS (10)
63#define HALF_NUM_EXPONENT_BITS (5)
64#define HALF_NUM_SIGN_BITS (1)
65
66#define HALF_SIGNIFICAND_SHIFT (0)
67#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
68#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
69
Laurence Lundblade06350ea2020-01-27 19:32:40 -080070#define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits // 0x03ff
71#define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
72#define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // // 0x8000 1 bit of sign
73#define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
Laurence Lundblade12d32c52018-09-19 11:25:27 -070074
75/* Biased Biased Unbiased Use
76 0x00 0 -15 0 and subnormal
77 0x01 1 -14 Smallest normal exponent
78 0x1e 30 15 Largest normal exponent
79 0x1F 31 16 NaN and Infinity */
80#define HALF_EXPONENT_BIAS (15)
81#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
82#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
83#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
84#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
85
86
Laurence Lundbladeee851742020-01-08 08:37:05 -080087// ------ Single-Precision --------
Laurence Lundblade12d32c52018-09-19 11:25:27 -070088#define SINGLE_NUM_SIGNIFICAND_BITS (23)
89#define SINGLE_NUM_EXPONENT_BITS (8)
90#define SINGLE_NUM_SIGN_BITS (1)
91
92#define SINGLE_SIGNIFICAND_SHIFT (0)
93#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
94#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
95
Laurence Lundblade06350ea2020-01-27 19:32:40 -080096#define SINGLE_SIGNIFICAND_MASK (0x7fffffU) // The lower 23 bits
97#define SINGLE_EXPONENT_MASK (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
98#define SINGLE_SIGN_MASK (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign
99#define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700100
101/* Biased Biased Unbiased Use
102 0x0000 0 -127 0 and subnormal
103 0x0001 1 -126 Smallest normal exponent
104 0x7f 127 0 1
105 0xfe 254 127 Largest normal exponent
106 0xff 255 128 NaN and Infinity */
107#define SINGLE_EXPONENT_BIAS (127)
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800108#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700109#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
110#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800111#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700112
113
Laurence Lundbladeee851742020-01-08 08:37:05 -0800114// --------- Double-Precision ----------
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700115#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
116#define DOUBLE_NUM_EXPONENT_BITS (11)
117#define DOUBLE_NUM_SIGN_BITS (1)
118
119#define DOUBLE_SIGNIFICAND_SHIFT (0)
120#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
121#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
122
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700123#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
124#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
125#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
126#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
127
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700128
129/* Biased Biased Unbiased Use
130 0x00000000 0 -1023 0 and subnormal
131 0x00000001 1 -1022 Smallest normal exponent
132 0x000007fe 2046 1023 Largest normal exponent
133 0x000007ff 2047 1024 NaN and Infinity */
134#define DOUBLE_EXPONENT_BIAS (1023)
135#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
136#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
137#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
138#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
139
140
141
142/*
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800143 Convenient functions to avoid type punning, compiler warnings and
144 such. The optimizer reduces them to a simple assignment. This is a
145 crusty corner of C. It shouldn't be this hard.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800146
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700147 These are also in UsefulBuf.h under a different name. They are copied
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800148 here to avoid a dependency on UsefulBuf.h. There is no object code
149 size impact because these always optimze down to a simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700150 */
151static inline uint32_t CopyFloatToUint32(float f)
152{
153 uint32_t u32;
154 memcpy(&u32, &f, sizeof(uint32_t));
155 return u32;
156}
157
158static inline uint64_t CopyDoubleToUint64(double d)
159{
160 uint64_t u64;
161 memcpy(&u64, &d, sizeof(uint64_t));
162 return u64;
163}
164
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700165static inline float CopyUint32ToFloat(uint32_t u32)
166{
167 float f;
168 memcpy(&f, &u32, sizeof(uint32_t));
169 return f;
170}
171
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700172static inline double CopyUint64ToDouble(uint64_t u64)
173{
174 double d;
175 memcpy(&d, &u64, sizeof(uint64_t));
176 return d;
177}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700178
179
180// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700181uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700182{
183 // Pull the three parts out of the single-precision float
184 const uint32_t uSingle = CopyFloatToUint32(f);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800185 const int32_t nSingleUnbiasedExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
186 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
187 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800188
189
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700190 // Now convert the three parts to half-precision.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800191
192 // All works is done on uint32_t with conversion to uint16_t at the end.
193 // This avoids integer promotions that static analyzers complain about and
194 // reduces code size.
195 uint32_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
196
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700197 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
198 // +/- Infinity and NaNs -- single biased exponent is 0xff
199 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
200 if(!uSingleSignificand) {
201 // Infinity
202 uHalfSignificand = 0;
203 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800204 // Copy the LSBs of the NaN payload that will fit from the single to the half
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700205 uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
206 if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
207 // It's a qNaN; copy the qNaN bit
208 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700209 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800210 // It's an sNaN; make sure the significand is not zero so it stays a NaN
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700211 // This is needed because not all significand bits are copied from single
212 if(!uHalfSignificand) {
213 // Set the LSB. This is what wikipedia shows for sNAN.
214 uHalfSignificand |= 0x01;
215 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700216 }
217 }
218 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
219 // 0 or a subnormal number -- singled biased exponent is 0
220 uHalfBiasedExponent = 0;
221 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
222 } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
223 // Exponent is too large to express in half-precision; round up to infinity
224 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
225 uHalfSignificand = 0;
226 } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
227 // Exponent is too small to express in half-precision normal; make it a half-precision subnormal
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800228 uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700229 // Difference between single normal exponent and the base exponent of a half subnormal
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800230 const uint32_t uExpDiff = (uint32_t)-(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700231 // Also have to shift the significand by the difference in number of bits between a single and a half significand
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800232 const uint32_t uSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700233 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
Laurence Lundblade06350ea2020-01-27 19:32:40 -0800234 const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01U << SINGLE_NUM_SIGNIFICAND_BITS);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800235 uHalfSignificand = uSingleSignificandSubnormal >> (uExpDiff + uSignificandBitsDiff);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700236 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800237 // The normal case, exponent is in range for half-precision
238 uHalfBiasedExponent = (uint32_t)(nSingleUnbiasedExponent + HALF_EXPONENT_BIAS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700239 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
240 }
241 uHalfSign = uSingleSign;
242
243 // Put the 3 values in the right place for a half precision
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800244 const uint32_t uHalfPrecision = uHalfSignificand |
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700245 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
246 (uHalfSign << HALF_SIGN_SHIFT);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800247 // Cast is safe because all the masks and shifts above work to make
248 // a half precision value which is only 16 bits.
249 return (uint16_t)uHalfPrecision;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700250}
251
252
253// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700254uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700255{
256 // Pull the three parts out of the double-precision float
257 const uint64_t uDouble = CopyDoubleToUint64(d);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800258 const int64_t nDoubleUnbiasedExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
259 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
260 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800261
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700262 // Now convert the three parts to half-precision.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800263
264 // All works is done on uint64_t with conversion to uint16_t at the end.
265 // This avoids integer promotions that static analyzers complain about.
266 // Other options are for these to be unsigned int or fast_int16_t. Code
267 // size doesn't vary much between all these options for 64-bit LLVM,
268 // 64-bit GCC and 32-bit Armv7 LLVM.
269 uint64_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
270
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700271 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
272 // +/- Infinity and NaNs -- single biased exponent is 0xff
273 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
274 if(!uDoubleSignificand) {
275 // Infinity
276 uHalfSignificand = 0;
277 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800278 // Copy the LSBs of the NaN payload that will fit from the double to the half
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700279 uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
280 if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
281 // It's a qNaN; copy the qNaN bit
282 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700283 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700284 // It's an sNaN; make sure the significand is not zero so it stays a NaN
285 // This is needed because not all significand bits are copied from single
286 if(!uHalfSignificand) {
287 // Set the LSB. This is what wikipedia shows for sNAN.
288 uHalfSignificand |= 0x01;
289 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700290 }
291 }
292 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
293 // 0 or a subnormal number -- double biased exponent is 0
294 uHalfBiasedExponent = 0;
295 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
296 } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
297 // Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
298 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
299 uHalfSignificand = 0;
300 } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
301 // Exponent is too small to express in half-precision; round down to zero
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800302 uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700303 // Difference between double normal exponent and the base exponent of a half subnormal
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800304 const uint64_t uExpDiff = (uint64_t)-(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700305 // Also have to shift the significand by the difference in number of bits between a double and a half significand
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800306 const uint64_t uSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700307 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
Laurence Lundbladef2801952018-12-17 10:40:29 -0800308 const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01ULL << DOUBLE_NUM_SIGNIFICAND_BITS);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800309 uHalfSignificand = uDoubleSignificandSubnormal >> (uExpDiff + uSignificandBitsDiff);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700310 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800311 // The normal case, exponent is in range for half-precision
312 uHalfBiasedExponent = (uint32_t)(nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700313 uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
314 }
315 uHalfSign = uDoubleSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800316
317
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700318 // Put the 3 values in the right place for a half precision
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800319 const uint64_t uHalfPrecision = uHalfSignificand |
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700320 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
321 (uHalfSign << HALF_SIGN_SHIFT);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800322 // Cast is safe because all the masks and shifts above work to make
323 // a half precision value which is only 16 bits.
324 return (uint16_t)uHalfPrecision;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700325}
326
327
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800328
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700329// Public function; see ieee754.h
330float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
331{
332 // Pull out the three parts of the half-precision float
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800333 // Do all the work in 32 bits because that is what the end result is
334 // may give smaller code size and will keep static analyzers happier.
335 const uint32_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
336 const int32_t nHalfUnBiasedExponent = (int32_t)((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
337 const uint32_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800338
339
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700340 // Make the three parts of the single-precision number
341 uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
342 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
343 // 0 or subnormal
344 if(uHalfSignificand) {
345 // Subnormal case
346 uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
347 // A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
348 uSingleSignificand = uHalfSignificand;
349 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
350 do {
351 uSingleSignificand <<= 1;
352 uSingleBiasedExponent--;
353 } while ((uSingleSignificand & 0x400) == 0);
354 uSingleSignificand &= HALF_SIGNIFICAND_MASK;
355 uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
356 } else {
357 // Just zero
358 uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
359 uSingleSignificand = 0;
360 }
361 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
362 // NaN or Inifinity
363 uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
364 if(uHalfSignificand) {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700365 // NaN
366 // First preserve the NaN payload from half to single
367 uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
368 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
369 // Next, set qNaN if needed since half qNaN bit is not copied above
370 uSingleSignificand |= SINGLE_QUIET_NAN_BIT;
371 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700372 } else {
373 // Infinity
374 uSingleSignificand = 0;
375 }
376 } else {
377 // Normal number
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800378 uSingleBiasedExponent = (uint32_t)(nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700379 uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
380 }
381 uSingleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800382
Laurence Lundbladeee851742020-01-08 08:37:05 -0800383 // Shift the three parts of the single-precision into place
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700384 const uint32_t uSinglePrecision = uSingleSignificand |
385 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
386 (uSingleSign << SINGLE_SIGN_SHIFT);
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800387
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700388 return CopyUint32ToFloat(uSinglePrecision);
389}
390
391
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700392// Public function; see ieee754.h
393double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
394{
395 // Pull out the three parts of the half-precision float
Laurence Lundblade9682a532020-06-06 18:33:04 -0700396 // Do all the work in 64 bits because that is what the end result is.
397 // It may give smaller code size and will keep static analyzers happier.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800398 const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
399 const int64_t nHalfUnBiasedExponent = (int64_t)((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
400 const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800401
402
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700403 // Make the three parts of hte single-precision number
404 uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
405 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
406 // 0 or subnormal
407 uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
408 if(uHalfSignificand) {
409 // Subnormal case
410 uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
411 // A half-precision subnormal can always be converted to a normal double-precision float because the ranges line up
412 uDoubleSignificand = uHalfSignificand;
413 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
414 do {
415 uDoubleSignificand <<= 1;
416 uDoubleBiasedExponent--;
417 } while ((uDoubleSignificand & 0x400) == 0);
418 uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
419 uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
420 } else {
421 // Just zero
422 uDoubleSignificand = 0;
423 }
424 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
425 // NaN or Inifinity
426 uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
427 if(uHalfSignificand) {
428 // NaN
429 // First preserve the NaN payload from half to single
430 uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
431 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
432 // Next, set qNaN if needed since half qNaN bit is not copied above
433 uDoubleSignificand |= DOUBLE_QUIET_NAN_BIT;
434 }
435 } else {
436 // Infinity
437 uDoubleSignificand = 0;
438 }
439 } else {
440 // Normal number
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800441 uDoubleBiasedExponent = (uint64_t)(nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
442 uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700443 }
444 uDoubleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800445
446
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700447 // Shift the 3 parts into place as a double-precision
448 const uint64_t uDouble = uDoubleSignificand |
449 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
450 (uDoubleSign << DOUBLE_SIGN_SHIFT);
451 return CopyUint64ToDouble(uDouble);
452}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700453
454
455// Public function; see ieee754.h
Laurence Lundblade8fa7d5d2020-07-11 16:30:47 -0700456double IEEE754_FloatToDouble(uint32_t uFloat)
Laurence Lundblade9682a532020-06-06 18:33:04 -0700457{
Laurence Lundblade9682a532020-06-06 18:33:04 -0700458 // Pull out the three parts of the single-precision float
459 // Do all the work in 64 bits because that is what the end result is.
460 // It may give smaller code size and will keep static analyzers happier.
461 const uint64_t uSingleSignificand = uFloat & SINGLE_SIGNIFICAND_MASK;
462 const int64_t nSingleUnBiasedExponent = (int64_t)((uFloat & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
463 const uint64_t uSingleSign = (uFloat & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
464
465
466 // Make the three parts of hte single-precision number
467 uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
468 if(nSingleUnBiasedExponent == SINGLE_EXPONENT_ZERO) {
469 // 0 or subnormal
470 uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
471 if(uSingleSignificand) {
472 // Subnormal case
473 uDoubleBiasedExponent = -SINGLE_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS + 1;
474 // A single-precision subnormal can always be converted to a normal double-precision float because the ranges line up
475 uDoubleSignificand = uSingleSignificand;
476 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
477 do {
478 uDoubleSignificand <<= 1;
479 uDoubleBiasedExponent--;
480 // TODO: is this right? Where does 0x400 come from?
481 } while ((uDoubleSignificand & 0x400) == 0);
482 uDoubleSignificand &= SINGLE_SIGNIFICAND_MASK;
483 uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
484 } else {
485 // Just zero
486 uDoubleSignificand = 0;
487 }
488 } else if(nSingleUnBiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
489 // NaN or Inifinity
490 uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
491 if(uSingleSignificand) {
492 // NaN
493 // First preserve the NaN payload from half to single
494 // TODO: check this
495 uDoubleSignificand = uSingleSignificand & ~SINGLE_QUIET_NAN_BIT;
496 if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
497 // Next, set qNaN if needed since half qNaN bit is not copied above
498 uDoubleSignificand |= DOUBLE_QUIET_NAN_BIT;
499 }
500 } else {
501 // Infinity
502 uDoubleSignificand = 0;
503 }
504 } else {
505 // Normal number
506 uDoubleBiasedExponent = (uint64_t)(nSingleUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
507 uDoubleSignificand = uSingleSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
508 }
509 uDoubleSign = uSingleSign;
510
511
512 // Shift the 3 parts into place as a double-precision
513 const uint64_t uDouble = uDoubleSignificand |
514 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
515 (uDoubleSign << DOUBLE_SIGN_SHIFT);
516 return CopyUint64ToDouble(uDouble);
517}
518
519
520
521
522// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700523IEEE754_union IEEE754_FloatToSmallest(float f)
524{
525 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800526
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700527 // Pull the neeed two parts out of the single-precision float
528 const uint32_t uSingle = CopyFloatToUint32(f);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800529 const int32_t nSingleExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700530 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800531
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700532 // Bit mask that is the significand bits that would be lost when converting
533 // from single-precision to half-precision
534 const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
535
536 // Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
537 if(uSingle == 0) {
538 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700539 result.uSize = IEEE754_UNION_IS_HALF;
540 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700541 } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
542 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700543 result.uSize = IEEE754_UNION_IS_HALF;
544 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700545 } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
546 // Normal number in exponent range and precision won't be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700547 result.uSize = IEEE754_UNION_IS_HALF;
548 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700549 } else {
550 // Subnormal, exponent out of range, or precision will be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700551 result.uSize = IEEE754_UNION_IS_SINGLE;
552 result.uValue = uSingle;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700553 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800554
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700555 return result;
556}
557
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700558// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700559IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
560{
561 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800562
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700563 // Pull the needed two parts out of the double-precision float
564 const uint64_t uDouble = CopyDoubleToUint64(d);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800565 const int64_t nDoubleExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700566 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800567
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700568 // Masks to check whether dropped significand bits are zero or not
569 const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
570 const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800571
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700572 // The various cases
Laurence Lundbladed711fb22018-09-26 14:35:22 -0700573 if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700574 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700575 result.uSize = IEEE754_UNION_IS_HALF;
576 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700577 } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
578 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700579 result.uSize = IEEE754_UNION_IS_HALF;
580 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700581 } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
582 // Can convert to half without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700583 result.uSize = IEEE754_UNION_IS_HALF;
584 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700585 } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
586 // Can convert to single without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700587 result.uSize = IEEE754_UNION_IS_SINGLE;
588 result.uValue = CopyFloatToUint32((float)d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700589 } else {
590 // Can't convert without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700591 result.uSize = IEEE754_UNION_IS_DOUBLE;
592 result.uValue = uDouble;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700593 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800594
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700595 return result;
596}
597
Laurence Lundblade9682a532020-06-06 18:33:04 -0700598#endif /* QCBOR_CONFIG_DISABLE_ENCODE_IEEE754 */