blob: ef0adef0f8c08334bb10b9d2e34d8eea82a68dfa [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
Laurence Lundbladeee851742020-01-08 08:37:05 -08002 ieee754.c -- floating-point conversion between half, double & single-precision
Laurence Lundblade035bd782019-01-21 17:01:31 -08003
Laurence Lundbladeee851742020-01-08 08:37:05 -08004 Copyright (c) 2018-2020, Laurence Lundblade. All rights reserved.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -08005
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08006 SPDX-License-Identifier: BSD-3-Clause
Laurence Lundblade035bd782019-01-21 17:01:31 -08007
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08008 See BSD-3-Clause license in README.md
Laurence Lundblade035bd782019-01-21 17:01:31 -08009
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -080010 Created on 7/23/18
Laurence Lundbladeee851742020-01-08 08:37:05 -080011 =============================================================================*/
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070012
Laurence Lundblade12d32c52018-09-19 11:25:27 -070013#include "ieee754.h"
14#include <string.h> // For memcpy()
15
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070016
Laurence Lundblade12d32c52018-09-19 11:25:27 -070017/*
Laurence Lundbladeee851742020-01-08 08:37:05 -080018 This code is written for clarity and verifiability, not for size, on
19 the assumption that the optimizer will do a good job. The LLVM
20 optimizer, -Os, does seem to do the job and the resulting object code
21 is smaller from combining code for the many different cases (normal,
Laurence Lundbladec5fef682020-01-25 11:38:45 -080022 subnormal, infinity, zero...) for the conversions. GCC is no where near
23 as good.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080024
Laurence Lundbladeee851742020-01-08 08:37:05 -080025 This code has really long lines and is much easier to read because of
26 them. Some coding guidelines prefer 80 column lines (can they not afford
27 big displays?). It would make this code much worse even to wrap at 120
28 columns.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080029
Laurence Lundbladeee851742020-01-08 08:37:05 -080030 Dead stripping is also really helpful to get code size down when
Laurence Lundbladec5fef682020-01-25 11:38:45 -080031 floating-point encoding is not needed. (If this is put in a library
32 and linking is against the library, then dead stripping is automatic).
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080033
Laurence Lundbladeee851742020-01-08 08:37:05 -080034 This code works solely using shifts and masks and thus has no
35 dependency on any math libraries. It can even work if the CPU doesn't
36 have any floating-point support, though that isn't the most useful
37 thing to do.
38
39 The memcpy() dependency is only for CopyFloatToUint32() and friends
40 which only is needed to avoid type punning when converting the actual
41 float bits to an unsigned value so the bit shifts and masks can work.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070042 */
43
44/*
45 The references used to write this code:
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080046
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070047 - IEEE 754-2008, particularly section 3.6 and 6.2.1
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080048
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070049 - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080050
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070051 - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundbladec5fef682020-01-25 11:38:45 -080052
53 - https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules
54
55 - https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be
Laurence Lundblade12d32c52018-09-19 11:25:27 -070056 */
57
58
59// ----- Half Precsion -----------
60#define HALF_NUM_SIGNIFICAND_BITS (10)
61#define HALF_NUM_EXPONENT_BITS (5)
62#define HALF_NUM_SIGN_BITS (1)
63
64#define HALF_SIGNIFICAND_SHIFT (0)
65#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
66#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
67
68#define HALF_SIGNIFICAND_MASK (0x3ff) // The lower 10 bits // 0x03ff
69#define HALF_EXPONENT_MASK (0x1f << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
70#define HALF_SIGN_MASK (0x01 << HALF_SIGN_SHIFT) // // 0x80001 bit of sign
71#define HALF_QUIET_NAN_BIT (0x01 << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
72
73/* Biased Biased Unbiased Use
74 0x00 0 -15 0 and subnormal
75 0x01 1 -14 Smallest normal exponent
76 0x1e 30 15 Largest normal exponent
77 0x1F 31 16 NaN and Infinity */
78#define HALF_EXPONENT_BIAS (15)
79#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
80#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
81#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
82#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
83
84
Laurence Lundbladeee851742020-01-08 08:37:05 -080085// ------ Single-Precision --------
Laurence Lundblade12d32c52018-09-19 11:25:27 -070086#define SINGLE_NUM_SIGNIFICAND_BITS (23)
87#define SINGLE_NUM_EXPONENT_BITS (8)
88#define SINGLE_NUM_SIGN_BITS (1)
89
90#define SINGLE_SIGNIFICAND_SHIFT (0)
91#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
92#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
93
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070094#define SINGLE_SIGNIFICAND_MASK (0x7fffffUL) // The lower 23 bits
95#define SINGLE_EXPONENT_MASK (0xffUL << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
96#define SINGLE_SIGN_MASK (0x01UL << SINGLE_SIGN_SHIFT) // 1 bit of sign
97#define SINGLE_QUIET_NAN_BIT (0x01UL << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade12d32c52018-09-19 11:25:27 -070098
99/* Biased Biased Unbiased Use
100 0x0000 0 -127 0 and subnormal
101 0x0001 1 -126 Smallest normal exponent
102 0x7f 127 0 1
103 0xfe 254 127 Largest normal exponent
104 0xff 255 128 NaN and Infinity */
105#define SINGLE_EXPONENT_BIAS (127)
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800106#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700107#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
108#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800109#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700110
111
Laurence Lundbladeee851742020-01-08 08:37:05 -0800112// --------- Double-Precision ----------
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700113#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
114#define DOUBLE_NUM_EXPONENT_BITS (11)
115#define DOUBLE_NUM_SIGN_BITS (1)
116
117#define DOUBLE_SIGNIFICAND_SHIFT (0)
118#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
119#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
120
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700121#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
122#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
123#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
124#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
125
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700126
127/* Biased Biased Unbiased Use
128 0x00000000 0 -1023 0 and subnormal
129 0x00000001 1 -1022 Smallest normal exponent
130 0x000007fe 2046 1023 Largest normal exponent
131 0x000007ff 2047 1024 NaN and Infinity */
132#define DOUBLE_EXPONENT_BIAS (1023)
133#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
134#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
135#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
136#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
137
138
139
140/*
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800141 Convenient functions to avoid type punning, compiler warnings and
142 such. The optimizer reduces them to a simple assignment. This is a
143 crusty corner of C. It shouldn't be this hard.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800144
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700145 These are also in UsefulBuf.h under a different name. They are copied
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800146 here to avoid a dependency on UsefulBuf.h. There is no object code
147 size impact because these always optimze down to a simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700148 */
149static inline uint32_t CopyFloatToUint32(float f)
150{
151 uint32_t u32;
152 memcpy(&u32, &f, sizeof(uint32_t));
153 return u32;
154}
155
156static inline uint64_t CopyDoubleToUint64(double d)
157{
158 uint64_t u64;
159 memcpy(&u64, &d, sizeof(uint64_t));
160 return u64;
161}
162
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700163static inline float CopyUint32ToFloat(uint32_t u32)
164{
165 float f;
166 memcpy(&f, &u32, sizeof(uint32_t));
167 return f;
168}
169
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700170static inline double CopyUint64ToDouble(uint64_t u64)
171{
172 double d;
173 memcpy(&d, &u64, sizeof(uint64_t));
174 return d;
175}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700176
177
178// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700179uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700180{
181 // Pull the three parts out of the single-precision float
182 const uint32_t uSingle = CopyFloatToUint32(f);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800183 const int32_t nSingleUnbiasedExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
184 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
185 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800186
187
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700188 // Now convert the three parts to half-precision.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800189
190 // All works is done on uint32_t with conversion to uint16_t at the end.
191 // This avoids integer promotions that static analyzers complain about and
192 // reduces code size.
193 uint32_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
194
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700195 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
196 // +/- Infinity and NaNs -- single biased exponent is 0xff
197 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
198 if(!uSingleSignificand) {
199 // Infinity
200 uHalfSignificand = 0;
201 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800202 // Copy the LSBs of the NaN payload that will fit from the single to the half
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700203 uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
204 if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
205 // It's a qNaN; copy the qNaN bit
206 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700207 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800208 // It's an sNaN; make sure the significand is not zero so it stays a NaN
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700209 // This is needed because not all significand bits are copied from single
210 if(!uHalfSignificand) {
211 // Set the LSB. This is what wikipedia shows for sNAN.
212 uHalfSignificand |= 0x01;
213 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700214 }
215 }
216 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
217 // 0 or a subnormal number -- singled biased exponent is 0
218 uHalfBiasedExponent = 0;
219 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
220 } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
221 // Exponent is too large to express in half-precision; round up to infinity
222 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
223 uHalfSignificand = 0;
224 } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
225 // Exponent is too small to express in half-precision normal; make it a half-precision subnormal
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800226 uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700227 // Difference between single normal exponent and the base exponent of a half subnormal
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800228 const uint32_t uExpDiff = (uint32_t)-(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700229 // Also have to shift the significand by the difference in number of bits between a single and a half significand
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800230 const uint32_t uSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700231 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800232 const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01UL << SINGLE_NUM_SIGNIFICAND_BITS);
233 uHalfSignificand = uSingleSignificandSubnormal >> (uExpDiff + uSignificandBitsDiff);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700234 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800235 // The normal case, exponent is in range for half-precision
236 uHalfBiasedExponent = (uint32_t)(nSingleUnbiasedExponent + HALF_EXPONENT_BIAS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700237 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
238 }
239 uHalfSign = uSingleSign;
240
241 // Put the 3 values in the right place for a half precision
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800242 const uint32_t uHalfPrecision = uHalfSignificand |
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700243 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
244 (uHalfSign << HALF_SIGN_SHIFT);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800245 // Cast is safe because all the masks and shifts above work to make
246 // a half precision value which is only 16 bits.
247 return (uint16_t)uHalfPrecision;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700248}
249
250
251// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700252uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700253{
254 // Pull the three parts out of the double-precision float
255 const uint64_t uDouble = CopyDoubleToUint64(d);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800256 const int64_t nDoubleUnbiasedExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
257 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
258 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800259
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700260 // Now convert the three parts to half-precision.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800261
262 // All works is done on uint64_t with conversion to uint16_t at the end.
263 // This avoids integer promotions that static analyzers complain about.
264 // Other options are for these to be unsigned int or fast_int16_t. Code
265 // size doesn't vary much between all these options for 64-bit LLVM,
266 // 64-bit GCC and 32-bit Armv7 LLVM.
267 uint64_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
268
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700269 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
270 // +/- Infinity and NaNs -- single biased exponent is 0xff
271 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
272 if(!uDoubleSignificand) {
273 // Infinity
274 uHalfSignificand = 0;
275 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800276 // Copy the LSBs of the NaN payload that will fit from the double to the half
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700277 uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
278 if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
279 // It's a qNaN; copy the qNaN bit
280 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700281 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700282 // It's an sNaN; make sure the significand is not zero so it stays a NaN
283 // This is needed because not all significand bits are copied from single
284 if(!uHalfSignificand) {
285 // Set the LSB. This is what wikipedia shows for sNAN.
286 uHalfSignificand |= 0x01;
287 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700288 }
289 }
290 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
291 // 0 or a subnormal number -- double biased exponent is 0
292 uHalfBiasedExponent = 0;
293 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
294 } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
295 // Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
296 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
297 uHalfSignificand = 0;
298 } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
299 // Exponent is too small to express in half-precision; round down to zero
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800300 uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700301 // Difference between double normal exponent and the base exponent of a half subnormal
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800302 const uint64_t uExpDiff = (uint64_t)-(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700303 // Also have to shift the significand by the difference in number of bits between a double and a half significand
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800304 const uint64_t uSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700305 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
Laurence Lundbladef2801952018-12-17 10:40:29 -0800306 const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01ULL << DOUBLE_NUM_SIGNIFICAND_BITS);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800307 uHalfSignificand = uDoubleSignificandSubnormal >> (uExpDiff + uSignificandBitsDiff);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700308 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800309 // The normal case, exponent is in range for half-precision
310 uHalfBiasedExponent = (uint32_t)(nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700311 uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
312 }
313 uHalfSign = uDoubleSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800314
315
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700316 // Put the 3 values in the right place for a half precision
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800317 const uint64_t uHalfPrecision = uHalfSignificand |
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700318 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
319 (uHalfSign << HALF_SIGN_SHIFT);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800320 // Cast is safe because all the masks and shifts above work to make
321 // a half precision value which is only 16 bits.
322 return (uint16_t)uHalfPrecision;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700323}
324
325
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800326
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700327// Public function; see ieee754.h
328float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
329{
330 // Pull out the three parts of the half-precision float
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800331 // Do all the work in 32 bits because that is what the end result is
332 // may give smaller code size and will keep static analyzers happier.
333 const uint32_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
334 const int32_t nHalfUnBiasedExponent = (int32_t)((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
335 const uint32_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800336
337
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700338 // Make the three parts of the single-precision number
339 uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
340 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
341 // 0 or subnormal
342 if(uHalfSignificand) {
343 // Subnormal case
344 uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
345 // A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
346 uSingleSignificand = uHalfSignificand;
347 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
348 do {
349 uSingleSignificand <<= 1;
350 uSingleBiasedExponent--;
351 } while ((uSingleSignificand & 0x400) == 0);
352 uSingleSignificand &= HALF_SIGNIFICAND_MASK;
353 uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
354 } else {
355 // Just zero
356 uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
357 uSingleSignificand = 0;
358 }
359 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
360 // NaN or Inifinity
361 uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
362 if(uHalfSignificand) {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700363 // NaN
364 // First preserve the NaN payload from half to single
365 uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
366 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
367 // Next, set qNaN if needed since half qNaN bit is not copied above
368 uSingleSignificand |= SINGLE_QUIET_NAN_BIT;
369 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700370 } else {
371 // Infinity
372 uSingleSignificand = 0;
373 }
374 } else {
375 // Normal number
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800376 uSingleBiasedExponent = (uint32_t)(nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700377 uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
378 }
379 uSingleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800380
Laurence Lundbladeee851742020-01-08 08:37:05 -0800381 // Shift the three parts of the single-precision into place
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700382 const uint32_t uSinglePrecision = uSingleSignificand |
383 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
384 (uSingleSign << SINGLE_SIGN_SHIFT);
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800385
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700386 return CopyUint32ToFloat(uSinglePrecision);
387}
388
389
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700390// Public function; see ieee754.h
391double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
392{
393 // Pull out the three parts of the half-precision float
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800394 // Do all the work in 64 bits because that is what the end result is
395 // may give smaller code size and will keep static analyzers happier.
396 const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
397 const int64_t nHalfUnBiasedExponent = (int64_t)((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
398 const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800399
400
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700401 // Make the three parts of hte single-precision number
402 uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
403 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
404 // 0 or subnormal
405 uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
406 if(uHalfSignificand) {
407 // Subnormal case
408 uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
409 // A half-precision subnormal can always be converted to a normal double-precision float because the ranges line up
410 uDoubleSignificand = uHalfSignificand;
411 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
412 do {
413 uDoubleSignificand <<= 1;
414 uDoubleBiasedExponent--;
415 } while ((uDoubleSignificand & 0x400) == 0);
416 uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
417 uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
418 } else {
419 // Just zero
420 uDoubleSignificand = 0;
421 }
422 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
423 // NaN or Inifinity
424 uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
425 if(uHalfSignificand) {
426 // NaN
427 // First preserve the NaN payload from half to single
428 uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
429 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
430 // Next, set qNaN if needed since half qNaN bit is not copied above
431 uDoubleSignificand |= DOUBLE_QUIET_NAN_BIT;
432 }
433 } else {
434 // Infinity
435 uDoubleSignificand = 0;
436 }
437 } else {
438 // Normal number
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800439 uDoubleBiasedExponent = (uint64_t)(nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
440 uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700441 }
442 uDoubleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800443
444
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700445 // Shift the 3 parts into place as a double-precision
446 const uint64_t uDouble = uDoubleSignificand |
447 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
448 (uDoubleSign << DOUBLE_SIGN_SHIFT);
449 return CopyUint64ToDouble(uDouble);
450}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700451
452
453// Public function; see ieee754.h
454IEEE754_union IEEE754_FloatToSmallest(float f)
455{
456 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800457
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700458 // Pull the neeed two parts out of the single-precision float
459 const uint32_t uSingle = CopyFloatToUint32(f);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800460 const int32_t nSingleExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700461 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800462
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700463 // Bit mask that is the significand bits that would be lost when converting
464 // from single-precision to half-precision
465 const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
466
467 // Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
468 if(uSingle == 0) {
469 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700470 result.uSize = IEEE754_UNION_IS_HALF;
471 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700472 } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
473 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700474 result.uSize = IEEE754_UNION_IS_HALF;
475 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700476 } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
477 // Normal number in exponent range and precision won't be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700478 result.uSize = IEEE754_UNION_IS_HALF;
479 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700480 } else {
481 // Subnormal, exponent out of range, or precision will be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700482 result.uSize = IEEE754_UNION_IS_SINGLE;
483 result.uValue = uSingle;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700484 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800485
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700486 return result;
487}
488
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700489// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700490IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
491{
492 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800493
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700494 // Pull the needed two parts out of the double-precision float
495 const uint64_t uDouble = CopyDoubleToUint64(d);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800496 const int64_t nDoubleExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700497 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800498
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700499 // Masks to check whether dropped significand bits are zero or not
500 const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
501 const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800502
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700503 // The various cases
Laurence Lundbladed711fb22018-09-26 14:35:22 -0700504 if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700505 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700506 result.uSize = IEEE754_UNION_IS_HALF;
507 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700508 } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
509 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700510 result.uSize = IEEE754_UNION_IS_HALF;
511 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700512 } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
513 // Can convert to half without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700514 result.uSize = IEEE754_UNION_IS_HALF;
515 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700516 } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
517 // Can convert to single without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700518 result.uSize = IEEE754_UNION_IS_SINGLE;
519 result.uValue = CopyFloatToUint32((float)d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700520 } else {
521 // Can't convert without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700522 result.uSize = IEEE754_UNION_IS_DOUBLE;
523 result.uValue = uDouble;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700524 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800525
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700526 return result;
527}
528