blob: ea44e1dbda3a7367fc42122e3adcc35a521c089b [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
Laurence Lundbladeee851742020-01-08 08:37:05 -08002 ieee754.c -- floating-point conversion between half, double & single-precision
Laurence Lundblade035bd782019-01-21 17:01:31 -08003
Laurence Lundbladeee851742020-01-08 08:37:05 -08004 Copyright (c) 2018-2020, Laurence Lundblade. All rights reserved.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -08005
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08006 SPDX-License-Identifier: BSD-3-Clause
Laurence Lundblade035bd782019-01-21 17:01:31 -08007
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08008 See BSD-3-Clause license in README.md
Laurence Lundblade035bd782019-01-21 17:01:31 -08009
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -080010 Created on 7/23/18
Laurence Lundbladeee851742020-01-08 08:37:05 -080011 =============================================================================*/
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070012
Laurence Lundblade12d32c52018-09-19 11:25:27 -070013#include "ieee754.h"
14#include <string.h> // For memcpy()
15
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070016
Laurence Lundblade12d32c52018-09-19 11:25:27 -070017/*
Laurence Lundbladeee851742020-01-08 08:37:05 -080018 This code is written for clarity and verifiability, not for size, on
19 the assumption that the optimizer will do a good job. The LLVM
20 optimizer, -Os, does seem to do the job and the resulting object code
21 is smaller from combining code for the many different cases (normal,
22 subnormal, infinity, zero...) for the conversions.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080023
Laurence Lundbladeee851742020-01-08 08:37:05 -080024 This code has really long lines and is much easier to read because of
25 them. Some coding guidelines prefer 80 column lines (can they not afford
26 big displays?). It would make this code much worse even to wrap at 120
27 columns.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080028
Laurence Lundbladeee851742020-01-08 08:37:05 -080029 Dead stripping is also really helpful to get code size down when
30 floating-point encoding is not needed.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080031
Laurence Lundbladeee851742020-01-08 08:37:05 -080032 This code works solely using shifts and masks and thus has no
33 dependency on any math libraries. It can even work if the CPU doesn't
34 have any floating-point support, though that isn't the most useful
35 thing to do.
36
37 The memcpy() dependency is only for CopyFloatToUint32() and friends
38 which only is needed to avoid type punning when converting the actual
39 float bits to an unsigned value so the bit shifts and masks can work.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070040 */
41
42/*
43 The references used to write this code:
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080044
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070045 - IEEE 754-2008, particularly section 3.6 and 6.2.1
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080046
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070047 - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080048
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070049 - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundblade12d32c52018-09-19 11:25:27 -070050 */
51
52
53// ----- Half Precsion -----------
54#define HALF_NUM_SIGNIFICAND_BITS (10)
55#define HALF_NUM_EXPONENT_BITS (5)
56#define HALF_NUM_SIGN_BITS (1)
57
58#define HALF_SIGNIFICAND_SHIFT (0)
59#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
60#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
61
62#define HALF_SIGNIFICAND_MASK (0x3ff) // The lower 10 bits // 0x03ff
63#define HALF_EXPONENT_MASK (0x1f << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
64#define HALF_SIGN_MASK (0x01 << HALF_SIGN_SHIFT) // // 0x80001 bit of sign
65#define HALF_QUIET_NAN_BIT (0x01 << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
66
67/* Biased Biased Unbiased Use
68 0x00 0 -15 0 and subnormal
69 0x01 1 -14 Smallest normal exponent
70 0x1e 30 15 Largest normal exponent
71 0x1F 31 16 NaN and Infinity */
72#define HALF_EXPONENT_BIAS (15)
73#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
74#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
75#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
76#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
77
78
Laurence Lundbladeee851742020-01-08 08:37:05 -080079// ------ Single-Precision --------
Laurence Lundblade12d32c52018-09-19 11:25:27 -070080#define SINGLE_NUM_SIGNIFICAND_BITS (23)
81#define SINGLE_NUM_EXPONENT_BITS (8)
82#define SINGLE_NUM_SIGN_BITS (1)
83
84#define SINGLE_SIGNIFICAND_SHIFT (0)
85#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
86#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
87
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070088#define SINGLE_SIGNIFICAND_MASK (0x7fffffUL) // The lower 23 bits
89#define SINGLE_EXPONENT_MASK (0xffUL << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
90#define SINGLE_SIGN_MASK (0x01UL << SINGLE_SIGN_SHIFT) // 1 bit of sign
91#define SINGLE_QUIET_NAN_BIT (0x01UL << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade12d32c52018-09-19 11:25:27 -070092
93/* Biased Biased Unbiased Use
94 0x0000 0 -127 0 and subnormal
95 0x0001 1 -126 Smallest normal exponent
96 0x7f 127 0 1
97 0xfe 254 127 Largest normal exponent
98 0xff 255 128 NaN and Infinity */
99#define SINGLE_EXPONENT_BIAS (127)
100#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
101#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
102#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
103#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
104
105
Laurence Lundbladeee851742020-01-08 08:37:05 -0800106// --------- Double-Precision ----------
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700107#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
108#define DOUBLE_NUM_EXPONENT_BITS (11)
109#define DOUBLE_NUM_SIGN_BITS (1)
110
111#define DOUBLE_SIGNIFICAND_SHIFT (0)
112#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
113#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
114
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700115#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
116#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
117#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
118#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
119
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700120
121/* Biased Biased Unbiased Use
122 0x00000000 0 -1023 0 and subnormal
123 0x00000001 1 -1022 Smallest normal exponent
124 0x000007fe 2046 1023 Largest normal exponent
125 0x000007ff 2047 1024 NaN and Infinity */
126#define DOUBLE_EXPONENT_BIAS (1023)
127#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
128#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
129#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
130#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
131
132
133
134/*
135 Convenient functions to avoid type punning, compiler warnings and such
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700136 The optimizer reduces them to a simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700137 This is a crusty corner of C. It shouldn't be this hard.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800138
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700139 These are also in UsefulBuf.h under a different name. They are copied
Laurence Lundblade3df8c7e2018-11-02 13:12:41 +0700140 here to avoid a dependency on UsefulBuf.h. There is no
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700141 object code size impact because these always optimze down to a
142 simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700143 */
144static inline uint32_t CopyFloatToUint32(float f)
145{
146 uint32_t u32;
147 memcpy(&u32, &f, sizeof(uint32_t));
148 return u32;
149}
150
151static inline uint64_t CopyDoubleToUint64(double d)
152{
153 uint64_t u64;
154 memcpy(&u64, &d, sizeof(uint64_t));
155 return u64;
156}
157
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700158static inline float CopyUint32ToFloat(uint32_t u32)
159{
160 float f;
161 memcpy(&f, &u32, sizeof(uint32_t));
162 return f;
163}
164
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700165static inline double CopyUint64ToDouble(uint64_t u64)
166{
167 double d;
168 memcpy(&d, &u64, sizeof(uint64_t));
169 return d;
170}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700171
172
173// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700174uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700175{
176 // Pull the three parts out of the single-precision float
177 const uint32_t uSingle = CopyFloatToUint32(f);
178 const int32_t nSingleUnbiasedExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
179 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
180 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800181
182
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700183 // Now convert the three parts to half-precision.
184 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
185 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
186 // +/- Infinity and NaNs -- single biased exponent is 0xff
187 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
188 if(!uSingleSignificand) {
189 // Infinity
190 uHalfSignificand = 0;
191 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700192 // Copy the LBSs of the NaN payload that will fit from the single to the half
193 uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
194 if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
195 // It's a qNaN; copy the qNaN bit
196 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700197 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700198 // It's a sNaN; make sure the significand is not zero so it stays a NaN
199 // This is needed because not all significand bits are copied from single
200 if(!uHalfSignificand) {
201 // Set the LSB. This is what wikipedia shows for sNAN.
202 uHalfSignificand |= 0x01;
203 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700204 }
205 }
206 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
207 // 0 or a subnormal number -- singled biased exponent is 0
208 uHalfBiasedExponent = 0;
209 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
210 } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
211 // Exponent is too large to express in half-precision; round up to infinity
212 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
213 uHalfSignificand = 0;
214 } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
215 // Exponent is too small to express in half-precision normal; make it a half-precision subnormal
216 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
217 // Difference between single normal exponent and the base exponent of a half subnormal
218 const uint32_t nExpDiff = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
219 // Also have to shift the significand by the difference in number of bits between a single and a half significand
220 const int32_t nSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
221 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
222 const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01L << SINGLE_NUM_SIGNIFICAND_BITS);
223 uHalfSignificand = uSingleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
224 } else {
225 // The normal case
226 uHalfBiasedExponent = nSingleUnbiasedExponent + HALF_EXPONENT_BIAS;
227 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
228 }
229 uHalfSign = uSingleSign;
230
231 // Put the 3 values in the right place for a half precision
232 const uint16_t uHalfPrecision = uHalfSignificand |
233 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
234 (uHalfSign << HALF_SIGN_SHIFT);
235 return uHalfPrecision;
236}
237
238
239// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700240uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700241{
242 // Pull the three parts out of the double-precision float
243 const uint64_t uDouble = CopyDoubleToUint64(d);
244 const int64_t nDoubleUnbiasedExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
245 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
246 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
247
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800248
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700249 // Now convert the three parts to half-precision.
250 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
251 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
252 // +/- Infinity and NaNs -- single biased exponent is 0xff
253 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
254 if(!uDoubleSignificand) {
255 // Infinity
256 uHalfSignificand = 0;
257 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700258 // Copy the LBSs of the NaN payload that will fit from the double to the half
259 uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
260 if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
261 // It's a qNaN; copy the qNaN bit
262 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700263 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700264 // It's an sNaN; make sure the significand is not zero so it stays a NaN
265 // This is needed because not all significand bits are copied from single
266 if(!uHalfSignificand) {
267 // Set the LSB. This is what wikipedia shows for sNAN.
268 uHalfSignificand |= 0x01;
269 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700270 }
271 }
272 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
273 // 0 or a subnormal number -- double biased exponent is 0
274 uHalfBiasedExponent = 0;
275 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
276 } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
277 // Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
278 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
279 uHalfSignificand = 0;
280 } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
281 // Exponent is too small to express in half-precision; round down to zero
282 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
283 // Difference between double normal exponent and the base exponent of a half subnormal
284 const uint64_t nExpDiff = -(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
285 // Also have to shift the significand by the difference in number of bits between a double and a half significand
286 const int64_t nSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
287 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
Laurence Lundbladef2801952018-12-17 10:40:29 -0800288 const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01ULL << DOUBLE_NUM_SIGNIFICAND_BITS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700289 uHalfSignificand = uDoubleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
290 } else {
291 // The normal case
292 uHalfBiasedExponent = nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS;
293 uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
294 }
295 uHalfSign = uDoubleSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800296
297
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700298 // Put the 3 values in the right place for a half precision
299 const uint16_t uHalfPrecision = uHalfSignificand |
300 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
301 (uHalfSign << HALF_SIGN_SHIFT);
302 return uHalfPrecision;
303}
304
305
306// Public function; see ieee754.h
307float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
308{
309 // Pull out the three parts of the half-precision float
310 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
311 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
312 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800313
314
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700315 // Make the three parts of the single-precision number
316 uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
317 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
318 // 0 or subnormal
319 if(uHalfSignificand) {
320 // Subnormal case
321 uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
322 // A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
323 uSingleSignificand = uHalfSignificand;
324 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
325 do {
326 uSingleSignificand <<= 1;
327 uSingleBiasedExponent--;
328 } while ((uSingleSignificand & 0x400) == 0);
329 uSingleSignificand &= HALF_SIGNIFICAND_MASK;
330 uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
331 } else {
332 // Just zero
333 uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
334 uSingleSignificand = 0;
335 }
336 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
337 // NaN or Inifinity
338 uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
339 if(uHalfSignificand) {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700340 // NaN
341 // First preserve the NaN payload from half to single
342 uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
343 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
344 // Next, set qNaN if needed since half qNaN bit is not copied above
345 uSingleSignificand |= SINGLE_QUIET_NAN_BIT;
346 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700347 } else {
348 // Infinity
349 uSingleSignificand = 0;
350 }
351 } else {
352 // Normal number
353 uSingleBiasedExponent = nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS;
354 uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
355 }
356 uSingleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800357
358
Laurence Lundbladeee851742020-01-08 08:37:05 -0800359 // Shift the three parts of the single-precision into place
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700360 const uint32_t uSinglePrecision = uSingleSignificand |
361 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
362 (uSingleSign << SINGLE_SIGN_SHIFT);
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800363
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700364 return CopyUint32ToFloat(uSinglePrecision);
365}
366
367
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700368// Public function; see ieee754.h
369double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
370{
371 // Pull out the three parts of the half-precision float
372 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
373 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
374 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800375
376
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700377 // Make the three parts of hte single-precision number
378 uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
379 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
380 // 0 or subnormal
381 uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
382 if(uHalfSignificand) {
383 // Subnormal case
384 uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
385 // A half-precision subnormal can always be converted to a normal double-precision float because the ranges line up
386 uDoubleSignificand = uHalfSignificand;
387 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
388 do {
389 uDoubleSignificand <<= 1;
390 uDoubleBiasedExponent--;
391 } while ((uDoubleSignificand & 0x400) == 0);
392 uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
393 uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
394 } else {
395 // Just zero
396 uDoubleSignificand = 0;
397 }
398 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
399 // NaN or Inifinity
400 uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
401 if(uHalfSignificand) {
402 // NaN
403 // First preserve the NaN payload from half to single
404 uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
405 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
406 // Next, set qNaN if needed since half qNaN bit is not copied above
407 uDoubleSignificand |= DOUBLE_QUIET_NAN_BIT;
408 }
409 } else {
410 // Infinity
411 uDoubleSignificand = 0;
412 }
413 } else {
414 // Normal number
415 uDoubleBiasedExponent = nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS;
416 uDoubleSignificand = (uint64_t)uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
417 }
418 uDoubleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800419
420
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700421 // Shift the 3 parts into place as a double-precision
422 const uint64_t uDouble = uDoubleSignificand |
423 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
424 (uDoubleSign << DOUBLE_SIGN_SHIFT);
425 return CopyUint64ToDouble(uDouble);
426}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700427
428
429// Public function; see ieee754.h
430IEEE754_union IEEE754_FloatToSmallest(float f)
431{
432 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800433
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700434 // Pull the neeed two parts out of the single-precision float
435 const uint32_t uSingle = CopyFloatToUint32(f);
436 const int32_t nSingleExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
437 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800438
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700439 // Bit mask that is the significand bits that would be lost when converting
440 // from single-precision to half-precision
441 const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
442
443 // Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
444 if(uSingle == 0) {
445 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700446 result.uSize = IEEE754_UNION_IS_HALF;
447 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700448 } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
449 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700450 result.uSize = IEEE754_UNION_IS_HALF;
451 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700452 } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
453 // Normal number in exponent range and precision won't be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700454 result.uSize = IEEE754_UNION_IS_HALF;
455 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700456 } else {
457 // Subnormal, exponent out of range, or precision will be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700458 result.uSize = IEEE754_UNION_IS_SINGLE;
459 result.uValue = uSingle;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700460 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800461
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700462 return result;
463}
464
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700465// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700466IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
467{
468 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800469
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700470 // Pull the needed two parts out of the double-precision float
471 const uint64_t uDouble = CopyDoubleToUint64(d);
472 const int64_t nDoubleExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
473 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800474
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700475 // Masks to check whether dropped significand bits are zero or not
476 const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
477 const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800478
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700479 // The various cases
Laurence Lundbladed711fb22018-09-26 14:35:22 -0700480 if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700481 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700482 result.uSize = IEEE754_UNION_IS_HALF;
483 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700484 } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
485 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700486 result.uSize = IEEE754_UNION_IS_HALF;
487 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700488 } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
489 // Can convert to half without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700490 result.uSize = IEEE754_UNION_IS_HALF;
491 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700492 } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
493 // Can convert to single without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700494 result.uSize = IEEE754_UNION_IS_SINGLE;
495 result.uValue = CopyFloatToUint32((float)d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700496 } else {
497 // Can't convert without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700498 result.uSize = IEEE754_UNION_IS_DOUBLE;
499 result.uValue = uDouble;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700500 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800501
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700502 return result;
503}
504