blob: 4ae8dd63417502421c6a40b24afa0dc1ea7a6b5e [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08002 ieee754.c -- floating point conversion between half, double and single precision
3
4 Copyright (c) 2018-2019, Laurence Lundblade. All rights reserved.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -08005
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08006 SPDX-License-Identifier: BSD-3-Clause
7
8 See BSD-3-Clause license in README.md
9
10 Created on 7/23/18
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070011 ==============================================================================*/
12
Laurence Lundblade12d32c52018-09-19 11:25:27 -070013#include "ieee754.h"
14#include <string.h> // For memcpy()
15
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070016
Laurence Lundblade12d32c52018-09-19 11:25:27 -070017/*
Laurence Lundblade12d32c52018-09-19 11:25:27 -070018 This code is written for clarity and verifiability, not for size, on the assumption
19 that the optimizer will do a good job. The LLVM optimizer, -Os, does seem to do the
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070020 job and the resulting object code is smaller from combining code for the many different
Laurence Lundblade12d32c52018-09-19 11:25:27 -070021 cases (normal, subnormal, infinity, zero...) for the conversions.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080022
Laurence Lundblade570fab52018-10-13 18:28:27 +080023 Dead stripping is also really helpful to get code size down when floating point
24 encoding is not needed.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080025
Laurence Lundblade570fab52018-10-13 18:28:27 +080026 This code works solely using shifts and masks and thus has no dependency on
27 any math libraries. It can even work if the CPU doesn't have any floating
28 point support, though that isn't the most useful thing to do.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080029
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070030 The memcpy() dependency is only for CopyFloatToUint32() and friends which only
31 is needed to avoid type punning when converting the actual float bits to
32 an unsigned value so the bit shifts and masks can work.
33 */
34
35/*
36 The references used to write this code:
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080037
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070038 - IEEE 754-2008, particularly section 3.6 and 6.2.1
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080039
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070040 - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080041
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070042 - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundblade12d32c52018-09-19 11:25:27 -070043 */
44
45
46// ----- Half Precsion -----------
47#define HALF_NUM_SIGNIFICAND_BITS (10)
48#define HALF_NUM_EXPONENT_BITS (5)
49#define HALF_NUM_SIGN_BITS (1)
50
51#define HALF_SIGNIFICAND_SHIFT (0)
52#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
53#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
54
55#define HALF_SIGNIFICAND_MASK (0x3ff) // The lower 10 bits // 0x03ff
56#define HALF_EXPONENT_MASK (0x1f << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
57#define HALF_SIGN_MASK (0x01 << HALF_SIGN_SHIFT) // // 0x80001 bit of sign
58#define HALF_QUIET_NAN_BIT (0x01 << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
59
60/* Biased Biased Unbiased Use
61 0x00 0 -15 0 and subnormal
62 0x01 1 -14 Smallest normal exponent
63 0x1e 30 15 Largest normal exponent
64 0x1F 31 16 NaN and Infinity */
65#define HALF_EXPONENT_BIAS (15)
66#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
67#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
68#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
69#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
70
71
72// ------ Single Precision --------
73#define SINGLE_NUM_SIGNIFICAND_BITS (23)
74#define SINGLE_NUM_EXPONENT_BITS (8)
75#define SINGLE_NUM_SIGN_BITS (1)
76
77#define SINGLE_SIGNIFICAND_SHIFT (0)
78#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
79#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
80
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070081#define SINGLE_SIGNIFICAND_MASK (0x7fffffUL) // The lower 23 bits
82#define SINGLE_EXPONENT_MASK (0xffUL << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
83#define SINGLE_SIGN_MASK (0x01UL << SINGLE_SIGN_SHIFT) // 1 bit of sign
84#define SINGLE_QUIET_NAN_BIT (0x01UL << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade12d32c52018-09-19 11:25:27 -070085
86/* Biased Biased Unbiased Use
87 0x0000 0 -127 0 and subnormal
88 0x0001 1 -126 Smallest normal exponent
89 0x7f 127 0 1
90 0xfe 254 127 Largest normal exponent
91 0xff 255 128 NaN and Infinity */
92#define SINGLE_EXPONENT_BIAS (127)
93#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
94#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
95#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
96#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
97
98
99// --------- Double Precision ----------
100#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
101#define DOUBLE_NUM_EXPONENT_BITS (11)
102#define DOUBLE_NUM_SIGN_BITS (1)
103
104#define DOUBLE_SIGNIFICAND_SHIFT (0)
105#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
106#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
107
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700108#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
109#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
110#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
111#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
112
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700113
114/* Biased Biased Unbiased Use
115 0x00000000 0 -1023 0 and subnormal
116 0x00000001 1 -1022 Smallest normal exponent
117 0x000007fe 2046 1023 Largest normal exponent
118 0x000007ff 2047 1024 NaN and Infinity */
119#define DOUBLE_EXPONENT_BIAS (1023)
120#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
121#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
122#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
123#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
124
125
126
127/*
128 Convenient functions to avoid type punning, compiler warnings and such
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700129 The optimizer reduces them to a simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700130 This is a crusty corner of C. It shouldn't be this hard.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800131
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700132 These are also in UsefulBuf.h under a different name. They are copied
Laurence Lundblade3df8c7e2018-11-02 13:12:41 +0700133 here to avoid a dependency on UsefulBuf.h. There is no
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700134 object code size impact because these always optimze down to a
135 simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700136 */
137static inline uint32_t CopyFloatToUint32(float f)
138{
139 uint32_t u32;
140 memcpy(&u32, &f, sizeof(uint32_t));
141 return u32;
142}
143
144static inline uint64_t CopyDoubleToUint64(double d)
145{
146 uint64_t u64;
147 memcpy(&u64, &d, sizeof(uint64_t));
148 return u64;
149}
150
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700151static inline float CopyUint32ToFloat(uint32_t u32)
152{
153 float f;
154 memcpy(&f, &u32, sizeof(uint32_t));
155 return f;
156}
157
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700158static inline double CopyUint64ToDouble(uint64_t u64)
159{
160 double d;
161 memcpy(&d, &u64, sizeof(uint64_t));
162 return d;
163}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700164
165
166// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700167uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700168{
169 // Pull the three parts out of the single-precision float
170 const uint32_t uSingle = CopyFloatToUint32(f);
171 const int32_t nSingleUnbiasedExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
172 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
173 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800174
175
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700176 // Now convert the three parts to half-precision.
177 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
178 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
179 // +/- Infinity and NaNs -- single biased exponent is 0xff
180 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
181 if(!uSingleSignificand) {
182 // Infinity
183 uHalfSignificand = 0;
184 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700185 // Copy the LBSs of the NaN payload that will fit from the single to the half
186 uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
187 if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
188 // It's a qNaN; copy the qNaN bit
189 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700190 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700191 // It's a sNaN; make sure the significand is not zero so it stays a NaN
192 // This is needed because not all significand bits are copied from single
193 if(!uHalfSignificand) {
194 // Set the LSB. This is what wikipedia shows for sNAN.
195 uHalfSignificand |= 0x01;
196 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700197 }
198 }
199 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
200 // 0 or a subnormal number -- singled biased exponent is 0
201 uHalfBiasedExponent = 0;
202 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
203 } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
204 // Exponent is too large to express in half-precision; round up to infinity
205 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
206 uHalfSignificand = 0;
207 } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
208 // Exponent is too small to express in half-precision normal; make it a half-precision subnormal
209 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
210 // Difference between single normal exponent and the base exponent of a half subnormal
211 const uint32_t nExpDiff = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
212 // Also have to shift the significand by the difference in number of bits between a single and a half significand
213 const int32_t nSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
214 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
215 const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01L << SINGLE_NUM_SIGNIFICAND_BITS);
216 uHalfSignificand = uSingleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
217 } else {
218 // The normal case
219 uHalfBiasedExponent = nSingleUnbiasedExponent + HALF_EXPONENT_BIAS;
220 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
221 }
222 uHalfSign = uSingleSign;
223
224 // Put the 3 values in the right place for a half precision
225 const uint16_t uHalfPrecision = uHalfSignificand |
226 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
227 (uHalfSign << HALF_SIGN_SHIFT);
228 return uHalfPrecision;
229}
230
231
232// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700233uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700234{
235 // Pull the three parts out of the double-precision float
236 const uint64_t uDouble = CopyDoubleToUint64(d);
237 const int64_t nDoubleUnbiasedExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
238 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
239 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
240
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800241
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700242 // Now convert the three parts to half-precision.
243 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
244 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
245 // +/- Infinity and NaNs -- single biased exponent is 0xff
246 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
247 if(!uDoubleSignificand) {
248 // Infinity
249 uHalfSignificand = 0;
250 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700251 // Copy the LBSs of the NaN payload that will fit from the double to the half
252 uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
253 if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
254 // It's a qNaN; copy the qNaN bit
255 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700256 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700257 // It's an sNaN; make sure the significand is not zero so it stays a NaN
258 // This is needed because not all significand bits are copied from single
259 if(!uHalfSignificand) {
260 // Set the LSB. This is what wikipedia shows for sNAN.
261 uHalfSignificand |= 0x01;
262 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700263 }
264 }
265 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
266 // 0 or a subnormal number -- double biased exponent is 0
267 uHalfBiasedExponent = 0;
268 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
269 } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
270 // Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
271 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
272 uHalfSignificand = 0;
273 } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
274 // Exponent is too small to express in half-precision; round down to zero
275 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
276 // Difference between double normal exponent and the base exponent of a half subnormal
277 const uint64_t nExpDiff = -(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
278 // Also have to shift the significand by the difference in number of bits between a double and a half significand
279 const int64_t nSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
280 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
Laurence Lundbladef2801952018-12-17 10:40:29 -0800281 const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01ULL << DOUBLE_NUM_SIGNIFICAND_BITS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700282 uHalfSignificand = uDoubleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
283 } else {
284 // The normal case
285 uHalfBiasedExponent = nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS;
286 uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
287 }
288 uHalfSign = uDoubleSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800289
290
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700291 // Put the 3 values in the right place for a half precision
292 const uint16_t uHalfPrecision = uHalfSignificand |
293 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
294 (uHalfSign << HALF_SIGN_SHIFT);
295 return uHalfPrecision;
296}
297
298
299// Public function; see ieee754.h
300float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
301{
302 // Pull out the three parts of the half-precision float
303 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
304 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
305 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800306
307
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700308 // Make the three parts of the single-precision number
309 uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
310 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
311 // 0 or subnormal
312 if(uHalfSignificand) {
313 // Subnormal case
314 uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
315 // A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
316 uSingleSignificand = uHalfSignificand;
317 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
318 do {
319 uSingleSignificand <<= 1;
320 uSingleBiasedExponent--;
321 } while ((uSingleSignificand & 0x400) == 0);
322 uSingleSignificand &= HALF_SIGNIFICAND_MASK;
323 uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
324 } else {
325 // Just zero
326 uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
327 uSingleSignificand = 0;
328 }
329 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
330 // NaN or Inifinity
331 uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
332 if(uHalfSignificand) {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700333 // NaN
334 // First preserve the NaN payload from half to single
335 uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
336 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
337 // Next, set qNaN if needed since half qNaN bit is not copied above
338 uSingleSignificand |= SINGLE_QUIET_NAN_BIT;
339 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700340 } else {
341 // Infinity
342 uSingleSignificand = 0;
343 }
344 } else {
345 // Normal number
346 uSingleBiasedExponent = nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS;
347 uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
348 }
349 uSingleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800350
351
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700352 // Shift the three parts of the single precision into place
353 const uint32_t uSinglePrecision = uSingleSignificand |
354 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
355 (uSingleSign << SINGLE_SIGN_SHIFT);
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800356
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700357 return CopyUint32ToFloat(uSinglePrecision);
358}
359
360
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700361// Public function; see ieee754.h
362double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
363{
364 // Pull out the three parts of the half-precision float
365 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
366 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
367 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800368
369
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700370 // Make the three parts of hte single-precision number
371 uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
372 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
373 // 0 or subnormal
374 uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
375 if(uHalfSignificand) {
376 // Subnormal case
377 uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
378 // A half-precision subnormal can always be converted to a normal double-precision float because the ranges line up
379 uDoubleSignificand = uHalfSignificand;
380 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
381 do {
382 uDoubleSignificand <<= 1;
383 uDoubleBiasedExponent--;
384 } while ((uDoubleSignificand & 0x400) == 0);
385 uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
386 uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
387 } else {
388 // Just zero
389 uDoubleSignificand = 0;
390 }
391 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
392 // NaN or Inifinity
393 uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
394 if(uHalfSignificand) {
395 // NaN
396 // First preserve the NaN payload from half to single
397 uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
398 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
399 // Next, set qNaN if needed since half qNaN bit is not copied above
400 uDoubleSignificand |= DOUBLE_QUIET_NAN_BIT;
401 }
402 } else {
403 // Infinity
404 uDoubleSignificand = 0;
405 }
406 } else {
407 // Normal number
408 uDoubleBiasedExponent = nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS;
409 uDoubleSignificand = (uint64_t)uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
410 }
411 uDoubleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800412
413
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700414 // Shift the 3 parts into place as a double-precision
415 const uint64_t uDouble = uDoubleSignificand |
416 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
417 (uDoubleSign << DOUBLE_SIGN_SHIFT);
418 return CopyUint64ToDouble(uDouble);
419}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700420
421
422// Public function; see ieee754.h
423IEEE754_union IEEE754_FloatToSmallest(float f)
424{
425 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800426
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700427 // Pull the neeed two parts out of the single-precision float
428 const uint32_t uSingle = CopyFloatToUint32(f);
429 const int32_t nSingleExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
430 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800431
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700432 // Bit mask that is the significand bits that would be lost when converting
433 // from single-precision to half-precision
434 const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
435
436 // Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
437 if(uSingle == 0) {
438 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700439 result.uSize = IEEE754_UNION_IS_HALF;
440 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700441 } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
442 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700443 result.uSize = IEEE754_UNION_IS_HALF;
444 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700445 } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
446 // Normal number in exponent range and precision won't be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700447 result.uSize = IEEE754_UNION_IS_HALF;
448 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700449 } else {
450 // Subnormal, exponent out of range, or precision will be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700451 result.uSize = IEEE754_UNION_IS_SINGLE;
452 result.uValue = uSingle;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700453 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800454
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700455 return result;
456}
457
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700458// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700459IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
460{
461 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800462
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700463 // Pull the needed two parts out of the double-precision float
464 const uint64_t uDouble = CopyDoubleToUint64(d);
465 const int64_t nDoubleExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
466 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800467
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700468 // Masks to check whether dropped significand bits are zero or not
469 const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
470 const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800471
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700472 // The various cases
Laurence Lundbladed711fb22018-09-26 14:35:22 -0700473 if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700474 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700475 result.uSize = IEEE754_UNION_IS_HALF;
476 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700477 } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
478 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700479 result.uSize = IEEE754_UNION_IS_HALF;
480 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700481 } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
482 // Can convert to half without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700483 result.uSize = IEEE754_UNION_IS_HALF;
484 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700485 } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
486 // Can convert to single without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700487 result.uSize = IEEE754_UNION_IS_SINGLE;
488 result.uValue = CopyFloatToUint32((float)d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700489 } else {
490 // Can't convert without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700491 result.uSize = IEEE754_UNION_IS_DOUBLE;
492 result.uValue = uDouble;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700493 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800494
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700495 return result;
496}
497