blob: b4497ccaf163be70b6b87cbdf03d0d6b0d713cbd [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
2 Copyright 2018 Laurence Lundblade
3
4 Permission is hereby granted, free of charge, to any person obtaining
5 a copy of this software and associated documentation files (the
6 "Software"), to deal in the Software without restriction, including
7 without limitation the rights to use, copy, modify, merge, publish,
8 distribute, sublicense, and/or sell copies of the Software, and to
9 permit persons to whom the Software is furnished to do so, subject to
10 the following conditions:
11
12 The above copyright notice and this permission notice shall be included
13 in all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 SOFTWARE.
23
24 (This is the MIT license)
25 ==============================================================================*/
26
Laurence Lundblade12d32c52018-09-19 11:25:27 -070027//
28// ieee754.c
29// Indefinite
30//
31// Created by Laurence Lundblade on 7/23/18.
32// Copyright © 2018 Laurence Lundblade. All rights reserved.
33//
34
35#include "ieee754.h"
36#include <string.h> // For memcpy()
37
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070038
Laurence Lundblade12d32c52018-09-19 11:25:27 -070039/*
Laurence Lundblade12d32c52018-09-19 11:25:27 -070040 This code is written for clarity and verifiability, not for size, on the assumption
41 that the optimizer will do a good job. The LLVM optimizer, -Os, does seem to do the
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070042 job and the resulting object code is smaller from combining code for the many different
Laurence Lundblade12d32c52018-09-19 11:25:27 -070043 cases (normal, subnormal, infinity, zero...) for the conversions.
44
Laurence Lundblade570fab52018-10-13 18:28:27 +080045 Dead stripping is also really helpful to get code size down when floating point
46 encoding is not needed.
Laurence Lundblade12d32c52018-09-19 11:25:27 -070047
Laurence Lundblade570fab52018-10-13 18:28:27 +080048 This code works solely using shifts and masks and thus has no dependency on
49 any math libraries. It can even work if the CPU doesn't have any floating
50 point support, though that isn't the most useful thing to do.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070051
52 The memcpy() dependency is only for CopyFloatToUint32() and friends which only
53 is needed to avoid type punning when converting the actual float bits to
54 an unsigned value so the bit shifts and masks can work.
55 */
56
57/*
58 The references used to write this code:
59
60 - IEEE 754-2008, particularly section 3.6 and 6.2.1
61
62 - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
63
64 - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundblade12d32c52018-09-19 11:25:27 -070065 */
66
67
68// ----- Half Precsion -----------
69#define HALF_NUM_SIGNIFICAND_BITS (10)
70#define HALF_NUM_EXPONENT_BITS (5)
71#define HALF_NUM_SIGN_BITS (1)
72
73#define HALF_SIGNIFICAND_SHIFT (0)
74#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
75#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
76
77#define HALF_SIGNIFICAND_MASK (0x3ff) // The lower 10 bits // 0x03ff
78#define HALF_EXPONENT_MASK (0x1f << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
79#define HALF_SIGN_MASK (0x01 << HALF_SIGN_SHIFT) // // 0x80001 bit of sign
80#define HALF_QUIET_NAN_BIT (0x01 << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
81
82/* Biased Biased Unbiased Use
83 0x00 0 -15 0 and subnormal
84 0x01 1 -14 Smallest normal exponent
85 0x1e 30 15 Largest normal exponent
86 0x1F 31 16 NaN and Infinity */
87#define HALF_EXPONENT_BIAS (15)
88#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
89#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
90#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
91#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
92
93
94// ------ Single Precision --------
95#define SINGLE_NUM_SIGNIFICAND_BITS (23)
96#define SINGLE_NUM_EXPONENT_BITS (8)
97#define SINGLE_NUM_SIGN_BITS (1)
98
99#define SINGLE_SIGNIFICAND_SHIFT (0)
100#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
101#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
102
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700103#define SINGLE_SIGNIFICAND_MASK (0x7fffffUL) // The lower 23 bits
104#define SINGLE_EXPONENT_MASK (0xffUL << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
105#define SINGLE_SIGN_MASK (0x01UL << SINGLE_SIGN_SHIFT) // 1 bit of sign
106#define SINGLE_QUIET_NAN_BIT (0x01UL << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700107
108/* Biased Biased Unbiased Use
109 0x0000 0 -127 0 and subnormal
110 0x0001 1 -126 Smallest normal exponent
111 0x7f 127 0 1
112 0xfe 254 127 Largest normal exponent
113 0xff 255 128 NaN and Infinity */
114#define SINGLE_EXPONENT_BIAS (127)
115#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
116#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
117#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
118#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
119
120
121// --------- Double Precision ----------
122#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
123#define DOUBLE_NUM_EXPONENT_BITS (11)
124#define DOUBLE_NUM_SIGN_BITS (1)
125
126#define DOUBLE_SIGNIFICAND_SHIFT (0)
127#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
128#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
129
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700130#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
131#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
132#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
133#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
134
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700135
136/* Biased Biased Unbiased Use
137 0x00000000 0 -1023 0 and subnormal
138 0x00000001 1 -1022 Smallest normal exponent
139 0x000007fe 2046 1023 Largest normal exponent
140 0x000007ff 2047 1024 NaN and Infinity */
141#define DOUBLE_EXPONENT_BIAS (1023)
142#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
143#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
144#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
145#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
146
147
148
149/*
150 Convenient functions to avoid type punning, compiler warnings and such
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700151 The optimizer reduces them to a simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700152 This is a crusty corner of C. It shouldn't be this hard.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700153
154 These are also in UsefulBuf.h under a different name. They are copied
155 here because to avoid a dependency on UsefulBuf.h. There is no
156 object code size impact because these always optimze down to a
157 simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700158 */
159static inline uint32_t CopyFloatToUint32(float f)
160{
161 uint32_t u32;
162 memcpy(&u32, &f, sizeof(uint32_t));
163 return u32;
164}
165
166static inline uint64_t CopyDoubleToUint64(double d)
167{
168 uint64_t u64;
169 memcpy(&u64, &d, sizeof(uint64_t));
170 return u64;
171}
172
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700173static inline float CopyUint32ToFloat(uint32_t u32)
174{
175 float f;
176 memcpy(&f, &u32, sizeof(uint32_t));
177 return f;
178}
179
180
181
182// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700183uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700184{
185 // Pull the three parts out of the single-precision float
186 const uint32_t uSingle = CopyFloatToUint32(f);
187 const int32_t nSingleUnbiasedExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
188 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
189 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
190
191
192 // Now convert the three parts to half-precision.
193 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
194 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
195 // +/- Infinity and NaNs -- single biased exponent is 0xff
196 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
197 if(!uSingleSignificand) {
198 // Infinity
199 uHalfSignificand = 0;
200 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700201 // Copy the LBSs of the NaN payload that will fit from the single to the half
202 uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
203 if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
204 // It's a qNaN; copy the qNaN bit
205 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700206 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700207 // It's a sNaN; make sure the significand is not zero so it stays a NaN
208 // This is needed because not all significand bits are copied from single
209 if(!uHalfSignificand) {
210 // Set the LSB. This is what wikipedia shows for sNAN.
211 uHalfSignificand |= 0x01;
212 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700213 }
214 }
215 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
216 // 0 or a subnormal number -- singled biased exponent is 0
217 uHalfBiasedExponent = 0;
218 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
219 } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
220 // Exponent is too large to express in half-precision; round up to infinity
221 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
222 uHalfSignificand = 0;
223 } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
224 // Exponent is too small to express in half-precision normal; make it a half-precision subnormal
225 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
226 // Difference between single normal exponent and the base exponent of a half subnormal
227 const uint32_t nExpDiff = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
228 // Also have to shift the significand by the difference in number of bits between a single and a half significand
229 const int32_t nSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
230 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
231 const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01L << SINGLE_NUM_SIGNIFICAND_BITS);
232 uHalfSignificand = uSingleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
233 } else {
234 // The normal case
235 uHalfBiasedExponent = nSingleUnbiasedExponent + HALF_EXPONENT_BIAS;
236 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
237 }
238 uHalfSign = uSingleSign;
239
240 // Put the 3 values in the right place for a half precision
241 const uint16_t uHalfPrecision = uHalfSignificand |
242 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
243 (uHalfSign << HALF_SIGN_SHIFT);
244 return uHalfPrecision;
245}
246
247
248// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700249uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700250{
251 // Pull the three parts out of the double-precision float
252 const uint64_t uDouble = CopyDoubleToUint64(d);
253 const int64_t nDoubleUnbiasedExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
254 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
255 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
256
257
258 // Now convert the three parts to half-precision.
259 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
260 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
261 // +/- Infinity and NaNs -- single biased exponent is 0xff
262 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
263 if(!uDoubleSignificand) {
264 // Infinity
265 uHalfSignificand = 0;
266 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700267 // Copy the LBSs of the NaN payload that will fit from the double to the half
268 uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
269 if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
270 // It's a qNaN; copy the qNaN bit
271 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700272 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700273 // It's an sNaN; make sure the significand is not zero so it stays a NaN
274 // This is needed because not all significand bits are copied from single
275 if(!uHalfSignificand) {
276 // Set the LSB. This is what wikipedia shows for sNAN.
277 uHalfSignificand |= 0x01;
278 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700279 }
280 }
281 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
282 // 0 or a subnormal number -- double biased exponent is 0
283 uHalfBiasedExponent = 0;
284 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
285 } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
286 // Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
287 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
288 uHalfSignificand = 0;
289 } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
290 // Exponent is too small to express in half-precision; round down to zero
291 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
292 // Difference between double normal exponent and the base exponent of a half subnormal
293 const uint64_t nExpDiff = -(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
294 // Also have to shift the significand by the difference in number of bits between a double and a half significand
295 const int64_t nSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
296 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
297 const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01L << DOUBLE_NUM_SIGNIFICAND_BITS);
298 uHalfSignificand = uDoubleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
299 } else {
300 // The normal case
301 uHalfBiasedExponent = nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS;
302 uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
303 }
304 uHalfSign = uDoubleSign;
305
306
307 // Put the 3 values in the right place for a half precision
308 const uint16_t uHalfPrecision = uHalfSignificand |
309 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
310 (uHalfSign << HALF_SIGN_SHIFT);
311 return uHalfPrecision;
312}
313
314
315// Public function; see ieee754.h
316float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
317{
318 // Pull out the three parts of the half-precision float
319 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
320 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
321 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
322
323
324 // Make the three parts of the single-precision number
325 uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
326 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
327 // 0 or subnormal
328 if(uHalfSignificand) {
329 // Subnormal case
330 uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
331 // A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
332 uSingleSignificand = uHalfSignificand;
333 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
334 do {
335 uSingleSignificand <<= 1;
336 uSingleBiasedExponent--;
337 } while ((uSingleSignificand & 0x400) == 0);
338 uSingleSignificand &= HALF_SIGNIFICAND_MASK;
339 uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
340 } else {
341 // Just zero
342 uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
343 uSingleSignificand = 0;
344 }
345 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
346 // NaN or Inifinity
347 uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
348 if(uHalfSignificand) {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700349 // NaN
350 // First preserve the NaN payload from half to single
351 uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
352 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
353 // Next, set qNaN if needed since half qNaN bit is not copied above
354 uSingleSignificand |= SINGLE_QUIET_NAN_BIT;
355 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700356 } else {
357 // Infinity
358 uSingleSignificand = 0;
359 }
360 } else {
361 // Normal number
362 uSingleBiasedExponent = nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS;
363 uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
364 }
365 uSingleSign = uHalfSign;
366
367
368 // Shift the three parts of the single precision into place
369 const uint32_t uSinglePrecision = uSingleSignificand |
370 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
371 (uSingleSign << SINGLE_SIGN_SHIFT);
372
373 return CopyUint32ToFloat(uSinglePrecision);
374}
375
376
Laurence Lundblade781fd822018-10-01 09:37:52 -0700377/*
378 double IEEE754_HalfToDouble(uint16_t uHalfPrecision) is not needed
379*/
380
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700381
382
383// Public function; see ieee754.h
384IEEE754_union IEEE754_FloatToSmallest(float f)
385{
386 IEEE754_union result;
387
388 // Pull the neeed two parts out of the single-precision float
389 const uint32_t uSingle = CopyFloatToUint32(f);
390 const int32_t nSingleExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
391 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
392
393 // Bit mask that is the significand bits that would be lost when converting
394 // from single-precision to half-precision
395 const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
396
397 // Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
398 if(uSingle == 0) {
399 // Value is 0.0000, not a a subnormal
400 result.uTag = IEEE754_UNION_IS_HALF;
401 result.u16 = IEEE754_FloatToHalf(f);
402 } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
403 // NaN, +/- infinity
404 result.uTag = IEEE754_UNION_IS_HALF;
405 result.u16 = IEEE754_FloatToHalf(f);
406 } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
407 // Normal number in exponent range and precision won't be lost
408 result.uTag = IEEE754_UNION_IS_HALF;
409 result.u16 = IEEE754_FloatToHalf(f);
410 } else {
411 // Subnormal, exponent out of range, or precision will be lost
412 result.uTag = IEEE754_UNION_IS_SINGLE;
413 result.u32 = uSingle;
414 }
415
416 return result;
417}
418
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700419// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700420IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
421{
422 IEEE754_union result;
423
424 // Pull the needed two parts out of the double-precision float
425 const uint64_t uDouble = CopyDoubleToUint64(d);
426 const int64_t nDoubleExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
427 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
428
429 // Masks to check whether dropped significand bits are zero or not
430 const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
431 const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
432
433 // The various cases
Laurence Lundbladed711fb22018-09-26 14:35:22 -0700434 if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700435 // Value is 0.0000, not a a subnormal
436 result.uTag = IEEE754_UNION_IS_HALF;
437 result.u16 = IEEE754_DoubleToHalf(d);
438 } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
439 // NaN, +/- infinity
440 result.uTag = IEEE754_UNION_IS_HALF;
441 result.u16 = IEEE754_DoubleToHalf(d);
442 } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
443 // Can convert to half without precision loss
444 result.uTag = IEEE754_UNION_IS_HALF;
445 result.u16 = IEEE754_DoubleToHalf(d);
446 } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
447 // Can convert to single without precision loss
448 result.uTag = IEEE754_UNION_IS_SINGLE;
449 result.u32 = CopyFloatToUint32((float)d);
450 } else {
451 // Can't convert without precision loss
452 result.uTag = IEEE754_UNION_IS_DOUBLE;
453 result.u64 = uDouble;
454 }
455
456 return result;
457}
458