blob: 82ba659dc4282af3829469320469102ca22d0d4e [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
2 Copyright 2018 Laurence Lundblade
3
4 Permission is hereby granted, free of charge, to any person obtaining
5 a copy of this software and associated documentation files (the
6 "Software"), to deal in the Software without restriction, including
7 without limitation the rights to use, copy, modify, merge, publish,
8 distribute, sublicense, and/or sell copies of the Software, and to
9 permit persons to whom the Software is furnished to do so, subject to
10 the following conditions:
11
12 The above copyright notice and this permission notice shall be included
13 in all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 SOFTWARE.
23
24 (This is the MIT license)
25 ==============================================================================*/
26
Laurence Lundblade12d32c52018-09-19 11:25:27 -070027//
28// ieee754.c
29// Indefinite
30//
31// Created by Laurence Lundblade on 7/23/18.
32// Copyright © 2018 Laurence Lundblade. All rights reserved.
33//
34
35#include "ieee754.h"
36#include <string.h> // For memcpy()
37
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070038
Laurence Lundblade12d32c52018-09-19 11:25:27 -070039/*
Laurence Lundblade12d32c52018-09-19 11:25:27 -070040 This code is written for clarity and verifiability, not for size, on the assumption
41 that the optimizer will do a good job. The LLVM optimizer, -Os, does seem to do the
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070042 job and the resulting object code is smaller from combining code for the many different
Laurence Lundblade12d32c52018-09-19 11:25:27 -070043 cases (normal, subnormal, infinity, zero...) for the conversions.
44
45 Dead stripping is also really helpful to get code size down.
46
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070047 This code also works solely using shifts and masks and thus has no dependency on
48 any math libraries. It will even work if the CPU doesn't have any floating
49 point support.
50
51 The memcpy() dependency is only for CopyFloatToUint32() and friends which only
52 is needed to avoid type punning when converting the actual float bits to
53 an unsigned value so the bit shifts and masks can work.
54 */
55
56/*
57 The references used to write this code:
58
59 - IEEE 754-2008, particularly section 3.6 and 6.2.1
60
61 - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
62
63 - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundblade12d32c52018-09-19 11:25:27 -070064 */
65
66
67// ----- Half Precsion -----------
68#define HALF_NUM_SIGNIFICAND_BITS (10)
69#define HALF_NUM_EXPONENT_BITS (5)
70#define HALF_NUM_SIGN_BITS (1)
71
72#define HALF_SIGNIFICAND_SHIFT (0)
73#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
74#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
75
76#define HALF_SIGNIFICAND_MASK (0x3ff) // The lower 10 bits // 0x03ff
77#define HALF_EXPONENT_MASK (0x1f << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
78#define HALF_SIGN_MASK (0x01 << HALF_SIGN_SHIFT) // // 0x80001 bit of sign
79#define HALF_QUIET_NAN_BIT (0x01 << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
80
81/* Biased Biased Unbiased Use
82 0x00 0 -15 0 and subnormal
83 0x01 1 -14 Smallest normal exponent
84 0x1e 30 15 Largest normal exponent
85 0x1F 31 16 NaN and Infinity */
86#define HALF_EXPONENT_BIAS (15)
87#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
88#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
89#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
90#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
91
92
93// ------ Single Precision --------
94#define SINGLE_NUM_SIGNIFICAND_BITS (23)
95#define SINGLE_NUM_EXPONENT_BITS (8)
96#define SINGLE_NUM_SIGN_BITS (1)
97
98#define SINGLE_SIGNIFICAND_SHIFT (0)
99#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
100#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
101
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700102#define SINGLE_SIGNIFICAND_MASK (0x7fffffUL) // The lower 23 bits
103#define SINGLE_EXPONENT_MASK (0xffUL << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
104#define SINGLE_SIGN_MASK (0x01UL << SINGLE_SIGN_SHIFT) // 1 bit of sign
105#define SINGLE_QUIET_NAN_BIT (0x01UL << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700106
107/* Biased Biased Unbiased Use
108 0x0000 0 -127 0 and subnormal
109 0x0001 1 -126 Smallest normal exponent
110 0x7f 127 0 1
111 0xfe 254 127 Largest normal exponent
112 0xff 255 128 NaN and Infinity */
113#define SINGLE_EXPONENT_BIAS (127)
114#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
115#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
116#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
117#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
118
119
120// --------- Double Precision ----------
121#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
122#define DOUBLE_NUM_EXPONENT_BITS (11)
123#define DOUBLE_NUM_SIGN_BITS (1)
124
125#define DOUBLE_SIGNIFICAND_SHIFT (0)
126#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
127#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
128
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700129#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
130#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
131#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
132#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
133
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700134
135/* Biased Biased Unbiased Use
136 0x00000000 0 -1023 0 and subnormal
137 0x00000001 1 -1022 Smallest normal exponent
138 0x000007fe 2046 1023 Largest normal exponent
139 0x000007ff 2047 1024 NaN and Infinity */
140#define DOUBLE_EXPONENT_BIAS (1023)
141#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
142#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
143#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
144#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
145
146
147
148/*
149 Convenient functions to avoid type punning, compiler warnings and such
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700150 The optimizer reduces them to a simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700151 This is a crusty corner of C. It shouldn't be this hard.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700152
153 These are also in UsefulBuf.h under a different name. They are copied
154 here because to avoid a dependency on UsefulBuf.h. There is no
155 object code size impact because these always optimze down to a
156 simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700157 */
158static inline uint32_t CopyFloatToUint32(float f)
159{
160 uint32_t u32;
161 memcpy(&u32, &f, sizeof(uint32_t));
162 return u32;
163}
164
165static inline uint64_t CopyDoubleToUint64(double d)
166{
167 uint64_t u64;
168 memcpy(&u64, &d, sizeof(uint64_t));
169 return u64;
170}
171
172static inline double CopyUint64ToDouble(uint64_t u64)
173{
174 double d;
175 memcpy(&d, &u64, sizeof(uint64_t));
176 return d;
177}
178
179static inline float CopyUint32ToFloat(uint32_t u32)
180{
181 float f;
182 memcpy(&f, &u32, sizeof(uint32_t));
183 return f;
184}
185
186
187
188// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700189uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700190{
191 // Pull the three parts out of the single-precision float
192 const uint32_t uSingle = CopyFloatToUint32(f);
193 const int32_t nSingleUnbiasedExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
194 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
195 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
196
197
198 // Now convert the three parts to half-precision.
199 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
200 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
201 // +/- Infinity and NaNs -- single biased exponent is 0xff
202 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
203 if(!uSingleSignificand) {
204 // Infinity
205 uHalfSignificand = 0;
206 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700207 // Copy the LBSs of the NaN payload that will fit from the single to the half
208 uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
209 if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
210 // It's a qNaN; copy the qNaN bit
211 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700212 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700213 // It's a sNaN; make sure the significand is not zero so it stays a NaN
214 // This is needed because not all significand bits are copied from single
215 if(!uHalfSignificand) {
216 // Set the LSB. This is what wikipedia shows for sNAN.
217 uHalfSignificand |= 0x01;
218 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700219 }
220 }
221 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
222 // 0 or a subnormal number -- singled biased exponent is 0
223 uHalfBiasedExponent = 0;
224 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
225 } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
226 // Exponent is too large to express in half-precision; round up to infinity
227 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
228 uHalfSignificand = 0;
229 } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
230 // Exponent is too small to express in half-precision normal; make it a half-precision subnormal
231 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
232 // Difference between single normal exponent and the base exponent of a half subnormal
233 const uint32_t nExpDiff = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
234 // Also have to shift the significand by the difference in number of bits between a single and a half significand
235 const int32_t nSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
236 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
237 const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01L << SINGLE_NUM_SIGNIFICAND_BITS);
238 uHalfSignificand = uSingleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
239 } else {
240 // The normal case
241 uHalfBiasedExponent = nSingleUnbiasedExponent + HALF_EXPONENT_BIAS;
242 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
243 }
244 uHalfSign = uSingleSign;
245
246 // Put the 3 values in the right place for a half precision
247 const uint16_t uHalfPrecision = uHalfSignificand |
248 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
249 (uHalfSign << HALF_SIGN_SHIFT);
250 return uHalfPrecision;
251}
252
253
254// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700255uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700256{
257 // Pull the three parts out of the double-precision float
258 const uint64_t uDouble = CopyDoubleToUint64(d);
259 const int64_t nDoubleUnbiasedExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
260 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
261 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
262
263
264 // Now convert the three parts to half-precision.
265 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
266 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
267 // +/- Infinity and NaNs -- single biased exponent is 0xff
268 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
269 if(!uDoubleSignificand) {
270 // Infinity
271 uHalfSignificand = 0;
272 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700273 // Copy the LBSs of the NaN payload that will fit from the double to the half
274 uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
275 if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
276 // It's a qNaN; copy the qNaN bit
277 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700278 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700279 // It's an sNaN; make sure the significand is not zero so it stays a NaN
280 // This is needed because not all significand bits are copied from single
281 if(!uHalfSignificand) {
282 // Set the LSB. This is what wikipedia shows for sNAN.
283 uHalfSignificand |= 0x01;
284 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700285 }
286 }
287 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
288 // 0 or a subnormal number -- double biased exponent is 0
289 uHalfBiasedExponent = 0;
290 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
291 } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
292 // Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
293 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
294 uHalfSignificand = 0;
295 } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
296 // Exponent is too small to express in half-precision; round down to zero
297 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
298 // Difference between double normal exponent and the base exponent of a half subnormal
299 const uint64_t nExpDiff = -(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
300 // Also have to shift the significand by the difference in number of bits between a double and a half significand
301 const int64_t nSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
302 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
303 const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01L << DOUBLE_NUM_SIGNIFICAND_BITS);
304 uHalfSignificand = uDoubleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
305 } else {
306 // The normal case
307 uHalfBiasedExponent = nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS;
308 uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
309 }
310 uHalfSign = uDoubleSign;
311
312
313 // Put the 3 values in the right place for a half precision
314 const uint16_t uHalfPrecision = uHalfSignificand |
315 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
316 (uHalfSign << HALF_SIGN_SHIFT);
317 return uHalfPrecision;
318}
319
320
321// Public function; see ieee754.h
322float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
323{
324 // Pull out the three parts of the half-precision float
325 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
326 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
327 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
328
329
330 // Make the three parts of the single-precision number
331 uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
332 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
333 // 0 or subnormal
334 if(uHalfSignificand) {
335 // Subnormal case
336 uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
337 // A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
338 uSingleSignificand = uHalfSignificand;
339 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
340 do {
341 uSingleSignificand <<= 1;
342 uSingleBiasedExponent--;
343 } while ((uSingleSignificand & 0x400) == 0);
344 uSingleSignificand &= HALF_SIGNIFICAND_MASK;
345 uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
346 } else {
347 // Just zero
348 uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
349 uSingleSignificand = 0;
350 }
351 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
352 // NaN or Inifinity
353 uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
354 if(uHalfSignificand) {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700355 // NaN
356 // First preserve the NaN payload from half to single
357 uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
358 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
359 // Next, set qNaN if needed since half qNaN bit is not copied above
360 uSingleSignificand |= SINGLE_QUIET_NAN_BIT;
361 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700362 } else {
363 // Infinity
364 uSingleSignificand = 0;
365 }
366 } else {
367 // Normal number
368 uSingleBiasedExponent = nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS;
369 uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
370 }
371 uSingleSign = uHalfSign;
372
373
374 // Shift the three parts of the single precision into place
375 const uint32_t uSinglePrecision = uSingleSignificand |
376 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
377 (uSingleSign << SINGLE_SIGN_SHIFT);
378
379 return CopyUint32ToFloat(uSinglePrecision);
380}
381
382
383// Public function; see ieee754.h
384double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
385{
386 // Pull out the three parts of the half-precision float
387 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
388 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
389 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
390
391
392 // Make the three parts of hte single-precision number
393 uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
394 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
395 // 0 or subnormal
396 uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
397 if(uHalfSignificand) {
398 // Subnormal case
399 uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
400 // A half-precision subnormal can always be converted to a normal double-precision float because the ranges line up
401 uDoubleSignificand = uHalfSignificand;
402 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
403 do {
404 uDoubleSignificand <<= 1;
405 uDoubleBiasedExponent--;
406 } while ((uDoubleSignificand & 0x400) == 0);
407 uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
408 uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
409 } else {
410 // Just zero
411 uDoubleSignificand = 0;
412 }
413 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
414 // NaN or Inifinity
415 uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
416 if(uHalfSignificand) {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700417 // NaN
418 // First preserve the NaN payload from half to single
419 uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
420 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
421 // Next, set qNaN if needed since half qNaN bit is not copied above
422 uDoubleSignificand |= DOUBLE_QUIET_NAN_BIT;
423 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700424 } else {
425 // Infinity
426 uDoubleSignificand = 0;
427 }
428 } else {
429 // Normal number
430 uDoubleBiasedExponent = nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS;
431 uDoubleSignificand = (uint64_t)uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
432 }
433 uDoubleSign = uHalfSign;
434
435
436 // Shift the 3 parts into place as a double-precision
437 const uint64_t uDouble = uDoubleSignificand |
438 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
439 (uDoubleSign << DOUBLE_SIGN_SHIFT);
440 return CopyUint64ToDouble(uDouble);
441}
442
443
444// Public function; see ieee754.h
445IEEE754_union IEEE754_FloatToSmallest(float f)
446{
447 IEEE754_union result;
448
449 // Pull the neeed two parts out of the single-precision float
450 const uint32_t uSingle = CopyFloatToUint32(f);
451 const int32_t nSingleExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
452 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
453
454 // Bit mask that is the significand bits that would be lost when converting
455 // from single-precision to half-precision
456 const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
457
458 // Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
459 if(uSingle == 0) {
460 // Value is 0.0000, not a a subnormal
461 result.uTag = IEEE754_UNION_IS_HALF;
462 result.u16 = IEEE754_FloatToHalf(f);
463 } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
464 // NaN, +/- infinity
465 result.uTag = IEEE754_UNION_IS_HALF;
466 result.u16 = IEEE754_FloatToHalf(f);
467 } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
468 // Normal number in exponent range and precision won't be lost
469 result.uTag = IEEE754_UNION_IS_HALF;
470 result.u16 = IEEE754_FloatToHalf(f);
471 } else {
472 // Subnormal, exponent out of range, or precision will be lost
473 result.uTag = IEEE754_UNION_IS_SINGLE;
474 result.u32 = uSingle;
475 }
476
477 return result;
478}
479
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700480// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700481IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
482{
483 IEEE754_union result;
484
485 // Pull the needed two parts out of the double-precision float
486 const uint64_t uDouble = CopyDoubleToUint64(d);
487 const int64_t nDoubleExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
488 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
489
490 // Masks to check whether dropped significand bits are zero or not
491 const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
492 const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
493
494 // The various cases
Laurence Lundbladed711fb22018-09-26 14:35:22 -0700495 if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700496 // Value is 0.0000, not a a subnormal
497 result.uTag = IEEE754_UNION_IS_HALF;
498 result.u16 = IEEE754_DoubleToHalf(d);
499 } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
500 // NaN, +/- infinity
501 result.uTag = IEEE754_UNION_IS_HALF;
502 result.u16 = IEEE754_DoubleToHalf(d);
503 } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
504 // Can convert to half without precision loss
505 result.uTag = IEEE754_UNION_IS_HALF;
506 result.u16 = IEEE754_DoubleToHalf(d);
507 } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
508 // Can convert to single without precision loss
509 result.uTag = IEEE754_UNION_IS_SINGLE;
510 result.u32 = CopyFloatToUint32((float)d);
511 } else {
512 // Can't convert without precision loss
513 result.uTag = IEEE754_UNION_IS_DOUBLE;
514 result.u64 = uDouble;
515 }
516
517 return result;
518}
519