blob: 12877bdf63f61b29af86c80e40badcfe208e6d4c [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
2 Copyright 2018 Laurence Lundblade
3
4 Permission is hereby granted, free of charge, to any person obtaining
5 a copy of this software and associated documentation files (the
6 "Software"), to deal in the Software without restriction, including
7 without limitation the rights to use, copy, modify, merge, publish,
8 distribute, sublicense, and/or sell copies of the Software, and to
9 permit persons to whom the Software is furnished to do so, subject to
10 the following conditions:
11
12 The above copyright notice and this permission notice shall be included
13 in all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 SOFTWARE.
23
24 (This is the MIT license)
25 ==============================================================================*/
26
Laurence Lundblade12d32c52018-09-19 11:25:27 -070027//
28// ieee754.c
29// Indefinite
30//
31// Created by Laurence Lundblade on 7/23/18.
32// Copyright © 2018 Laurence Lundblade. All rights reserved.
33//
34
35#include "ieee754.h"
36#include <string.h> // For memcpy()
37
38/*
39
40 https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
41
42 These values come from IEEE 754-2008 section 3.6
43
44 This code is written for clarity and verifiability, not for size, on the assumption
45 that the optimizer will do a good job. The LLVM optimizer, -Os, does seem to do the
46 job and the resulting object code is smaller from combing code for the many different
47 cases (normal, subnormal, infinity, zero...) for the conversions.
48
49 Dead stripping is also really helpful to get code size down.
50
51 */
52
53
54// ----- Half Precsion -----------
55#define HALF_NUM_SIGNIFICAND_BITS (10)
56#define HALF_NUM_EXPONENT_BITS (5)
57#define HALF_NUM_SIGN_BITS (1)
58
59#define HALF_SIGNIFICAND_SHIFT (0)
60#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
61#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
62
63#define HALF_SIGNIFICAND_MASK (0x3ff) // The lower 10 bits // 0x03ff
64#define HALF_EXPONENT_MASK (0x1f << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
65#define HALF_SIGN_MASK (0x01 << HALF_SIGN_SHIFT) // // 0x80001 bit of sign
66#define HALF_QUIET_NAN_BIT (0x01 << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
67
68/* Biased Biased Unbiased Use
69 0x00 0 -15 0 and subnormal
70 0x01 1 -14 Smallest normal exponent
71 0x1e 30 15 Largest normal exponent
72 0x1F 31 16 NaN and Infinity */
73#define HALF_EXPONENT_BIAS (15)
74#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
75#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
76#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
77#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
78
79
80// ------ Single Precision --------
81#define SINGLE_NUM_SIGNIFICAND_BITS (23)
82#define SINGLE_NUM_EXPONENT_BITS (8)
83#define SINGLE_NUM_SIGN_BITS (1)
84
85#define SINGLE_SIGNIFICAND_SHIFT (0)
86#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
87#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
88
89#define SINGLE_SIGNIFICAND_MASK (0x7fffff) // The lower 23 bits
90#define SINGLE_EXPONENT_MASK (0xff << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
91#define SINGLE_SIGN_MASK (0x01 << SINGLE_SIGN_SHIFT) // 1 bit of sign
92
93/* Biased Biased Unbiased Use
94 0x0000 0 -127 0 and subnormal
95 0x0001 1 -126 Smallest normal exponent
96 0x7f 127 0 1
97 0xfe 254 127 Largest normal exponent
98 0xff 255 128 NaN and Infinity */
99#define SINGLE_EXPONENT_BIAS (127)
100#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
101#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
102#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
103#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
104
105
106// --------- Double Precision ----------
107#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
108#define DOUBLE_NUM_EXPONENT_BITS (11)
109#define DOUBLE_NUM_SIGN_BITS (1)
110
111#define DOUBLE_SIGNIFICAND_SHIFT (0)
112#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
113#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
114
115#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffLL) // The lower 52 bits
116#define DOUBLE_EXPONENT_MASK (0x7ffLL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
117#define DOUBLE_SIGN_MASK (0x01LL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
118
119/* Biased Biased Unbiased Use
120 0x00000000 0 -1023 0 and subnormal
121 0x00000001 1 -1022 Smallest normal exponent
122 0x000007fe 2046 1023 Largest normal exponent
123 0x000007ff 2047 1024 NaN and Infinity */
124#define DOUBLE_EXPONENT_BIAS (1023)
125#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
126#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
127#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
128#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
129
130
131
132/*
133 Convenient functions to avoid type punning, compiler warnings and such
134 The optimizer reduces them to a simple assignment
135 This is a crusty corner of C. It shouldn't be this hard.
136 */
137static inline uint32_t CopyFloatToUint32(float f)
138{
139 uint32_t u32;
140 memcpy(&u32, &f, sizeof(uint32_t));
141 return u32;
142}
143
144static inline uint64_t CopyDoubleToUint64(double d)
145{
146 uint64_t u64;
147 memcpy(&u64, &d, sizeof(uint64_t));
148 return u64;
149}
150
151static inline double CopyUint64ToDouble(uint64_t u64)
152{
153 double d;
154 memcpy(&d, &u64, sizeof(uint64_t));
155 return d;
156}
157
158static inline float CopyUint32ToFloat(uint32_t u32)
159{
160 float f;
161 memcpy(&f, &u32, sizeof(uint32_t));
162 return f;
163}
164
165
166
167// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700168uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700169{
170 // Pull the three parts out of the single-precision float
171 const uint32_t uSingle = CopyFloatToUint32(f);
172 const int32_t nSingleUnbiasedExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
173 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
174 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
175
176
177 // Now convert the three parts to half-precision.
178 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
179 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
180 // +/- Infinity and NaNs -- single biased exponent is 0xff
181 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
182 if(!uSingleSignificand) {
183 // Infinity
184 uHalfSignificand = 0;
185 } else {
186 // NaN; significand has to be non-zero
187 if(!(uSingleSignificand & HALF_SIGNIFICAND_MASK)) {
188 // NaN payload bits that can't be carried; convert to a quite NaN
189 // since this has to be non-zero to still be a NaN
190 uHalfSignificand = HALF_QUIET_NAN_BIT; // standard qNaN;
191 } else {
192 // The LSBs are preserved, but not the MSBs
193 // This preservation allows some limited form of NaN payloads / boxing
194 // Would be good to find out what other implementations do for
195 // this kind of conversion of NaN
196 uHalfSignificand = uSingleSignificand & HALF_SIGNIFICAND_MASK;
197 }
198 }
199 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
200 // 0 or a subnormal number -- singled biased exponent is 0
201 uHalfBiasedExponent = 0;
202 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
203 } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
204 // Exponent is too large to express in half-precision; round up to infinity
205 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
206 uHalfSignificand = 0;
207 } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
208 // Exponent is too small to express in half-precision normal; make it a half-precision subnormal
209 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
210 // Difference between single normal exponent and the base exponent of a half subnormal
211 const uint32_t nExpDiff = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
212 // Also have to shift the significand by the difference in number of bits between a single and a half significand
213 const int32_t nSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
214 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
215 const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01L << SINGLE_NUM_SIGNIFICAND_BITS);
216 uHalfSignificand = uSingleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
217 } else {
218 // The normal case
219 uHalfBiasedExponent = nSingleUnbiasedExponent + HALF_EXPONENT_BIAS;
220 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
221 }
222 uHalfSign = uSingleSign;
223
224 // Put the 3 values in the right place for a half precision
225 const uint16_t uHalfPrecision = uHalfSignificand |
226 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
227 (uHalfSign << HALF_SIGN_SHIFT);
228 return uHalfPrecision;
229}
230
231
232// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700233uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700234{
235 // Pull the three parts out of the double-precision float
236 const uint64_t uDouble = CopyDoubleToUint64(d);
237 const int64_t nDoubleUnbiasedExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
238 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
239 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
240
241
242 // Now convert the three parts to half-precision.
243 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
244 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
245 // +/- Infinity and NaNs -- single biased exponent is 0xff
246 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
247 if(!uDoubleSignificand) {
248 // Infinity
249 uHalfSignificand = 0;
250 } else {
251 // NaN; significand has to be non-zero
252 if(!(uDoubleSignificand & HALF_SIGNIFICAND_MASK)) {
253 // NaN payload bits that can't be carried; convert to a quite NaN
254 // since this has to be non-zero to still be a NaN
255 uHalfSignificand = HALF_QUIET_NAN_BIT; // standard qNaN;
256 } else {
257 // The LSBs are preserved, but not the MSBs
258 // This preservation allows some limited form of NaN payloads / boxing
259 // Would be good to find out what other implementations do for
260 // this kind of conversion of NaN
261 uHalfSignificand = uDoubleSignificand & HALF_SIGNIFICAND_MASK;
262 }
263 }
264 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
265 // 0 or a subnormal number -- double biased exponent is 0
266 uHalfBiasedExponent = 0;
267 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
268 } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
269 // Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
270 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
271 uHalfSignificand = 0;
272 } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
273 // Exponent is too small to express in half-precision; round down to zero
274 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
275 // Difference between double normal exponent and the base exponent of a half subnormal
276 const uint64_t nExpDiff = -(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
277 // Also have to shift the significand by the difference in number of bits between a double and a half significand
278 const int64_t nSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
279 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
280 const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01L << DOUBLE_NUM_SIGNIFICAND_BITS);
281 uHalfSignificand = uDoubleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
282 } else {
283 // The normal case
284 uHalfBiasedExponent = nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS;
285 uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
286 }
287 uHalfSign = uDoubleSign;
288
289
290 // Put the 3 values in the right place for a half precision
291 const uint16_t uHalfPrecision = uHalfSignificand |
292 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
293 (uHalfSign << HALF_SIGN_SHIFT);
294 return uHalfPrecision;
295}
296
297
298// Public function; see ieee754.h
299float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
300{
301 // Pull out the three parts of the half-precision float
302 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
303 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
304 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
305
306
307 // Make the three parts of the single-precision number
308 uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
309 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
310 // 0 or subnormal
311 if(uHalfSignificand) {
312 // Subnormal case
313 uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
314 // A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
315 uSingleSignificand = uHalfSignificand;
316 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
317 do {
318 uSingleSignificand <<= 1;
319 uSingleBiasedExponent--;
320 } while ((uSingleSignificand & 0x400) == 0);
321 uSingleSignificand &= HALF_SIGNIFICAND_MASK;
322 uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
323 } else {
324 // Just zero
325 uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
326 uSingleSignificand = 0;
327 }
328 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
329 // NaN or Inifinity
330 uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
331 if(uHalfSignificand) {
332 // Preserve NaN payload for NaN boxing
333 uSingleSignificand = uHalfSignificand;
334 } else {
335 // Infinity
336 uSingleSignificand = 0;
337 }
338 } else {
339 // Normal number
340 uSingleBiasedExponent = nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS;
341 uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
342 }
343 uSingleSign = uHalfSign;
344
345
346 // Shift the three parts of the single precision into place
347 const uint32_t uSinglePrecision = uSingleSignificand |
348 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
349 (uSingleSign << SINGLE_SIGN_SHIFT);
350
351 return CopyUint32ToFloat(uSinglePrecision);
352}
353
354
355// Public function; see ieee754.h
356double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
357{
358 // Pull out the three parts of the half-precision float
359 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
360 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
361 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
362
363
364 // Make the three parts of hte single-precision number
365 uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
366 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
367 // 0 or subnormal
368 uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
369 if(uHalfSignificand) {
370 // Subnormal case
371 uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
372 // A half-precision subnormal can always be converted to a normal double-precision float because the ranges line up
373 uDoubleSignificand = uHalfSignificand;
374 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
375 do {
376 uDoubleSignificand <<= 1;
377 uDoubleBiasedExponent--;
378 } while ((uDoubleSignificand & 0x400) == 0);
379 uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
380 uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
381 } else {
382 // Just zero
383 uDoubleSignificand = 0;
384 }
385 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
386 // NaN or Inifinity
387 uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
388 if(uHalfSignificand) {
389 // Preserve NaN payload for NaN boxing
390 uDoubleSignificand = uHalfSignificand;
391 } else {
392 // Infinity
393 uDoubleSignificand = 0;
394 }
395 } else {
396 // Normal number
397 uDoubleBiasedExponent = nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS;
398 uDoubleSignificand = (uint64_t)uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
399 }
400 uDoubleSign = uHalfSign;
401
402
403 // Shift the 3 parts into place as a double-precision
404 const uint64_t uDouble = uDoubleSignificand |
405 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
406 (uDoubleSign << DOUBLE_SIGN_SHIFT);
407 return CopyUint64ToDouble(uDouble);
408}
409
410
411// Public function; see ieee754.h
412IEEE754_union IEEE754_FloatToSmallest(float f)
413{
414 IEEE754_union result;
415
416 // Pull the neeed two parts out of the single-precision float
417 const uint32_t uSingle = CopyFloatToUint32(f);
418 const int32_t nSingleExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
419 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
420
421 // Bit mask that is the significand bits that would be lost when converting
422 // from single-precision to half-precision
423 const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
424
425 // Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
426 if(uSingle == 0) {
427 // Value is 0.0000, not a a subnormal
428 result.uTag = IEEE754_UNION_IS_HALF;
429 result.u16 = IEEE754_FloatToHalf(f);
430 } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
431 // NaN, +/- infinity
432 result.uTag = IEEE754_UNION_IS_HALF;
433 result.u16 = IEEE754_FloatToHalf(f);
434 } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
435 // Normal number in exponent range and precision won't be lost
436 result.uTag = IEEE754_UNION_IS_HALF;
437 result.u16 = IEEE754_FloatToHalf(f);
438 } else {
439 // Subnormal, exponent out of range, or precision will be lost
440 result.uTag = IEEE754_UNION_IS_SINGLE;
441 result.u32 = uSingle;
442 }
443
444 return result;
445}
446
447
448IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
449{
450 IEEE754_union result;
451
452 // Pull the needed two parts out of the double-precision float
453 const uint64_t uDouble = CopyDoubleToUint64(d);
454 const int64_t nDoubleExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
455 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
456
457 // Masks to check whether dropped significand bits are zero or not
458 const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
459 const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
460
461 // The various cases
Laurence Lundbladed711fb22018-09-26 14:35:22 -0700462 if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700463 // Value is 0.0000, not a a subnormal
464 result.uTag = IEEE754_UNION_IS_HALF;
465 result.u16 = IEEE754_DoubleToHalf(d);
466 } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
467 // NaN, +/- infinity
468 result.uTag = IEEE754_UNION_IS_HALF;
469 result.u16 = IEEE754_DoubleToHalf(d);
470 } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
471 // Can convert to half without precision loss
472 result.uTag = IEEE754_UNION_IS_HALF;
473 result.u16 = IEEE754_DoubleToHalf(d);
474 } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
475 // Can convert to single without precision loss
476 result.uTag = IEEE754_UNION_IS_SINGLE;
477 result.u32 = CopyFloatToUint32((float)d);
478 } else {
479 // Can't convert without precision loss
480 result.uTag = IEEE754_UNION_IS_DOUBLE;
481 result.u64 = uDouble;
482 }
483
484 return result;
485}
486