blob: e221f3ddeb8085faf8f7aaff5dd65537fd1d53c0 [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07002
Laurence Lundbladed92a6162018-11-01 11:38:35 +07003 Copyright (c) 2018, Laurence Lundblade.
4 All rights reserved.
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07005
Laurence Lundbladed92a6162018-11-01 11:38:35 +07006 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are
8 met:
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11 * Redistributions in binary form must reproduce the above
12 copyright notice, this list of conditions and the following
13 disclaimer in the documentation and/or other materials provided
14 with the distribution.
15 * Neither the name of The Linux Foundation nor the names of its
16 contributors, nor the name "Laurence Lundblade" may be used to
17 endorse or promote products derived from this software without
18 specific prior written permission.
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070019
Laurence Lundbladed92a6162018-11-01 11:38:35 +070020 THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
21 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
24 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
27 BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
29 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
30 IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070031 ==============================================================================*/
32
Laurence Lundblade12d32c52018-09-19 11:25:27 -070033//
34// ieee754.c
35// Indefinite
36//
37// Created by Laurence Lundblade on 7/23/18.
38// Copyright © 2018 Laurence Lundblade. All rights reserved.
39//
40
41#include "ieee754.h"
42#include <string.h> // For memcpy()
43
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070044
Laurence Lundblade12d32c52018-09-19 11:25:27 -070045/*
Laurence Lundblade12d32c52018-09-19 11:25:27 -070046 This code is written for clarity and verifiability, not for size, on the assumption
47 that the optimizer will do a good job. The LLVM optimizer, -Os, does seem to do the
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070048 job and the resulting object code is smaller from combining code for the many different
Laurence Lundblade12d32c52018-09-19 11:25:27 -070049 cases (normal, subnormal, infinity, zero...) for the conversions.
50
Laurence Lundblade570fab52018-10-13 18:28:27 +080051 Dead stripping is also really helpful to get code size down when floating point
52 encoding is not needed.
Laurence Lundblade12d32c52018-09-19 11:25:27 -070053
Laurence Lundblade570fab52018-10-13 18:28:27 +080054 This code works solely using shifts and masks and thus has no dependency on
55 any math libraries. It can even work if the CPU doesn't have any floating
56 point support, though that isn't the most useful thing to do.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070057
58 The memcpy() dependency is only for CopyFloatToUint32() and friends which only
59 is needed to avoid type punning when converting the actual float bits to
60 an unsigned value so the bit shifts and masks can work.
61 */
62
63/*
64 The references used to write this code:
65
66 - IEEE 754-2008, particularly section 3.6 and 6.2.1
67
68 - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
69
70 - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundblade12d32c52018-09-19 11:25:27 -070071 */
72
73
74// ----- Half Precsion -----------
75#define HALF_NUM_SIGNIFICAND_BITS (10)
76#define HALF_NUM_EXPONENT_BITS (5)
77#define HALF_NUM_SIGN_BITS (1)
78
79#define HALF_SIGNIFICAND_SHIFT (0)
80#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
81#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
82
83#define HALF_SIGNIFICAND_MASK (0x3ff) // The lower 10 bits // 0x03ff
84#define HALF_EXPONENT_MASK (0x1f << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
85#define HALF_SIGN_MASK (0x01 << HALF_SIGN_SHIFT) // // 0x80001 bit of sign
86#define HALF_QUIET_NAN_BIT (0x01 << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
87
88/* Biased Biased Unbiased Use
89 0x00 0 -15 0 and subnormal
90 0x01 1 -14 Smallest normal exponent
91 0x1e 30 15 Largest normal exponent
92 0x1F 31 16 NaN and Infinity */
93#define HALF_EXPONENT_BIAS (15)
94#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
95#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
96#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
97#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
98
99
100// ------ Single Precision --------
101#define SINGLE_NUM_SIGNIFICAND_BITS (23)
102#define SINGLE_NUM_EXPONENT_BITS (8)
103#define SINGLE_NUM_SIGN_BITS (1)
104
105#define SINGLE_SIGNIFICAND_SHIFT (0)
106#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
107#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
108
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700109#define SINGLE_SIGNIFICAND_MASK (0x7fffffUL) // The lower 23 bits
110#define SINGLE_EXPONENT_MASK (0xffUL << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
111#define SINGLE_SIGN_MASK (0x01UL << SINGLE_SIGN_SHIFT) // 1 bit of sign
112#define SINGLE_QUIET_NAN_BIT (0x01UL << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700113
114/* Biased Biased Unbiased Use
115 0x0000 0 -127 0 and subnormal
116 0x0001 1 -126 Smallest normal exponent
117 0x7f 127 0 1
118 0xfe 254 127 Largest normal exponent
119 0xff 255 128 NaN and Infinity */
120#define SINGLE_EXPONENT_BIAS (127)
121#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
122#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
123#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
124#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
125
126
127// --------- Double Precision ----------
128#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
129#define DOUBLE_NUM_EXPONENT_BITS (11)
130#define DOUBLE_NUM_SIGN_BITS (1)
131
132#define DOUBLE_SIGNIFICAND_SHIFT (0)
133#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
134#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
135
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700136#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
137#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
138#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
139#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
140
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700141
142/* Biased Biased Unbiased Use
143 0x00000000 0 -1023 0 and subnormal
144 0x00000001 1 -1022 Smallest normal exponent
145 0x000007fe 2046 1023 Largest normal exponent
146 0x000007ff 2047 1024 NaN and Infinity */
147#define DOUBLE_EXPONENT_BIAS (1023)
148#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
149#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
150#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
151#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
152
153
154
155/*
156 Convenient functions to avoid type punning, compiler warnings and such
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700157 The optimizer reduces them to a simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700158 This is a crusty corner of C. It shouldn't be this hard.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700159
160 These are also in UsefulBuf.h under a different name. They are copied
161 here because to avoid a dependency on UsefulBuf.h. There is no
162 object code size impact because these always optimze down to a
163 simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700164 */
165static inline uint32_t CopyFloatToUint32(float f)
166{
167 uint32_t u32;
168 memcpy(&u32, &f, sizeof(uint32_t));
169 return u32;
170}
171
172static inline uint64_t CopyDoubleToUint64(double d)
173{
174 uint64_t u64;
175 memcpy(&u64, &d, sizeof(uint64_t));
176 return u64;
177}
178
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700179static inline float CopyUint32ToFloat(uint32_t u32)
180{
181 float f;
182 memcpy(&f, &u32, sizeof(uint32_t));
183 return f;
184}
185
186
187
188// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700189uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700190{
191 // Pull the three parts out of the single-precision float
192 const uint32_t uSingle = CopyFloatToUint32(f);
193 const int32_t nSingleUnbiasedExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
194 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
195 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
196
197
198 // Now convert the three parts to half-precision.
199 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
200 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
201 // +/- Infinity and NaNs -- single biased exponent is 0xff
202 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
203 if(!uSingleSignificand) {
204 // Infinity
205 uHalfSignificand = 0;
206 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700207 // Copy the LBSs of the NaN payload that will fit from the single to the half
208 uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
209 if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
210 // It's a qNaN; copy the qNaN bit
211 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700212 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700213 // It's a sNaN; make sure the significand is not zero so it stays a NaN
214 // This is needed because not all significand bits are copied from single
215 if(!uHalfSignificand) {
216 // Set the LSB. This is what wikipedia shows for sNAN.
217 uHalfSignificand |= 0x01;
218 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700219 }
220 }
221 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
222 // 0 or a subnormal number -- singled biased exponent is 0
223 uHalfBiasedExponent = 0;
224 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
225 } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
226 // Exponent is too large to express in half-precision; round up to infinity
227 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
228 uHalfSignificand = 0;
229 } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
230 // Exponent is too small to express in half-precision normal; make it a half-precision subnormal
231 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
232 // Difference between single normal exponent and the base exponent of a half subnormal
233 const uint32_t nExpDiff = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
234 // Also have to shift the significand by the difference in number of bits between a single and a half significand
235 const int32_t nSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
236 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
237 const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01L << SINGLE_NUM_SIGNIFICAND_BITS);
238 uHalfSignificand = uSingleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
239 } else {
240 // The normal case
241 uHalfBiasedExponent = nSingleUnbiasedExponent + HALF_EXPONENT_BIAS;
242 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
243 }
244 uHalfSign = uSingleSign;
245
246 // Put the 3 values in the right place for a half precision
247 const uint16_t uHalfPrecision = uHalfSignificand |
248 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
249 (uHalfSign << HALF_SIGN_SHIFT);
250 return uHalfPrecision;
251}
252
253
254// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700255uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700256{
257 // Pull the three parts out of the double-precision float
258 const uint64_t uDouble = CopyDoubleToUint64(d);
259 const int64_t nDoubleUnbiasedExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
260 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
261 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
262
263
264 // Now convert the three parts to half-precision.
265 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
266 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
267 // +/- Infinity and NaNs -- single biased exponent is 0xff
268 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
269 if(!uDoubleSignificand) {
270 // Infinity
271 uHalfSignificand = 0;
272 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700273 // Copy the LBSs of the NaN payload that will fit from the double to the half
274 uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
275 if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
276 // It's a qNaN; copy the qNaN bit
277 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700278 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700279 // It's an sNaN; make sure the significand is not zero so it stays a NaN
280 // This is needed because not all significand bits are copied from single
281 if(!uHalfSignificand) {
282 // Set the LSB. This is what wikipedia shows for sNAN.
283 uHalfSignificand |= 0x01;
284 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700285 }
286 }
287 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
288 // 0 or a subnormal number -- double biased exponent is 0
289 uHalfBiasedExponent = 0;
290 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
291 } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
292 // Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
293 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
294 uHalfSignificand = 0;
295 } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
296 // Exponent is too small to express in half-precision; round down to zero
297 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
298 // Difference between double normal exponent and the base exponent of a half subnormal
299 const uint64_t nExpDiff = -(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
300 // Also have to shift the significand by the difference in number of bits between a double and a half significand
301 const int64_t nSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
302 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
303 const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01L << DOUBLE_NUM_SIGNIFICAND_BITS);
304 uHalfSignificand = uDoubleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
305 } else {
306 // The normal case
307 uHalfBiasedExponent = nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS;
308 uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
309 }
310 uHalfSign = uDoubleSign;
311
312
313 // Put the 3 values in the right place for a half precision
314 const uint16_t uHalfPrecision = uHalfSignificand |
315 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
316 (uHalfSign << HALF_SIGN_SHIFT);
317 return uHalfPrecision;
318}
319
320
321// Public function; see ieee754.h
322float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
323{
324 // Pull out the three parts of the half-precision float
325 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
326 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
327 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
328
329
330 // Make the three parts of the single-precision number
331 uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
332 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
333 // 0 or subnormal
334 if(uHalfSignificand) {
335 // Subnormal case
336 uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
337 // A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
338 uSingleSignificand = uHalfSignificand;
339 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
340 do {
341 uSingleSignificand <<= 1;
342 uSingleBiasedExponent--;
343 } while ((uSingleSignificand & 0x400) == 0);
344 uSingleSignificand &= HALF_SIGNIFICAND_MASK;
345 uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
346 } else {
347 // Just zero
348 uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
349 uSingleSignificand = 0;
350 }
351 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
352 // NaN or Inifinity
353 uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
354 if(uHalfSignificand) {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700355 // NaN
356 // First preserve the NaN payload from half to single
357 uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
358 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
359 // Next, set qNaN if needed since half qNaN bit is not copied above
360 uSingleSignificand |= SINGLE_QUIET_NAN_BIT;
361 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700362 } else {
363 // Infinity
364 uSingleSignificand = 0;
365 }
366 } else {
367 // Normal number
368 uSingleBiasedExponent = nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS;
369 uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
370 }
371 uSingleSign = uHalfSign;
372
373
374 // Shift the three parts of the single precision into place
375 const uint32_t uSinglePrecision = uSingleSignificand |
376 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
377 (uSingleSign << SINGLE_SIGN_SHIFT);
378
379 return CopyUint32ToFloat(uSinglePrecision);
380}
381
382
Laurence Lundblade781fd822018-10-01 09:37:52 -0700383/*
384 double IEEE754_HalfToDouble(uint16_t uHalfPrecision) is not needed
385*/
386
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700387
388
389// Public function; see ieee754.h
390IEEE754_union IEEE754_FloatToSmallest(float f)
391{
392 IEEE754_union result;
393
394 // Pull the neeed two parts out of the single-precision float
395 const uint32_t uSingle = CopyFloatToUint32(f);
396 const int32_t nSingleExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
397 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
398
399 // Bit mask that is the significand bits that would be lost when converting
400 // from single-precision to half-precision
401 const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
402
403 // Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
404 if(uSingle == 0) {
405 // Value is 0.0000, not a a subnormal
406 result.uTag = IEEE754_UNION_IS_HALF;
407 result.u16 = IEEE754_FloatToHalf(f);
408 } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
409 // NaN, +/- infinity
410 result.uTag = IEEE754_UNION_IS_HALF;
411 result.u16 = IEEE754_FloatToHalf(f);
412 } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
413 // Normal number in exponent range and precision won't be lost
414 result.uTag = IEEE754_UNION_IS_HALF;
415 result.u16 = IEEE754_FloatToHalf(f);
416 } else {
417 // Subnormal, exponent out of range, or precision will be lost
418 result.uTag = IEEE754_UNION_IS_SINGLE;
419 result.u32 = uSingle;
420 }
421
422 return result;
423}
424
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700425// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700426IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
427{
428 IEEE754_union result;
429
430 // Pull the needed two parts out of the double-precision float
431 const uint64_t uDouble = CopyDoubleToUint64(d);
432 const int64_t nDoubleExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
433 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
434
435 // Masks to check whether dropped significand bits are zero or not
436 const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
437 const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
438
439 // The various cases
Laurence Lundbladed711fb22018-09-26 14:35:22 -0700440 if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700441 // Value is 0.0000, not a a subnormal
442 result.uTag = IEEE754_UNION_IS_HALF;
443 result.u16 = IEEE754_DoubleToHalf(d);
444 } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
445 // NaN, +/- infinity
446 result.uTag = IEEE754_UNION_IS_HALF;
447 result.u16 = IEEE754_DoubleToHalf(d);
448 } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
449 // Can convert to half without precision loss
450 result.uTag = IEEE754_UNION_IS_HALF;
451 result.u16 = IEEE754_DoubleToHalf(d);
452 } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
453 // Can convert to single without precision loss
454 result.uTag = IEEE754_UNION_IS_SINGLE;
455 result.u32 = CopyFloatToUint32((float)d);
456 } else {
457 // Can't convert without precision loss
458 result.uTag = IEEE754_UNION_IS_DOUBLE;
459 result.u64 = uDouble;
460 }
461
462 return result;
463}
464