blob: c52f6eb2d3503bdd5789d655b4c3f7cad07e2783 [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
Laurence Lundblade3aee3a32018-12-17 16:17:45 -08002
Laurence Lundbladed92a6162018-11-01 11:38:35 +07003 Copyright (c) 2018, Laurence Lundblade.
4 All rights reserved.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -08005
Laurence Lundblade0dbc9172018-11-01 14:17:21 +07006Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are
8met:
9 * Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11 * Redistributions in binary form must reproduce the above
12 copyright notice, this list of conditions and the following
13 disclaimer in the documentation and/or other materials provided
14 with the distribution.
15 * The name "Laurence Lundblade" may not be used to
16 endorse or promote products derived from this software without
17 specific prior written permission.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080018
Laurence Lundblade0dbc9172018-11-01 14:17:21 +070019THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
20WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
21MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
22ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
23BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
29IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070030 ==============================================================================*/
31
Laurence Lundblade12d32c52018-09-19 11:25:27 -070032//
33// ieee754.c
34// Indefinite
35//
36// Created by Laurence Lundblade on 7/23/18.
37// Copyright © 2018 Laurence Lundblade. All rights reserved.
38//
39
40#include "ieee754.h"
41#include <string.h> // For memcpy()
42
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070043
Laurence Lundblade12d32c52018-09-19 11:25:27 -070044/*
Laurence Lundblade12d32c52018-09-19 11:25:27 -070045 This code is written for clarity and verifiability, not for size, on the assumption
46 that the optimizer will do a good job. The LLVM optimizer, -Os, does seem to do the
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070047 job and the resulting object code is smaller from combining code for the many different
Laurence Lundblade12d32c52018-09-19 11:25:27 -070048 cases (normal, subnormal, infinity, zero...) for the conversions.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080049
Laurence Lundblade570fab52018-10-13 18:28:27 +080050 Dead stripping is also really helpful to get code size down when floating point
51 encoding is not needed.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080052
Laurence Lundblade570fab52018-10-13 18:28:27 +080053 This code works solely using shifts and masks and thus has no dependency on
54 any math libraries. It can even work if the CPU doesn't have any floating
55 point support, though that isn't the most useful thing to do.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080056
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070057 The memcpy() dependency is only for CopyFloatToUint32() and friends which only
58 is needed to avoid type punning when converting the actual float bits to
59 an unsigned value so the bit shifts and masks can work.
60 */
61
62/*
63 The references used to write this code:
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080064
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070065 - IEEE 754-2008, particularly section 3.6 and 6.2.1
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080066
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070067 - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080068
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070069 - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundblade12d32c52018-09-19 11:25:27 -070070 */
71
72
73// ----- Half Precsion -----------
74#define HALF_NUM_SIGNIFICAND_BITS (10)
75#define HALF_NUM_EXPONENT_BITS (5)
76#define HALF_NUM_SIGN_BITS (1)
77
78#define HALF_SIGNIFICAND_SHIFT (0)
79#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
80#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
81
82#define HALF_SIGNIFICAND_MASK (0x3ff) // The lower 10 bits // 0x03ff
83#define HALF_EXPONENT_MASK (0x1f << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
84#define HALF_SIGN_MASK (0x01 << HALF_SIGN_SHIFT) // // 0x80001 bit of sign
85#define HALF_QUIET_NAN_BIT (0x01 << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
86
87/* Biased Biased Unbiased Use
88 0x00 0 -15 0 and subnormal
89 0x01 1 -14 Smallest normal exponent
90 0x1e 30 15 Largest normal exponent
91 0x1F 31 16 NaN and Infinity */
92#define HALF_EXPONENT_BIAS (15)
93#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
94#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
95#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
96#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
97
98
99// ------ Single Precision --------
100#define SINGLE_NUM_SIGNIFICAND_BITS (23)
101#define SINGLE_NUM_EXPONENT_BITS (8)
102#define SINGLE_NUM_SIGN_BITS (1)
103
104#define SINGLE_SIGNIFICAND_SHIFT (0)
105#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
106#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
107
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700108#define SINGLE_SIGNIFICAND_MASK (0x7fffffUL) // The lower 23 bits
109#define SINGLE_EXPONENT_MASK (0xffUL << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
110#define SINGLE_SIGN_MASK (0x01UL << SINGLE_SIGN_SHIFT) // 1 bit of sign
111#define SINGLE_QUIET_NAN_BIT (0x01UL << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700112
113/* Biased Biased Unbiased Use
114 0x0000 0 -127 0 and subnormal
115 0x0001 1 -126 Smallest normal exponent
116 0x7f 127 0 1
117 0xfe 254 127 Largest normal exponent
118 0xff 255 128 NaN and Infinity */
119#define SINGLE_EXPONENT_BIAS (127)
120#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
121#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
122#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
123#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
124
125
126// --------- Double Precision ----------
127#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
128#define DOUBLE_NUM_EXPONENT_BITS (11)
129#define DOUBLE_NUM_SIGN_BITS (1)
130
131#define DOUBLE_SIGNIFICAND_SHIFT (0)
132#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
133#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
134
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700135#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
136#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
137#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
138#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
139
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700140
141/* Biased Biased Unbiased Use
142 0x00000000 0 -1023 0 and subnormal
143 0x00000001 1 -1022 Smallest normal exponent
144 0x000007fe 2046 1023 Largest normal exponent
145 0x000007ff 2047 1024 NaN and Infinity */
146#define DOUBLE_EXPONENT_BIAS (1023)
147#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
148#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
149#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
150#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
151
152
153
154/*
155 Convenient functions to avoid type punning, compiler warnings and such
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700156 The optimizer reduces them to a simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700157 This is a crusty corner of C. It shouldn't be this hard.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800158
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700159 These are also in UsefulBuf.h under a different name. They are copied
Laurence Lundblade3df8c7e2018-11-02 13:12:41 +0700160 here to avoid a dependency on UsefulBuf.h. There is no
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700161 object code size impact because these always optimze down to a
162 simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700163 */
164static inline uint32_t CopyFloatToUint32(float f)
165{
166 uint32_t u32;
167 memcpy(&u32, &f, sizeof(uint32_t));
168 return u32;
169}
170
171static inline uint64_t CopyDoubleToUint64(double d)
172{
173 uint64_t u64;
174 memcpy(&u64, &d, sizeof(uint64_t));
175 return u64;
176}
177
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700178static inline float CopyUint32ToFloat(uint32_t u32)
179{
180 float f;
181 memcpy(&f, &u32, sizeof(uint32_t));
182 return f;
183}
184
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700185static inline double CopyUint64ToDouble(uint64_t u64)
186{
187 double d;
188 memcpy(&d, &u64, sizeof(uint64_t));
189 return d;
190}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700191
192
193// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700194uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700195{
196 // Pull the three parts out of the single-precision float
197 const uint32_t uSingle = CopyFloatToUint32(f);
198 const int32_t nSingleUnbiasedExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
199 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
200 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800201
202
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700203 // Now convert the three parts to half-precision.
204 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
205 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
206 // +/- Infinity and NaNs -- single biased exponent is 0xff
207 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
208 if(!uSingleSignificand) {
209 // Infinity
210 uHalfSignificand = 0;
211 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700212 // Copy the LBSs of the NaN payload that will fit from the single to the half
213 uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
214 if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
215 // It's a qNaN; copy the qNaN bit
216 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700217 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700218 // It's a sNaN; make sure the significand is not zero so it stays a NaN
219 // This is needed because not all significand bits are copied from single
220 if(!uHalfSignificand) {
221 // Set the LSB. This is what wikipedia shows for sNAN.
222 uHalfSignificand |= 0x01;
223 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700224 }
225 }
226 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
227 // 0 or a subnormal number -- singled biased exponent is 0
228 uHalfBiasedExponent = 0;
229 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
230 } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
231 // Exponent is too large to express in half-precision; round up to infinity
232 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
233 uHalfSignificand = 0;
234 } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
235 // Exponent is too small to express in half-precision normal; make it a half-precision subnormal
236 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
237 // Difference between single normal exponent and the base exponent of a half subnormal
238 const uint32_t nExpDiff = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
239 // Also have to shift the significand by the difference in number of bits between a single and a half significand
240 const int32_t nSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
241 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
242 const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01L << SINGLE_NUM_SIGNIFICAND_BITS);
243 uHalfSignificand = uSingleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
244 } else {
245 // The normal case
246 uHalfBiasedExponent = nSingleUnbiasedExponent + HALF_EXPONENT_BIAS;
247 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
248 }
249 uHalfSign = uSingleSign;
250
251 // Put the 3 values in the right place for a half precision
252 const uint16_t uHalfPrecision = uHalfSignificand |
253 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
254 (uHalfSign << HALF_SIGN_SHIFT);
255 return uHalfPrecision;
256}
257
258
259// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700260uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700261{
262 // Pull the three parts out of the double-precision float
263 const uint64_t uDouble = CopyDoubleToUint64(d);
264 const int64_t nDoubleUnbiasedExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
265 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
266 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
267
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800268
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700269 // Now convert the three parts to half-precision.
270 uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
271 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
272 // +/- Infinity and NaNs -- single biased exponent is 0xff
273 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
274 if(!uDoubleSignificand) {
275 // Infinity
276 uHalfSignificand = 0;
277 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700278 // Copy the LBSs of the NaN payload that will fit from the double to the half
279 uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
280 if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
281 // It's a qNaN; copy the qNaN bit
282 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700283 } else {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700284 // It's an sNaN; make sure the significand is not zero so it stays a NaN
285 // This is needed because not all significand bits are copied from single
286 if(!uHalfSignificand) {
287 // Set the LSB. This is what wikipedia shows for sNAN.
288 uHalfSignificand |= 0x01;
289 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700290 }
291 }
292 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
293 // 0 or a subnormal number -- double biased exponent is 0
294 uHalfBiasedExponent = 0;
295 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
296 } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
297 // Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
298 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
299 uHalfSignificand = 0;
300 } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
301 // Exponent is too small to express in half-precision; round down to zero
302 uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
303 // Difference between double normal exponent and the base exponent of a half subnormal
304 const uint64_t nExpDiff = -(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
305 // Also have to shift the significand by the difference in number of bits between a double and a half significand
306 const int64_t nSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
307 // Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
Laurence Lundbladef2801952018-12-17 10:40:29 -0800308 const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01ULL << DOUBLE_NUM_SIGNIFICAND_BITS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700309 uHalfSignificand = uDoubleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
310 } else {
311 // The normal case
312 uHalfBiasedExponent = nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS;
313 uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
314 }
315 uHalfSign = uDoubleSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800316
317
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700318 // Put the 3 values in the right place for a half precision
319 const uint16_t uHalfPrecision = uHalfSignificand |
320 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
321 (uHalfSign << HALF_SIGN_SHIFT);
322 return uHalfPrecision;
323}
324
325
326// Public function; see ieee754.h
327float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
328{
329 // Pull out the three parts of the half-precision float
330 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
331 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
332 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800333
334
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700335 // Make the three parts of the single-precision number
336 uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
337 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
338 // 0 or subnormal
339 if(uHalfSignificand) {
340 // Subnormal case
341 uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
342 // A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
343 uSingleSignificand = uHalfSignificand;
344 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
345 do {
346 uSingleSignificand <<= 1;
347 uSingleBiasedExponent--;
348 } while ((uSingleSignificand & 0x400) == 0);
349 uSingleSignificand &= HALF_SIGNIFICAND_MASK;
350 uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
351 } else {
352 // Just zero
353 uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
354 uSingleSignificand = 0;
355 }
356 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
357 // NaN or Inifinity
358 uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
359 if(uHalfSignificand) {
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700360 // NaN
361 // First preserve the NaN payload from half to single
362 uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
363 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
364 // Next, set qNaN if needed since half qNaN bit is not copied above
365 uSingleSignificand |= SINGLE_QUIET_NAN_BIT;
366 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700367 } else {
368 // Infinity
369 uSingleSignificand = 0;
370 }
371 } else {
372 // Normal number
373 uSingleBiasedExponent = nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS;
374 uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
375 }
376 uSingleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800377
378
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700379 // Shift the three parts of the single precision into place
380 const uint32_t uSinglePrecision = uSingleSignificand |
381 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
382 (uSingleSign << SINGLE_SIGN_SHIFT);
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800383
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700384 return CopyUint32ToFloat(uSinglePrecision);
385}
386
387
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700388// Public function; see ieee754.h
389double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
390{
391 // Pull out the three parts of the half-precision float
392 const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
393 const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
394 const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800395
396
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700397 // Make the three parts of hte single-precision number
398 uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
399 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
400 // 0 or subnormal
401 uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
402 if(uHalfSignificand) {
403 // Subnormal case
404 uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
405 // A half-precision subnormal can always be converted to a normal double-precision float because the ranges line up
406 uDoubleSignificand = uHalfSignificand;
407 // Shift bits from right of the decimal to left, reducing the exponent by 1 each time
408 do {
409 uDoubleSignificand <<= 1;
410 uDoubleBiasedExponent--;
411 } while ((uDoubleSignificand & 0x400) == 0);
412 uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
413 uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
414 } else {
415 // Just zero
416 uDoubleSignificand = 0;
417 }
418 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
419 // NaN or Inifinity
420 uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
421 if(uHalfSignificand) {
422 // NaN
423 // First preserve the NaN payload from half to single
424 uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
425 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
426 // Next, set qNaN if needed since half qNaN bit is not copied above
427 uDoubleSignificand |= DOUBLE_QUIET_NAN_BIT;
428 }
429 } else {
430 // Infinity
431 uDoubleSignificand = 0;
432 }
433 } else {
434 // Normal number
435 uDoubleBiasedExponent = nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS;
436 uDoubleSignificand = (uint64_t)uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
437 }
438 uDoubleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800439
440
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700441 // Shift the 3 parts into place as a double-precision
442 const uint64_t uDouble = uDoubleSignificand |
443 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
444 (uDoubleSign << DOUBLE_SIGN_SHIFT);
445 return CopyUint64ToDouble(uDouble);
446}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700447
448
449// Public function; see ieee754.h
450IEEE754_union IEEE754_FloatToSmallest(float f)
451{
452 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800453
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700454 // Pull the neeed two parts out of the single-precision float
455 const uint32_t uSingle = CopyFloatToUint32(f);
456 const int32_t nSingleExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
457 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800458
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700459 // Bit mask that is the significand bits that would be lost when converting
460 // from single-precision to half-precision
461 const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
462
463 // Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
464 if(uSingle == 0) {
465 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700466 result.uSize = IEEE754_UNION_IS_HALF;
467 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700468 } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
469 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700470 result.uSize = IEEE754_UNION_IS_HALF;
471 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700472 } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
473 // Normal number in exponent range and precision won't be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700474 result.uSize = IEEE754_UNION_IS_HALF;
475 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700476 } else {
477 // Subnormal, exponent out of range, or precision will be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700478 result.uSize = IEEE754_UNION_IS_SINGLE;
479 result.uValue = uSingle;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700480 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800481
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700482 return result;
483}
484
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700485// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700486IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
487{
488 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800489
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700490 // Pull the needed two parts out of the double-precision float
491 const uint64_t uDouble = CopyDoubleToUint64(d);
492 const int64_t nDoubleExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
493 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800494
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700495 // Masks to check whether dropped significand bits are zero or not
496 const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
497 const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800498
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700499 // The various cases
Laurence Lundbladed711fb22018-09-26 14:35:22 -0700500 if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700501 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700502 result.uSize = IEEE754_UNION_IS_HALF;
503 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700504 } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
505 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700506 result.uSize = IEEE754_UNION_IS_HALF;
507 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700508 } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
509 // Can convert to half without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700510 result.uSize = IEEE754_UNION_IS_HALF;
511 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700512 } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
513 // Can convert to single without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700514 result.uSize = IEEE754_UNION_IS_SINGLE;
515 result.uValue = CopyFloatToUint32((float)d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700516 } else {
517 // Can't convert without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700518 result.uSize = IEEE754_UNION_IS_DOUBLE;
519 result.uValue = uDouble;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700520 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800521
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700522 return result;
523}
524