blob: 216cd00a69c28fae61110efb834d8eb19ec358fd [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
Laurence Lundbladeee851742020-01-08 08:37:05 -08002 ieee754.c -- floating-point conversion between half, double & single-precision
Laurence Lundblade035bd782019-01-21 17:01:31 -08003
Laurence Lundbladeee851742020-01-08 08:37:05 -08004 Copyright (c) 2018-2020, Laurence Lundblade. All rights reserved.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -08005
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08006 SPDX-License-Identifier: BSD-3-Clause
Laurence Lundblade035bd782019-01-21 17:01:31 -08007
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08008 See BSD-3-Clause license in README.md
Laurence Lundblade035bd782019-01-21 17:01:31 -08009
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -080010 Created on 7/23/18
Laurence Lundbladeee851742020-01-08 08:37:05 -080011 =============================================================================*/
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070012
Laurence Lundbladeb275cdc2020-07-12 12:34:38 -070013#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
Laurence Lundblade9682a532020-06-06 18:33:04 -070014
Laurence Lundblade12d32c52018-09-19 11:25:27 -070015#include "ieee754.h"
16#include <string.h> // For memcpy()
17
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070018
Laurence Lundblade12d32c52018-09-19 11:25:27 -070019/*
Laurence Lundbladeee851742020-01-08 08:37:05 -080020 This code is written for clarity and verifiability, not for size, on
21 the assumption that the optimizer will do a good job. The LLVM
22 optimizer, -Os, does seem to do the job and the resulting object code
23 is smaller from combining code for the many different cases (normal,
Laurence Lundbladec5fef682020-01-25 11:38:45 -080024 subnormal, infinity, zero...) for the conversions. GCC is no where near
25 as good.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080026
Laurence Lundbladeee851742020-01-08 08:37:05 -080027 This code has really long lines and is much easier to read because of
28 them. Some coding guidelines prefer 80 column lines (can they not afford
29 big displays?). It would make this code much worse even to wrap at 120
30 columns.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080031
Laurence Lundbladeee851742020-01-08 08:37:05 -080032 Dead stripping is also really helpful to get code size down when
Laurence Lundbladec5fef682020-01-25 11:38:45 -080033 floating-point encoding is not needed. (If this is put in a library
34 and linking is against the library, then dead stripping is automatic).
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080035
Laurence Lundbladeee851742020-01-08 08:37:05 -080036 This code works solely using shifts and masks and thus has no
37 dependency on any math libraries. It can even work if the CPU doesn't
38 have any floating-point support, though that isn't the most useful
39 thing to do.
40
41 The memcpy() dependency is only for CopyFloatToUint32() and friends
42 which only is needed to avoid type punning when converting the actual
43 float bits to an unsigned value so the bit shifts and masks can work.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070044 */
45
46/*
47 The references used to write this code:
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080048
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070049 - IEEE 754-2008, particularly section 3.6 and 6.2.1
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080050
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070051 - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080052
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070053 - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundbladec5fef682020-01-25 11:38:45 -080054
55 - https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules
56
57 - https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be
Laurence Lundblade12d32c52018-09-19 11:25:27 -070058 */
59
60
61// ----- Half Precsion -----------
62#define HALF_NUM_SIGNIFICAND_BITS (10)
63#define HALF_NUM_EXPONENT_BITS (5)
64#define HALF_NUM_SIGN_BITS (1)
65
66#define HALF_SIGNIFICAND_SHIFT (0)
67#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
68#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
69
Laurence Lundblade06350ea2020-01-27 19:32:40 -080070#define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits // 0x03ff
71#define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
72#define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // // 0x8000 1 bit of sign
73#define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
Laurence Lundblade12d32c52018-09-19 11:25:27 -070074
75/* Biased Biased Unbiased Use
76 0x00 0 -15 0 and subnormal
77 0x01 1 -14 Smallest normal exponent
78 0x1e 30 15 Largest normal exponent
79 0x1F 31 16 NaN and Infinity */
80#define HALF_EXPONENT_BIAS (15)
81#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
82#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
83#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
84#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
85
86
Laurence Lundbladeee851742020-01-08 08:37:05 -080087// ------ Single-Precision --------
Laurence Lundblade12d32c52018-09-19 11:25:27 -070088#define SINGLE_NUM_SIGNIFICAND_BITS (23)
89#define SINGLE_NUM_EXPONENT_BITS (8)
90#define SINGLE_NUM_SIGN_BITS (1)
91
92#define SINGLE_SIGNIFICAND_SHIFT (0)
93#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
94#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
95
Laurence Lundblade06350ea2020-01-27 19:32:40 -080096#define SINGLE_SIGNIFICAND_MASK (0x7fffffU) // The lower 23 bits
97#define SINGLE_EXPONENT_MASK (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
98#define SINGLE_SIGN_MASK (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign
99#define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700100
101/* Biased Biased Unbiased Use
102 0x0000 0 -127 0 and subnormal
103 0x0001 1 -126 Smallest normal exponent
104 0x7f 127 0 1
105 0xfe 254 127 Largest normal exponent
106 0xff 255 128 NaN and Infinity */
107#define SINGLE_EXPONENT_BIAS (127)
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800108#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700109#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
110#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800111#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700112
113
Laurence Lundbladeee851742020-01-08 08:37:05 -0800114// --------- Double-Precision ----------
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700115#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
116#define DOUBLE_NUM_EXPONENT_BITS (11)
117#define DOUBLE_NUM_SIGN_BITS (1)
118
119#define DOUBLE_SIGNIFICAND_SHIFT (0)
120#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
121#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
122
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700123#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
124#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
125#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
126#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
127
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700128
129/* Biased Biased Unbiased Use
130 0x00000000 0 -1023 0 and subnormal
131 0x00000001 1 -1022 Smallest normal exponent
132 0x000007fe 2046 1023 Largest normal exponent
133 0x000007ff 2047 1024 NaN and Infinity */
134#define DOUBLE_EXPONENT_BIAS (1023)
135#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
136#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
137#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
138#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
139
140
141
142/*
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800143 Convenient functions to avoid type punning, compiler warnings and
144 such. The optimizer reduces them to a simple assignment. This is a
145 crusty corner of C. It shouldn't be this hard.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800146
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700147 These are also in UsefulBuf.h under a different name. They are copied
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800148 here to avoid a dependency on UsefulBuf.h. There is no object code
149 size impact because these always optimze down to a simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700150 */
151static inline uint32_t CopyFloatToUint32(float f)
152{
153 uint32_t u32;
154 memcpy(&u32, &f, sizeof(uint32_t));
155 return u32;
156}
157
158static inline uint64_t CopyDoubleToUint64(double d)
159{
160 uint64_t u64;
161 memcpy(&u64, &d, sizeof(uint64_t));
162 return u64;
163}
164
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700165static inline double CopyUint64ToDouble(uint64_t u64)
166{
167 double d;
168 memcpy(&d, &u64, sizeof(uint64_t));
169 return d;
170}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700171
172
173// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700174uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700175{
176 // Pull the three parts out of the single-precision float
177 const uint32_t uSingle = CopyFloatToUint32(f);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800178 const int32_t nSingleUnbiasedExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
179 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
180 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800181
182
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700183 // Now convert the three parts to half-precision.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800184
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700185 // All works is done on uint32_t with conversion to uint16_t at
186 // the end. This avoids integer promotions that static analyzers
187 // complain about and reduces code size.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800188 uint32_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
189
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700190 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
191 // +/- Infinity and NaNs -- single biased exponent is 0xff
192 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
193 if(!uSingleSignificand) {
194 // Infinity
195 uHalfSignificand = 0;
196 } else {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700197 // Copy the LSBs of the NaN payload that will fit from the
198 // single to the half
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700199 uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
200 if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
201 // It's a qNaN; copy the qNaN bit
202 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700203 } else {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700204 // It's an sNaN; make sure the significand is not zero
205 // so it stays a NaN This is needed because not all
206 // significand bits are copied from single
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700207 if(!uHalfSignificand) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700208 // Set the LSB. This is what wikipedia shows for
209 // sNAN.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700210 uHalfSignificand |= 0x01;
211 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700212 }
213 }
214 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700215 // 0 or a subnormal number -- singled biased exponent is 0
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700216 uHalfBiasedExponent = 0;
217 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
218 } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700219 // Exponent is too large to express in half-precision; round
220 // up to infinity
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700221 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
222 uHalfSignificand = 0;
223 } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700224 // Exponent is too small to express in half-precision normal;
225 // make it a half-precision subnormal
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800226 uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700227 uHalfSignificand = 0;
228 // Could convert some of these values to a half-precision
229 // subnormal, but the layer above this will never use it. See
230 // layer above. There is code to do this in github history
231 // for this file, but it was removed because it was never
232 // invoked.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700233 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800234 // The normal case, exponent is in range for half-precision
235 uHalfBiasedExponent = (uint32_t)(nSingleUnbiasedExponent + HALF_EXPONENT_BIAS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700236 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
237 }
238 uHalfSign = uSingleSign;
239
240 // Put the 3 values in the right place for a half precision
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800241 const uint32_t uHalfPrecision = uHalfSignificand |
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700242 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
243 (uHalfSign << HALF_SIGN_SHIFT);
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700244 // Cast is safe because all the masks and shifts above work to
245 // make a half precision value which is only 16 bits.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800246 return (uint16_t)uHalfPrecision;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700247}
248
249
250// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700251uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700252{
253 // Pull the three parts out of the double-precision float
254 const uint64_t uDouble = CopyDoubleToUint64(d);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800255 const int64_t nDoubleUnbiasedExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
256 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
257 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800258
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700259 // Now convert the three parts to half-precision.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800260
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700261 // All works is done on uint64_t with conversion to uint16_t at
262 // the end. This avoids integer promotions that static analyzers
263 // complain about. Other options are for these to be unsigned int
264 // or fast_int16_t. Code size doesn't vary much between all these
265 // options for 64-bit LLVM, 64-bit GCC and 32-bit Armv7 LLVM.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800266 uint64_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
267
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700268 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
269 // +/- Infinity and NaNs -- single biased exponent is 0xff
270 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
271 if(!uDoubleSignificand) {
272 // Infinity
273 uHalfSignificand = 0;
274 } else {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700275 // Copy the LSBs of the NaN payload that will fit from the
276 // double to the half
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700277 uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
278 if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
279 // It's a qNaN; copy the qNaN bit
280 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700281 } else {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700282 // It's an sNaN; make sure the significand is not zero
283 // so it stays a NaN This is needed because not all
284 // significand bits are copied from single
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700285 if(!uHalfSignificand) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700286 // Set the LSB. This is what wikipedia shows for
287 // sNAN.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700288 uHalfSignificand |= 0x01;
289 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700290 }
291 }
292 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700293 // 0 or a subnormal number -- double biased exponent is 0
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700294 uHalfBiasedExponent = 0;
295 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
296 } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700297 // Exponent is too large to express in half-precision; round
298 // up to infinity; TODO, is this really true?
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700299 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
300 uHalfSignificand = 0;
301 } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700302 // Exponent is too small to express in half-precision; round
303 // down to zero
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800304 uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700305 uHalfSignificand = 0;
306 // Could convert some of these values to a half-precision
307 // subnormal, but the layer above this will never use it. See
308 // layer above. There is code to do this in github history
309 // for this file, but it was removed because it was never
310 // invoked.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700311 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800312 // The normal case, exponent is in range for half-precision
313 uHalfBiasedExponent = (uint32_t)(nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700314 uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
315 }
316 uHalfSign = uDoubleSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800317
318
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700319 // Put the 3 values in the right place for a half precision
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800320 const uint64_t uHalfPrecision = uHalfSignificand |
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700321 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
322 (uHalfSign << HALF_SIGN_SHIFT);
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700323 // Cast is safe because all the masks and shifts above work to
324 // make a half precision value which is only 16 bits.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800325 return (uint16_t)uHalfPrecision;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700326}
327
328
Laurence Lundbladefe09bbf2020-07-16 12:14:51 -0700329/*
330 EEE754_HalfToFloat() was created but is not needed. It can be retrieved from
331 github history if needed.
332 */
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700333
334
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700335// Public function; see ieee754.h
336double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
337{
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700338 // Pull out the three parts of the half-precision float. Do all
339 // the work in 64 bits because that is what the end result is. It
340 // may give smaller code size and will keep static analyzers
341 // happier.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800342 const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
343 const int64_t nHalfUnBiasedExponent = (int64_t)((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
344 const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800345
346
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700347 // Make the three parts of hte single-precision number
348 uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
349 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
350 // 0 or subnormal
351 uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
352 if(uHalfSignificand) {
353 // Subnormal case
354 uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700355 // A half-precision subnormal can always be converted to a
356 // normal double-precision float because the ranges line
357 // up
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700358 uDoubleSignificand = uHalfSignificand;
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700359 // Shift bits from right of the decimal to left, reducing
360 // the exponent by 1 each time
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700361 do {
362 uDoubleSignificand <<= 1;
363 uDoubleBiasedExponent--;
364 } while ((uDoubleSignificand & 0x400) == 0);
365 uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
366 uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
367 } else {
368 // Just zero
369 uDoubleSignificand = 0;
370 }
371 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
372 // NaN or Inifinity
373 uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
374 if(uHalfSignificand) {
375 // NaN
376 // First preserve the NaN payload from half to single
377 uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
378 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700379 // Next, set qNaN if needed since half qNaN bit is not
380 // copied above
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700381 uDoubleSignificand |= DOUBLE_QUIET_NAN_BIT;
382 }
383 } else {
384 // Infinity
385 uDoubleSignificand = 0;
386 }
387 } else {
388 // Normal number
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800389 uDoubleBiasedExponent = (uint64_t)(nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
390 uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700391 }
392 uDoubleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800393
394
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700395 // Shift the 3 parts into place as a double-precision
396 const uint64_t uDouble = uDoubleSignificand |
397 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
398 (uDoubleSign << DOUBLE_SIGN_SHIFT);
399 return CopyUint64ToDouble(uDouble);
400}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700401
402
Laurence Lundblade9682a532020-06-06 18:33:04 -0700403
Laurence Lundbladef7c0adb2020-08-08 20:20:58 -0700404/*
405 IEEE754_FloatToDouble(uint32_t uFloat) was created but is not needed. It can be retrieved from
406github history if needed.
407*/
Laurence Lundblade9682a532020-06-06 18:33:04 -0700408
409
410
411// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700412IEEE754_union IEEE754_FloatToSmallest(float f)
413{
414 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800415
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700416 // Pull the neeed two parts out of the single-precision float
417 const uint32_t uSingle = CopyFloatToUint32(f);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800418 const int32_t nSingleExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700419 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800420
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700421 // Bit mask that is the significand bits that would be lost when
422 // converting from single-precision to half-precision
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700423 const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
424
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700425 // Optimizer will re organize so there is only one call to
426 // IEEE754_FloatToHalf() in the final code.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700427 if(uSingle == 0) {
428 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700429 result.uSize = IEEE754_UNION_IS_HALF;
430 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700431 } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
432 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700433 result.uSize = IEEE754_UNION_IS_HALF;
434 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700435 } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
436 // Normal number in exponent range and precision won't be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700437 result.uSize = IEEE754_UNION_IS_HALF;
438 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700439 } else {
440 // Subnormal, exponent out of range, or precision will be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700441 result.uSize = IEEE754_UNION_IS_SINGLE;
442 result.uValue = uSingle;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700443 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800444
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700445 return result;
446}
447
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700448// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700449IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
450{
451 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800452
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700453 // Pull the needed two parts out of the double-precision float
454 const uint64_t uDouble = CopyDoubleToUint64(d);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800455 const int64_t nDoubleExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
Laurence Lundbladee17e2d72020-07-16 19:15:26 -0700456 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800457
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700458 // Masks to check whether dropped significand bits are zero or not
Laurence Lundbladeb992fdb2020-07-20 22:44:11 -0700459 const uint64_t uDroppedHalfBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700460 const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800461
Laurence Lundblade29ec4642020-07-21 21:11:45 -0700462 // This will not convert to half-precion or single-precision
463 // subnormals. Values that could be converted will be output as
464 // the double they are or occasionally to a normal single. This
465 // could be implemented, but it is more code and would rarely be
466 // used and rarely reduce the output size.
Laurence Lundbladeb992fdb2020-07-20 22:44:11 -0700467
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700468 // The various cases
Laurence Lundbladed711fb22018-09-26 14:35:22 -0700469 if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700470 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700471 result.uSize = IEEE754_UNION_IS_HALF;
472 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700473 } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
474 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700475 result.uSize = IEEE754_UNION_IS_HALF;
476 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundbladeb992fdb2020-07-20 22:44:11 -0700477 } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedHalfBits))) {
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700478 // Can convert to half without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700479 result.uSize = IEEE754_UNION_IS_HALF;
480 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700481 } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
482 // Can convert to single without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700483 result.uSize = IEEE754_UNION_IS_SINGLE;
484 result.uValue = CopyFloatToUint32((float)d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700485 } else {
486 // Can't convert without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700487 result.uSize = IEEE754_UNION_IS_DOUBLE;
488 result.uValue = uDouble;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700489 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800490
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700491 return result;
492}
493
Laurence Lundbladefe09bbf2020-07-16 12:14:51 -0700494#else
495
496int x;
497
Laurence Lundbladeb275cdc2020-07-12 12:34:38 -0700498#endif /* QCBOR_DISABLE_PREFERRED_FLOAT */