blob: a8079f8cff0c9c22a963bc6e7c7e4c8b01508333 [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
Laurence Lundbladeee851742020-01-08 08:37:05 -08002 ieee754.c -- floating-point conversion between half, double & single-precision
Laurence Lundblade035bd782019-01-21 17:01:31 -08003
Laurence Lundbladeee851742020-01-08 08:37:05 -08004 Copyright (c) 2018-2020, Laurence Lundblade. All rights reserved.
Máté Tóth-Pálef5f07a2021-09-17 19:31:37 +02005 Copyright (c) 2021, Arm Limited. All rights reserved.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -08006
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08007 SPDX-License-Identifier: BSD-3-Clause
Laurence Lundblade035bd782019-01-21 17:01:31 -08008
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08009 See BSD-3-Clause license in README.md
Laurence Lundblade035bd782019-01-21 17:01:31 -080010
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -080011 Created on 7/23/18
Laurence Lundbladeee851742020-01-08 08:37:05 -080012 =============================================================================*/
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070013
Máté Tóth-Pálef5f07a2021-09-17 19:31:37 +020014/*
15 Include before QCBOR_DISABLE_PREFERRED_FLOAT is checked as
16 QCBOR_DISABLE_PREFERRED_FLOAT might be defined in qcbor/qcbor_common.h
17 */
18#include "qcbor/qcbor_common.h"
19
Laurence Lundbladeb275cdc2020-07-12 12:34:38 -070020#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
Laurence Lundblade9682a532020-06-06 18:33:04 -070021
Laurence Lundblade12d32c52018-09-19 11:25:27 -070022#include "ieee754.h"
23#include <string.h> // For memcpy()
24
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070025
Laurence Lundblade12d32c52018-09-19 11:25:27 -070026/*
Laurence Lundbladeee851742020-01-08 08:37:05 -080027 This code is written for clarity and verifiability, not for size, on
28 the assumption that the optimizer will do a good job. The LLVM
29 optimizer, -Os, does seem to do the job and the resulting object code
30 is smaller from combining code for the many different cases (normal,
Laurence Lundbladec5fef682020-01-25 11:38:45 -080031 subnormal, infinity, zero...) for the conversions. GCC is no where near
32 as good.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080033
Laurence Lundbladeee851742020-01-08 08:37:05 -080034 This code has really long lines and is much easier to read because of
35 them. Some coding guidelines prefer 80 column lines (can they not afford
36 big displays?). It would make this code much worse even to wrap at 120
37 columns.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080038
Laurence Lundbladeee851742020-01-08 08:37:05 -080039 Dead stripping is also really helpful to get code size down when
Laurence Lundbladec5fef682020-01-25 11:38:45 -080040 floating-point encoding is not needed. (If this is put in a library
41 and linking is against the library, then dead stripping is automatic).
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080042
Laurence Lundbladeee851742020-01-08 08:37:05 -080043 This code works solely using shifts and masks and thus has no
44 dependency on any math libraries. It can even work if the CPU doesn't
45 have any floating-point support, though that isn't the most useful
46 thing to do.
47
48 The memcpy() dependency is only for CopyFloatToUint32() and friends
49 which only is needed to avoid type punning when converting the actual
50 float bits to an unsigned value so the bit shifts and masks can work.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070051 */
52
53/*
54 The references used to write this code:
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080055
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070056 - IEEE 754-2008, particularly section 3.6 and 6.2.1
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080057
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070058 - https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080059
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070060 - https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundbladec5fef682020-01-25 11:38:45 -080061
62 - https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules
63
64 - https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be
Laurence Lundblade12d32c52018-09-19 11:25:27 -070065 */
66
67
68// ----- Half Precsion -----------
69#define HALF_NUM_SIGNIFICAND_BITS (10)
70#define HALF_NUM_EXPONENT_BITS (5)
71#define HALF_NUM_SIGN_BITS (1)
72
73#define HALF_SIGNIFICAND_SHIFT (0)
74#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
75#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
76
Laurence Lundblade06350ea2020-01-27 19:32:40 -080077#define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits // 0x03ff
78#define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
79#define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // // 0x8000 1 bit of sign
80#define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
Laurence Lundblade12d32c52018-09-19 11:25:27 -070081
82/* Biased Biased Unbiased Use
83 0x00 0 -15 0 and subnormal
84 0x01 1 -14 Smallest normal exponent
85 0x1e 30 15 Largest normal exponent
86 0x1F 31 16 NaN and Infinity */
87#define HALF_EXPONENT_BIAS (15)
88#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
89#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
90#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
91#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
92
93
Laurence Lundbladeee851742020-01-08 08:37:05 -080094// ------ Single-Precision --------
Laurence Lundblade12d32c52018-09-19 11:25:27 -070095#define SINGLE_NUM_SIGNIFICAND_BITS (23)
96#define SINGLE_NUM_EXPONENT_BITS (8)
97#define SINGLE_NUM_SIGN_BITS (1)
98
99#define SINGLE_SIGNIFICAND_SHIFT (0)
100#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
101#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
102
Laurence Lundblade06350ea2020-01-27 19:32:40 -0800103#define SINGLE_SIGNIFICAND_MASK (0x7fffffU) // The lower 23 bits
104#define SINGLE_EXPONENT_MASK (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
105#define SINGLE_SIGN_MASK (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign
106#define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700107
108/* Biased Biased Unbiased Use
109 0x0000 0 -127 0 and subnormal
110 0x0001 1 -126 Smallest normal exponent
111 0x7f 127 0 1
112 0xfe 254 127 Largest normal exponent
113 0xff 255 128 NaN and Infinity */
114#define SINGLE_EXPONENT_BIAS (127)
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800115#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700116#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
117#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800118#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700119
120
Laurence Lundbladeee851742020-01-08 08:37:05 -0800121// --------- Double-Precision ----------
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700122#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
123#define DOUBLE_NUM_EXPONENT_BITS (11)
124#define DOUBLE_NUM_SIGN_BITS (1)
125
126#define DOUBLE_SIGNIFICAND_SHIFT (0)
127#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
128#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
129
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700130#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
131#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
132#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
133#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
134
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700135
136/* Biased Biased Unbiased Use
137 0x00000000 0 -1023 0 and subnormal
138 0x00000001 1 -1022 Smallest normal exponent
139 0x000007fe 2046 1023 Largest normal exponent
140 0x000007ff 2047 1024 NaN and Infinity */
141#define DOUBLE_EXPONENT_BIAS (1023)
142#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
143#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
144#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
145#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
146
147
148
149/*
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800150 Convenient functions to avoid type punning, compiler warnings and
151 such. The optimizer reduces them to a simple assignment. This is a
152 crusty corner of C. It shouldn't be this hard.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800153
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700154 These are also in UsefulBuf.h under a different name. They are copied
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800155 here to avoid a dependency on UsefulBuf.h. There is no object code
156 size impact because these always optimze down to a simple assignment.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700157 */
158static inline uint32_t CopyFloatToUint32(float f)
159{
160 uint32_t u32;
161 memcpy(&u32, &f, sizeof(uint32_t));
162 return u32;
163}
164
165static inline uint64_t CopyDoubleToUint64(double d)
166{
167 uint64_t u64;
168 memcpy(&u64, &d, sizeof(uint64_t));
169 return u64;
170}
171
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700172static inline double CopyUint64ToDouble(uint64_t u64)
173{
174 double d;
175 memcpy(&d, &u64, sizeof(uint64_t));
176 return d;
177}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700178
179
180// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700181uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700182{
183 // Pull the three parts out of the single-precision float
184 const uint32_t uSingle = CopyFloatToUint32(f);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800185 const int32_t nSingleUnbiasedExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
186 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
187 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800188
189
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700190 // Now convert the three parts to half-precision.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800191
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700192 // All works is done on uint32_t with conversion to uint16_t at
193 // the end. This avoids integer promotions that static analyzers
194 // complain about and reduces code size.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800195 uint32_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
196
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700197 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
198 // +/- Infinity and NaNs -- single biased exponent is 0xff
199 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
200 if(!uSingleSignificand) {
201 // Infinity
202 uHalfSignificand = 0;
203 } else {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700204 // Copy the LSBs of the NaN payload that will fit from the
205 // single to the half
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700206 uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
207 if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
208 // It's a qNaN; copy the qNaN bit
209 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700210 } else {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700211 // It's an sNaN; make sure the significand is not zero
212 // so it stays a NaN This is needed because not all
213 // significand bits are copied from single
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700214 if(!uHalfSignificand) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700215 // Set the LSB. This is what wikipedia shows for
216 // sNAN.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700217 uHalfSignificand |= 0x01;
218 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700219 }
220 }
221 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700222 // 0 or a subnormal number -- singled biased exponent is 0
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700223 uHalfBiasedExponent = 0;
224 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
225 } else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700226 // Exponent is too large to express in half-precision; round
227 // up to infinity
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700228 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
229 uHalfSignificand = 0;
230 } else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700231 // Exponent is too small to express in half-precision normal;
232 // make it a half-precision subnormal
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800233 uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700234 uHalfSignificand = 0;
235 // Could convert some of these values to a half-precision
236 // subnormal, but the layer above this will never use it. See
237 // layer above. There is code to do this in github history
238 // for this file, but it was removed because it was never
239 // invoked.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700240 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800241 // The normal case, exponent is in range for half-precision
242 uHalfBiasedExponent = (uint32_t)(nSingleUnbiasedExponent + HALF_EXPONENT_BIAS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700243 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
244 }
245 uHalfSign = uSingleSign;
246
247 // Put the 3 values in the right place for a half precision
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800248 const uint32_t uHalfPrecision = uHalfSignificand |
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700249 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
250 (uHalfSign << HALF_SIGN_SHIFT);
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700251 // Cast is safe because all the masks and shifts above work to
252 // make a half precision value which is only 16 bits.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800253 return (uint16_t)uHalfPrecision;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700254}
255
256
257// Public function; see ieee754.h
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700258uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700259{
260 // Pull the three parts out of the double-precision float
261 const uint64_t uDouble = CopyDoubleToUint64(d);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800262 const int64_t nDoubleUnbiasedExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
263 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
264 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800265
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700266 // Now convert the three parts to half-precision.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800267
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700268 // All works is done on uint64_t with conversion to uint16_t at
269 // the end. This avoids integer promotions that static analyzers
270 // complain about. Other options are for these to be unsigned int
271 // or fast_int16_t. Code size doesn't vary much between all these
272 // options for 64-bit LLVM, 64-bit GCC and 32-bit Armv7 LLVM.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800273 uint64_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
274
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700275 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
276 // +/- Infinity and NaNs -- single biased exponent is 0xff
277 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
278 if(!uDoubleSignificand) {
279 // Infinity
280 uHalfSignificand = 0;
281 } else {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700282 // Copy the LSBs of the NaN payload that will fit from the
283 // double to the half
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700284 uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
285 if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
286 // It's a qNaN; copy the qNaN bit
287 uHalfSignificand |= HALF_QUIET_NAN_BIT;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700288 } else {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700289 // It's an sNaN; make sure the significand is not zero
290 // so it stays a NaN This is needed because not all
291 // significand bits are copied from single
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700292 if(!uHalfSignificand) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700293 // Set the LSB. This is what wikipedia shows for
294 // sNAN.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700295 uHalfSignificand |= 0x01;
296 }
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700297 }
298 }
299 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700300 // 0 or a subnormal number -- double biased exponent is 0
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700301 uHalfBiasedExponent = 0;
302 uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
303 } else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700304 // Exponent is too large to express in half-precision; round
305 // up to infinity; TODO, is this really true?
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700306 uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
307 uHalfSignificand = 0;
308 } else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700309 // Exponent is too small to express in half-precision; round
310 // down to zero
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800311 uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700312 uHalfSignificand = 0;
313 // Could convert some of these values to a half-precision
314 // subnormal, but the layer above this will never use it. See
315 // layer above. There is code to do this in github history
316 // for this file, but it was removed because it was never
317 // invoked.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700318 } else {
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800319 // The normal case, exponent is in range for half-precision
320 uHalfBiasedExponent = (uint32_t)(nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700321 uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
322 }
323 uHalfSign = uDoubleSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800324
325
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700326 // Put the 3 values in the right place for a half precision
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800327 const uint64_t uHalfPrecision = uHalfSignificand |
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700328 (uHalfBiasedExponent << HALF_EXPONENT_SHIFT) |
329 (uHalfSign << HALF_SIGN_SHIFT);
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700330 // Cast is safe because all the masks and shifts above work to
331 // make a half precision value which is only 16 bits.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800332 return (uint16_t)uHalfPrecision;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700333}
334
335
Laurence Lundbladefe09bbf2020-07-16 12:14:51 -0700336/*
337 EEE754_HalfToFloat() was created but is not needed. It can be retrieved from
338 github history if needed.
339 */
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700340
341
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700342// Public function; see ieee754.h
343double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
344{
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700345 // Pull out the three parts of the half-precision float. Do all
346 // the work in 64 bits because that is what the end result is. It
347 // may give smaller code size and will keep static analyzers
348 // happier.
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800349 const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
350 const int64_t nHalfUnBiasedExponent = (int64_t)((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
351 const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800352
353
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700354 // Make the three parts of hte single-precision number
355 uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
356 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
357 // 0 or subnormal
358 uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
359 if(uHalfSignificand) {
360 // Subnormal case
361 uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700362 // A half-precision subnormal can always be converted to a
363 // normal double-precision float because the ranges line
364 // up
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700365 uDoubleSignificand = uHalfSignificand;
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700366 // Shift bits from right of the decimal to left, reducing
367 // the exponent by 1 each time
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700368 do {
369 uDoubleSignificand <<= 1;
370 uDoubleBiasedExponent--;
371 } while ((uDoubleSignificand & 0x400) == 0);
372 uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
373 uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
374 } else {
375 // Just zero
376 uDoubleSignificand = 0;
377 }
378 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
379 // NaN or Inifinity
380 uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
381 if(uHalfSignificand) {
382 // NaN
383 // First preserve the NaN payload from half to single
384 uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
385 if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700386 // Next, set qNaN if needed since half qNaN bit is not
387 // copied above
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700388 uDoubleSignificand |= DOUBLE_QUIET_NAN_BIT;
389 }
390 } else {
391 // Infinity
392 uDoubleSignificand = 0;
393 }
394 } else {
395 // Normal number
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800396 uDoubleBiasedExponent = (uint64_t)(nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
397 uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700398 }
399 uDoubleSign = uHalfSign;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800400
401
Laurence Lundblade67bd5512018-11-02 21:44:06 +0700402 // Shift the 3 parts into place as a double-precision
403 const uint64_t uDouble = uDoubleSignificand |
404 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
405 (uDoubleSign << DOUBLE_SIGN_SHIFT);
406 return CopyUint64ToDouble(uDouble);
407}
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700408
409
Laurence Lundblade9682a532020-06-06 18:33:04 -0700410
Laurence Lundbladef7c0adb2020-08-08 20:20:58 -0700411/*
412 IEEE754_FloatToDouble(uint32_t uFloat) was created but is not needed. It can be retrieved from
413github history if needed.
414*/
Laurence Lundblade9682a532020-06-06 18:33:04 -0700415
416
417
418// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700419IEEE754_union IEEE754_FloatToSmallest(float f)
420{
421 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800422
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700423 // Pull the neeed two parts out of the single-precision float
424 const uint32_t uSingle = CopyFloatToUint32(f);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800425 const int32_t nSingleExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700426 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800427
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700428 // Bit mask that is the significand bits that would be lost when
429 // converting from single-precision to half-precision
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700430 const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
431
Laurence Lundblade576aa0c2020-07-21 21:36:52 -0700432 // Optimizer will re organize so there is only one call to
433 // IEEE754_FloatToHalf() in the final code.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700434 if(uSingle == 0) {
435 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700436 result.uSize = IEEE754_UNION_IS_HALF;
437 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700438 } else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
439 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700440 result.uSize = IEEE754_UNION_IS_HALF;
441 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700442 } else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
443 // Normal number in exponent range and precision won't be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700444 result.uSize = IEEE754_UNION_IS_HALF;
445 result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700446 } else {
447 // Subnormal, exponent out of range, or precision will be lost
Laurence Lundblade577d8212018-11-01 14:04:08 +0700448 result.uSize = IEEE754_UNION_IS_SINGLE;
449 result.uValue = uSingle;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700450 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800451
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700452 return result;
453}
454
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700455// Public function; see ieee754.h
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700456IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
457{
458 IEEE754_union result;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800459
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700460 // Pull the needed two parts out of the double-precision float
461 const uint64_t uDouble = CopyDoubleToUint64(d);
Laurence Lundbladec5fef682020-01-25 11:38:45 -0800462 const int64_t nDoubleExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
Laurence Lundbladee17e2d72020-07-16 19:15:26 -0700463 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800464
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700465 // Masks to check whether dropped significand bits are zero or not
Laurence Lundbladeb992fdb2020-07-20 22:44:11 -0700466 const uint64_t uDroppedHalfBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700467 const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800468
Laurence Lundblade29ec4642020-07-21 21:11:45 -0700469 // This will not convert to half-precion or single-precision
470 // subnormals. Values that could be converted will be output as
471 // the double they are or occasionally to a normal single. This
472 // could be implemented, but it is more code and would rarely be
473 // used and rarely reduce the output size.
Laurence Lundbladeb992fdb2020-07-20 22:44:11 -0700474
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700475 // The various cases
Laurence Lundbladed711fb22018-09-26 14:35:22 -0700476 if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700477 // Value is 0.0000, not a a subnormal
Laurence Lundblade577d8212018-11-01 14:04:08 +0700478 result.uSize = IEEE754_UNION_IS_HALF;
479 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700480 } else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
481 // NaN, +/- infinity
Laurence Lundblade577d8212018-11-01 14:04:08 +0700482 result.uSize = IEEE754_UNION_IS_HALF;
483 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundbladeb992fdb2020-07-20 22:44:11 -0700484 } else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedHalfBits))) {
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700485 // Can convert to half without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700486 result.uSize = IEEE754_UNION_IS_HALF;
487 result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700488 } else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
489 // Can convert to single without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700490 result.uSize = IEEE754_UNION_IS_SINGLE;
491 result.uValue = CopyFloatToUint32((float)d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700492 } else {
493 // Can't convert without precision loss
Laurence Lundblade577d8212018-11-01 14:04:08 +0700494 result.uSize = IEEE754_UNION_IS_DOUBLE;
495 result.uValue = uDouble;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700496 }
Laurence Lundblade3aee3a32018-12-17 16:17:45 -0800497
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700498 return result;
499}
500
Laurence Lundbladefe09bbf2020-07-16 12:14:51 -0700501#else
502
503int x;
504
Laurence Lundbladeb275cdc2020-07-12 12:34:38 -0700505#endif /* QCBOR_DISABLE_PREFERRED_FLOAT */