blob: 85422fecee2121f6cb483666af4b9571be9eb06f [file] [log] [blame]
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -07001/* ==========================================================================
2 * ieee754.h -- Conversion between half, double & single-precision floats
3 *
4 * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved.
5 *
6 * SPDX-License-Identifier: BSD-3-Clause
7 *
Laurence Lundbladee8f58162024-08-22 10:30:08 -07008 * See BSD-3-Clause license in file named "LICENSE"
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -07009 *
10 * Created on 7/23/18
11 * ========================================================================== */
Laurence Lundblade12d32c52018-09-19 11:25:27 -070012
Laurence Lundblade9682a532020-06-06 18:33:04 -070013
Laurence Lundblade12d32c52018-09-19 11:25:27 -070014#ifndef ieee754_h
15#define ieee754_h
16
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -080017
Laurence Lundblade12d32c52018-09-19 11:25:27 -070018#include <stdint.h>
19
20
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070021/** @file ieee754.h
22 *
23 * This implements floating-point conversion between half, single and
Laurence Lundbladecb7282d2024-11-09 23:01:11 -080024 * double precision floating-point numbers, in particular conversion to
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070025 * smaller representation (e.g., double to single) that does not lose
26 * precision for CBOR preferred serialization.
27 *
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -080028 * This also implements conversion of floats to whole numbers as
29 * is required for dCBOR.
30 *
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070031 * This implementation works entirely with shifts and masks and does
32 * not require any floating-point HW or library.
33 *
34 * This conforms to IEEE 754-2008, but note that it doesn't specify
35 * conversions, just the encodings.
36 *
37 * This is complete, supporting +/- infinity, +/- zero, subnormals and
38 * NaN payloads. NaN payloads are converted to smaller by dropping the
39 * right most bits if they are zero and shifting to the right. If the
Laurence Lundbladecb7282d2024-11-09 23:01:11 -080040 * rightmost bits are not zero, the conversion is not performed. When
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070041 * converting from smaller to larger, the payload is shifted left and
42 * zero-padded. This is what is specified by CBOR preferred
Laurence Lundbladecb7282d2024-11-09 23:01:11 -080043 * serialization and what modern HW conversion instructions do.
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070044 *
45 * There is no special handling of silent and quiet NaNs. It probably
Laurence Lundbladecb7282d2024-11-09 23:01:11 -080046 * isn't necessary to transmit these special NaNs as their purpose is
47 * more for propagating errors up through some calculation. In many
48 * cases the handling of the NaN payload will work for silent and quiet
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070049 * NaNs.
50 *
51 * A previous version of this was usable as a general library for
52 * conversion. This version is reduced to what is needed for CBOR.
Laurence Lundblade12d32c52018-09-19 11:25:27 -070053 */
54
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -080055#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070056
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070057/**
58 * @brief Convert half-precision float to double-precision float.
59 *
Laurence Lundbladecb7282d2024-11-09 23:01:11 -080060 * @param[in] uHalfPrecision Half-precision number to convert.
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070061 *
Laurence Lundbladecb7282d2024-11-09 23:01:11 -080062 * @returns double-precision value.
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070063 *
64 * This is a lossless conversion because every half-precision value
65 * can be represented as a double. There is no error condition.
66 *
67 * There is no half-precision type in C, so it is represented here as
68 * a @c uint16_t. The bits of @c uHalfPrecision are as described for
69 * half-precision by IEEE 754.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070070 */
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070071double
72IEEE754_HalfToDouble(uint16_t uHalfPrecision);
Laurence Lundblade12d32c52018-09-19 11:25:27 -070073
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070074
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070075/** Holds a floating-point value that could be half, single or
76 * double-precision. The value is in a @c uint64_t that may be copied
77 * to a float or double. Simply casting uValue will usually work but
78 * may generate compiler or static analyzer warnings. Using
79 * UsefulBufUtil_CopyUint64ToDouble() or
80 * UsefulBufUtil_CopyUint32ToFloat() will not (and will not generate
81 * any extra code).
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070082 */
Laurence Lundblade12d32c52018-09-19 11:25:27 -070083typedef struct {
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070084 enum {IEEE754_UNION_IS_HALF = 2,
85 IEEE754_UNION_IS_SINGLE = 4,
86 IEEE754_UNION_IS_DOUBLE = 8,
87 } uSize; /* Size of uValue */
88 uint64_t uValue;
Laurence Lundblade12d32c52018-09-19 11:25:27 -070089} IEEE754_union;
90
91
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -080092/** Holds result of an attempt to convert a floating-point
93 * number to an int64_t or uint64_t.
94 */
95struct IEEE754_ToInt {
96 enum {IEEE754_ToInt_IS_INT,
97 IEEE754_ToInt_IS_UINT,
Laurence Lundblade14ce2282024-07-24 22:13:35 -070098 IEEE754_ToInt_IS_65BIT_NEG,
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -080099 IEEE754_ToInt_NO_CONVERSION,
100 IEEE754_ToInt_NaN
101 } type;
102 union {
103 uint64_t un_signed;
104 int64_t is_signed;
105 } integer;
106};
107
108
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700109/**
110 * @brief Convert a double to either single or half-precision.
111 *
112 * @param[in] d The value to convert.
113 * @param[in] bAllowHalfPrecision If true, convert to either half or
114 * single precision.
115 *
116 * @returns Unconverted value, or value converted to single or half-precision.
117 *
118 * This always succeeds. If the value cannot be converted without the
119 * loss of precision, it is not converted.
120 *
121 * This handles all subnormals and NaN payloads.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700122 */
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700123IEEE754_union
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800124IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision, int bNoNaNPayload);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700125
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700126
127/**
128 * @brief Convert a single-precision float to half-precision.
129 *
130 * @param[in] f The value to convert.
131 *
132 * @returns Either unconverted value or value converted to half-precision.
133 *
134 * This always succeeds. If the value cannot be converted without the
135 * loss of precision, it is not converted.
136 *
137 * This handles all subnormals and NaN payloads.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700138 */
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700139IEEE754_union
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800140IEEE754_SingleToHalf(float f, int bNoNanPayloads);
141
142
143/**
Laurence Lundbladecb7282d2024-11-09 23:01:11 -0800144 * @brief Convert a double-precision float to an integer if whole number
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800145 *
146 * @param[in] d The value to convert.
147 *
148 * @returns Either converted number or conversion status.
149 *
150 * If the value is a whole number that will fit either in a uint64_t
151 * or an int64_t, it is converted. If it is a NaN, then there is no
Laurence Lundbladecb7282d2024-11-09 23:01:11 -0800152 * conversion and the fact that it is a NaN is indicated in the
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800153 * returned structure. If it can't be converted, then that is
154 * indicated in the returned structure.
155 *
Laurence Lundbladecb7282d2024-11-09 23:01:11 -0800156 * This always returns positive numbers as a uint64_t even if they will
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800157 * fit in an int64_t.
158 *
Laurence Lundbladecb7282d2024-11-09 23:01:11 -0800159 * This never fails because of precision, but may fail because of range.
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800160 */
161struct IEEE754_ToInt
162IEEE754_DoubleToInt(double d);
163
164
165/**
Laurence Lundbladecb7282d2024-11-09 23:01:11 -0800166 * @brief Convert a single-precision float to an integer if whole number
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800167 *
168 * @param[in] f The value to convert.
169 *
170 * @returns Either converted number or conversion status.
171 *
172 * If the value is a whole number that will fit either in a uint64_t
173 * or an int64_t, it is converted. If it is a NaN, then there is no
Laurence Lundbladecb7282d2024-11-09 23:01:11 -0800174 * conversion and the fact that it is a NaN is indicated in the
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800175 * returned structure. If it can't be converted, then that is
176 * indicated in the returned structure.
177 *
Laurence Lundbladecb7282d2024-11-09 23:01:11 -0800178 * This always returns positive numbers as a uint64_t even if they will
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800179 * fit in an int64_t.
180 *
Laurence Lundbladecb7282d2024-11-09 23:01:11 -0800181 * This never fails because of precision, but may fail because of range.
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800182 */
183struct IEEE754_ToInt
184IEEE754_SingleToInt(float f);
185
Laurence Lundbladed883ad32024-03-23 22:37:37 -0700186
187/**
188 * @brief Convert an unsigned integer to a double with no precision loss.
189 *
190 * @param[in] uInt The value to convert.
Laurence Lundbladecb7282d2024-11-09 23:01:11 -0800191 * @param[in] uIsNegative 0 if positive, 1 if negative.
Laurence Lundbladed883ad32024-03-23 22:37:37 -0700192 *
193 * @returns Either the converted number or 0.5 if no conversion.
194 *
195 * The conversion will fail if the input can not be represented in the
196 * 52 bits or precision that a double has. 0.5 is returned to indicate
197 * no conversion. It is out-of-band from non-error results, because
198 * all non-error results are whole integers.
199 */
200#define IEEE754_UINT_TO_DOUBLE_OOB 0.5
201double
202IEEE754_UintToDouble(uint64_t uInt, int uIsNegative);
203
204
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800205#endif /* ! QCBOR_DISABLE_PREFERRED_FLOAT */
206
207
208/**
209 * @brief Tests whether NaN is "quiet" vs having a payload.
210 *
211 * @param[in] dNum Double number to test.
212 *
213 * @returns 0 if a quiet NaN, 1 if it has a payload.
214 *
215 * A quiet NaN is usually represented as 0x7ff8000000000000. That is
216 * the significand bits are 0x8000000000000. If the significand bits
217 * are other than 0x8000000000000 it is considered to have a NaN
218 * payload.
219 *
220 * Note that 0x7ff8000000000000 is not specified in a standard, but it
221 * is commonly implemented and chosen by CBOR as the best way to
222 * represent a NaN.
223 */
224int
Laurence Lundblade9b2ae8a2024-07-12 11:00:20 -0700225IEEE754_DoubleHasNaNPayload(double dNum);
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800226
227
228
229/**
230 * @brief Tests whether NaN is "quiet" vs having a payload.
231 *
232 * @param[in] fNum Float number to test.
233 *
234 * @returns 0 if a quiet NaN, 1 if it has a payload.
235 *
Laurence Lundblade9b2ae8a2024-07-12 11:00:20 -0700236 * See IEEE754_DoubleHasNaNPayload(). A single precision quiet NaN
Laurence Lundbladeeb3cdef2024-02-17 20:38:55 -0800237 * is 0x7fc00000.
238 */
239int
Laurence Lundblade9b2ae8a2024-07-12 11:00:20 -0700240IEEE754_SingleHasNaNPayload(float fNum);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700241
242
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700243#endif /* ieee754_h */
244