blob: 863019b2bf42e099513d57c99bfbbf16629864dd [file] [log] [blame]
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -07001/* ==========================================================================
2 * ieee754.h -- Conversion between half, double & single-precision floats
3 *
4 * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved.
5 *
6 * SPDX-License-Identifier: BSD-3-Clause
7 *
8 * See BSD-3-Clause license in README.md
9 *
10 * Created on 7/23/18
11 * ========================================================================== */
Laurence Lundblade12d32c52018-09-19 11:25:27 -070012
Laurence Lundbladeb275cdc2020-07-12 12:34:38 -070013#ifndef QCBOR_DISABLE_PREFERRED_FLOAT
Laurence Lundblade9682a532020-06-06 18:33:04 -070014
Laurence Lundblade12d32c52018-09-19 11:25:27 -070015#ifndef ieee754_h
16#define ieee754_h
17
18#include <stdint.h>
19
20
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070021/** @file ieee754.h
22 *
23 * This implements floating-point conversion between half, single and
24 * double precision floating-point numbers, in particular convesion to
25 * smaller representation (e.g., double to single) that does not lose
26 * precision for CBOR preferred serialization.
27 *
28 * This implementation works entirely with shifts and masks and does
29 * not require any floating-point HW or library.
30 *
31 * This conforms to IEEE 754-2008, but note that it doesn't specify
32 * conversions, just the encodings.
33 *
34 * This is complete, supporting +/- infinity, +/- zero, subnormals and
35 * NaN payloads. NaN payloads are converted to smaller by dropping the
36 * right most bits if they are zero and shifting to the right. If the
37 * rightmost bits are not zero the conversion is not performed. When
38 * converting from smaller to larger, the payload is shifted left and
39 * zero-padded. This is what is specified by CBOR preferred
40 * serialization and what modern HW conversion instructions do. CBOR
41 * CDE handling for NaN is not clearly specified, but upcoming
42 * documents may clarify this.
43 *
44 * There is no special handling of silent and quiet NaNs. It probably
45 * isn't necessary to transmit these special NaNs as there purpose is
46 * more for propgating errors up through some calculation. In many
47 * cases the handlng of the NaN payload will work for silent and quiet
48 * NaNs.
49 *
50 * A previous version of this was usable as a general library for
51 * conversion. This version is reduced to what is needed for CBOR.
Laurence Lundblade12d32c52018-09-19 11:25:27 -070052 */
53
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070054
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070055/**
56 * @brief Convert half-precision float to double-precision float.
57 *
58 * @param[in] uHalfPrecision Half-prevision number to convert.
59 *
60 * @returns double-presion value.
61 *
62 * This is a lossless conversion because every half-precision value
63 * can be represented as a double. There is no error condition.
64 *
65 * There is no half-precision type in C, so it is represented here as
66 * a @c uint16_t. The bits of @c uHalfPrecision are as described for
67 * half-precision by IEEE 754.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070068 */
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070069double
70IEEE754_HalfToDouble(uint16_t uHalfPrecision);
Laurence Lundblade12d32c52018-09-19 11:25:27 -070071
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070072
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070073/** Holds a floating-point value that could be half, single or
74 * double-precision. The value is in a @c uint64_t that may be copied
75 * to a float or double. Simply casting uValue will usually work but
76 * may generate compiler or static analyzer warnings. Using
77 * UsefulBufUtil_CopyUint64ToDouble() or
78 * UsefulBufUtil_CopyUint32ToFloat() will not (and will not generate
79 * any extra code).
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070080 */
Laurence Lundblade12d32c52018-09-19 11:25:27 -070081typedef struct {
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070082 enum {IEEE754_UNION_IS_HALF = 2,
83 IEEE754_UNION_IS_SINGLE = 4,
84 IEEE754_UNION_IS_DOUBLE = 8,
85 } uSize; /* Size of uValue */
86 uint64_t uValue;
Laurence Lundblade12d32c52018-09-19 11:25:27 -070087} IEEE754_union;
88
89
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -070090/**
91 * @brief Convert a double to either single or half-precision.
92 *
93 * @param[in] d The value to convert.
94 * @param[in] bAllowHalfPrecision If true, convert to either half or
95 * single precision.
96 *
97 * @returns Unconverted value, or value converted to single or half-precision.
98 *
99 * This always succeeds. If the value cannot be converted without the
100 * loss of precision, it is not converted.
101 *
102 * This handles all subnormals and NaN payloads.
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700103 */
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700104IEEE754_union
105IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700106
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700107
108/**
109 * @brief Convert a single-precision float to half-precision.
110 *
111 * @param[in] f The value to convert.
112 *
113 * @returns Either unconverted value or value converted to half-precision.
114 *
115 * This always succeeds. If the value cannot be converted without the
116 * loss of precision, it is not converted.
117 *
118 * This handles all subnormals and NaN payloads.
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700119 */
Laurence Lundblade83dbf5c2024-01-07 19:17:52 -0700120IEEE754_union
121IEEE754_SingleToHalf(float f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700122
123
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700124#endif /* ieee754_h */
125
Laurence Lundbladeb275cdc2020-07-12 12:34:38 -0700126#endif /* QCBOR_DISABLE_PREFERRED_FLOAT */