Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 1 | /* ========================================================================== |
| 2 | * ieee754.h -- Conversion between half, double & single-precision floats |
| 3 | * |
| 4 | * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved. |
| 5 | * |
| 6 | * SPDX-License-Identifier: BSD-3-Clause |
| 7 | * |
Laurence Lundblade | 4903e55 | 2024-08-21 11:13:48 -0700 | [diff] [blame] | 8 | * See BSD-3-Clause license in file named "LICENSE" |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 9 | * |
| 10 | * Created on 7/23/18 |
| 11 | * ========================================================================== */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 12 | |
Laurence Lundblade | b275cdc | 2020-07-12 12:34:38 -0700 | [diff] [blame] | 13 | #ifndef QCBOR_DISABLE_PREFERRED_FLOAT |
Laurence Lundblade | 9682a53 | 2020-06-06 18:33:04 -0700 | [diff] [blame] | 14 | |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 15 | #ifndef ieee754_h |
| 16 | #define ieee754_h |
| 17 | |
| 18 | #include <stdint.h> |
| 19 | |
| 20 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 21 | /** @file ieee754.h |
| 22 | * |
| 23 | * This implements floating-point conversion between half, single and |
| 24 | * double precision floating-point numbers, in particular convesion to |
| 25 | * smaller representation (e.g., double to single) that does not lose |
| 26 | * precision for CBOR preferred serialization. |
| 27 | * |
| 28 | * This implementation works entirely with shifts and masks and does |
| 29 | * not require any floating-point HW or library. |
| 30 | * |
| 31 | * This conforms to IEEE 754-2008, but note that it doesn't specify |
| 32 | * conversions, just the encodings. |
| 33 | * |
| 34 | * This is complete, supporting +/- infinity, +/- zero, subnormals and |
| 35 | * NaN payloads. NaN payloads are converted to smaller by dropping the |
| 36 | * right most bits if they are zero and shifting to the right. If the |
| 37 | * rightmost bits are not zero the conversion is not performed. When |
| 38 | * converting from smaller to larger, the payload is shifted left and |
| 39 | * zero-padded. This is what is specified by CBOR preferred |
| 40 | * serialization and what modern HW conversion instructions do. CBOR |
| 41 | * CDE handling for NaN is not clearly specified, but upcoming |
| 42 | * documents may clarify this. |
| 43 | * |
| 44 | * There is no special handling of silent and quiet NaNs. It probably |
| 45 | * isn't necessary to transmit these special NaNs as there purpose is |
| 46 | * more for propgating errors up through some calculation. In many |
| 47 | * cases the handlng of the NaN payload will work for silent and quiet |
| 48 | * NaNs. |
| 49 | * |
| 50 | * A previous version of this was usable as a general library for |
| 51 | * conversion. This version is reduced to what is needed for CBOR. |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 52 | */ |
| 53 | |
Laurence Lundblade | 8db3d3e | 2018-09-29 11:46:37 -0700 | [diff] [blame] | 54 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 55 | /** |
| 56 | * @brief Convert half-precision float to double-precision float. |
| 57 | * |
| 58 | * @param[in] uHalfPrecision Half-prevision number to convert. |
| 59 | * |
| 60 | * @returns double-presion value. |
| 61 | * |
| 62 | * This is a lossless conversion because every half-precision value |
| 63 | * can be represented as a double. There is no error condition. |
| 64 | * |
| 65 | * There is no half-precision type in C, so it is represented here as |
| 66 | * a @c uint16_t. The bits of @c uHalfPrecision are as described for |
| 67 | * half-precision by IEEE 754. |
Laurence Lundblade | 8db3d3e | 2018-09-29 11:46:37 -0700 | [diff] [blame] | 68 | */ |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 69 | double |
| 70 | IEEE754_HalfToDouble(uint16_t uHalfPrecision); |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 71 | |
Laurence Lundblade | 8db3d3e | 2018-09-29 11:46:37 -0700 | [diff] [blame] | 72 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 73 | /** Holds a floating-point value that could be half, single or |
| 74 | * double-precision. The value is in a @c uint64_t that may be copied |
| 75 | * to a float or double. Simply casting uValue will usually work but |
| 76 | * may generate compiler or static analyzer warnings. Using |
| 77 | * UsefulBufUtil_CopyUint64ToDouble() or |
| 78 | * UsefulBufUtil_CopyUint32ToFloat() will not (and will not generate |
| 79 | * any extra code). |
Laurence Lundblade | 8db3d3e | 2018-09-29 11:46:37 -0700 | [diff] [blame] | 80 | */ |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 81 | typedef struct { |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 82 | enum {IEEE754_UNION_IS_HALF = 2, |
| 83 | IEEE754_UNION_IS_SINGLE = 4, |
| 84 | IEEE754_UNION_IS_DOUBLE = 8, |
| 85 | } uSize; /* Size of uValue */ |
| 86 | uint64_t uValue; |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 87 | } IEEE754_union; |
| 88 | |
| 89 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 90 | /** |
| 91 | * @brief Convert a double to either single or half-precision. |
| 92 | * |
| 93 | * @param[in] d The value to convert. |
| 94 | * @param[in] bAllowHalfPrecision If true, convert to either half or |
| 95 | * single precision. |
| 96 | * |
| 97 | * @returns Unconverted value, or value converted to single or half-precision. |
| 98 | * |
| 99 | * This always succeeds. If the value cannot be converted without the |
| 100 | * loss of precision, it is not converted. |
| 101 | * |
| 102 | * This handles all subnormals and NaN payloads. |
Laurence Lundblade | 8db3d3e | 2018-09-29 11:46:37 -0700 | [diff] [blame] | 103 | */ |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 104 | IEEE754_union |
| 105 | IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision); |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 106 | |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 107 | |
| 108 | /** |
| 109 | * @brief Convert a single-precision float to half-precision. |
| 110 | * |
| 111 | * @param[in] f The value to convert. |
| 112 | * |
| 113 | * @returns Either unconverted value or value converted to half-precision. |
| 114 | * |
| 115 | * This always succeeds. If the value cannot be converted without the |
| 116 | * loss of precision, it is not converted. |
| 117 | * |
| 118 | * This handles all subnormals and NaN payloads. |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 119 | */ |
Laurence Lundblade | 83dbf5c | 2024-01-07 19:17:52 -0700 | [diff] [blame] | 120 | IEEE754_union |
| 121 | IEEE754_SingleToHalf(float f); |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 122 | |
| 123 | |
Laurence Lundblade | 12d32c5 | 2018-09-19 11:25:27 -0700 | [diff] [blame] | 124 | #endif /* ieee754_h */ |
| 125 | |
Laurence Lundblade | b275cdc | 2020-07-12 12:34:38 -0700 | [diff] [blame] | 126 | #endif /* QCBOR_DISABLE_PREFERRED_FLOAT */ |