blob: dfe45fb0606acf9782e5d00ff3f968129af8e285 [file] [log] [blame]
Laurence Lundbladecc2ed342018-09-22 17:29:55 -07001/*==============================================================================
Laurence Lundbladea3fd49f2019-01-21 10:16:22 -08002 ieee754.c -- floating point conversion between half, double and single precision
3
4 Copyright (c) 2018-2019, Laurence Lundblade. All rights reserved.
5
6 SPDX-License-Identifier: BSD-3-Clause
7
8 See BSD-3-Clause license in README.md
9
10 Created on 7/23/18
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070011 ==============================================================================*/
Laurence Lundblade12d32c52018-09-19 11:25:27 -070012
13#ifndef ieee754_h
14#define ieee754_h
15
16#include <stdint.h>
17
18
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070019
20/*
21 General comments
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080022
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070023 This is a complete in that it handles all conversion cases
24 including +/- infinity, +/- zero, subnormal numbers, qNaN, sNaN
25 and NaN payloads.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080026
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070027 This confirms to IEEE 754-2008, but note that this doesn't
28 specify conversions, just the encodings.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080029
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070030 NaN payloads are preserved with alignment on the LSB. The
31 qNaN bit is handled differently and explicity copied. It
32 is always the MSB of the significand. The NaN payload MSBs
33 (except the qNaN bit) are truncated when going from
34 double or single to half.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080035
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070036 TODO: what does the C cast do with NaN payloads from
37 double to single?
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080038
39
40
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070041 */
42
Laurence Lundblade12d32c52018-09-19 11:25:27 -070043/*
44 Most simply just explicilty encode the type you want, single or double.
45 This works easily everywhere since standard C supports both
46 these types and so does qcbor. This encoder also supports
47 half precision and there's a few ways to use it to encode
48 floating point numbers in less space.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080049
Laurence Lundblade12d32c52018-09-19 11:25:27 -070050 Without losing precision, you can encode a single or double
51 such that the special values of 0, NaN and Infinity encode
52 as half-precision. This CBOR decodoer and most others
53 should handle this properly.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080054
Laurence Lundblade12d32c52018-09-19 11:25:27 -070055 If you don't mind losing precision, then you can use half-precision.
56 One way to do this is to set up your environment to use
57 ___fp_16. Some compilers and CPUs support it even though it is not
58 standard C. What is nice about this is that your program
59 will use less memory and floating point operations like
60 multiplying, adding and such will be faster.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080061
Laurence Lundblade12d32c52018-09-19 11:25:27 -070062 Another way to make use of half-precision is to represent
63 the values in your program as single or double, but encode
64 them in CBOR as half-precision. This cuts the size
65 of the encoded messages by 2 or 4, but doesn't reduce
66 memory needs or speed because you are still using
67 single or double in your code.
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080068
Laurence Lundblade12d32c52018-09-19 11:25:27 -070069
70 encode:
71 - float as float
72 - double as double
73 - half as half
74 - float as half_precision, for environments that don't support a half-precision type
75 - double as half_precision, for environments that don't support a half-precision type
76 - float with NaN, Infinity and 0 as half
77 - double with NaN, Infinity and 0 as half
Laurence Lundblade3aee3a32018-12-17 16:17:45 -080078
79
80
81
Laurence Lundblade12d32c52018-09-19 11:25:27 -070082 */
83
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070084
85
86/*
87 Convert single precision float to half-precision float.
88 Precision and NaN payload bits will be lost. Too large
89 values will round up to infinity and too small to zero.
90 */
Laurence Lundbladecc2ed342018-09-22 17:29:55 -070091uint16_t IEEE754_FloatToHalf(float f);
Laurence Lundblade12d32c52018-09-19 11:25:27 -070092
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -070093
94/*
95 Convert half precision float to single precision float.
96 This is a loss-less conversion.
97 */
Laurence Lundblade12d32c52018-09-19 11:25:27 -070098float IEEE754_HalfToFloat(uint16_t uHalfPrecision);
99
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700100
101/*
102 Convert double precision float to half-precision float.
103 Precision and NaN payload bits will be lost. Too large
104 values will round up to infinity and too small to zero.
105 */
Laurence Lundbladecc2ed342018-09-22 17:29:55 -0700106uint16_t IEEE754_DoubleToHalf(double d);
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700107
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700108
109/*
110 Convert half precision float to double precision float.
111 This is a loss-less conversion.
112 */
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700113double IEEE754_HalfToDouble(uint16_t uHalfPrecision);
114
115
116
Laurence Lundblade577d8212018-11-01 14:04:08 +0700117// Both tags the value and gives the size
118#define IEEE754_UNION_IS_HALF 2
119#define IEEE754_UNION_IS_SINGLE 4
120#define IEEE754_UNION_IS_DOUBLE 8
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700121
122typedef struct {
Laurence Lundblade577d8212018-11-01 14:04:08 +0700123 uint8_t uSize; // One of IEEE754_IS_xxxx
124 uint64_t uValue;
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700125} IEEE754_union;
126
127
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700128/*
129 Converts double-precision to single-precision or half-precision if possible without
130 loss of precisions. If not, leaves it as a double. Only converts to single-precision
131 unless bAllowHalfPrecision is set.
132 */
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700133IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision);
134
135/*
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700136 Converts double-precision to single-precision if possible without
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700137 loss of precision. If not, leaves it as a double.
138 */
139static inline IEEE754_union IEEE754_DoubleToSmall(double d)
140{
141 return IEEE754_DoubleToSmallestInternal(d, 0);
142}
143
144
145/*
Laurence Lundblade8db3d3e2018-09-29 11:46:37 -0700146 Converts double-precision to single-precision or half-precision if possible without
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700147 loss of precisions. If not, leaves it as a double.
148 */
149static inline IEEE754_union IEEE754_DoubleToSmallest(double d)
150{
151 return IEEE754_DoubleToSmallestInternal(d, 1);
152}
153
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700154/*
155 Converts single-precision to half-precision if possible without
156 loss of precision. If not leaves as single-precision.
157 */
158IEEE754_union IEEE754_FloatToSmallest(float f);
159
160
Laurence Lundblade12d32c52018-09-19 11:25:27 -0700161#endif /* ieee754_h */
162
163
164
165
166
167
168