blob: 705ef62e731162f8e915761f706075f7eebf2b99 [file] [log] [blame]
Laurence Lundblade6ed34222018-12-18 09:46:23 -08001/*==============================================================================
Laurence Lundbladeac515e52020-01-30 10:44:06 -08002 ieee754.c -- floating-point conversion between half, double & single-precision
Laurence Lundblade6ed34222018-12-18 09:46:23 -08003
Laurence Lundbladeac515e52020-01-30 10:44:06 -08004 Copyright (c) 2018-2020, Laurence Lundblade. All rights reserved.
Laurence Lundblade6ed34222018-12-18 09:46:23 -08005
6 SPDX-License-Identifier: BSD-3-Clause
7
8 See BSD-3-Clause license in README.md
9
10 Created on 7/23/18
Laurence Lundbladeac515e52020-01-30 10:44:06 -080011 =============================================================================*/
Laurence Lundblade6ed34222018-12-18 09:46:23 -080012
13#ifndef ieee754_h
14#define ieee754_h
15
16#include <stdint.h>
17
18
19
20/*
21 General comments
22
Laurence Lundbladeac515e52020-01-30 10:44:06 -080023 This is a complete in that it handles all conversion cases including
24 +/- infinity, +/- zero, subnormal numbers, qNaN, sNaN and NaN
25 payloads.
Laurence Lundblade6ed34222018-12-18 09:46:23 -080026
Laurence Lundbladeac515e52020-01-30 10:44:06 -080027 This conforms to IEEE 754-2008, but note that this doesn't specify
28 conversions, just the encodings.
Laurence Lundblade6ed34222018-12-18 09:46:23 -080029
Laurence Lundbladeac515e52020-01-30 10:44:06 -080030 NaN payloads are preserved with alignment on the LSB. The qNaN bit is
31 handled differently and explicity copied. It is always the MSB of the
32 significand. The NaN payload MSBs (except the qNaN bit) are truncated
33 when going from double or single to half.
Laurence Lundblade6ed34222018-12-18 09:46:23 -080034
35 TODO: what does the C cast do with NaN payloads from
Laurence Lundbladeac515e52020-01-30 10:44:06 -080036 double to single? It probably depends entirely on the
37 CPU.
Laurence Lundblade6ed34222018-12-18 09:46:23 -080038
39 */
40
41/*
Laurence Lundbladeac515e52020-01-30 10:44:06 -080042 Most simply just explicilty encode the type you want, single or
43 double. This works easily everywhere since standard C supports both
44 these types and so does qcbor. This encoder also supports half
45 precision and there's a few ways to use it to encode floating-point
46 numbers in less space.
Laurence Lundblade6ed34222018-12-18 09:46:23 -080047
Laurence Lundbladeac515e52020-01-30 10:44:06 -080048 Without losing precision, you can encode a single or double such that
49 the special values of 0, NaN and Infinity encode as half-precision.
50 This CBOR decodoer and most others should handle this properly.
Laurence Lundblade6ed34222018-12-18 09:46:23 -080051
52 If you don't mind losing precision, then you can use half-precision.
53 One way to do this is to set up your environment to use
54 ___fp_16. Some compilers and CPUs support it even though it is not
Laurence Lundbladeac515e52020-01-30 10:44:06 -080055 standard C. What is nice about this is that your program will use
56 less memory and floating-point operations like multiplying, adding
57 and such will be faster.
Laurence Lundblade6ed34222018-12-18 09:46:23 -080058
Laurence Lundbladeac515e52020-01-30 10:44:06 -080059 Another way to make use of half-precision is to represent the values
60 in your program as single or double, but encode them in CBOR as
61 half-precision. This cuts the size of the encoded messages by 2 or 4,
62 but doesn't reduce memory needs or speed because you are still using
Laurence Lundblade6ed34222018-12-18 09:46:23 -080063 single or double in your code.
64
Laurence Lundblade6ed34222018-12-18 09:46:23 -080065 */
66
67
68
69/*
Laurence Lundbladeac515e52020-01-30 10:44:06 -080070 Convert single-precision float to half-precision float. Precision
71 and NaN payload bits will be lost. Too-large values will round up to
72 infinity and too small to zero.
Laurence Lundblade6ed34222018-12-18 09:46:23 -080073 */
74uint16_t IEEE754_FloatToHalf(float f);
75
76
77/*
Laurence Lundbladeac515e52020-01-30 10:44:06 -080078 Convert half-precision float to single-precision float. This is a
79 loss-less conversion.
Laurence Lundblade6ed34222018-12-18 09:46:23 -080080 */
81float IEEE754_HalfToFloat(uint16_t uHalfPrecision);
82
83
84/*
Laurence Lundbladeac515e52020-01-30 10:44:06 -080085 Convert double-precision float to half-precision float. Precision
86 and NaN payload bits will be lost. Too-large values will round up to
87 infinity and too small to zero.
Laurence Lundblade6ed34222018-12-18 09:46:23 -080088 */
89uint16_t IEEE754_DoubleToHalf(double d);
90
91
92/*
Laurence Lundbladeac515e52020-01-30 10:44:06 -080093 Convert half-precision float to double-precision float.
Laurence Lundblade6ed34222018-12-18 09:46:23 -080094 This is a loss-less conversion.
95 */
96double IEEE754_HalfToDouble(uint16_t uHalfPrecision);
97
98
99
100// Both tags the value and gives the size
101#define IEEE754_UNION_IS_HALF 2
102#define IEEE754_UNION_IS_SINGLE 4
103#define IEEE754_UNION_IS_DOUBLE 8
104
105typedef struct {
106 uint8_t uSize; // One of IEEE754_IS_xxxx
107 uint64_t uValue;
108} IEEE754_union;
109
110
111/*
Laurence Lundbladeac515e52020-01-30 10:44:06 -0800112 Converts double-precision to single-precision or half-precision if
113 possible without loss of precisions. If not, leaves it as a
114 double. Only converts to single-precision unless bAllowHalfPrecision
115 is set.
Laurence Lundblade6ed34222018-12-18 09:46:23 -0800116 */
117IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision);
118
119/*
120 Converts double-precision to single-precision if possible without
121 loss of precision. If not, leaves it as a double.
122 */
123static inline IEEE754_union IEEE754_DoubleToSmall(double d)
124{
125 return IEEE754_DoubleToSmallestInternal(d, 0);
126}
127
128
129/*
Laurence Lundbladeac515e52020-01-30 10:44:06 -0800130 Converts double-precision to single-precision or half-precision if
131 possible without loss of precisions. If not, leaves it as a double.
Laurence Lundblade6ed34222018-12-18 09:46:23 -0800132 */
133static inline IEEE754_union IEEE754_DoubleToSmallest(double d)
134{
135 return IEEE754_DoubleToSmallestInternal(d, 1);
136}
137
Laurence Lundbladeac515e52020-01-30 10:44:06 -0800138
Laurence Lundblade6ed34222018-12-18 09:46:23 -0800139/*
Laurence Lundbladeac515e52020-01-30 10:44:06 -0800140 Converts single-precision to half-precision if possible without loss
141 of precision. If not leaves as single-precision.
Laurence Lundblade6ed34222018-12-18 09:46:23 -0800142 */
143IEEE754_union IEEE754_FloatToSmallest(float f);
144
145
146#endif /* ieee754_h */
147
148
149
150
151
152
153