Blame - src/ieee754.c - mirror/QCBOR - TrustedFirmware Git Browser

blob: ea44e1dbda3a7367fc42122e3adcc35a521c089b [file] [log] [blame]

Laurence Lundblade	cc2ed34	2018-09-22 17:29:55 -0700	[diff] [blame]	1	/*==============================================================================
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	2	ieee754.c -- floating-point conversion between half, double & single-precision
Laurence Lundblade	035bd78	2019-01-21 17:01:31 -0800	[diff] [blame]	3
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	4	Copyright (c) 2018-2020, Laurence Lundblade. All rights reserved.
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	5
Laurence Lundblade	a3fd49f	2019-01-21 10:16:22 -0800	[diff] [blame]	6	SPDX-License-Identifier: BSD-3-Clause
Laurence Lundblade	035bd78	2019-01-21 17:01:31 -0800	[diff] [blame]	7
Laurence Lundblade	a3fd49f	2019-01-21 10:16:22 -0800	[diff] [blame]	8	See BSD-3-Clause license in README.md
Laurence Lundblade	035bd78	2019-01-21 17:01:31 -0800	[diff] [blame]	9
Laurence Lundblade	a3fd49f	2019-01-21 10:16:22 -0800	[diff] [blame]	10	Created on 7/23/18
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	11	=============================================================================*/
Laurence Lundblade	cc2ed34	2018-09-22 17:29:55 -0700	[diff] [blame]	12
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	13	#include "ieee754.h"
				14	#include <string.h> // For memcpy()
				15
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	16
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	17	/*
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	18	This code is written for clarity and verifiability, not for size, on
				19	the assumption that the optimizer will do a good job. The LLVM
				20	optimizer, -Os, does seem to do the job and the resulting object code
				21	is smaller from combining code for the many different cases (normal,
				22	subnormal, infinity, zero...) for the conversions.
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	23
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	24	This code has really long lines and is much easier to read because of
				25	them. Some coding guidelines prefer 80 column lines (can they not afford
				26	big displays?). It would make this code much worse even to wrap at 120
				27	columns.
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	28
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	29	Dead stripping is also really helpful to get code size down when
				30	floating-point encoding is not needed.
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	31
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	32	This code works solely using shifts and masks and thus has no
				33	dependency on any math libraries. It can even work if the CPU doesn't
				34	have any floating-point support, though that isn't the most useful
				35	thing to do.
				36
				37	The memcpy() dependency is only for CopyFloatToUint32() and friends
				38	which only is needed to avoid type punning when converting the actual
				39	float bits to an unsigned value so the bit shifts and masks can work.
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	40	*/
				41
				42	/*
				43	The references used to write this code:
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	44
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	45	- IEEE 754-2008, particularly section 3.6 and 6.2.1
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	46
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	47	- https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	48
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	49	- https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	50	*/
				51
				52
				53	// ----- Half Precsion -----------
				54	#define HALF_NUM_SIGNIFICAND_BITS (10)
				55	#define HALF_NUM_EXPONENT_BITS (5)
				56	#define HALF_NUM_SIGN_BITS (1)
				57
				58	#define HALF_SIGNIFICAND_SHIFT (0)
				59	#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
				60	#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
				61
				62	#define HALF_SIGNIFICAND_MASK (0x3ff) // The lower 10 bits // 0x03ff
				63	#define HALF_EXPONENT_MASK (0x1f << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
				64	#define HALF_SIGN_MASK (0x01 << HALF_SIGN_SHIFT) // // 0x80001 bit of sign
				65	#define HALF_QUIET_NAN_BIT (0x01 << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
				66
				67	/* Biased Biased Unbiased Use
				68	0x00 0 -15 0 and subnormal
				69	0x01 1 -14 Smallest normal exponent
				70	0x1e 30 15 Largest normal exponent
				71	0x1F 31 16 NaN and Infinity */
				72	#define HALF_EXPONENT_BIAS (15)
				73	#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
				74	#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
				75	#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
				76	#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
				77
				78
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	79	// ------ Single-Precision --------
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	80	#define SINGLE_NUM_SIGNIFICAND_BITS (23)
				81	#define SINGLE_NUM_EXPONENT_BITS (8)
				82	#define SINGLE_NUM_SIGN_BITS (1)
				83
				84	#define SINGLE_SIGNIFICAND_SHIFT (0)
				85	#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
				86	#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
				87
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	88	#define SINGLE_SIGNIFICAND_MASK (0x7fffffUL) // The lower 23 bits
				89	#define SINGLE_EXPONENT_MASK (0xffUL << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
				90	#define SINGLE_SIGN_MASK (0x01UL << SINGLE_SIGN_SHIFT) // 1 bit of sign
				91	#define SINGLE_QUIET_NAN_BIT (0x01UL << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	92
				93	/* Biased Biased Unbiased Use
				94	0x0000 0 -127 0 and subnormal
				95	0x0001 1 -126 Smallest normal exponent
				96	0x7f 127 0 1
				97	0xfe 254 127 Largest normal exponent
				98	0xff 255 128 NaN and Infinity */
				99	#define SINGLE_EXPONENT_BIAS (127)
				100	#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
				101	#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
				102	#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
				103	#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
				104
				105
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	106	// --------- Double-Precision ----------
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	107	#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
				108	#define DOUBLE_NUM_EXPONENT_BITS (11)
				109	#define DOUBLE_NUM_SIGN_BITS (1)
				110
				111	#define DOUBLE_SIGNIFICAND_SHIFT (0)
				112	#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
				113	#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
				114
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	115	#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
				116	#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
				117	#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
				118	#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
				119
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	120
				121	/* Biased Biased Unbiased Use
				122	0x00000000 0 -1023 0 and subnormal
				123	0x00000001 1 -1022 Smallest normal exponent
				124	0x000007fe 2046 1023 Largest normal exponent
				125	0x000007ff 2047 1024 NaN and Infinity */
				126	#define DOUBLE_EXPONENT_BIAS (1023)
				127	#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
				128	#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
				129	#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
				130	#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
				131
				132
				133
				134	/*
				135	Convenient functions to avoid type punning, compiler warnings and such
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	136	The optimizer reduces them to a simple assignment.
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	137	This is a crusty corner of C. It shouldn't be this hard.
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	138
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	139	These are also in UsefulBuf.h under a different name. They are copied
Laurence Lundblade	3df8c7e	2018-11-02 13:12:41 +0700	[diff] [blame]	140	here to avoid a dependency on UsefulBuf.h. There is no
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	141	object code size impact because these always optimze down to a
				142	simple assignment.
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	143	*/
				144	static inline uint32_t CopyFloatToUint32(float f)
				145	{
				146	uint32_t u32;
				147	memcpy(&u32, &f, sizeof(uint32_t));
				148	return u32;
				149	}
				150
				151	static inline uint64_t CopyDoubleToUint64(double d)
				152	{
				153	uint64_t u64;
				154	memcpy(&u64, &d, sizeof(uint64_t));
				155	return u64;
				156	}
				157
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	158	static inline float CopyUint32ToFloat(uint32_t u32)
				159	{
				160	float f;
				161	memcpy(&f, &u32, sizeof(uint32_t));
				162	return f;
				163	}
				164
Laurence Lundblade	67bd551	2018-11-02 21:44:06 +0700	[diff] [blame]	165	static inline double CopyUint64ToDouble(uint64_t u64)
				166	{
				167	double d;
				168	memcpy(&d, &u64, sizeof(uint64_t));
				169	return d;
				170	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	171
				172
				173	// Public function; see ieee754.h
Laurence Lundblade	cc2ed34	2018-09-22 17:29:55 -0700	[diff] [blame]	174	uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	175	{
				176	// Pull the three parts out of the single-precision float
				177	const uint32_t uSingle = CopyFloatToUint32(f);
				178	const int32_t nSingleUnbiasedExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
				179	const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
				180	const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	181
				182
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	183	// Now convert the three parts to half-precision.
				184	uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
				185	if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
				186	// +/- Infinity and NaNs -- single biased exponent is 0xff
				187	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				188	if(!uSingleSignificand) {
				189	// Infinity
				190	uHalfSignificand = 0;
				191	} else {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	192	// Copy the LBSs of the NaN payload that will fit from the single to the half
				193	uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
				194	if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
				195	// It's a qNaN; copy the qNaN bit
				196	uHalfSignificand \|= HALF_QUIET_NAN_BIT;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	197	} else {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	198	// It's a sNaN; make sure the significand is not zero so it stays a NaN
				199	// This is needed because not all significand bits are copied from single
				200	if(!uHalfSignificand) {
				201	// Set the LSB. This is what wikipedia shows for sNAN.
				202	uHalfSignificand \|= 0x01;
				203	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	204	}
				205	}
				206	} else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
				207	// 0 or a subnormal number -- singled biased exponent is 0
				208	uHalfBiasedExponent = 0;
				209	uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
				210	} else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
				211	// Exponent is too large to express in half-precision; round up to infinity
				212	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				213	uHalfSignificand = 0;
				214	} else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
				215	// Exponent is too small to express in half-precision normal; make it a half-precision subnormal
				216	uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
				217	// Difference between single normal exponent and the base exponent of a half subnormal
				218	const uint32_t nExpDiff = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
				219	// Also have to shift the significand by the difference in number of bits between a single and a half significand
				220	const int32_t nSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
				221	// Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
				222	const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01L << SINGLE_NUM_SIGNIFICAND_BITS);
				223	uHalfSignificand = uSingleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
				224	} else {
				225	// The normal case
				226	uHalfBiasedExponent = nSingleUnbiasedExponent + HALF_EXPONENT_BIAS;
				227	uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				228	}
				229	uHalfSign = uSingleSign;
				230
				231	// Put the 3 values in the right place for a half precision
				232	const uint16_t uHalfPrecision = uHalfSignificand \|
				233	(uHalfBiasedExponent << HALF_EXPONENT_SHIFT) \|
				234	(uHalfSign << HALF_SIGN_SHIFT);
				235	return uHalfPrecision;
				236	}
				237
				238
				239	// Public function; see ieee754.h
Laurence Lundblade	cc2ed34	2018-09-22 17:29:55 -0700	[diff] [blame]	240	uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	241	{
				242	// Pull the three parts out of the double-precision float
				243	const uint64_t uDouble = CopyDoubleToUint64(d);
				244	const int64_t nDoubleUnbiasedExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
				245	const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
				246	const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
				247
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	248
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	249	// Now convert the three parts to half-precision.
				250	uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
				251	if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
				252	// +/- Infinity and NaNs -- single biased exponent is 0xff
				253	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				254	if(!uDoubleSignificand) {
				255	// Infinity
				256	uHalfSignificand = 0;
				257	} else {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	258	// Copy the LBSs of the NaN payload that will fit from the double to the half
				259	uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
				260	if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
				261	// It's a qNaN; copy the qNaN bit
				262	uHalfSignificand \|= HALF_QUIET_NAN_BIT;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	263	} else {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	264	// It's an sNaN; make sure the significand is not zero so it stays a NaN
				265	// This is needed because not all significand bits are copied from single
				266	if(!uHalfSignificand) {
				267	// Set the LSB. This is what wikipedia shows for sNAN.
				268	uHalfSignificand \|= 0x01;
				269	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	270	}
				271	}
				272	} else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
				273	// 0 or a subnormal number -- double biased exponent is 0
				274	uHalfBiasedExponent = 0;
				275	uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
				276	} else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
				277	// Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
				278	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				279	uHalfSignificand = 0;
				280	} else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
				281	// Exponent is too small to express in half-precision; round down to zero
				282	uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
				283	// Difference between double normal exponent and the base exponent of a half subnormal
				284	const uint64_t nExpDiff = -(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
				285	// Also have to shift the significand by the difference in number of bits between a double and a half significand
				286	const int64_t nSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
				287	// Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
Laurence Lundblade	f280195	2018-12-17 10:40:29 -0800	[diff] [blame]	288	const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01ULL << DOUBLE_NUM_SIGNIFICAND_BITS);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	289	uHalfSignificand = uDoubleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
				290	} else {
				291	// The normal case
				292	uHalfBiasedExponent = nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS;
				293	uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				294	}
				295	uHalfSign = uDoubleSign;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	296
				297
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	298	// Put the 3 values in the right place for a half precision
				299	const uint16_t uHalfPrecision = uHalfSignificand \|
				300	(uHalfBiasedExponent << HALF_EXPONENT_SHIFT) \|
				301	(uHalfSign << HALF_SIGN_SHIFT);
				302	return uHalfPrecision;
				303	}
				304
				305
				306	// Public function; see ieee754.h
				307	float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
				308	{
				309	// Pull out the three parts of the half-precision float
				310	const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
				311	const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
				312	const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	313
				314
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	315	// Make the three parts of the single-precision number
				316	uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
				317	if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
				318	// 0 or subnormal
				319	if(uHalfSignificand) {
				320	// Subnormal case
				321	uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
				322	// A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
				323	uSingleSignificand = uHalfSignificand;
				324	// Shift bits from right of the decimal to left, reducing the exponent by 1 each time
				325	do {
				326	uSingleSignificand <<= 1;
				327	uSingleBiasedExponent--;
				328	} while ((uSingleSignificand & 0x400) == 0);
				329	uSingleSignificand &= HALF_SIGNIFICAND_MASK;
				330	uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				331	} else {
				332	// Just zero
				333	uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
				334	uSingleSignificand = 0;
				335	}
				336	} else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
				337	// NaN or Inifinity
				338	uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
				339	if(uHalfSignificand) {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	340	// NaN
				341	// First preserve the NaN payload from half to single
				342	uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
				343	if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
				344	// Next, set qNaN if needed since half qNaN bit is not copied above
				345	uSingleSignificand \|= SINGLE_QUIET_NAN_BIT;
				346	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	347	} else {
				348	// Infinity
				349	uSingleSignificand = 0;
				350	}
				351	} else {
				352	// Normal number
				353	uSingleBiasedExponent = nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS;
				354	uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				355	}
				356	uSingleSign = uHalfSign;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	357
				358
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	359	// Shift the three parts of the single-precision into place
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	360	const uint32_t uSinglePrecision = uSingleSignificand \|
				361	(uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) \|
				362	(uSingleSign << SINGLE_SIGN_SHIFT);
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	363
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	364	return CopyUint32ToFloat(uSinglePrecision);
				365	}
				366
				367
Laurence Lundblade	67bd551	2018-11-02 21:44:06 +0700	[diff] [blame]	368	// Public function; see ieee754.h
				369	double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
				370	{
				371	// Pull out the three parts of the half-precision float
				372	const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
				373	const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
				374	const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	375
				376
Laurence Lundblade	67bd551	2018-11-02 21:44:06 +0700	[diff] [blame]	377	// Make the three parts of hte single-precision number
				378	uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
				379	if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
				380	// 0 or subnormal
				381	uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
				382	if(uHalfSignificand) {
				383	// Subnormal case
				384	uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
				385	// A half-precision subnormal can always be converted to a normal double-precision float because the ranges line up
				386	uDoubleSignificand = uHalfSignificand;
				387	// Shift bits from right of the decimal to left, reducing the exponent by 1 each time
				388	do {
				389	uDoubleSignificand <<= 1;
				390	uDoubleBiasedExponent--;
				391	} while ((uDoubleSignificand & 0x400) == 0);
				392	uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
				393	uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				394	} else {
				395	// Just zero
				396	uDoubleSignificand = 0;
				397	}
				398	} else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
				399	// NaN or Inifinity
				400	uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
				401	if(uHalfSignificand) {
				402	// NaN
				403	// First preserve the NaN payload from half to single
				404	uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
				405	if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
				406	// Next, set qNaN if needed since half qNaN bit is not copied above
				407	uDoubleSignificand \|= DOUBLE_QUIET_NAN_BIT;
				408	}
				409	} else {
				410	// Infinity
				411	uDoubleSignificand = 0;
				412	}
				413	} else {
				414	// Normal number
				415	uDoubleBiasedExponent = nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS;
				416	uDoubleSignificand = (uint64_t)uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				417	}
				418	uDoubleSign = uHalfSign;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	419
				420
Laurence Lundblade	67bd551	2018-11-02 21:44:06 +0700	[diff] [blame]	421	// Shift the 3 parts into place as a double-precision
				422	const uint64_t uDouble = uDoubleSignificand \|
				423	(uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) \|
				424	(uDoubleSign << DOUBLE_SIGN_SHIFT);
				425	return CopyUint64ToDouble(uDouble);
				426	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	427
				428
				429	// Public function; see ieee754.h
				430	IEEE754_union IEEE754_FloatToSmallest(float f)
				431	{
				432	IEEE754_union result;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	433
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	434	// Pull the neeed two parts out of the single-precision float
				435	const uint32_t uSingle = CopyFloatToUint32(f);
				436	const int32_t nSingleExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
				437	const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	438
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	439	// Bit mask that is the significand bits that would be lost when converting
				440	// from single-precision to half-precision
				441	const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
				442
				443	// Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
				444	if(uSingle == 0) {
				445	// Value is 0.0000, not a a subnormal
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	446	result.uSize = IEEE754_UNION_IS_HALF;
				447	result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	448	} else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
				449	// NaN, +/- infinity
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	450	result.uSize = IEEE754_UNION_IS_HALF;
				451	result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	452	} else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
				453	// Normal number in exponent range and precision won't be lost
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	454	result.uSize = IEEE754_UNION_IS_HALF;
				455	result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	456	} else {
				457	// Subnormal, exponent out of range, or precision will be lost
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	458	result.uSize = IEEE754_UNION_IS_SINGLE;
				459	result.uValue = uSingle;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	460	}
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	461
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	462	return result;
				463	}
				464
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	465	// Public function; see ieee754.h
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	466	IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
				467	{
				468	IEEE754_union result;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	469
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	470	// Pull the needed two parts out of the double-precision float
				471	const uint64_t uDouble = CopyDoubleToUint64(d);
				472	const int64_t nDoubleExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
				473	const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	474
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	475	// Masks to check whether dropped significand bits are zero or not
				476	const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
				477	const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	478
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	479	// The various cases
Laurence Lundblade	d711fb2	2018-09-26 14:35:22 -0700	[diff] [blame]	480	if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	481	// Value is 0.0000, not a a subnormal
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	482	result.uSize = IEEE754_UNION_IS_HALF;
				483	result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	484	} else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
				485	// NaN, +/- infinity
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	486	result.uSize = IEEE754_UNION_IS_HALF;
				487	result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	488	} else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
				489	// Can convert to half without precision loss
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	490	result.uSize = IEEE754_UNION_IS_HALF;
				491	result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	492	} else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
				493	// Can convert to single without precision loss
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	494	result.uSize = IEEE754_UNION_IS_SINGLE;
				495	result.uValue = CopyFloatToUint32((float)d);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	496	} else {
				497	// Can't convert without precision loss
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	498	result.uSize = IEEE754_UNION_IS_DOUBLE;
				499	result.uValue = uDouble;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	500	}
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	501
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	502	return result;
				503	}
				504