Blame - src/ieee754.c - mirror/QCBOR - TrustedFirmware Git Browser

blob: ef0adef0f8c08334bb10b9d2e34d8eea82a68dfa [file] [log] [blame]

Laurence Lundblade	cc2ed34	2018-09-22 17:29:55 -0700	[diff] [blame]	1	/*==============================================================================
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	2	ieee754.c -- floating-point conversion between half, double & single-precision
Laurence Lundblade	035bd78	2019-01-21 17:01:31 -0800	[diff] [blame]	3
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	4	Copyright (c) 2018-2020, Laurence Lundblade. All rights reserved.
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	5
Laurence Lundblade	a3fd49f	2019-01-21 10:16:22 -0800	[diff] [blame]	6	SPDX-License-Identifier: BSD-3-Clause
Laurence Lundblade	035bd78	2019-01-21 17:01:31 -0800	[diff] [blame]	7
Laurence Lundblade	a3fd49f	2019-01-21 10:16:22 -0800	[diff] [blame]	8	See BSD-3-Clause license in README.md
Laurence Lundblade	035bd78	2019-01-21 17:01:31 -0800	[diff] [blame]	9
Laurence Lundblade	a3fd49f	2019-01-21 10:16:22 -0800	[diff] [blame]	10	Created on 7/23/18
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	11	=============================================================================*/
Laurence Lundblade	cc2ed34	2018-09-22 17:29:55 -0700	[diff] [blame]	12
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	13	#include "ieee754.h"
				14	#include <string.h> // For memcpy()
				15
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	16
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	17	/*
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	18	This code is written for clarity and verifiability, not for size, on
				19	the assumption that the optimizer will do a good job. The LLVM
				20	optimizer, -Os, does seem to do the job and the resulting object code
				21	is smaller from combining code for the many different cases (normal,
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	22	subnormal, infinity, zero...) for the conversions. GCC is no where near
				23	as good.
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	24
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	25	This code has really long lines and is much easier to read because of
				26	them. Some coding guidelines prefer 80 column lines (can they not afford
				27	big displays?). It would make this code much worse even to wrap at 120
				28	columns.
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	29
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	30	Dead stripping is also really helpful to get code size down when
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	31	floating-point encoding is not needed. (If this is put in a library
				32	and linking is against the library, then dead stripping is automatic).
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	33
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	34	This code works solely using shifts and masks and thus has no
				35	dependency on any math libraries. It can even work if the CPU doesn't
				36	have any floating-point support, though that isn't the most useful
				37	thing to do.
				38
				39	The memcpy() dependency is only for CopyFloatToUint32() and friends
				40	which only is needed to avoid type punning when converting the actual
				41	float bits to an unsigned value so the bit shifts and masks can work.
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	42	*/
				43
				44	/*
				45	The references used to write this code:
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	46
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	47	- IEEE 754-2008, particularly section 3.6 and 6.2.1
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	48
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	49	- https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	50
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	51	- https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	52
				53	- https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules
				54
				55	- https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	56	*/
				57
				58
				59	// ----- Half Precsion -----------
				60	#define HALF_NUM_SIGNIFICAND_BITS (10)
				61	#define HALF_NUM_EXPONENT_BITS (5)
				62	#define HALF_NUM_SIGN_BITS (1)
				63
				64	#define HALF_SIGNIFICAND_SHIFT (0)
				65	#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
				66	#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
				67
				68	#define HALF_SIGNIFICAND_MASK (0x3ff) // The lower 10 bits // 0x03ff
				69	#define HALF_EXPONENT_MASK (0x1f << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
				70	#define HALF_SIGN_MASK (0x01 << HALF_SIGN_SHIFT) // // 0x80001 bit of sign
				71	#define HALF_QUIET_NAN_BIT (0x01 << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
				72
				73	/* Biased Biased Unbiased Use
				74	0x00 0 -15 0 and subnormal
				75	0x01 1 -14 Smallest normal exponent
				76	0x1e 30 15 Largest normal exponent
				77	0x1F 31 16 NaN and Infinity */
				78	#define HALF_EXPONENT_BIAS (15)
				79	#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
				80	#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
				81	#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
				82	#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
				83
				84
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	85	// ------ Single-Precision --------
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	86	#define SINGLE_NUM_SIGNIFICAND_BITS (23)
				87	#define SINGLE_NUM_EXPONENT_BITS (8)
				88	#define SINGLE_NUM_SIGN_BITS (1)
				89
				90	#define SINGLE_SIGNIFICAND_SHIFT (0)
				91	#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
				92	#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
				93
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	94	#define SINGLE_SIGNIFICAND_MASK (0x7fffffUL) // The lower 23 bits
				95	#define SINGLE_EXPONENT_MASK (0xffUL << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
				96	#define SINGLE_SIGN_MASK (0x01UL << SINGLE_SIGN_SHIFT) // 1 bit of sign
				97	#define SINGLE_QUIET_NAN_BIT (0x01UL << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	98
				99	/* Biased Biased Unbiased Use
				100	0x0000 0 -127 0 and subnormal
				101	0x0001 1 -126 Smallest normal exponent
				102	0x7f 127 0 1
				103	0xfe 254 127 Largest normal exponent
				104	0xff 255 128 NaN and Infinity */
				105	#define SINGLE_EXPONENT_BIAS (127)
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	106	#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	107	#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
				108	#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	109	#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	110
				111
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	112	// --------- Double-Precision ----------
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	113	#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
				114	#define DOUBLE_NUM_EXPONENT_BITS (11)
				115	#define DOUBLE_NUM_SIGN_BITS (1)
				116
				117	#define DOUBLE_SIGNIFICAND_SHIFT (0)
				118	#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
				119	#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
				120
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	121	#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
				122	#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
				123	#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
				124	#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
				125
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	126
				127	/* Biased Biased Unbiased Use
				128	0x00000000 0 -1023 0 and subnormal
				129	0x00000001 1 -1022 Smallest normal exponent
				130	0x000007fe 2046 1023 Largest normal exponent
				131	0x000007ff 2047 1024 NaN and Infinity */
				132	#define DOUBLE_EXPONENT_BIAS (1023)
				133	#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
				134	#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
				135	#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
				136	#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
				137
				138
				139
				140	/*
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	141	Convenient functions to avoid type punning, compiler warnings and
				142	such. The optimizer reduces them to a simple assignment. This is a
				143	crusty corner of C. It shouldn't be this hard.
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	144
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	145	These are also in UsefulBuf.h under a different name. They are copied
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	146	here to avoid a dependency on UsefulBuf.h. There is no object code
				147	size impact because these always optimze down to a simple assignment.
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	148	*/
				149	static inline uint32_t CopyFloatToUint32(float f)
				150	{
				151	uint32_t u32;
				152	memcpy(&u32, &f, sizeof(uint32_t));
				153	return u32;
				154	}
				155
				156	static inline uint64_t CopyDoubleToUint64(double d)
				157	{
				158	uint64_t u64;
				159	memcpy(&u64, &d, sizeof(uint64_t));
				160	return u64;
				161	}
				162
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	163	static inline float CopyUint32ToFloat(uint32_t u32)
				164	{
				165	float f;
				166	memcpy(&f, &u32, sizeof(uint32_t));
				167	return f;
				168	}
				169
Laurence Lundblade	67bd551	2018-11-02 21:44:06 +0700	[diff] [blame]	170	static inline double CopyUint64ToDouble(uint64_t u64)
				171	{
				172	double d;
				173	memcpy(&d, &u64, sizeof(uint64_t));
				174	return d;
				175	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	176
				177
				178	// Public function; see ieee754.h
Laurence Lundblade	cc2ed34	2018-09-22 17:29:55 -0700	[diff] [blame]	179	uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	180	{
				181	// Pull the three parts out of the single-precision float
				182	const uint32_t uSingle = CopyFloatToUint32(f);
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	183	const int32_t nSingleUnbiasedExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
				184	const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
				185	const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	186
				187
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	188	// Now convert the three parts to half-precision.
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	189
				190	// All works is done on uint32_t with conversion to uint16_t at the end.
				191	// This avoids integer promotions that static analyzers complain about and
				192	// reduces code size.
				193	uint32_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
				194
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	195	if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
				196	// +/- Infinity and NaNs -- single biased exponent is 0xff
				197	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				198	if(!uSingleSignificand) {
				199	// Infinity
				200	uHalfSignificand = 0;
				201	} else {
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	202	// Copy the LSBs of the NaN payload that will fit from the single to the half
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	203	uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
				204	if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
				205	// It's a qNaN; copy the qNaN bit
				206	uHalfSignificand \|= HALF_QUIET_NAN_BIT;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	207	} else {
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	208	// It's an sNaN; make sure the significand is not zero so it stays a NaN
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	209	// This is needed because not all significand bits are copied from single
				210	if(!uHalfSignificand) {
				211	// Set the LSB. This is what wikipedia shows for sNAN.
				212	uHalfSignificand \|= 0x01;
				213	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	214	}
				215	}
				216	} else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
				217	// 0 or a subnormal number -- singled biased exponent is 0
				218	uHalfBiasedExponent = 0;
				219	uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
				220	} else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
				221	// Exponent is too large to express in half-precision; round up to infinity
				222	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				223	uHalfSignificand = 0;
				224	} else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
				225	// Exponent is too small to express in half-precision normal; make it a half-precision subnormal
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	226	uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	227	// Difference between single normal exponent and the base exponent of a half subnormal
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	228	const uint32_t uExpDiff = (uint32_t)-(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	229	// Also have to shift the significand by the difference in number of bits between a single and a half significand
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	230	const uint32_t uSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	231	// Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	232	const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01UL << SINGLE_NUM_SIGNIFICAND_BITS);
				233	uHalfSignificand = uSingleSignificandSubnormal >> (uExpDiff + uSignificandBitsDiff);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	234	} else {
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	235	// The normal case, exponent is in range for half-precision
				236	uHalfBiasedExponent = (uint32_t)(nSingleUnbiasedExponent + HALF_EXPONENT_BIAS);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	237	uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				238	}
				239	uHalfSign = uSingleSign;
				240
				241	// Put the 3 values in the right place for a half precision
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	242	const uint32_t uHalfPrecision = uHalfSignificand \|
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	243	(uHalfBiasedExponent << HALF_EXPONENT_SHIFT) \|
				244	(uHalfSign << HALF_SIGN_SHIFT);
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	245	// Cast is safe because all the masks and shifts above work to make
				246	// a half precision value which is only 16 bits.
				247	return (uint16_t)uHalfPrecision;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	248	}
				249
				250
				251	// Public function; see ieee754.h
Laurence Lundblade	cc2ed34	2018-09-22 17:29:55 -0700	[diff] [blame]	252	uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	253	{
				254	// Pull the three parts out of the double-precision float
				255	const uint64_t uDouble = CopyDoubleToUint64(d);
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	256	const int64_t nDoubleUnbiasedExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
				257	const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
				258	const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	259
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	260	// Now convert the three parts to half-precision.
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	261
				262	// All works is done on uint64_t with conversion to uint16_t at the end.
				263	// This avoids integer promotions that static analyzers complain about.
				264	// Other options are for these to be unsigned int or fast_int16_t. Code
				265	// size doesn't vary much between all these options for 64-bit LLVM,
				266	// 64-bit GCC and 32-bit Armv7 LLVM.
				267	uint64_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
				268
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	269	if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
				270	// +/- Infinity and NaNs -- single biased exponent is 0xff
				271	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				272	if(!uDoubleSignificand) {
				273	// Infinity
				274	uHalfSignificand = 0;
				275	} else {
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	276	// Copy the LSBs of the NaN payload that will fit from the double to the half
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	277	uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
				278	if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
				279	// It's a qNaN; copy the qNaN bit
				280	uHalfSignificand \|= HALF_QUIET_NAN_BIT;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	281	} else {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	282	// It's an sNaN; make sure the significand is not zero so it stays a NaN
				283	// This is needed because not all significand bits are copied from single
				284	if(!uHalfSignificand) {
				285	// Set the LSB. This is what wikipedia shows for sNAN.
				286	uHalfSignificand \|= 0x01;
				287	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	288	}
				289	}
				290	} else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
				291	// 0 or a subnormal number -- double biased exponent is 0
				292	uHalfBiasedExponent = 0;
				293	uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
				294	} else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
				295	// Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
				296	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				297	uHalfSignificand = 0;
				298	} else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
				299	// Exponent is too small to express in half-precision; round down to zero
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	300	uHalfBiasedExponent = HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	301	// Difference between double normal exponent and the base exponent of a half subnormal
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	302	const uint64_t uExpDiff = (uint64_t)-(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	303	// Also have to shift the significand by the difference in number of bits between a double and a half significand
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	304	const uint64_t uSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	305	// Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
Laurence Lundblade	f280195	2018-12-17 10:40:29 -0800	[diff] [blame]	306	const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01ULL << DOUBLE_NUM_SIGNIFICAND_BITS);
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	307	uHalfSignificand = uDoubleSignificandSubnormal >> (uExpDiff + uSignificandBitsDiff);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	308	} else {
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	309	// The normal case, exponent is in range for half-precision
				310	uHalfBiasedExponent = (uint32_t)(nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	311	uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				312	}
				313	uHalfSign = uDoubleSign;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	314
				315
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	316	// Put the 3 values in the right place for a half precision
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	317	const uint64_t uHalfPrecision = uHalfSignificand \|
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	318	(uHalfBiasedExponent << HALF_EXPONENT_SHIFT) \|
				319	(uHalfSign << HALF_SIGN_SHIFT);
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	320	// Cast is safe because all the masks and shifts above work to make
				321	// a half precision value which is only 16 bits.
				322	return (uint16_t)uHalfPrecision;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	323	}
				324
				325
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	326
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	327	// Public function; see ieee754.h
				328	float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
				329	{
				330	// Pull out the three parts of the half-precision float
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	331	// Do all the work in 32 bits because that is what the end result is
				332	// may give smaller code size and will keep static analyzers happier.
				333	const uint32_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
				334	const int32_t nHalfUnBiasedExponent = (int32_t)((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
				335	const uint32_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	336
				337
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	338	// Make the three parts of the single-precision number
				339	uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
				340	if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
				341	// 0 or subnormal
				342	if(uHalfSignificand) {
				343	// Subnormal case
				344	uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
				345	// A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
				346	uSingleSignificand = uHalfSignificand;
				347	// Shift bits from right of the decimal to left, reducing the exponent by 1 each time
				348	do {
				349	uSingleSignificand <<= 1;
				350	uSingleBiasedExponent--;
				351	} while ((uSingleSignificand & 0x400) == 0);
				352	uSingleSignificand &= HALF_SIGNIFICAND_MASK;
				353	uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				354	} else {
				355	// Just zero
				356	uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
				357	uSingleSignificand = 0;
				358	}
				359	} else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
				360	// NaN or Inifinity
				361	uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
				362	if(uHalfSignificand) {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	363	// NaN
				364	// First preserve the NaN payload from half to single
				365	uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
				366	if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
				367	// Next, set qNaN if needed since half qNaN bit is not copied above
				368	uSingleSignificand \|= SINGLE_QUIET_NAN_BIT;
				369	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	370	} else {
				371	// Infinity
				372	uSingleSignificand = 0;
				373	}
				374	} else {
				375	// Normal number
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	376	uSingleBiasedExponent = (uint32_t)(nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	377	uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				378	}
				379	uSingleSign = uHalfSign;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	380
Laurence Lundblade	ee85174	2020-01-08 08:37:05 -0800	[diff] [blame]	381	// Shift the three parts of the single-precision into place
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	382	const uint32_t uSinglePrecision = uSingleSignificand \|
				383	(uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) \|
				384	(uSingleSign << SINGLE_SIGN_SHIFT);
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	385
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	386	return CopyUint32ToFloat(uSinglePrecision);
				387	}
				388
				389
Laurence Lundblade	67bd551	2018-11-02 21:44:06 +0700	[diff] [blame]	390	// Public function; see ieee754.h
				391	double IEEE754_HalfToDouble(uint16_t uHalfPrecision)
				392	{
				393	// Pull out the three parts of the half-precision float
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	394	// Do all the work in 64 bits because that is what the end result is
				395	// may give smaller code size and will keep static analyzers happier.
				396	const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
				397	const int64_t nHalfUnBiasedExponent = (int64_t)((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
				398	const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	399
				400
Laurence Lundblade	67bd551	2018-11-02 21:44:06 +0700	[diff] [blame]	401	// Make the three parts of hte single-precision number
				402	uint64_t uDoubleSignificand, uDoubleSign, uDoubleBiasedExponent;
				403	if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
				404	// 0 or subnormal
				405	uDoubleBiasedExponent = DOUBLE_EXPONENT_ZERO + DOUBLE_EXPONENT_BIAS;
				406	if(uHalfSignificand) {
				407	// Subnormal case
				408	uDoubleBiasedExponent = -HALF_EXPONENT_BIAS + DOUBLE_EXPONENT_BIAS +1;
				409	// A half-precision subnormal can always be converted to a normal double-precision float because the ranges line up
				410	uDoubleSignificand = uHalfSignificand;
				411	// Shift bits from right of the decimal to left, reducing the exponent by 1 each time
				412	do {
				413	uDoubleSignificand <<= 1;
				414	uDoubleBiasedExponent--;
				415	} while ((uDoubleSignificand & 0x400) == 0);
				416	uDoubleSignificand &= HALF_SIGNIFICAND_MASK;
				417	uDoubleSignificand <<= (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				418	} else {
				419	// Just zero
				420	uDoubleSignificand = 0;
				421	}
				422	} else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
				423	// NaN or Inifinity
				424	uDoubleBiasedExponent = DOUBLE_EXPONENT_INF_OR_NAN + DOUBLE_EXPONENT_BIAS;
				425	if(uHalfSignificand) {
				426	// NaN
				427	// First preserve the NaN payload from half to single
				428	uDoubleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
				429	if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
				430	// Next, set qNaN if needed since half qNaN bit is not copied above
				431	uDoubleSignificand \|= DOUBLE_QUIET_NAN_BIT;
				432	}
				433	} else {
				434	// Infinity
				435	uDoubleSignificand = 0;
				436	}
				437	} else {
				438	// Normal number
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	439	uDoubleBiasedExponent = (uint64_t)(nHalfUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
				440	uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
Laurence Lundblade	67bd551	2018-11-02 21:44:06 +0700	[diff] [blame]	441	}
				442	uDoubleSign = uHalfSign;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	443
				444
Laurence Lundblade	67bd551	2018-11-02 21:44:06 +0700	[diff] [blame]	445	// Shift the 3 parts into place as a double-precision
				446	const uint64_t uDouble = uDoubleSignificand \|
				447	(uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) \|
				448	(uDoubleSign << DOUBLE_SIGN_SHIFT);
				449	return CopyUint64ToDouble(uDouble);
				450	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	451
				452
				453	// Public function; see ieee754.h
				454	IEEE754_union IEEE754_FloatToSmallest(float f)
				455	{
				456	IEEE754_union result;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	457
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	458	// Pull the neeed two parts out of the single-precision float
				459	const uint32_t uSingle = CopyFloatToUint32(f);
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	460	const int32_t nSingleExponent = (int32_t)((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	461	const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	462
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	463	// Bit mask that is the significand bits that would be lost when converting
				464	// from single-precision to half-precision
				465	const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
				466
				467	// Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
				468	if(uSingle == 0) {
				469	// Value is 0.0000, not a a subnormal
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	470	result.uSize = IEEE754_UNION_IS_HALF;
				471	result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	472	} else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
				473	// NaN, +/- infinity
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	474	result.uSize = IEEE754_UNION_IS_HALF;
				475	result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	476	} else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
				477	// Normal number in exponent range and precision won't be lost
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	478	result.uSize = IEEE754_UNION_IS_HALF;
				479	result.uValue = IEEE754_FloatToHalf(f);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	480	} else {
				481	// Subnormal, exponent out of range, or precision will be lost
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	482	result.uSize = IEEE754_UNION_IS_SINGLE;
				483	result.uValue = uSingle;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	484	}
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	485
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	486	return result;
				487	}
				488
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	489	// Public function; see ieee754.h
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	490	IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
				491	{
				492	IEEE754_union result;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	493
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	494	// Pull the needed two parts out of the double-precision float
				495	const uint64_t uDouble = CopyDoubleToUint64(d);
Laurence Lundblade	c5fef68	2020-01-25 11:38:45 -0800	[diff] [blame^]	496	const int64_t nDoubleExponent = (int64_t)((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	497	const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	498
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	499	// Masks to check whether dropped significand bits are zero or not
				500	const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
				501	const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	502
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	503	// The various cases
Laurence Lundblade	d711fb2	2018-09-26 14:35:22 -0700	[diff] [blame]	504	if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	505	// Value is 0.0000, not a a subnormal
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	506	result.uSize = IEEE754_UNION_IS_HALF;
				507	result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	508	} else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
				509	// NaN, +/- infinity
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	510	result.uSize = IEEE754_UNION_IS_HALF;
				511	result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	512	} else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
				513	// Can convert to half without precision loss
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	514	result.uSize = IEEE754_UNION_IS_HALF;
				515	result.uValue = IEEE754_DoubleToHalf(d);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	516	} else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
				517	// Can convert to single without precision loss
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	518	result.uSize = IEEE754_UNION_IS_SINGLE;
				519	result.uValue = CopyFloatToUint32((float)d);
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	520	} else {
				521	// Can't convert without precision loss
Laurence Lundblade	577d821	2018-11-01 14:04:08 +0700	[diff] [blame]	522	result.uSize = IEEE754_UNION_IS_DOUBLE;
				523	result.uValue = uDouble;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	524	}
Laurence Lundblade	3aee3a3	2018-12-17 16:17:45 -0800	[diff] [blame]	525
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	526	return result;
				527	}
				528