Blame - src/ieee754.c - mirror/QCBOR - TrustedFirmware Git Browser

blob: b4497ccaf163be70b6b87cbdf03d0d6b0d713cbd [file] [log] [blame]

Laurence Lundblade	cc2ed34	2018-09-22 17:29:55 -0700	[diff] [blame]	1	/*==============================================================================
				2	Copyright 2018 Laurence Lundblade
				3
				4	Permission is hereby granted, free of charge, to any person obtaining
				5	a copy of this software and associated documentation files (the
				6	"Software"), to deal in the Software without restriction, including
				7	without limitation the rights to use, copy, modify, merge, publish,
				8	distribute, sublicense, and/or sell copies of the Software, and to
				9	permit persons to whom the Software is furnished to do so, subject to
				10	the following conditions:
				11
				12	The above copyright notice and this permission notice shall be included
				13	in all copies or substantial portions of the Software.
				14
				15	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				16	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				17	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				18	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				19	BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				20	ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				21	CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	SOFTWARE.
				23
				24	(This is the MIT license)
				25	==============================================================================*/
				26
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	27	//
				28	// ieee754.c
				29	// Indefinite
				30	//
				31	// Created by Laurence Lundblade on 7/23/18.
				32	// Copyright © 2018 Laurence Lundblade. All rights reserved.
				33	//
				34
				35	#include "ieee754.h"
				36	#include <string.h> // For memcpy()
				37
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	38
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	39	/*
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	40	This code is written for clarity and verifiability, not for size, on the assumption
				41	that the optimizer will do a good job. The LLVM optimizer, -Os, does seem to do the
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	42	job and the resulting object code is smaller from combining code for the many different
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	43	cases (normal, subnormal, infinity, zero...) for the conversions.
				44
Laurence Lundblade	570fab5	2018-10-13 18:28:27 +0800	[diff] [blame]	45	Dead stripping is also really helpful to get code size down when floating point
				46	encoding is not needed.
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	47
Laurence Lundblade	570fab5	2018-10-13 18:28:27 +0800	[diff] [blame]	48	This code works solely using shifts and masks and thus has no dependency on
				49	any math libraries. It can even work if the CPU doesn't have any floating
				50	point support, though that isn't the most useful thing to do.
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	51
				52	The memcpy() dependency is only for CopyFloatToUint32() and friends which only
				53	is needed to avoid type punning when converting the actual float bits to
				54	an unsigned value so the bit shifts and masks can work.
				55	*/
				56
				57	/*
				58	The references used to write this code:
				59
				60	- IEEE 754-2008, particularly section 3.6 and 6.2.1
				61
				62	- https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
				63
				64	- https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	65	*/
				66
				67
				68	// ----- Half Precsion -----------
				69	#define HALF_NUM_SIGNIFICAND_BITS (10)
				70	#define HALF_NUM_EXPONENT_BITS (5)
				71	#define HALF_NUM_SIGN_BITS (1)
				72
				73	#define HALF_SIGNIFICAND_SHIFT (0)
				74	#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
				75	#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
				76
				77	#define HALF_SIGNIFICAND_MASK (0x3ff) // The lower 10 bits // 0x03ff
				78	#define HALF_EXPONENT_MASK (0x1f << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
				79	#define HALF_SIGN_MASK (0x01 << HALF_SIGN_SHIFT) // // 0x80001 bit of sign
				80	#define HALF_QUIET_NAN_BIT (0x01 << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
				81
				82	/* Biased Biased Unbiased Use
				83	0x00 0 -15 0 and subnormal
				84	0x01 1 -14 Smallest normal exponent
				85	0x1e 30 15 Largest normal exponent
				86	0x1F 31 16 NaN and Infinity */
				87	#define HALF_EXPONENT_BIAS (15)
				88	#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
				89	#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
				90	#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
				91	#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
				92
				93
				94	// ------ Single Precision --------
				95	#define SINGLE_NUM_SIGNIFICAND_BITS (23)
				96	#define SINGLE_NUM_EXPONENT_BITS (8)
				97	#define SINGLE_NUM_SIGN_BITS (1)
				98
				99	#define SINGLE_SIGNIFICAND_SHIFT (0)
				100	#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
				101	#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
				102
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	103	#define SINGLE_SIGNIFICAND_MASK (0x7fffffUL) // The lower 23 bits
				104	#define SINGLE_EXPONENT_MASK (0xffUL << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
				105	#define SINGLE_SIGN_MASK (0x01UL << SINGLE_SIGN_SHIFT) // 1 bit of sign
				106	#define SINGLE_QUIET_NAN_BIT (0x01UL << (SINGLE_NUM_SIGNIFICAND_BITS-1))
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	107
				108	/* Biased Biased Unbiased Use
				109	0x0000 0 -127 0 and subnormal
				110	0x0001 1 -126 Smallest normal exponent
				111	0x7f 127 0 1
				112	0xfe 254 127 Largest normal exponent
				113	0xff 255 128 NaN and Infinity */
				114	#define SINGLE_EXPONENT_BIAS (127)
				115	#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) // 127 unbiased
				116	#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) // -126 unbiased
				117	#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) // -127 unbiased
				118	#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) // 128 unbiased
				119
				120
				121	// --------- Double Precision ----------
				122	#define DOUBLE_NUM_SIGNIFICAND_BITS (52)
				123	#define DOUBLE_NUM_EXPONENT_BITS (11)
				124	#define DOUBLE_NUM_SIGN_BITS (1)
				125
				126	#define DOUBLE_SIGNIFICAND_SHIFT (0)
				127	#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
				128	#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
				129
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	130	#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
				131	#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
				132	#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
				133	#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
				134
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	135
				136	/* Biased Biased Unbiased Use
				137	0x00000000 0 -1023 0 and subnormal
				138	0x00000001 1 -1022 Smallest normal exponent
				139	0x000007fe 2046 1023 Largest normal exponent
				140	0x000007ff 2047 1024 NaN and Infinity */
				141	#define DOUBLE_EXPONENT_BIAS (1023)
				142	#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) // unbiased
				143	#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) // unbiased
				144	#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) // unbiased
				145	#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) // unbiased
				146
				147
				148
				149	/*
				150	Convenient functions to avoid type punning, compiler warnings and such
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	151	The optimizer reduces them to a simple assignment.
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	152	This is a crusty corner of C. It shouldn't be this hard.
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	153
				154	These are also in UsefulBuf.h under a different name. They are copied
				155	here because to avoid a dependency on UsefulBuf.h. There is no
				156	object code size impact because these always optimze down to a
				157	simple assignment.
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	158	*/
				159	static inline uint32_t CopyFloatToUint32(float f)
				160	{
				161	uint32_t u32;
				162	memcpy(&u32, &f, sizeof(uint32_t));
				163	return u32;
				164	}
				165
				166	static inline uint64_t CopyDoubleToUint64(double d)
				167	{
				168	uint64_t u64;
				169	memcpy(&u64, &d, sizeof(uint64_t));
				170	return u64;
				171	}
				172
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	173	static inline float CopyUint32ToFloat(uint32_t u32)
				174	{
				175	float f;
				176	memcpy(&f, &u32, sizeof(uint32_t));
				177	return f;
				178	}
				179
				180
				181
				182	// Public function; see ieee754.h
Laurence Lundblade	cc2ed34	2018-09-22 17:29:55 -0700	[diff] [blame]	183	uint16_t IEEE754_FloatToHalf(float f)
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	184	{
				185	// Pull the three parts out of the single-precision float
				186	const uint32_t uSingle = CopyFloatToUint32(f);
				187	const int32_t nSingleUnbiasedExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
				188	const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
				189	const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
				190
				191
				192	// Now convert the three parts to half-precision.
				193	uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
				194	if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
				195	// +/- Infinity and NaNs -- single biased exponent is 0xff
				196	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				197	if(!uSingleSignificand) {
				198	// Infinity
				199	uHalfSignificand = 0;
				200	} else {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	201	// Copy the LBSs of the NaN payload that will fit from the single to the half
				202	uHalfSignificand = uSingleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
				203	if(uSingleSignificand & SINGLE_QUIET_NAN_BIT) {
				204	// It's a qNaN; copy the qNaN bit
				205	uHalfSignificand \|= HALF_QUIET_NAN_BIT;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	206	} else {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	207	// It's a sNaN; make sure the significand is not zero so it stays a NaN
				208	// This is needed because not all significand bits are copied from single
				209	if(!uHalfSignificand) {
				210	// Set the LSB. This is what wikipedia shows for sNAN.
				211	uHalfSignificand \|= 0x01;
				212	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	213	}
				214	}
				215	} else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
				216	// 0 or a subnormal number -- singled biased exponent is 0
				217	uHalfBiasedExponent = 0;
				218	uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision
				219	} else if(nSingleUnbiasedExponent > HALF_EXPONENT_MAX) {
				220	// Exponent is too large to express in half-precision; round up to infinity
				221	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				222	uHalfSignificand = 0;
				223	} else if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN) {
				224	// Exponent is too small to express in half-precision normal; make it a half-precision subnormal
				225	uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
				226	// Difference between single normal exponent and the base exponent of a half subnormal
				227	const uint32_t nExpDiff = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
				228	// Also have to shift the significand by the difference in number of bits between a single and a half significand
				229	const int32_t nSignificandBitsDiff = SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
				230	// Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
				231	const uint32_t uSingleSignificandSubnormal = uSingleSignificand + (0x01L << SINGLE_NUM_SIGNIFICAND_BITS);
				232	uHalfSignificand = uSingleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
				233	} else {
				234	// The normal case
				235	uHalfBiasedExponent = nSingleUnbiasedExponent + HALF_EXPONENT_BIAS;
				236	uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				237	}
				238	uHalfSign = uSingleSign;
				239
				240	// Put the 3 values in the right place for a half precision
				241	const uint16_t uHalfPrecision = uHalfSignificand \|
				242	(uHalfBiasedExponent << HALF_EXPONENT_SHIFT) \|
				243	(uHalfSign << HALF_SIGN_SHIFT);
				244	return uHalfPrecision;
				245	}
				246
				247
				248	// Public function; see ieee754.h
Laurence Lundblade	cc2ed34	2018-09-22 17:29:55 -0700	[diff] [blame]	249	uint16_t IEEE754_DoubleToHalf(double d)
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	250	{
				251	// Pull the three parts out of the double-precision float
				252	const uint64_t uDouble = CopyDoubleToUint64(d);
				253	const int64_t nDoubleUnbiasedExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
				254	const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
				255	const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
				256
				257
				258	// Now convert the three parts to half-precision.
				259	uint16_t uHalfSign, uHalfSignificand, uHalfBiasedExponent;
				260	if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
				261	// +/- Infinity and NaNs -- single biased exponent is 0xff
				262	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				263	if(!uDoubleSignificand) {
				264	// Infinity
				265	uHalfSignificand = 0;
				266	} else {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	267	// Copy the LBSs of the NaN payload that will fit from the double to the half
				268	uHalfSignificand = uDoubleSignificand & (HALF_SIGNIFICAND_MASK & ~HALF_QUIET_NAN_BIT);
				269	if(uDoubleSignificand & DOUBLE_QUIET_NAN_BIT) {
				270	// It's a qNaN; copy the qNaN bit
				271	uHalfSignificand \|= HALF_QUIET_NAN_BIT;
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	272	} else {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	273	// It's an sNaN; make sure the significand is not zero so it stays a NaN
				274	// This is needed because not all significand bits are copied from single
				275	if(!uHalfSignificand) {
				276	// Set the LSB. This is what wikipedia shows for sNAN.
				277	uHalfSignificand \|= 0x01;
				278	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	279	}
				280	}
				281	} else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
				282	// 0 or a subnormal number -- double biased exponent is 0
				283	uHalfBiasedExponent = 0;
				284	uHalfSignificand = 0; // Any subnormal single will be too small to express as a half precision; TODO, is this really true?
				285	} else if(nDoubleUnbiasedExponent > HALF_EXPONENT_MAX) {
				286	// Exponent is too large to express in half-precision; round up to infinity; TODO, is this really true?
				287	uHalfBiasedExponent = HALF_EXPONENT_INF_OR_NAN + HALF_EXPONENT_BIAS;
				288	uHalfSignificand = 0;
				289	} else if(nDoubleUnbiasedExponent < HALF_EXPONENT_MIN) {
				290	// Exponent is too small to express in half-precision; round down to zero
				291	uHalfBiasedExponent = (uint16_t)(HALF_EXPONENT_ZERO + HALF_EXPONENT_BIAS);
				292	// Difference between double normal exponent and the base exponent of a half subnormal
				293	const uint64_t nExpDiff = -(nDoubleUnbiasedExponent - HALF_EXPONENT_MIN);
				294	// Also have to shift the significand by the difference in number of bits between a double and a half significand
				295	const int64_t nSignificandBitsDiff = DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
				296	// Add in the 1 that is implied in the significand of a normal number; it needs to be present in a subnormal
				297	const uint64_t uDoubleSignificandSubnormal = uDoubleSignificand + (0x01L << DOUBLE_NUM_SIGNIFICAND_BITS);
				298	uHalfSignificand = uDoubleSignificandSubnormal >> (nExpDiff + nSignificandBitsDiff);
				299	} else {
				300	// The normal case
				301	uHalfBiasedExponent = nDoubleUnbiasedExponent + HALF_EXPONENT_BIAS;
				302	uHalfSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				303	}
				304	uHalfSign = uDoubleSign;
				305
				306
				307	// Put the 3 values in the right place for a half precision
				308	const uint16_t uHalfPrecision = uHalfSignificand \|
				309	(uHalfBiasedExponent << HALF_EXPONENT_SHIFT) \|
				310	(uHalfSign << HALF_SIGN_SHIFT);
				311	return uHalfPrecision;
				312	}
				313
				314
				315	// Public function; see ieee754.h
				316	float IEEE754_HalfToFloat(uint16_t uHalfPrecision)
				317	{
				318	// Pull out the three parts of the half-precision float
				319	const uint16_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
				320	const int16_t nHalfUnBiasedExponent = ((uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT) - HALF_EXPONENT_BIAS;
				321	const uint16_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
				322
				323
				324	// Make the three parts of the single-precision number
				325	uint32_t uSingleSignificand, uSingleSign, uSingleBiasedExponent;
				326	if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
				327	// 0 or subnormal
				328	if(uHalfSignificand) {
				329	// Subnormal case
				330	uSingleBiasedExponent = -HALF_EXPONENT_BIAS + SINGLE_EXPONENT_BIAS +1;
				331	// A half-precision subnormal can always be converted to a normal single-precision float because the ranges line up
				332	uSingleSignificand = uHalfSignificand;
				333	// Shift bits from right of the decimal to left, reducing the exponent by 1 each time
				334	do {
				335	uSingleSignificand <<= 1;
				336	uSingleBiasedExponent--;
				337	} while ((uSingleSignificand & 0x400) == 0);
				338	uSingleSignificand &= HALF_SIGNIFICAND_MASK;
				339	uSingleSignificand <<= (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				340	} else {
				341	// Just zero
				342	uSingleBiasedExponent = SINGLE_EXPONENT_ZERO + SINGLE_EXPONENT_BIAS;
				343	uSingleSignificand = 0;
				344	}
				345	} else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
				346	// NaN or Inifinity
				347	uSingleBiasedExponent = SINGLE_EXPONENT_INF_OR_NAN + SINGLE_EXPONENT_BIAS;
				348	if(uHalfSignificand) {
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	349	// NaN
				350	// First preserve the NaN payload from half to single
				351	uSingleSignificand = uHalfSignificand & ~HALF_QUIET_NAN_BIT;
				352	if(uHalfSignificand & HALF_QUIET_NAN_BIT) {
				353	// Next, set qNaN if needed since half qNaN bit is not copied above
				354	uSingleSignificand \|= SINGLE_QUIET_NAN_BIT;
				355	}
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	356	} else {
				357	// Infinity
				358	uSingleSignificand = 0;
				359	}
				360	} else {
				361	// Normal number
				362	uSingleBiasedExponent = nHalfUnBiasedExponent + SINGLE_EXPONENT_BIAS;
				363	uSingleSignificand = uHalfSignificand << (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
				364	}
				365	uSingleSign = uHalfSign;
				366
				367
				368	// Shift the three parts of the single precision into place
				369	const uint32_t uSinglePrecision = uSingleSignificand \|
				370	(uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) \|
				371	(uSingleSign << SINGLE_SIGN_SHIFT);
				372
				373	return CopyUint32ToFloat(uSinglePrecision);
				374	}
				375
				376
Laurence Lundblade	781fd82	2018-10-01 09:37:52 -0700	[diff] [blame]	377	/*
				378	double IEEE754_HalfToDouble(uint16_t uHalfPrecision) is not needed
				379	*/
				380
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	381
				382
				383	// Public function; see ieee754.h
				384	IEEE754_union IEEE754_FloatToSmallest(float f)
				385	{
				386	IEEE754_union result;
				387
				388	// Pull the neeed two parts out of the single-precision float
				389	const uint32_t uSingle = CopyFloatToUint32(f);
				390	const int32_t nSingleExponent = ((uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT) - SINGLE_EXPONENT_BIAS;
				391	const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
				392
				393	// Bit mask that is the significand bits that would be lost when converting
				394	// from single-precision to half-precision
				395	const uint64_t uDroppedSingleBits = SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
				396
				397	// Optimizer will re organize so there is only one call to IEEE754_FloatToHalf()
				398	if(uSingle == 0) {
				399	// Value is 0.0000, not a a subnormal
				400	result.uTag = IEEE754_UNION_IS_HALF;
				401	result.u16 = IEEE754_FloatToHalf(f);
				402	} else if(nSingleExponent == SINGLE_EXPONENT_INF_OR_NAN) {
				403	// NaN, +/- infinity
				404	result.uTag = IEEE754_UNION_IS_HALF;
				405	result.u16 = IEEE754_FloatToHalf(f);
				406	} else if((nSingleExponent >= HALF_EXPONENT_MIN) && nSingleExponent <= HALF_EXPONENT_MAX && (!(uSingleSignificand & uDroppedSingleBits))) {
				407	// Normal number in exponent range and precision won't be lost
				408	result.uTag = IEEE754_UNION_IS_HALF;
				409	result.u16 = IEEE754_FloatToHalf(f);
				410	} else {
				411	// Subnormal, exponent out of range, or precision will be lost
				412	result.uTag = IEEE754_UNION_IS_SINGLE;
				413	result.u32 = uSingle;
				414	}
				415
				416	return result;
				417	}
				418
Laurence Lundblade	8db3d3e	2018-09-29 11:46:37 -0700	[diff] [blame]	419	// Public function; see ieee754.h
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	420	IEEE754_union IEEE754_DoubleToSmallestInternal(double d, int bAllowHalfPrecision)
				421	{
				422	IEEE754_union result;
				423
				424	// Pull the needed two parts out of the double-precision float
				425	const uint64_t uDouble = CopyDoubleToUint64(d);
				426	const int64_t nDoubleExponent = ((uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT) - DOUBLE_EXPONENT_BIAS;
				427	const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
				428
				429	// Masks to check whether dropped significand bits are zero or not
				430	const uint64_t uDroppedDoubleBits = DOUBLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS;
				431	const uint64_t uDroppedSingleBits = DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS;
				432
				433	// The various cases
Laurence Lundblade	d711fb2	2018-09-26 14:35:22 -0700	[diff] [blame]	434	if(d == 0.0) { // Take care of positive and negative zero
Laurence Lundblade	12d32c5	2018-09-19 11:25:27 -0700	[diff] [blame]	435	// Value is 0.0000, not a a subnormal
				436	result.uTag = IEEE754_UNION_IS_HALF;
				437	result.u16 = IEEE754_DoubleToHalf(d);
				438	} else if(nDoubleExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
				439	// NaN, +/- infinity
				440	result.uTag = IEEE754_UNION_IS_HALF;
				441	result.u16 = IEEE754_DoubleToHalf(d);
				442	} else if(bAllowHalfPrecision && (nDoubleExponent >= HALF_EXPONENT_MIN) && nDoubleExponent <= HALF_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedDoubleBits))) {
				443	// Can convert to half without precision loss
				444	result.uTag = IEEE754_UNION_IS_HALF;
				445	result.u16 = IEEE754_DoubleToHalf(d);
				446	} else if((nDoubleExponent >= SINGLE_EXPONENT_MIN) && nDoubleExponent <= SINGLE_EXPONENT_MAX && (!(uDoubleSignificand & uDroppedSingleBits))) {
				447	// Can convert to single without precision loss
				448	result.uTag = IEEE754_UNION_IS_SINGLE;
				449	result.u32 = CopyFloatToUint32((float)d);
				450	} else {
				451	// Can't convert without precision loss
				452	result.uTag = IEEE754_UNION_IS_DOUBLE;
				453	result.u64 = uDouble;
				454	}
				455
				456	return result;
				457	}
				458