Blame - src/ieee754.c - mirror/QCBOR - TrustedFirmware Git Browser

2024-01-07 19:17:52 -0700

[diff] [blame^]

1

/* ==========================================================================

2

* ieee754.c -- floating-point conversion between half, double & single-precision

*

*

* SPDX-License-Identifier: BSD-3-Clause

8

*

9

* See BSD-3-Clause license in README.md

10

*

11

* Created on 7/23/18

12

* ========================================================================== */

Laurence Lundblade

cc2ed34

2018-09-22 17:29:55 -0700

[diff] [blame]

13

Máté Tóth-Pál

ef5f07a

2021-09-17 19:31:37 +0200

[diff] [blame]

14

/*

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

15

* Include before QCBOR_DISABLE_PREFERRED_FLOAT is checked as

16

* QCBOR_DISABLE_PREFERRED_FLOAT might be defined in qcbor/qcbor_common.h

Máté Tóth-Pál

ef5f07a

2021-09-17 19:31:37 +0200

[diff] [blame]

17

*/

18

#include "qcbor/qcbor_common.h"

19

Laurence Lundblade

b275cdc

2020-07-12 12:34:38 -0700

[diff] [blame]

20

#ifndef QCBOR_DISABLE_PREFERRED_FLOAT

Laurence Lundblade

9682a53

2020-06-06 18:33:04 -0700

[diff] [blame]

21

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

22

#include "ieee754.h"

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

23

#include <string.h> /* For memcpy() */

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

24

Laurence Lundblade

8db3d3e

2018-09-29 11:46:37 -0700

[diff] [blame]

25

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

26

/*

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

27

* This code has long lines and is easier to read because of

28

* them. Some coding guidelines prefer 80 column lines (can they not

29

* afford big displays?).

30

*

31

* This code works solely using shifts and masks and thus has no

32

* dependency on any math libraries. It can even work if the CPU

33

* doesn't have any floating-point support, though that isn't the most

34

* useful thing to do.

35

*

36

* The memcpy() dependency is only for CopyFloatToUint32() and friends

37

* which only is needed to avoid type punning when converting the

38

* actual float bits to an unsigned value so the bit shifts and masks

39

* can work.

40

*

41

* The references used to write this code:

42

*

43

* IEEE 754-2008, particularly section 3.6 and 6.2.1

44

*

45

* https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages

46

*

47

* https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values

48

*

49

* https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules

50

*

51

* https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be

52

*

53

* IEEE754_FloatToDouble(uint32_t uFloat) was created but is not

54

* needed. It can be retrieved from github history if needed.

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

*/

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

58

59

60

/* ----- Half Precsion ----------- */

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

61

#define HALF_NUM_SIGNIFICAND_BITS (10)

62

#define HALF_NUM_EXPONENT_BITS (5)

63

#define HALF_NUM_SIGN_BITS (1)

64

65

#define HALF_SIGNIFICAND_SHIFT (0)

66

#define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)

67

#define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)

68

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

69

#define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits

Laurence Lundblade

06350ea

2020-01-27 19:32:40 -0800

[diff] [blame]

70

#define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

71

#define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // 0x8000 1 bit of sign

Laurence Lundblade

06350ea

2020-01-27 19:32:40 -0800

[diff] [blame]

72

#define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

73

74

/* Biased Biased Unbiased Use

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

75

* 0x00 0 -15 0 and subnormal

76

* 0x01 1 -14 Smallest normal exponent

77

* 0x1e 30 15 Largest normal exponent

78

* 0x1F 31 16 NaN and Infinity */

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

79

#define HALF_EXPONENT_BIAS (15)

80

#define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased

81

#define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased

82

#define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased

83

#define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased

84

85

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

86

/* ------ Single-Precision -------- */

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

87

#define SINGLE_NUM_SIGNIFICAND_BITS (23)

88

#define SINGLE_NUM_EXPONENT_BITS (8)

89

#define SINGLE_NUM_SIGN_BITS (1)

90

91

#define SINGLE_SIGNIFICAND_SHIFT (0)

92

#define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)

93

#define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)

94

Laurence Lundblade

06350ea

2020-01-27 19:32:40 -0800

[diff] [blame]

95

#define SINGLE_SIGNIFICAND_MASK (0x7fffffU) // The lower 23 bits

96

#define SINGLE_EXPONENT_MASK (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent

97

#define SINGLE_SIGN_MASK (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign

98

#define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1))

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

99

100

/* Biased Biased Unbiased Use

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

101

* 0x0000 0 -127 0 and subnormal

102

* 0x0001 1 -126 Smallest normal exponent

103

* 0x7f 127 0 1

104

* 0xfe 254 127 Largest normal exponent

105

* 0xff 255 128 NaN and Infinity */

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

106

#define SINGLE_EXPONENT_BIAS (127)

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

107

#define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS)

108

#define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1)

109

#define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS)

110

#define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1)

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

111

112

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

113

/* --------- Double-Precision ---------- */

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

114

#define DOUBLE_NUM_SIGNIFICAND_BITS (52)

115

#define DOUBLE_NUM_EXPONENT_BITS (11)

116

#define DOUBLE_NUM_SIGN_BITS (1)

117

118

#define DOUBLE_SIGNIFICAND_SHIFT (0)

119

#define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)

120

#define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)

121

Laurence Lundblade

8db3d3e

2018-09-29 11:46:37 -0700

[diff] [blame]

122

#define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits

123

#define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent

124

#define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign

125

#define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))

126

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

127

128

/* Biased Biased Unbiased Use

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

129

* 0x00000000 0 -1023 0 and subnormal

130

* 0x00000001 1 -1022 Smallest normal exponent

131

* 0x000007fe 2046 1023 Largest normal exponent

132

* 0x000007ff 2047 1024 NaN and Infinity */

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

133

#define DOUBLE_EXPONENT_BIAS (1023)

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

134

#define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS)

135

#define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1)

136

#define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS)

137

#define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1)

138

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

/*

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

143

* Convenient functions to avoid type punning, compiler warnings and

144

* such. The optimizer reduces them to a simple assignment. This is a

145

* crusty corner of C. It shouldn't be this hard.

146

*

147

* These are also in UsefulBuf.h under a different name. They are copied

148

* here to avoid a dependency on UsefulBuf.h. There is no object code

149

* size impact because these always optimze down to a simple assignment.

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

150

*/

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

151

static inline uint32_t

152

CopyFloatToUint32(float f)

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

153

{

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

154

uint32_t u32;

155

memcpy(&u32, &f, sizeof(uint32_t));

156

return u32;

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

157

}

158

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

159

static inline uint64_t

160

CopyDoubleToUint64(double d)

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

161

{

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

162

uint64_t u64;

163

memcpy(&u64, &d, sizeof(uint64_t));

164

return u64;

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

165

}

166

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

167

static inline double

168

CopyUint64ToDouble(uint64_t u64)

Laurence Lundblade

2018-11-02 21:44:06 +0700

[diff] [blame]

169

{

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

170

double d;

171

memcpy(&d, &u64, sizeof(uint64_t));

return d;

}

static inline float

CopyUint32ToSingle(uint32_t u32)

177

{

178

float f;

179

memcpy(&f, &u32, sizeof(uint32_t));

180

return f;

Laurence Lundblade

2018-11-02 21:44:06 +0700

[diff] [blame]

181

}

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

182

183

Laurence Lundblade

2018-12-17 16:17:45 -0800

[diff] [blame]

184

185

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

186

/**

187

* @brief Assemble sign, significand and exponent into single precision float.

188

*

189

* @param[in] uDoubleSign 0 if positive, 1 if negative

190

* @pararm[in] uDoubleSignificand Bits of the significand

191

* @param[in] nDoubleUnBiasedExponent Exponent

192

*

193

* This returns the bits for a single-precision float, a binary64

194

* as specified in IEEE754.

Laurence Lundblade

fe09bbf

2020-07-16 12:14:51 -0700

[diff] [blame]

195

*/

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

196

static double

197

IEEE754_AssembleDouble(uint64_t uDoubleSign,

198

uint64_t uDoubleSignificand,

199

int64_t nDoubleUnBiasedExponent)

Laurence Lundblade

2018-11-02 21:44:06 +0700

[diff] [blame]

200

{

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

201

uint64_t uDoubleBiasedExponent;

202

203

uDoubleBiasedExponent = (uint64_t)(nDoubleUnBiasedExponent + DOUBLE_EXPONENT_BIAS);

204

205

return CopyUint64ToDouble(uDoubleSignificand |

206

(uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |

207

(uDoubleSign << DOUBLE_SIGN_SHIFT));

208

}

Laurence Lundblade

2018-12-17 16:17:45 -0800

[diff] [blame]

209

210

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

211

double

212

IEEE754_HalfToDouble(uint16_t uHalfPrecision)

213

{

214

uint64_t uDoubleSignificand;

215

int64_t nDoubleUnBiasedExponent;

216

double dResult;

217

218

/* Pull out the three parts of the half-precision float. Do all

219

* the work in 64 bits because that is what the end result is. It

220

* may give smaller code size and will keep static analyzers

221

* happier.

222

*/

223

const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;

224

const uint64_t uHalfBiasedExponent = (uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT;

225

const int64_t nHalfUnBiasedExponent = (int64_t)uHalfBiasedExponent - HALF_EXPONENT_BIAS;

226

const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;

227

228

if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {

229

/* 0 or subnormal */

230

if(uHalfSignificand) {

231

/* --- SUBNORMAL --- */

232

/* A half-precision subnormal can always be converted to a

233

* normal double-precision float because the ranges line up.

234

* The exponent of a subnormal starts out at the min exponent

235

* for a normal. As the sub normal significand bits are

236

* shifted, left to normalize, the exponent is

237

* decremented. Shifting continues until fully normalized.

238

*/

239

nDoubleUnBiasedExponent = HALF_EXPONENT_MIN;

240

uDoubleSignificand = uHalfSignificand;

241

do {

242

uDoubleSignificand <<= 1;

243

nDoubleUnBiasedExponent--;

244

} while ((uDoubleSignificand & (1ULL << HALF_NUM_SIGNIFICAND_BITS)) == 0);

245

/* A normal has an implied 1 in the most significant

246

* position that a subnormal doesn't. */

247

uDoubleSignificand -= 1ULL << HALF_NUM_SIGNIFICAND_BITS;

248

/* Must shift into place for a double significand */

249

uDoubleSignificand <<= DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;

250

251

dResult = IEEE754_AssembleDouble(uHalfSign,

252

uDoubleSignificand,

253

nDoubleUnBiasedExponent);

254

} else {

255

/* --- ZERO --- */

256

dResult = IEEE754_AssembleDouble(uHalfSign,

257

0,

258

DOUBLE_EXPONENT_ZERO);

259

}

260

} else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {

261

/* NaN or Inifinity */

262

if(uHalfSignificand) {

263

/* --- NaN --- */

264

/* Half-precision payloads always fit into double precision

265

* payloads. They are shifted left the same as a normal

266

* number significand.

267

*/

268

uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);

269

dResult = IEEE754_AssembleDouble(uHalfSign,

270

uDoubleSignificand,

271

DOUBLE_EXPONENT_INF_OR_NAN);

272

} else {

273

/* --- INFINITY --- */

274

dResult = IEEE754_AssembleDouble(uHalfSign,

275

0,

276

DOUBLE_EXPONENT_INF_OR_NAN);

277

}

278

} else {

279

/* --- NORMAL NUMBER --- */

280

uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);

281

dResult = IEEE754_AssembleDouble(uHalfSign,

282

uDoubleSignificand,

283

nHalfUnBiasedExponent);

}

return dResult;

}

/**

* @brief Assemble sign, significand and exponent into single precision float.

292

*

293

* @param[in] uHalfSign 0 if positive, 1 if negative

294

* @pararm[in] uHalfSignificand Bits of the significand

295

* @param[in] nHalfUnBiasedExponent Exponent

296

*

297

* This returns the bits for a single-precision float, a binary32 as

298

* specified in IEEE754. It is returned as a uint64_t rather than a

299

* uint32_t or a float for convenience of usage.

300

*/

301

static uint32_t

302

IEEE754_AssembleHalf(uint32_t uHalfSign,

303

uint32_t uHalfSignificand,

304

int32_t nHalfUnBiasedExponent)

305

{

306

uint32_t uHalfUnbiasedExponent;

307

308

uHalfUnbiasedExponent = (uint32_t)(nHalfUnBiasedExponent + HALF_EXPONENT_BIAS);

309

310

return uHalfSignificand |

311

(uHalfUnbiasedExponent << HALF_EXPONENT_SHIFT) |

312

(uHalfSign << HALF_SIGN_SHIFT);

}

/* Public function; see ieee754.h */

317

IEEE754_union

318

IEEE754_SingleToHalf(float f)

319

{

320

IEEE754_union result;

321

uint32_t uDroppedBits;

322

int32_t nExponentDifference;

323

int32_t nShiftAmount;

324

uint32_t uHalfSignificand;

325

326

/* Pull the three parts out of the double-precision float Most work

327

* is done with uint32_t which helps avoid integer promotions and

328

* static analyzer complaints.

329

*/

330

const uint32_t uSingle = CopyFloatToUint32(f);

331

const uint32_t uSingleBiasedExponent = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT;

332

const int32_t nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS;

333

const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;

334

const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;

335

336

if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {

337

if(uSingleSignificand == 0) {

338

/* --- IS ZERO --- */

339

result.uSize = IEEE754_UNION_IS_HALF;

340

result.uValue = IEEE754_AssembleHalf(uSingleSign,

0,

HALF_EXPONENT_ZERO);

} else {

/* --- IS SINGLE SUBNORMAL --- */

345

/* The largest single subnormal is slightly less than the

346

* largest single normal which is 2^-149 or

347

* 2.2040517676619426e-38. The smallest half subnormal is

348

* 2^-14 or 5.9604644775390625E-8. There is no overlap so

349

* single subnormals can't be converted to halfs of any sort.

350

*/

351

result.uSize = IEEE754_UNION_IS_SINGLE;

352

result.uValue = uSingle;

353

}

354

} else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {

355

if(uSingleSignificand == 0) {

356

/* ---- IS INFINITY ---- */

357

result.uSize = IEEE754_UNION_IS_HALF;

358

result.uValue = IEEE754_AssembleHalf(uSingleSign, 0, HALF_EXPONENT_INF_OR_NAN);

359

} else {

360

/* The NaN can only be converted if no payload bits are lost

361

* per RFC 8949 section 4.1 that defines Preferred

362

* Serializaton. Note that Deterministically Encode CBOR in

363

* section 4.2 allows for some variation of this rule, but at

364

* the moment this implementation is of Preferred

365

* Serialization, not CDE. As of December 2023, we are also

366

* expecting an update to CDE. This code may need to be

367

* updated for CDE.

368

*/

369

uDroppedBits = uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS);

370

if(uDroppedBits == 0) {

371

/* --- IS CONVERTABLE NAN --- */

372

uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);

373

result.uSize = IEEE754_UNION_IS_HALF;

374

result.uValue = IEEE754_AssembleHalf(uSingleSign,

375

uHalfSignificand,

376

HALF_EXPONENT_INF_OR_NAN);

377

378

} else {

379

/* --- IS UNCONVERTABLE NAN --- */

380

result.uSize = IEEE754_UNION_IS_SINGLE;

381

result.uValue = uSingle;

}

}

} else {

/* ---- REGULAR NUMBER ---- */

386

/* A regular single can be converted to a regular half if the

387

* single's exponent is in the smaller range of a half and if no

388

* precision is lost in the significand.

389

*/

390

if(nSingleUnbiasedExponent >= HALF_EXPONENT_MIN &&

391

nSingleUnbiasedExponent <= HALF_EXPONENT_MAX &&

392

(uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS)) == 0) {

393

uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);

394

395

/* --- CONVERT TO HALF NORMAL --- */

396

result.uSize = IEEE754_UNION_IS_HALF;

397

result.uValue = IEEE754_AssembleHalf(uSingleSign,

398

uHalfSignificand,

399

nSingleUnbiasedExponent);

400

} else {

401

/* Unable to convert to a half normal. See if it can be

402

* converted to a half subnormal. To do that, the exponent

403

* must be in range and no precision can be lost in the

404

* signficand.

405

*

406

* This is more complicated because the number is not

407

* normalized. The signficand must be shifted proprotionally

408

* to the exponent and 1 must be added in. See

409

* https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding

410

*

411

* Exponents -14 to -24 map to a shift of 0 to 10 of the

412

* significand. The largest value of a half subnormal has an

413

* exponent of -14. Subnormals are not normalized like

414

* normals meaning they lose precision as the numbers get

415

* smaller. Normals don't lose precision because the exponent

416

* allows all the bits of the significand to be significant.

417

*/

418

/* The exponent of the largest possible half-precision

419

* subnormal is HALF_EXPONENT_MIN (-14). Exponents larger

420

* than this are normal and handled above. We're going to

421

* shift the significand right by at least this amount.

422

*/

423

nExponentDifference = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);

424

425

/* In addition to the shift based on the exponent's value,

426

* the single significand has to be shifted right to fit into

427

* a half-precision significand */

428

nShiftAmount = nExponentDifference + (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);

429

430

/* Must add 1 in to the possible significand because there is

431

* an implied 1 for normal values and not for subnormal

432

* values. See equations here:

433

* https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding

434

*/

435

uHalfSignificand = (uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;

436

437

/* If only zero bits get shifted out, this can be converted

438

* to subnormal */

439

if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN &&

440

nSingleUnbiasedExponent >= HALF_EXPONENT_MIN - HALF_NUM_SIGNIFICAND_BITS &&

441

uHalfSignificand << nShiftAmount == uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) {

442

/* --- CONVERTABLE TO HALF SUBNORMAL --- */

443

result.uSize = IEEE754_UNION_IS_HALF;

444

result.uValue = IEEE754_AssembleHalf(uSingleSign,

uHalfSignificand,

HALF_EXPONENT_ZERO);

} else {

/* --- DO NOT CONVERT --- */

449

result.uSize = IEEE754_UNION_IS_SINGLE;

450

result.uValue = uSingle;

}

}

}

return result;

}

/**

* @brief Assemble sign, significand and exponent into single precision float.

461

*

462

* @param[in] uSingleSign 0 if positive, 1 if negative

463

* @pararm[in] uSingleSignificand Bits of the significand

464

* @param[in] nSingleUnBiasedExponent Exponent

465

*

466

* This returns the bits for a single-precision float, a binary32 as

467

* specified in IEEE754. It is returned as a uint64_t rather than a

468

* uint32_t or a float for convenience of usage.

469

*/

470

static uint64_t

471

IEEE754_AssembleSingle(uint64_t uSingleSign,

472

uint64_t uSingleSignificand,

473

int64_t nSingleUnBiasedExponent)

474

{

475

uint64_t uSingleBiasedExponent;

476

477

uSingleBiasedExponent = (uint64_t)(nSingleUnBiasedExponent + SINGLE_EXPONENT_BIAS);

478

479

return uSingleSignificand |

480

(uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |

481

(uSingleSign << SINGLE_SIGN_SHIFT);

}

/**

* @brief Convert a double-precision float to single-precision.

487

*

488

* @param[in] d The value to convert.

489

*

490

* @returns Either unconverted value or value converted to single-precision.

491

*

492

* This always succeeds. If the value cannot be converted without the

493

* loss of precision, it is not converted.

494

*

495

* This handles all subnormals and NaN payloads.

496

*/

497

static IEEE754_union

498

IEEE754_DoubleToSingle(double d)

499

{

500

IEEE754_union Result;

501

int64_t nExponentDifference;

502

int64_t nShiftAmount;

503

uint64_t uSingleSignificand;

504

uint64_t uDroppedBits;

505

506

507

/* Pull the three parts out of the double-precision float. Most

508

* work is done with uint64_t which helps avoid integer promotions

509

* and static analyzer complaints.

510

*/

511

const uint64_t uDouble = CopyDoubleToUint64(d);

512

const uint64_t uDoubleBiasedExponent = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT;

513

const int64_t nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS;

514

const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;

515

const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;

516

517

518

if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {

519

if(uDoubleSignificand == 0) {

520

/* --- IS ZERO --- */

521

Result.uSize = IEEE754_UNION_IS_SINGLE;

522

Result.uValue = IEEE754_AssembleSingle(uDoubleSign,

523

0,

524

SINGLE_EXPONENT_ZERO);

Laurence Lundblade

2018-11-02 21:44:06 +0700

[diff] [blame]

525

} else {

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

526

/* --- IS DOUBLE SUBNORMAL --- */

527

/* The largest double subnormal is slightly less than the

528

* largest double normal which is 2^-1022 or

529

* 2.2250738585072014e-308. The smallest single subnormal

530

* is 2^-149 or 1.401298464324817e-45. There is no

531

* overlap so double subnormals can't be converted to

532

* singles of any sort.

533

*/

534

Result.uSize = IEEE754_UNION_IS_DOUBLE;

535

Result.uValue = uDouble;

536

}

537

} else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {

538

if(uDoubleSignificand == 0) {

539

/* ---- IS INFINITY ---- */

540

Result.uSize = IEEE754_UNION_IS_SINGLE;

541

Result.uValue = IEEE754_AssembleSingle(uDoubleSign,

542

0,

543

SINGLE_EXPONENT_INF_OR_NAN);

544

} else {

545

/* The NaN can only be converted if no payload bits are

546

* lost per RFC 8949 section 4.1 that defines Preferred

547

* Serializaton. Note that Deterministically Encode CBOR

548

* in section 4.2 allows for some variation of this rule,

549

* but at the moment this implementation is of Preferred

550

* Serialization, not CDE. As of December 2023, we are

551

* also expecting an update to CDE. This code may need to

552

* be updated for CDE.

553

*/

554

uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);

555

if(uDroppedBits == 0) {

556

/* --- IS CONVERTABLE NAN --- */

557

uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);

558

Result.uSize = IEEE754_UNION_IS_SINGLE;

559

Result.uValue = IEEE754_AssembleSingle(uDoubleSign,

560

uSingleSignificand,

561

SINGLE_EXPONENT_INF_OR_NAN);

562

} else {

563

/* --- IS UNCONVERTABLE NAN --- */

564

Result.uSize = IEEE754_UNION_IS_DOUBLE;

565

Result.uValue = uDouble;

Laurence Lundblade

2018-11-02 21:44:06 +0700

[diff] [blame]

566

}

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

567

}

568

} else {

569

/* ---- REGULAR NUMBER ---- */

570

/* A regular double can be converted to a regular single if

571

* the double's exponent is in the smaller range of a single

572

* and if no precision is lost in the significand.

573

*/

574

uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);

575

if(nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN &&

576

nDoubleUnbiasedExponent <= SINGLE_EXPONENT_MAX &&

577

uDroppedBits == 0) {

578

/* --- IS CONVERTABLE TO SINGLE --- */

579

uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);

580

Result.uSize = IEEE754_UNION_IS_SINGLE;

581

Result.uValue = IEEE754_AssembleSingle(uDoubleSign,

582

uSingleSignificand,

583

nDoubleUnbiasedExponent);

Laurence Lundblade

2018-11-02 21:44:06 +0700

[diff] [blame]

584

} else {

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

585

/* Unable to convert to a single normal. See if it can be

586

* converted to a single subnormal. To do that, the

587

* exponent must be in range and no precision can be lost

588

* in the signficand.

589

*

590

* This is more complicated because the number is not

591

* normalized. The signficand must be shifted

592

* proprotionally to the exponent and 1 must be added

593

* in. See

594

* https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding

595

*/

596

nExponentDifference = -(nDoubleUnbiasedExponent - SINGLE_EXPONENT_MIN);

597

nShiftAmount = nExponentDifference + (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);

598

uSingleSignificand = (uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;

599

600

if(nDoubleUnbiasedExponent < SINGLE_EXPONENT_MIN &&

601

nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN - SINGLE_NUM_SIGNIFICAND_BITS &&

602

uSingleSignificand << nShiftAmount == uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) {

603

/* --- IS CONVERTABLE TO SINGLE SUBNORMAL --- */

604

Result.uSize = IEEE754_UNION_IS_SINGLE;

605

Result.uValue = IEEE754_AssembleSingle(uDoubleSign,

606

uSingleSignificand,

607

SINGLE_EXPONENT_ZERO);

608

} else {

609

/* --- CAN NOT BE CONVERTED --- */

610

Result.uSize = IEEE754_UNION_IS_DOUBLE;

611

Result.uValue = uDouble;

612

}

Laurence Lundblade

2018-11-02 21:44:06 +0700

[diff] [blame]

613

}

Laurence Lundblade

2018-11-02 21:44:06 +0700

[diff] [blame]

614

}

Laurence Lundblade

2018-12-17 16:17:45 -0800

[diff] [blame]

615

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

616

return Result;

Laurence Lundblade

2018-11-02 21:44:06 +0700

[diff] [blame]

617

}

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

618

619

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

620

/* Public function; see ieee754.h */

621

IEEE754_union

622

IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision)

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

623

{

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

624

IEEE754_union result;

Laurence Lundblade

2018-12-17 16:17:45 -0800

[diff] [blame]

625

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

626

result = IEEE754_DoubleToSingle(d);

Laurence Lundblade

2018-12-17 16:17:45 -0800

[diff] [blame]

627

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

628

if(result.uSize == IEEE754_UNION_IS_SINGLE && bAllowHalfPrecision) {

629

/* Cast to uint32_t is OK, because value was just successfully

630

* converted to single. */

631

float uSingle = CopyUint32ToSingle((uint32_t)result.uValue);

632

result = IEEE754_SingleToHalf(uSingle);

633

}

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

634

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

635

return result;

Laurence Lundblade

2018-09-19 11:25:27 -0700

[diff] [blame]

636

}

637

Laurence Lundblade

2018-12-17 16:17:45 -0800

[diff] [blame]

638

Laurence Lundblade

2024-01-07 19:17:52 -0700

[diff] [blame^]

639

#else /* QCBOR_DISABLE_PREFERRED_FLOAT */

Laurence Lundblade

2018-12-17 16:17:45 -0800

[diff] [blame]

640

Laurence Lundblade