-
Notifications
You must be signed in to change notification settings - Fork 8
/
float16.go
310 lines (255 loc) · 9.77 KB
/
float16.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
// Copyright 2019 Montgomery Edwards⁴⁴⁸ and Faye Amacker
//
// Special thanks to Kathryn Long for her Rust implementation
// of float16 at github.com/starkat99/half-rs (MIT license)
// Package float16 defines support for half-precision floating-point numbers.
package float16
import (
"math"
"strconv"
)
// Float16 represents IEEE 754 half-precision floating-point numbers (binary16).
type Float16 uint16
// Precision indicates whether the conversion to Float16 is
// exact, subnormal without dropped bits, inexact, underflow, or overflow.
type Precision int
const (
// PrecisionExact is for non-subnormals that don't drop bits during conversion.
// All of these can round-trip. Should always convert to float16.
PrecisionExact Precision = iota
// PrecisionUnknown is for subnormals that don't drop bits during conversion but
// not all of these can round-trip so precision is unknown without more effort.
// Only 2046 of these can round-trip and the rest cannot round-trip.
PrecisionUnknown
// PrecisionInexact is for dropped significand bits and cannot round-trip.
// Some of these are subnormals. Cannot round-trip float32->float16->float32.
PrecisionInexact
// PrecisionUnderflow is for Underflows. Cannot round-trip float32->float16->float32.
PrecisionUnderflow
// PrecisionOverflow is for Overflows. Cannot round-trip float32->float16->float32.
PrecisionOverflow
)
// SmallestNonzero is the smallest nonzero denormal value for float16 (0.000000059604645).
// It's the float16 equivalent for [math.SmallestNonzeroFloat32] and [math.SmallestNonzeroFloat64].
// For context, [math.SmallestNonzeroFloat32] used the formula 1 / 2**(127 - 1 + 23) to produce
// the smallest denormal value for float32 (1.401298464324817070923729583289916131280e-45).
// The equivalent formula for float16 is 1 / 2**(15 - 1 + 10). We use Float16(0x0001) to compile as const.
const SmallestNonzero = Float16(0x0001) // 5.9604645e-08 (effectively 0x1p-14 * 0x1p-10)
// PrecisionFromfloat32 returns Precision without performing
// the conversion. Conversions from both Infinity and NaN
// values will always report PrecisionExact even if NaN payload
// or NaN-Quiet-Bit is lost. This function is kept simple to
// allow inlining and run < 0.5 ns/op, to serve as a fast filter.
func PrecisionFromfloat32(f32 float32) Precision {
u32 := math.Float32bits(f32)
if u32 == 0 || u32 == 0x80000000 {
// +- zero will always be exact conversion
return PrecisionExact
}
const COEFMASK uint32 = 0x7fffff // 23 least significant bits
const EXPSHIFT uint32 = 23
const EXPBIAS uint32 = 127
const EXPMASK uint32 = uint32(0xff) << EXPSHIFT
const DROPMASK uint32 = COEFMASK >> 10
exp := int32(((u32 & EXPMASK) >> EXPSHIFT) - EXPBIAS)
coef := u32 & COEFMASK
if exp == 128 {
// +- infinity or NaN
// apps may want to do extra checks for NaN separately
return PrecisionExact
}
// https://en.wikipedia.org/wiki/Half-precision_floating-point_format says,
// "Decimals between 2^−24 (minimum positive subnormal) and 2^−14 (maximum subnormal): fixed interval 2^−24"
if exp < -24 {
return PrecisionUnderflow
}
if exp > 15 {
return PrecisionOverflow
}
if (coef & DROPMASK) != uint32(0) {
// these include subnormals and non-subnormals that dropped bits
return PrecisionInexact
}
if exp < -14 {
// Subnormals. Caller may want to test these further.
// There are 2046 subnormals that can successfully round-trip f32->f16->f32
// and 20 of those 2046 have 32-bit input coef == 0.
// RFC 7049 and 7049bis Draft 12 don't precisely define "preserves value"
// so some protocols and libraries will choose to handle subnormals differently
// when deciding to encode them to CBOR float32 vs float16.
return PrecisionUnknown
}
return PrecisionExact
}
// Frombits returns the float16 number corresponding to the IEEE 754 binary16
// representation u16, with the sign bit of u16 and the result in the same bit
// position. Frombits(Bits(x)) == x.
func Frombits(u16 uint16) Float16 {
return Float16(u16)
}
// Fromfloat32 returns a Float16 value converted from f32. Conversion uses
// IEEE default rounding (nearest int, with ties to even).
func Fromfloat32(f32 float32) Float16 {
return Float16(f32bitsToF16bits(math.Float32bits(f32)))
}
// ErrInvalidNaNValue indicates a NaN was not received.
const ErrInvalidNaNValue = float16Error("float16: invalid NaN value, expected IEEE 754 NaN")
type float16Error string
func (e float16Error) Error() string { return string(e) }
// FromNaN32ps converts nan to IEEE binary16 NaN while preserving both
// signaling and payload. Unlike Fromfloat32(), which can only return
// qNaN because it sets quiet bit = 1, this can return both sNaN and qNaN.
// If the result is infinity (sNaN with empty payload), then the
// lowest bit of payload is set to make the result a NaN.
// Returns ErrInvalidNaNValue and 0x7c01 (sNaN) if nan isn't IEEE 754 NaN.
// This function was kept simple to be able to inline.
func FromNaN32ps(nan float32) (Float16, error) {
const SNAN = Float16(uint16(0x7c01)) // signaling NaN
u32 := math.Float32bits(nan)
sign := u32 & 0x80000000
exp := u32 & 0x7f800000
coef := u32 & 0x007fffff
if (exp != 0x7f800000) || (coef == 0) {
return SNAN, ErrInvalidNaNValue
}
u16 := uint16((sign >> 16) | uint32(0x7c00) | (coef >> 13))
if (u16 & 0x03ff) == 0 {
// result became infinity, make it NaN by setting lowest bit in payload
u16 |= 0x0001
}
return Float16(u16), nil
}
// NaN returns a Float16 of IEEE 754 binary16 not-a-number (NaN).
// Returned NaN value 0x7e01 has all exponent bits = 1 with the
// first and last bits = 1 in the significand. This is consistent
// with Go's 64-bit math.NaN(). Canonical CBOR in RFC 7049 uses 0x7e00.
func NaN() Float16 {
return Float16(0x7e01)
}
// Inf returns a Float16 with an infinity value with the specified sign.
// A sign >= returns positive infinity.
// A sign < 0 returns negative infinity.
func Inf(sign int) Float16 {
if sign >= 0 {
return Float16(0x7c00)
}
return Float16(0x8000 | 0x7c00)
}
// Float32 returns a float32 converted from f (Float16).
// This is a lossless conversion.
func (f Float16) Float32() float32 {
u32 := f16bitsToF32bits(uint16(f))
return math.Float32frombits(u32)
}
// Bits returns the IEEE 754 binary16 representation of f, with the sign bit
// of f and the result in the same bit position. Bits(Frombits(x)) == x.
func (f Float16) Bits() uint16 {
return uint16(f)
}
// IsNaN reports whether f is an IEEE 754 binary16 “not-a-number” value.
func (f Float16) IsNaN() bool {
return (f&0x7c00 == 0x7c00) && (f&0x03ff != 0)
}
// IsQuietNaN reports whether f is a quiet (non-signaling) IEEE 754 binary16
// “not-a-number” value.
func (f Float16) IsQuietNaN() bool {
return (f&0x7c00 == 0x7c00) && (f&0x03ff != 0) && (f&0x0200 != 0)
}
// IsInf reports whether f is an infinity (inf).
// A sign > 0 reports whether f is positive inf.
// A sign < 0 reports whether f is negative inf.
// A sign == 0 reports whether f is either inf.
func (f Float16) IsInf(sign int) bool {
return ((f == 0x7c00) && sign >= 0) ||
(f == 0xfc00 && sign <= 0)
}
// IsFinite returns true if f is neither infinite nor NaN.
func (f Float16) IsFinite() bool {
return (uint16(f) & uint16(0x7c00)) != uint16(0x7c00)
}
// IsNormal returns true if f is neither zero, infinite, subnormal, or NaN.
func (f Float16) IsNormal() bool {
exp := uint16(f) & uint16(0x7c00)
return (exp != uint16(0x7c00)) && (exp != 0)
}
// Signbit reports whether f is negative or negative zero.
func (f Float16) Signbit() bool {
return (uint16(f) & uint16(0x8000)) != 0
}
// String satisfies the fmt.Stringer interface.
func (f Float16) String() string {
return strconv.FormatFloat(float64(f.Float32()), 'f', -1, 32)
}
// f16bitsToF32bits returns uint32 (float32 bits) converted from specified uint16.
func f16bitsToF32bits(in uint16) uint32 {
// All 65536 conversions with this were confirmed to be correct
// by Montgomery Edwards⁴⁴⁸ (github.com/x448).
sign := uint32(in&0x8000) << 16 // sign for 32-bit
exp := uint32(in&0x7c00) >> 10 // exponenent for 16-bit
coef := uint32(in&0x03ff) << 13 // significand for 32-bit
if exp == 0x1f {
if coef == 0 {
// infinity
return sign | 0x7f800000 | coef
}
// NaN
return sign | 0x7fc00000 | coef
}
if exp == 0 {
if coef == 0 {
// zero
return sign
}
// normalize subnormal numbers
exp++
for coef&0x7f800000 == 0 {
coef <<= 1
exp--
}
coef &= 0x007fffff
}
return sign | ((exp + (0x7f - 0xf)) << 23) | coef
}
// f32bitsToF16bits returns uint16 (Float16 bits) converted from the specified float32.
// Conversion rounds to nearest integer with ties to even.
func f32bitsToF16bits(u32 uint32) uint16 {
// Translated from Rust to Go by Montgomery Edwards⁴⁴⁸ (github.com/x448).
// All 4294967296 conversions with this were confirmed to be correct by x448.
// Original Rust implementation is by Kathryn Long (github.com/starkat99) with MIT license.
sign := u32 & 0x80000000
exp := u32 & 0x7f800000
coef := u32 & 0x007fffff
if exp == 0x7f800000 {
// NaN or Infinity
nanBit := uint32(0)
if coef != 0 {
nanBit = uint32(0x0200)
}
return uint16((sign >> 16) | uint32(0x7c00) | nanBit | (coef >> 13))
}
halfSign := sign >> 16
unbiasedExp := int32(exp>>23) - 127
halfExp := unbiasedExp + 15
if halfExp >= 0x1f {
return uint16(halfSign | uint32(0x7c00))
}
if halfExp <= 0 {
if 14-halfExp > 24 {
return uint16(halfSign)
}
c := coef | uint32(0x00800000)
halfCoef := c >> uint32(14-halfExp)
roundBit := uint32(1) << uint32(13-halfExp)
if (c&roundBit) != 0 && (c&(3*roundBit-1)) != 0 {
halfCoef++
}
return uint16(halfSign | halfCoef)
}
uHalfExp := uint32(halfExp) << 10
halfCoef := coef >> 13
roundBit := uint32(0x00001000)
if (coef&roundBit) != 0 && (coef&(3*roundBit-1)) != 0 {
return uint16((halfSign | uHalfExp | halfCoef) + 1)
}
return uint16(halfSign | uHalfExp | halfCoef)
}