diff options
| author | Aleksandr Nogikh <nogikh@google.com> | 2025-01-02 11:58:29 +0100 |
|---|---|---|
| committer | Aleksandr Nogikh <nogikh@google.com> | 2025-01-22 13:17:53 +0000 |
| commit | 7512e6e7738143bd302d9b20cb1fd0d1d7af9643 (patch) | |
| tree | 67988d580d111bacbd009acfc0057f89aafa6522 /vendor/github.com/x448/float16/float16.go | |
| parent | 44f2ad31190603135f4ac758273f26111ca6003c (diff) | |
vendor: fetch the dependencies
Diffstat (limited to 'vendor/github.com/x448/float16/float16.go')
| -rw-r--r-- | vendor/github.com/x448/float16/float16.go | 302 |
1 files changed, 302 insertions, 0 deletions
diff --git a/vendor/github.com/x448/float16/float16.go b/vendor/github.com/x448/float16/float16.go new file mode 100644 index 000000000..1a0e6dad0 --- /dev/null +++ b/vendor/github.com/x448/float16/float16.go @@ -0,0 +1,302 @@ +// Copyright 2019 Montgomery Edwards⁴⁴⁸ and Faye Amacker +// +// Special thanks to Kathryn Long for her Rust implementation +// of float16 at github.com/starkat99/half-rs (MIT license) + +package float16 + +import ( + "math" + "strconv" +) + +// Float16 represents IEEE 754 half-precision floating-point numbers (binary16). +type Float16 uint16 + +// Precision indicates whether the conversion to Float16 is +// exact, subnormal without dropped bits, inexact, underflow, or overflow. +type Precision int + +const ( + + // PrecisionExact is for non-subnormals that don't drop bits during conversion. + // All of these can round-trip. Should always convert to float16. + PrecisionExact Precision = iota + + // PrecisionUnknown is for subnormals that don't drop bits during conversion but + // not all of these can round-trip so precision is unknown without more effort. + // Only 2046 of these can round-trip and the rest cannot round-trip. + PrecisionUnknown + + // PrecisionInexact is for dropped significand bits and cannot round-trip. + // Some of these are subnormals. Cannot round-trip float32->float16->float32. + PrecisionInexact + + // PrecisionUnderflow is for Underflows. Cannot round-trip float32->float16->float32. + PrecisionUnderflow + + // PrecisionOverflow is for Overflows. Cannot round-trip float32->float16->float32. + PrecisionOverflow +) + +// PrecisionFromfloat32 returns Precision without performing +// the conversion. Conversions from both Infinity and NaN +// values will always report PrecisionExact even if NaN payload +// or NaN-Quiet-Bit is lost. This function is kept simple to +// allow inlining and run < 0.5 ns/op, to serve as a fast filter. +func PrecisionFromfloat32(f32 float32) Precision { + u32 := math.Float32bits(f32) + + if u32 == 0 || u32 == 0x80000000 { + // +- zero will always be exact conversion + return PrecisionExact + } + + const COEFMASK uint32 = 0x7fffff // 23 least significant bits + const EXPSHIFT uint32 = 23 + const EXPBIAS uint32 = 127 + const EXPMASK uint32 = uint32(0xff) << EXPSHIFT + const DROPMASK uint32 = COEFMASK >> 10 + + exp := int32(((u32 & EXPMASK) >> EXPSHIFT) - EXPBIAS) + coef := u32 & COEFMASK + + if exp == 128 { + // +- infinity or NaN + // apps may want to do extra checks for NaN separately + return PrecisionExact + } + + // https://en.wikipedia.org/wiki/Half-precision_floating-point_format says, + // "Decimals between 2^−24 (minimum positive subnormal) and 2^−14 (maximum subnormal): fixed interval 2^−24" + if exp < -24 { + return PrecisionUnderflow + } + if exp > 15 { + return PrecisionOverflow + } + if (coef & DROPMASK) != uint32(0) { + // these include subnormals and non-subnormals that dropped bits + return PrecisionInexact + } + + if exp < -14 { + // Subnormals. Caller may want to test these further. + // There are 2046 subnormals that can successfully round-trip f32->f16->f32 + // and 20 of those 2046 have 32-bit input coef == 0. + // RFC 7049 and 7049bis Draft 12 don't precisely define "preserves value" + // so some protocols and libraries will choose to handle subnormals differently + // when deciding to encode them to CBOR float32 vs float16. + return PrecisionUnknown + } + + return PrecisionExact +} + +// Frombits returns the float16 number corresponding to the IEEE 754 binary16 +// representation u16, with the sign bit of u16 and the result in the same bit +// position. Frombits(Bits(x)) == x. +func Frombits(u16 uint16) Float16 { + return Float16(u16) +} + +// Fromfloat32 returns a Float16 value converted from f32. Conversion uses +// IEEE default rounding (nearest int, with ties to even). +func Fromfloat32(f32 float32) Float16 { + return Float16(f32bitsToF16bits(math.Float32bits(f32))) +} + +// ErrInvalidNaNValue indicates a NaN was not received. +const ErrInvalidNaNValue = float16Error("float16: invalid NaN value, expected IEEE 754 NaN") + +type float16Error string + +func (e float16Error) Error() string { return string(e) } + +// FromNaN32ps converts nan to IEEE binary16 NaN while preserving both +// signaling and payload. Unlike Fromfloat32(), which can only return +// qNaN because it sets quiet bit = 1, this can return both sNaN and qNaN. +// If the result is infinity (sNaN with empty payload), then the +// lowest bit of payload is set to make the result a NaN. +// Returns ErrInvalidNaNValue and 0x7c01 (sNaN) if nan isn't IEEE 754 NaN. +// This function was kept simple to be able to inline. +func FromNaN32ps(nan float32) (Float16, error) { + const SNAN = Float16(uint16(0x7c01)) // signalling NaN + + u32 := math.Float32bits(nan) + sign := u32 & 0x80000000 + exp := u32 & 0x7f800000 + coef := u32 & 0x007fffff + + if (exp != 0x7f800000) || (coef == 0) { + return SNAN, ErrInvalidNaNValue + } + + u16 := uint16((sign >> 16) | uint32(0x7c00) | (coef >> 13)) + + if (u16 & 0x03ff) == 0 { + // result became infinity, make it NaN by setting lowest bit in payload + u16 = u16 | 0x0001 + } + + return Float16(u16), nil +} + +// NaN returns a Float16 of IEEE 754 binary16 not-a-number (NaN). +// Returned NaN value 0x7e01 has all exponent bits = 1 with the +// first and last bits = 1 in the significand. This is consistent +// with Go's 64-bit math.NaN(). Canonical CBOR in RFC 7049 uses 0x7e00. +func NaN() Float16 { + return Float16(0x7e01) +} + +// Inf returns a Float16 with an infinity value with the specified sign. +// A sign >= returns positive infinity. +// A sign < 0 returns negative infinity. +func Inf(sign int) Float16 { + if sign >= 0 { + return Float16(0x7c00) + } + return Float16(0x8000 | 0x7c00) +} + +// Float32 returns a float32 converted from f (Float16). +// This is a lossless conversion. +func (f Float16) Float32() float32 { + u32 := f16bitsToF32bits(uint16(f)) + return math.Float32frombits(u32) +} + +// Bits returns the IEEE 754 binary16 representation of f, with the sign bit +// of f and the result in the same bit position. Bits(Frombits(x)) == x. +func (f Float16) Bits() uint16 { + return uint16(f) +} + +// IsNaN reports whether f is an IEEE 754 binary16 “not-a-number” value. +func (f Float16) IsNaN() bool { + return (f&0x7c00 == 0x7c00) && (f&0x03ff != 0) +} + +// IsQuietNaN reports whether f is a quiet (non-signaling) IEEE 754 binary16 +// “not-a-number” value. +func (f Float16) IsQuietNaN() bool { + return (f&0x7c00 == 0x7c00) && (f&0x03ff != 0) && (f&0x0200 != 0) +} + +// IsInf reports whether f is an infinity (inf). +// A sign > 0 reports whether f is positive inf. +// A sign < 0 reports whether f is negative inf. +// A sign == 0 reports whether f is either inf. +func (f Float16) IsInf(sign int) bool { + return ((f == 0x7c00) && sign >= 0) || + (f == 0xfc00 && sign <= 0) +} + +// IsFinite returns true if f is neither infinite nor NaN. +func (f Float16) IsFinite() bool { + return (uint16(f) & uint16(0x7c00)) != uint16(0x7c00) +} + +// IsNormal returns true if f is neither zero, infinite, subnormal, or NaN. +func (f Float16) IsNormal() bool { + exp := uint16(f) & uint16(0x7c00) + return (exp != uint16(0x7c00)) && (exp != 0) +} + +// Signbit reports whether f is negative or negative zero. +func (f Float16) Signbit() bool { + return (uint16(f) & uint16(0x8000)) != 0 +} + +// String satisfies the fmt.Stringer interface. +func (f Float16) String() string { + return strconv.FormatFloat(float64(f.Float32()), 'f', -1, 32) +} + +// f16bitsToF32bits returns uint32 (float32 bits) converted from specified uint16. +func f16bitsToF32bits(in uint16) uint32 { + // All 65536 conversions with this were confirmed to be correct + // by Montgomery Edwards⁴⁴⁸ (github.com/x448). + + sign := uint32(in&0x8000) << 16 // sign for 32-bit + exp := uint32(in&0x7c00) >> 10 // exponenent for 16-bit + coef := uint32(in&0x03ff) << 13 // significand for 32-bit + + if exp == 0x1f { + if coef == 0 { + // infinity + return sign | 0x7f800000 | coef + } + // NaN + return sign | 0x7fc00000 | coef + } + + if exp == 0 { + if coef == 0 { + // zero + return sign + } + + // normalize subnormal numbers + exp++ + for coef&0x7f800000 == 0 { + coef <<= 1 + exp-- + } + coef &= 0x007fffff + } + + return sign | ((exp + (0x7f - 0xf)) << 23) | coef +} + +// f32bitsToF16bits returns uint16 (Float16 bits) converted from the specified float32. +// Conversion rounds to nearest integer with ties to even. +func f32bitsToF16bits(u32 uint32) uint16 { + // Translated from Rust to Go by Montgomery Edwards⁴⁴⁸ (github.com/x448). + // All 4294967296 conversions with this were confirmed to be correct by x448. + // Original Rust implementation is by Kathryn Long (github.com/starkat99) with MIT license. + + sign := u32 & 0x80000000 + exp := u32 & 0x7f800000 + coef := u32 & 0x007fffff + + if exp == 0x7f800000 { + // NaN or Infinity + nanBit := uint32(0) + if coef != 0 { + nanBit = uint32(0x0200) + } + return uint16((sign >> 16) | uint32(0x7c00) | nanBit | (coef >> 13)) + } + + halfSign := sign >> 16 + + unbiasedExp := int32(exp>>23) - 127 + halfExp := unbiasedExp + 15 + + if halfExp >= 0x1f { + return uint16(halfSign | uint32(0x7c00)) + } + + if halfExp <= 0 { + if 14-halfExp > 24 { + return uint16(halfSign) + } + coef := coef | uint32(0x00800000) + halfCoef := coef >> uint32(14-halfExp) + roundBit := uint32(1) << uint32(13-halfExp) + if (coef&roundBit) != 0 && (coef&(3*roundBit-1)) != 0 { + halfCoef++ + } + return uint16(halfSign | halfCoef) + } + + uHalfExp := uint32(halfExp) << 10 + halfCoef := coef >> 13 + roundBit := uint32(0x00001000) + if (coef&roundBit) != 0 && (coef&(3*roundBit-1)) != 0 { + return uint16((halfSign | uHalfExp | halfCoef) + 1) + } + return uint16(halfSign | uHalfExp | halfCoef) +} |
