/src/serenity/AK/FloatingPoint.h

Source
/*
 * Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#pragma once

#include <AK/BitCast.h>
#include <AK/StdLibExtras.h>
#include <AK/Types.h>

namespace AK {

template<typename T>
struct FloatExtractor;

#ifdef AK_HAS_FLOAT_128
template<>
struct FloatExtractor<f128> {
    static constexpr FloatExtractor<f128> from_float(f128 f) { return bit_cast<FloatExtractor<f128>>(f); }
    constexpr f128 to_float() const { return bit_cast<f128>(*this); }

    using ComponentType = unsigned __int128;
    static constexpr int mantissa_bits = 112;
    static constexpr ComponentType mantissa_max = (((ComponentType)1) << 112) - 1;
    static constexpr int exponent_bias = 16383;
    static constexpr int exponent_bits = 15;
    static constexpr unsigned exponent_max = 32767;

    ComponentType mantissa : 112;
    ComponentType exponent : 15;
    ComponentType sign : 1;
};
// Validate that f128 and the FloatExtractor struct are 128 bits.
static_assert(AssertSize<f128, 16>());
static_assert(AssertSize<FloatExtractor<f128>, sizeof(f128)>());
#endif

#ifdef AK_HAS_FLOAT_80
template<>
struct FloatExtractor<f80> {
    static constexpr FloatExtractor<f80> from_float(f80 f) { return bit_cast<FloatExtractor<f80>>(f); }
    constexpr f80 to_float() const { return bit_cast<f80>(*this); }

    using ComponentType = unsigned long long;
    static constexpr int mantissa_bits = 64;
    static constexpr ComponentType mantissa_max = ~0ull;
    static constexpr int exponent_bias = 16383;
    static constexpr int exponent_bits = 15;
    static constexpr unsigned exponent_max = 32767;

    // This is technically wrong: Extended floating point values really only have 63 bits of mantissa
    // and an "integer bit" that behaves in various strange, unintuitive and non-IEEE-754 ways.
    // However, since all bit-fiddling float code assumes IEEE floats, it cannot handle this properly.
    // If we pretend that 80-bit floats are IEEE floats with 64-bit mantissas, almost everything works correctly
    // and we just need a few special cases.
    ComponentType mantissa : 64;
    ComponentType exponent : 15;
    ComponentType sign : 1;
};
static_assert(AssertSize<FloatExtractor<f80>, sizeof(f80)>());
#endif

template<>
struct FloatExtractor<f64> {
    static constexpr FloatExtractor<f64> from_float(f64 f) { return bit_cast<FloatExtractor<f64>>(f); }
    constexpr f64 to_float() const { return bit_cast<f64>(*this); }

    using ComponentType = unsigned long long;
    static constexpr int mantissa_bits = 52;
    static constexpr ComponentType mantissa_max = (1ull << 52) - 1;
    static constexpr int exponent_bias = 1023;
    static constexpr int exponent_bits = 11;
    static constexpr unsigned exponent_max = 2047;

    // FIXME: These types have to all be the same, otherwise this struct
    //        goes from being a bitfield describing the layout of an f64
    //        into being a multibyte mess on windows.
    //        Technically, '-mno-ms-bitfields' is supposed to disable this
    //        very intuitive and portable behaviour on windows, but it doesn't
    //        work with the msvc ABI.
    //        See <https://github.com/llvm/llvm-project/issues/24757>
    ComponentType mantissa : 52;
    ComponentType exponent : 11;
    ComponentType sign : 1;
};
static_assert(AssertSize<FloatExtractor<f64>, sizeof(f64)>());

template<>
struct FloatExtractor<f32> {
    static constexpr FloatExtractor<f32> from_float(f32 f) { return bit_cast<FloatExtractor<f32>>(f); }
    constexpr f32 to_float() const { return bit_cast<f32>(*this); }

    using ComponentType = unsigned;
    static constexpr int mantissa_bits = 23;
    static constexpr ComponentType mantissa_max = (1 << 23) - 1;
    static constexpr int exponent_bias = 127;
    static constexpr int exponent_bits = 8;
    static constexpr ComponentType exponent_max = 255;

    ComponentType mantissa : 23;
    ComponentType exponent : 8;
    ComponentType sign : 1;
};
static_assert(AssertSize<FloatExtractor<f32>, sizeof(f32)>());

template<size_t S, size_t E, size_t M>
requires(S <= 1 && E >= 1 && M >= 1 && (S + E + M) <= 64) class FloatingPointBits final {
public:
    static size_t const signbit = S;
    static size_t const exponentbits = E;
    static size_t const mantissabits = M;

    template<typename T>
    requires(IsIntegral<T> && IsUnsigned<T> && sizeof(T) <= 8) constexpr FloatingPointBits(T bits)
        : m_bits(bits)
    {
    }

    constexpr FloatingPointBits(double value)
        : m_bits(bit_cast<u64>(value))
    {
    }

    constexpr FloatingPointBits(float value)
        : m_bits(bit_cast<u32>(value))
    {
    }

    double as_double() const
    requires(S == 1 && E == 11 && M == 52)
    {
        return bit_cast<double>(m_bits);
    }
    float as_float() const
    requires(S == 1 && E == 8 && M == 23)
    {
        return bit_cast<float>(static_cast<u32>(m_bits));
    }
    u64 bits() const { return m_bits; }

private:
    u64 m_bits;
};

typedef FloatingPointBits<1, 8, 23> SingleFloatingPointBits;
typedef FloatingPointBits<1, 11, 52> DoubleFloatingPointBits;

/**
 * Convert between two IEEE 754 floating point types in any arrangement of sign, exponent and mantissa bits.
 */
template<typename To, typename From>
constexpr To float_to_float(From const input)
{
    constexpr u64 from_exponent_nonnumber = (1ull << From::exponentbits) - 1;
    constexpr u64 from_exponent_bias = (1ull << (From::exponentbits - 1)) - 1;
    constexpr u64 to_exponent_nonnumber = (1ull << To::exponentbits) - 1;
    constexpr u64 to_exponent_bias = (1ull << (To::exponentbits - 1)) - 1;
    constexpr u64 to_exponent_max = (1ull << To::exponentbits) - 2;

    // Deconstruct input bits to float components
    u64 from_sign = (input.bits() >> (From::exponentbits + From::mantissabits)) & From::signbit;
    u64 from_exponent = (input.bits() >> From::mantissabits) & ((1ull << From::exponentbits) - 1);
    u64 from_mantissa = input.bits() & ((1ull << From::mantissabits) - 1);

    u64 to_sign = from_sign & To::signbit;
    u64 to_exponent;
    u64 to_mantissa;
    auto target_value = [&to_sign, &to_exponent, &to_mantissa]() {
        return To((to_sign << (To::exponentbits + To::mantissabits)) | (to_exponent << To::mantissabits) | to_mantissa);
    };

    auto shift_mantissa = [](u64 mantissa) -> u64 {
        if constexpr (From::mantissabits < To::mantissabits)
            return mantissa << (To::mantissabits - From::mantissabits);
        else
            return mantissa >> (From::mantissabits - To::mantissabits);
    };

    // If target is unsigned and source is negative, clamp to 0 or keep NaN
    if constexpr (To::signbit == 0) {
        if (from_sign == 1) {
            if (from_exponent == from_exponent_nonnumber && from_mantissa > 0) {
                to_exponent = to_exponent_nonnumber;
                to_mantissa = 1;
            } else {
                to_exponent = 0;
                to_mantissa = 0;
            }
            return target_value();
        }
    }

    // If the source floating point is denormalized;
    if (from_exponent == 0) {
        // If the source mantissa is 0, the value is +/-0
        if (from_mantissa == 0) {
            to_exponent = 0;
            to_mantissa = 0;
            return target_value();
        }

        // If the source has more exponent bits than the target, then the largest possible
        // source mantissa still cannot be represented in the target denormalized value.
        if constexpr (From::exponentbits > To::exponentbits) {
            to_exponent = 0;
            to_mantissa = 0;
            return target_value();
        }

        // If the source and target have the same number of exponent bits, we only need to
        // shift the mantissa.
        if constexpr (From::exponentbits == To::exponentbits) {
            to_exponent = 0;
            to_mantissa = shift_mantissa(from_mantissa);
            return target_value();
        }

        // The target has more exponent bits, so our denormalized value can be represented
        // as a normalized value in the target floating point. Normalized values have an
        // implicit leading 1, so we shift the mantissa left until we find our explicit
        // leading 1 which is then dropped.
        int adjust_exponent = -1;
        to_mantissa = from_mantissa;
        do {
            ++adjust_exponent;
            to_mantissa <<= 1;
        } while ((to_mantissa & (1ull << From::mantissabits)) == 0);
        to_exponent = to_exponent_bias - from_exponent_bias - adjust_exponent;

        // Drop the most significant bit from the mantissa
        to_mantissa &= (1ull << From::mantissabits) - 1;
        to_mantissa = shift_mantissa(to_mantissa);
        return target_value();
    }

    // If the source is NaN or +/-Inf, keep it that way
    if (from_exponent == from_exponent_nonnumber) {
        to_exponent = to_exponent_nonnumber;
        to_mantissa = (from_mantissa == 0) ? 0 : 1;
        return target_value();
    }

    // Determine the target exponent
    to_exponent = to_exponent_bias - from_exponent_bias + from_exponent;

    // If the calculated exponent exceeds the target's capacity, clamp both the exponent and the
    // mantissa to their maximum values.
    if (to_exponent > to_exponent_max) {
        to_exponent = to_exponent_max;
        to_mantissa = (1ull << To::mantissabits) - 1;
        return target_value();
    }

    // If the new exponent is less than 1, we can only represent this value as a denormalized number
    if (to_exponent < 1) {
        to_exponent = 0;

        // Add a leading 1 and shift the mantissa right
        int adjust_exponent = 1 - to_exponent_bias - from_exponent + from_exponent_bias;
        to_mantissa = ((1ull << From::mantissabits) | from_mantissa) >> adjust_exponent;
        to_mantissa = shift_mantissa(to_mantissa);
        return target_value();
    }

    // New exponent fits; shift the mantissa to fit as well
    to_mantissa = shift_mantissa(from_mantissa);
    return target_value();
}

template<typename O>
constexpr O convert_from_native_double(double input) { return float_to_float<O>(DoubleFloatingPointBits(input)); }

template<typename O>
constexpr O convert_from_native_float(float input) { return float_to_float<O>(SingleFloatingPointBits(input)); }

template<typename I>
constexpr double convert_to_native_double(I input) { return float_to_float<DoubleFloatingPointBits>(input).as_double(); }

template<typename I>
constexpr float convert_to_native_float(I input) { return float_to_float<SingleFloatingPointBits>(input).as_float(); }

}

#if USING_AK_GLOBALLY
using AK::DoubleFloatingPointBits;
using AK::FloatExtractor;
using AK::FloatingPointBits;
using AK::SingleFloatingPointBits;

using AK::convert_from_native_double;
using AK::convert_from_native_float;
using AK::convert_to_native_double;
using AK::convert_to_native_float;
using AK::float_to_float;
#endif

Coverage Report

Created: 2025-11-16 07:46

Line	Count	Source
1		/*
2		* Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
3		*
4		* SPDX-License-Identifier: BSD-2-Clause
5		*/
6
7		#pragma once
8
9		#include <AK/BitCast.h>
10		#include <AK/StdLibExtras.h>
11		#include <AK/Types.h>
12
13		namespace AK {
14
15		template<typename T>
16		struct FloatExtractor;
17
18		#ifdef AK_HAS_FLOAT_128
19		template<>
20		struct FloatExtractor<f128> {
21		static constexpr FloatExtractor<f128> from_float(f128 f) { return bit_cast<FloatExtractor<f128>>(f); }
22		constexpr f128 to_float() const { return bit_cast<f128>(*this); }
23
24		using ComponentType = unsigned __int128;
25		static constexpr int mantissa_bits = 112;
26		static constexpr ComponentType mantissa_max = (((ComponentType)1) << 112) - 1;
27		static constexpr int exponent_bias = 16383;
28		static constexpr int exponent_bits = 15;
29		static constexpr unsigned exponent_max = 32767;
30
31		ComponentType mantissa : 112;
32		ComponentType exponent : 15;
33		ComponentType sign : 1;
34		};
35		// Validate that f128 and the FloatExtractor struct are 128 bits.
36		static_assert(AssertSize<f128, 16>());
37		static_assert(AssertSize<FloatExtractor<f128>, sizeof(f128)>());
38		#endif
39
40		#ifdef AK_HAS_FLOAT_80
41		template<>
42		struct FloatExtractor<f80> {
43	0	static constexpr FloatExtractor<f80> from_float(f80 f) { return bit_cast<FloatExtractor<f80>>(f); }
44	0	constexpr f80 to_float() const { return bit_cast<f80>(*this); }
45
46		using ComponentType = unsigned long long;
47		static constexpr int mantissa_bits = 64;
48		static constexpr ComponentType mantissa_max = ~0ull;
49		static constexpr int exponent_bias = 16383;
50		static constexpr int exponent_bits = 15;
51		static constexpr unsigned exponent_max = 32767;
52
53		// This is technically wrong: Extended floating point values really only have 63 bits of mantissa
54		// and an "integer bit" that behaves in various strange, unintuitive and non-IEEE-754 ways.
55		// However, since all bit-fiddling float code assumes IEEE floats, it cannot handle this properly.
56		// If we pretend that 80-bit floats are IEEE floats with 64-bit mantissas, almost everything works correctly
57		// and we just need a few special cases.
58		ComponentType mantissa : 64;
59		ComponentType exponent : 15;
60		ComponentType sign : 1;
61		};
62		static_assert(AssertSize<FloatExtractor<f80>, sizeof(f80)>());
63		#endif
64
65		template<>
66		struct FloatExtractor<f64> {
67	0	static constexpr FloatExtractor<f64> from_float(f64 f) { return bit_cast<FloatExtractor<f64>>(f); }
68	0	constexpr f64 to_float() const { return bit_cast<f64>(*this); }
69
70		using ComponentType = unsigned long long;
71		static constexpr int mantissa_bits = 52;
72		static constexpr ComponentType mantissa_max = (1ull << 52) - 1;
73		static constexpr int exponent_bias = 1023;
74		static constexpr int exponent_bits = 11;
75		static constexpr unsigned exponent_max = 2047;
76
77		// FIXME: These types have to all be the same, otherwise this struct
78		// goes from being a bitfield describing the layout of an f64
79		// into being a multibyte mess on windows.
80		// Technically, '-mno-ms-bitfields' is supposed to disable this
81		// very intuitive and portable behaviour on windows, but it doesn't
82		// work with the msvc ABI.
83		// See <https://github.com/llvm/llvm-project/issues/24757>
84		ComponentType mantissa : 52;
85		ComponentType exponent : 11;
86		ComponentType sign : 1;
87		};
88		static_assert(AssertSize<FloatExtractor<f64>, sizeof(f64)>());
89
90		template<>
91		struct FloatExtractor<f32> {
92	4.17M	static constexpr FloatExtractor<f32> from_float(f32 f) { return bit_cast<FloatExtractor<f32>>(f); }
93	570k	constexpr f32 to_float() const { return bit_cast<f32>(*this); }
94
95		using ComponentType = unsigned;
96		static constexpr int mantissa_bits = 23;
97		static constexpr ComponentType mantissa_max = (1 << 23) - 1;
98		static constexpr int exponent_bias = 127;
99		static constexpr int exponent_bits = 8;
100		static constexpr ComponentType exponent_max = 255;
101
102		ComponentType mantissa : 23;
103		ComponentType exponent : 8;
104		ComponentType sign : 1;
105		};
106		static_assert(AssertSize<FloatExtractor<f32>, sizeof(f32)>());
107
108		template<size_t S, size_t E, size_t M>
109		requires(S <= 1 && E >= 1 && M >= 1 && (S + E + M) <= 64) class FloatingPointBits final {
110		public:
111		static size_t const signbit = S;
112		static size_t const exponentbits = E;
113		static size_t const mantissabits = M;
114
115		template<typename T>
116		requires(IsIntegral<T> && IsUnsigned<T> && sizeof(T) <= 8) constexpr FloatingPointBits(T bits)
117		: m_bits(bits)
118		{
119		}
120
121		constexpr FloatingPointBits(double value)
122		: m_bits(bit_cast<u64>(value))
123		{
124		}
125
126		constexpr FloatingPointBits(float value)
127		: m_bits(bit_cast<u32>(value))
128		{
129		}
130
131		double as_double() const
132		requires(S == 1 && E == 11 && M == 52)
133		{
134		return bit_cast<double>(m_bits);
135		}
136		float as_float() const
137		requires(S == 1 && E == 8 && M == 23)
138		{
139		return bit_cast<float>(static_cast<u32>(m_bits));
140		}
141		u64 bits() const { return m_bits; }
142
143		private:
144		u64 m_bits;
145		};
146
147		typedef FloatingPointBits<1, 8, 23> SingleFloatingPointBits;
148		typedef FloatingPointBits<1, 11, 52> DoubleFloatingPointBits;
149
150		/**
151		* Convert between two IEEE 754 floating point types in any arrangement of sign, exponent and mantissa bits.
152		*/
153		template<typename To, typename From>
154		constexpr To float_to_float(From const input)
155		{
156		constexpr u64 from_exponent_nonnumber = (1ull << From::exponentbits) - 1;
157		constexpr u64 from_exponent_bias = (1ull << (From::exponentbits - 1)) - 1;
158		constexpr u64 to_exponent_nonnumber = (1ull << To::exponentbits) - 1;
159		constexpr u64 to_exponent_bias = (1ull << (To::exponentbits - 1)) - 1;
160		constexpr u64 to_exponent_max = (1ull << To::exponentbits) - 2;
161
162		// Deconstruct input bits to float components
163		u64 from_sign = (input.bits() >> (From::exponentbits + From::mantissabits)) & From::signbit;
164		u64 from_exponent = (input.bits() >> From::mantissabits) & ((1ull << From::exponentbits) - 1);
165		u64 from_mantissa = input.bits() & ((1ull << From::mantissabits) - 1);
166
167		u64 to_sign = from_sign & To::signbit;
168		u64 to_exponent;
169		u64 to_mantissa;
170		auto target_value = [&to_sign, &to_exponent, &to_mantissa]() {
171		return To((to_sign << (To::exponentbits + To::mantissabits)) \| (to_exponent << To::mantissabits) \| to_mantissa);
172		};
173
174		auto shift_mantissa = [](u64 mantissa) -> u64 {
175		if constexpr (From::mantissabits < To::mantissabits)
176		return mantissa << (To::mantissabits - From::mantissabits);
177		else
178		return mantissa >> (From::mantissabits - To::mantissabits);
179		};
180
181		// If target is unsigned and source is negative, clamp to 0 or keep NaN
182		if constexpr (To::signbit == 0) {
183		if (from_sign == 1) {
184		if (from_exponent == from_exponent_nonnumber && from_mantissa > 0) {
185		to_exponent = to_exponent_nonnumber;
186		to_mantissa = 1;
187		} else {
188		to_exponent = 0;
189		to_mantissa = 0;
190		}
191		return target_value();
192		}
193		}
194
195		// If the source floating point is denormalized;
196		if (from_exponent == 0) {
197		// If the source mantissa is 0, the value is +/-0
198		if (from_mantissa == 0) {
199		to_exponent = 0;
200		to_mantissa = 0;
201		return target_value();
202		}
203
204		// If the source has more exponent bits than the target, then the largest possible
205		// source mantissa still cannot be represented in the target denormalized value.
206		if constexpr (From::exponentbits > To::exponentbits) {
207		to_exponent = 0;
208		to_mantissa = 0;
209		return target_value();
210		}
211
212		// If the source and target have the same number of exponent bits, we only need to
213		// shift the mantissa.
214		if constexpr (From::exponentbits == To::exponentbits) {
215		to_exponent = 0;
216		to_mantissa = shift_mantissa(from_mantissa);
217		return target_value();
218		}
219
220		// The target has more exponent bits, so our denormalized value can be represented
221		// as a normalized value in the target floating point. Normalized values have an
222		// implicit leading 1, so we shift the mantissa left until we find our explicit
223		// leading 1 which is then dropped.
224		int adjust_exponent = -1;
225		to_mantissa = from_mantissa;
226		do {
227		++adjust_exponent;
228		to_mantissa <<= 1;
229		} while ((to_mantissa & (1ull << From::mantissabits)) == 0);
230		to_exponent = to_exponent_bias - from_exponent_bias - adjust_exponent;
231
232		// Drop the most significant bit from the mantissa
233		to_mantissa &= (1ull << From::mantissabits) - 1;
234		to_mantissa = shift_mantissa(to_mantissa);
235		return target_value();
236		}
237
238		// If the source is NaN or +/-Inf, keep it that way
239		if (from_exponent == from_exponent_nonnumber) {
240		to_exponent = to_exponent_nonnumber;
241		to_mantissa = (from_mantissa == 0) ? 0 : 1;
242		return target_value();
243		}
244
245		// Determine the target exponent
246		to_exponent = to_exponent_bias - from_exponent_bias + from_exponent;
247
248		// If the calculated exponent exceeds the target's capacity, clamp both the exponent and the
249		// mantissa to their maximum values.
250		if (to_exponent > to_exponent_max) {
251		to_exponent = to_exponent_max;
252		to_mantissa = (1ull << To::mantissabits) - 1;
253		return target_value();
254		}
255
256		// If the new exponent is less than 1, we can only represent this value as a denormalized number
257		if (to_exponent < 1) {
258		to_exponent = 0;
259
260		// Add a leading 1 and shift the mantissa right
261		int adjust_exponent = 1 - to_exponent_bias - from_exponent + from_exponent_bias;
262		to_mantissa = ((1ull << From::mantissabits) \| from_mantissa) >> adjust_exponent;
263		to_mantissa = shift_mantissa(to_mantissa);
264		return target_value();
265		}
266
267		// New exponent fits; shift the mantissa to fit as well
268		to_mantissa = shift_mantissa(from_mantissa);
269		return target_value();
270		}
271
272		template<typename O>
273		constexpr O convert_from_native_double(double input) { return float_to_float<O>(DoubleFloatingPointBits(input)); }
274
275		template<typename O>
276		constexpr O convert_from_native_float(float input) { return float_to_float<O>(SingleFloatingPointBits(input)); }
277
278		template<typename I>
279		constexpr double convert_to_native_double(I input) { return float_to_float<DoubleFloatingPointBits>(input).as_double(); }
280
281		template<typename I>
282		constexpr float convert_to_native_float(I input) { return float_to_float<SingleFloatingPointBits>(input).as_float(); }
283
284		}
285
286		#if USING_AK_GLOBALLY
287		using AK::DoubleFloatingPointBits;
288		using AK::FloatExtractor;
289		using AK::FloatingPointBits;
290		using AK::SingleFloatingPointBits;
291
292		using AK::convert_from_native_double;
293		using AK::convert_from_native_float;
294		using AK::convert_to_native_double;
295		using AK::convert_to_native_float;
296		using AK::float_to_float;
297		#endif