/src/serenity/AK/FloatingPoint.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #pragma once |
8 | | |
9 | | #include <AK/BitCast.h> |
10 | | #include <AK/StdLibExtras.h> |
11 | | #include <AK/Types.h> |
12 | | |
13 | | namespace AK { |
14 | | |
15 | | template<typename T> |
16 | | struct FloatExtractor; |
17 | | |
18 | | #ifdef AK_HAS_FLOAT_128 |
19 | | template<> |
20 | | struct FloatExtractor<f128> { |
21 | | static constexpr FloatExtractor<f128> from_float(f128 f) { return bit_cast<FloatExtractor<f128>>(f); } |
22 | | constexpr f128 to_float() const { return bit_cast<f128>(*this); } |
23 | | |
24 | | using ComponentType = unsigned __int128; |
25 | | static constexpr int mantissa_bits = 112; |
26 | | static constexpr ComponentType mantissa_max = (((ComponentType)1) << 112) - 1; |
27 | | static constexpr int exponent_bias = 16383; |
28 | | static constexpr int exponent_bits = 15; |
29 | | static constexpr unsigned exponent_max = 32767; |
30 | | |
31 | | ComponentType mantissa : 112; |
32 | | ComponentType exponent : 15; |
33 | | ComponentType sign : 1; |
34 | | }; |
35 | | // Validate that f128 and the FloatExtractor struct are 128 bits. |
36 | | static_assert(AssertSize<f128, 16>()); |
37 | | static_assert(AssertSize<FloatExtractor<f128>, sizeof(f128)>()); |
38 | | #endif |
39 | | |
40 | | #ifdef AK_HAS_FLOAT_80 |
41 | | template<> |
42 | | struct FloatExtractor<f80> { |
43 | 0 | static constexpr FloatExtractor<f80> from_float(f80 f) { return bit_cast<FloatExtractor<f80>>(f); } |
44 | 0 | constexpr f80 to_float() const { return bit_cast<f80>(*this); } |
45 | | |
46 | | using ComponentType = unsigned long long; |
47 | | static constexpr int mantissa_bits = 64; |
48 | | static constexpr ComponentType mantissa_max = ~0ull; |
49 | | static constexpr int exponent_bias = 16383; |
50 | | static constexpr int exponent_bits = 15; |
51 | | static constexpr unsigned exponent_max = 32767; |
52 | | |
53 | | // This is technically wrong: Extended floating point values really only have 63 bits of mantissa |
54 | | // and an "integer bit" that behaves in various strange, unintuitive and non-IEEE-754 ways. |
55 | | // However, since all bit-fiddling float code assumes IEEE floats, it cannot handle this properly. |
56 | | // If we pretend that 80-bit floats are IEEE floats with 64-bit mantissas, almost everything works correctly |
57 | | // and we just need a few special cases. |
58 | | ComponentType mantissa : 64; |
59 | | ComponentType exponent : 15; |
60 | | ComponentType sign : 1; |
61 | | }; |
62 | | static_assert(AssertSize<FloatExtractor<f80>, sizeof(f80)>()); |
63 | | #endif |
64 | | |
65 | | template<> |
66 | | struct FloatExtractor<f64> { |
67 | 0 | static constexpr FloatExtractor<f64> from_float(f64 f) { return bit_cast<FloatExtractor<f64>>(f); } |
68 | 0 | constexpr f64 to_float() const { return bit_cast<f64>(*this); } |
69 | | |
70 | | using ComponentType = unsigned long long; |
71 | | static constexpr int mantissa_bits = 52; |
72 | | static constexpr ComponentType mantissa_max = (1ull << 52) - 1; |
73 | | static constexpr int exponent_bias = 1023; |
74 | | static constexpr int exponent_bits = 11; |
75 | | static constexpr unsigned exponent_max = 2047; |
76 | | |
77 | | // FIXME: These types have to all be the same, otherwise this struct |
78 | | // goes from being a bitfield describing the layout of an f64 |
79 | | // into being a multibyte mess on windows. |
80 | | // Technically, '-mno-ms-bitfields' is supposed to disable this |
81 | | // very intuitive and portable behaviour on windows, but it doesn't |
82 | | // work with the msvc ABI. |
83 | | // See <https://github.com/llvm/llvm-project/issues/24757> |
84 | | ComponentType mantissa : 52; |
85 | | ComponentType exponent : 11; |
86 | | ComponentType sign : 1; |
87 | | }; |
88 | | static_assert(AssertSize<FloatExtractor<f64>, sizeof(f64)>()); |
89 | | |
90 | | template<> |
91 | | struct FloatExtractor<f32> { |
92 | 4.17M | static constexpr FloatExtractor<f32> from_float(f32 f) { return bit_cast<FloatExtractor<f32>>(f); } |
93 | 570k | constexpr f32 to_float() const { return bit_cast<f32>(*this); } |
94 | | |
95 | | using ComponentType = unsigned; |
96 | | static constexpr int mantissa_bits = 23; |
97 | | static constexpr ComponentType mantissa_max = (1 << 23) - 1; |
98 | | static constexpr int exponent_bias = 127; |
99 | | static constexpr int exponent_bits = 8; |
100 | | static constexpr ComponentType exponent_max = 255; |
101 | | |
102 | | ComponentType mantissa : 23; |
103 | | ComponentType exponent : 8; |
104 | | ComponentType sign : 1; |
105 | | }; |
106 | | static_assert(AssertSize<FloatExtractor<f32>, sizeof(f32)>()); |
107 | | |
108 | | template<size_t S, size_t E, size_t M> |
109 | | requires(S <= 1 && E >= 1 && M >= 1 && (S + E + M) <= 64) class FloatingPointBits final { |
110 | | public: |
111 | | static size_t const signbit = S; |
112 | | static size_t const exponentbits = E; |
113 | | static size_t const mantissabits = M; |
114 | | |
115 | | template<typename T> |
116 | | requires(IsIntegral<T> && IsUnsigned<T> && sizeof(T) <= 8) constexpr FloatingPointBits(T bits) |
117 | | : m_bits(bits) |
118 | | { |
119 | | } |
120 | | |
121 | | constexpr FloatingPointBits(double value) |
122 | | : m_bits(bit_cast<u64>(value)) |
123 | | { |
124 | | } |
125 | | |
126 | | constexpr FloatingPointBits(float value) |
127 | | : m_bits(bit_cast<u32>(value)) |
128 | | { |
129 | | } |
130 | | |
131 | | double as_double() const |
132 | | requires(S == 1 && E == 11 && M == 52) |
133 | | { |
134 | | return bit_cast<double>(m_bits); |
135 | | } |
136 | | float as_float() const |
137 | | requires(S == 1 && E == 8 && M == 23) |
138 | | { |
139 | | return bit_cast<float>(static_cast<u32>(m_bits)); |
140 | | } |
141 | | u64 bits() const { return m_bits; } |
142 | | |
143 | | private: |
144 | | u64 m_bits; |
145 | | }; |
146 | | |
147 | | typedef FloatingPointBits<1, 8, 23> SingleFloatingPointBits; |
148 | | typedef FloatingPointBits<1, 11, 52> DoubleFloatingPointBits; |
149 | | |
150 | | /** |
151 | | * Convert between two IEEE 754 floating point types in any arrangement of sign, exponent and mantissa bits. |
152 | | */ |
153 | | template<typename To, typename From> |
154 | | constexpr To float_to_float(From const input) |
155 | | { |
156 | | constexpr u64 from_exponent_nonnumber = (1ull << From::exponentbits) - 1; |
157 | | constexpr u64 from_exponent_bias = (1ull << (From::exponentbits - 1)) - 1; |
158 | | constexpr u64 to_exponent_nonnumber = (1ull << To::exponentbits) - 1; |
159 | | constexpr u64 to_exponent_bias = (1ull << (To::exponentbits - 1)) - 1; |
160 | | constexpr u64 to_exponent_max = (1ull << To::exponentbits) - 2; |
161 | | |
162 | | // Deconstruct input bits to float components |
163 | | u64 from_sign = (input.bits() >> (From::exponentbits + From::mantissabits)) & From::signbit; |
164 | | u64 from_exponent = (input.bits() >> From::mantissabits) & ((1ull << From::exponentbits) - 1); |
165 | | u64 from_mantissa = input.bits() & ((1ull << From::mantissabits) - 1); |
166 | | |
167 | | u64 to_sign = from_sign & To::signbit; |
168 | | u64 to_exponent; |
169 | | u64 to_mantissa; |
170 | | auto target_value = [&to_sign, &to_exponent, &to_mantissa]() { |
171 | | return To((to_sign << (To::exponentbits + To::mantissabits)) | (to_exponent << To::mantissabits) | to_mantissa); |
172 | | }; |
173 | | |
174 | | auto shift_mantissa = [](u64 mantissa) -> u64 { |
175 | | if constexpr (From::mantissabits < To::mantissabits) |
176 | | return mantissa << (To::mantissabits - From::mantissabits); |
177 | | else |
178 | | return mantissa >> (From::mantissabits - To::mantissabits); |
179 | | }; |
180 | | |
181 | | // If target is unsigned and source is negative, clamp to 0 or keep NaN |
182 | | if constexpr (To::signbit == 0) { |
183 | | if (from_sign == 1) { |
184 | | if (from_exponent == from_exponent_nonnumber && from_mantissa > 0) { |
185 | | to_exponent = to_exponent_nonnumber; |
186 | | to_mantissa = 1; |
187 | | } else { |
188 | | to_exponent = 0; |
189 | | to_mantissa = 0; |
190 | | } |
191 | | return target_value(); |
192 | | } |
193 | | } |
194 | | |
195 | | // If the source floating point is denormalized; |
196 | | if (from_exponent == 0) { |
197 | | // If the source mantissa is 0, the value is +/-0 |
198 | | if (from_mantissa == 0) { |
199 | | to_exponent = 0; |
200 | | to_mantissa = 0; |
201 | | return target_value(); |
202 | | } |
203 | | |
204 | | // If the source has more exponent bits than the target, then the largest possible |
205 | | // source mantissa still cannot be represented in the target denormalized value. |
206 | | if constexpr (From::exponentbits > To::exponentbits) { |
207 | | to_exponent = 0; |
208 | | to_mantissa = 0; |
209 | | return target_value(); |
210 | | } |
211 | | |
212 | | // If the source and target have the same number of exponent bits, we only need to |
213 | | // shift the mantissa. |
214 | | if constexpr (From::exponentbits == To::exponentbits) { |
215 | | to_exponent = 0; |
216 | | to_mantissa = shift_mantissa(from_mantissa); |
217 | | return target_value(); |
218 | | } |
219 | | |
220 | | // The target has more exponent bits, so our denormalized value can be represented |
221 | | // as a normalized value in the target floating point. Normalized values have an |
222 | | // implicit leading 1, so we shift the mantissa left until we find our explicit |
223 | | // leading 1 which is then dropped. |
224 | | int adjust_exponent = -1; |
225 | | to_mantissa = from_mantissa; |
226 | | do { |
227 | | ++adjust_exponent; |
228 | | to_mantissa <<= 1; |
229 | | } while ((to_mantissa & (1ull << From::mantissabits)) == 0); |
230 | | to_exponent = to_exponent_bias - from_exponent_bias - adjust_exponent; |
231 | | |
232 | | // Drop the most significant bit from the mantissa |
233 | | to_mantissa &= (1ull << From::mantissabits) - 1; |
234 | | to_mantissa = shift_mantissa(to_mantissa); |
235 | | return target_value(); |
236 | | } |
237 | | |
238 | | // If the source is NaN or +/-Inf, keep it that way |
239 | | if (from_exponent == from_exponent_nonnumber) { |
240 | | to_exponent = to_exponent_nonnumber; |
241 | | to_mantissa = (from_mantissa == 0) ? 0 : 1; |
242 | | return target_value(); |
243 | | } |
244 | | |
245 | | // Determine the target exponent |
246 | | to_exponent = to_exponent_bias - from_exponent_bias + from_exponent; |
247 | | |
248 | | // If the calculated exponent exceeds the target's capacity, clamp both the exponent and the |
249 | | // mantissa to their maximum values. |
250 | | if (to_exponent > to_exponent_max) { |
251 | | to_exponent = to_exponent_max; |
252 | | to_mantissa = (1ull << To::mantissabits) - 1; |
253 | | return target_value(); |
254 | | } |
255 | | |
256 | | // If the new exponent is less than 1, we can only represent this value as a denormalized number |
257 | | if (to_exponent < 1) { |
258 | | to_exponent = 0; |
259 | | |
260 | | // Add a leading 1 and shift the mantissa right |
261 | | int adjust_exponent = 1 - to_exponent_bias - from_exponent + from_exponent_bias; |
262 | | to_mantissa = ((1ull << From::mantissabits) | from_mantissa) >> adjust_exponent; |
263 | | to_mantissa = shift_mantissa(to_mantissa); |
264 | | return target_value(); |
265 | | } |
266 | | |
267 | | // New exponent fits; shift the mantissa to fit as well |
268 | | to_mantissa = shift_mantissa(from_mantissa); |
269 | | return target_value(); |
270 | | } |
271 | | |
272 | | template<typename O> |
273 | | constexpr O convert_from_native_double(double input) { return float_to_float<O>(DoubleFloatingPointBits(input)); } |
274 | | |
275 | | template<typename O> |
276 | | constexpr O convert_from_native_float(float input) { return float_to_float<O>(SingleFloatingPointBits(input)); } |
277 | | |
278 | | template<typename I> |
279 | | constexpr double convert_to_native_double(I input) { return float_to_float<DoubleFloatingPointBits>(input).as_double(); } |
280 | | |
281 | | template<typename I> |
282 | | constexpr float convert_to_native_float(I input) { return float_to_float<SingleFloatingPointBits>(input).as_float(); } |
283 | | |
284 | | } |
285 | | |
286 | | #if USING_AK_GLOBALLY |
287 | | using AK::DoubleFloatingPointBits; |
288 | | using AK::FloatExtractor; |
289 | | using AK::FloatingPointBits; |
290 | | using AK::SingleFloatingPointBits; |
291 | | |
292 | | using AK::convert_from_native_double; |
293 | | using AK::convert_from_native_float; |
294 | | using AK::convert_to_native_double; |
295 | | using AK::convert_to_native_float; |
296 | | using AK::float_to_float; |
297 | | #endif |