/rust/registry/src/index.crates.io-1949cf8c6b5b557f/pxfm-0.1.27/src/dyadic_float.rs
Line | Count | Source |
1 | | /* |
2 | | * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved. |
3 | | * // |
4 | | * // Redistribution and use in source and binary forms, with or without modification, |
5 | | * // are permitted provided that the following conditions are met: |
6 | | * // |
7 | | * // 1. Redistributions of source code must retain the above copyright notice, this |
8 | | * // list of conditions and the following disclaimer. |
9 | | * // |
10 | | * // 2. Redistributions in binary form must reproduce the above copyright notice, |
11 | | * // this list of conditions and the following disclaimer in the documentation |
12 | | * // and/or other materials provided with the distribution. |
13 | | * // |
14 | | * // 3. Neither the name of the copyright holder nor the names of its |
15 | | * // contributors may be used to endorse or promote products derived from |
16 | | * // this software without specific prior written permission. |
17 | | * // |
18 | | * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
19 | | * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
20 | | * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
21 | | * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE |
22 | | * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
23 | | * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
24 | | * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
25 | | * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
26 | | * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
27 | | * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | | */ |
29 | | use crate::bits::EXP_MASK; |
30 | | use crate::common::f_fmla; |
31 | | use std::ops::{Add, Mul, Sub}; |
32 | | |
33 | | #[repr(u8)] |
34 | | #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)] |
35 | | pub(crate) enum DyadicSign { |
36 | | Pos = 0, |
37 | | Neg = 1, |
38 | | } |
39 | | |
40 | | impl DyadicSign { |
41 | | #[inline] |
42 | 0 | pub(crate) fn negate(self) -> Self { |
43 | 0 | match self { |
44 | 0 | DyadicSign::Pos => DyadicSign::Neg, |
45 | 0 | DyadicSign::Neg => DyadicSign::Pos, |
46 | | } |
47 | 0 | } |
48 | | |
49 | | #[inline] |
50 | 0 | pub(crate) const fn to_bit(self) -> u8 { |
51 | 0 | match self { |
52 | 0 | DyadicSign::Pos => 0, |
53 | 0 | DyadicSign::Neg => 1, |
54 | | } |
55 | 0 | } |
56 | | |
57 | | #[inline] |
58 | 0 | pub(crate) const fn mult(self, rhs: Self) -> Self { |
59 | 0 | if (self as u8) ^ (rhs as u8) != 0 { |
60 | 0 | DyadicSign::Neg |
61 | | } else { |
62 | 0 | DyadicSign::Pos |
63 | | } |
64 | 0 | } |
65 | | } |
66 | | |
67 | | const BITS: u32 = 128; |
68 | | |
69 | | #[derive(Copy, Clone, Debug)] |
70 | | pub(crate) struct DyadicFloat128 { |
71 | | pub(crate) sign: DyadicSign, |
72 | | pub(crate) exponent: i16, |
73 | | pub(crate) mantissa: u128, |
74 | | } |
75 | | |
76 | | #[inline] |
77 | 0 | pub(crate) const fn f64_from_parts(sign: DyadicSign, exp: u64, mantissa: u64) -> f64 { |
78 | 0 | let r_sign = (if sign.to_bit() == 0 { 0u64 } else { 1u64 }).wrapping_shl(63); |
79 | 0 | let r_exp = exp.wrapping_shl(52); |
80 | 0 | f64::from_bits(r_sign | r_exp | mantissa) |
81 | 0 | } |
82 | | |
83 | | #[inline] |
84 | 0 | pub(crate) fn mulhi_u128(a: u128, b: u128) -> u128 { |
85 | 0 | let a_lo = a as u64 as u128; |
86 | 0 | let a_hi = (a >> 64) as u64 as u128; |
87 | 0 | let b_lo = b as u64 as u128; |
88 | 0 | let b_hi = (b >> 64) as u64 as u128; |
89 | | |
90 | 0 | let lo_lo = a_lo * b_lo; |
91 | 0 | let lo_hi = a_lo * b_hi; |
92 | 0 | let hi_lo = a_hi * b_lo; |
93 | 0 | let hi_hi = a_hi * b_hi; |
94 | | |
95 | 0 | let carry = (lo_lo >> 64) |
96 | 0 | .wrapping_add(lo_hi & 0xffff_ffff_ffff_ffff) |
97 | 0 | .wrapping_add(hi_lo & 0xffff_ffff_ffff_ffff); |
98 | 0 | let mid = (lo_hi >> 64) |
99 | 0 | .wrapping_add(hi_lo >> 64) |
100 | 0 | .wrapping_add(carry >> 64); |
101 | | |
102 | 0 | hi_hi.wrapping_add(mid) |
103 | 0 | } |
104 | | |
105 | | #[inline] |
106 | 0 | const fn explicit_exponent(x: f64) -> i16 { |
107 | 0 | let exp = ((x.to_bits() >> 52) & ((1u64 << 11) - 1u64)) as i16 - 1023; |
108 | 0 | if x == 0. { |
109 | 0 | return 0; |
110 | 0 | } else if x.is_subnormal() { |
111 | | const EXP_BIAS: u64 = (1u64 << (11 - 1u64)) - 1u64; |
112 | 0 | return 1i16 - EXP_BIAS as i16; |
113 | 0 | } |
114 | 0 | exp |
115 | 0 | } |
116 | | |
117 | | #[inline] |
118 | 0 | const fn explicit_mantissa(x: f64) -> u64 { |
119 | | const MASK: u64 = (1u64 << 52) - 1; |
120 | 0 | let sig_bits = x.to_bits() & MASK; |
121 | 0 | if x.is_subnormal() || x == 0. { |
122 | 0 | return sig_bits; |
123 | 0 | } |
124 | 0 | (1u64 << 52) | sig_bits |
125 | 0 | } |
126 | | |
127 | | impl DyadicFloat128 { |
128 | | #[inline] |
129 | 0 | pub(crate) const fn zero() -> Self { |
130 | 0 | Self { |
131 | 0 | sign: DyadicSign::Pos, |
132 | 0 | exponent: 0, |
133 | 0 | mantissa: 0, |
134 | 0 | } |
135 | 0 | } |
136 | | |
137 | | #[inline] |
138 | 0 | pub(crate) const fn new_from_f64(x: f64) -> Self { |
139 | 0 | let sign = if x.is_sign_negative() { |
140 | 0 | DyadicSign::Neg |
141 | | } else { |
142 | 0 | DyadicSign::Pos |
143 | | }; |
144 | 0 | let exponent = explicit_exponent(x) - 52; |
145 | 0 | let mantissa = explicit_mantissa(x) as u128; |
146 | 0 | let mut new_val = Self { |
147 | 0 | sign, |
148 | 0 | exponent, |
149 | 0 | mantissa, |
150 | 0 | }; |
151 | 0 | new_val.normalize(); |
152 | 0 | new_val |
153 | 0 | } |
154 | | |
155 | | #[inline] |
156 | 0 | pub(crate) fn new(sign: DyadicSign, exponent: i16, mantissa: u128) -> Self { |
157 | 0 | let mut new_item = DyadicFloat128 { |
158 | 0 | sign, |
159 | 0 | exponent, |
160 | 0 | mantissa, |
161 | 0 | }; |
162 | 0 | new_item.normalize(); |
163 | 0 | new_item |
164 | 0 | } |
165 | | |
166 | | #[inline] |
167 | 0 | pub(crate) fn accurate_reciprocal(a: f64) -> Self { |
168 | 0 | let mut r = DyadicFloat128::new_from_f64(4.0 / a); /* accurate to about 53 bits */ |
169 | 0 | r.exponent -= 2; |
170 | | /* we use Newton's iteration: r -> r + r*(1-a*r) */ |
171 | 0 | let ba = DyadicFloat128::new_from_f64(-a); |
172 | 0 | let mut q = ba * r; |
173 | | const F128_ONE: DyadicFloat128 = DyadicFloat128 { |
174 | | sign: DyadicSign::Pos, |
175 | | exponent: -127, |
176 | | mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0000_u128, |
177 | | }; |
178 | 0 | q = F128_ONE + q; |
179 | 0 | q = r * q; |
180 | 0 | r + q |
181 | 0 | } |
182 | | |
183 | | #[inline] |
184 | 0 | pub(crate) fn from_div_f64(a: f64, b: f64) -> Self { |
185 | 0 | let reciprocal = DyadicFloat128::accurate_reciprocal(b); |
186 | 0 | let da = DyadicFloat128::new_from_f64(a); |
187 | 0 | reciprocal * da |
188 | 0 | } |
189 | | |
190 | | /// Multiply self by integer scalar `b`. |
191 | | /// Returns a new normalized DyadicFloat128. |
192 | | #[inline] |
193 | 0 | pub(crate) fn mul_int64(&self, b: i64) -> DyadicFloat128 { |
194 | 0 | if b == 0 { |
195 | 0 | return DyadicFloat128::zero(); |
196 | 0 | } |
197 | | |
198 | 0 | let abs_b = b.unsigned_abs(); |
199 | 0 | let sign = if (b < 0) ^ (self.sign == DyadicSign::Neg) { |
200 | 0 | DyadicSign::Neg |
201 | | } else { |
202 | 0 | DyadicSign::Pos |
203 | | }; |
204 | | |
205 | 0 | let mut hi_prod = (self.mantissa >> 64).wrapping_mul(abs_b as u128); |
206 | 0 | let m = hi_prod.leading_zeros(); |
207 | 0 | hi_prod <<= m; |
208 | | |
209 | 0 | let mut lo_prod = (self.mantissa & 0xffff_ffff_ffff_ffff).wrapping_mul(abs_b as u128); |
210 | 0 | lo_prod = (lo_prod << (m - 1)) >> 63; |
211 | | |
212 | 0 | let (mut product, overflow) = hi_prod.overflowing_add(lo_prod); |
213 | | |
214 | 0 | let mut result = DyadicFloat128 { |
215 | 0 | sign, |
216 | 0 | exponent: self.exponent + 64 - m as i16, |
217 | 0 | mantissa: product, |
218 | 0 | }; |
219 | | |
220 | 0 | if overflow { |
221 | 0 | // Overflow means an implicit bit in the 129th place, which we shift down. |
222 | 0 | product += product & 0x1; |
223 | 0 | result.mantissa = (product >> 1) | (1u128 << 127); |
224 | 0 | result.shift_right(1); |
225 | 0 | } |
226 | | |
227 | 0 | result.normalize(); |
228 | 0 | result |
229 | 0 | } |
230 | | |
231 | | #[inline] |
232 | 0 | fn shift_right(&mut self, amount: u32) { |
233 | 0 | if amount < BITS { |
234 | 0 | self.exponent += amount as i16; |
235 | 0 | self.mantissa = self.mantissa.wrapping_shr(amount); |
236 | 0 | } else { |
237 | 0 | self.exponent = 0; |
238 | 0 | self.mantissa = 0; |
239 | 0 | } |
240 | 0 | } |
241 | | |
242 | | #[inline] |
243 | 0 | fn shift_left(&mut self, amount: u32) { |
244 | 0 | if amount < BITS { |
245 | 0 | self.exponent -= amount as i16; |
246 | 0 | self.mantissa = self.mantissa.wrapping_shl(amount); |
247 | 0 | } else { |
248 | 0 | self.exponent = 0; |
249 | 0 | self.mantissa = 0; |
250 | 0 | } |
251 | 0 | } |
252 | | |
253 | | // Don't forget to call if manually created |
254 | | #[inline] |
255 | 0 | pub(crate) const fn normalize(&mut self) { |
256 | 0 | if self.mantissa != 0 { |
257 | 0 | let shift_length = self.mantissa.leading_zeros(); |
258 | 0 | self.exponent -= shift_length as i16; |
259 | 0 | self.mantissa = self.mantissa.wrapping_shl(shift_length); |
260 | 0 | } |
261 | 0 | } |
262 | | |
263 | | #[inline] |
264 | 0 | pub(crate) fn negated(&self) -> Self { |
265 | 0 | Self { |
266 | 0 | sign: self.sign.negate(), |
267 | 0 | exponent: self.exponent, |
268 | 0 | mantissa: self.mantissa, |
269 | 0 | } |
270 | 0 | } |
271 | | |
272 | | #[inline] |
273 | 0 | pub(crate) fn quick_sub(&self, rhs: &Self) -> Self { |
274 | 0 | self.quick_add(&rhs.negated()) |
275 | 0 | } |
276 | | |
277 | | #[inline] |
278 | 0 | pub(crate) fn quick_add(&self, rhs: &Self) -> Self { |
279 | 0 | if self.mantissa == 0 { |
280 | 0 | return *rhs; |
281 | 0 | } |
282 | 0 | if rhs.mantissa == 0 { |
283 | 0 | return *self; |
284 | 0 | } |
285 | 0 | let mut a = *self; |
286 | 0 | let mut b = *rhs; |
287 | | |
288 | 0 | let exp_diff = a.exponent.wrapping_sub(b.exponent); |
289 | | |
290 | | // If exponent difference is too large, b is negligible |
291 | 0 | if exp_diff.abs() >= BITS as i16 { |
292 | 0 | return if a.sign == b.sign { |
293 | | // Adding very small number to large: return a |
294 | 0 | return if a.exponent > b.exponent { a } else { b }; |
295 | 0 | } else if a.exponent > b.exponent { |
296 | 0 | a |
297 | | } else { |
298 | 0 | b |
299 | | }; |
300 | 0 | } |
301 | | |
302 | | // Align exponents |
303 | 0 | if a.exponent > b.exponent { |
304 | 0 | b.shift_right((a.exponent - b.exponent) as u32); |
305 | 0 | } else if b.exponent > a.exponent { |
306 | 0 | a.shift_right((b.exponent - a.exponent) as u32); |
307 | 0 | } |
308 | | |
309 | 0 | let mut result = DyadicFloat128::zero(); |
310 | | |
311 | 0 | if a.sign == b.sign { |
312 | | // Addition |
313 | 0 | result.sign = a.sign; |
314 | 0 | result.exponent = a.exponent; |
315 | 0 | result.mantissa = a.mantissa; |
316 | 0 | let (sum, is_overflow) = result.mantissa.overflowing_add(b.mantissa); |
317 | 0 | result.mantissa = sum; |
318 | 0 | if is_overflow { |
319 | 0 | // Mantissa addition overflow. |
320 | 0 | result.shift_right(1); |
321 | 0 | result.mantissa |= 1u128 << 127; |
322 | 0 | } |
323 | | // Result is already normalized. |
324 | 0 | return result; |
325 | 0 | } |
326 | | |
327 | | // Subtraction |
328 | 0 | if a.mantissa >= b.mantissa { |
329 | 0 | result.sign = a.sign; |
330 | 0 | result.exponent = a.exponent; |
331 | 0 | result.mantissa = a.mantissa.wrapping_sub(b.mantissa); |
332 | 0 | } else { |
333 | 0 | result.sign = b.sign; |
334 | 0 | result.exponent = b.exponent; |
335 | 0 | result.mantissa = b.mantissa.wrapping_sub(a.mantissa); |
336 | 0 | } |
337 | | |
338 | 0 | result.normalize(); |
339 | 0 | result |
340 | 0 | } |
341 | | |
342 | | #[inline] |
343 | 0 | pub(crate) fn quick_mul(&self, rhs: &Self) -> Self { |
344 | 0 | let mut result = DyadicFloat128 { |
345 | 0 | sign: if self.sign != rhs.sign { |
346 | 0 | DyadicSign::Neg |
347 | | } else { |
348 | 0 | DyadicSign::Pos |
349 | | }, |
350 | 0 | exponent: self.exponent + rhs.exponent + BITS as i16, |
351 | | mantissa: 0, |
352 | | }; |
353 | | |
354 | 0 | if !(self.mantissa == 0 || rhs.mantissa == 0) { |
355 | 0 | result.mantissa = mulhi_u128(self.mantissa, rhs.mantissa); |
356 | | // Check the leading bit directly, should be faster than using clz in |
357 | | // normalize(). |
358 | 0 | if result.mantissa >> 127 == 0 { |
359 | 0 | result.shift_left(1); |
360 | 0 | } |
361 | 0 | } else { |
362 | 0 | result.mantissa = 0; |
363 | 0 | } |
364 | 0 | result |
365 | 0 | } |
366 | | |
367 | | #[inline] |
368 | 0 | pub(crate) fn fast_as_f64(&self) -> f64 { |
369 | 0 | if self.mantissa == 0 { |
370 | 0 | return if self.sign == DyadicSign::Pos { |
371 | 0 | 0. |
372 | | } else { |
373 | 0 | -0.0 |
374 | | }; |
375 | 0 | } |
376 | | |
377 | | // Assume that it is normalized, and output is also normal. |
378 | | const PRECISION: u32 = 52 + 1; |
379 | | |
380 | | // SIG_MASK - FRACTION_MASK |
381 | | const SIG_MASK: u64 = (1u64 << 52) - 1; |
382 | | const FRACTION_MASK: u64 = (1u64 << 52) - 1; |
383 | | const IMPLICIT_MASK: u64 = SIG_MASK - FRACTION_MASK; |
384 | | const EXP_BIAS: u64 = (1u64 << (11 - 1u64)) - 1u64; |
385 | | |
386 | 0 | let mut exp_hi = self.exponent as i32 + ((BITS - 1) as i32 + EXP_BIAS as i32); |
387 | | |
388 | 0 | if exp_hi > 2 * EXP_BIAS as i32 { |
389 | | // Results overflow. |
390 | 0 | let d_hi = f64_from_parts(self.sign, 2 * EXP_BIAS, IMPLICIT_MASK); |
391 | | // volatile prevents constant propagation that would result in infinity |
392 | | // always being returned no matter the current rounding mode. |
393 | 0 | let two = 2.0f64; |
394 | 0 | let r = two * d_hi; |
395 | 0 | return r; |
396 | 0 | } |
397 | | |
398 | 0 | let mut denorm = false; |
399 | 0 | let mut shift = BITS - PRECISION; |
400 | 0 | if exp_hi <= 0 { |
401 | 0 | // Output is denormal. |
402 | 0 | denorm = true; |
403 | 0 | shift = (BITS - PRECISION) + (1 - exp_hi) as u32; |
404 | 0 |
|
405 | 0 | exp_hi = EXP_BIAS as i32; |
406 | 0 | } |
407 | | |
408 | 0 | let exp_lo = exp_hi.wrapping_sub(PRECISION as i32).wrapping_sub(1); |
409 | | |
410 | 0 | let m_hi = if shift >= BITS { |
411 | 0 | 0 |
412 | | } else { |
413 | 0 | self.mantissa >> shift |
414 | | }; |
415 | | |
416 | 0 | let d_hi = f64_from_parts( |
417 | 0 | self.sign, |
418 | 0 | exp_hi as u64, |
419 | 0 | (m_hi as u64 & SIG_MASK) | IMPLICIT_MASK, |
420 | | ); |
421 | | |
422 | 0 | let round_mask = if shift > BITS { |
423 | 0 | 0 |
424 | | } else { |
425 | 0 | 1u128.wrapping_shl(shift.wrapping_sub(1)) |
426 | | }; |
427 | 0 | let sticky_mask = round_mask.wrapping_sub(1u128); |
428 | | |
429 | 0 | let round_bit = (self.mantissa & round_mask) != 0; |
430 | 0 | let sticky_bit = (self.mantissa & sticky_mask) != 0; |
431 | 0 | let round_and_sticky = round_bit as i32 * 2 + sticky_bit as i32; |
432 | | |
433 | | let d_lo: f64; |
434 | | |
435 | 0 | if exp_lo <= 0 { |
436 | | // d_lo is denormal, but the output is normal. |
437 | 0 | let scale_up_exponent = 1 - exp_lo; |
438 | 0 | let scale_up_factor = f64_from_parts( |
439 | 0 | DyadicSign::Pos, |
440 | 0 | EXP_BIAS + scale_up_exponent as u64, |
441 | | IMPLICIT_MASK, |
442 | | ); |
443 | 0 | let scale_down_factor = f64_from_parts( |
444 | 0 | DyadicSign::Pos, |
445 | 0 | EXP_BIAS - scale_up_exponent as u64, |
446 | | IMPLICIT_MASK, |
447 | | ); |
448 | | |
449 | 0 | d_lo = f64_from_parts( |
450 | 0 | self.sign, |
451 | 0 | (exp_lo + scale_up_exponent) as u64, |
452 | 0 | IMPLICIT_MASK, |
453 | 0 | ); |
454 | | |
455 | 0 | return f_fmla(d_lo, round_and_sticky as f64, d_hi * scale_up_factor) |
456 | 0 | * scale_down_factor; |
457 | 0 | } |
458 | | |
459 | 0 | d_lo = f64_from_parts(self.sign, exp_lo as u64, IMPLICIT_MASK); |
460 | | |
461 | | // Still correct without FMA instructions if `d_lo` is not underflow. |
462 | 0 | let r = f_fmla(d_lo, round_and_sticky as f64, d_hi); |
463 | | |
464 | 0 | if denorm { |
465 | | const SIG_LEN: u64 = 52; |
466 | | // Exponent before rounding is in denormal range, simply clear the |
467 | | // exponent field. |
468 | 0 | let clear_exp: u64 = (exp_hi as u64) << SIG_LEN; |
469 | 0 | let mut r_bits: u64 = r.to_bits() - clear_exp; |
470 | | |
471 | 0 | if r_bits & EXP_MASK == 0 { |
472 | 0 | // Output is denormal after rounding, clear the implicit bit for 80-bit |
473 | 0 | // long double. |
474 | 0 | r_bits -= IMPLICIT_MASK; |
475 | 0 | } |
476 | | |
477 | 0 | return f64::from_bits(r_bits); |
478 | 0 | } |
479 | | |
480 | 0 | r |
481 | 0 | } |
482 | | |
483 | | // Approximate reciprocal - given a nonzero `a`, make a good approximation to 1/a. |
484 | | // The method is Newton-Raphson iteration, based on quick_mul. |
485 | | #[inline] |
486 | 0 | pub(crate) fn reciprocal(self) -> DyadicFloat128 { |
487 | | // Computes the reciprocal using Newton-Raphson iteration: |
488 | | // Given an approximation x ≈ 1/a, we refine via: |
489 | | // x' = x * (2 - a * x) |
490 | | // This squares the error term: if ax ≈ 1 - e, then ax' ≈ 1 - e². |
491 | | |
492 | 0 | let guess = 1. / self.fast_as_f64(); |
493 | 0 | let mut x = DyadicFloat128::new_from_f64(guess); |
494 | | |
495 | | // The constant 2, which we'll need in every iteration |
496 | 0 | let twos = DyadicFloat128 { |
497 | 0 | sign: DyadicSign::Pos, |
498 | 0 | exponent: -126, |
499 | 0 | mantissa: 0x80000000_00000000_00000000_00000000_u128, |
500 | 0 | }; |
501 | | |
502 | 0 | x = x * (twos - (self * x)); |
503 | 0 | x = x * (twos - (self * x)); |
504 | 0 | x |
505 | 0 | } |
506 | | |
507 | | // // Approximate reciprocal - given a nonzero `a`, make a good approximation to 1/a. |
508 | | // // The method is Newton-Raphson iteration, based on quick_mul. |
509 | | // // *This is very crude guess* |
510 | | // #[inline] |
511 | | // fn approximate_reciprocal(&self) -> DyadicFloat128 { |
512 | | // // Given an approximation x to 1/a, a better one is x' = x(2-ax). |
513 | | // // |
514 | | // // You can derive this by using the Newton-Raphson formula with the function |
515 | | // // f(x) = 1/x - a. But another way to see that it works is to say: suppose |
516 | | // // that ax = 1-e for some small error e. Then ax' = ax(2-ax) = (1-e)(1+e) = |
517 | | // // 1-e^2. So the error in x' is the square of the error in x, i.e. the number |
518 | | // // of correct bits in x' is double the number in x. |
519 | | // |
520 | | // // An initial approximation to the reciprocal |
521 | | // let mut x = DyadicFloat128 { |
522 | | // sign: DyadicSign::Pos, |
523 | | // exponent: -32 - self.exponent - BITS as i16, |
524 | | // mantissa: self.mantissa >> (BITS - 32), |
525 | | // }; |
526 | | // x.normalize(); |
527 | | // |
528 | | // // The constant 2, which we'll need in every iteration |
529 | | // let two = DyadicFloat128::new(DyadicSign::Pos, 1, 1); |
530 | | // |
531 | | // // We expect at least 31 correct bits from our 32-bit starting approximation |
532 | | // let mut ok_bits = 31usize; |
533 | | // |
534 | | // // The number of good bits doubles in each iteration, except that rounding |
535 | | // // errors introduce a little extra each time. Subtract a bit from our |
536 | | // // accuracy assessment to account for that. |
537 | | // while ok_bits < BITS as usize { |
538 | | // x = x * (two - (*self * x)); |
539 | | // ok_bits = 2 * ok_bits - 1; |
540 | | // } |
541 | | // |
542 | | // x |
543 | | // } |
544 | | } |
545 | | |
546 | | impl Add<DyadicFloat128> for DyadicFloat128 { |
547 | | type Output = DyadicFloat128; |
548 | | #[inline] |
549 | 0 | fn add(self, rhs: DyadicFloat128) -> Self::Output { |
550 | 0 | self.quick_add(&rhs) |
551 | 0 | } |
552 | | } |
553 | | |
554 | | impl DyadicFloat128 { |
555 | | #[inline] |
556 | 0 | pub(crate) fn biased_exponent(&self) -> i16 { |
557 | 0 | self.exponent + (BITS as i16 - 1) |
558 | 0 | } |
559 | | |
560 | | #[inline] |
561 | 0 | pub(crate) fn trunc_to_i64(&self) -> i64 { |
562 | 0 | if self.exponent <= -(BITS as i16) { |
563 | | // Absolute value of x is greater than equal to 0.5 but less than 1. |
564 | 0 | return 0; |
565 | 0 | } |
566 | 0 | let hi = self.mantissa >> 64; |
567 | 0 | let norm_exp = self.biased_exponent(); |
568 | 0 | if norm_exp > 63 { |
569 | 0 | return if self.sign == DyadicSign::Neg { |
570 | 0 | i64::MIN |
571 | | } else { |
572 | 0 | i64::MAX |
573 | | }; |
574 | 0 | } |
575 | 0 | let r: i64 = (hi >> (63 - norm_exp)) as i64; |
576 | | |
577 | 0 | if self.sign == DyadicSign::Neg { -r } else { r } |
578 | 0 | } |
579 | | |
580 | | #[inline] |
581 | 0 | pub(crate) fn round_to_nearest(&self) -> DyadicFloat128 { |
582 | 0 | if self.exponent == -(BITS as i16) { |
583 | | // Absolute value of x is greater than equal to 0.5 but less than 1. |
584 | 0 | return DyadicFloat128 { |
585 | 0 | sign: self.sign, |
586 | 0 | exponent: -(BITS as i16 - 1), |
587 | 0 | mantissa: 0x80000000_00000000_00000000_00000000_u128, |
588 | 0 | }; |
589 | 0 | } |
590 | 0 | if self.exponent <= -((BITS + 1) as i16) { |
591 | | // Absolute value of x is greater than equal to 0.5 but less than 1. |
592 | 0 | return DyadicFloat128 { |
593 | 0 | sign: self.sign, |
594 | 0 | exponent: 0, |
595 | 0 | mantissa: 0u128, |
596 | 0 | }; |
597 | 0 | } |
598 | | const FRACTION_LENGTH: u32 = BITS - 1; |
599 | 0 | let trim_size = |
600 | 0 | (FRACTION_LENGTH as i64).wrapping_sub(self.exponent as i64 + (BITS - 1) as i64) as u128; |
601 | 0 | let half_bit_set = |
602 | 0 | self.mantissa & (1u128.wrapping_shl(trim_size.wrapping_sub(1) as u32)) != 0; |
603 | 0 | let trunc_u: u128 = self |
604 | 0 | .mantissa |
605 | 0 | .wrapping_shr(trim_size as u32) |
606 | 0 | .wrapping_shl(trim_size as u32); |
607 | 0 | if trunc_u == self.mantissa { |
608 | 0 | return *self; |
609 | 0 | } |
610 | | |
611 | 0 | let truncated = DyadicFloat128::new(self.sign, self.exponent, trunc_u); |
612 | | |
613 | 0 | if !half_bit_set { |
614 | | // Franctional part is less than 0.5 so round value is the |
615 | | // same as the trunc value. |
616 | 0 | truncated |
617 | 0 | } else if self.sign == DyadicSign::Neg { |
618 | 0 | let ones = DyadicFloat128 { |
619 | 0 | sign: DyadicSign::Pos, |
620 | 0 | exponent: -(BITS as i16 - 1), |
621 | 0 | mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0000_u128, |
622 | 0 | }; |
623 | 0 | truncated - ones |
624 | | } else { |
625 | 0 | let ones = DyadicFloat128 { |
626 | 0 | sign: DyadicSign::Pos, |
627 | 0 | exponent: -(BITS as i16 - 1), |
628 | 0 | mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0000_u128, |
629 | 0 | }; |
630 | 0 | truncated + ones |
631 | | } |
632 | 0 | } |
633 | | |
634 | | #[inline] |
635 | 0 | pub(crate) fn round_to_nearest_f64(&self) -> f64 { |
636 | 0 | self.round_to_nearest().fast_as_f64() |
637 | 0 | } |
638 | | } |
639 | | |
640 | | impl Sub<DyadicFloat128> for DyadicFloat128 { |
641 | | type Output = DyadicFloat128; |
642 | | #[inline] |
643 | 0 | fn sub(self, rhs: DyadicFloat128) -> Self::Output { |
644 | 0 | self.quick_sub(&rhs) |
645 | 0 | } |
646 | | } |
647 | | |
648 | | impl Mul<DyadicFloat128> for DyadicFloat128 { |
649 | | type Output = DyadicFloat128; |
650 | | #[inline] |
651 | 0 | fn mul(self, rhs: DyadicFloat128) -> Self::Output { |
652 | 0 | self.quick_mul(&rhs) |
653 | 0 | } |
654 | | } |
655 | | |
656 | | #[cfg(test)] |
657 | | mod tests { |
658 | | use super::*; |
659 | | |
660 | | #[test] |
661 | | fn test_dyadic_float() { |
662 | | let ones = DyadicFloat128 { |
663 | | sign: DyadicSign::Pos, |
664 | | exponent: -127, |
665 | | mantissa: 0x80000000_00000000_00000000_00000000_u128, |
666 | | }; |
667 | | let cvt = ones.fast_as_f64(); |
668 | | assert_eq!(cvt, 1.0); |
669 | | |
670 | | let minus_0_5 = DyadicFloat128 { |
671 | | sign: DyadicSign::Neg, |
672 | | exponent: -128, |
673 | | mantissa: 0x80000000_00000000_00000000_00000000_u128, |
674 | | }; |
675 | | let cvt0 = minus_0_5.fast_as_f64(); |
676 | | assert_eq!(cvt0, -1.0 / 2.0); |
677 | | |
678 | | let minus_1_f4 = DyadicFloat128 { |
679 | | sign: DyadicSign::Neg, |
680 | | exponent: -132, |
681 | | mantissa: 0xaaaaaaaa_aaaaaaaa_aaaaaaaa_aaaaaaab_u128, |
682 | | }; |
683 | | let cvt0 = minus_1_f4.fast_as_f64(); |
684 | | assert_eq!(cvt0, -1.0 / 24.0); |
685 | | |
686 | | let minus_1_f8 = DyadicFloat128 { |
687 | | sign: DyadicSign::Pos, |
688 | | exponent: -143, |
689 | | mantissa: 0xd00d00d0_0d00d00d_00d00d00_d00d00d0_u128, |
690 | | }; |
691 | | let cvt0 = minus_1_f8.fast_as_f64(); |
692 | | assert_eq!(cvt0, 1.0 / 40320.0); |
693 | | } |
694 | | |
695 | | #[test] |
696 | | fn dyadic_float_add() { |
697 | | let ones = DyadicFloat128 { |
698 | | sign: DyadicSign::Pos, |
699 | | exponent: -127, |
700 | | mantissa: 0x80000000_00000000_00000000_00000000_u128, |
701 | | }; |
702 | | |
703 | | let cvt = ones.fast_as_f64(); |
704 | | assert_eq!(cvt, 1.0); |
705 | | |
706 | | let minus_0_5 = DyadicFloat128 { |
707 | | sign: DyadicSign::Neg, |
708 | | exponent: -128, |
709 | | mantissa: 0x80000000_00000000_00000000_00000000_u128, |
710 | | }; |
711 | | let cvt0 = ones.quick_add(&minus_0_5).fast_as_f64(); |
712 | | assert_eq!(cvt0, 0.5); |
713 | | } |
714 | | |
715 | | #[test] |
716 | | fn dyadic_float_mul() { |
717 | | let ones = DyadicFloat128 { |
718 | | sign: DyadicSign::Pos, |
719 | | exponent: -127, |
720 | | mantissa: 0x80000000_00000000_00000000_00000000_u128, |
721 | | }; |
722 | | |
723 | | let cvt = ones.fast_as_f64(); |
724 | | assert_eq!(cvt, 1.0); |
725 | | |
726 | | let minus_0_5 = DyadicFloat128 { |
727 | | sign: DyadicSign::Neg, |
728 | | exponent: -128, |
729 | | mantissa: 0x80000000_00000000_00000000_00000000_u128, |
730 | | }; |
731 | | let product = ones.quick_mul(&minus_0_5); |
732 | | let cvt0 = product.fast_as_f64(); |
733 | | assert_eq!(cvt0, -0.5); |
734 | | |
735 | | let twos = DyadicFloat128 { |
736 | | sign: DyadicSign::Pos, |
737 | | exponent: -126, |
738 | | mantissa: 0x80000000_00000000_00000000_00000000_u128, |
739 | | }; |
740 | | |
741 | | let cvt = twos.fast_as_f64(); |
742 | | assert_eq!(cvt, 2.0); |
743 | | } |
744 | | |
745 | | #[test] |
746 | | fn dyadic_round_trip() { |
747 | | let z00 = 0.0; |
748 | | let zvt00 = DyadicFloat128::new_from_f64(z00); |
749 | | let b00 = zvt00.fast_as_f64(); |
750 | | assert_eq!(b00, z00); |
751 | | |
752 | | let zvt000 = DyadicFloat128 { |
753 | | sign: DyadicSign::Pos, |
754 | | exponent: 0, |
755 | | mantissa: 0, |
756 | | }; |
757 | | let b000 = zvt000.fast_as_f64(); |
758 | | assert_eq!(b000, z00); |
759 | | |
760 | | let z0 = 1.0; |
761 | | let zvt0 = DyadicFloat128::new_from_f64(z0); |
762 | | let b0 = zvt0.fast_as_f64(); |
763 | | assert_eq!(b0, z0); |
764 | | |
765 | | let z1 = 0.5; |
766 | | let zvt1 = DyadicFloat128::new_from_f64(z1); |
767 | | let b1 = zvt1.fast_as_f64(); |
768 | | assert_eq!(b1, z1); |
769 | | |
770 | | let z2 = -0.5; |
771 | | let zvt2 = DyadicFloat128::new_from_f64(z2); |
772 | | let b2 = zvt2.fast_as_f64(); |
773 | | assert_eq!(b2, z2); |
774 | | |
775 | | let z3 = -532322.54324324232; |
776 | | let zvt3 = DyadicFloat128::new_from_f64(z3); |
777 | | let b3 = zvt3.fast_as_f64(); |
778 | | assert_eq!(b3, z3); |
779 | | } |
780 | | |
781 | | #[test] |
782 | | fn dyadic_float_reciprocal() { |
783 | | let ones = DyadicFloat128 { |
784 | | sign: DyadicSign::Pos, |
785 | | exponent: -127, |
786 | | mantissa: 0x80000000_00000000_00000000_00000000_u128, |
787 | | } |
788 | | .reciprocal(); |
789 | | |
790 | | let cvt = ones.fast_as_f64(); |
791 | | assert_eq!(cvt, 1.0); |
792 | | |
793 | | let minus_0_5 = DyadicFloat128::new_from_f64(4.).reciprocal(); |
794 | | let cvt0 = minus_0_5.fast_as_f64(); |
795 | | assert_eq!(cvt0, 0.25); |
796 | | } |
797 | | |
798 | | #[test] |
799 | | fn dyadic_float_from_div() { |
800 | | let from_div = DyadicFloat128::from_div_f64(1.0, 4.0); |
801 | | let cvt = from_div.fast_as_f64(); |
802 | | assert_eq!(cvt, 0.25); |
803 | | } |
804 | | |
805 | | #[test] |
806 | | fn dyadic_float_accurate_reciprocal() { |
807 | | let from_div = DyadicFloat128::accurate_reciprocal(4.0); |
808 | | let cvt = from_div.fast_as_f64(); |
809 | | assert_eq!(cvt, 0.25); |
810 | | } |
811 | | |
812 | | #[test] |
813 | | fn dyadic_float_mul_int() { |
814 | | let from_div = DyadicFloat128::new_from_f64(4.0); |
815 | | let m1 = from_div.mul_int64(-2); |
816 | | assert_eq!(m1.fast_as_f64(), -8.0); |
817 | | |
818 | | let from_div = DyadicFloat128::new_from_f64(-4.0); |
819 | | let m1 = from_div.mul_int64(-2); |
820 | | assert_eq!(m1.fast_as_f64(), 8.0); |
821 | | |
822 | | let from_div = DyadicFloat128::new_from_f64(2.5); |
823 | | let m1 = from_div.mul_int64(2); |
824 | | assert_eq!(m1.fast_as_f64(), 5.0); |
825 | | } |
826 | | |
827 | | #[test] |
828 | | fn dyadic_float_round() { |
829 | | let from_div = DyadicFloat128::new_from_f64(2.5); |
830 | | let m1 = from_div.round_to_nearest_f64(); |
831 | | assert_eq!(m1, 3.0); |
832 | | |
833 | | let from_div = DyadicFloat128::new_from_f64(0.5); |
834 | | let m1 = from_div.round_to_nearest_f64(); |
835 | | assert_eq!(m1, 1.0); |
836 | | |
837 | | let from_div = DyadicFloat128::new_from_f64(-0.5); |
838 | | let m1 = from_div.round_to_nearest_f64(); |
839 | | assert_eq!(m1, -1.0); |
840 | | |
841 | | let from_div = DyadicFloat128::new_from_f64(-0.351); |
842 | | let m1 = from_div.round_to_nearest_f64(); |
843 | | assert_eq!(m1, (-0.351f64).round()); |
844 | | |
845 | | let from_div = DyadicFloat128::new_from_f64(0.351); |
846 | | let m1 = from_div.round_to_nearest_f64(); |
847 | | assert_eq!(m1, 0.351f64.round()); |
848 | | |
849 | | let z00 = 25.6; |
850 | | let zvt00 = DyadicFloat128::new_from_f64(z00); |
851 | | let b00 = zvt00.round_to_nearest_f64(); |
852 | | assert_eq!(b00, 26.); |
853 | | } |
854 | | |
855 | | #[test] |
856 | | fn dyadic_int_trunc() { |
857 | | let from_div = DyadicFloat128::new_from_f64(-2.5); |
858 | | let m1 = from_div.trunc_to_i64(); |
859 | | assert_eq!(m1, -2); |
860 | | |
861 | | let from_div = DyadicFloat128::new_from_f64(2.5); |
862 | | let m1 = from_div.trunc_to_i64(); |
863 | | assert_eq!(m1, 2); |
864 | | |
865 | | let from_div = DyadicFloat128::new_from_f64(0.5); |
866 | | let m1 = from_div.trunc_to_i64(); |
867 | | assert_eq!(m1, 0); |
868 | | |
869 | | let from_div = DyadicFloat128::new_from_f64(-0.5); |
870 | | let m1 = from_div.trunc_to_i64(); |
871 | | assert_eq!(m1, 0); |
872 | | |
873 | | let from_div = DyadicFloat128::new_from_f64(-0.351); |
874 | | let m1 = from_div.trunc_to_i64(); |
875 | | assert_eq!(m1, 0); |
876 | | |
877 | | let from_div = DyadicFloat128::new_from_f64(0.351); |
878 | | let m1 = from_div.trunc_to_i64(); |
879 | | assert_eq!(m1, 0); |
880 | | } |
881 | | } |