Coverage Report

Created: 2026-01-17 06:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/pxfm-0.1.27/src/dyadic_float.rs
Line
Count
Source
1
/*
2
 * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
3
 * //
4
 * // Redistribution and use in source and binary forms, with or without modification,
5
 * // are permitted provided that the following conditions are met:
6
 * //
7
 * // 1.  Redistributions of source code must retain the above copyright notice, this
8
 * // list of conditions and the following disclaimer.
9
 * //
10
 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11
 * // this list of conditions and the following disclaimer in the documentation
12
 * // and/or other materials provided with the distribution.
13
 * //
14
 * // 3.  Neither the name of the copyright holder nor the names of its
15
 * // contributors may be used to endorse or promote products derived from
16
 * // this software without specific prior written permission.
17
 * //
18
 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
 */
29
use crate::bits::EXP_MASK;
30
use crate::common::f_fmla;
31
use std::ops::{Add, Mul, Sub};
32
33
#[repr(u8)]
34
#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)]
35
pub(crate) enum DyadicSign {
36
    Pos = 0,
37
    Neg = 1,
38
}
39
40
impl DyadicSign {
41
    #[inline]
42
0
    pub(crate) fn negate(self) -> Self {
43
0
        match self {
44
0
            DyadicSign::Pos => DyadicSign::Neg,
45
0
            DyadicSign::Neg => DyadicSign::Pos,
46
        }
47
0
    }
48
49
    #[inline]
50
0
    pub(crate) const fn to_bit(self) -> u8 {
51
0
        match self {
52
0
            DyadicSign::Pos => 0,
53
0
            DyadicSign::Neg => 1,
54
        }
55
0
    }
56
57
    #[inline]
58
0
    pub(crate) const fn mult(self, rhs: Self) -> Self {
59
0
        if (self as u8) ^ (rhs as u8) != 0 {
60
0
            DyadicSign::Neg
61
        } else {
62
0
            DyadicSign::Pos
63
        }
64
0
    }
65
}
66
67
const BITS: u32 = 128;
68
69
#[derive(Copy, Clone, Debug)]
70
pub(crate) struct DyadicFloat128 {
71
    pub(crate) sign: DyadicSign,
72
    pub(crate) exponent: i16,
73
    pub(crate) mantissa: u128,
74
}
75
76
#[inline]
77
0
pub(crate) const fn f64_from_parts(sign: DyadicSign, exp: u64, mantissa: u64) -> f64 {
78
0
    let r_sign = (if sign.to_bit() == 0 { 0u64 } else { 1u64 }).wrapping_shl(63);
79
0
    let r_exp = exp.wrapping_shl(52);
80
0
    f64::from_bits(r_sign | r_exp | mantissa)
81
0
}
82
83
#[inline]
84
0
pub(crate) fn mulhi_u128(a: u128, b: u128) -> u128 {
85
0
    let a_lo = a as u64 as u128;
86
0
    let a_hi = (a >> 64) as u64 as u128;
87
0
    let b_lo = b as u64 as u128;
88
0
    let b_hi = (b >> 64) as u64 as u128;
89
90
0
    let lo_lo = a_lo * b_lo;
91
0
    let lo_hi = a_lo * b_hi;
92
0
    let hi_lo = a_hi * b_lo;
93
0
    let hi_hi = a_hi * b_hi;
94
95
0
    let carry = (lo_lo >> 64)
96
0
        .wrapping_add(lo_hi & 0xffff_ffff_ffff_ffff)
97
0
        .wrapping_add(hi_lo & 0xffff_ffff_ffff_ffff);
98
0
    let mid = (lo_hi >> 64)
99
0
        .wrapping_add(hi_lo >> 64)
100
0
        .wrapping_add(carry >> 64);
101
102
0
    hi_hi.wrapping_add(mid)
103
0
}
104
105
#[inline]
106
0
const fn explicit_exponent(x: f64) -> i16 {
107
0
    let exp = ((x.to_bits() >> 52) & ((1u64 << 11) - 1u64)) as i16 - 1023;
108
0
    if x == 0. {
109
0
        return 0;
110
0
    } else if x.is_subnormal() {
111
        const EXP_BIAS: u64 = (1u64 << (11 - 1u64)) - 1u64;
112
0
        return 1i16 - EXP_BIAS as i16;
113
0
    }
114
0
    exp
115
0
}
116
117
#[inline]
118
0
const fn explicit_mantissa(x: f64) -> u64 {
119
    const MASK: u64 = (1u64 << 52) - 1;
120
0
    let sig_bits = x.to_bits() & MASK;
121
0
    if x.is_subnormal() || x == 0. {
122
0
        return sig_bits;
123
0
    }
124
0
    (1u64 << 52) | sig_bits
125
0
}
126
127
impl DyadicFloat128 {
128
    #[inline]
129
0
    pub(crate) const fn zero() -> Self {
130
0
        Self {
131
0
            sign: DyadicSign::Pos,
132
0
            exponent: 0,
133
0
            mantissa: 0,
134
0
        }
135
0
    }
136
137
    #[inline]
138
0
    pub(crate) const fn new_from_f64(x: f64) -> Self {
139
0
        let sign = if x.is_sign_negative() {
140
0
            DyadicSign::Neg
141
        } else {
142
0
            DyadicSign::Pos
143
        };
144
0
        let exponent = explicit_exponent(x) - 52;
145
0
        let mantissa = explicit_mantissa(x) as u128;
146
0
        let mut new_val = Self {
147
0
            sign,
148
0
            exponent,
149
0
            mantissa,
150
0
        };
151
0
        new_val.normalize();
152
0
        new_val
153
0
    }
154
155
    #[inline]
156
0
    pub(crate) fn new(sign: DyadicSign, exponent: i16, mantissa: u128) -> Self {
157
0
        let mut new_item = DyadicFloat128 {
158
0
            sign,
159
0
            exponent,
160
0
            mantissa,
161
0
        };
162
0
        new_item.normalize();
163
0
        new_item
164
0
    }
165
166
    #[inline]
167
0
    pub(crate) fn accurate_reciprocal(a: f64) -> Self {
168
0
        let mut r = DyadicFloat128::new_from_f64(4.0 / a); /* accurate to about 53 bits */
169
0
        r.exponent -= 2;
170
        /* we use Newton's iteration: r -> r + r*(1-a*r) */
171
0
        let ba = DyadicFloat128::new_from_f64(-a);
172
0
        let mut q = ba * r;
173
        const F128_ONE: DyadicFloat128 = DyadicFloat128 {
174
            sign: DyadicSign::Pos,
175
            exponent: -127,
176
            mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0000_u128,
177
        };
178
0
        q = F128_ONE + q;
179
0
        q = r * q;
180
0
        r + q
181
0
    }
182
183
    #[inline]
184
0
    pub(crate) fn from_div_f64(a: f64, b: f64) -> Self {
185
0
        let reciprocal = DyadicFloat128::accurate_reciprocal(b);
186
0
        let da = DyadicFloat128::new_from_f64(a);
187
0
        reciprocal * da
188
0
    }
189
190
    /// Multiply self by integer scalar `b`.
191
    /// Returns a new normalized DyadicFloat128.
192
    #[inline]
193
0
    pub(crate) fn mul_int64(&self, b: i64) -> DyadicFloat128 {
194
0
        if b == 0 {
195
0
            return DyadicFloat128::zero();
196
0
        }
197
198
0
        let abs_b = b.unsigned_abs();
199
0
        let sign = if (b < 0) ^ (self.sign == DyadicSign::Neg) {
200
0
            DyadicSign::Neg
201
        } else {
202
0
            DyadicSign::Pos
203
        };
204
205
0
        let mut hi_prod = (self.mantissa >> 64).wrapping_mul(abs_b as u128);
206
0
        let m = hi_prod.leading_zeros();
207
0
        hi_prod <<= m;
208
209
0
        let mut lo_prod = (self.mantissa & 0xffff_ffff_ffff_ffff).wrapping_mul(abs_b as u128);
210
0
        lo_prod = (lo_prod << (m - 1)) >> 63;
211
212
0
        let (mut product, overflow) = hi_prod.overflowing_add(lo_prod);
213
214
0
        let mut result = DyadicFloat128 {
215
0
            sign,
216
0
            exponent: self.exponent + 64 - m as i16,
217
0
            mantissa: product,
218
0
        };
219
220
0
        if overflow {
221
0
            // Overflow means an implicit bit in the 129th place, which we shift down.
222
0
            product += product & 0x1;
223
0
            result.mantissa = (product >> 1) | (1u128 << 127);
224
0
            result.shift_right(1);
225
0
        }
226
227
0
        result.normalize();
228
0
        result
229
0
    }
230
231
    #[inline]
232
0
    fn shift_right(&mut self, amount: u32) {
233
0
        if amount < BITS {
234
0
            self.exponent += amount as i16;
235
0
            self.mantissa = self.mantissa.wrapping_shr(amount);
236
0
        } else {
237
0
            self.exponent = 0;
238
0
            self.mantissa = 0;
239
0
        }
240
0
    }
241
242
    #[inline]
243
0
    fn shift_left(&mut self, amount: u32) {
244
0
        if amount < BITS {
245
0
            self.exponent -= amount as i16;
246
0
            self.mantissa = self.mantissa.wrapping_shl(amount);
247
0
        } else {
248
0
            self.exponent = 0;
249
0
            self.mantissa = 0;
250
0
        }
251
0
    }
252
253
    // Don't forget to call if manually created
254
    #[inline]
255
0
    pub(crate) const fn normalize(&mut self) {
256
0
        if self.mantissa != 0 {
257
0
            let shift_length = self.mantissa.leading_zeros();
258
0
            self.exponent -= shift_length as i16;
259
0
            self.mantissa = self.mantissa.wrapping_shl(shift_length);
260
0
        }
261
0
    }
262
263
    #[inline]
264
0
    pub(crate) fn negated(&self) -> Self {
265
0
        Self {
266
0
            sign: self.sign.negate(),
267
0
            exponent: self.exponent,
268
0
            mantissa: self.mantissa,
269
0
        }
270
0
    }
271
272
    #[inline]
273
0
    pub(crate) fn quick_sub(&self, rhs: &Self) -> Self {
274
0
        self.quick_add(&rhs.negated())
275
0
    }
276
277
    #[inline]
278
0
    pub(crate) fn quick_add(&self, rhs: &Self) -> Self {
279
0
        if self.mantissa == 0 {
280
0
            return *rhs;
281
0
        }
282
0
        if rhs.mantissa == 0 {
283
0
            return *self;
284
0
        }
285
0
        let mut a = *self;
286
0
        let mut b = *rhs;
287
288
0
        let exp_diff = a.exponent.wrapping_sub(b.exponent);
289
290
        // If exponent difference is too large, b is negligible
291
0
        if exp_diff.abs() >= BITS as i16 {
292
0
            return if a.sign == b.sign {
293
                // Adding very small number to large: return a
294
0
                return if a.exponent > b.exponent { a } else { b };
295
0
            } else if a.exponent > b.exponent {
296
0
                a
297
            } else {
298
0
                b
299
            };
300
0
        }
301
302
        // Align exponents
303
0
        if a.exponent > b.exponent {
304
0
            b.shift_right((a.exponent - b.exponent) as u32);
305
0
        } else if b.exponent > a.exponent {
306
0
            a.shift_right((b.exponent - a.exponent) as u32);
307
0
        }
308
309
0
        let mut result = DyadicFloat128::zero();
310
311
0
        if a.sign == b.sign {
312
            // Addition
313
0
            result.sign = a.sign;
314
0
            result.exponent = a.exponent;
315
0
            result.mantissa = a.mantissa;
316
0
            let (sum, is_overflow) = result.mantissa.overflowing_add(b.mantissa);
317
0
            result.mantissa = sum;
318
0
            if is_overflow {
319
0
                // Mantissa addition overflow.
320
0
                result.shift_right(1);
321
0
                result.mantissa |= 1u128 << 127;
322
0
            }
323
            // Result is already normalized.
324
0
            return result;
325
0
        }
326
327
        // Subtraction
328
0
        if a.mantissa >= b.mantissa {
329
0
            result.sign = a.sign;
330
0
            result.exponent = a.exponent;
331
0
            result.mantissa = a.mantissa.wrapping_sub(b.mantissa);
332
0
        } else {
333
0
            result.sign = b.sign;
334
0
            result.exponent = b.exponent;
335
0
            result.mantissa = b.mantissa.wrapping_sub(a.mantissa);
336
0
        }
337
338
0
        result.normalize();
339
0
        result
340
0
    }
341
342
    #[inline]
343
0
    pub(crate) fn quick_mul(&self, rhs: &Self) -> Self {
344
0
        let mut result = DyadicFloat128 {
345
0
            sign: if self.sign != rhs.sign {
346
0
                DyadicSign::Neg
347
            } else {
348
0
                DyadicSign::Pos
349
            },
350
0
            exponent: self.exponent + rhs.exponent + BITS as i16,
351
            mantissa: 0,
352
        };
353
354
0
        if !(self.mantissa == 0 || rhs.mantissa == 0) {
355
0
            result.mantissa = mulhi_u128(self.mantissa, rhs.mantissa);
356
            // Check the leading bit directly, should be faster than using clz in
357
            // normalize().
358
0
            if result.mantissa >> 127 == 0 {
359
0
                result.shift_left(1);
360
0
            }
361
0
        } else {
362
0
            result.mantissa = 0;
363
0
        }
364
0
        result
365
0
    }
366
367
    #[inline]
368
0
    pub(crate) fn fast_as_f64(&self) -> f64 {
369
0
        if self.mantissa == 0 {
370
0
            return if self.sign == DyadicSign::Pos {
371
0
                0.
372
            } else {
373
0
                -0.0
374
            };
375
0
        }
376
377
        // Assume that it is normalized, and output is also normal.
378
        const PRECISION: u32 = 52 + 1;
379
380
        // SIG_MASK - FRACTION_MASK
381
        const SIG_MASK: u64 = (1u64 << 52) - 1;
382
        const FRACTION_MASK: u64 = (1u64 << 52) - 1;
383
        const IMPLICIT_MASK: u64 = SIG_MASK - FRACTION_MASK;
384
        const EXP_BIAS: u64 = (1u64 << (11 - 1u64)) - 1u64;
385
386
0
        let mut exp_hi = self.exponent as i32 + ((BITS - 1) as i32 + EXP_BIAS as i32);
387
388
0
        if exp_hi > 2 * EXP_BIAS as i32 {
389
            // Results overflow.
390
0
            let d_hi = f64_from_parts(self.sign, 2 * EXP_BIAS, IMPLICIT_MASK);
391
            // volatile prevents constant propagation that would result in infinity
392
            // always being returned no matter the current rounding mode.
393
0
            let two = 2.0f64;
394
0
            let r = two * d_hi;
395
0
            return r;
396
0
        }
397
398
0
        let mut denorm = false;
399
0
        let mut shift = BITS - PRECISION;
400
0
        if exp_hi <= 0 {
401
0
            // Output is denormal.
402
0
            denorm = true;
403
0
            shift = (BITS - PRECISION) + (1 - exp_hi) as u32;
404
0
405
0
            exp_hi = EXP_BIAS as i32;
406
0
        }
407
408
0
        let exp_lo = exp_hi.wrapping_sub(PRECISION as i32).wrapping_sub(1);
409
410
0
        let m_hi = if shift >= BITS {
411
0
            0
412
        } else {
413
0
            self.mantissa >> shift
414
        };
415
416
0
        let d_hi = f64_from_parts(
417
0
            self.sign,
418
0
            exp_hi as u64,
419
0
            (m_hi as u64 & SIG_MASK) | IMPLICIT_MASK,
420
        );
421
422
0
        let round_mask = if shift > BITS {
423
0
            0
424
        } else {
425
0
            1u128.wrapping_shl(shift.wrapping_sub(1))
426
        };
427
0
        let sticky_mask = round_mask.wrapping_sub(1u128);
428
429
0
        let round_bit = (self.mantissa & round_mask) != 0;
430
0
        let sticky_bit = (self.mantissa & sticky_mask) != 0;
431
0
        let round_and_sticky = round_bit as i32 * 2 + sticky_bit as i32;
432
433
        let d_lo: f64;
434
435
0
        if exp_lo <= 0 {
436
            // d_lo is denormal, but the output is normal.
437
0
            let scale_up_exponent = 1 - exp_lo;
438
0
            let scale_up_factor = f64_from_parts(
439
0
                DyadicSign::Pos,
440
0
                EXP_BIAS + scale_up_exponent as u64,
441
                IMPLICIT_MASK,
442
            );
443
0
            let scale_down_factor = f64_from_parts(
444
0
                DyadicSign::Pos,
445
0
                EXP_BIAS - scale_up_exponent as u64,
446
                IMPLICIT_MASK,
447
            );
448
449
0
            d_lo = f64_from_parts(
450
0
                self.sign,
451
0
                (exp_lo + scale_up_exponent) as u64,
452
0
                IMPLICIT_MASK,
453
0
            );
454
455
0
            return f_fmla(d_lo, round_and_sticky as f64, d_hi * scale_up_factor)
456
0
                * scale_down_factor;
457
0
        }
458
459
0
        d_lo = f64_from_parts(self.sign, exp_lo as u64, IMPLICIT_MASK);
460
461
        // Still correct without FMA instructions if `d_lo` is not underflow.
462
0
        let r = f_fmla(d_lo, round_and_sticky as f64, d_hi);
463
464
0
        if denorm {
465
            const SIG_LEN: u64 = 52;
466
            // Exponent before rounding is in denormal range, simply clear the
467
            // exponent field.
468
0
            let clear_exp: u64 = (exp_hi as u64) << SIG_LEN;
469
0
            let mut r_bits: u64 = r.to_bits() - clear_exp;
470
471
0
            if r_bits & EXP_MASK == 0 {
472
0
                // Output is denormal after rounding, clear the implicit bit for 80-bit
473
0
                // long double.
474
0
                r_bits -= IMPLICIT_MASK;
475
0
            }
476
477
0
            return f64::from_bits(r_bits);
478
0
        }
479
480
0
        r
481
0
    }
482
483
    // Approximate reciprocal - given a nonzero `a`, make a good approximation to 1/a.
484
    // The method is Newton-Raphson iteration, based on quick_mul.
485
    #[inline]
486
0
    pub(crate) fn reciprocal(self) -> DyadicFloat128 {
487
        // Computes the reciprocal using Newton-Raphson iteration:
488
        // Given an approximation x ≈ 1/a, we refine via:
489
        //     x' = x * (2 - a * x)
490
        // This squares the error term: if ax ≈ 1 - e, then ax' ≈ 1 - e².
491
492
0
        let guess = 1. / self.fast_as_f64();
493
0
        let mut x = DyadicFloat128::new_from_f64(guess);
494
495
        // The constant 2, which we'll need in every iteration
496
0
        let twos = DyadicFloat128 {
497
0
            sign: DyadicSign::Pos,
498
0
            exponent: -126,
499
0
            mantissa: 0x80000000_00000000_00000000_00000000_u128,
500
0
        };
501
502
0
        x = x * (twos - (self * x));
503
0
        x = x * (twos - (self * x));
504
0
        x
505
0
    }
506
507
    // // Approximate reciprocal - given a nonzero `a`, make a good approximation to 1/a.
508
    // // The method is Newton-Raphson iteration, based on quick_mul.
509
    // // *This is very crude guess*
510
    // #[inline]
511
    // fn approximate_reciprocal(&self) -> DyadicFloat128 {
512
    //     // Given an approximation x to 1/a, a better one is x' = x(2-ax).
513
    //     //
514
    //     // You can derive this by using the Newton-Raphson formula with the function
515
    //     // f(x) = 1/x - a. But another way to see that it works is to say: suppose
516
    //     // that ax = 1-e for some small error e. Then ax' = ax(2-ax) = (1-e)(1+e) =
517
    //     // 1-e^2. So the error in x' is the square of the error in x, i.e. the number
518
    //     // of correct bits in x' is double the number in x.
519
    //
520
    //     // An initial approximation to the reciprocal
521
    //     let mut x = DyadicFloat128 {
522
    //         sign: DyadicSign::Pos,
523
    //         exponent: -32 - self.exponent - BITS as i16,
524
    //         mantissa: self.mantissa >> (BITS - 32),
525
    //     };
526
    //     x.normalize();
527
    //
528
    //     // The constant 2, which we'll need in every iteration
529
    //     let two = DyadicFloat128::new(DyadicSign::Pos, 1, 1);
530
    //
531
    //     // We expect at least 31 correct bits from our 32-bit starting approximation
532
    //     let mut ok_bits = 31usize;
533
    //
534
    //     // The number of good bits doubles in each iteration, except that rounding
535
    //     // errors introduce a little extra each time. Subtract a bit from our
536
    //     // accuracy assessment to account for that.
537
    //     while ok_bits < BITS as usize {
538
    //         x = x * (two - (*self * x));
539
    //         ok_bits = 2 * ok_bits - 1;
540
    //     }
541
    //
542
    //     x
543
    // }
544
}
545
546
impl Add<DyadicFloat128> for DyadicFloat128 {
547
    type Output = DyadicFloat128;
548
    #[inline]
549
0
    fn add(self, rhs: DyadicFloat128) -> Self::Output {
550
0
        self.quick_add(&rhs)
551
0
    }
552
}
553
554
impl DyadicFloat128 {
555
    #[inline]
556
0
    pub(crate) fn biased_exponent(&self) -> i16 {
557
0
        self.exponent + (BITS as i16 - 1)
558
0
    }
559
560
    #[inline]
561
0
    pub(crate) fn trunc_to_i64(&self) -> i64 {
562
0
        if self.exponent <= -(BITS as i16) {
563
            // Absolute value of x is greater than equal to 0.5 but less than 1.
564
0
            return 0;
565
0
        }
566
0
        let hi = self.mantissa >> 64;
567
0
        let norm_exp = self.biased_exponent();
568
0
        if norm_exp > 63 {
569
0
            return if self.sign == DyadicSign::Neg {
570
0
                i64::MIN
571
            } else {
572
0
                i64::MAX
573
            };
574
0
        }
575
0
        let r: i64 = (hi >> (63 - norm_exp)) as i64;
576
577
0
        if self.sign == DyadicSign::Neg { -r } else { r }
578
0
    }
579
580
    #[inline]
581
0
    pub(crate) fn round_to_nearest(&self) -> DyadicFloat128 {
582
0
        if self.exponent == -(BITS as i16) {
583
            // Absolute value of x is greater than equal to 0.5 but less than 1.
584
0
            return DyadicFloat128 {
585
0
                sign: self.sign,
586
0
                exponent: -(BITS as i16 - 1),
587
0
                mantissa: 0x80000000_00000000_00000000_00000000_u128,
588
0
            };
589
0
        }
590
0
        if self.exponent <= -((BITS + 1) as i16) {
591
            // Absolute value of x is greater than equal to 0.5 but less than 1.
592
0
            return DyadicFloat128 {
593
0
                sign: self.sign,
594
0
                exponent: 0,
595
0
                mantissa: 0u128,
596
0
            };
597
0
        }
598
        const FRACTION_LENGTH: u32 = BITS - 1;
599
0
        let trim_size =
600
0
            (FRACTION_LENGTH as i64).wrapping_sub(self.exponent as i64 + (BITS - 1) as i64) as u128;
601
0
        let half_bit_set =
602
0
            self.mantissa & (1u128.wrapping_shl(trim_size.wrapping_sub(1) as u32)) != 0;
603
0
        let trunc_u: u128 = self
604
0
            .mantissa
605
0
            .wrapping_shr(trim_size as u32)
606
0
            .wrapping_shl(trim_size as u32);
607
0
        if trunc_u == self.mantissa {
608
0
            return *self;
609
0
        }
610
611
0
        let truncated = DyadicFloat128::new(self.sign, self.exponent, trunc_u);
612
613
0
        if !half_bit_set {
614
            // Franctional part is less than 0.5 so round value is the
615
            // same as the trunc value.
616
0
            truncated
617
0
        } else if self.sign == DyadicSign::Neg {
618
0
            let ones = DyadicFloat128 {
619
0
                sign: DyadicSign::Pos,
620
0
                exponent: -(BITS as i16 - 1),
621
0
                mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0000_u128,
622
0
            };
623
0
            truncated - ones
624
        } else {
625
0
            let ones = DyadicFloat128 {
626
0
                sign: DyadicSign::Pos,
627
0
                exponent: -(BITS as i16 - 1),
628
0
                mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0000_u128,
629
0
            };
630
0
            truncated + ones
631
        }
632
0
    }
633
634
    #[inline]
635
0
    pub(crate) fn round_to_nearest_f64(&self) -> f64 {
636
0
        self.round_to_nearest().fast_as_f64()
637
0
    }
638
}
639
640
impl Sub<DyadicFloat128> for DyadicFloat128 {
641
    type Output = DyadicFloat128;
642
    #[inline]
643
0
    fn sub(self, rhs: DyadicFloat128) -> Self::Output {
644
0
        self.quick_sub(&rhs)
645
0
    }
646
}
647
648
impl Mul<DyadicFloat128> for DyadicFloat128 {
649
    type Output = DyadicFloat128;
650
    #[inline]
651
0
    fn mul(self, rhs: DyadicFloat128) -> Self::Output {
652
0
        self.quick_mul(&rhs)
653
0
    }
654
}
655
656
#[cfg(test)]
657
mod tests {
658
    use super::*;
659
660
    #[test]
661
    fn test_dyadic_float() {
662
        let ones = DyadicFloat128 {
663
            sign: DyadicSign::Pos,
664
            exponent: -127,
665
            mantissa: 0x80000000_00000000_00000000_00000000_u128,
666
        };
667
        let cvt = ones.fast_as_f64();
668
        assert_eq!(cvt, 1.0);
669
670
        let minus_0_5 = DyadicFloat128 {
671
            sign: DyadicSign::Neg,
672
            exponent: -128,
673
            mantissa: 0x80000000_00000000_00000000_00000000_u128,
674
        };
675
        let cvt0 = minus_0_5.fast_as_f64();
676
        assert_eq!(cvt0, -1.0 / 2.0);
677
678
        let minus_1_f4 = DyadicFloat128 {
679
            sign: DyadicSign::Neg,
680
            exponent: -132,
681
            mantissa: 0xaaaaaaaa_aaaaaaaa_aaaaaaaa_aaaaaaab_u128,
682
        };
683
        let cvt0 = minus_1_f4.fast_as_f64();
684
        assert_eq!(cvt0, -1.0 / 24.0);
685
686
        let minus_1_f8 = DyadicFloat128 {
687
            sign: DyadicSign::Pos,
688
            exponent: -143,
689
            mantissa: 0xd00d00d0_0d00d00d_00d00d00_d00d00d0_u128,
690
        };
691
        let cvt0 = minus_1_f8.fast_as_f64();
692
        assert_eq!(cvt0, 1.0 / 40320.0);
693
    }
694
695
    #[test]
696
    fn dyadic_float_add() {
697
        let ones = DyadicFloat128 {
698
            sign: DyadicSign::Pos,
699
            exponent: -127,
700
            mantissa: 0x80000000_00000000_00000000_00000000_u128,
701
        };
702
703
        let cvt = ones.fast_as_f64();
704
        assert_eq!(cvt, 1.0);
705
706
        let minus_0_5 = DyadicFloat128 {
707
            sign: DyadicSign::Neg,
708
            exponent: -128,
709
            mantissa: 0x80000000_00000000_00000000_00000000_u128,
710
        };
711
        let cvt0 = ones.quick_add(&minus_0_5).fast_as_f64();
712
        assert_eq!(cvt0, 0.5);
713
    }
714
715
    #[test]
716
    fn dyadic_float_mul() {
717
        let ones = DyadicFloat128 {
718
            sign: DyadicSign::Pos,
719
            exponent: -127,
720
            mantissa: 0x80000000_00000000_00000000_00000000_u128,
721
        };
722
723
        let cvt = ones.fast_as_f64();
724
        assert_eq!(cvt, 1.0);
725
726
        let minus_0_5 = DyadicFloat128 {
727
            sign: DyadicSign::Neg,
728
            exponent: -128,
729
            mantissa: 0x80000000_00000000_00000000_00000000_u128,
730
        };
731
        let product = ones.quick_mul(&minus_0_5);
732
        let cvt0 = product.fast_as_f64();
733
        assert_eq!(cvt0, -0.5);
734
735
        let twos = DyadicFloat128 {
736
            sign: DyadicSign::Pos,
737
            exponent: -126,
738
            mantissa: 0x80000000_00000000_00000000_00000000_u128,
739
        };
740
741
        let cvt = twos.fast_as_f64();
742
        assert_eq!(cvt, 2.0);
743
    }
744
745
    #[test]
746
    fn dyadic_round_trip() {
747
        let z00 = 0.0;
748
        let zvt00 = DyadicFloat128::new_from_f64(z00);
749
        let b00 = zvt00.fast_as_f64();
750
        assert_eq!(b00, z00);
751
752
        let zvt000 = DyadicFloat128 {
753
            sign: DyadicSign::Pos,
754
            exponent: 0,
755
            mantissa: 0,
756
        };
757
        let b000 = zvt000.fast_as_f64();
758
        assert_eq!(b000, z00);
759
760
        let z0 = 1.0;
761
        let zvt0 = DyadicFloat128::new_from_f64(z0);
762
        let b0 = zvt0.fast_as_f64();
763
        assert_eq!(b0, z0);
764
765
        let z1 = 0.5;
766
        let zvt1 = DyadicFloat128::new_from_f64(z1);
767
        let b1 = zvt1.fast_as_f64();
768
        assert_eq!(b1, z1);
769
770
        let z2 = -0.5;
771
        let zvt2 = DyadicFloat128::new_from_f64(z2);
772
        let b2 = zvt2.fast_as_f64();
773
        assert_eq!(b2, z2);
774
775
        let z3 = -532322.54324324232;
776
        let zvt3 = DyadicFloat128::new_from_f64(z3);
777
        let b3 = zvt3.fast_as_f64();
778
        assert_eq!(b3, z3);
779
    }
780
781
    #[test]
782
    fn dyadic_float_reciprocal() {
783
        let ones = DyadicFloat128 {
784
            sign: DyadicSign::Pos,
785
            exponent: -127,
786
            mantissa: 0x80000000_00000000_00000000_00000000_u128,
787
        }
788
        .reciprocal();
789
790
        let cvt = ones.fast_as_f64();
791
        assert_eq!(cvt, 1.0);
792
793
        let minus_0_5 = DyadicFloat128::new_from_f64(4.).reciprocal();
794
        let cvt0 = minus_0_5.fast_as_f64();
795
        assert_eq!(cvt0, 0.25);
796
    }
797
798
    #[test]
799
    fn dyadic_float_from_div() {
800
        let from_div = DyadicFloat128::from_div_f64(1.0, 4.0);
801
        let cvt = from_div.fast_as_f64();
802
        assert_eq!(cvt, 0.25);
803
    }
804
805
    #[test]
806
    fn dyadic_float_accurate_reciprocal() {
807
        let from_div = DyadicFloat128::accurate_reciprocal(4.0);
808
        let cvt = from_div.fast_as_f64();
809
        assert_eq!(cvt, 0.25);
810
    }
811
812
    #[test]
813
    fn dyadic_float_mul_int() {
814
        let from_div = DyadicFloat128::new_from_f64(4.0);
815
        let m1 = from_div.mul_int64(-2);
816
        assert_eq!(m1.fast_as_f64(), -8.0);
817
818
        let from_div = DyadicFloat128::new_from_f64(-4.0);
819
        let m1 = from_div.mul_int64(-2);
820
        assert_eq!(m1.fast_as_f64(), 8.0);
821
822
        let from_div = DyadicFloat128::new_from_f64(2.5);
823
        let m1 = from_div.mul_int64(2);
824
        assert_eq!(m1.fast_as_f64(), 5.0);
825
    }
826
827
    #[test]
828
    fn dyadic_float_round() {
829
        let from_div = DyadicFloat128::new_from_f64(2.5);
830
        let m1 = from_div.round_to_nearest_f64();
831
        assert_eq!(m1, 3.0);
832
833
        let from_div = DyadicFloat128::new_from_f64(0.5);
834
        let m1 = from_div.round_to_nearest_f64();
835
        assert_eq!(m1, 1.0);
836
837
        let from_div = DyadicFloat128::new_from_f64(-0.5);
838
        let m1 = from_div.round_to_nearest_f64();
839
        assert_eq!(m1, -1.0);
840
841
        let from_div = DyadicFloat128::new_from_f64(-0.351);
842
        let m1 = from_div.round_to_nearest_f64();
843
        assert_eq!(m1, (-0.351f64).round());
844
845
        let from_div = DyadicFloat128::new_from_f64(0.351);
846
        let m1 = from_div.round_to_nearest_f64();
847
        assert_eq!(m1, 0.351f64.round());
848
849
        let z00 = 25.6;
850
        let zvt00 = DyadicFloat128::new_from_f64(z00);
851
        let b00 = zvt00.round_to_nearest_f64();
852
        assert_eq!(b00, 26.);
853
    }
854
855
    #[test]
856
    fn dyadic_int_trunc() {
857
        let from_div = DyadicFloat128::new_from_f64(-2.5);
858
        let m1 = from_div.trunc_to_i64();
859
        assert_eq!(m1, -2);
860
861
        let from_div = DyadicFloat128::new_from_f64(2.5);
862
        let m1 = from_div.trunc_to_i64();
863
        assert_eq!(m1, 2);
864
865
        let from_div = DyadicFloat128::new_from_f64(0.5);
866
        let m1 = from_div.trunc_to_i64();
867
        assert_eq!(m1, 0);
868
869
        let from_div = DyadicFloat128::new_from_f64(-0.5);
870
        let m1 = from_div.trunc_to_i64();
871
        assert_eq!(m1, 0);
872
873
        let from_div = DyadicFloat128::new_from_f64(-0.351);
874
        let m1 = from_div.trunc_to_i64();
875
        assert_eq!(m1, 0);
876
877
        let from_div = DyadicFloat128::new_from_f64(0.351);
878
        let m1 = from_div.trunc_to_i64();
879
        assert_eq!(m1, 0);
880
    }
881
}