/rust/registry/src/index.crates.io-1949cf8c6b5b557f/pxfm-0.1.25/src/pow_exec.rs

Source
/*
 * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
 * //
 * // Redistribution and use in source and binary forms, with or without modification,
 * // are permitted provided that the following conditions are met:
 * //
 * // 1.  Redistributions of source code must retain the above copyright notice, this
 * // list of conditions and the following disclaimer.
 * //
 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
 * // this list of conditions and the following disclaimer in the documentation
 * // and/or other materials provided with the distribution.
 * //
 * // 3.  Neither the name of the copyright holder nor the names of its
 * // contributors may be used to endorse or promote products derived from
 * // this software without specific prior written permission.
 * //
 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
use crate::common::dd_fmla;
use crate::double_double::DoubleDouble;
use crate::dyadic_float::{DyadicFloat128, DyadicSign};
use crate::exponents::{EXP_REDUCE_T0, EXP_REDUCE_T1, ldexp};
use crate::exponents::{EXPM1_T0, EXPM1_T1};
use crate::polyeval::f_polyeval6;
use crate::pow_tables::{EXP_T1_2_DYADIC, EXP_T2_2_DYADIC, POW_INVERSE, POW_LOG_INV};
use crate::rounding::CpuRound;
use crate::rounding::CpuRoundTiesEven;

#[inline(always)]
pub(crate) fn log_poly_1(z: f64) -> DoubleDouble {
    /* The following is a degree-8 polynomial generated by Sollya for
    log(1+x)-x+x^2/2 over [-0.0040283203125,0.0040283203125]
    with absolute error < 2^-81.63
    and relative error < 2^-72.423 (see sollya/P_1.sollya).
    The relative error is for x - x^2/2 + P(x) with respect to log(1+x). */
    const P_1: [u64; 6] = [
        0x3fd5555555555558,
        0xbfd0000000000003,
        0x3fc999999981f535,
        0xbfc55555553d1eb4,
        0x3fc2494526fd4a06,
        0xbfc0001f0c80e8ce,
    ];
    let w = DoubleDouble::from_exact_mult(z, z);
    let t = dd_fmla(f64::from_bits(P_1[5]), z, f64::from_bits(P_1[4]));
    let mut u = dd_fmla(f64::from_bits(P_1[3]), z, f64::from_bits(P_1[2]));
    let mut v = dd_fmla(f64::from_bits(P_1[1]), z, f64::from_bits(P_1[0]));
    u = dd_fmla(t, w.hi, u);
    v = dd_fmla(u, w.hi, v);
    u = v * w.hi;
    DoubleDouble::new(dd_fmla(u, z, -0.5 * w.lo), -0.5 * w.hi)
}

/* Given 2^-1074 <= x <= 0x1.fffffffffffffp+1023, this routine puts in h+l
   an approximation of log(x) such that |l| < 2^-23.89*|h| and

   | h + l - log(x) | <= elog * |log x|

   with elog = 2^-73.527  if x < 1/sqrt(2) or sqrt(2) < x,
   and  elog = 2^-67.0544 if 1/sqrt(2) < x < sqrt(2)
   (note that x cannot equal 1/sqrt(2) nor sqrt(2)).
*/
#[inline]
pub(crate) fn pow_log_1(x: f64) -> (DoubleDouble, bool) {
    /* for 181 <= i <= 362, r[i] = _INVERSE[i-181] is a 9-bit approximation of
    1/x[i], where i*2^-8 <= x[i] < (i+1)*2^-8.
    More precisely r[i] is a 9-bit value such that r[i]*y-1 is representable
    exactly on 53 bits for for any y, i*2^-8 <= y < (i+1)*2^-8.
    Moreover |r[i]*y-1| < 0.0040283203125.
    Table generated with the accompanying pow.sage file,
    with l=inverse_centered(k=8,prec=9,maxbits=53,verbose=false) */
    let x_u = x.to_bits();
    let mut m = x_u & 0xfffffffffffff;
    let mut e: i64 = ((x_u >> 52) & 0x7ff) as i64;

    let t;
    if e != 0 {
        t = m | (0x3ffu64 << 52);
        m = m.wrapping_add(1u64 << 52);
        e -= 0x3ff;
    } else {
        /* x is a subnormal double  */
        let k = m.leading_zeros() - 11;

        e = -0x3fei64 - k as i64;
        m = m.wrapping_shl(k);
        t = m | (0x3ffu64 << 52);
    }

    /* now |x| = 2^_e*_t = 2^(_e-52)*m with 1 <= _t < 2,
    and 2^52 <= _m < 2^53 */

    //   log(x) = log(t) + E · log(2)
    let mut t = f64::from_bits(t);

    // If m > sqrt(2) we divide it by 2 so ensure 1/sqrt(2) < t < sqrt(2)
    let c: usize = (m >= 0x16a09e667f3bcd) as usize;
    static CY: [f64; 2] = [1.0, 0.5];
    static CM: [u64; 2] = [44, 45];

    e = e.wrapping_add(c as i64);
    let be = e;
    let i = m >> CM[c]; /* i/2^8 <= t < (i+1)/2^8 */
    /* when c=1, we have 0x16a09e667f3bcd <= m < 2^53, thus 90 <= i <= 127;
    when c=0, we have 2^52 <= m < 0x16a09e667f3bcd, thus 128 <= i <= 181 */
    t *= CY[c];
    /* now 0x1.6a09e667f3bcdp-1 <= t < 0x1.6a09e667f3bcdp+0,
    and log(x) = E * log(2) + log(t) */

    let r = f64::from_bits(POW_INVERSE[(i - 181) as usize]);
    let l1 = f64::from_bits(POW_LOG_INV[(i - 181) as usize].1);
    let l2 = f64::from_bits(POW_LOG_INV[(i - 181) as usize].0);

    let z = dd_fmla(r, t, -1.0);

    const LOG2_DD: DoubleDouble = DoubleDouble::new(
        f64::from_bits(0x3d2ef35793c76730),
        f64::from_bits(0x3fe62e42fefa3800),
    );

    let th = dd_fmla(be as f64, LOG2_DD.hi, l1);
    let tl = dd_fmla(be as f64, LOG2_DD.lo, l2);

    let mut v = DoubleDouble::f64_add(th, DoubleDouble::new(tl, z));
    let p = log_poly_1(z);
    v = DoubleDouble::f64_add(v.hi, DoubleDouble::new(v.lo + p.lo, p.hi));

    if e == 0 && v.lo.abs() > (v.hi.abs()) * f64::from_bits(0x3e70000000000000) {
        v = DoubleDouble::from_exact_add(v.hi, v.lo);
        return (v, true);
    }

    (v, false)
}

/* Given z such that |z| < 2^-12.905,
   this routine puts in qh+ql an approximation of exp(z) such that

   | (qh+ql) / exp(z) - 1 | < 2^-64.902632

   and |ql| <= 2^-51.999.
*/
#[inline(always)]
fn exp_poly_1(z: f64) -> DoubleDouble {
    /* The following is a degree-4 polynomial generated by Sollya for exp(x)
    over [-2^-12.905,2^-12.905]
    with absolute error < 2^-74.34 (see sollya/Q_1.sollya). */
    const Q_1: [u64; 5] = [
        0x3ff0000000000000,
        0x3ff0000000000000,
        0x3fe0000000000000,
        0x3fc5555555997996,
        0x3fa5555555849d8d,
    ];
    let mut q = dd_fmla(f64::from_bits(Q_1[4]), z, f64::from_bits(Q_1[3]));
    q = dd_fmla(q, z, f64::from_bits(Q_1[2]));
    let h0 = dd_fmla(q, z, f64::from_bits(Q_1[1]));

    let v1 = DoubleDouble::from_exact_mult(z, h0);
    DoubleDouble::f64_add(f64::from_bits(Q_1[0]), v1)
}

/* Given z such that |z| < 2^-12.905,
   this routine puts in qh+ql an approximation of exp(z) such that

   | (qh+ql) / exp(z) - 1 | < 2^-64.902632

   and |ql| <= 2^-51.999.
*/

// #[inline(always)]
// fn exp_poly_dd(z: DoubleDouble) -> DoubleDouble {
//     /* The following is a degree-4 polynomial generated by Sollya for exp(x)
//     over [-2^-12.905,2^-12.905] */
//     const Q_1: [(u64, u64); 7] = [
//         (0x0000000000000000, 0x3ff0000000000000),
//         (0x3a20e40000000000, 0x3ff0000000000000),
//         (0x3a04820000000000, 0x3fe0000000000000),
//         (0xbc756423c5338a66, 0x3fc5555555555556),
//         (0xbc5560f74db5556c, 0x3fa5555555555556),
//         (0x3c3648eca89bc6ac, 0x3f8111111144fbee),
//         (0xbbd53d924ae90c8c, 0x3f56c16c16ffeecc),
//     ];
//     let mut p = DoubleDouble::mult(z, DoubleDouble::from_bit_pair(Q_1[6]));
//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[5]));
//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[4]));
//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[3]));
//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[2]));
//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[1]));
//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[0]));
//     p
// }

#[inline]
pub(crate) fn pow_exp_1(r: DoubleDouble, s: f64) -> DoubleDouble {
    const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27);
    // #define RHO1 -0x1.577453f1799a6p+9
    /* We increase the initial value of RHO1 to avoid spurious underflow in
    the result value el. However, it is not possible to obtain a lower
    bound on |el| from the input value rh, thus this modified value of RHO1
    is obtained experimentally. */
    const RHO1: f64 = f64::from_bits(0xc08483b8cca421af);
    const RHO2: f64 = f64::from_bits(0x40862e42e709a95b);
    const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9);

    // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too
    #[allow(clippy::neg_cmp_op_on_partial_ord)]
    if !(r.hi <= RHO2) {
        return if r.hi > RHO3 {
            /* If rh > RHO3, we are sure there is overflow,
            For s=1 we return eh = el = DBL_MAX, which yields
            res_min = res_max = +Inf for rounding up or to nearest,
            and res_min = res_max = DBL_MAX for rounding down or toward zero,
            which will yield the correct rounding.
            For s=-1 we return eh = el = -DBL_MAX, which similarly gives
            res_min = res_max = -Inf or res_min = res_max = -DBL_MAX,
            which is the correct rounding. */
            DoubleDouble::new(
                f64::from_bits(0x7fefffffffffffff) * s,
                f64::from_bits(0x7fefffffffffffff) * s,
            )
        } else {
            /* If RHO2 < rh <= RHO3, we are in the intermediate region
            where there might be overflow or not, thus we set eh = el = NaN,
            which will set res_min = res_max = NaN, the comparison
            res_min == res_max will fail: we defer to the 2nd phase. */
            DoubleDouble::new(f64::NAN, f64::NAN)
        };
    }

    if r.hi < RHO1 {
        return if r.hi < RHO0 {
            /* For s=1, we have eh=el=+0 except for rounding up,
               thus res_min=+0 or -0, res_max=+0 in the main code,
               the rounding test succeeds, and we return res_max which is the
               expected result in the underflow case.
               For s=1 and rounding up, we have eh=+0, el=2^-1074,
               thus res_min = res_max = 2^-1074, which is the expected result too.
               For s=-1, we have eh=el=-0 except for rounding down,
               thus res_min=-0 or +0, res_max=-0 in the main code,
               the rounding test succeeds, and we return res_max which is the
               expected result in the underflow case.
               For s=-1 and rounding down, we have eh=-0, el=-2^-1074,
               thus res_min = res_max = -2^-1074, which is the expected result too.
            */
            DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s)
        } else {
            /* RHO0 <= rh < RHO1 or s < 0: we defer to the 2nd phase */
            DoubleDouble::new(f64::NAN, f64::NAN)
        };
    }
    const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);

    let k = (r.hi * INVLOG2).cpu_round();

    const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
    const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);

    let zh = dd_fmla(LOG2H, -k, r.hi);
    let zl = dd_fmla(LOG2L, -k, r.lo);

    let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
    let mk = (bk >> 12) + 0x3ff;
    let i2 = (bk >> 6) & 0x3f;
    let i1 = bk & 0x3f;

    let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]);
    let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]);
    let mut de = DoubleDouble::quick_mult(t1, t0);
    let q = exp_poly_1(zh + zl);
    de = DoubleDouble::quick_mult(de, q);
    /* we should have 1 < M < 2047 here, since we filtered out
    potential underflow/overflow cases at the beginning of this function */

    let mut du = (mk as u64).wrapping_shl(52);
    du = (f64::from_bits(du) * s).to_bits();
    de.hi *= f64::from_bits(du);
    de.lo *= f64::from_bits(du);
    de
}

#[inline]
pub(crate) fn exp_dd_fast(r: DoubleDouble) -> DoubleDouble {
    const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);

    let k = (r.hi * INVLOG2).cpu_round();

    const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
    const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);

    let mut z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r);
    z = DoubleDouble::from_exact_add(z.hi, z.lo);

    let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
    let mk = (bk >> 12) + 0x3ff;
    let i2 = (bk >> 6) & 0x3f;
    let i1 = bk & 0x3f;

    let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]);
    let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]);
    let mut de = DoubleDouble::quick_mult(t1, t0);
    // exp(hi + lo) = exp(hi) * exp(lo)
    let q_hi = exp_poly_1(z.hi);
    // Taylor series exp(x) ~ 1 + x since z.lo < ulp(z.h)
    let q_lo = DoubleDouble::from_exact_add(1., z.lo);
    let q = DoubleDouble::quick_mult(q_hi, q_lo);
    de = DoubleDouble::quick_mult(de, q);
    /* we should have 1 < M < 2047 here, since we filtered out
    potential underflow/overflow cases at the beginning of this function */

    let du = (mk as u64).wrapping_shl(52);
    de.hi *= f64::from_bits(du);
    de.lo *= f64::from_bits(du);
    de
}

#[inline]
pub(crate) fn pow_exp_dd(r: DoubleDouble, s: f64) -> DoubleDouble {
    const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27);
    // #define RHO1 -0x1.577453f1799a6p+9
    /* We increase the initial value of RHO1 to avoid spurious underflow in
    the result value el. However, it is not possible to obtain a lower
    bound on |el| from the input value rh, thus this modified value of RHO1
    is obtained experimentally. */
    const RHO1: f64 = f64::from_bits(0xc08483b8cca421af);
    const RHO2: f64 = f64::from_bits(0x40862e42e709a95b);
    const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9);

    // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too
    #[allow(clippy::neg_cmp_op_on_partial_ord)]
    if !(r.hi <= RHO2) {
        return if r.hi > RHO3 {
            /* If rh > RHO3, we are sure there is overflow,
            For s=1 we return eh = el = DBL_MAX, which yields
            res_min = res_max = +Inf for rounding up or to nearest,
            and res_min = res_max = DBL_MAX for rounding down or toward zero,
            which will yield the correct rounding.
            For s=-1 we return eh = el = -DBL_MAX, which similarly gives
            res_min = res_max = -Inf or res_min = res_max = -DBL_MAX,
            which is the correct rounding. */
            DoubleDouble::new(
                f64::from_bits(0x7fefffffffffffff) * s,
                f64::from_bits(0x7fefffffffffffff) * s,
            )
        } else {
            /* If RHO2 < rh <= RHO3, we are in the intermediate region
            where there might be overflow or not, thus we set eh = el = NaN,
            which will set res_min = res_max = NaN, the comparison
            res_min == res_max will fail: we defer to the 2nd phase. */
            DoubleDouble::new(f64::NAN, f64::NAN)
        };
    }

    if r.hi < RHO1 {
        return if r.hi < RHO0 {
            /* For s=1, we have eh=el=+0 except for rounding up,
               thus res_min=+0 or -0, res_max=+0 in the main code,
               the rounding test succeeds, and we return res_max which is the
               expected result in the underflow case.
               For s=1 and rounding up, we have eh=+0, el=2^-1074,
               thus res_min = res_max = 2^-1074, which is the expected result too.
               For s=-1, we have eh=el=-0 except for rounding down,
               thus res_min=-0 or +0, res_max=-0 in the main code,
               the rounding test succeeds, and we return res_max which is the
               expected result in the underflow case.
               For s=-1 and rounding down, we have eh=-0, el=-2^-1074,
               thus res_min = res_max = -2^-1074, which is the expected result too.
            */
            DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s)
        } else {
            /* RHO0 <= rh < RHO1 or s < 0: we defer to the 2nd phase */
            DoubleDouble::new(f64::NAN, f64::NAN)
        };
    }
    const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);

    let k = (r.hi * INVLOG2).cpu_round();

    const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
    const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);

    let z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r);

    let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
    let mk = (bk >> 12) + 0x3ff;
    let i2 = (bk >> 6) & 0x3f;
    let i1 = bk & 0x3f;

    let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]);
    let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]);
    let mut de = DoubleDouble::quick_mult(t1, t0);
    let q = exp_poly_1(z.to_f64());
    de = DoubleDouble::quick_mult(de, q);
    /* we should have 1 < M < 2047 here, since we filtered out
    potential underflow/overflow cases at the beginning of this function */

    let mut du = (mk as u64).wrapping_shl(52);
    du = (f64::from_bits(du) * s).to_bits();
    de.hi *= f64::from_bits(du);
    de.lo *= f64::from_bits(du);
    de
}
/*
#[inline(always)]
pub(crate) fn expm1_poly_dd(z: DoubleDouble) -> DoubleDouble {
    /*
       Sollya:
       pretty = proc(u) {
         return ~(floor(u*1000)/1000);
       };

       d = [-2^-12.905,2^-12.905];
       f = expm1(x);
       w = 1;
       pf = fpminimax(f, [|1,2,3,4,5,6,7|], [|1, 1, 107...|], d, absolute, floating);
       err_p = -log2(dirtyinfnorm(pf*w-f, d));
       display = decimal;

       for i from 1 to degree(pf) do print(coeff(pf, i));

       print (pf);
       display = decimal;
       print ("absolute error:",pretty(err_p));
       f = 1;
       w = 1/expm1(x);
       err_p = -log2(dirtyinfnorm(pf*w-f, d));
       print ("relative error:",pretty(err_p));
    */
    const Q: [(u64, u64); 7] = [
        (0x0000000000000000, 0x3ff0000000000000),
        (0x0000000000000000, 0x3fe0000000000000),
        (0xbc75555554d7c48c, 0x3fc5555555555556),
        (0xbc555a40ffb472d9, 0x3fa5555555555556),
        (0x3c24866314c38093, 0x3f8111111111110e),
        (0x3be34665978dddb8, 0x3f56c16c16efac90),
        (0x3baeab43b813ef24, 0x3f2a01a1e12d253c),
    ];
    let z2 = z * z;
    let z4 = z2 * z2;

    let b0 = DoubleDouble::quick_mul_add(
        z,
        DoubleDouble::from_bit_pair(Q[1]),
        DoubleDouble::from_bit_pair(Q[0]),
    );
    let b1 = DoubleDouble::quick_mul_add(
        z,
        DoubleDouble::from_bit_pair(Q[3]),
        DoubleDouble::from_bit_pair(Q[2]),
    );
    let b2 = DoubleDouble::quick_mul_add(
        z,
        DoubleDouble::from_bit_pair(Q[5]),
        DoubleDouble::from_bit_pair(Q[4]),
    );

    let c0 = DoubleDouble::quick_mul_add(z2, b1, b0);
    let c1 = DoubleDouble::quick_mul_add(z2, DoubleDouble::from_bit_pair(Q[6]), b2);

    let p = DoubleDouble::quick_mul_add(z4, c1, c0);
    DoubleDouble::quick_mult(p, z)
}
*/
#[inline(always)]
pub(crate) fn expm1_poly_fast(z: DoubleDouble) -> DoubleDouble {
    // Polynomial generated by Sollya:
    // d = [-2^-12.905,2^-12.905];
    // f = expm1(x);
    // w = 1;
    // pf = fpminimax(f, [|1,2,3,4,5,6,7|], [|1, 1, D...|], d, absolute, floating);
    // See ./notes/compound_m1_expm1_fast.sollya
    let p = f_polyeval6(
        z.hi,
        f64::from_bits(0x3fe0000000000000),
        f64::from_bits(0x3fc5555555555555),
        f64::from_bits(0x3fa55555555553de),
        f64::from_bits(0x3f81111144995a9a),
        f64::from_bits(0x3f56c241f9a791c5),
        f64::from_bits(0xbfad9209c6d8b9e1),
    );
    let px = DoubleDouble::quick_mult_f64(DoubleDouble::from_exact_mult(z.hi, p), z.hi);
    // expm1(hi + lo) = expm1(hi) + expm1(lo)(1 + expm1(hi)) = expm1(hi) + expm1(lo)expm1(hi) + expm1(lo)
    // expm1(lo) ~ lo
    let expm1_hi = DoubleDouble::f64_add(z.hi, px);
    let mut lowest_part = DoubleDouble::quick_mult_f64(expm1_hi, z.lo);
    lowest_part = DoubleDouble::full_add_f64(lowest_part, z.lo);
    DoubleDouble::quick_dd_add(expm1_hi, lowest_part)
}

/// |z.hi| < 2^-7
#[inline(always)]
pub(crate) fn expm1_poly_dd_tiny(z: DoubleDouble) -> DoubleDouble {
    // Polynomial generated in Sollya
    // d = [-2^-7,2^-7];
    // f = expm1(x);
    // w = 1;
    // pf = fpminimax(f, [|1,2,3,4,5,6,7,8,9|], [|1, 1, 107...|], d, absolute, floating);
    // See ./notes/compound_expm1_tiny.sollya
    const Q: [(u64, u64); 9] = [
        (0x0000000000000000, 0x3ff0000000000000),
        (0x0000000000000000, 0x3fe0000000000000),
        (0x3c6555564150ff16, 0x3fc5555555555555),
        (0x3c4586275c26f8a5, 0x3fa5555555555555),
        (0xbc19e6193ac658a6, 0x3f81111111111111),
        (0xbbf025e72dc21051, 0x3f56c16c16c1500a),
        (0x3bc2d641a7b7b9b8, 0x3f2a01a01a07dc46),
        (0xbb42cc8aaeeb3d00, 0x3efa01a29fef3e6f),
        (0x3b52b1589125ce82, 0x3ec71db6af553255),
    ];
    let z = DoubleDouble::from_exact_add(z.hi, z.lo);
    let mut d = DoubleDouble::quick_mul_add(
        z,
        DoubleDouble::from_bit_pair(Q[8]),
        DoubleDouble::from_bit_pair(Q[7]),
    );
    d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[6]));
    d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[5]));
    d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[4]));
    d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[3]));
    d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[2]));
    d = DoubleDouble::quick_mul_add_f64(z, d, f64::from_bits(0x3fe0000000000000));
    d = DoubleDouble::quick_mul_add_f64(z, d, f64::from_bits(0x3ff0000000000000));
    DoubleDouble::quick_mult(d, z)
}

#[inline]
pub(crate) fn pow_expm1_1(r: DoubleDouble, s: f64) -> DoubleDouble {
    const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27);
    // #define RHO1 -0x1.577453f1799a6p+9
    /* We increase the initial value of RHO1 to avoid spurious underflow in
    the result value el. However, it is not possible to obtain a lower
    bound on |el| from the input value rh, thus this modified value of RHO1
    is obtained experimentally. */
    const RHO1: f64 = f64::from_bits(0xc08483b8cca421af);
    const RHO2: f64 = f64::from_bits(0x40862e42e709a95b);
    const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9);

    // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too
    #[allow(clippy::neg_cmp_op_on_partial_ord)]
    if !(r.hi <= RHO2) {
        return if r.hi > RHO3 {
            /* If rh > RHO3, we are sure there is overflow,
            For s=1 we return eh = el = DBL_MAX, which yields
            res_min = res_max = +Inf for rounding up or to nearest,
            and res_min = res_max = DBL_MAX for rounding down or toward zero,
            which will yield the correct rounding.
            For s=-1 we return eh = el = -DBL_MAX, which similarly gives
            res_min = res_max = -Inf or res_min = res_max = -DBL_MAX,
            which is the correct rounding. */
            DoubleDouble::new(
                f64::from_bits(0x7fefffffffffffff) * s,
                f64::from_bits(0x7fefffffffffffff) * s,
            )
        } else {
            /* If RHO2 < rh <= RHO3, we are in the intermediate region
            where there might be overflow or not, thus we set eh = el = NaN,
            which will set res_min = res_max = NaN, the comparison
            res_min == res_max will fail: we defer to the 2nd phase. */
            DoubleDouble::new(f64::NAN, f64::NAN)
        };
    }

    if r.hi < RHO1 {
        if r.hi < RHO0 {
            /* For s=1, we have eh=el=+0 except for rounding up,
               thus res_min=+0 or -0, res_max=+0 in the main code,
               the rounding test succeeds, and we return res_max which is the
               expected result in the underflow case.
               For s=1 and rounding up, we have eh=+0, el=2^-1074,
               thus res_min = res_max = 2^-1074, which is the expected result too.
               For s=-1, we have eh=el=-0 except for rounding down,
               thus res_min=-0 or +0, res_max=-0 in the main code,
               the rounding test succeeds, and we return res_max which is the
               expected result in the underflow case.
               For s=-1 and rounding down, we have eh=-0, el=-2^-1074,
               thus res_min = res_max = -2^-1074, which is the expected result too.
            */
            return DoubleDouble::full_add_f64(
                DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s),
                -1.0,
            );
        } else {
            /* RHO0 <= rh < RHO1 or s < 0: we return -1 */
            return DoubleDouble::new(0., -1.);
        };
    }

    let ax = r.hi.to_bits() & 0x7fffffffffffffffu64;

    const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
    const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);

    if ax <= 0x3f80000000000000 {
        // |x| < 2^-7
        if ax < 0x3970000000000000 {
            // |x| < 2^-104
            return r;
        }
        let d = expm1_poly_dd_tiny(r);
        return d;
    }

    const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);

    let k = (r.hi * INVLOG2).cpu_round_ties_even();

    let z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r);

    let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
    let mk = (bk >> 12) + 0x3ff;
    let i2 = (bk >> 6) & 0x3f;
    let i1 = bk & 0x3f;

    let t0 = DoubleDouble::from_bit_pair(EXPM1_T0[i2 as usize]);
    let t1 = DoubleDouble::from_bit_pair(EXPM1_T1[i1 as usize]);
    let tbh = DoubleDouble::quick_mult(t1, t0);
    let mut de = tbh;
    // exp(k)=2^k*exp(r) + (2^k - 1)
    let q = expm1_poly_fast(z);
    de = DoubleDouble::quick_mult(de, q);
    de = DoubleDouble::add(tbh, de);

    let ie = mk - 0x3ff;

    let off: f64 = f64::from_bits((2048i64 + 1023i64).wrapping_sub(ie).wrapping_shl(52) as u64);

    let e: f64;
    if ie < 53 {
        let fhz = DoubleDouble::from_exact_add(off, de.hi);
        de.hi = fhz.hi;
        e = fhz.lo;
    } else if ie < 104 {
        let fhz = DoubleDouble::from_exact_add(de.hi, off);
        de.hi = fhz.hi;
        e = fhz.lo;
    } else {
        e = 0.;
    }
    de.lo += e;
    de.hi = ldexp(de.to_f64(), ie as i32);
    de.lo = 0.;
    de
}

fn exp_dyadic_poly(x: DyadicFloat128) -> DyadicFloat128 {
    const Q_2: [DyadicFloat128; 8] = [
        DyadicFloat128 {
            sign: DyadicSign::Pos,
            exponent: -128,
            mantissa: 0xffff_ffff_ffff_ffff_ffff_ffff_ffff_ffd0_u128,
        },
        DyadicFloat128 {
            sign: DyadicSign::Pos,
            exponent: -127,
            mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0088_u128,
        },
        DyadicFloat128 {
            sign: DyadicSign::Pos,
            exponent: -128,
            mantissa: 0x8000_0000_0000_0000_0000_000c_06f3_cd29_u128,
        },
        DyadicFloat128 {
            sign: DyadicSign::Pos,
            exponent: -130,
            mantissa: 0xaaaa_aaaa_aaaa_aaaa_aaaa_aa6a_1e07_76ae_u128,
        },
        DyadicFloat128 {
            sign: DyadicSign::Pos,
            exponent: -132,
            mantissa: 0xaaaa_aaaa_aaaa_aaa3_0000_0000_0000_0000_u128,
        },
        DyadicFloat128 {
            sign: DyadicSign::Pos,
            exponent: -134,
            mantissa: 0x8888_8888_8888_8897_0000_0000_0000_0000_u128,
        },
        DyadicFloat128 {
            sign: DyadicSign::Pos,
            exponent: -137,
            mantissa: 0xb60b_60b9_3214_6a54_0000_0000_0000_0000_u128,
        },
        DyadicFloat128 {
            sign: DyadicSign::Pos,
            exponent: -140,
            mantissa: 0xd00d_00cd_9841_6862_0000_0000_0000_0000_u128,
        },
    ];
    let mut p = Q_2[7];
    for i in (0..7).rev() {
        p = x * p + Q_2[i];
    }
    p
}

/* put in r an approximation of exp(x), for |x| < 744.45,
with relative error < 2^-121.70 */
#[inline]
pub(crate) fn exp_dyadic(x: DyadicFloat128) -> DyadicFloat128 {
    let ex = x.exponent + 127;
    if ex >= 10
    // underflow or overflow
    {
        return DyadicFloat128 {
            sign: DyadicSign::Pos,
            exponent: if x.sign == DyadicSign::Neg {
                -1076
            } else {
                1025
            },
            mantissa: x.mantissa,
        };
    }

    const LOG2_INV: DyadicFloat128 = DyadicFloat128 {
        sign: DyadicSign::Pos,
        exponent: -115,
        mantissa: 0xb8aa_3b29_5c17_f0bc_0000_0000_0000_0000_u128,
    };

    const LOG2: DyadicFloat128 = DyadicFloat128 {
        sign: DyadicSign::Pos,
        exponent: -128,
        mantissa: 0xb172_17f7_d1cf_79ab_c9e3_b398_03f2_f6af_u128,
    };

    let mut bk = x * LOG2_INV;
    let k = bk.trunc_to_i64(); /* k = trunc(K) [rounded towards zero, exact] */
    /* The rounding error of mul_dint_int64() is bounded by 6 ulps, thus since
    |K| <= 4399162*log(2) < 3049267, the error on K is bounded by 2^-103.41.
    This error is divided by 2^12 below, thus yields < 2^-115.41. */
    bk = LOG2.mul_int64(k);
    bk.exponent -= 12;
    bk.sign = bk.sign.negate();
    let y = x + bk;

    let bm = k >> 12;
    let i2 = (k >> 6) & 0x3f;
    let i1 = k & 0x3f;
    let mut r = exp_dyadic_poly(y);
    r = EXP_T1_2_DYADIC[i2 as usize] * r;
    r = EXP_T2_2_DYADIC[i1 as usize] * r;
    r.exponent += bm as i16; /* exact */
    r
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::f_expm1;
    #[test]
    fn test_log() {
        let k = DyadicFloat128::new_from_f64(2.5);
        assert_eq!(exp_dyadic(k).fast_as_f64(), 12.182493960703473);
    }

    #[test]
    fn test_exp() {
        let k = pow_expm1_1(DoubleDouble::new(0., 2.543543543543), 1.);
        println!("{}", k.to_f64());
        println!("{}", f_expm1(2.543543543543));
    }
}

Coverage Report

Created: 2025-11-24 07:30