/rust/registry/src/index.crates.io-1949cf8c6b5b557f/pxfm-0.1.28/src/tangent/atanpi.rs

Source
/*
 * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
 * //
 * // Redistribution and use in source and binary forms, with or without modification,
 * // are permitted provided that the following conditions are met:
 * //
 * // 1.  Redistributions of source code must retain the above copyright notice, this
 * // list of conditions and the following disclaimer.
 * //
 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
 * // this list of conditions and the following disclaimer in the documentation
 * // and/or other materials provided with the distribution.
 * //
 * // 3.  Neither the name of the copyright holder nor the names of its
 * // contributors may be used to endorse or promote products derived from
 * // this software without specific prior written permission.
 * //
 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
use crate::common::{dd_fmla, dyad_fmla, f_fmla};
use crate::double_double::DoubleDouble;
use crate::shared_eval::poly_dd_3;
use crate::tangent::atan::{ATAN_CIRCLE, ATAN_REDUCE};

const ONE_OVER_PI_DD: DoubleDouble = DoubleDouble::new(
    f64::from_bits(0xbc76b01ec5417056),
    f64::from_bits(0x3fd45f306dc9c883),
);

const ONE_OVER_3PI: f64 = f64::from_bits(0x3fbb2995e7b7b604); // approximates 1/(3pi)

fn atanpi_small(x: f64) -> f64 {
    if x == 0. {
        return x;
    }
    if x.abs() == f64::from_bits(0x0015cba89af1f855) {
        return if x > 0. {
            f_fmla(
                f64::from_bits(0x9a70000000000000),
                f64::from_bits(0x1a70000000000000),
                f64::from_bits(0x0006f00f7cd3a40b),
            )
        } else {
            f_fmla(
                f64::from_bits(0x1a70000000000000),
                f64::from_bits(0x1a70000000000000),
                f64::from_bits(0x8006f00f7cd3a40b),
            )
        };
    }
    // generic worst case
    let mut v = x.to_bits();
    if (v & 0xfffffffffffff) == 0x59af9a1194efe
    // +/-0x1.59af9a1194efe*2^e
    {
        let e = v >> 52;
        if (e & 0x7ff) > 2 {
            v = ((e - 2) << 52) | 0xb824198b94a89;
            return if x > 0. {
                dd_fmla(
                    f64::from_bits(0x9a70000000000000),
                    f64::from_bits(0x1a70000000000000),
                    f64::from_bits(v),
                )
            } else {
                dd_fmla(
                    f64::from_bits(0x1a70000000000000),
                    f64::from_bits(0x1a70000000000000),
                    f64::from_bits(v),
                )
            };
        }
    }
    let h = x * ONE_OVER_PI_DD.hi;
    /* Assuming h = x*ONE_OVER_PI_DD.hi - e, the correction term is
    e + x * ONE_OVER_PIL, but we need to scale values to avoid underflow. */
    let mut corr = dd_fmla(
        x * f64::from_bits(0x4690000000000000),
        ONE_OVER_PI_DD.hi,
        -h * f64::from_bits(0x4690000000000000),
    );
    corr = dd_fmla(
        x * f64::from_bits(0x4690000000000000),
        ONE_OVER_PI_DD.lo,
        corr,
    );
    // now return h + corr * 2^-106

    dyad_fmla(corr, f64::from_bits(0x3950000000000000), h)
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx", enable = "fma")]
unsafe fn atanpi_small_fma(x: f64) -> f64 {
    if x == 0. {
        return x;
    }
    if x.abs() == f64::from_bits(0x0015cba89af1f855) {
        return if x > 0. {
            f64::mul_add(
                f64::from_bits(0x9a70000000000000),
                f64::from_bits(0x1a70000000000000),
                f64::from_bits(0x0006f00f7cd3a40b),
            )
        } else {
            f64::mul_add(
                f64::from_bits(0x1a70000000000000),
                f64::from_bits(0x1a70000000000000),
                f64::from_bits(0x8006f00f7cd3a40b),
            )
        };
    }
    // generic worst case
    let mut v = x.to_bits();
    if (v & 0xfffffffffffff) == 0x59af9a1194efe
    // +/-0x1.59af9a1194efe*2^e
    {
        let e = v >> 52;
        if (e & 0x7ff) > 2 {
            v = ((e - 2) << 52) | 0xb824198b94a89;
            return if x > 0. {
                f64::mul_add(
                    f64::from_bits(0x9a70000000000000),
                    f64::from_bits(0x1a70000000000000),
                    f64::from_bits(v),
                )
            } else {
                f64::mul_add(
                    f64::from_bits(0x1a70000000000000),
                    f64::from_bits(0x1a70000000000000),
                    f64::from_bits(v),
                )
            };
        }
    }
    let h = x * ONE_OVER_PI_DD.hi;
    /* Assuming h = x*ONE_OVER_PI_DD.hi - e, the correction term is
    e + x * ONE_OVER_PIL, but we need to scale values to avoid underflow. */
    let mut corr = f64::mul_add(
        x * f64::from_bits(0x4690000000000000),
        ONE_OVER_PI_DD.hi,
        -h * f64::from_bits(0x4690000000000000),
    );
    corr = f64::mul_add(
        x * f64::from_bits(0x4690000000000000),
        ONE_OVER_PI_DD.lo,
        corr,
    );
    // now return h + corr * 2^-106

    f64::mul_add(corr, f64::from_bits(0x3950000000000000), h)
}

/* Deal with the case where |x| is large:
for x > 0, atanpi(x) = 1/2 - 1/pi * 1/x + 1/(3pi) * 1/x^3 + O(1/x^5)
for x < 0, atanpi(x) = -1/2 - 1/pi * 1/x + 1/(3pi) * 1/x^3 + O(1/x^5).
The next term 1/5*x^5/pi is smaller than 2^-107 * atanpi(x)
when |x| > 0x1.bep20. */
fn atanpi_asympt(x: f64) -> f64 {
    let h = f64::copysign(0.5, x);
    // approximate 1/x as yh + yl
    let rcy = DoubleDouble::from_quick_recip(x);
    let mut m = DoubleDouble::quick_mult(rcy, ONE_OVER_PI_DD);
    // m + l ~ 1/pi * 1/x
    m.hi = -m.hi;
    m.lo = dd_fmla(ONE_OVER_3PI * rcy.hi, rcy.hi * rcy.hi, -m.lo);
    // m + l ~ - 1/pi * 1/x + 1/(3pi) * 1/x^3
    let vh = DoubleDouble::from_exact_add(h, m.hi);
    m.hi = vh.hi;
    m = DoubleDouble::from_exact_add(vh.lo, m.lo);
    if m.hi.abs() == f64::from_bits(0x3c80000000000000) {
        // this is 1/2 ulp(atan(x))
        m.hi = if m.hi * m.lo > 0. {
            f64::copysign(f64::from_bits(0x3c80000000000001), m.hi)
        } else {
            f64::copysign(f64::from_bits(0x3c7fffffffffffff), m.hi)
        };
    }
    vh.hi + m.hi
}

#[inline]
fn atanpi_tiny(x: f64) -> f64 {
    let mut dx = DoubleDouble::quick_mult_f64(ONE_OVER_PI_DD, x);
    dx.lo = dd_fmla(-ONE_OVER_3PI * x, x * x, dx.lo);
    dx.to_f64()
}

#[cold]
fn as_atanpi_refine2(x: f64, a: f64) -> f64 {
    if x.abs() > f64::from_bits(0x413be00000000000) {
        return atanpi_asympt(x);
    }
    if x.abs() < f64::from_bits(0x3e4c700000000000) {
        return atanpi_tiny(x);
    }
    const CH: [(u64, u64); 3] = [
        (0xbfd5555555555555, 0xbc75555555555555),
        (0x3fc999999999999a, 0xbc6999999999bcb8),
        (0xbfc2492492492492, 0xbc6249242093c016),
    ];
    const CL: [u64; 4] = [
        0x3fbc71c71c71c71c,
        0xbfb745d1745d1265,
        0x3fb3b13b115bcbc4,
        0xbfb1107c41ad3253,
    ];
    let phi = ((a.abs()) * f64::from_bits(0x40545f306dc9c883) + 256.5).to_bits();
    let i: i64 = ((phi >> (52 - 8)) & 0xff) as i64;
    let dh: DoubleDouble = if i == 128 {
        DoubleDouble::from_quick_recip(-x)
    } else {
        let ta = f64::copysign(f64::from_bits(ATAN_REDUCE[i as usize].0), x);
        let dzta = DoubleDouble::from_exact_mult(x, ta);
        let zmta = x - ta;
        let v = 1. + dzta.hi;
        let d = 1. - v;
        let ev = (d + dzta.hi) - ((d + v) - 1.) + dzta.lo;
        let r = 1.0 / v;
        let rl = f_fmla(-ev, r, dd_fmla(r, -v, 1.0)) * r;
        DoubleDouble::quick_mult_f64(DoubleDouble::new(rl, r), zmta)
    };
    let d2 = DoubleDouble::quick_mult(dh, dh);
    let h4 = d2.hi * d2.hi;
    let h3 = DoubleDouble::quick_mult(dh, d2);

    let fl0 = f_fmla(d2.hi, f64::from_bits(CL[1]), f64::from_bits(CL[0]));
    let fl1 = f_fmla(d2.hi, f64::from_bits(CL[3]), f64::from_bits(CL[2]));

    let fl = d2.hi * f_fmla(h4, fl1, fl0);
    let mut f = poly_dd_3(d2, CH, fl);
    f = DoubleDouble::quick_mult(h3, f);
    let (ah, mut al, mut at);
    if i == 0 {
        ah = dh.hi;
        al = f.hi;
        at = f.lo;
    } else {
        let mut df = 0.;
        if i < 128 {
            df = f64::copysign(1.0, x) * f64::from_bits(ATAN_REDUCE[i as usize].1);
        }
        let id = f64::copysign(i as f64, x);
        ah = f64::from_bits(0x3f8921fb54442d00) * id;
        al = f64::from_bits(0x3c88469898cc5180) * id;
        at = f64::from_bits(0xb97fc8f8cbb5bf80) * id;
        let v0 = DoubleDouble::add(DoubleDouble::new(at, al), DoubleDouble::new(0., df));
        let v1 = DoubleDouble::add(v0, dh);
        let v2 = DoubleDouble::add(v1, f);
        al = v2.hi;
        at = v2.lo;
    }

    let v2 = DoubleDouble::from_exact_add(ah, al);
    let v1 = DoubleDouble::from_exact_add(v2.lo, at);

    let z0 = DoubleDouble::quick_mult(DoubleDouble::new(v1.hi, v2.hi), ONE_OVER_PI_DD);
    // atanpi_end
    z0.to_f64()
}

#[inline(always)]
/// fma - fma
/// dd_fma - mandatory fma fallback
fn atanpi_gen_impl<
    Q: Fn(f64, f64, f64) -> f64,
    F: Fn(f64, f64, f64) -> f64,
    DdMul: Fn(DoubleDouble, DoubleDouble) -> DoubleDouble,
    AtanpiSmall: Fn(f64) -> f64,
>(
    x: f64,
    fma: Q,
    dd_fma: F,
    dd_mul: DdMul,
    atanpi_small: AtanpiSmall,
) -> f64 {
    const CH: [u64; 4] = [
        0x3ff0000000000000,
        0xbfd555555555552b,
        0x3fc9999999069c20,
        0xbfc248d2c8444ac6,
    ];
    let t = x.to_bits();
    let at: u64 = t & 0x7fff_ffff_ffff_ffff;
    let mut i = (at >> 51).wrapping_sub(2030u64);
    if at < 0x3f7b21c475e6362au64 {
        // |x| < 0.006624
        if at < 0x3c90000000000000u64 {
            // |x| < 2^-54
            return atanpi_small(x);
        }
        if x == 0. {
            return x;
        }
        const CH2: [u64; 4] = [
            0xbfd5555555555555,
            0x3fc99999999998c1,
            0xbfc249249176aec0,
            0x3fbc711fd121ae80,
        ];
        let x2 = x * x;
        let x3 = x * x2;
        let x4 = x2 * x2;

        let f0 = fma(x2, f64::from_bits(CH2[3]), f64::from_bits(CH2[2]));
        let f1 = fma(x2, f64::from_bits(CH2[1]), f64::from_bits(CH2[0]));

        let f = x3 * dd_fma(x4, f0, f1);
        // begin_atanpi
        /* Here x+f approximates atan(x), with absolute error bounded by
        0x4.8p-52*f (see atan.c). After multiplying by 1/pi this error
        will be bounded by 0x1.6fp-52*f. For |x| < 0x1.b21c475e6362ap-8
        we have |f| < 2^-16*|x|, thus the error is bounded by
        0x1.6fp-52*2^-16*|x| < 0x1.6fp-68. */
        // multiply x + f by 1/pi
        let hy = dd_mul(DoubleDouble::new(f, x), ONE_OVER_PI_DD);
        /* The rounding error in muldd and the approximation error between
        1/pi and ONE_OVER_PIH + ONE_OVER_PIL are covered by the difference
        between 0x4.8p-52*pi and 0x1.6fp-52, which is > 2^-61.8. */
        let mut ub = hy.hi + dd_fma(f64::from_bits(0x3bb6f00000000000), x, hy.lo);
        let lb = hy.hi + dd_fma(f64::from_bits(0xbbb6f00000000000), x, hy.lo);
        if ub == lb {
            return ub;
        }
        // end_atanpi
        ub = (f + f * f64::from_bits(0x3cd2000000000000)) + x; // atanpi_specific, original value in atan
        return as_atanpi_refine2(x, ub);
    }
    // now |x| >= 0x1.b21c475e6362ap-8
    let h;
    let mut a: DoubleDouble;
    if at > 0x4062ded8e34a9035u64 {
        // |x| > 0x1.2ded8e34a9035p+7, atanpi|x| > 0.49789
        if at >= 0x43445f306dc9c883u64 {
            // |x| >= 0x1.45f306dc9c883p+53, atanpi|x| > 0.5 - 0x1p-55
            if at >= (0x7ffu64 << 52) {
                // case Inf or NaN
                if at == 0x7ffu64 << 52 {
                    // Inf
                    return f64::copysign(0.5, x);
                } // atanpi_specific
                return x + x; // NaN
            }
            return f64::copysign(0.5, x) - f64::copysign(f64::from_bits(0x3c70000000000000), x);
        }
        h = -1.0 / x;
        a = DoubleDouble::new(
            f64::copysign(f64::from_bits(0x3c91a62633145c07), x),
            f64::copysign(f64::from_bits(0x3ff921fb54442d18), x),
        );
    } else {
        // 0x1.b21c475e6362ap-8 <= |x| <= 0x1.2ded8e34a9035p+7
        /* we need to deal with |x| = 1 separately since in this case
        h=0 below, and the error is measured in terms of multiple of h */
        if at == 0x3ff0000000000000 {
            // |x| = 1
            return f64::copysign(f64::from_bits(0x3fd0000000000000), x);
        }
        let u: u64 = t & 0x0007ffffffffffff;
        let ut = u >> (51 - 16);
        let ut2 = (ut * ut) >> 16;
        let vc = ATAN_CIRCLE[i as usize];
        i = (((vc[0] as u64).wrapping_shl(16)) + ut * (vc[1] as u64) - ut2 * (vc[2] as u64))
            >> (16 + 9);
        let va = ATAN_REDUCE[i as usize];
        let ta = f64::copysign(1.0, x) * f64::from_bits(va.0);
        let id = f64::copysign(1.0, x) * i as f64;
        h = (x - ta) / fma(x, ta, 1.);
        a = DoubleDouble::new(
            fma(
                f64::copysign(1.0, x),
                f64::from_bits(va.1),
                f64::from_bits(0x3c88469898cc5170) * id,
            ),
            f64::from_bits(0x3f8921fb54442d00) * id,
        );
    }
    let h2 = h * h;
    let h4 = h2 * h2;

    let f0 = fma(h2, f64::from_bits(CH[3]), f64::from_bits(CH[2]));
    let f1 = fma(h2, f64::from_bits(CH[1]), f64::from_bits(CH[0]));

    let f = fma(h4, f0, f1);
    a.lo = dd_fma(h, f, a.lo);
    // begin_atanpi
    /* Now ah + al approximates atan(x) with error bounded by 0x3.fp-52*h
    (see atan.c), thus by 0x1.41p-52*h after multiplication by 1/pi.
    We normalize ah+al so that the rounding error in muldd is negligible
    below. */
    let e0 = h * f64::from_bits(0x3ccf800000000000); // original value in atan.c
    let ub0 = (a.lo + e0) + a.hi; // original value in atan.c
    a = DoubleDouble::from_exact_add(a.hi, a.lo);
    a = dd_mul(a, ONE_OVER_PI_DD);
    /* The rounding error in muldd() and the approximation error between 1/pi
    and ONE_OVER_PIH+ONE_OVER_PIL are absorbed when rounding up 0x3.fp-52*pi
    to 0x1.41p-52. */
    let e = h * f64::from_bits(0x3cb4100000000000); // atanpi_specific
    // end_atanpi
    let ub = (a.lo + e) + a.hi;
    let lb = (a.lo - e) + a.hi;
    if ub == lb {
        return ub;
    }
    as_atanpi_refine2(x, ub0)
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx", enable = "fma")]
unsafe fn atanpi_fma_impl(x: f64) -> f64 {
    atanpi_gen_impl(
        x,
        f64::mul_add,
        f64::mul_add,
        DoubleDouble::quick_mult_fma,
        |x| unsafe { atanpi_small_fma(x) },
    )
}

/// Computes atan(x)/pi
///
/// Max ULP 0.5
pub fn f_atanpi(x: f64) -> f64 {
    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
    {
        atanpi_gen_impl(x, f_fmla, dd_fmla, DoubleDouble::quick_mult, atanpi_small)
    }
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    {
        use std::sync::OnceLock;
        static EXECUTOR: OnceLock<unsafe fn(f64) -> f64> = OnceLock::new();
        let q = EXECUTOR.get_or_init(|| {
            if std::arch::is_x86_feature_detected!("avx")
                && std::arch::is_x86_feature_detected!("fma")
            {
                atanpi_fma_impl
            } else {
                fn def_atanpi(x: f64) -> f64 {
                    atanpi_gen_impl(x, f_fmla, dd_fmla, DoubleDouble::quick_mult, atanpi_small)
                }
                def_atanpi
            }
        });
        unsafe { q(x) }
    }
}

#[cfg(test)]
mod tests {

    use super::*;

    #[test]
    fn atanpi_test() {
        assert_eq!(f_atanpi(119895888825197.97), 0.49999999999999734);
        assert_eq!(f_atanpi(1.5610674235815122E-307), 4.969031939254545e-308);
        assert_eq!(f_atanpi(0.00004577636903266291), 0.000014571070806516354);
        assert_eq!(f_atanpi(-0.00004577636903266291), -0.000014571070806516354);
        assert_eq!(f_atanpi(-0.4577636903266291), -0.13664770469904508);
        assert_eq!(f_atanpi(0.9876636903266291), 0.24802445507819193);
    }
}

Coverage Report

Created: 2026-03-20 07:09

Line	Count	Source
1		/*
2		* // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
3		* //
4		* // Redistribution and use in source and binary forms, with or without modification,
5		* // are permitted provided that the following conditions are met:
6		* //
7		* // 1. Redistributions of source code must retain the above copyright notice, this
8		* // list of conditions and the following disclaimer.
9		* //
10		* // 2. Redistributions in binary form must reproduce the above copyright notice,
11		* // this list of conditions and the following disclaimer in the documentation
12		* // and/or other materials provided with the distribution.
13		* //
14		* // 3. Neither the name of the copyright holder nor the names of its
15		* // contributors may be used to endorse or promote products derived from
16		* // this software without specific prior written permission.
17		* //
18		* // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19		* // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20		* // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21		* // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22		* // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23		* // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24		* // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25		* // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26		* // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27		* // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28		*/
29		use crate::common::{dd_fmla, dyad_fmla, f_fmla};
30		use crate::double_double::DoubleDouble;
31		use crate::shared_eval::poly_dd_3;
32		use crate::tangent::atan::{ATAN_CIRCLE, ATAN_REDUCE};
33
34		const ONE_OVER_PI_DD: DoubleDouble = DoubleDouble::new(
35		f64::from_bits(0xbc76b01ec5417056),
36		f64::from_bits(0x3fd45f306dc9c883),
37		);
38
39		const ONE_OVER_3PI: f64 = f64::from_bits(0x3fbb2995e7b7b604); // approximates 1/(3pi)
40
41	0	fn atanpi_small(x: f64) -> f64 {
42	0	if x == 0. {
43	0	return x;
44	0	}
45	0	if x.abs() == f64::from_bits(0x0015cba89af1f855) {
46	0	return if x > 0. {
47	0	f_fmla(
48	0	f64::from_bits(0x9a70000000000000),
49	0	f64::from_bits(0x1a70000000000000),
50	0	f64::from_bits(0x0006f00f7cd3a40b),
51		)
52		} else {
53	0	f_fmla(
54	0	f64::from_bits(0x1a70000000000000),
55	0	f64::from_bits(0x1a70000000000000),
56	0	f64::from_bits(0x8006f00f7cd3a40b),
57		)
58		};
59	0	}
60		// generic worst case
61	0	let mut v = x.to_bits();
62	0	if (v & 0xfffffffffffff) == 0x59af9a1194efe
63		// +/-0x1.59af9a1194efe*2^e
64		{
65	0	let e = v >> 52;
66	0	if (e & 0x7ff) > 2 {
67	0	v = ((e - 2) << 52) \| 0xb824198b94a89;
68	0	return if x > 0. {
69	0	dd_fmla(
70	0	f64::from_bits(0x9a70000000000000),
71	0	f64::from_bits(0x1a70000000000000),
72	0	f64::from_bits(v),
73		)
74		} else {
75	0	dd_fmla(
76	0	f64::from_bits(0x1a70000000000000),
77	0	f64::from_bits(0x1a70000000000000),
78	0	f64::from_bits(v),
79		)
80		};
81	0	}
82	0	}
83	0	let h = x * ONE_OVER_PI_DD.hi;
84		/* Assuming h = x*ONE_OVER_PI_DD.hi - e, the correction term is
85		e + x * ONE_OVER_PIL, but we need to scale values to avoid underflow. */
86	0	let mut corr = dd_fmla(
87	0	x * f64::from_bits(0x4690000000000000),
88	0	ONE_OVER_PI_DD.hi,
89	0	-h * f64::from_bits(0x4690000000000000),
90		);
91	0	corr = dd_fmla(
92	0	x * f64::from_bits(0x4690000000000000),
93	0	ONE_OVER_PI_DD.lo,
94	0	corr,
95	0	);
96		// now return h + corr * 2^-106
97
98	0	dyad_fmla(corr, f64::from_bits(0x3950000000000000), h)
99	0	}
100
101		#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
102		#[target_feature(enable = "avx", enable = "fma")]
103	0	unsafe fn atanpi_small_fma(x: f64) -> f64 {
104	0	if x == 0. {
105	0	return x;
106	0	}
107	0	if x.abs() == f64::from_bits(0x0015cba89af1f855) {
108	0	return if x > 0. {
109	0	f64::mul_add(
110	0	f64::from_bits(0x9a70000000000000),
111	0	f64::from_bits(0x1a70000000000000),
112	0	f64::from_bits(0x0006f00f7cd3a40b),
113		)
114		} else {
115	0	f64::mul_add(
116	0	f64::from_bits(0x1a70000000000000),
117	0	f64::from_bits(0x1a70000000000000),
118	0	f64::from_bits(0x8006f00f7cd3a40b),
119		)
120		};
121	0	}
122		// generic worst case
123	0	let mut v = x.to_bits();
124	0	if (v & 0xfffffffffffff) == 0x59af9a1194efe
125		// +/-0x1.59af9a1194efe*2^e
126		{
127	0	let e = v >> 52;
128	0	if (e & 0x7ff) > 2 {
129	0	v = ((e - 2) << 52) \| 0xb824198b94a89;
130	0	return if x > 0. {
131	0	f64::mul_add(
132	0	f64::from_bits(0x9a70000000000000),
133	0	f64::from_bits(0x1a70000000000000),
134	0	f64::from_bits(v),
135		)
136		} else {
137	0	f64::mul_add(
138	0	f64::from_bits(0x1a70000000000000),
139	0	f64::from_bits(0x1a70000000000000),
140	0	f64::from_bits(v),
141		)
142		};
143	0	}
144	0	}
145	0	let h = x * ONE_OVER_PI_DD.hi;
146		/* Assuming h = x*ONE_OVER_PI_DD.hi - e, the correction term is
147		e + x * ONE_OVER_PIL, but we need to scale values to avoid underflow. */
148	0	let mut corr = f64::mul_add(
149	0	x * f64::from_bits(0x4690000000000000),
150	0	ONE_OVER_PI_DD.hi,
151	0	-h * f64::from_bits(0x4690000000000000),
152		);
153	0	corr = f64::mul_add(
154	0	x * f64::from_bits(0x4690000000000000),
155	0	ONE_OVER_PI_DD.lo,
156	0	corr,
157	0	);
158		// now return h + corr * 2^-106
159
160	0	f64::mul_add(corr, f64::from_bits(0x3950000000000000), h)
161	0	}
162
163		/* Deal with the case where \|x\| is large:
164		for x > 0, atanpi(x) = 1/2 - 1/pi * 1/x + 1/(3pi) * 1/x^3 + O(1/x^5)
165		for x < 0, atanpi(x) = -1/2 - 1/pi * 1/x + 1/(3pi) * 1/x^3 + O(1/x^5).
166		The next term 1/5x^5/pi is smaller than 2^-107 atanpi(x)
167		when \|x\| > 0x1.bep20. */
168	0	fn atanpi_asympt(x: f64) -> f64 {
169	0	let h = f64::copysign(0.5, x);
170		// approximate 1/x as yh + yl
171	0	let rcy = DoubleDouble::from_quick_recip(x);
172	0	let mut m = DoubleDouble::quick_mult(rcy, ONE_OVER_PI_DD);
173		// m + l ~ 1/pi * 1/x
174	0	m.hi = -m.hi;
175	0	m.lo = dd_fmla(ONE_OVER_3PI * rcy.hi, rcy.hi * rcy.hi, -m.lo);
176		// m + l ~ - 1/pi * 1/x + 1/(3pi) * 1/x^3
177	0	let vh = DoubleDouble::from_exact_add(h, m.hi);
178	0	m.hi = vh.hi;
179	0	m = DoubleDouble::from_exact_add(vh.lo, m.lo);
180	0	if m.hi.abs() == f64::from_bits(0x3c80000000000000) {
181		// this is 1/2 ulp(atan(x))
182	0	m.hi = if m.hi * m.lo > 0. {
183	0	f64::copysign(f64::from_bits(0x3c80000000000001), m.hi)
184		} else {
185	0	f64::copysign(f64::from_bits(0x3c7fffffffffffff), m.hi)
186		};
187	0	}
188	0	vh.hi + m.hi
189	0	}
190
191		#[inline]
192	0	fn atanpi_tiny(x: f64) -> f64 {
193	0	let mut dx = DoubleDouble::quick_mult_f64(ONE_OVER_PI_DD, x);
194	0	dx.lo = dd_fmla(-ONE_OVER_3PI * x, x * x, dx.lo);
195	0	dx.to_f64()
196	0	}
197
198		#[cold]
199	0	fn as_atanpi_refine2(x: f64, a: f64) -> f64 {
200	0	if x.abs() > f64::from_bits(0x413be00000000000) {
201	0	return atanpi_asympt(x);
202	0	}
203	0	if x.abs() < f64::from_bits(0x3e4c700000000000) {
204	0	return atanpi_tiny(x);
205	0	}
206		const CH: [(u64, u64); 3] = [
207		(0xbfd5555555555555, 0xbc75555555555555),
208		(0x3fc999999999999a, 0xbc6999999999bcb8),
209		(0xbfc2492492492492, 0xbc6249242093c016),
210		];
211		const CL: [u64; 4] = [
212		0x3fbc71c71c71c71c,
213		0xbfb745d1745d1265,
214		0x3fb3b13b115bcbc4,
215		0xbfb1107c41ad3253,
216		];
217	0	let phi = ((a.abs()) * f64::from_bits(0x40545f306dc9c883) + 256.5).to_bits();
218	0	let i: i64 = ((phi >> (52 - 8)) & 0xff) as i64;
219	0	let dh: DoubleDouble = if i == 128 {
220	0	DoubleDouble::from_quick_recip(-x)
221		} else {
222	0	let ta = f64::copysign(f64::from_bits(ATAN_REDUCE[i as usize].0), x);
223	0	let dzta = DoubleDouble::from_exact_mult(x, ta);
224	0	let zmta = x - ta;
225	0	let v = 1. + dzta.hi;
226	0	let d = 1. - v;
227	0	let ev = (d + dzta.hi) - ((d + v) - 1.) + dzta.lo;
228	0	let r = 1.0 / v;
229	0	let rl = f_fmla(-ev, r, dd_fmla(r, -v, 1.0)) * r;
230	0	DoubleDouble::quick_mult_f64(DoubleDouble::new(rl, r), zmta)
231		};
232	0	let d2 = DoubleDouble::quick_mult(dh, dh);
233	0	let h4 = d2.hi * d2.hi;
234	0	let h3 = DoubleDouble::quick_mult(dh, d2);
235
236	0	let fl0 = f_fmla(d2.hi, f64::from_bits(CL[1]), f64::from_bits(CL[0]));
237	0	let fl1 = f_fmla(d2.hi, f64::from_bits(CL[3]), f64::from_bits(CL[2]));
238
239	0	let fl = d2.hi * f_fmla(h4, fl1, fl0);
240	0	let mut f = poly_dd_3(d2, CH, fl);
241	0	f = DoubleDouble::quick_mult(h3, f);
242		let (ah, mut al, mut at);
243	0	if i == 0 {
244	0	ah = dh.hi;
245	0	al = f.hi;
246	0	at = f.lo;
247	0	} else {
248	0	let mut df = 0.;
249	0	if i < 128 {
250	0	df = f64::copysign(1.0, x) * f64::from_bits(ATAN_REDUCE[i as usize].1);
251	0	}
252	0	let id = f64::copysign(i as f64, x);
253	0	ah = f64::from_bits(0x3f8921fb54442d00) * id;
254	0	al = f64::from_bits(0x3c88469898cc5180) * id;
255	0	at = f64::from_bits(0xb97fc8f8cbb5bf80) * id;
256	0	let v0 = DoubleDouble::add(DoubleDouble::new(at, al), DoubleDouble::new(0., df));
257	0	let v1 = DoubleDouble::add(v0, dh);
258	0	let v2 = DoubleDouble::add(v1, f);
259	0	al = v2.hi;
260	0	at = v2.lo;
261		}
262
263	0	let v2 = DoubleDouble::from_exact_add(ah, al);
264	0	let v1 = DoubleDouble::from_exact_add(v2.lo, at);
265
266	0	let z0 = DoubleDouble::quick_mult(DoubleDouble::new(v1.hi, v2.hi), ONE_OVER_PI_DD);
267		// atanpi_end
268	0	z0.to_f64()
269	0	}
270
271		#[inline(always)]
272		/// fma - fma
273		/// dd_fma - mandatory fma fallback
274	0	fn atanpi_gen_impl<
275	0	Q: Fn(f64, f64, f64) -> f64,
276	0	F: Fn(f64, f64, f64) -> f64,
277	0	DdMul: Fn(DoubleDouble, DoubleDouble) -> DoubleDouble,
278	0	AtanpiSmall: Fn(f64) -> f64,
279	0	>(
280	0	x: f64,
281	0	fma: Q,
282	0	dd_fma: F,
283	0	dd_mul: DdMul,
284	0	atanpi_small: AtanpiSmall,
285	0	) -> f64 {
286		const CH: [u64; 4] = [
287		0x3ff0000000000000,
288		0xbfd555555555552b,
289		0x3fc9999999069c20,
290		0xbfc248d2c8444ac6,
291		];
292	0	let t = x.to_bits();
293	0	let at: u64 = t & 0x7fff_ffff_ffff_ffff;
294	0	let mut i = (at >> 51).wrapping_sub(2030u64);
295	0	if at < 0x3f7b21c475e6362au64 {
296		// \|x\| < 0.006624
297	0	if at < 0x3c90000000000000u64 {
298		// \|x\| < 2^-54
299	0	return atanpi_small(x);
300	0	}
301	0	if x == 0. {
302	0	return x;
303	0	}
304		const CH2: [u64; 4] = [
305		0xbfd5555555555555,
306		0x3fc99999999998c1,
307		0xbfc249249176aec0,
308		0x3fbc711fd121ae80,
309		];
310	0	let x2 = x * x;
311	0	let x3 = x * x2;
312	0	let x4 = x2 * x2;
313
314	0	let f0 = fma(x2, f64::from_bits(CH2[3]), f64::from_bits(CH2[2]));
315	0	let f1 = fma(x2, f64::from_bits(CH2[1]), f64::from_bits(CH2[0]));
316
317	0	let f = x3 * dd_fma(x4, f0, f1);
318		// begin_atanpi
319		/* Here x+f approximates atan(x), with absolute error bounded by
320		0x4.8p-52*f (see atan.c). After multiplying by 1/pi this error
321		will be bounded by 0x1.6fp-52*f. For \|x\| < 0x1.b21c475e6362ap-8
322		we have \|f\| < 2^-16*\|x\|, thus the error is bounded by
323		0x1.6fp-522^-16\|x\| < 0x1.6fp-68. */
324		// multiply x + f by 1/pi
325	0	let hy = dd_mul(DoubleDouble::new(f, x), ONE_OVER_PI_DD);
326		/* The rounding error in muldd and the approximation error between
327		1/pi and ONE_OVER_PIH + ONE_OVER_PIL are covered by the difference
328		between 0x4.8p-52pi and 0x1.6fp-52, which is > 2^-61.8. /
329	0	let mut ub = hy.hi + dd_fma(f64::from_bits(0x3bb6f00000000000), x, hy.lo);
330	0	let lb = hy.hi + dd_fma(f64::from_bits(0xbbb6f00000000000), x, hy.lo);
331	0	if ub == lb {
332	0	return ub;
333	0	}
334		// end_atanpi
335	0	ub = (f + f * f64::from_bits(0x3cd2000000000000)) + x; // atanpi_specific, original value in atan
336	0	return as_atanpi_refine2(x, ub);
337	0	}
338		// now \|x\| >= 0x1.b21c475e6362ap-8
339		let h;
340		let mut a: DoubleDouble;
341	0	if at > 0x4062ded8e34a9035u64 {
342		// \|x\| > 0x1.2ded8e34a9035p+7, atanpi\|x\| > 0.49789
343	0	if at >= 0x43445f306dc9c883u64 {
344		// \|x\| >= 0x1.45f306dc9c883p+53, atanpi\|x\| > 0.5 - 0x1p-55
345	0	if at >= (0x7ffu64 << 52) {
346		// case Inf or NaN
347	0	if at == 0x7ffu64 << 52 {
348		// Inf
349	0	return f64::copysign(0.5, x);
350	0	} // atanpi_specific
351	0	return x + x; // NaN
352	0	}
353	0	return f64::copysign(0.5, x) - f64::copysign(f64::from_bits(0x3c70000000000000), x);
354	0	}
355	0	h = -1.0 / x;
356	0	a = DoubleDouble::new(
357	0	f64::copysign(f64::from_bits(0x3c91a62633145c07), x),
358	0	f64::copysign(f64::from_bits(0x3ff921fb54442d18), x),
359	0	);
360		} else {
361		// 0x1.b21c475e6362ap-8 <= \|x\| <= 0x1.2ded8e34a9035p+7
362		/* we need to deal with \|x\| = 1 separately since in this case
363		h=0 below, and the error is measured in terms of multiple of h */
364	0	if at == 0x3ff0000000000000 {
365		// \|x\| = 1
366	0	return f64::copysign(f64::from_bits(0x3fd0000000000000), x);
367	0	}
368	0	let u: u64 = t & 0x0007ffffffffffff;
369	0	let ut = u >> (51 - 16);
370	0	let ut2 = (ut * ut) >> 16;
371	0	let vc = ATAN_CIRCLE[i as usize];
372	0	i = (((vc[0] as u64).wrapping_shl(16)) + ut * (vc[1] as u64) - ut2 * (vc[2] as u64))
373	0	>> (16 + 9);
374	0	let va = ATAN_REDUCE[i as usize];
375	0	let ta = f64::copysign(1.0, x) * f64::from_bits(va.0);
376	0	let id = f64::copysign(1.0, x) * i as f64;
377	0	h = (x - ta) / fma(x, ta, 1.);
378	0	a = DoubleDouble::new(
379	0	fma(
380	0	f64::copysign(1.0, x),
381	0	f64::from_bits(va.1),
382	0	f64::from_bits(0x3c88469898cc5170) * id,
383	0	),
384	0	f64::from_bits(0x3f8921fb54442d00) * id,
385	0	);
386		}
387	0	let h2 = h * h;
388	0	let h4 = h2 * h2;
389
390	0	let f0 = fma(h2, f64::from_bits(CH[3]), f64::from_bits(CH[2]));
391	0	let f1 = fma(h2, f64::from_bits(CH[1]), f64::from_bits(CH[0]));
392
393	0	let f = fma(h4, f0, f1);
394	0	a.lo = dd_fma(h, f, a.lo);
395		// begin_atanpi
396		/* Now ah + al approximates atan(x) with error bounded by 0x3.fp-52*h
397		(see atan.c), thus by 0x1.41p-52*h after multiplication by 1/pi.
398		We normalize ah+al so that the rounding error in muldd is negligible
399		below. */
400	0	let e0 = h * f64::from_bits(0x3ccf800000000000); // original value in atan.c
401	0	let ub0 = (a.lo + e0) + a.hi; // original value in atan.c
402	0	a = DoubleDouble::from_exact_add(a.hi, a.lo);
403	0	a = dd_mul(a, ONE_OVER_PI_DD);
404		/* The rounding error in muldd() and the approximation error between 1/pi
405		and ONE_OVER_PIH+ONE_OVER_PIL are absorbed when rounding up 0x3.fp-52*pi
406		to 0x1.41p-52. */
407	0	let e = h * f64::from_bits(0x3cb4100000000000); // atanpi_specific
408		// end_atanpi
409	0	let ub = (a.lo + e) + a.hi;
410	0	let lb = (a.lo - e) + a.hi;
411	0	if ub == lb {
412	0	return ub;
413	0	}
414	0	as_atanpi_refine2(x, ub0)
415	0	} Unexecuted instantiation: pxfm::tangent::atanpi::atanpi_gen_impl::<<f64>::mul_add, <f64>::mul_add, <pxfm::double_double::DoubleDouble>::quick_mult_fma, pxfm::tangent::atanpi::atanpi_fma_impl::{closure#0}> Unexecuted instantiation: pxfm::tangent::atanpi::atanpi_gen_impl::<pxfm::common::f_fmla, pxfm::common::dd_fmla, <pxfm::double_double::DoubleDouble>::quick_mult, pxfm::tangent::atanpi::atanpi_small>
416
417		#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
418		#[target_feature(enable = "avx", enable = "fma")]
419	0	unsafe fn atanpi_fma_impl(x: f64) -> f64 {
420	0	atanpi_gen_impl(
421	0	x,
422		f64::mul_add,
423		f64::mul_add,
424		DoubleDouble::quick_mult_fma,
425	0	\|x\| unsafe { atanpi_small_fma(x) },
426		)
427	0	}
428
429		/// Computes atan(x)/pi
430		///
431		/// Max ULP 0.5
432	0	pub fn f_atanpi(x: f64) -> f64 {
433		#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
434		{
435		atanpi_gen_impl(x, f_fmla, dd_fmla, DoubleDouble::quick_mult, atanpi_small)
436		}
437		#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
438		{
439		use std::sync::OnceLock;
440		static EXECUTOR: OnceLock<unsafe fn(f64) -> f64> = OnceLock::new();
441	0	let q = EXECUTOR.get_or_init(\|\| {
442	0	if std::arch::is_x86_feature_detected!("avx")
443	0	&& std::arch::is_x86_feature_detected!("fma")
444		{
445	0	atanpi_fma_impl
446		} else {
447	0	fn def_atanpi(x: f64) -> f64 {
448	0	atanpi_gen_impl(x, f_fmla, dd_fmla, DoubleDouble::quick_mult, atanpi_small)
449	0	}
450	0	def_atanpi
451		}
452	0	});
453	0	unsafe { q(x) }
454		}
455	0	}
456
457		#[cfg(test)]
458		mod tests {
459
460		use super::*;
461
462		#[test]
463		fn atanpi_test() {
464		assert_eq!(f_atanpi(119895888825197.97), 0.49999999999999734);
465		assert_eq!(f_atanpi(1.5610674235815122E-307), 4.969031939254545e-308);
466		assert_eq!(f_atanpi(0.00004577636903266291), 0.000014571070806516354);
467		assert_eq!(f_atanpi(-0.00004577636903266291), -0.000014571070806516354);
468		assert_eq!(f_atanpi(-0.4577636903266291), -0.13664770469904508);
469		assert_eq!(f_atanpi(0.9876636903266291), 0.24802445507819193);
470		}
471		}