Coverage Report

Created: 2026-02-26 07:34

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/pxfm-0.1.27/src/powf.rs
Line
Count
Source
1
/*
2
 * // Copyright (c) Radzivon Bartoshyk 4/2025. All rights reserved.
3
 * //
4
 * // Redistribution and use in source and binary forms, with or without modification,
5
 * // are permitted provided that the following conditions are met:
6
 * //
7
 * // 1.  Redistributions of source code must retain the above copyright notice, this
8
 * // list of conditions and the following disclaimer.
9
 * //
10
 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11
 * // this list of conditions and the following disclaimer in the documentation
12
 * // and/or other materials provided with the distribution.
13
 * //
14
 * // 3.  Neither the name of the copyright holder nor the names of its
15
 * // contributors may be used to endorse or promote products derived from
16
 * // this software without specific prior written permission.
17
 * //
18
 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
 */
29
#![allow(clippy::too_many_arguments)]
30
use crate::bits::biased_exponent_f64;
31
use crate::common::*;
32
use crate::double_double::DoubleDouble;
33
use crate::exponents::expf;
34
use crate::logf;
35
use crate::logs::LOG2_R;
36
use crate::pow_tables::EXP2_MID1;
37
use crate::powf_tables::{LOG2_R_TD, LOG2_R2_DD, POWF_R2};
38
use crate::rounding::CpuRound;
39
40
/// Power function for given value for const context.
41
/// This is simplified version just to make a good approximation on const context.
42
0
pub const fn powf(d: f32, n: f32) -> f32 {
43
0
    let value = d.abs();
44
0
    let c = expf(n * logf(value));
45
0
    if n == 1. {
46
0
        return d;
47
0
    }
48
0
    if d < 0.0 {
49
0
        let y = n as i32;
50
0
        if y % 2 == 0 { c } else { -c }
51
    } else {
52
0
        c
53
    }
54
0
}
55
56
pub(crate) trait PowfBackend {
57
    fn fmaf(&self, x: f32, y: f32, z: f32) -> f32;
58
    fn fma(&self, x: f64, y: f64, z: f64) -> f64;
59
    fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64;
60
    fn integerf(&self, x: f32) -> bool;
61
    fn odd_integerf(&self, x: f32) -> bool;
62
    fn round(&self, x: f64) -> f64;
63
    fn quick_mult(&self, x: DoubleDouble, y: DoubleDouble) -> DoubleDouble;
64
    fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble;
65
    fn dd_polyeval6(
66
        &self,
67
        x: DoubleDouble,
68
        a0: DoubleDouble,
69
        a1: DoubleDouble,
70
        a2: DoubleDouble,
71
        a3: DoubleDouble,
72
        a4: DoubleDouble,
73
        a5: DoubleDouble,
74
    ) -> DoubleDouble;
75
    fn dd_polyeval10(
76
        &self,
77
        x: DoubleDouble,
78
        a0: DoubleDouble,
79
        a1: DoubleDouble,
80
        a2: DoubleDouble,
81
        a3: DoubleDouble,
82
        a4: DoubleDouble,
83
        a5: DoubleDouble,
84
        a6: DoubleDouble,
85
        a7: DoubleDouble,
86
        a8: DoubleDouble,
87
        a9: DoubleDouble,
88
    ) -> DoubleDouble;
89
    const HAS_FMA: bool;
90
    const ERR: u64;
91
}
92
93
pub(crate) struct GenPowfBackend {}
94
95
impl PowfBackend for GenPowfBackend {
96
    #[inline(always)]
97
0
    fn fmaf(&self, x: f32, y: f32, z: f32) -> f32 {
98
0
        f_fmlaf(x, y, z)
99
0
    }
100
101
    #[inline(always)]
102
0
    fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
103
0
        f_fmla(x, y, z)
104
0
    }
105
106
    #[inline(always)]
107
0
    fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64 {
108
        use crate::polyeval::f_polyeval3;
109
0
        f_polyeval3(x, a0, a1, a2)
110
0
    }
111
112
    #[inline(always)]
113
0
    fn integerf(&self, x: f32) -> bool {
114
0
        is_integerf(x)
115
0
    }
116
117
    #[inline(always)]
118
0
    fn odd_integerf(&self, x: f32) -> bool {
119
0
        is_odd_integerf(x)
120
0
    }
121
122
    #[inline(always)]
123
0
    fn round(&self, x: f64) -> f64 {
124
0
        x.cpu_round()
125
0
    }
126
127
    #[inline(always)]
128
0
    fn quick_mult(&self, x: DoubleDouble, y: DoubleDouble) -> DoubleDouble {
129
0
        DoubleDouble::quick_mult(x, y)
130
0
    }
131
132
    #[inline(always)]
133
0
    fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble {
134
0
        DoubleDouble::quick_mult_f64(x, y)
135
0
    }
136
137
    #[inline(always)]
138
0
    fn dd_polyeval6(
139
0
        &self,
140
0
        x: DoubleDouble,
141
0
        a0: DoubleDouble,
142
0
        a1: DoubleDouble,
143
0
        a2: DoubleDouble,
144
0
        a3: DoubleDouble,
145
0
        a4: DoubleDouble,
146
0
        a5: DoubleDouble,
147
0
    ) -> DoubleDouble {
148
        use crate::polyeval::dd_quick_polyeval6;
149
0
        dd_quick_polyeval6(x, a0, a1, a2, a3, a4, a5)
150
0
    }
151
152
    #[inline(always)]
153
0
    fn dd_polyeval10(
154
0
        &self,
155
0
        x: DoubleDouble,
156
0
        a0: DoubleDouble,
157
0
        a1: DoubleDouble,
158
0
        a2: DoubleDouble,
159
0
        a3: DoubleDouble,
160
0
        a4: DoubleDouble,
161
0
        a5: DoubleDouble,
162
0
        a6: DoubleDouble,
163
0
        a7: DoubleDouble,
164
0
        a8: DoubleDouble,
165
0
        a9: DoubleDouble,
166
0
    ) -> DoubleDouble {
167
        use crate::polyeval::dd_quick_polyeval10;
168
0
        dd_quick_polyeval10(x, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)
169
0
    }
170
171
    #[cfg(any(
172
        all(
173
            any(target_arch = "x86", target_arch = "x86_64"),
174
            target_feature = "fma"
175
        ),
176
        target_arch = "aarch64"
177
    ))]
178
    const HAS_FMA: bool = true;
179
    #[cfg(not(any(
180
        all(
181
            any(target_arch = "x86", target_arch = "x86_64"),
182
            target_feature = "fma"
183
        ),
184
        target_arch = "aarch64"
185
    )))]
186
    const HAS_FMA: bool = false;
187
    #[cfg(any(
188
        all(
189
            any(target_arch = "x86", target_arch = "x86_64"),
190
            target_feature = "fma"
191
        ),
192
        target_arch = "aarch64"
193
    ))]
194
    const ERR: u64 = 64;
195
    #[cfg(not(any(
196
        all(
197
            any(target_arch = "x86", target_arch = "x86_64"),
198
            target_feature = "fma"
199
        ),
200
        target_arch = "aarch64"
201
    )))]
202
    const ERR: u64 = 128;
203
}
204
205
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
206
pub(crate) struct FmaPowfBackend {}
207
208
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
209
impl PowfBackend for FmaPowfBackend {
210
    #[inline(always)]
211
0
    fn fmaf(&self, x: f32, y: f32, z: f32) -> f32 {
212
0
        f32::mul_add(x, y, z)
213
0
    }
214
215
    #[inline(always)]
216
0
    fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
217
0
        f64::mul_add(x, y, z)
218
0
    }
219
220
    #[inline(always)]
221
0
    fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64 {
222
        use crate::polyeval::d_polyeval3;
223
0
        d_polyeval3(x, a0, a1, a2)
224
0
    }
225
226
    #[inline(always)]
227
0
    fn integerf(&self, x: f32) -> bool {
228
0
        x.round_ties_even() == x
229
0
    }
230
231
    #[inline(always)]
232
0
    fn odd_integerf(&self, x: f32) -> bool {
233
        use crate::common::is_odd_integerf_fast;
234
0
        is_odd_integerf_fast(x)
235
0
    }
236
237
    #[inline(always)]
238
0
    fn round(&self, x: f64) -> f64 {
239
0
        x.round()
240
0
    }
241
242
    #[inline(always)]
243
0
    fn quick_mult(&self, x: DoubleDouble, y: DoubleDouble) -> DoubleDouble {
244
0
        DoubleDouble::quick_mult_fma(x, y)
245
0
    }
246
247
    #[inline(always)]
248
0
    fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble {
249
0
        DoubleDouble::quick_mult_f64_fma(x, y)
250
0
    }
251
252
    #[inline(always)]
253
0
    fn dd_polyeval6(
254
0
        &self,
255
0
        x: DoubleDouble,
256
0
        a0: DoubleDouble,
257
0
        a1: DoubleDouble,
258
0
        a2: DoubleDouble,
259
0
        a3: DoubleDouble,
260
0
        a4: DoubleDouble,
261
0
        a5: DoubleDouble,
262
0
    ) -> DoubleDouble {
263
        use crate::polyeval::dd_quick_polyeval6_fma;
264
0
        dd_quick_polyeval6_fma(x, a0, a1, a2, a3, a4, a5)
265
0
    }
266
267
    #[inline(always)]
268
0
    fn dd_polyeval10(
269
0
        &self,
270
0
        x: DoubleDouble,
271
0
        a0: DoubleDouble,
272
0
        a1: DoubleDouble,
273
0
        a2: DoubleDouble,
274
0
        a3: DoubleDouble,
275
0
        a4: DoubleDouble,
276
0
        a5: DoubleDouble,
277
0
        a6: DoubleDouble,
278
0
        a7: DoubleDouble,
279
0
        a8: DoubleDouble,
280
0
        a9: DoubleDouble,
281
0
    ) -> DoubleDouble {
282
        use crate::polyeval::dd_quick_polyeval10_fma;
283
0
        dd_quick_polyeval10_fma(x, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)
284
0
    }
285
286
    const HAS_FMA: bool = true;
287
288
    const ERR: u64 = 64;
289
}
290
291
#[inline]
292
0
const fn larger_exponent(a: f64, b: f64) -> bool {
293
0
    biased_exponent_f64(a) >= biased_exponent_f64(b)
294
0
}
295
296
// Calculate 2^(y * log2(x)) in double-double precision.
297
// At this point we can reuse the following values:
298
//   idx_x: index for extra precision of log2 for the middle part of log2(x).
299
//   dx: the reduced argument for log2(x)
300
//   y6: 2^6 * y.
301
//   lo6_hi: the high part of 2^6 * (y - (hi + mid))
302
//   exp2_hi_mid: high part of 2^(hi + mid)
303
#[cold]
304
#[inline(always)]
305
0
fn powf_dd<B: PowfBackend>(
306
0
    idx_x: i32,
307
0
    dx: f64,
308
0
    y6: f64,
309
0
    lo6_hi: f64,
310
0
    exp2_hi_mid: DoubleDouble,
311
0
    backend: &B,
312
0
) -> f64 {
313
    // Perform a second range reduction step:
314
    //   idx2 = round(2^14 * (dx  + 2^-8)) = round ( dx * 2^14 + 2^6)
315
    //   dx2 = (1 + dx) * r2 - 1
316
    // Output range:
317
    //   -0x1.3ffcp-15 <= dx2 <= 0x1.3e3dp-15
318
0
    let idx2 = backend.round(backend.fma(
319
0
        dx,
320
0
        f64::from_bits(0x40d0000000000000),
321
0
        f64::from_bits(0x4050000000000000),
322
0
    )) as usize;
323
0
    let dx2 = backend.fma(1.0 + dx, f64::from_bits(POWF_R2[idx2]), -1.0); // Exact
324
325
    const COEFFS: [(u64, u64); 6] = [
326
        (0x3c7777d0ffda25e0, 0x3ff71547652b82fe),
327
        (0xbc6777d101cf0a84, 0xbfe71547652b82fe),
328
        (0x3c7ce04b5140d867, 0x3fdec709dc3a03fd),
329
        (0x3c7137b47e635be5, 0xbfd71547652b82fb),
330
        (0xbc5b5a30b3bdb318, 0x3fd2776c516a92a2),
331
        (0x3c62d2fbd081e657, 0xbfcec70af1929ca6),
332
    ];
333
0
    let dx_dd = DoubleDouble::new(0.0, dx2);
334
0
    let p = backend.dd_polyeval6(
335
0
        dx_dd,
336
0
        DoubleDouble::from_bit_pair(COEFFS[0]),
337
0
        DoubleDouble::from_bit_pair(COEFFS[1]),
338
0
        DoubleDouble::from_bit_pair(COEFFS[2]),
339
0
        DoubleDouble::from_bit_pair(COEFFS[3]),
340
0
        DoubleDouble::from_bit_pair(COEFFS[4]),
341
0
        DoubleDouble::from_bit_pair(COEFFS[5]),
342
    );
343
    // log2(1 + dx2) ~ dx2 * P(dx2)
344
0
    let log2_x_lo = backend.quick_mult_f64(p, dx2);
345
    // Lower parts of (e_x - log2(r1)) of the first range reduction constant
346
0
    let log2_r_td = LOG2_R_TD[idx_x as usize];
347
0
    let log2_x_mid = DoubleDouble::new(f64::from_bits(log2_r_td.0), f64::from_bits(log2_r_td.1));
348
    // -log2(r2) + lower part of (e_x - log2(r1))
349
0
    let log2_x_m = DoubleDouble::add(DoubleDouble::from_bit_pair(LOG2_R2_DD[idx2]), log2_x_mid);
350
    // log2(1 + dx2) - log2(r2) + lower part of (e_x - log2(r1))
351
    // Since we don't know which one has larger exponent to apply Fast2Sum
352
    // algorithm, we need to check them before calling double-double addition.
353
0
    let log2_x = if larger_exponent(log2_x_m.hi, log2_x_lo.hi) {
354
0
        DoubleDouble::add(log2_x_m, log2_x_lo)
355
    } else {
356
0
        DoubleDouble::add(log2_x_lo, log2_x_m)
357
    };
358
0
    let lo6_hi_dd = DoubleDouble::new(0.0, lo6_hi);
359
    // 2^6 * y * (log2(1 + dx2) - log2(r2) + lower part of (e_x - log2(r1)))
360
0
    let prod = backend.quick_mult_f64(log2_x, y6);
361
    // 2^6 * (y * log2(x) - (hi + mid)) = 2^6 * lo
362
0
    let lo6 = if larger_exponent(prod.hi, lo6_hi) {
363
0
        DoubleDouble::add(prod, lo6_hi_dd)
364
    } else {
365
0
        DoubleDouble::add(lo6_hi_dd, prod)
366
    };
367
368
    const EXP2_COEFFS: [(u64, u64); 10] = [
369
        (0x0000000000000000, 0x3ff0000000000000),
370
        (0x3c1abc9e3b398024, 0x3f862e42fefa39ef),
371
        (0xbba5e43a5429bddb, 0x3f0ebfbdff82c58f),
372
        (0xbb2d33162491268f, 0x3e8c6b08d704a0c0),
373
        (0x3a94fb32d240a14e, 0x3e03b2ab6fba4e77),
374
        (0x39ee84e916be83e0, 0x3d75d87fe78a6731),
375
        (0xb989a447bfddc5e6, 0x3ce430912f86bfb8),
376
        (0xb8e31a55719de47f, 0x3c4ffcbfc588ded9),
377
        (0xb850ba57164eb36b, 0x3bb62c034beb8339),
378
        (0xb7b8483eabd9642d, 0x3b1b5251ff97bee1),
379
    ];
380
381
0
    let pp = backend.dd_polyeval10(
382
0
        lo6,
383
0
        DoubleDouble::from_bit_pair(EXP2_COEFFS[0]),
384
0
        DoubleDouble::from_bit_pair(EXP2_COEFFS[1]),
385
0
        DoubleDouble::from_bit_pair(EXP2_COEFFS[2]),
386
0
        DoubleDouble::from_bit_pair(EXP2_COEFFS[3]),
387
0
        DoubleDouble::from_bit_pair(EXP2_COEFFS[4]),
388
0
        DoubleDouble::from_bit_pair(EXP2_COEFFS[5]),
389
0
        DoubleDouble::from_bit_pair(EXP2_COEFFS[6]),
390
0
        DoubleDouble::from_bit_pair(EXP2_COEFFS[7]),
391
0
        DoubleDouble::from_bit_pair(EXP2_COEFFS[8]),
392
0
        DoubleDouble::from_bit_pair(EXP2_COEFFS[9]),
393
    );
394
0
    let rr = backend.quick_mult(exp2_hi_mid, pp);
395
396
    // Make sure the sum is normalized:
397
0
    let r = DoubleDouble::from_exact_add(rr.hi, rr.lo);
398
399
    const FRACTION_MASK: u64 = (1u64 << 52) - 1;
400
401
0
    let mut r_bits = r.hi.to_bits();
402
0
    if ((r_bits & 0xfff_ffff) == 0) && (r.lo != 0.0) {
403
0
        let hi_sign = r.hi.to_bits() >> 63;
404
0
        let lo_sign = r.lo.to_bits() >> 63;
405
0
        if hi_sign == lo_sign {
406
0
            r_bits = r_bits.wrapping_add(1);
407
0
        } else if (r_bits & FRACTION_MASK) > 0 {
408
0
            r_bits = r_bits.wrapping_sub(1);
409
0
        }
410
0
    }
411
412
0
    f64::from_bits(r_bits)
413
0
}
Unexecuted instantiation: pxfm::powf::powf_dd::<pxfm::powf::FmaPowfBackend>
Unexecuted instantiation: pxfm::powf::powf_dd::<pxfm::powf::GenPowfBackend>
414
415
#[inline(always)]
416
0
fn powf_gen<B: PowfBackend>(x: f32, y: f32, backend: B) -> f32 {
417
0
    let mut x_u = x.to_bits();
418
0
    let x_abs = x_u & 0x7fff_ffff;
419
0
    let mut y = y;
420
0
    let y_u = y.to_bits();
421
0
    let y_abs = y_u & 0x7fff_ffff;
422
0
    let mut x = x;
423
424
0
    if ((y_abs & 0x0007_ffff) == 0) || (y_abs > 0x4f170000) {
425
        // y is signaling NaN
426
0
        if x.is_nan() || y.is_nan() {
427
0
            if y.abs() == 0. {
428
0
                return 1.;
429
0
            }
430
0
            if x == 1. {
431
0
                return 1.;
432
0
            }
433
0
            return f32::NAN;
434
0
        }
435
436
        // Exceptional exponents.
437
0
        if y == 0.0 {
438
0
            return 1.0;
439
0
        }
440
441
0
        match y_abs {
442
            0x7f80_0000 => {
443
0
                if x_abs > 0x7f80_0000 {
444
                    // pow(NaN, +-Inf) = NaN
445
0
                    return x;
446
0
                }
447
0
                if x_abs == 0x3f80_0000 {
448
                    // pow(+-1, +-Inf) = 1.0f
449
0
                    return 1.0;
450
0
                }
451
0
                if x == 0.0 && y_u == 0xff80_0000 {
452
                    // pow(+-0, -Inf) = +inf and raise FE_DIVBYZERO
453
0
                    return f32::INFINITY;
454
0
                }
455
                // pow (|x| < 1, -inf) = +inf
456
                // pow (|x| < 1, +inf) = 0.0f
457
                // pow (|x| > 1, -inf) = 0.0f
458
                // pow (|x| > 1, +inf) = +inf
459
0
                return if (x_abs < 0x3f80_0000) == (y_u == 0xff80_0000) {
460
0
                    f32::INFINITY
461
                } else {
462
0
                    0.
463
                };
464
            }
465
            _ => {
466
0
                match y_u {
467
                    0x3f00_0000 => {
468
                        // pow(x, 1/2) = sqrt(x)
469
0
                        if x == 0.0 || x_u == 0xff80_0000 {
470
                            // pow(-0, 1/2) = +0
471
                            // pow(-inf, 1/2) = +inf
472
                            // Make sure it is correct for FTZ/DAZ.
473
0
                            return x * x;
474
0
                        }
475
0
                        let r = x.sqrt();
476
0
                        return if r.to_bits() != 0x8000_0000 { r } else { 0.0 };
477
                    }
478
                    0x3f80_0000 => {
479
0
                        return x;
480
                    } // y = 1.0f
481
0
                    0x4000_0000 => return x * x, // y = 2.0f
482
                    _ => {
483
0
                        let is_int = backend.integerf(y);
484
0
                        if is_int && (y_u > 0x4000_0000) && (y_u <= 0x41c0_0000) {
485
                            // Check for exact cases when 2 < y < 25 and y is an integer.
486
0
                            let mut msb: i32 = if x_abs == 0 {
487
0
                                32 - 2
488
                            } else {
489
0
                                x_abs.leading_zeros() as i32
490
                            };
491
0
                            msb = if msb > 8 { msb } else { 8 };
492
0
                            let mut lsb: i32 = if x_abs == 0 {
493
0
                                0
494
                            } else {
495
0
                                x_abs.trailing_zeros() as i32
496
                            };
497
0
                            lsb = if lsb > 23 { 23 } else { lsb };
498
0
                            let extra_bits: i32 = 32 - 2 - lsb - msb;
499
0
                            let iter = y as i32;
500
501
0
                            if extra_bits * iter <= 23 + 2 {
502
                                // The result is either exact or exactly half-way.
503
                                // But it is exactly representable in double precision.
504
0
                                let x_d = x as f64;
505
0
                                let mut result = x_d;
506
0
                                for _ in 1..iter {
507
0
                                    result *= x_d;
508
0
                                }
509
0
                                return result as f32;
510
0
                            }
511
0
                        }
512
513
0
                        if y_abs > 0x4f17_0000 {
514
                            // if y is NaN
515
0
                            if y_abs > 0x7f80_0000 {
516
0
                                if x_u == 0x3f80_0000 {
517
                                    // x = 1.0f
518
                                    // pow(1, NaN) = 1
519
0
                                    return 1.0;
520
0
                                }
521
                                // pow(x, NaN) = NaN
522
0
                                return y;
523
0
                            }
524
                            // x^y will be overflow / underflow in single precision.  Set y to a
525
                            // large enough exponent but not too large, so that the computations
526
                            // won't be overflow in double precision.
527
0
                            y = f32::from_bits((y_u & 0x8000_0000).wrapping_add(0x4f800000u32));
528
0
                        }
529
                    }
530
                }
531
            }
532
        }
533
0
    }
534
535
    const E_BIAS: u32 = (1u32 << (8 - 1u32)) - 1u32;
536
0
    let mut ex = -(E_BIAS as i32);
537
0
    let mut sign: u64 = 0;
538
539
0
    if ((x_u & 0x801f_ffffu32) == 0) || x_u >= 0x7f80_0000u32 || x_u < 0x0080_0000u32 {
540
0
        if x.is_nan() {
541
0
            return f32::NAN;
542
0
        }
543
544
0
        if x_u == 0x3f80_0000 {
545
0
            return 1.;
546
0
        }
547
548
0
        let x_is_neg = x.to_bits() > 0x8000_0000;
549
550
0
        if x == 0.0 {
551
0
            let out_is_neg = x_is_neg && backend.odd_integerf(f32::from_bits(y_u));
552
0
            if y_u > 0x8000_0000u32 {
553
                // pow(0, negative number) = inf
554
0
                return if x_is_neg {
555
0
                    f32::NEG_INFINITY
556
                } else {
557
0
                    f32::INFINITY
558
                };
559
0
            }
560
            // pow(0, positive number) = 0
561
0
            return if out_is_neg { -0.0 } else { 0.0 };
562
0
        }
563
564
0
        if x_abs == 0x7f80_0000u32 {
565
            // x = +-Inf
566
0
            let out_is_neg = x_is_neg && backend.odd_integerf(f32::from_bits(y_u));
567
0
            if y_u >= 0x7fff_ffff {
568
0
                return if out_is_neg { -0.0 } else { 0.0 };
569
0
            }
570
0
            return if out_is_neg {
571
0
                f32::NEG_INFINITY
572
            } else {
573
0
                f32::INFINITY
574
            };
575
0
        }
576
577
0
        if x_abs > 0x7f80_0000 {
578
            // x is NaN.
579
            // pow (aNaN, 0) is already taken care above.
580
0
            return x;
581
0
        }
582
583
        // Normalize denormal inputs.
584
0
        if x_abs < 0x0080_0000u32 {
585
0
            ex = ex.wrapping_sub(64);
586
0
            x *= f32::from_bits(0x5f800000);
587
0
        }
588
589
        // x is finite and negative, and y is a finite integer.
590
0
        if x.is_sign_negative() {
591
0
            if backend.integerf(y) {
592
0
                x = -x;
593
0
                if backend.odd_integerf(y) {
594
0
                    sign = 0x8000_0000_0000_0000u64;
595
0
                }
596
            } else {
597
                // pow( negative, non-integer ) = NaN
598
0
                return f32::NAN;
599
            }
600
0
        }
601
0
    }
602
603
    // x^y = 2^( y * log2(x) )
604
    //     = 2^( y * ( e_x + log2(m_x) ) )
605
    // First we compute log2(x) = e_x + log2(m_x)
606
0
    x_u = x.to_bits();
607
608
    // Extract exponent field of x.
609
0
    ex = ex.wrapping_add((x_u >> 23) as i32);
610
0
    let e_x = ex as f64;
611
    // Use the highest 7 fractional bits of m_x as the index for look up tables.
612
0
    let x_mant = x_u & ((1u32 << 23) - 1);
613
0
    let idx_x = (x_mant >> (23 - 7)) as i32;
614
    // Add the hidden bit to the mantissa.
615
    // 1 <= m_x < 2
616
0
    let m_x = f32::from_bits(x_mant | 0x3f800000);
617
618
    // Reduced argument for log2(m_x):
619
    //   dx = r * m_x - 1.
620
    // The computation is exact, and -2^-8 <= dx < 2^-7.
621
    // Then m_x = (1 + dx) / r, and
622
    //   log2(m_x) = log2( (1 + dx) / r )
623
    //             = log2(1 + dx) - log2(r).
624
625
0
    let dx = if B::HAS_FMA {
626
        use crate::logs::LOG_REDUCTION_F32;
627
0
        backend.fmaf(
628
0
            m_x,
629
0
            f32::from_bits(LOG_REDUCTION_F32.0[idx_x as usize]),
630
0
            -1.0,
631
0
        ) as f64 // Exact.
632
    } else {
633
        use crate::logs::LOG_RANGE_REDUCTION;
634
0
        backend.fma(
635
0
            m_x as f64,
636
0
            f64::from_bits(LOG_RANGE_REDUCTION[idx_x as usize]),
637
            -1.0,
638
        ) // Exact
639
    };
640
641
    // Degree-5 polynomial approximation:
642
    //   dx * P(dx) ~ log2(1 + dx)
643
    // Generated by Sollya with:
644
    // > P = fpminimax(log2(1 + x)/x, 5, [|D...|], [-2^-8, 2^-7]);
645
    // > dirtyinfnorm(log2(1 + x)/x - P, [-2^-8, 2^-7]);
646
    //   0x1.653...p-52
647
    const COEFFS: [u64; 6] = [
648
        0x3ff71547652b82fe,
649
        0xbfe71547652b7a07,
650
        0x3fdec709dc458db1,
651
        0xbfd715479c2266c9,
652
        0x3fd2776ae1ddf8f0,
653
        0xbfce7b2178870157,
654
    ];
655
656
0
    let dx2 = dx * dx; // Exact
657
0
    let c0 = backend.fma(dx, f64::from_bits(COEFFS[1]), f64::from_bits(COEFFS[0]));
658
0
    let c1 = backend.fma(dx, f64::from_bits(COEFFS[3]), f64::from_bits(COEFFS[2]));
659
0
    let c2 = backend.fma(dx, f64::from_bits(COEFFS[5]), f64::from_bits(COEFFS[4]));
660
661
0
    let p = backend.polyeval3(dx2, c0, c1, c2);
662
663
    // s = e_x - log2(r) + dx * P(dx)
664
    // Approximation errors:
665
    //   |log2(x) - s| < ulp(e_x) + (bounds on dx) * (error bounds of P(dx))
666
    //                 = ulp(e_x) + 2^-7 * 2^-51
667
    //                 < 2^8 * 2^-52 + 2^-7 * 2^-43
668
    //                 ~ 2^-44 + 2^-50
669
0
    let s = backend.fma(dx, p, f64::from_bits(LOG2_R[idx_x as usize]) + e_x);
670
671
    // To compute 2^(y * log2(x)), we break the exponent into 3 parts:
672
    //   y * log(2) = hi + mid + lo, where
673
    //   hi is an integer
674
    //   mid * 2^6 is an integer
675
    //   |lo| <= 2^-7
676
    // Then:
677
    //   x^y = 2^(y * log2(x)) = 2^hi * 2^mid * 2^lo,
678
    // In which 2^mid is obtained from a look-up table of size 2^6 = 64 elements,
679
    // and 2^lo ~ 1 + lo * P(lo).
680
    // Thus, we have:
681
    //   hi + mid = 2^-6 * round( 2^6 * y * log2(x) )
682
    // If we restrict the output such that |hi| < 150, (hi + mid) uses (8 + 6)
683
    // bits, hence, if we use double precision to perform
684
    //   round( 2^6 * y * log2(x))
685
    // the lo part is bounded by 2^-7 + 2^(-(52 - 14)) = 2^-7 + 2^-38
686
687
    // In the following computations:
688
    //   y6  = 2^6 * y
689
    //   hm  = 2^6 * (hi + mid) = round(2^6 * y * log2(x)) ~ round(y6 * s)
690
    //   lo6 = 2^6 * lo = 2^6 * (y - (hi + mid)) = y6 * log2(x) - hm.
691
0
    let y6 = (y * f32::from_bits(0x42800000)) as f64; // Exact.
692
0
    let hm = backend.round(s * y6);
693
694
    // let log2_rr = LOG2_R2_DD[idx_x as usize];
695
696
    // // lo6 = 2^6 * lo.
697
    // let lo6_hi = f_fmla(y6, e_x + f64::from_bits(log2_rr.1), -hm); // Exact
698
    // // Error bounds:
699
    // //   | (y*log2(x) - hm * 2^-6 - lo) / y| < err(dx * p) + err(LOG2_R_DD.lo)
700
    // //                                       < 2^-51 + 2^-75
701
    // let lo6 = f_fmla(y6, f_fmla(dx, p, f64::from_bits(log2_rr.0)), lo6_hi);
702
703
    // lo6 = 2^6 * lo.
704
0
    let lo6_hi = backend.fma(y6, e_x + f64::from_bits(LOG2_R_TD[idx_x as usize].2), -hm); // Exact
705
    // Error bounds:
706
    //   | (y*log2(x) - hm * 2^-6 - lo) / y| < err(dx * p) + err(LOG2_R_DD.lo)
707
    //                                       < 2^-51 + 2^-75
708
0
    let lo6 = backend.fma(
709
0
        y6,
710
0
        backend.fma(dx, p, f64::from_bits(LOG2_R_TD[idx_x as usize].1)),
711
0
        lo6_hi,
712
    );
713
714
    // |2^(hi + mid) - exp2_hi_mid| <= ulp(exp2_hi_mid) / 2
715
    // Clamp the exponent part into smaller range that fits double precision.
716
    // For those exponents that are out of range, the final conversion will round
717
    // them correctly to inf/max float or 0/min float accordingly.
718
0
    let mut hm_i = unsafe { hm.to_int_unchecked::<i64>() };
719
0
    hm_i = if hm_i > (1i64 << 15) {
720
0
        1 << 15
721
0
    } else if hm_i < (-(1i64 << 15)) {
722
0
        -(1 << 15)
723
    } else {
724
0
        hm_i
725
    };
726
727
0
    let idx_y = hm_i & 0x3f;
728
729
    // 2^hi
730
0
    let exp_hi_i = (hm_i >> 6).wrapping_shl(52);
731
    // 2^mid
732
0
    let exp_mid_i = EXP2_MID1[idx_y as usize].1;
733
    // (-1)^sign * 2^hi * 2^mid
734
    // Error <= 2^hi * 2^-53
735
0
    let exp2_hi_mid_i = (exp_hi_i.wrapping_add(exp_mid_i as i64) as u64).wrapping_add(sign);
736
0
    let exp2_hi_mid = f64::from_bits(exp2_hi_mid_i);
737
738
    // Degree-5 polynomial approximation P(lo6) ~ 2^(lo6 / 2^6) = 2^(lo).
739
    // Generated by Sollya with:
740
    // > P = fpminimax(2^(x/64), 5, [|1, D...|], [-2^-1, 2^-1]);
741
    // > dirtyinfnorm(2^(x/64) - P, [-0.5, 0.5]);
742
    // 0x1.a2b77e618f5c4c176fd11b7659016cde5de83cb72p-60
743
    const EXP2_COEFFS: [u64; 6] = [
744
        0x3ff0000000000000,
745
        0x3f862e42fefa39ef,
746
        0x3f0ebfbdff82a23a,
747
        0x3e8c6b08d7076268,
748
        0x3e03b2ad33f8b48b,
749
        0x3d75d870c4d84445,
750
    ];
751
752
0
    let lo6_sqr = lo6 * lo6;
753
0
    let d0 = backend.fma(
754
0
        lo6,
755
0
        f64::from_bits(EXP2_COEFFS[1]),
756
0
        f64::from_bits(EXP2_COEFFS[0]),
757
    );
758
0
    let d1 = backend.fma(
759
0
        lo6,
760
0
        f64::from_bits(EXP2_COEFFS[3]),
761
0
        f64::from_bits(EXP2_COEFFS[2]),
762
    );
763
0
    let d2 = backend.fma(
764
0
        lo6,
765
0
        f64::from_bits(EXP2_COEFFS[5]),
766
0
        f64::from_bits(EXP2_COEFFS[4]),
767
    );
768
0
    let pp = backend.polyeval3(lo6_sqr, d0, d1, d2);
769
770
0
    let r = pp * exp2_hi_mid;
771
0
    let r_u = r.to_bits();
772
773
0
    let r_upper = f64::from_bits(r_u + B::ERR) as f32;
774
0
    let r_lower = f64::from_bits(r_u - B::ERR) as f32;
775
0
    if r_upper == r_lower {
776
0
        return r_upper;
777
0
    }
778
779
    // Scale lower part of 2^(hi + mid)
780
0
    let exp2_hi_mid_dd = DoubleDouble {
781
0
        lo: if idx_y != 0 {
782
0
            f64::from_bits((exp_hi_i as u64).wrapping_add(EXP2_MID1[idx_y as usize].0))
783
        } else {
784
0
            0.
785
        },
786
0
        hi: exp2_hi_mid,
787
    };
788
789
0
    let r_dd = powf_dd(idx_x, dx, y6, lo6_hi, exp2_hi_mid_dd, &backend);
790
0
    r_dd as f32
791
0
}
Unexecuted instantiation: pxfm::powf::powf_gen::<pxfm::powf::FmaPowfBackend>
Unexecuted instantiation: pxfm::powf::powf_gen::<pxfm::powf::GenPowfBackend>
792
793
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
794
#[target_feature(enable = "avx", enable = "fma")]
795
0
unsafe fn powf_fma_impl(x: f32, y: f32) -> f32 {
796
0
    powf_gen(x, y, FmaPowfBackend {})
797
0
}
798
799
/// Power function
800
///
801
/// Max found ULP 0.5
802
0
pub fn f_powf(x: f32, y: f32) -> f32 {
803
    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
804
    {
805
        powf_gen(x, y, GenPowfBackend {})
806
    }
807
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
808
    {
809
        use std::sync::OnceLock;
810
        static EXECUTOR: OnceLock<unsafe fn(f32, f32) -> f32> = OnceLock::new();
811
0
        let q = EXECUTOR.get_or_init(|| {
812
0
            if std::arch::is_x86_feature_detected!("avx")
813
0
                && std::arch::is_x86_feature_detected!("fma")
814
            {
815
0
                powf_fma_impl
816
            } else {
817
0
                fn def_powf(x: f32, y: f32) -> f32 {
818
0
                    powf_gen(x, y, GenPowfBackend {})
819
0
                }
820
0
                def_powf
821
            }
822
0
        });
823
0
        unsafe { q(x, y) }
824
    }
825
0
}
826
827
/// Dirty fast pow
828
#[inline]
829
0
pub fn dirty_powf(d: f32, n: f32) -> f32 {
830
    use crate::exponents::dirty_exp2f;
831
    use crate::logs::dirty_log2f;
832
0
    let value = d.abs();
833
0
    let lg = dirty_log2f(value);
834
0
    let c = dirty_exp2f(n * lg);
835
0
    if d < 0.0 {
836
0
        let y = n as i32;
837
0
        if y % 2 == 0 { c } else { -c }
838
    } else {
839
0
        c
840
    }
841
0
}
Unexecuted instantiation: pxfm::powf::dirty_powf
Unexecuted instantiation: pxfm::powf::dirty_powf
842
843
#[cfg(test)]
844
mod tests {
845
    use super::*;
846
847
    #[test]
848
    fn powf_test() {
849
        assert!(
850
            (powf(2f32, 3f32) - 8f32).abs() < 1e-6,
851
            "Invalid result {}",
852
            powf(2f32, 3f32)
853
        );
854
        assert!(
855
            (powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6,
856
            "Invalid result {}",
857
            powf(0.5f32, 2f32)
858
        );
859
    }
860
861
    #[test]
862
    fn f_powf_test() {
863
        assert!(
864
            (f_powf(2f32, 3f32) - 8f32).abs() < 1e-6,
865
            "Invalid result {}",
866
            f_powf(2f32, 3f32)
867
        );
868
        assert!(
869
            (f_powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6,
870
            "Invalid result {}",
871
            f_powf(0.5f32, 2f32)
872
        );
873
        assert_eq!(f_powf(0.5f32, 1.5432f32), 0.34312353);
874
        assert_eq!(
875
            f_powf(f32::INFINITY, 0.00000000000000000000000000000000038518824),
876
            f32::INFINITY
877
        );
878
        assert_eq!(f_powf(f32::NAN, 0.0), 1.);
879
        assert_eq!(f_powf(1., f32::NAN), 1.);
880
    }
881
882
    #[test]
883
    fn dirty_powf_test() {
884
        assert!(
885
            (dirty_powf(2f32, 3f32) - 8f32).abs() < 1e-6,
886
            "Invalid result {}",
887
            dirty_powf(2f32, 3f32)
888
        );
889
        assert!(
890
            (dirty_powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6,
891
            "Invalid result {}",
892
            dirty_powf(0.5f32, 2f32)
893
        );
894
    }
895
}