/rust/registry/src/index.crates.io-1949cf8c6b5b557f/pxfm-0.1.27/src/powf.rs
Line | Count | Source |
1 | | /* |
2 | | * // Copyright (c) Radzivon Bartoshyk 4/2025. All rights reserved. |
3 | | * // |
4 | | * // Redistribution and use in source and binary forms, with or without modification, |
5 | | * // are permitted provided that the following conditions are met: |
6 | | * // |
7 | | * // 1. Redistributions of source code must retain the above copyright notice, this |
8 | | * // list of conditions and the following disclaimer. |
9 | | * // |
10 | | * // 2. Redistributions in binary form must reproduce the above copyright notice, |
11 | | * // this list of conditions and the following disclaimer in the documentation |
12 | | * // and/or other materials provided with the distribution. |
13 | | * // |
14 | | * // 3. Neither the name of the copyright holder nor the names of its |
15 | | * // contributors may be used to endorse or promote products derived from |
16 | | * // this software without specific prior written permission. |
17 | | * // |
18 | | * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
19 | | * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
20 | | * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
21 | | * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE |
22 | | * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
23 | | * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
24 | | * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
25 | | * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
26 | | * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
27 | | * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | | */ |
29 | | #![allow(clippy::too_many_arguments)] |
30 | | use crate::bits::biased_exponent_f64; |
31 | | use crate::common::*; |
32 | | use crate::double_double::DoubleDouble; |
33 | | use crate::exponents::expf; |
34 | | use crate::logf; |
35 | | use crate::logs::LOG2_R; |
36 | | use crate::pow_tables::EXP2_MID1; |
37 | | use crate::powf_tables::{LOG2_R_TD, LOG2_R2_DD, POWF_R2}; |
38 | | use crate::rounding::CpuRound; |
39 | | |
40 | | /// Power function for given value for const context. |
41 | | /// This is simplified version just to make a good approximation on const context. |
42 | 0 | pub const fn powf(d: f32, n: f32) -> f32 { |
43 | 0 | let value = d.abs(); |
44 | 0 | let c = expf(n * logf(value)); |
45 | 0 | if n == 1. { |
46 | 0 | return d; |
47 | 0 | } |
48 | 0 | if d < 0.0 { |
49 | 0 | let y = n as i32; |
50 | 0 | if y % 2 == 0 { c } else { -c } |
51 | | } else { |
52 | 0 | c |
53 | | } |
54 | 0 | } |
55 | | |
56 | | pub(crate) trait PowfBackend { |
57 | | fn fmaf(&self, x: f32, y: f32, z: f32) -> f32; |
58 | | fn fma(&self, x: f64, y: f64, z: f64) -> f64; |
59 | | fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64; |
60 | | fn integerf(&self, x: f32) -> bool; |
61 | | fn odd_integerf(&self, x: f32) -> bool; |
62 | | fn round(&self, x: f64) -> f64; |
63 | | fn quick_mult(&self, x: DoubleDouble, y: DoubleDouble) -> DoubleDouble; |
64 | | fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble; |
65 | | fn dd_polyeval6( |
66 | | &self, |
67 | | x: DoubleDouble, |
68 | | a0: DoubleDouble, |
69 | | a1: DoubleDouble, |
70 | | a2: DoubleDouble, |
71 | | a3: DoubleDouble, |
72 | | a4: DoubleDouble, |
73 | | a5: DoubleDouble, |
74 | | ) -> DoubleDouble; |
75 | | fn dd_polyeval10( |
76 | | &self, |
77 | | x: DoubleDouble, |
78 | | a0: DoubleDouble, |
79 | | a1: DoubleDouble, |
80 | | a2: DoubleDouble, |
81 | | a3: DoubleDouble, |
82 | | a4: DoubleDouble, |
83 | | a5: DoubleDouble, |
84 | | a6: DoubleDouble, |
85 | | a7: DoubleDouble, |
86 | | a8: DoubleDouble, |
87 | | a9: DoubleDouble, |
88 | | ) -> DoubleDouble; |
89 | | const HAS_FMA: bool; |
90 | | const ERR: u64; |
91 | | } |
92 | | |
93 | | pub(crate) struct GenPowfBackend {} |
94 | | |
95 | | impl PowfBackend for GenPowfBackend { |
96 | | #[inline(always)] |
97 | 0 | fn fmaf(&self, x: f32, y: f32, z: f32) -> f32 { |
98 | 0 | f_fmlaf(x, y, z) |
99 | 0 | } |
100 | | |
101 | | #[inline(always)] |
102 | 0 | fn fma(&self, x: f64, y: f64, z: f64) -> f64 { |
103 | 0 | f_fmla(x, y, z) |
104 | 0 | } |
105 | | |
106 | | #[inline(always)] |
107 | 0 | fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64 { |
108 | | use crate::polyeval::f_polyeval3; |
109 | 0 | f_polyeval3(x, a0, a1, a2) |
110 | 0 | } |
111 | | |
112 | | #[inline(always)] |
113 | 0 | fn integerf(&self, x: f32) -> bool { |
114 | 0 | is_integerf(x) |
115 | 0 | } |
116 | | |
117 | | #[inline(always)] |
118 | 0 | fn odd_integerf(&self, x: f32) -> bool { |
119 | 0 | is_odd_integerf(x) |
120 | 0 | } |
121 | | |
122 | | #[inline(always)] |
123 | 0 | fn round(&self, x: f64) -> f64 { |
124 | 0 | x.cpu_round() |
125 | 0 | } |
126 | | |
127 | | #[inline(always)] |
128 | 0 | fn quick_mult(&self, x: DoubleDouble, y: DoubleDouble) -> DoubleDouble { |
129 | 0 | DoubleDouble::quick_mult(x, y) |
130 | 0 | } |
131 | | |
132 | | #[inline(always)] |
133 | 0 | fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble { |
134 | 0 | DoubleDouble::quick_mult_f64(x, y) |
135 | 0 | } |
136 | | |
137 | | #[inline(always)] |
138 | 0 | fn dd_polyeval6( |
139 | 0 | &self, |
140 | 0 | x: DoubleDouble, |
141 | 0 | a0: DoubleDouble, |
142 | 0 | a1: DoubleDouble, |
143 | 0 | a2: DoubleDouble, |
144 | 0 | a3: DoubleDouble, |
145 | 0 | a4: DoubleDouble, |
146 | 0 | a5: DoubleDouble, |
147 | 0 | ) -> DoubleDouble { |
148 | | use crate::polyeval::dd_quick_polyeval6; |
149 | 0 | dd_quick_polyeval6(x, a0, a1, a2, a3, a4, a5) |
150 | 0 | } |
151 | | |
152 | | #[inline(always)] |
153 | 0 | fn dd_polyeval10( |
154 | 0 | &self, |
155 | 0 | x: DoubleDouble, |
156 | 0 | a0: DoubleDouble, |
157 | 0 | a1: DoubleDouble, |
158 | 0 | a2: DoubleDouble, |
159 | 0 | a3: DoubleDouble, |
160 | 0 | a4: DoubleDouble, |
161 | 0 | a5: DoubleDouble, |
162 | 0 | a6: DoubleDouble, |
163 | 0 | a7: DoubleDouble, |
164 | 0 | a8: DoubleDouble, |
165 | 0 | a9: DoubleDouble, |
166 | 0 | ) -> DoubleDouble { |
167 | | use crate::polyeval::dd_quick_polyeval10; |
168 | 0 | dd_quick_polyeval10(x, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) |
169 | 0 | } |
170 | | |
171 | | #[cfg(any( |
172 | | all( |
173 | | any(target_arch = "x86", target_arch = "x86_64"), |
174 | | target_feature = "fma" |
175 | | ), |
176 | | target_arch = "aarch64" |
177 | | ))] |
178 | | const HAS_FMA: bool = true; |
179 | | #[cfg(not(any( |
180 | | all( |
181 | | any(target_arch = "x86", target_arch = "x86_64"), |
182 | | target_feature = "fma" |
183 | | ), |
184 | | target_arch = "aarch64" |
185 | | )))] |
186 | | const HAS_FMA: bool = false; |
187 | | #[cfg(any( |
188 | | all( |
189 | | any(target_arch = "x86", target_arch = "x86_64"), |
190 | | target_feature = "fma" |
191 | | ), |
192 | | target_arch = "aarch64" |
193 | | ))] |
194 | | const ERR: u64 = 64; |
195 | | #[cfg(not(any( |
196 | | all( |
197 | | any(target_arch = "x86", target_arch = "x86_64"), |
198 | | target_feature = "fma" |
199 | | ), |
200 | | target_arch = "aarch64" |
201 | | )))] |
202 | | const ERR: u64 = 128; |
203 | | } |
204 | | |
205 | | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
206 | | pub(crate) struct FmaPowfBackend {} |
207 | | |
208 | | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
209 | | impl PowfBackend for FmaPowfBackend { |
210 | | #[inline(always)] |
211 | 0 | fn fmaf(&self, x: f32, y: f32, z: f32) -> f32 { |
212 | 0 | f32::mul_add(x, y, z) |
213 | 0 | } |
214 | | |
215 | | #[inline(always)] |
216 | 0 | fn fma(&self, x: f64, y: f64, z: f64) -> f64 { |
217 | 0 | f64::mul_add(x, y, z) |
218 | 0 | } |
219 | | |
220 | | #[inline(always)] |
221 | 0 | fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64 { |
222 | | use crate::polyeval::d_polyeval3; |
223 | 0 | d_polyeval3(x, a0, a1, a2) |
224 | 0 | } |
225 | | |
226 | | #[inline(always)] |
227 | 0 | fn integerf(&self, x: f32) -> bool { |
228 | 0 | x.round_ties_even() == x |
229 | 0 | } |
230 | | |
231 | | #[inline(always)] |
232 | 0 | fn odd_integerf(&self, x: f32) -> bool { |
233 | | use crate::common::is_odd_integerf_fast; |
234 | 0 | is_odd_integerf_fast(x) |
235 | 0 | } |
236 | | |
237 | | #[inline(always)] |
238 | 0 | fn round(&self, x: f64) -> f64 { |
239 | 0 | x.round() |
240 | 0 | } |
241 | | |
242 | | #[inline(always)] |
243 | 0 | fn quick_mult(&self, x: DoubleDouble, y: DoubleDouble) -> DoubleDouble { |
244 | 0 | DoubleDouble::quick_mult_fma(x, y) |
245 | 0 | } |
246 | | |
247 | | #[inline(always)] |
248 | 0 | fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble { |
249 | 0 | DoubleDouble::quick_mult_f64_fma(x, y) |
250 | 0 | } |
251 | | |
252 | | #[inline(always)] |
253 | 0 | fn dd_polyeval6( |
254 | 0 | &self, |
255 | 0 | x: DoubleDouble, |
256 | 0 | a0: DoubleDouble, |
257 | 0 | a1: DoubleDouble, |
258 | 0 | a2: DoubleDouble, |
259 | 0 | a3: DoubleDouble, |
260 | 0 | a4: DoubleDouble, |
261 | 0 | a5: DoubleDouble, |
262 | 0 | ) -> DoubleDouble { |
263 | | use crate::polyeval::dd_quick_polyeval6_fma; |
264 | 0 | dd_quick_polyeval6_fma(x, a0, a1, a2, a3, a4, a5) |
265 | 0 | } |
266 | | |
267 | | #[inline(always)] |
268 | 0 | fn dd_polyeval10( |
269 | 0 | &self, |
270 | 0 | x: DoubleDouble, |
271 | 0 | a0: DoubleDouble, |
272 | 0 | a1: DoubleDouble, |
273 | 0 | a2: DoubleDouble, |
274 | 0 | a3: DoubleDouble, |
275 | 0 | a4: DoubleDouble, |
276 | 0 | a5: DoubleDouble, |
277 | 0 | a6: DoubleDouble, |
278 | 0 | a7: DoubleDouble, |
279 | 0 | a8: DoubleDouble, |
280 | 0 | a9: DoubleDouble, |
281 | 0 | ) -> DoubleDouble { |
282 | | use crate::polyeval::dd_quick_polyeval10_fma; |
283 | 0 | dd_quick_polyeval10_fma(x, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) |
284 | 0 | } |
285 | | |
286 | | const HAS_FMA: bool = true; |
287 | | |
288 | | const ERR: u64 = 64; |
289 | | } |
290 | | |
291 | | #[inline] |
292 | 0 | const fn larger_exponent(a: f64, b: f64) -> bool { |
293 | 0 | biased_exponent_f64(a) >= biased_exponent_f64(b) |
294 | 0 | } |
295 | | |
296 | | // Calculate 2^(y * log2(x)) in double-double precision. |
297 | | // At this point we can reuse the following values: |
298 | | // idx_x: index for extra precision of log2 for the middle part of log2(x). |
299 | | // dx: the reduced argument for log2(x) |
300 | | // y6: 2^6 * y. |
301 | | // lo6_hi: the high part of 2^6 * (y - (hi + mid)) |
302 | | // exp2_hi_mid: high part of 2^(hi + mid) |
303 | | #[cold] |
304 | | #[inline(always)] |
305 | 0 | fn powf_dd<B: PowfBackend>( |
306 | 0 | idx_x: i32, |
307 | 0 | dx: f64, |
308 | 0 | y6: f64, |
309 | 0 | lo6_hi: f64, |
310 | 0 | exp2_hi_mid: DoubleDouble, |
311 | 0 | backend: &B, |
312 | 0 | ) -> f64 { |
313 | | // Perform a second range reduction step: |
314 | | // idx2 = round(2^14 * (dx + 2^-8)) = round ( dx * 2^14 + 2^6) |
315 | | // dx2 = (1 + dx) * r2 - 1 |
316 | | // Output range: |
317 | | // -0x1.3ffcp-15 <= dx2 <= 0x1.3e3dp-15 |
318 | 0 | let idx2 = backend.round(backend.fma( |
319 | 0 | dx, |
320 | 0 | f64::from_bits(0x40d0000000000000), |
321 | 0 | f64::from_bits(0x4050000000000000), |
322 | 0 | )) as usize; |
323 | 0 | let dx2 = backend.fma(1.0 + dx, f64::from_bits(POWF_R2[idx2]), -1.0); // Exact |
324 | | |
325 | | const COEFFS: [(u64, u64); 6] = [ |
326 | | (0x3c7777d0ffda25e0, 0x3ff71547652b82fe), |
327 | | (0xbc6777d101cf0a84, 0xbfe71547652b82fe), |
328 | | (0x3c7ce04b5140d867, 0x3fdec709dc3a03fd), |
329 | | (0x3c7137b47e635be5, 0xbfd71547652b82fb), |
330 | | (0xbc5b5a30b3bdb318, 0x3fd2776c516a92a2), |
331 | | (0x3c62d2fbd081e657, 0xbfcec70af1929ca6), |
332 | | ]; |
333 | 0 | let dx_dd = DoubleDouble::new(0.0, dx2); |
334 | 0 | let p = backend.dd_polyeval6( |
335 | 0 | dx_dd, |
336 | 0 | DoubleDouble::from_bit_pair(COEFFS[0]), |
337 | 0 | DoubleDouble::from_bit_pair(COEFFS[1]), |
338 | 0 | DoubleDouble::from_bit_pair(COEFFS[2]), |
339 | 0 | DoubleDouble::from_bit_pair(COEFFS[3]), |
340 | 0 | DoubleDouble::from_bit_pair(COEFFS[4]), |
341 | 0 | DoubleDouble::from_bit_pair(COEFFS[5]), |
342 | | ); |
343 | | // log2(1 + dx2) ~ dx2 * P(dx2) |
344 | 0 | let log2_x_lo = backend.quick_mult_f64(p, dx2); |
345 | | // Lower parts of (e_x - log2(r1)) of the first range reduction constant |
346 | 0 | let log2_r_td = LOG2_R_TD[idx_x as usize]; |
347 | 0 | let log2_x_mid = DoubleDouble::new(f64::from_bits(log2_r_td.0), f64::from_bits(log2_r_td.1)); |
348 | | // -log2(r2) + lower part of (e_x - log2(r1)) |
349 | 0 | let log2_x_m = DoubleDouble::add(DoubleDouble::from_bit_pair(LOG2_R2_DD[idx2]), log2_x_mid); |
350 | | // log2(1 + dx2) - log2(r2) + lower part of (e_x - log2(r1)) |
351 | | // Since we don't know which one has larger exponent to apply Fast2Sum |
352 | | // algorithm, we need to check them before calling double-double addition. |
353 | 0 | let log2_x = if larger_exponent(log2_x_m.hi, log2_x_lo.hi) { |
354 | 0 | DoubleDouble::add(log2_x_m, log2_x_lo) |
355 | | } else { |
356 | 0 | DoubleDouble::add(log2_x_lo, log2_x_m) |
357 | | }; |
358 | 0 | let lo6_hi_dd = DoubleDouble::new(0.0, lo6_hi); |
359 | | // 2^6 * y * (log2(1 + dx2) - log2(r2) + lower part of (e_x - log2(r1))) |
360 | 0 | let prod = backend.quick_mult_f64(log2_x, y6); |
361 | | // 2^6 * (y * log2(x) - (hi + mid)) = 2^6 * lo |
362 | 0 | let lo6 = if larger_exponent(prod.hi, lo6_hi) { |
363 | 0 | DoubleDouble::add(prod, lo6_hi_dd) |
364 | | } else { |
365 | 0 | DoubleDouble::add(lo6_hi_dd, prod) |
366 | | }; |
367 | | |
368 | | const EXP2_COEFFS: [(u64, u64); 10] = [ |
369 | | (0x0000000000000000, 0x3ff0000000000000), |
370 | | (0x3c1abc9e3b398024, 0x3f862e42fefa39ef), |
371 | | (0xbba5e43a5429bddb, 0x3f0ebfbdff82c58f), |
372 | | (0xbb2d33162491268f, 0x3e8c6b08d704a0c0), |
373 | | (0x3a94fb32d240a14e, 0x3e03b2ab6fba4e77), |
374 | | (0x39ee84e916be83e0, 0x3d75d87fe78a6731), |
375 | | (0xb989a447bfddc5e6, 0x3ce430912f86bfb8), |
376 | | (0xb8e31a55719de47f, 0x3c4ffcbfc588ded9), |
377 | | (0xb850ba57164eb36b, 0x3bb62c034beb8339), |
378 | | (0xb7b8483eabd9642d, 0x3b1b5251ff97bee1), |
379 | | ]; |
380 | | |
381 | 0 | let pp = backend.dd_polyeval10( |
382 | 0 | lo6, |
383 | 0 | DoubleDouble::from_bit_pair(EXP2_COEFFS[0]), |
384 | 0 | DoubleDouble::from_bit_pair(EXP2_COEFFS[1]), |
385 | 0 | DoubleDouble::from_bit_pair(EXP2_COEFFS[2]), |
386 | 0 | DoubleDouble::from_bit_pair(EXP2_COEFFS[3]), |
387 | 0 | DoubleDouble::from_bit_pair(EXP2_COEFFS[4]), |
388 | 0 | DoubleDouble::from_bit_pair(EXP2_COEFFS[5]), |
389 | 0 | DoubleDouble::from_bit_pair(EXP2_COEFFS[6]), |
390 | 0 | DoubleDouble::from_bit_pair(EXP2_COEFFS[7]), |
391 | 0 | DoubleDouble::from_bit_pair(EXP2_COEFFS[8]), |
392 | 0 | DoubleDouble::from_bit_pair(EXP2_COEFFS[9]), |
393 | | ); |
394 | 0 | let rr = backend.quick_mult(exp2_hi_mid, pp); |
395 | | |
396 | | // Make sure the sum is normalized: |
397 | 0 | let r = DoubleDouble::from_exact_add(rr.hi, rr.lo); |
398 | | |
399 | | const FRACTION_MASK: u64 = (1u64 << 52) - 1; |
400 | | |
401 | 0 | let mut r_bits = r.hi.to_bits(); |
402 | 0 | if ((r_bits & 0xfff_ffff) == 0) && (r.lo != 0.0) { |
403 | 0 | let hi_sign = r.hi.to_bits() >> 63; |
404 | 0 | let lo_sign = r.lo.to_bits() >> 63; |
405 | 0 | if hi_sign == lo_sign { |
406 | 0 | r_bits = r_bits.wrapping_add(1); |
407 | 0 | } else if (r_bits & FRACTION_MASK) > 0 { |
408 | 0 | r_bits = r_bits.wrapping_sub(1); |
409 | 0 | } |
410 | 0 | } |
411 | | |
412 | 0 | f64::from_bits(r_bits) |
413 | 0 | } Unexecuted instantiation: pxfm::powf::powf_dd::<pxfm::powf::FmaPowfBackend> Unexecuted instantiation: pxfm::powf::powf_dd::<pxfm::powf::GenPowfBackend> |
414 | | |
415 | | #[inline(always)] |
416 | 0 | fn powf_gen<B: PowfBackend>(x: f32, y: f32, backend: B) -> f32 { |
417 | 0 | let mut x_u = x.to_bits(); |
418 | 0 | let x_abs = x_u & 0x7fff_ffff; |
419 | 0 | let mut y = y; |
420 | 0 | let y_u = y.to_bits(); |
421 | 0 | let y_abs = y_u & 0x7fff_ffff; |
422 | 0 | let mut x = x; |
423 | | |
424 | 0 | if ((y_abs & 0x0007_ffff) == 0) || (y_abs > 0x4f170000) { |
425 | | // y is signaling NaN |
426 | 0 | if x.is_nan() || y.is_nan() { |
427 | 0 | if y.abs() == 0. { |
428 | 0 | return 1.; |
429 | 0 | } |
430 | 0 | if x == 1. { |
431 | 0 | return 1.; |
432 | 0 | } |
433 | 0 | return f32::NAN; |
434 | 0 | } |
435 | | |
436 | | // Exceptional exponents. |
437 | 0 | if y == 0.0 { |
438 | 0 | return 1.0; |
439 | 0 | } |
440 | | |
441 | 0 | match y_abs { |
442 | | 0x7f80_0000 => { |
443 | 0 | if x_abs > 0x7f80_0000 { |
444 | | // pow(NaN, +-Inf) = NaN |
445 | 0 | return x; |
446 | 0 | } |
447 | 0 | if x_abs == 0x3f80_0000 { |
448 | | // pow(+-1, +-Inf) = 1.0f |
449 | 0 | return 1.0; |
450 | 0 | } |
451 | 0 | if x == 0.0 && y_u == 0xff80_0000 { |
452 | | // pow(+-0, -Inf) = +inf and raise FE_DIVBYZERO |
453 | 0 | return f32::INFINITY; |
454 | 0 | } |
455 | | // pow (|x| < 1, -inf) = +inf |
456 | | // pow (|x| < 1, +inf) = 0.0f |
457 | | // pow (|x| > 1, -inf) = 0.0f |
458 | | // pow (|x| > 1, +inf) = +inf |
459 | 0 | return if (x_abs < 0x3f80_0000) == (y_u == 0xff80_0000) { |
460 | 0 | f32::INFINITY |
461 | | } else { |
462 | 0 | 0. |
463 | | }; |
464 | | } |
465 | | _ => { |
466 | 0 | match y_u { |
467 | | 0x3f00_0000 => { |
468 | | // pow(x, 1/2) = sqrt(x) |
469 | 0 | if x == 0.0 || x_u == 0xff80_0000 { |
470 | | // pow(-0, 1/2) = +0 |
471 | | // pow(-inf, 1/2) = +inf |
472 | | // Make sure it is correct for FTZ/DAZ. |
473 | 0 | return x * x; |
474 | 0 | } |
475 | 0 | let r = x.sqrt(); |
476 | 0 | return if r.to_bits() != 0x8000_0000 { r } else { 0.0 }; |
477 | | } |
478 | | 0x3f80_0000 => { |
479 | 0 | return x; |
480 | | } // y = 1.0f |
481 | 0 | 0x4000_0000 => return x * x, // y = 2.0f |
482 | | _ => { |
483 | 0 | let is_int = backend.integerf(y); |
484 | 0 | if is_int && (y_u > 0x4000_0000) && (y_u <= 0x41c0_0000) { |
485 | | // Check for exact cases when 2 < y < 25 and y is an integer. |
486 | 0 | let mut msb: i32 = if x_abs == 0 { |
487 | 0 | 32 - 2 |
488 | | } else { |
489 | 0 | x_abs.leading_zeros() as i32 |
490 | | }; |
491 | 0 | msb = if msb > 8 { msb } else { 8 }; |
492 | 0 | let mut lsb: i32 = if x_abs == 0 { |
493 | 0 | 0 |
494 | | } else { |
495 | 0 | x_abs.trailing_zeros() as i32 |
496 | | }; |
497 | 0 | lsb = if lsb > 23 { 23 } else { lsb }; |
498 | 0 | let extra_bits: i32 = 32 - 2 - lsb - msb; |
499 | 0 | let iter = y as i32; |
500 | | |
501 | 0 | if extra_bits * iter <= 23 + 2 { |
502 | | // The result is either exact or exactly half-way. |
503 | | // But it is exactly representable in double precision. |
504 | 0 | let x_d = x as f64; |
505 | 0 | let mut result = x_d; |
506 | 0 | for _ in 1..iter { |
507 | 0 | result *= x_d; |
508 | 0 | } |
509 | 0 | return result as f32; |
510 | 0 | } |
511 | 0 | } |
512 | | |
513 | 0 | if y_abs > 0x4f17_0000 { |
514 | | // if y is NaN |
515 | 0 | if y_abs > 0x7f80_0000 { |
516 | 0 | if x_u == 0x3f80_0000 { |
517 | | // x = 1.0f |
518 | | // pow(1, NaN) = 1 |
519 | 0 | return 1.0; |
520 | 0 | } |
521 | | // pow(x, NaN) = NaN |
522 | 0 | return y; |
523 | 0 | } |
524 | | // x^y will be overflow / underflow in single precision. Set y to a |
525 | | // large enough exponent but not too large, so that the computations |
526 | | // won't be overflow in double precision. |
527 | 0 | y = f32::from_bits((y_u & 0x8000_0000).wrapping_add(0x4f800000u32)); |
528 | 0 | } |
529 | | } |
530 | | } |
531 | | } |
532 | | } |
533 | 0 | } |
534 | | |
535 | | const E_BIAS: u32 = (1u32 << (8 - 1u32)) - 1u32; |
536 | 0 | let mut ex = -(E_BIAS as i32); |
537 | 0 | let mut sign: u64 = 0; |
538 | | |
539 | 0 | if ((x_u & 0x801f_ffffu32) == 0) || x_u >= 0x7f80_0000u32 || x_u < 0x0080_0000u32 { |
540 | 0 | if x.is_nan() { |
541 | 0 | return f32::NAN; |
542 | 0 | } |
543 | | |
544 | 0 | if x_u == 0x3f80_0000 { |
545 | 0 | return 1.; |
546 | 0 | } |
547 | | |
548 | 0 | let x_is_neg = x.to_bits() > 0x8000_0000; |
549 | | |
550 | 0 | if x == 0.0 { |
551 | 0 | let out_is_neg = x_is_neg && backend.odd_integerf(f32::from_bits(y_u)); |
552 | 0 | if y_u > 0x8000_0000u32 { |
553 | | // pow(0, negative number) = inf |
554 | 0 | return if x_is_neg { |
555 | 0 | f32::NEG_INFINITY |
556 | | } else { |
557 | 0 | f32::INFINITY |
558 | | }; |
559 | 0 | } |
560 | | // pow(0, positive number) = 0 |
561 | 0 | return if out_is_neg { -0.0 } else { 0.0 }; |
562 | 0 | } |
563 | | |
564 | 0 | if x_abs == 0x7f80_0000u32 { |
565 | | // x = +-Inf |
566 | 0 | let out_is_neg = x_is_neg && backend.odd_integerf(f32::from_bits(y_u)); |
567 | 0 | if y_u >= 0x7fff_ffff { |
568 | 0 | return if out_is_neg { -0.0 } else { 0.0 }; |
569 | 0 | } |
570 | 0 | return if out_is_neg { |
571 | 0 | f32::NEG_INFINITY |
572 | | } else { |
573 | 0 | f32::INFINITY |
574 | | }; |
575 | 0 | } |
576 | | |
577 | 0 | if x_abs > 0x7f80_0000 { |
578 | | // x is NaN. |
579 | | // pow (aNaN, 0) is already taken care above. |
580 | 0 | return x; |
581 | 0 | } |
582 | | |
583 | | // Normalize denormal inputs. |
584 | 0 | if x_abs < 0x0080_0000u32 { |
585 | 0 | ex = ex.wrapping_sub(64); |
586 | 0 | x *= f32::from_bits(0x5f800000); |
587 | 0 | } |
588 | | |
589 | | // x is finite and negative, and y is a finite integer. |
590 | 0 | if x.is_sign_negative() { |
591 | 0 | if backend.integerf(y) { |
592 | 0 | x = -x; |
593 | 0 | if backend.odd_integerf(y) { |
594 | 0 | sign = 0x8000_0000_0000_0000u64; |
595 | 0 | } |
596 | | } else { |
597 | | // pow( negative, non-integer ) = NaN |
598 | 0 | return f32::NAN; |
599 | | } |
600 | 0 | } |
601 | 0 | } |
602 | | |
603 | | // x^y = 2^( y * log2(x) ) |
604 | | // = 2^( y * ( e_x + log2(m_x) ) ) |
605 | | // First we compute log2(x) = e_x + log2(m_x) |
606 | 0 | x_u = x.to_bits(); |
607 | | |
608 | | // Extract exponent field of x. |
609 | 0 | ex = ex.wrapping_add((x_u >> 23) as i32); |
610 | 0 | let e_x = ex as f64; |
611 | | // Use the highest 7 fractional bits of m_x as the index for look up tables. |
612 | 0 | let x_mant = x_u & ((1u32 << 23) - 1); |
613 | 0 | let idx_x = (x_mant >> (23 - 7)) as i32; |
614 | | // Add the hidden bit to the mantissa. |
615 | | // 1 <= m_x < 2 |
616 | 0 | let m_x = f32::from_bits(x_mant | 0x3f800000); |
617 | | |
618 | | // Reduced argument for log2(m_x): |
619 | | // dx = r * m_x - 1. |
620 | | // The computation is exact, and -2^-8 <= dx < 2^-7. |
621 | | // Then m_x = (1 + dx) / r, and |
622 | | // log2(m_x) = log2( (1 + dx) / r ) |
623 | | // = log2(1 + dx) - log2(r). |
624 | | |
625 | 0 | let dx = if B::HAS_FMA { |
626 | | use crate::logs::LOG_REDUCTION_F32; |
627 | 0 | backend.fmaf( |
628 | 0 | m_x, |
629 | 0 | f32::from_bits(LOG_REDUCTION_F32.0[idx_x as usize]), |
630 | 0 | -1.0, |
631 | 0 | ) as f64 // Exact. |
632 | | } else { |
633 | | use crate::logs::LOG_RANGE_REDUCTION; |
634 | 0 | backend.fma( |
635 | 0 | m_x as f64, |
636 | 0 | f64::from_bits(LOG_RANGE_REDUCTION[idx_x as usize]), |
637 | | -1.0, |
638 | | ) // Exact |
639 | | }; |
640 | | |
641 | | // Degree-5 polynomial approximation: |
642 | | // dx * P(dx) ~ log2(1 + dx) |
643 | | // Generated by Sollya with: |
644 | | // > P = fpminimax(log2(1 + x)/x, 5, [|D...|], [-2^-8, 2^-7]); |
645 | | // > dirtyinfnorm(log2(1 + x)/x - P, [-2^-8, 2^-7]); |
646 | | // 0x1.653...p-52 |
647 | | const COEFFS: [u64; 6] = [ |
648 | | 0x3ff71547652b82fe, |
649 | | 0xbfe71547652b7a07, |
650 | | 0x3fdec709dc458db1, |
651 | | 0xbfd715479c2266c9, |
652 | | 0x3fd2776ae1ddf8f0, |
653 | | 0xbfce7b2178870157, |
654 | | ]; |
655 | | |
656 | 0 | let dx2 = dx * dx; // Exact |
657 | 0 | let c0 = backend.fma(dx, f64::from_bits(COEFFS[1]), f64::from_bits(COEFFS[0])); |
658 | 0 | let c1 = backend.fma(dx, f64::from_bits(COEFFS[3]), f64::from_bits(COEFFS[2])); |
659 | 0 | let c2 = backend.fma(dx, f64::from_bits(COEFFS[5]), f64::from_bits(COEFFS[4])); |
660 | | |
661 | 0 | let p = backend.polyeval3(dx2, c0, c1, c2); |
662 | | |
663 | | // s = e_x - log2(r) + dx * P(dx) |
664 | | // Approximation errors: |
665 | | // |log2(x) - s| < ulp(e_x) + (bounds on dx) * (error bounds of P(dx)) |
666 | | // = ulp(e_x) + 2^-7 * 2^-51 |
667 | | // < 2^8 * 2^-52 + 2^-7 * 2^-43 |
668 | | // ~ 2^-44 + 2^-50 |
669 | 0 | let s = backend.fma(dx, p, f64::from_bits(LOG2_R[idx_x as usize]) + e_x); |
670 | | |
671 | | // To compute 2^(y * log2(x)), we break the exponent into 3 parts: |
672 | | // y * log(2) = hi + mid + lo, where |
673 | | // hi is an integer |
674 | | // mid * 2^6 is an integer |
675 | | // |lo| <= 2^-7 |
676 | | // Then: |
677 | | // x^y = 2^(y * log2(x)) = 2^hi * 2^mid * 2^lo, |
678 | | // In which 2^mid is obtained from a look-up table of size 2^6 = 64 elements, |
679 | | // and 2^lo ~ 1 + lo * P(lo). |
680 | | // Thus, we have: |
681 | | // hi + mid = 2^-6 * round( 2^6 * y * log2(x) ) |
682 | | // If we restrict the output such that |hi| < 150, (hi + mid) uses (8 + 6) |
683 | | // bits, hence, if we use double precision to perform |
684 | | // round( 2^6 * y * log2(x)) |
685 | | // the lo part is bounded by 2^-7 + 2^(-(52 - 14)) = 2^-7 + 2^-38 |
686 | | |
687 | | // In the following computations: |
688 | | // y6 = 2^6 * y |
689 | | // hm = 2^6 * (hi + mid) = round(2^6 * y * log2(x)) ~ round(y6 * s) |
690 | | // lo6 = 2^6 * lo = 2^6 * (y - (hi + mid)) = y6 * log2(x) - hm. |
691 | 0 | let y6 = (y * f32::from_bits(0x42800000)) as f64; // Exact. |
692 | 0 | let hm = backend.round(s * y6); |
693 | | |
694 | | // let log2_rr = LOG2_R2_DD[idx_x as usize]; |
695 | | |
696 | | // // lo6 = 2^6 * lo. |
697 | | // let lo6_hi = f_fmla(y6, e_x + f64::from_bits(log2_rr.1), -hm); // Exact |
698 | | // // Error bounds: |
699 | | // // | (y*log2(x) - hm * 2^-6 - lo) / y| < err(dx * p) + err(LOG2_R_DD.lo) |
700 | | // // < 2^-51 + 2^-75 |
701 | | // let lo6 = f_fmla(y6, f_fmla(dx, p, f64::from_bits(log2_rr.0)), lo6_hi); |
702 | | |
703 | | // lo6 = 2^6 * lo. |
704 | 0 | let lo6_hi = backend.fma(y6, e_x + f64::from_bits(LOG2_R_TD[idx_x as usize].2), -hm); // Exact |
705 | | // Error bounds: |
706 | | // | (y*log2(x) - hm * 2^-6 - lo) / y| < err(dx * p) + err(LOG2_R_DD.lo) |
707 | | // < 2^-51 + 2^-75 |
708 | 0 | let lo6 = backend.fma( |
709 | 0 | y6, |
710 | 0 | backend.fma(dx, p, f64::from_bits(LOG2_R_TD[idx_x as usize].1)), |
711 | 0 | lo6_hi, |
712 | | ); |
713 | | |
714 | | // |2^(hi + mid) - exp2_hi_mid| <= ulp(exp2_hi_mid) / 2 |
715 | | // Clamp the exponent part into smaller range that fits double precision. |
716 | | // For those exponents that are out of range, the final conversion will round |
717 | | // them correctly to inf/max float or 0/min float accordingly. |
718 | 0 | let mut hm_i = unsafe { hm.to_int_unchecked::<i64>() }; |
719 | 0 | hm_i = if hm_i > (1i64 << 15) { |
720 | 0 | 1 << 15 |
721 | 0 | } else if hm_i < (-(1i64 << 15)) { |
722 | 0 | -(1 << 15) |
723 | | } else { |
724 | 0 | hm_i |
725 | | }; |
726 | | |
727 | 0 | let idx_y = hm_i & 0x3f; |
728 | | |
729 | | // 2^hi |
730 | 0 | let exp_hi_i = (hm_i >> 6).wrapping_shl(52); |
731 | | // 2^mid |
732 | 0 | let exp_mid_i = EXP2_MID1[idx_y as usize].1; |
733 | | // (-1)^sign * 2^hi * 2^mid |
734 | | // Error <= 2^hi * 2^-53 |
735 | 0 | let exp2_hi_mid_i = (exp_hi_i.wrapping_add(exp_mid_i as i64) as u64).wrapping_add(sign); |
736 | 0 | let exp2_hi_mid = f64::from_bits(exp2_hi_mid_i); |
737 | | |
738 | | // Degree-5 polynomial approximation P(lo6) ~ 2^(lo6 / 2^6) = 2^(lo). |
739 | | // Generated by Sollya with: |
740 | | // > P = fpminimax(2^(x/64), 5, [|1, D...|], [-2^-1, 2^-1]); |
741 | | // > dirtyinfnorm(2^(x/64) - P, [-0.5, 0.5]); |
742 | | // 0x1.a2b77e618f5c4c176fd11b7659016cde5de83cb72p-60 |
743 | | const EXP2_COEFFS: [u64; 6] = [ |
744 | | 0x3ff0000000000000, |
745 | | 0x3f862e42fefa39ef, |
746 | | 0x3f0ebfbdff82a23a, |
747 | | 0x3e8c6b08d7076268, |
748 | | 0x3e03b2ad33f8b48b, |
749 | | 0x3d75d870c4d84445, |
750 | | ]; |
751 | | |
752 | 0 | let lo6_sqr = lo6 * lo6; |
753 | 0 | let d0 = backend.fma( |
754 | 0 | lo6, |
755 | 0 | f64::from_bits(EXP2_COEFFS[1]), |
756 | 0 | f64::from_bits(EXP2_COEFFS[0]), |
757 | | ); |
758 | 0 | let d1 = backend.fma( |
759 | 0 | lo6, |
760 | 0 | f64::from_bits(EXP2_COEFFS[3]), |
761 | 0 | f64::from_bits(EXP2_COEFFS[2]), |
762 | | ); |
763 | 0 | let d2 = backend.fma( |
764 | 0 | lo6, |
765 | 0 | f64::from_bits(EXP2_COEFFS[5]), |
766 | 0 | f64::from_bits(EXP2_COEFFS[4]), |
767 | | ); |
768 | 0 | let pp = backend.polyeval3(lo6_sqr, d0, d1, d2); |
769 | | |
770 | 0 | let r = pp * exp2_hi_mid; |
771 | 0 | let r_u = r.to_bits(); |
772 | | |
773 | 0 | let r_upper = f64::from_bits(r_u + B::ERR) as f32; |
774 | 0 | let r_lower = f64::from_bits(r_u - B::ERR) as f32; |
775 | 0 | if r_upper == r_lower { |
776 | 0 | return r_upper; |
777 | 0 | } |
778 | | |
779 | | // Scale lower part of 2^(hi + mid) |
780 | 0 | let exp2_hi_mid_dd = DoubleDouble { |
781 | 0 | lo: if idx_y != 0 { |
782 | 0 | f64::from_bits((exp_hi_i as u64).wrapping_add(EXP2_MID1[idx_y as usize].0)) |
783 | | } else { |
784 | 0 | 0. |
785 | | }, |
786 | 0 | hi: exp2_hi_mid, |
787 | | }; |
788 | | |
789 | 0 | let r_dd = powf_dd(idx_x, dx, y6, lo6_hi, exp2_hi_mid_dd, &backend); |
790 | 0 | r_dd as f32 |
791 | 0 | } Unexecuted instantiation: pxfm::powf::powf_gen::<pxfm::powf::FmaPowfBackend> Unexecuted instantiation: pxfm::powf::powf_gen::<pxfm::powf::GenPowfBackend> |
792 | | |
793 | | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
794 | | #[target_feature(enable = "avx", enable = "fma")] |
795 | 0 | unsafe fn powf_fma_impl(x: f32, y: f32) -> f32 { |
796 | 0 | powf_gen(x, y, FmaPowfBackend {}) |
797 | 0 | } |
798 | | |
799 | | /// Power function |
800 | | /// |
801 | | /// Max found ULP 0.5 |
802 | 0 | pub fn f_powf(x: f32, y: f32) -> f32 { |
803 | | #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] |
804 | | { |
805 | | powf_gen(x, y, GenPowfBackend {}) |
806 | | } |
807 | | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
808 | | { |
809 | | use std::sync::OnceLock; |
810 | | static EXECUTOR: OnceLock<unsafe fn(f32, f32) -> f32> = OnceLock::new(); |
811 | 0 | let q = EXECUTOR.get_or_init(|| { |
812 | 0 | if std::arch::is_x86_feature_detected!("avx") |
813 | 0 | && std::arch::is_x86_feature_detected!("fma") |
814 | | { |
815 | 0 | powf_fma_impl |
816 | | } else { |
817 | 0 | fn def_powf(x: f32, y: f32) -> f32 { |
818 | 0 | powf_gen(x, y, GenPowfBackend {}) |
819 | 0 | } |
820 | 0 | def_powf |
821 | | } |
822 | 0 | }); |
823 | 0 | unsafe { q(x, y) } |
824 | | } |
825 | 0 | } |
826 | | |
827 | | /// Dirty fast pow |
828 | | #[inline] |
829 | 0 | pub fn dirty_powf(d: f32, n: f32) -> f32 { |
830 | | use crate::exponents::dirty_exp2f; |
831 | | use crate::logs::dirty_log2f; |
832 | 0 | let value = d.abs(); |
833 | 0 | let lg = dirty_log2f(value); |
834 | 0 | let c = dirty_exp2f(n * lg); |
835 | 0 | if d < 0.0 { |
836 | 0 | let y = n as i32; |
837 | 0 | if y % 2 == 0 { c } else { -c } |
838 | | } else { |
839 | 0 | c |
840 | | } |
841 | 0 | } Unexecuted instantiation: pxfm::powf::dirty_powf Unexecuted instantiation: pxfm::powf::dirty_powf |
842 | | |
843 | | #[cfg(test)] |
844 | | mod tests { |
845 | | use super::*; |
846 | | |
847 | | #[test] |
848 | | fn powf_test() { |
849 | | assert!( |
850 | | (powf(2f32, 3f32) - 8f32).abs() < 1e-6, |
851 | | "Invalid result {}", |
852 | | powf(2f32, 3f32) |
853 | | ); |
854 | | assert!( |
855 | | (powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6, |
856 | | "Invalid result {}", |
857 | | powf(0.5f32, 2f32) |
858 | | ); |
859 | | } |
860 | | |
861 | | #[test] |
862 | | fn f_powf_test() { |
863 | | assert!( |
864 | | (f_powf(2f32, 3f32) - 8f32).abs() < 1e-6, |
865 | | "Invalid result {}", |
866 | | f_powf(2f32, 3f32) |
867 | | ); |
868 | | assert!( |
869 | | (f_powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6, |
870 | | "Invalid result {}", |
871 | | f_powf(0.5f32, 2f32) |
872 | | ); |
873 | | assert_eq!(f_powf(0.5f32, 1.5432f32), 0.34312353); |
874 | | assert_eq!( |
875 | | f_powf(f32::INFINITY, 0.00000000000000000000000000000000038518824), |
876 | | f32::INFINITY |
877 | | ); |
878 | | assert_eq!(f_powf(f32::NAN, 0.0), 1.); |
879 | | assert_eq!(f_powf(1., f32::NAN), 1.); |
880 | | } |
881 | | |
882 | | #[test] |
883 | | fn dirty_powf_test() { |
884 | | assert!( |
885 | | (dirty_powf(2f32, 3f32) - 8f32).abs() < 1e-6, |
886 | | "Invalid result {}", |
887 | | dirty_powf(2f32, 3f32) |
888 | | ); |
889 | | assert!( |
890 | | (dirty_powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6, |
891 | | "Invalid result {}", |
892 | | dirty_powf(0.5f32, 2f32) |
893 | | ); |
894 | | } |
895 | | } |