/rust/registry/src/index.crates.io-1949cf8c6b5b557f/pxfm-0.1.25/src/pow_exec.rs
Line | Count | Source |
1 | | /* |
2 | | * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved. |
3 | | * // |
4 | | * // Redistribution and use in source and binary forms, with or without modification, |
5 | | * // are permitted provided that the following conditions are met: |
6 | | * // |
7 | | * // 1. Redistributions of source code must retain the above copyright notice, this |
8 | | * // list of conditions and the following disclaimer. |
9 | | * // |
10 | | * // 2. Redistributions in binary form must reproduce the above copyright notice, |
11 | | * // this list of conditions and the following disclaimer in the documentation |
12 | | * // and/or other materials provided with the distribution. |
13 | | * // |
14 | | * // 3. Neither the name of the copyright holder nor the names of its |
15 | | * // contributors may be used to endorse or promote products derived from |
16 | | * // this software without specific prior written permission. |
17 | | * // |
18 | | * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
19 | | * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
20 | | * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
21 | | * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE |
22 | | * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
23 | | * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
24 | | * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
25 | | * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
26 | | * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
27 | | * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | | */ |
29 | | use crate::common::dd_fmla; |
30 | | use crate::double_double::DoubleDouble; |
31 | | use crate::dyadic_float::{DyadicFloat128, DyadicSign}; |
32 | | use crate::exponents::{EXP_REDUCE_T0, EXP_REDUCE_T1, ldexp}; |
33 | | use crate::exponents::{EXPM1_T0, EXPM1_T1}; |
34 | | use crate::polyeval::f_polyeval6; |
35 | | use crate::pow_tables::{EXP_T1_2_DYADIC, EXP_T2_2_DYADIC, POW_INVERSE, POW_LOG_INV}; |
36 | | use crate::rounding::CpuRound; |
37 | | use crate::rounding::CpuRoundTiesEven; |
38 | | |
39 | | #[inline(always)] |
40 | 0 | pub(crate) fn log_poly_1(z: f64) -> DoubleDouble { |
41 | | /* The following is a degree-8 polynomial generated by Sollya for |
42 | | log(1+x)-x+x^2/2 over [-0.0040283203125,0.0040283203125] |
43 | | with absolute error < 2^-81.63 |
44 | | and relative error < 2^-72.423 (see sollya/P_1.sollya). |
45 | | The relative error is for x - x^2/2 + P(x) with respect to log(1+x). */ |
46 | | const P_1: [u64; 6] = [ |
47 | | 0x3fd5555555555558, |
48 | | 0xbfd0000000000003, |
49 | | 0x3fc999999981f535, |
50 | | 0xbfc55555553d1eb4, |
51 | | 0x3fc2494526fd4a06, |
52 | | 0xbfc0001f0c80e8ce, |
53 | | ]; |
54 | 0 | let w = DoubleDouble::from_exact_mult(z, z); |
55 | 0 | let t = dd_fmla(f64::from_bits(P_1[5]), z, f64::from_bits(P_1[4])); |
56 | 0 | let mut u = dd_fmla(f64::from_bits(P_1[3]), z, f64::from_bits(P_1[2])); |
57 | 0 | let mut v = dd_fmla(f64::from_bits(P_1[1]), z, f64::from_bits(P_1[0])); |
58 | 0 | u = dd_fmla(t, w.hi, u); |
59 | 0 | v = dd_fmla(u, w.hi, v); |
60 | 0 | u = v * w.hi; |
61 | 0 | DoubleDouble::new(dd_fmla(u, z, -0.5 * w.lo), -0.5 * w.hi) |
62 | 0 | } |
63 | | |
64 | | /* Given 2^-1074 <= x <= 0x1.fffffffffffffp+1023, this routine puts in h+l |
65 | | an approximation of log(x) such that |l| < 2^-23.89*|h| and |
66 | | |
67 | | | h + l - log(x) | <= elog * |log x| |
68 | | |
69 | | with elog = 2^-73.527 if x < 1/sqrt(2) or sqrt(2) < x, |
70 | | and elog = 2^-67.0544 if 1/sqrt(2) < x < sqrt(2) |
71 | | (note that x cannot equal 1/sqrt(2) nor sqrt(2)). |
72 | | */ |
73 | | #[inline] |
74 | 0 | pub(crate) fn pow_log_1(x: f64) -> (DoubleDouble, bool) { |
75 | | /* for 181 <= i <= 362, r[i] = _INVERSE[i-181] is a 9-bit approximation of |
76 | | 1/x[i], where i*2^-8 <= x[i] < (i+1)*2^-8. |
77 | | More precisely r[i] is a 9-bit value such that r[i]*y-1 is representable |
78 | | exactly on 53 bits for for any y, i*2^-8 <= y < (i+1)*2^-8. |
79 | | Moreover |r[i]*y-1| < 0.0040283203125. |
80 | | Table generated with the accompanying pow.sage file, |
81 | | with l=inverse_centered(k=8,prec=9,maxbits=53,verbose=false) */ |
82 | 0 | let x_u = x.to_bits(); |
83 | 0 | let mut m = x_u & 0xfffffffffffff; |
84 | 0 | let mut e: i64 = ((x_u >> 52) & 0x7ff) as i64; |
85 | | |
86 | | let t; |
87 | 0 | if e != 0 { |
88 | 0 | t = m | (0x3ffu64 << 52); |
89 | 0 | m = m.wrapping_add(1u64 << 52); |
90 | 0 | e -= 0x3ff; |
91 | 0 | } else { |
92 | 0 | /* x is a subnormal double */ |
93 | 0 | let k = m.leading_zeros() - 11; |
94 | 0 |
|
95 | 0 | e = -0x3fei64 - k as i64; |
96 | 0 | m = m.wrapping_shl(k); |
97 | 0 | t = m | (0x3ffu64 << 52); |
98 | 0 | } |
99 | | |
100 | | /* now |x| = 2^_e*_t = 2^(_e-52)*m with 1 <= _t < 2, |
101 | | and 2^52 <= _m < 2^53 */ |
102 | | |
103 | | // log(x) = log(t) + E ยท log(2) |
104 | 0 | let mut t = f64::from_bits(t); |
105 | | |
106 | | // If m > sqrt(2) we divide it by 2 so ensure 1/sqrt(2) < t < sqrt(2) |
107 | 0 | let c: usize = (m >= 0x16a09e667f3bcd) as usize; |
108 | | static CY: [f64; 2] = [1.0, 0.5]; |
109 | | static CM: [u64; 2] = [44, 45]; |
110 | | |
111 | 0 | e = e.wrapping_add(c as i64); |
112 | 0 | let be = e; |
113 | 0 | let i = m >> CM[c]; /* i/2^8 <= t < (i+1)/2^8 */ |
114 | | /* when c=1, we have 0x16a09e667f3bcd <= m < 2^53, thus 90 <= i <= 127; |
115 | | when c=0, we have 2^52 <= m < 0x16a09e667f3bcd, thus 128 <= i <= 181 */ |
116 | 0 | t *= CY[c]; |
117 | | /* now 0x1.6a09e667f3bcdp-1 <= t < 0x1.6a09e667f3bcdp+0, |
118 | | and log(x) = E * log(2) + log(t) */ |
119 | | |
120 | 0 | let r = f64::from_bits(POW_INVERSE[(i - 181) as usize]); |
121 | 0 | let l1 = f64::from_bits(POW_LOG_INV[(i - 181) as usize].1); |
122 | 0 | let l2 = f64::from_bits(POW_LOG_INV[(i - 181) as usize].0); |
123 | | |
124 | 0 | let z = dd_fmla(r, t, -1.0); |
125 | | |
126 | | const LOG2_DD: DoubleDouble = DoubleDouble::new( |
127 | | f64::from_bits(0x3d2ef35793c76730), |
128 | | f64::from_bits(0x3fe62e42fefa3800), |
129 | | ); |
130 | | |
131 | 0 | let th = dd_fmla(be as f64, LOG2_DD.hi, l1); |
132 | 0 | let tl = dd_fmla(be as f64, LOG2_DD.lo, l2); |
133 | | |
134 | 0 | let mut v = DoubleDouble::f64_add(th, DoubleDouble::new(tl, z)); |
135 | 0 | let p = log_poly_1(z); |
136 | 0 | v = DoubleDouble::f64_add(v.hi, DoubleDouble::new(v.lo + p.lo, p.hi)); |
137 | | |
138 | 0 | if e == 0 && v.lo.abs() > (v.hi.abs()) * f64::from_bits(0x3e70000000000000) { |
139 | 0 | v = DoubleDouble::from_exact_add(v.hi, v.lo); |
140 | 0 | return (v, true); |
141 | 0 | } |
142 | | |
143 | 0 | (v, false) |
144 | 0 | } |
145 | | |
146 | | /* Given z such that |z| < 2^-12.905, |
147 | | this routine puts in qh+ql an approximation of exp(z) such that |
148 | | |
149 | | | (qh+ql) / exp(z) - 1 | < 2^-64.902632 |
150 | | |
151 | | and |ql| <= 2^-51.999. |
152 | | */ |
153 | | #[inline(always)] |
154 | 0 | fn exp_poly_1(z: f64) -> DoubleDouble { |
155 | | /* The following is a degree-4 polynomial generated by Sollya for exp(x) |
156 | | over [-2^-12.905,2^-12.905] |
157 | | with absolute error < 2^-74.34 (see sollya/Q_1.sollya). */ |
158 | | const Q_1: [u64; 5] = [ |
159 | | 0x3ff0000000000000, |
160 | | 0x3ff0000000000000, |
161 | | 0x3fe0000000000000, |
162 | | 0x3fc5555555997996, |
163 | | 0x3fa5555555849d8d, |
164 | | ]; |
165 | 0 | let mut q = dd_fmla(f64::from_bits(Q_1[4]), z, f64::from_bits(Q_1[3])); |
166 | 0 | q = dd_fmla(q, z, f64::from_bits(Q_1[2])); |
167 | 0 | let h0 = dd_fmla(q, z, f64::from_bits(Q_1[1])); |
168 | | |
169 | 0 | let v1 = DoubleDouble::from_exact_mult(z, h0); |
170 | 0 | DoubleDouble::f64_add(f64::from_bits(Q_1[0]), v1) |
171 | 0 | } |
172 | | |
173 | | /* Given z such that |z| < 2^-12.905, |
174 | | this routine puts in qh+ql an approximation of exp(z) such that |
175 | | |
176 | | | (qh+ql) / exp(z) - 1 | < 2^-64.902632 |
177 | | |
178 | | and |ql| <= 2^-51.999. |
179 | | */ |
180 | | |
181 | | // #[inline(always)] |
182 | | // fn exp_poly_dd(z: DoubleDouble) -> DoubleDouble { |
183 | | // /* The following is a degree-4 polynomial generated by Sollya for exp(x) |
184 | | // over [-2^-12.905,2^-12.905] */ |
185 | | // const Q_1: [(u64, u64); 7] = [ |
186 | | // (0x0000000000000000, 0x3ff0000000000000), |
187 | | // (0x3a20e40000000000, 0x3ff0000000000000), |
188 | | // (0x3a04820000000000, 0x3fe0000000000000), |
189 | | // (0xbc756423c5338a66, 0x3fc5555555555556), |
190 | | // (0xbc5560f74db5556c, 0x3fa5555555555556), |
191 | | // (0x3c3648eca89bc6ac, 0x3f8111111144fbee), |
192 | | // (0xbbd53d924ae90c8c, 0x3f56c16c16ffeecc), |
193 | | // ]; |
194 | | // let mut p = DoubleDouble::mult(z, DoubleDouble::from_bit_pair(Q_1[6])); |
195 | | // p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[5])); |
196 | | // p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[4])); |
197 | | // p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[3])); |
198 | | // p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[2])); |
199 | | // p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[1])); |
200 | | // p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[0])); |
201 | | // p |
202 | | // } |
203 | | |
204 | | #[inline] |
205 | 0 | pub(crate) fn pow_exp_1(r: DoubleDouble, s: f64) -> DoubleDouble { |
206 | | const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27); |
207 | | // #define RHO1 -0x1.577453f1799a6p+9 |
208 | | /* We increase the initial value of RHO1 to avoid spurious underflow in |
209 | | the result value el. However, it is not possible to obtain a lower |
210 | | bound on |el| from the input value rh, thus this modified value of RHO1 |
211 | | is obtained experimentally. */ |
212 | | const RHO1: f64 = f64::from_bits(0xc08483b8cca421af); |
213 | | const RHO2: f64 = f64::from_bits(0x40862e42e709a95b); |
214 | | const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9); |
215 | | |
216 | | // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too |
217 | | #[allow(clippy::neg_cmp_op_on_partial_ord)] |
218 | 0 | if !(r.hi <= RHO2) { |
219 | 0 | return if r.hi > RHO3 { |
220 | | /* If rh > RHO3, we are sure there is overflow, |
221 | | For s=1 we return eh = el = DBL_MAX, which yields |
222 | | res_min = res_max = +Inf for rounding up or to nearest, |
223 | | and res_min = res_max = DBL_MAX for rounding down or toward zero, |
224 | | which will yield the correct rounding. |
225 | | For s=-1 we return eh = el = -DBL_MAX, which similarly gives |
226 | | res_min = res_max = -Inf or res_min = res_max = -DBL_MAX, |
227 | | which is the correct rounding. */ |
228 | 0 | DoubleDouble::new( |
229 | 0 | f64::from_bits(0x7fefffffffffffff) * s, |
230 | 0 | f64::from_bits(0x7fefffffffffffff) * s, |
231 | | ) |
232 | | } else { |
233 | | /* If RHO2 < rh <= RHO3, we are in the intermediate region |
234 | | where there might be overflow or not, thus we set eh = el = NaN, |
235 | | which will set res_min = res_max = NaN, the comparison |
236 | | res_min == res_max will fail: we defer to the 2nd phase. */ |
237 | 0 | DoubleDouble::new(f64::NAN, f64::NAN) |
238 | | }; |
239 | 0 | } |
240 | | |
241 | 0 | if r.hi < RHO1 { |
242 | 0 | return if r.hi < RHO0 { |
243 | | /* For s=1, we have eh=el=+0 except for rounding up, |
244 | | thus res_min=+0 or -0, res_max=+0 in the main code, |
245 | | the rounding test succeeds, and we return res_max which is the |
246 | | expected result in the underflow case. |
247 | | For s=1 and rounding up, we have eh=+0, el=2^-1074, |
248 | | thus res_min = res_max = 2^-1074, which is the expected result too. |
249 | | For s=-1, we have eh=el=-0 except for rounding down, |
250 | | thus res_min=-0 or +0, res_max=-0 in the main code, |
251 | | the rounding test succeeds, and we return res_max which is the |
252 | | expected result in the underflow case. |
253 | | For s=-1 and rounding down, we have eh=-0, el=-2^-1074, |
254 | | thus res_min = res_max = -2^-1074, which is the expected result too. |
255 | | */ |
256 | 0 | DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s) |
257 | | } else { |
258 | | /* RHO0 <= rh < RHO1 or s < 0: we defer to the 2nd phase */ |
259 | 0 | DoubleDouble::new(f64::NAN, f64::NAN) |
260 | | }; |
261 | 0 | } |
262 | | const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe); |
263 | | |
264 | 0 | let k = (r.hi * INVLOG2).cpu_round(); |
265 | | |
266 | | const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef); |
267 | | const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f); |
268 | | |
269 | 0 | let zh = dd_fmla(LOG2H, -k, r.hi); |
270 | 0 | let zl = dd_fmla(LOG2L, -k, r.lo); |
271 | | |
272 | 0 | let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */ |
273 | 0 | let mk = (bk >> 12) + 0x3ff; |
274 | 0 | let i2 = (bk >> 6) & 0x3f; |
275 | 0 | let i1 = bk & 0x3f; |
276 | | |
277 | 0 | let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]); |
278 | 0 | let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]); |
279 | 0 | let mut de = DoubleDouble::quick_mult(t1, t0); |
280 | 0 | let q = exp_poly_1(zh + zl); |
281 | 0 | de = DoubleDouble::quick_mult(de, q); |
282 | | /* we should have 1 < M < 2047 here, since we filtered out |
283 | | potential underflow/overflow cases at the beginning of this function */ |
284 | | |
285 | 0 | let mut du = (mk as u64).wrapping_shl(52); |
286 | 0 | du = (f64::from_bits(du) * s).to_bits(); |
287 | 0 | de.hi *= f64::from_bits(du); |
288 | 0 | de.lo *= f64::from_bits(du); |
289 | 0 | de |
290 | 0 | } |
291 | | |
292 | | #[inline] |
293 | 0 | pub(crate) fn exp_dd_fast(r: DoubleDouble) -> DoubleDouble { |
294 | | const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe); |
295 | | |
296 | 0 | let k = (r.hi * INVLOG2).cpu_round(); |
297 | | |
298 | | const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef); |
299 | | const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f); |
300 | | |
301 | 0 | let mut z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r); |
302 | 0 | z = DoubleDouble::from_exact_add(z.hi, z.lo); |
303 | | |
304 | 0 | let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */ |
305 | 0 | let mk = (bk >> 12) + 0x3ff; |
306 | 0 | let i2 = (bk >> 6) & 0x3f; |
307 | 0 | let i1 = bk & 0x3f; |
308 | | |
309 | 0 | let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]); |
310 | 0 | let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]); |
311 | 0 | let mut de = DoubleDouble::quick_mult(t1, t0); |
312 | | // exp(hi + lo) = exp(hi) * exp(lo) |
313 | 0 | let q_hi = exp_poly_1(z.hi); |
314 | | // Taylor series exp(x) ~ 1 + x since z.lo < ulp(z.h) |
315 | 0 | let q_lo = DoubleDouble::from_exact_add(1., z.lo); |
316 | 0 | let q = DoubleDouble::quick_mult(q_hi, q_lo); |
317 | 0 | de = DoubleDouble::quick_mult(de, q); |
318 | | /* we should have 1 < M < 2047 here, since we filtered out |
319 | | potential underflow/overflow cases at the beginning of this function */ |
320 | | |
321 | 0 | let du = (mk as u64).wrapping_shl(52); |
322 | 0 | de.hi *= f64::from_bits(du); |
323 | 0 | de.lo *= f64::from_bits(du); |
324 | 0 | de |
325 | 0 | } |
326 | | |
327 | | #[inline] |
328 | 0 | pub(crate) fn pow_exp_dd(r: DoubleDouble, s: f64) -> DoubleDouble { |
329 | | const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27); |
330 | | // #define RHO1 -0x1.577453f1799a6p+9 |
331 | | /* We increase the initial value of RHO1 to avoid spurious underflow in |
332 | | the result value el. However, it is not possible to obtain a lower |
333 | | bound on |el| from the input value rh, thus this modified value of RHO1 |
334 | | is obtained experimentally. */ |
335 | | const RHO1: f64 = f64::from_bits(0xc08483b8cca421af); |
336 | | const RHO2: f64 = f64::from_bits(0x40862e42e709a95b); |
337 | | const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9); |
338 | | |
339 | | // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too |
340 | | #[allow(clippy::neg_cmp_op_on_partial_ord)] |
341 | 0 | if !(r.hi <= RHO2) { |
342 | 0 | return if r.hi > RHO3 { |
343 | | /* If rh > RHO3, we are sure there is overflow, |
344 | | For s=1 we return eh = el = DBL_MAX, which yields |
345 | | res_min = res_max = +Inf for rounding up or to nearest, |
346 | | and res_min = res_max = DBL_MAX for rounding down or toward zero, |
347 | | which will yield the correct rounding. |
348 | | For s=-1 we return eh = el = -DBL_MAX, which similarly gives |
349 | | res_min = res_max = -Inf or res_min = res_max = -DBL_MAX, |
350 | | which is the correct rounding. */ |
351 | 0 | DoubleDouble::new( |
352 | 0 | f64::from_bits(0x7fefffffffffffff) * s, |
353 | 0 | f64::from_bits(0x7fefffffffffffff) * s, |
354 | | ) |
355 | | } else { |
356 | | /* If RHO2 < rh <= RHO3, we are in the intermediate region |
357 | | where there might be overflow or not, thus we set eh = el = NaN, |
358 | | which will set res_min = res_max = NaN, the comparison |
359 | | res_min == res_max will fail: we defer to the 2nd phase. */ |
360 | 0 | DoubleDouble::new(f64::NAN, f64::NAN) |
361 | | }; |
362 | 0 | } |
363 | | |
364 | 0 | if r.hi < RHO1 { |
365 | 0 | return if r.hi < RHO0 { |
366 | | /* For s=1, we have eh=el=+0 except for rounding up, |
367 | | thus res_min=+0 or -0, res_max=+0 in the main code, |
368 | | the rounding test succeeds, and we return res_max which is the |
369 | | expected result in the underflow case. |
370 | | For s=1 and rounding up, we have eh=+0, el=2^-1074, |
371 | | thus res_min = res_max = 2^-1074, which is the expected result too. |
372 | | For s=-1, we have eh=el=-0 except for rounding down, |
373 | | thus res_min=-0 or +0, res_max=-0 in the main code, |
374 | | the rounding test succeeds, and we return res_max which is the |
375 | | expected result in the underflow case. |
376 | | For s=-1 and rounding down, we have eh=-0, el=-2^-1074, |
377 | | thus res_min = res_max = -2^-1074, which is the expected result too. |
378 | | */ |
379 | 0 | DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s) |
380 | | } else { |
381 | | /* RHO0 <= rh < RHO1 or s < 0: we defer to the 2nd phase */ |
382 | 0 | DoubleDouble::new(f64::NAN, f64::NAN) |
383 | | }; |
384 | 0 | } |
385 | | const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe); |
386 | | |
387 | 0 | let k = (r.hi * INVLOG2).cpu_round(); |
388 | | |
389 | | const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef); |
390 | | const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f); |
391 | | |
392 | 0 | let z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r); |
393 | | |
394 | 0 | let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */ |
395 | 0 | let mk = (bk >> 12) + 0x3ff; |
396 | 0 | let i2 = (bk >> 6) & 0x3f; |
397 | 0 | let i1 = bk & 0x3f; |
398 | | |
399 | 0 | let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]); |
400 | 0 | let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]); |
401 | 0 | let mut de = DoubleDouble::quick_mult(t1, t0); |
402 | 0 | let q = exp_poly_1(z.to_f64()); |
403 | 0 | de = DoubleDouble::quick_mult(de, q); |
404 | | /* we should have 1 < M < 2047 here, since we filtered out |
405 | | potential underflow/overflow cases at the beginning of this function */ |
406 | | |
407 | 0 | let mut du = (mk as u64).wrapping_shl(52); |
408 | 0 | du = (f64::from_bits(du) * s).to_bits(); |
409 | 0 | de.hi *= f64::from_bits(du); |
410 | 0 | de.lo *= f64::from_bits(du); |
411 | 0 | de |
412 | 0 | } |
413 | | /* |
414 | | #[inline(always)] |
415 | | pub(crate) fn expm1_poly_dd(z: DoubleDouble) -> DoubleDouble { |
416 | | /* |
417 | | Sollya: |
418 | | pretty = proc(u) { |
419 | | return ~(floor(u*1000)/1000); |
420 | | }; |
421 | | |
422 | | d = [-2^-12.905,2^-12.905]; |
423 | | f = expm1(x); |
424 | | w = 1; |
425 | | pf = fpminimax(f, [|1,2,3,4,5,6,7|], [|1, 1, 107...|], d, absolute, floating); |
426 | | err_p = -log2(dirtyinfnorm(pf*w-f, d)); |
427 | | display = decimal; |
428 | | |
429 | | for i from 1 to degree(pf) do print(coeff(pf, i)); |
430 | | |
431 | | print (pf); |
432 | | display = decimal; |
433 | | print ("absolute error:",pretty(err_p)); |
434 | | f = 1; |
435 | | w = 1/expm1(x); |
436 | | err_p = -log2(dirtyinfnorm(pf*w-f, d)); |
437 | | print ("relative error:",pretty(err_p)); |
438 | | */ |
439 | | const Q: [(u64, u64); 7] = [ |
440 | | (0x0000000000000000, 0x3ff0000000000000), |
441 | | (0x0000000000000000, 0x3fe0000000000000), |
442 | | (0xbc75555554d7c48c, 0x3fc5555555555556), |
443 | | (0xbc555a40ffb472d9, 0x3fa5555555555556), |
444 | | (0x3c24866314c38093, 0x3f8111111111110e), |
445 | | (0x3be34665978dddb8, 0x3f56c16c16efac90), |
446 | | (0x3baeab43b813ef24, 0x3f2a01a1e12d253c), |
447 | | ]; |
448 | | let z2 = z * z; |
449 | | let z4 = z2 * z2; |
450 | | |
451 | | let b0 = DoubleDouble::quick_mul_add( |
452 | | z, |
453 | | DoubleDouble::from_bit_pair(Q[1]), |
454 | | DoubleDouble::from_bit_pair(Q[0]), |
455 | | ); |
456 | | let b1 = DoubleDouble::quick_mul_add( |
457 | | z, |
458 | | DoubleDouble::from_bit_pair(Q[3]), |
459 | | DoubleDouble::from_bit_pair(Q[2]), |
460 | | ); |
461 | | let b2 = DoubleDouble::quick_mul_add( |
462 | | z, |
463 | | DoubleDouble::from_bit_pair(Q[5]), |
464 | | DoubleDouble::from_bit_pair(Q[4]), |
465 | | ); |
466 | | |
467 | | let c0 = DoubleDouble::quick_mul_add(z2, b1, b0); |
468 | | let c1 = DoubleDouble::quick_mul_add(z2, DoubleDouble::from_bit_pair(Q[6]), b2); |
469 | | |
470 | | let p = DoubleDouble::quick_mul_add(z4, c1, c0); |
471 | | DoubleDouble::quick_mult(p, z) |
472 | | } |
473 | | */ |
474 | | #[inline(always)] |
475 | 0 | pub(crate) fn expm1_poly_fast(z: DoubleDouble) -> DoubleDouble { |
476 | | // Polynomial generated by Sollya: |
477 | | // d = [-2^-12.905,2^-12.905]; |
478 | | // f = expm1(x); |
479 | | // w = 1; |
480 | | // pf = fpminimax(f, [|1,2,3,4,5,6,7|], [|1, 1, D...|], d, absolute, floating); |
481 | | // See ./notes/compound_m1_expm1_fast.sollya |
482 | 0 | let p = f_polyeval6( |
483 | 0 | z.hi, |
484 | 0 | f64::from_bits(0x3fe0000000000000), |
485 | 0 | f64::from_bits(0x3fc5555555555555), |
486 | 0 | f64::from_bits(0x3fa55555555553de), |
487 | 0 | f64::from_bits(0x3f81111144995a9a), |
488 | 0 | f64::from_bits(0x3f56c241f9a791c5), |
489 | 0 | f64::from_bits(0xbfad9209c6d8b9e1), |
490 | | ); |
491 | 0 | let px = DoubleDouble::quick_mult_f64(DoubleDouble::from_exact_mult(z.hi, p), z.hi); |
492 | | // expm1(hi + lo) = expm1(hi) + expm1(lo)(1 + expm1(hi)) = expm1(hi) + expm1(lo)expm1(hi) + expm1(lo) |
493 | | // expm1(lo) ~ lo |
494 | 0 | let expm1_hi = DoubleDouble::f64_add(z.hi, px); |
495 | 0 | let mut lowest_part = DoubleDouble::quick_mult_f64(expm1_hi, z.lo); |
496 | 0 | lowest_part = DoubleDouble::full_add_f64(lowest_part, z.lo); |
497 | 0 | DoubleDouble::quick_dd_add(expm1_hi, lowest_part) |
498 | 0 | } |
499 | | |
500 | | /// |z.hi| < 2^-7 |
501 | | #[inline(always)] |
502 | 0 | pub(crate) fn expm1_poly_dd_tiny(z: DoubleDouble) -> DoubleDouble { |
503 | | // Polynomial generated in Sollya |
504 | | // d = [-2^-7,2^-7]; |
505 | | // f = expm1(x); |
506 | | // w = 1; |
507 | | // pf = fpminimax(f, [|1,2,3,4,5,6,7,8,9|], [|1, 1, 107...|], d, absolute, floating); |
508 | | // See ./notes/compound_expm1_tiny.sollya |
509 | | const Q: [(u64, u64); 9] = [ |
510 | | (0x0000000000000000, 0x3ff0000000000000), |
511 | | (0x0000000000000000, 0x3fe0000000000000), |
512 | | (0x3c6555564150ff16, 0x3fc5555555555555), |
513 | | (0x3c4586275c26f8a5, 0x3fa5555555555555), |
514 | | (0xbc19e6193ac658a6, 0x3f81111111111111), |
515 | | (0xbbf025e72dc21051, 0x3f56c16c16c1500a), |
516 | | (0x3bc2d641a7b7b9b8, 0x3f2a01a01a07dc46), |
517 | | (0xbb42cc8aaeeb3d00, 0x3efa01a29fef3e6f), |
518 | | (0x3b52b1589125ce82, 0x3ec71db6af553255), |
519 | | ]; |
520 | 0 | let z = DoubleDouble::from_exact_add(z.hi, z.lo); |
521 | 0 | let mut d = DoubleDouble::quick_mul_add( |
522 | 0 | z, |
523 | 0 | DoubleDouble::from_bit_pair(Q[8]), |
524 | 0 | DoubleDouble::from_bit_pair(Q[7]), |
525 | | ); |
526 | 0 | d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[6])); |
527 | 0 | d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[5])); |
528 | 0 | d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[4])); |
529 | 0 | d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[3])); |
530 | 0 | d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[2])); |
531 | 0 | d = DoubleDouble::quick_mul_add_f64(z, d, f64::from_bits(0x3fe0000000000000)); |
532 | 0 | d = DoubleDouble::quick_mul_add_f64(z, d, f64::from_bits(0x3ff0000000000000)); |
533 | 0 | DoubleDouble::quick_mult(d, z) |
534 | 0 | } |
535 | | |
536 | | #[inline] |
537 | 0 | pub(crate) fn pow_expm1_1(r: DoubleDouble, s: f64) -> DoubleDouble { |
538 | | const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27); |
539 | | // #define RHO1 -0x1.577453f1799a6p+9 |
540 | | /* We increase the initial value of RHO1 to avoid spurious underflow in |
541 | | the result value el. However, it is not possible to obtain a lower |
542 | | bound on |el| from the input value rh, thus this modified value of RHO1 |
543 | | is obtained experimentally. */ |
544 | | const RHO1: f64 = f64::from_bits(0xc08483b8cca421af); |
545 | | const RHO2: f64 = f64::from_bits(0x40862e42e709a95b); |
546 | | const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9); |
547 | | |
548 | | // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too |
549 | | #[allow(clippy::neg_cmp_op_on_partial_ord)] |
550 | 0 | if !(r.hi <= RHO2) { |
551 | 0 | return if r.hi > RHO3 { |
552 | | /* If rh > RHO3, we are sure there is overflow, |
553 | | For s=1 we return eh = el = DBL_MAX, which yields |
554 | | res_min = res_max = +Inf for rounding up or to nearest, |
555 | | and res_min = res_max = DBL_MAX for rounding down or toward zero, |
556 | | which will yield the correct rounding. |
557 | | For s=-1 we return eh = el = -DBL_MAX, which similarly gives |
558 | | res_min = res_max = -Inf or res_min = res_max = -DBL_MAX, |
559 | | which is the correct rounding. */ |
560 | 0 | DoubleDouble::new( |
561 | 0 | f64::from_bits(0x7fefffffffffffff) * s, |
562 | 0 | f64::from_bits(0x7fefffffffffffff) * s, |
563 | | ) |
564 | | } else { |
565 | | /* If RHO2 < rh <= RHO3, we are in the intermediate region |
566 | | where there might be overflow or not, thus we set eh = el = NaN, |
567 | | which will set res_min = res_max = NaN, the comparison |
568 | | res_min == res_max will fail: we defer to the 2nd phase. */ |
569 | 0 | DoubleDouble::new(f64::NAN, f64::NAN) |
570 | | }; |
571 | 0 | } |
572 | | |
573 | 0 | if r.hi < RHO1 { |
574 | 0 | if r.hi < RHO0 { |
575 | | /* For s=1, we have eh=el=+0 except for rounding up, |
576 | | thus res_min=+0 or -0, res_max=+0 in the main code, |
577 | | the rounding test succeeds, and we return res_max which is the |
578 | | expected result in the underflow case. |
579 | | For s=1 and rounding up, we have eh=+0, el=2^-1074, |
580 | | thus res_min = res_max = 2^-1074, which is the expected result too. |
581 | | For s=-1, we have eh=el=-0 except for rounding down, |
582 | | thus res_min=-0 or +0, res_max=-0 in the main code, |
583 | | the rounding test succeeds, and we return res_max which is the |
584 | | expected result in the underflow case. |
585 | | For s=-1 and rounding down, we have eh=-0, el=-2^-1074, |
586 | | thus res_min = res_max = -2^-1074, which is the expected result too. |
587 | | */ |
588 | 0 | return DoubleDouble::full_add_f64( |
589 | 0 | DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s), |
590 | | -1.0, |
591 | | ); |
592 | | } else { |
593 | | /* RHO0 <= rh < RHO1 or s < 0: we return -1 */ |
594 | 0 | return DoubleDouble::new(0., -1.); |
595 | | }; |
596 | 0 | } |
597 | | |
598 | 0 | let ax = r.hi.to_bits() & 0x7fffffffffffffffu64; |
599 | | |
600 | | const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef); |
601 | | const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f); |
602 | | |
603 | 0 | if ax <= 0x3f80000000000000 { |
604 | | // |x| < 2^-7 |
605 | 0 | if ax < 0x3970000000000000 { |
606 | | // |x| < 2^-104 |
607 | 0 | return r; |
608 | 0 | } |
609 | 0 | let d = expm1_poly_dd_tiny(r); |
610 | 0 | return d; |
611 | 0 | } |
612 | | |
613 | | const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe); |
614 | | |
615 | 0 | let k = (r.hi * INVLOG2).cpu_round_ties_even(); |
616 | | |
617 | 0 | let z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r); |
618 | | |
619 | 0 | let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */ |
620 | 0 | let mk = (bk >> 12) + 0x3ff; |
621 | 0 | let i2 = (bk >> 6) & 0x3f; |
622 | 0 | let i1 = bk & 0x3f; |
623 | | |
624 | 0 | let t0 = DoubleDouble::from_bit_pair(EXPM1_T0[i2 as usize]); |
625 | 0 | let t1 = DoubleDouble::from_bit_pair(EXPM1_T1[i1 as usize]); |
626 | 0 | let tbh = DoubleDouble::quick_mult(t1, t0); |
627 | 0 | let mut de = tbh; |
628 | | // exp(k)=2^k*exp(r) + (2^k - 1) |
629 | 0 | let q = expm1_poly_fast(z); |
630 | 0 | de = DoubleDouble::quick_mult(de, q); |
631 | 0 | de = DoubleDouble::add(tbh, de); |
632 | | |
633 | 0 | let ie = mk - 0x3ff; |
634 | | |
635 | 0 | let off: f64 = f64::from_bits((2048i64 + 1023i64).wrapping_sub(ie).wrapping_shl(52) as u64); |
636 | | |
637 | | let e: f64; |
638 | 0 | if ie < 53 { |
639 | 0 | let fhz = DoubleDouble::from_exact_add(off, de.hi); |
640 | 0 | de.hi = fhz.hi; |
641 | 0 | e = fhz.lo; |
642 | 0 | } else if ie < 104 { |
643 | 0 | let fhz = DoubleDouble::from_exact_add(de.hi, off); |
644 | 0 | de.hi = fhz.hi; |
645 | 0 | e = fhz.lo; |
646 | 0 | } else { |
647 | 0 | e = 0.; |
648 | 0 | } |
649 | 0 | de.lo += e; |
650 | 0 | de.hi = ldexp(de.to_f64(), ie as i32); |
651 | 0 | de.lo = 0.; |
652 | 0 | de |
653 | 0 | } |
654 | | |
655 | 0 | fn exp_dyadic_poly(x: DyadicFloat128) -> DyadicFloat128 { |
656 | | const Q_2: [DyadicFloat128; 8] = [ |
657 | | DyadicFloat128 { |
658 | | sign: DyadicSign::Pos, |
659 | | exponent: -128, |
660 | | mantissa: 0xffff_ffff_ffff_ffff_ffff_ffff_ffff_ffd0_u128, |
661 | | }, |
662 | | DyadicFloat128 { |
663 | | sign: DyadicSign::Pos, |
664 | | exponent: -127, |
665 | | mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0088_u128, |
666 | | }, |
667 | | DyadicFloat128 { |
668 | | sign: DyadicSign::Pos, |
669 | | exponent: -128, |
670 | | mantissa: 0x8000_0000_0000_0000_0000_000c_06f3_cd29_u128, |
671 | | }, |
672 | | DyadicFloat128 { |
673 | | sign: DyadicSign::Pos, |
674 | | exponent: -130, |
675 | | mantissa: 0xaaaa_aaaa_aaaa_aaaa_aaaa_aa6a_1e07_76ae_u128, |
676 | | }, |
677 | | DyadicFloat128 { |
678 | | sign: DyadicSign::Pos, |
679 | | exponent: -132, |
680 | | mantissa: 0xaaaa_aaaa_aaaa_aaa3_0000_0000_0000_0000_u128, |
681 | | }, |
682 | | DyadicFloat128 { |
683 | | sign: DyadicSign::Pos, |
684 | | exponent: -134, |
685 | | mantissa: 0x8888_8888_8888_8897_0000_0000_0000_0000_u128, |
686 | | }, |
687 | | DyadicFloat128 { |
688 | | sign: DyadicSign::Pos, |
689 | | exponent: -137, |
690 | | mantissa: 0xb60b_60b9_3214_6a54_0000_0000_0000_0000_u128, |
691 | | }, |
692 | | DyadicFloat128 { |
693 | | sign: DyadicSign::Pos, |
694 | | exponent: -140, |
695 | | mantissa: 0xd00d_00cd_9841_6862_0000_0000_0000_0000_u128, |
696 | | }, |
697 | | ]; |
698 | 0 | let mut p = Q_2[7]; |
699 | 0 | for i in (0..7).rev() { |
700 | 0 | p = x * p + Q_2[i]; |
701 | 0 | } |
702 | 0 | p |
703 | 0 | } |
704 | | |
705 | | /* put in r an approximation of exp(x), for |x| < 744.45, |
706 | | with relative error < 2^-121.70 */ |
707 | | #[inline] |
708 | 0 | pub(crate) fn exp_dyadic(x: DyadicFloat128) -> DyadicFloat128 { |
709 | 0 | let ex = x.exponent + 127; |
710 | 0 | if ex >= 10 |
711 | | // underflow or overflow |
712 | | { |
713 | | return DyadicFloat128 { |
714 | 0 | sign: DyadicSign::Pos, |
715 | 0 | exponent: if x.sign == DyadicSign::Neg { |
716 | 0 | -1076 |
717 | | } else { |
718 | 0 | 1025 |
719 | | }, |
720 | 0 | mantissa: x.mantissa, |
721 | | }; |
722 | 0 | } |
723 | | |
724 | | const LOG2_INV: DyadicFloat128 = DyadicFloat128 { |
725 | | sign: DyadicSign::Pos, |
726 | | exponent: -115, |
727 | | mantissa: 0xb8aa_3b29_5c17_f0bc_0000_0000_0000_0000_u128, |
728 | | }; |
729 | | |
730 | | const LOG2: DyadicFloat128 = DyadicFloat128 { |
731 | | sign: DyadicSign::Pos, |
732 | | exponent: -128, |
733 | | mantissa: 0xb172_17f7_d1cf_79ab_c9e3_b398_03f2_f6af_u128, |
734 | | }; |
735 | | |
736 | 0 | let mut bk = x * LOG2_INV; |
737 | 0 | let k = bk.trunc_to_i64(); /* k = trunc(K) [rounded towards zero, exact] */ |
738 | | /* The rounding error of mul_dint_int64() is bounded by 6 ulps, thus since |
739 | | |K| <= 4399162*log(2) < 3049267, the error on K is bounded by 2^-103.41. |
740 | | This error is divided by 2^12 below, thus yields < 2^-115.41. */ |
741 | 0 | bk = LOG2.mul_int64(k); |
742 | 0 | bk.exponent -= 12; |
743 | 0 | bk.sign = bk.sign.negate(); |
744 | 0 | let y = x + bk; |
745 | | |
746 | 0 | let bm = k >> 12; |
747 | 0 | let i2 = (k >> 6) & 0x3f; |
748 | 0 | let i1 = k & 0x3f; |
749 | 0 | let mut r = exp_dyadic_poly(y); |
750 | 0 | r = EXP_T1_2_DYADIC[i2 as usize] * r; |
751 | 0 | r = EXP_T2_2_DYADIC[i1 as usize] * r; |
752 | 0 | r.exponent += bm as i16; /* exact */ |
753 | 0 | r |
754 | 0 | } |
755 | | |
756 | | #[cfg(test)] |
757 | | mod tests { |
758 | | use super::*; |
759 | | use crate::f_expm1; |
760 | | #[test] |
761 | | fn test_log() { |
762 | | let k = DyadicFloat128::new_from_f64(2.5); |
763 | | assert_eq!(exp_dyadic(k).fast_as_f64(), 12.182493960703473); |
764 | | } |
765 | | |
766 | | #[test] |
767 | | fn test_exp() { |
768 | | let k = pow_expm1_1(DoubleDouble::new(0., 2.543543543543), 1.); |
769 | | println!("{}", k.to_f64()); |
770 | | println!("{}", f_expm1(2.543543543543)); |
771 | | } |
772 | | } |