Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_sqrtrem -- square root and remainder |
2 | | |
3 | | Contributed to the GNU project by Paul Zimmermann (most code), |
4 | | Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt). |
5 | | |
6 | | THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE |
7 | | INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. |
8 | | IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A |
9 | | FUTURE GMP RELEASE. |
10 | | |
11 | | Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software |
12 | | Foundation, Inc. |
13 | | |
14 | | This file is part of the GNU MP Library. |
15 | | |
16 | | The GNU MP Library is free software; you can redistribute it and/or modify |
17 | | it under the terms of either: |
18 | | |
19 | | * the GNU Lesser General Public License as published by the Free |
20 | | Software Foundation; either version 3 of the License, or (at your |
21 | | option) any later version. |
22 | | |
23 | | or |
24 | | |
25 | | * the GNU General Public License as published by the Free Software |
26 | | Foundation; either version 2 of the License, or (at your option) any |
27 | | later version. |
28 | | |
29 | | or both in parallel, as here. |
30 | | |
31 | | The GNU MP Library is distributed in the hope that it will be useful, but |
32 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
33 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
34 | | for more details. |
35 | | |
36 | | You should have received copies of the GNU General Public License and the |
37 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
38 | | see https://www.gnu.org/licenses/. */ |
39 | | |
40 | | |
41 | | /* See "Karatsuba Square Root", reference in gmp.texi. */ |
42 | | |
43 | | |
44 | | #include <stdio.h> |
45 | | #include <stdlib.h> |
46 | | |
47 | | #include "gmp-impl.h" |
48 | | #include "longlong.h" |
49 | 0 | #define USE_DIVAPPR_Q 1 |
50 | | #define TRACE(x) |
51 | | |
52 | | static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */ |
53 | | { |
54 | | 0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */ |
55 | | 0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */ |
56 | | 0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */ |
57 | | 0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */ |
58 | | 0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */ |
59 | | 0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */ |
60 | | 0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */ |
61 | | 0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */ |
62 | | 0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */ |
63 | | 0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */ |
64 | | 0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */ |
65 | | 0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */ |
66 | | 0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */ |
67 | | 0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */ |
68 | | 0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */ |
69 | | 0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */ |
70 | | 0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */ |
71 | | 0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */ |
72 | | 0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */ |
73 | | 0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */ |
74 | | 0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */ |
75 | | 0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */ |
76 | | 0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */ |
77 | | 0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */ |
78 | | 0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */ |
79 | | 0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */ |
80 | | 0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */ |
81 | | 0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */ |
82 | | 0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */ |
83 | | 0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */ |
84 | | 0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */ |
85 | | 0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */ |
86 | | 0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */ |
87 | | 0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */ |
88 | | 0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */ |
89 | | 0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */ |
90 | | 0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */ |
91 | | 0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */ |
92 | | 0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */ |
93 | | 0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */ |
94 | | 0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */ |
95 | | 0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */ |
96 | | 0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */ |
97 | | 0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */ |
98 | | 0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */ |
99 | | 0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */ |
100 | | 0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */ |
101 | | 0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */ |
102 | | }; |
103 | | |
104 | | /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */ |
105 | | |
106 | | #if GMP_NUMB_BITS > 32 |
107 | 0 | #define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */ |
108 | | #else |
109 | | #define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */ |
110 | | #endif |
111 | | |
112 | | static mp_limb_t |
113 | | mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0) |
114 | 0 | { |
115 | 0 | #if GMP_NUMB_BITS > 32 |
116 | 0 | mp_limb_t a1; |
117 | 0 | #endif |
118 | 0 | mp_limb_t x0, t2, t, x2; |
119 | 0 | unsigned abits; |
120 | |
|
121 | 0 | ASSERT_ALWAYS (GMP_NAIL_BITS == 0); |
122 | 0 | ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64); |
123 | 0 | ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2); |
124 | | |
125 | | /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a), |
126 | | since we can do the former without division. As part of the last |
127 | | iteration convert from 1/sqrt(a) to sqrt(a). */ |
128 | |
|
129 | 0 | abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */ |
130 | 0 | x0 = 0x100 | invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */ |
131 | | |
132 | | /* x0 is now an 8 bits approximation of 1/sqrt(a0) */ |
133 | |
|
134 | 0 | #if GMP_NUMB_BITS > 32 |
135 | 0 | a1 = a0 >> (GMP_LIMB_BITS - 1 - 32); |
136 | 0 | t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16; |
137 | 0 | x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2)); |
138 | | |
139 | | /* x0 is now a 16 bits approximation of 1/sqrt(a0) */ |
140 | |
|
141 | 0 | t2 = x0 * (a0 >> (32-8)); |
142 | 0 | t = t2 >> 25; |
143 | 0 | t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8)); |
144 | 0 | x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15); |
145 | 0 | x0 >>= 32; |
146 | | #else |
147 | | t2 = x0 * (a0 >> (16-8)); |
148 | | t = t2 >> 13; |
149 | | t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8)); |
150 | | x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7); |
151 | | x0 >>= 16; |
152 | | #endif |
153 | | |
154 | | /* x0 is now a full limb approximation of sqrt(a0) */ |
155 | |
|
156 | 0 | x2 = x0 * x0; |
157 | 0 | if (x2 + 2*x0 <= a0 - 1) |
158 | 0 | { |
159 | 0 | x2 += 2*x0 + 1; |
160 | 0 | x0++; |
161 | 0 | } |
162 | |
|
163 | 0 | *rp = a0 - x2; |
164 | 0 | return x0; |
165 | 0 | } |
166 | | |
167 | | |
168 | 0 | #define Prec (GMP_NUMB_BITS >> 1) |
169 | | #if ! defined(SQRTREM2_INPLACE) |
170 | | #define SQRTREM2_INPLACE 0 |
171 | | #endif |
172 | | |
173 | | /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized |
174 | | return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */ |
175 | | #if SQRTREM2_INPLACE |
176 | | #define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp) |
177 | | static mp_limb_t |
178 | | mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp) |
179 | | { |
180 | | mp_srcptr np = rp; |
181 | | #else |
182 | 0 | #define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp) |
183 | | static mp_limb_t |
184 | | mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np) |
185 | 0 | { |
186 | 0 | #endif |
187 | 0 | mp_limb_t q, u, np0, sp0, rp0, q2; |
188 | 0 | int cc; |
189 | |
|
190 | 0 | ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2); |
191 | |
|
192 | 0 | np0 = np[0]; |
193 | 0 | sp0 = mpn_sqrtrem1 (rp, np[1]); |
194 | 0 | rp0 = rp[0]; |
195 | | /* rp0 <= 2*sp0 < 2^(Prec + 1) */ |
196 | 0 | rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1)); |
197 | 0 | q = rp0 / sp0; |
198 | | /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */ |
199 | 0 | q -= q >> Prec; |
200 | | /* now we have q < 2^Prec */ |
201 | 0 | u = rp0 - q * sp0; |
202 | | /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */ |
203 | 0 | sp0 = (sp0 << Prec) | q; |
204 | 0 | cc = u >> (Prec - 1); |
205 | 0 | rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1)); |
206 | | /* subtract q * q from rp */ |
207 | 0 | q2 = q * q; |
208 | 0 | cc -= rp0 < q2; |
209 | 0 | rp0 -= q2; |
210 | 0 | if (cc < 0) |
211 | 0 | { |
212 | 0 | rp0 += sp0; |
213 | 0 | cc += rp0 < sp0; |
214 | 0 | --sp0; |
215 | 0 | rp0 += sp0; |
216 | 0 | cc += rp0 < sp0; |
217 | 0 | } |
218 | |
|
219 | 0 | rp[0] = rp0; |
220 | 0 | sp[0] = sp0; |
221 | 0 | return cc; |
222 | 0 | } |
223 | | |
224 | | /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n}, |
225 | | and in {np, n} the low n limbs of the remainder, returns the high |
226 | | limb of the remainder (which is 0 or 1). |
227 | | Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4 |
228 | | where B=2^GMP_NUMB_BITS. |
229 | | Needs a scratch of n/2+1 limbs. */ |
230 | | static mp_limb_t |
231 | | mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch) |
232 | 0 | { |
233 | 0 | mp_limb_t q; /* carry out of {sp, n} */ |
234 | 0 | int c, b; /* carry out of remainder */ |
235 | 0 | mp_size_t l, h; |
236 | |
|
237 | 0 | ASSERT (n > 1); |
238 | 0 | ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2); |
239 | |
|
240 | 0 | l = n / 2; |
241 | 0 | h = n - l; |
242 | 0 | if (h == 1) |
243 | 0 | q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l); |
244 | 0 | else |
245 | 0 | q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch); |
246 | 0 | if (q != 0) |
247 | 0 | ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h)); |
248 | 0 | TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1))); |
249 | 0 | mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h); |
250 | 0 | q += scratch[l]; |
251 | 0 | c = scratch[0] & 1; |
252 | 0 | mpn_rshift (sp, scratch, l, 1); |
253 | 0 | sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK; |
254 | 0 | if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */ |
255 | 0 | return 1; /* Remainder is non-zero */ |
256 | 0 | q >>= 1; |
257 | 0 | if (c != 0) |
258 | 0 | c = mpn_add_n (np + l, np + l, sp + l, h); |
259 | 0 | TRACE(printf("sqr(,,%u)\n", (unsigned) l)); |
260 | 0 | mpn_sqr (np + n, sp, l); |
261 | 0 | b = q + mpn_sub_n (np, np, np + n, 2 * l); |
262 | 0 | c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b); |
263 | |
|
264 | 0 | if (c < 0) |
265 | 0 | { |
266 | 0 | q = mpn_add_1 (sp + l, sp + l, h, q); |
267 | 0 | #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n |
268 | 0 | c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q; |
269 | | #else |
270 | | c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q; |
271 | | #endif |
272 | 0 | c -= mpn_sub_1 (np, np, n, CNST_LIMB(1)); |
273 | 0 | q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1)); |
274 | 0 | } |
275 | |
|
276 | 0 | return c; |
277 | 0 | } |
278 | | |
279 | | #if USE_DIVAPPR_Q |
280 | | static void |
281 | | mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch) |
282 | 0 | { |
283 | 0 | gmp_pi1_t inv; |
284 | 0 | mp_limb_t qh; |
285 | 0 | ASSERT (dn > 2); |
286 | 0 | ASSERT (nn >= dn); |
287 | 0 | ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); |
288 | |
|
289 | 0 | MPN_COPY (scratch, np, nn); |
290 | 0 | invert_pi1 (inv, dp[dn-1], dp[dn-2]); |
291 | 0 | if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD)) |
292 | 0 | qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32); |
293 | 0 | else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD)) |
294 | 0 | qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv); |
295 | 0 | else |
296 | 0 | { |
297 | 0 | mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0); |
298 | 0 | TMP_DECL; |
299 | 0 | TMP_MARK; |
300 | | /* Sadly, scratch is too small. */ |
301 | 0 | qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch)); |
302 | 0 | TMP_FREE; |
303 | 0 | } |
304 | 0 | qp [nn - dn] = qh; |
305 | 0 | } |
306 | | #endif |
307 | | |
308 | | /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd}, |
309 | | returns zero if the operand was a perfect square, one otherwise. |
310 | | Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4 |
311 | | where B=2^GMP_NUMB_BITS. |
312 | | THINK: In the odd case, three more (dummy) limbs are taken into account, |
313 | | when nsh is maximal, two limbs are discarded from the result of the |
314 | | division. Too much? Is a single dummy limb enough? */ |
315 | | static int |
316 | | mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd) |
317 | 0 | { |
318 | 0 | mp_limb_t q; /* carry out of {sp, n} */ |
319 | 0 | int c; /* carry out of remainder */ |
320 | 0 | mp_size_t l, h; |
321 | 0 | mp_ptr qp, tp, scratch; |
322 | 0 | TMP_DECL; |
323 | 0 | TMP_MARK; |
324 | |
|
325 | 0 | ASSERT (np[2 * n - 1 - odd] != 0); |
326 | 0 | ASSERT (n > 4); |
327 | 0 | ASSERT (nsh < GMP_NUMB_BITS / 2); |
328 | |
|
329 | 0 | l = (n - 1) / 2; |
330 | 0 | h = n - l; |
331 | 0 | ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd); |
332 | 0 | scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */ |
333 | 0 | tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */ |
334 | 0 | if (nsh != 0) |
335 | 0 | { |
336 | | /* o is used to exactly set the lowest bits of the dividend, is it needed? */ |
337 | 0 | int o = l > (1 + odd); |
338 | 0 | ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh)); |
339 | 0 | } |
340 | 0 | else |
341 | 0 | MPN_COPY (tp, np + l - 1 - odd, n + h + 1); |
342 | 0 | q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch); |
343 | 0 | if (q != 0) |
344 | 0 | ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h)); |
345 | 0 | qp = tp + n + 1; /* l + 2 */ |
346 | 0 | TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1))); |
347 | 0 | #if USE_DIVAPPR_Q |
348 | 0 | mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch); |
349 | | #else |
350 | | mpn_div_q (qp, tp, n + 1, sp + l, h, scratch); |
351 | | #endif |
352 | 0 | q += qp [l + 1]; |
353 | 0 | c = 1; |
354 | 0 | if (q > 1) |
355 | 0 | { |
356 | | /* FIXME: if s!=0 we will shift later, a noop on this area. */ |
357 | 0 | MPN_FILL (sp, l, GMP_NUMB_MAX); |
358 | 0 | } |
359 | 0 | else |
360 | 0 | { |
361 | | /* FIXME: if s!=0 we will shift again later, shift just once. */ |
362 | 0 | mpn_rshift (sp, qp + 1, l, 1); |
363 | 0 | sp[l - 1] |= q << (GMP_NUMB_BITS - 1); |
364 | 0 | if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */ |
365 | 0 | (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0) |
366 | 0 | { |
367 | 0 | mp_limb_t cy; |
368 | | /* Approximation is not good enough, the extra limb(+ nsh bits) |
369 | | is smaller than needed to absorb the possible error. */ |
370 | | /* {qp + 1, l + 1} equals 2*{sp, l} */ |
371 | | /* FIXME: use mullo or wrap-around, or directly evaluate |
372 | | remainder with a single sqrmod_bnm1. */ |
373 | 0 | TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1))); |
374 | 0 | ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1)); |
375 | | /* Compute the remainder of the previous mpn_div(appr)_q. */ |
376 | 0 | cy = mpn_sub_n (tp + 1, tp + 1, scratch, h); |
377 | 0 | #if USE_DIVAPPR_Q || WANT_ASSERT |
378 | 0 | MPN_DECR_U (tp + 1 + h, l, cy); |
379 | 0 | #if USE_DIVAPPR_Q |
380 | 0 | ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0); |
381 | 0 | if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0) |
382 | 0 | { |
383 | | /* May happen only if div result was not exact. */ |
384 | 0 | #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n |
385 | 0 | cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h); |
386 | | #else |
387 | | cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2)); |
388 | | #endif |
389 | 0 | ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy)); |
390 | 0 | MPN_DECR_U (sp, l, 1); |
391 | 0 | } |
392 | | /* Can the root be exact when a correction was needed? We |
393 | | did not find an example, but it depends on divappr |
394 | | internals, and we can not assume it true in general...*/ |
395 | | /* else */ |
396 | | #else /* WANT_ASSERT */ |
397 | | ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0); |
398 | | #endif |
399 | 0 | #endif |
400 | 0 | if (mpn_zero_p (tp + l + 1, h - l)) |
401 | 0 | { |
402 | 0 | TRACE(printf("sqr(,,%u)\n", (unsigned) l)); |
403 | 0 | mpn_sqr (scratch, sp, l); |
404 | 0 | c = mpn_cmp (tp + 1, scratch + l, l); |
405 | 0 | if (c == 0) |
406 | 0 | { |
407 | 0 | if (nsh != 0) |
408 | 0 | { |
409 | 0 | mpn_lshift (tp, np, l, 2 * nsh); |
410 | 0 | np = tp; |
411 | 0 | } |
412 | 0 | c = mpn_cmp (np, scratch + odd, l - odd); |
413 | 0 | } |
414 | 0 | if (c < 0) |
415 | 0 | { |
416 | 0 | MPN_DECR_U (sp, l, 1); |
417 | 0 | c = 1; |
418 | 0 | } |
419 | 0 | } |
420 | 0 | } |
421 | 0 | } |
422 | 0 | TMP_FREE; |
423 | |
|
424 | 0 | if ((odd | nsh) != 0) |
425 | 0 | mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0)); |
426 | 0 | return c; |
427 | 0 | } |
428 | | |
429 | | |
430 | | mp_size_t |
431 | | mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn) |
432 | 0 | { |
433 | 0 | mp_limb_t cc, high, rl; |
434 | 0 | int c; |
435 | 0 | mp_size_t rn, tn; |
436 | 0 | TMP_DECL; |
437 | |
|
438 | 0 | ASSERT (nn > 0); |
439 | 0 | ASSERT_MPN (np, nn); |
440 | |
|
441 | 0 | ASSERT (np[nn - 1] != 0); |
442 | 0 | ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn)); |
443 | 0 | ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn)); |
444 | 0 | ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn)); |
445 | |
|
446 | 0 | high = np[nn - 1]; |
447 | 0 | if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2))) |
448 | 0 | c = 0; |
449 | 0 | else |
450 | 0 | { |
451 | 0 | count_leading_zeros (c, high); |
452 | 0 | c -= GMP_NAIL_BITS; |
453 | |
|
454 | 0 | c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */ |
455 | 0 | } |
456 | 0 | if (nn == 1) { |
457 | 0 | if (c == 0) |
458 | 0 | { |
459 | 0 | sp[0] = mpn_sqrtrem1 (&rl, high); |
460 | 0 | if (rp != NULL) |
461 | 0 | rp[0] = rl; |
462 | 0 | } |
463 | 0 | else |
464 | 0 | { |
465 | 0 | cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c; |
466 | 0 | sp[0] = cc; |
467 | 0 | if (rp != NULL) |
468 | 0 | rp[0] = rl = high - cc*cc; |
469 | 0 | } |
470 | 0 | return rl != 0; |
471 | 0 | } |
472 | 0 | if (nn == 2) { |
473 | 0 | mp_limb_t tp [2]; |
474 | 0 | if (rp == NULL) rp = tp; |
475 | 0 | if (c == 0) |
476 | 0 | { |
477 | | #if SQRTREM2_INPLACE |
478 | | rp[1] = high; |
479 | | rp[0] = np[0]; |
480 | | cc = CALL_SQRTREM2_INPLACE (sp, rp); |
481 | | #else |
482 | 0 | cc = mpn_sqrtrem2 (sp, rp, np); |
483 | 0 | #endif |
484 | 0 | rp[1] = cc; |
485 | 0 | return ((rp[0] | cc) != 0) + cc; |
486 | 0 | } |
487 | 0 | else |
488 | 0 | { |
489 | 0 | rl = np[0]; |
490 | 0 | rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c)); |
491 | 0 | rp[0] = rl << (2*c); |
492 | 0 | CALL_SQRTREM2_INPLACE (sp, rp); |
493 | 0 | cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */ |
494 | 0 | rp[0] = rl -= cc*cc; /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */ |
495 | 0 | return rl != 0; |
496 | 0 | } |
497 | 0 | } |
498 | 0 | tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */ |
499 | |
|
500 | 0 | if ((rp == NULL) && (nn > 8)) |
501 | 0 | return mpn_dc_sqrt (sp, np, tn, c, nn & 1); |
502 | 0 | TMP_MARK; |
503 | 0 | if (((nn & 1) | c) != 0) |
504 | 0 | { |
505 | 0 | mp_limb_t s0[1], mask; |
506 | 0 | mp_ptr tp, scratch; |
507 | 0 | TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1); |
508 | 0 | tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */ |
509 | 0 | if (c != 0) |
510 | 0 | mpn_lshift (tp + (nn & 1), np, nn, 2 * c); |
511 | 0 | else |
512 | 0 | MPN_COPY (tp + (nn & 1), np, nn); |
513 | 0 | c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0; /* c now represents k */ |
514 | 0 | mask = (CNST_LIMB (1) << c) - 1; |
515 | 0 | rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch); |
516 | | /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2, |
517 | | thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */ |
518 | 0 | s0[0] = sp[0] & mask; /* S mod 2^k */ |
519 | 0 | rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */ |
520 | 0 | cc = mpn_submul_1 (tp, s0, 1, s0[0]); |
521 | 0 | rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc; |
522 | 0 | mpn_rshift (sp, sp, tn, c); |
523 | 0 | tp[tn] = rl; |
524 | 0 | if (rp == NULL) |
525 | 0 | rp = tp; |
526 | 0 | c = c << 1; |
527 | 0 | if (c < GMP_NUMB_BITS) |
528 | 0 | tn++; |
529 | 0 | else |
530 | 0 | { |
531 | 0 | tp++; |
532 | 0 | c -= GMP_NUMB_BITS; |
533 | 0 | } |
534 | 0 | if (c != 0) |
535 | 0 | mpn_rshift (rp, tp, tn, c); |
536 | 0 | else |
537 | 0 | MPN_COPY_INCR (rp, tp, tn); |
538 | 0 | rn = tn; |
539 | 0 | } |
540 | 0 | else |
541 | 0 | { |
542 | 0 | if (rp != np) |
543 | 0 | { |
544 | 0 | if (rp == NULL) /* nn <= 8 */ |
545 | 0 | rp = TMP_SALLOC_LIMBS (nn); |
546 | 0 | MPN_COPY (rp, np, nn); |
547 | 0 | } |
548 | 0 | rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1))); |
549 | 0 | } |
550 | |
|
551 | 0 | MPN_NORMALIZE (rp, rn); |
552 | |
|
553 | 0 | TMP_FREE; |
554 | 0 | return rn; |
555 | 0 | } |