Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* mpn_sqrtrem -- square root and remainder  | 
2  |  |  | 
3  |  |    Contributed to the GNU project by Paul Zimmermann (most code),  | 
4  |  |    Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).  | 
5  |  |  | 
6  |  |    THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE  | 
7  |  |    INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  | 
8  |  |    IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A  | 
9  |  |    FUTURE GMP RELEASE.  | 
10  |  |  | 
11  |  | Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software  | 
12  |  | Foundation, Inc.  | 
13  |  |  | 
14  |  | This file is part of the GNU MP Library.  | 
15  |  |  | 
16  |  | The GNU MP Library is free software; you can redistribute it and/or modify  | 
17  |  | it under the terms of either:  | 
18  |  |  | 
19  |  |   * the GNU Lesser General Public License as published by the Free  | 
20  |  |     Software Foundation; either version 3 of the License, or (at your  | 
21  |  |     option) any later version.  | 
22  |  |  | 
23  |  | or  | 
24  |  |  | 
25  |  |   * the GNU General Public License as published by the Free Software  | 
26  |  |     Foundation; either version 2 of the License, or (at your option) any  | 
27  |  |     later version.  | 
28  |  |  | 
29  |  | or both in parallel, as here.  | 
30  |  |  | 
31  |  | The GNU MP Library is distributed in the hope that it will be useful, but  | 
32  |  | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY  | 
33  |  | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License  | 
34  |  | for more details.  | 
35  |  |  | 
36  |  | You should have received copies of the GNU General Public License and the  | 
37  |  | GNU Lesser General Public License along with the GNU MP Library.  If not,  | 
38  |  | see https://www.gnu.org/licenses/.  */  | 
39  |  |  | 
40  |  |  | 
41  |  | /* See "Karatsuba Square Root", reference in gmp.texi.  */  | 
42  |  |  | 
43  |  |  | 
44  |  | #include <stdio.h>  | 
45  |  | #include <stdlib.h>  | 
46  |  |  | 
47  |  | #include "gmp-impl.h"  | 
48  |  | #include "longlong.h"  | 
49  | 0  | #define USE_DIVAPPR_Q 1  | 
50  |  | #define TRACE(x)  | 
51  |  |  | 
52  |  | static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */  | 
53  |  | { | 
54  |  |   0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */  | 
55  |  |   0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */  | 
56  |  |   0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */  | 
57  |  |   0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */  | 
58  |  |   0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */  | 
59  |  |   0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */  | 
60  |  |   0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */  | 
61  |  |   0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */  | 
62  |  |   0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */  | 
63  |  |   0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */  | 
64  |  |   0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */  | 
65  |  |   0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */  | 
66  |  |   0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */  | 
67  |  |   0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */  | 
68  |  |   0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */  | 
69  |  |   0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */  | 
70  |  |   0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */  | 
71  |  |   0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */  | 
72  |  |   0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */  | 
73  |  |   0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */  | 
74  |  |   0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */  | 
75  |  |   0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */  | 
76  |  |   0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */  | 
77  |  |   0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */  | 
78  |  |   0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */  | 
79  |  |   0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */  | 
80  |  |   0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */  | 
81  |  |   0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */  | 
82  |  |   0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */  | 
83  |  |   0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */  | 
84  |  |   0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */  | 
85  |  |   0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */  | 
86  |  |   0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */  | 
87  |  |   0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */  | 
88  |  |   0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */  | 
89  |  |   0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */  | 
90  |  |   0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */  | 
91  |  |   0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */  | 
92  |  |   0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */  | 
93  |  |   0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */  | 
94  |  |   0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */  | 
95  |  |   0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */  | 
96  |  |   0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */  | 
97  |  |   0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */  | 
98  |  |   0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */  | 
99  |  |   0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */  | 
100  |  |   0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */  | 
101  |  |   0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */  | 
102  |  | };  | 
103  |  |  | 
104  |  | /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */  | 
105  |  |  | 
106  |  | #if GMP_NUMB_BITS > 32  | 
107  | 0  | #define MAGIC CNST_LIMB(0x10000000000)  /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */  | 
108  |  | #else  | 
109  |  | #define MAGIC CNST_LIMB(0x100000)   /* 0xfee6f < MAGIC < 0x29cbc8 */  | 
110  |  | #endif  | 
111  |  |  | 
112  |  | static mp_limb_t  | 
113  |  | mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)  | 
114  | 0  | { | 
115  | 0  | #if GMP_NUMB_BITS > 32  | 
116  | 0  |   mp_limb_t a1;  | 
117  | 0  | #endif  | 
118  | 0  |   mp_limb_t x0, t2, t, x2;  | 
119  | 0  |   unsigned abits;  | 
120  |  | 
  | 
121  | 0  |   ASSERT_ALWAYS (GMP_NAIL_BITS == 0);  | 
122  | 0  |   ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);  | 
123  | 0  |   ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);  | 
124  |  |  | 
125  |  |   /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),  | 
126  |  |      since we can do the former without division.  As part of the last  | 
127  |  |      iteration convert from 1/sqrt(a) to sqrt(a).  */  | 
128  |  | 
  | 
129  | 0  |   abits = a0 >> (GMP_LIMB_BITS - 1 - 8);  /* extract bits for table lookup */  | 
130  | 0  |   x0 = 0x100 | invsqrttab[abits - 0x80];  /* initial 1/sqrt(a) */  | 
131  |  |  | 
132  |  |   /* x0 is now an 8 bits approximation of 1/sqrt(a0) */  | 
133  |  | 
  | 
134  | 0  | #if GMP_NUMB_BITS > 32  | 
135  | 0  |   a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);  | 
136  | 0  |   t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;  | 
137  | 0  |   x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));  | 
138  |  |  | 
139  |  |   /* x0 is now a 16 bits approximation of 1/sqrt(a0) */  | 
140  |  | 
  | 
141  | 0  |   t2 = x0 * (a0 >> (32-8));  | 
142  | 0  |   t = t2 >> 25;  | 
143  | 0  |   t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));  | 
144  | 0  |   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);  | 
145  | 0  |   x0 >>= 32;  | 
146  |  | #else  | 
147  |  |   t2 = x0 * (a0 >> (16-8));  | 
148  |  |   t = t2 >> 13;  | 
149  |  |   t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));  | 
150  |  |   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);  | 
151  |  |   x0 >>= 16;  | 
152  |  | #endif  | 
153  |  |  | 
154  |  |   /* x0 is now a full limb approximation of sqrt(a0) */  | 
155  |  | 
  | 
156  | 0  |   x2 = x0 * x0;  | 
157  | 0  |   if (x2 + 2*x0 <= a0 - 1)  | 
158  | 0  |     { | 
159  | 0  |       x2 += 2*x0 + 1;  | 
160  | 0  |       x0++;  | 
161  | 0  |     }  | 
162  |  | 
  | 
163  | 0  |   *rp = a0 - x2;  | 
164  | 0  |   return x0;  | 
165  | 0  | }  | 
166  |  |  | 
167  |  |  | 
168  | 0  | #define Prec (GMP_NUMB_BITS >> 1)  | 
169  |  | #if ! defined(SQRTREM2_INPLACE)  | 
170  |  | #define SQRTREM2_INPLACE 0  | 
171  |  | #endif  | 
172  |  |  | 
173  |  | /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized | 
174  |  |    return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */ | 
175  |  | #if SQRTREM2_INPLACE  | 
176  |  | #define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)  | 
177  |  | static mp_limb_t  | 
178  |  | mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)  | 
179  |  | { | 
180  |  |   mp_srcptr np = rp;  | 
181  |  | #else  | 
182  | 0  | #define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)  | 
183  |  | static mp_limb_t  | 
184  |  | mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)  | 
185  | 0  | { | 
186  | 0  | #endif  | 
187  | 0  |   mp_limb_t q, u, np0, sp0, rp0, q2;  | 
188  | 0  |   int cc;  | 
189  |  | 
  | 
190  | 0  |   ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);  | 
191  |  | 
  | 
192  | 0  |   np0 = np[0];  | 
193  | 0  |   sp0 = mpn_sqrtrem1 (rp, np[1]);  | 
194  | 0  |   rp0 = rp[0];  | 
195  |  |   /* rp0 <= 2*sp0 < 2^(Prec + 1) */  | 
196  | 0  |   rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));  | 
197  | 0  |   q = rp0 / sp0;  | 
198  |  |   /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */  | 
199  | 0  |   q -= q >> Prec;  | 
200  |  |   /* now we have q < 2^Prec */  | 
201  | 0  |   u = rp0 - q * sp0;  | 
202  |  |   /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */  | 
203  | 0  |   sp0 = (sp0 << Prec) | q;  | 
204  | 0  |   cc = u >> (Prec - 1);  | 
205  | 0  |   rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));  | 
206  |  |   /* subtract q * q from rp */  | 
207  | 0  |   q2 = q * q;  | 
208  | 0  |   cc -= rp0 < q2;  | 
209  | 0  |   rp0 -= q2;  | 
210  | 0  |   if (cc < 0)  | 
211  | 0  |     { | 
212  | 0  |       rp0 += sp0;  | 
213  | 0  |       cc += rp0 < sp0;  | 
214  | 0  |       --sp0;  | 
215  | 0  |       rp0 += sp0;  | 
216  | 0  |       cc += rp0 < sp0;  | 
217  | 0  |     }  | 
218  |  | 
  | 
219  | 0  |   rp[0] = rp0;  | 
220  | 0  |   sp[0] = sp0;  | 
221  | 0  |   return cc;  | 
222  | 0  | }  | 
223  |  |  | 
224  |  | /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n}, | 
225  |  |    and in {np, n} the low n limbs of the remainder, returns the high | 
226  |  |    limb of the remainder (which is 0 or 1).  | 
227  |  |    Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4 | 
228  |  |    where B=2^GMP_NUMB_BITS.  | 
229  |  |    Needs a scratch of n/2+1 limbs. */  | 
230  |  | static mp_limb_t  | 
231  |  | mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)  | 
232  | 0  | { | 
233  | 0  |   mp_limb_t q;      /* carry out of {sp, n} */ | 
234  | 0  |   int c, b;     /* carry out of remainder */  | 
235  | 0  |   mp_size_t l, h;  | 
236  |  | 
  | 
237  | 0  |   ASSERT (n > 1);  | 
238  | 0  |   ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);  | 
239  |  | 
  | 
240  | 0  |   l = n / 2;  | 
241  | 0  |   h = n - l;  | 
242  | 0  |   if (h == 1)  | 
243  | 0  |     q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);  | 
244  | 0  |   else  | 
245  | 0  |     q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);  | 
246  | 0  |   if (q != 0)  | 
247  | 0  |     ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));  | 
248  | 0  |   TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1))); | 
249  | 0  |   mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);  | 
250  | 0  |   q += scratch[l];  | 
251  | 0  |   c = scratch[0] & 1;  | 
252  | 0  |   mpn_rshift (sp, scratch, l, 1);  | 
253  | 0  |   sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;  | 
254  | 0  |   if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */  | 
255  | 0  |     return 1; /* Remainder is non-zero */  | 
256  | 0  |   q >>= 1;  | 
257  | 0  |   if (c != 0)  | 
258  | 0  |     c = mpn_add_n (np + l, np + l, sp + l, h);  | 
259  | 0  |   TRACE(printf("sqr(,,%u)\n", (unsigned) l)); | 
260  | 0  |   mpn_sqr (np + n, sp, l);  | 
261  | 0  |   b = q + mpn_sub_n (np, np, np + n, 2 * l);  | 
262  | 0  |   c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);  | 
263  |  | 
  | 
264  | 0  |   if (c < 0)  | 
265  | 0  |     { | 
266  | 0  |       q = mpn_add_1 (sp + l, sp + l, h, q);  | 
267  | 0  | #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n  | 
268  | 0  |       c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;  | 
269  |  | #else  | 
270  |  |       c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;  | 
271  |  | #endif  | 
272  | 0  |       c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));  | 
273  | 0  |       q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));  | 
274  | 0  |     }  | 
275  |  | 
  | 
276  | 0  |   return c;  | 
277  | 0  | }  | 
278  |  |  | 
279  |  | #if USE_DIVAPPR_Q  | 
280  |  | static void  | 
281  |  | mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)  | 
282  | 0  | { | 
283  | 0  |   gmp_pi1_t inv;  | 
284  | 0  |   mp_limb_t qh;  | 
285  | 0  |   ASSERT (dn > 2);  | 
286  | 0  |   ASSERT (nn >= dn);  | 
287  | 0  |   ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);  | 
288  |  | 
  | 
289  | 0  |   MPN_COPY (scratch, np, nn);  | 
290  | 0  |   invert_pi1 (inv, dp[dn-1], dp[dn-2]);  | 
291  | 0  |   if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))  | 
292  | 0  |     qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);  | 
293  | 0  |   else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))  | 
294  | 0  |     qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);  | 
295  | 0  |   else  | 
296  | 0  |     { | 
297  | 0  |       mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);  | 
298  | 0  |       TMP_DECL;  | 
299  | 0  |       TMP_MARK;  | 
300  |  |       /* Sadly, scratch is too small. */  | 
301  | 0  |       qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));  | 
302  | 0  |       TMP_FREE;  | 
303  | 0  |     }  | 
304  | 0  |   qp [nn - dn] = qh;  | 
305  | 0  | }  | 
306  |  | #endif  | 
307  |  |  | 
308  |  | /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd}, | 
309  |  |    returns zero if the operand was a perfect square, one otherwise.  | 
310  |  |    Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4 | 
311  |  |    where B=2^GMP_NUMB_BITS.  | 
312  |  |    THINK: In the odd case, three more (dummy) limbs are taken into account,  | 
313  |  |    when nsh is maximal, two limbs are discarded from the result of the  | 
314  |  |    division. Too much? Is a single dummy limb enough? */  | 
315  |  | static int  | 
316  |  | mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)  | 
317  | 0  | { | 
318  | 0  |   mp_limb_t q;      /* carry out of {sp, n} */ | 
319  | 0  |   int c;      /* carry out of remainder */  | 
320  | 0  |   mp_size_t l, h;  | 
321  | 0  |   mp_ptr qp, tp, scratch;  | 
322  | 0  |   TMP_DECL;  | 
323  | 0  |   TMP_MARK;  | 
324  |  | 
  | 
325  | 0  |   ASSERT (np[2 * n - 1 - odd] != 0);  | 
326  | 0  |   ASSERT (n > 4);  | 
327  | 0  |   ASSERT (nsh < GMP_NUMB_BITS / 2);  | 
328  |  | 
  | 
329  | 0  |   l = (n - 1) / 2;  | 
330  | 0  |   h = n - l;  | 
331  | 0  |   ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);  | 
332  | 0  |   scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */  | 
333  | 0  |   tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */  | 
334  | 0  |   if (nsh != 0)  | 
335  | 0  |     { | 
336  |  |       /* o is used to exactly set the lowest bits of the dividend, is it needed? */  | 
337  | 0  |       int o = l > (1 + odd);  | 
338  | 0  |       ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));  | 
339  | 0  |     }  | 
340  | 0  |   else  | 
341  | 0  |     MPN_COPY (tp, np + l - 1 - odd, n + h + 1);  | 
342  | 0  |   q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);  | 
343  | 0  |   if (q != 0)  | 
344  | 0  |     ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));  | 
345  | 0  |   qp = tp + n + 1; /* l + 2 */  | 
346  | 0  |   TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1))); | 
347  | 0  | #if USE_DIVAPPR_Q  | 
348  | 0  |   mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);  | 
349  |  | #else  | 
350  |  |   mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);  | 
351  |  | #endif  | 
352  | 0  |   q += qp [l + 1];  | 
353  | 0  |   c = 1;  | 
354  | 0  |   if (q > 1)  | 
355  | 0  |     { | 
356  |  |       /* FIXME: if s!=0 we will shift later, a noop on this area. */  | 
357  | 0  |       MPN_FILL (sp, l, GMP_NUMB_MAX);  | 
358  | 0  |     }  | 
359  | 0  |   else  | 
360  | 0  |     { | 
361  |  |       /* FIXME: if s!=0 we will shift again later, shift just once. */  | 
362  | 0  |       mpn_rshift (sp, qp + 1, l, 1);  | 
363  | 0  |       sp[l - 1] |= q << (GMP_NUMB_BITS - 1);  | 
364  | 0  |       if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */  | 
365  | 0  |      (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)  | 
366  | 0  |   { | 
367  | 0  |     mp_limb_t cy;  | 
368  |  |     /* Approximation is not good enough, the extra limb(+ nsh bits)  | 
369  |  |        is smaller than needed to absorb the possible error. */  | 
370  |  |     /* {qp + 1, l + 1} equals 2*{sp, l} */ | 
371  |  |     /* FIXME: use mullo or wrap-around, or directly evaluate  | 
372  |  |        remainder with a single sqrmod_bnm1. */  | 
373  | 0  |     TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1))); | 
374  | 0  |     ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));  | 
375  |  |     /* Compute the remainder of the previous mpn_div(appr)_q. */  | 
376  | 0  |     cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);  | 
377  | 0  | #if USE_DIVAPPR_Q || WANT_ASSERT  | 
378  | 0  |     MPN_DECR_U (tp + 1 + h, l, cy);  | 
379  | 0  | #if USE_DIVAPPR_Q  | 
380  | 0  |     ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);  | 
381  | 0  |     if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)  | 
382  | 0  |       { | 
383  |  |         /* May happen only if div result was not exact. */  | 
384  | 0  | #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n  | 
385  | 0  |         cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);  | 
386  |  | #else  | 
387  |  |         cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));  | 
388  |  | #endif  | 
389  | 0  |         ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));  | 
390  | 0  |         MPN_DECR_U (sp, l, 1);  | 
391  | 0  |       }  | 
392  |  |     /* Can the root be exact when a correction was needed? We  | 
393  |  |        did not find an example, but it depends on divappr  | 
394  |  |        internals, and we can not assume it true in general...*/  | 
395  |  |     /* else */  | 
396  |  | #else /* WANT_ASSERT */  | 
397  |  |     ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);  | 
398  |  | #endif  | 
399  | 0  | #endif  | 
400  | 0  |     if (mpn_zero_p (tp + l + 1, h - l))  | 
401  | 0  |       { | 
402  | 0  |         TRACE(printf("sqr(,,%u)\n", (unsigned) l)); | 
403  | 0  |         mpn_sqr (scratch, sp, l);  | 
404  | 0  |         c = mpn_cmp (tp + 1, scratch + l, l);  | 
405  | 0  |         if (c == 0)  | 
406  | 0  |     { | 
407  | 0  |       if (nsh != 0)  | 
408  | 0  |         { | 
409  | 0  |           mpn_lshift (tp, np, l, 2 * nsh);  | 
410  | 0  |           np = tp;  | 
411  | 0  |         }  | 
412  | 0  |       c = mpn_cmp (np, scratch + odd, l - odd);  | 
413  | 0  |     }  | 
414  | 0  |         if (c < 0)  | 
415  | 0  |     { | 
416  | 0  |       MPN_DECR_U (sp, l, 1);  | 
417  | 0  |       c = 1;  | 
418  | 0  |     }  | 
419  | 0  |       }  | 
420  | 0  |   }  | 
421  | 0  |     }  | 
422  | 0  |   TMP_FREE;  | 
423  |  | 
  | 
424  | 0  |   if ((odd | nsh) != 0)  | 
425  | 0  |     mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));  | 
426  | 0  |   return c;  | 
427  | 0  | }  | 
428  |  |  | 
429  |  |  | 
430  |  | mp_size_t  | 
431  |  | mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)  | 
432  | 0  | { | 
433  | 0  |   mp_limb_t cc, high, rl;  | 
434  | 0  |   int c;  | 
435  | 0  |   mp_size_t rn, tn;  | 
436  | 0  |   TMP_DECL;  | 
437  |  | 
  | 
438  | 0  |   ASSERT (nn > 0);  | 
439  | 0  |   ASSERT_MPN (np, nn);  | 
440  |  | 
  | 
441  | 0  |   ASSERT (np[nn - 1] != 0);  | 
442  | 0  |   ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));  | 
443  | 0  |   ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));  | 
444  | 0  |   ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));  | 
445  |  | 
  | 
446  | 0  |   high = np[nn - 1];  | 
447  | 0  |   if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))  | 
448  | 0  |     c = 0;  | 
449  | 0  |   else  | 
450  | 0  |     { | 
451  | 0  |       count_leading_zeros (c, high);  | 
452  | 0  |       c -= GMP_NAIL_BITS;  | 
453  |  | 
  | 
454  | 0  |       c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */ | 
455  | 0  |     }  | 
456  | 0  |   if (nn == 1) { | 
457  | 0  |     if (c == 0)  | 
458  | 0  |       { | 
459  | 0  |   sp[0] = mpn_sqrtrem1 (&rl, high);  | 
460  | 0  |   if (rp != NULL)  | 
461  | 0  |     rp[0] = rl;  | 
462  | 0  |       }  | 
463  | 0  |     else  | 
464  | 0  |       { | 
465  | 0  |   cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;  | 
466  | 0  |   sp[0] = cc;  | 
467  | 0  |   if (rp != NULL)  | 
468  | 0  |     rp[0] = rl = high - cc*cc;  | 
469  | 0  |       }  | 
470  | 0  |     return rl != 0;  | 
471  | 0  |   }  | 
472  | 0  |   if (nn == 2) { | 
473  | 0  |     mp_limb_t tp [2];  | 
474  | 0  |     if (rp == NULL) rp = tp;  | 
475  | 0  |     if (c == 0)  | 
476  | 0  |       { | 
477  |  | #if SQRTREM2_INPLACE  | 
478  |  |   rp[1] = high;  | 
479  |  |   rp[0] = np[0];  | 
480  |  |   cc = CALL_SQRTREM2_INPLACE (sp, rp);  | 
481  |  | #else  | 
482  | 0  |   cc = mpn_sqrtrem2 (sp, rp, np);  | 
483  | 0  | #endif  | 
484  | 0  |   rp[1] = cc;  | 
485  | 0  |   return ((rp[0] | cc) != 0) + cc;  | 
486  | 0  |       }  | 
487  | 0  |     else  | 
488  | 0  |       { | 
489  | 0  |   rl = np[0];  | 
490  | 0  |   rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));  | 
491  | 0  |   rp[0] = rl << (2*c);  | 
492  | 0  |   CALL_SQRTREM2_INPLACE (sp, rp);  | 
493  | 0  |   cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */  | 
494  | 0  |   rp[0] = rl -= cc*cc;  /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */  | 
495  | 0  |   return rl != 0;  | 
496  | 0  |       }  | 
497  | 0  |   }  | 
498  | 0  |   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */  | 
499  |  | 
  | 
500  | 0  |   if ((rp == NULL) && (nn > 8))  | 
501  | 0  |     return mpn_dc_sqrt (sp, np, tn, c, nn & 1);  | 
502  | 0  |   TMP_MARK;  | 
503  | 0  |   if (((nn & 1) | c) != 0)  | 
504  | 0  |     { | 
505  | 0  |       mp_limb_t s0[1], mask;  | 
506  | 0  |       mp_ptr tp, scratch;  | 
507  | 0  |       TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);  | 
508  | 0  |       tp[0] = 0;       /* needed only when 2*tn > nn, but saves a test */  | 
509  | 0  |       if (c != 0)  | 
510  | 0  |   mpn_lshift (tp + (nn & 1), np, nn, 2 * c);  | 
511  | 0  |       else  | 
512  | 0  |   MPN_COPY (tp + (nn & 1), np, nn);  | 
513  | 0  |       c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;   /* c now represents k */  | 
514  | 0  |       mask = (CNST_LIMB (1) << c) - 1;  | 
515  | 0  |       rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);  | 
516  |  |       /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,  | 
517  |  |    thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */  | 
518  | 0  |       s0[0] = sp[0] & mask; /* S mod 2^k */  | 
519  | 0  |       rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */  | 
520  | 0  |       cc = mpn_submul_1 (tp, s0, 1, s0[0]);  | 
521  | 0  |       rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;  | 
522  | 0  |       mpn_rshift (sp, sp, tn, c);  | 
523  | 0  |       tp[tn] = rl;  | 
524  | 0  |       if (rp == NULL)  | 
525  | 0  |   rp = tp;  | 
526  | 0  |       c = c << 1;  | 
527  | 0  |       if (c < GMP_NUMB_BITS)  | 
528  | 0  |   tn++;  | 
529  | 0  |       else  | 
530  | 0  |   { | 
531  | 0  |     tp++;  | 
532  | 0  |     c -= GMP_NUMB_BITS;  | 
533  | 0  |   }  | 
534  | 0  |       if (c != 0)  | 
535  | 0  |   mpn_rshift (rp, tp, tn, c);  | 
536  | 0  |       else  | 
537  | 0  |   MPN_COPY_INCR (rp, tp, tn);  | 
538  | 0  |       rn = tn;  | 
539  | 0  |     }  | 
540  | 0  |   else  | 
541  | 0  |     { | 
542  | 0  |       if (rp != np)  | 
543  | 0  |   { | 
544  | 0  |     if (rp == NULL) /* nn <= 8 */  | 
545  | 0  |       rp = TMP_SALLOC_LIMBS (nn);  | 
546  | 0  |     MPN_COPY (rp, np, nn);  | 
547  | 0  |   }  | 
548  | 0  |       rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));  | 
549  | 0  |     }  | 
550  |  | 
  | 
551  | 0  |   MPN_NORMALIZE (rp, rn);  | 
552  |  | 
  | 
553  | 0  |   TMP_FREE;  | 
554  | 0  |   return rn;  | 
555  | 0  | }  |