Coverage Report

Created: 2025-03-09 06:52

/src/gmp-6.2.1/mpn/sqrtrem.c
Line
Count
Source (jump to first uncovered line)
1
/* mpn_sqrtrem -- square root and remainder
2
3
   Contributed to the GNU project by Paul Zimmermann (most code),
4
   Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
5
6
   THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
7
   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
8
   IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
9
   FUTURE GMP RELEASE.
10
11
Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
12
Foundation, Inc.
13
14
This file is part of the GNU MP Library.
15
16
The GNU MP Library is free software; you can redistribute it and/or modify
17
it under the terms of either:
18
19
  * the GNU Lesser General Public License as published by the Free
20
    Software Foundation; either version 3 of the License, or (at your
21
    option) any later version.
22
23
or
24
25
  * the GNU General Public License as published by the Free Software
26
    Foundation; either version 2 of the License, or (at your option) any
27
    later version.
28
29
or both in parallel, as here.
30
31
The GNU MP Library is distributed in the hope that it will be useful, but
32
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
34
for more details.
35
36
You should have received copies of the GNU General Public License and the
37
GNU Lesser General Public License along with the GNU MP Library.  If not,
38
see https://www.gnu.org/licenses/.  */
39
40
41
/* See "Karatsuba Square Root", reference in gmp.texi.  */
42
43
44
#include <stdio.h>
45
#include <stdlib.h>
46
47
#include "gmp-impl.h"
48
#include "longlong.h"
49
1.22k
#define USE_DIVAPPR_Q 1
50
#define TRACE(x)
51
52
static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
53
{
54
  0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
55
  0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
56
  0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
57
  0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
58
  0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
59
  0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
60
  0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
61
  0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
62
  0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
63
  0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
64
  0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
65
  0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
66
  0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
67
  0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
68
  0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
69
  0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
70
  0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
71
  0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
72
  0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
73
  0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
74
  0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
75
  0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
76
  0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
77
  0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
78
  0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
79
  0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
80
  0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
81
  0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
82
  0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
83
  0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
84
  0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
85
  0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
86
  0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
87
  0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
88
  0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
89
  0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
90
  0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
91
  0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
92
  0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
93
  0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
94
  0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
95
  0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
96
  0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
97
  0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
98
  0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
99
  0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
100
  0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
101
  0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
102
};
103
104
/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
105
106
#if GMP_NUMB_BITS > 32
107
1.29k
#define MAGIC CNST_LIMB(0x10000000000)  /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
108
#else
109
#define MAGIC CNST_LIMB(0x100000)   /* 0xfee6f < MAGIC < 0x29cbc8 */
110
#endif
111
112
static mp_limb_t
113
mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
114
1.29k
{
115
1.29k
#if GMP_NUMB_BITS > 32
116
1.29k
  mp_limb_t a1;
117
1.29k
#endif
118
1.29k
  mp_limb_t x0, t2, t, x2;
119
1.29k
  unsigned abits;
120
121
1.29k
  ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
122
1.29k
  ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
123
1.29k
  ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
124
125
  /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
126
     since we can do the former without division.  As part of the last
127
     iteration convert from 1/sqrt(a) to sqrt(a).  */
128
129
1.29k
  abits = a0 >> (GMP_LIMB_BITS - 1 - 8);  /* extract bits for table lookup */
130
1.29k
  x0 = 0x100 | invsqrttab[abits - 0x80];  /* initial 1/sqrt(a) */
131
132
  /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
133
134
1.29k
#if GMP_NUMB_BITS > 32
135
1.29k
  a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
136
1.29k
  t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
137
1.29k
  x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
138
139
  /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
140
141
1.29k
  t2 = x0 * (a0 >> (32-8));
142
1.29k
  t = t2 >> 25;
143
1.29k
  t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
144
1.29k
  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
145
1.29k
  x0 >>= 32;
146
#else
147
  t2 = x0 * (a0 >> (16-8));
148
  t = t2 >> 13;
149
  t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
150
  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
151
  x0 >>= 16;
152
#endif
153
154
  /* x0 is now a full limb approximation of sqrt(a0) */
155
156
1.29k
  x2 = x0 * x0;
157
1.29k
  if (x2 + 2*x0 <= a0 - 1)
158
59
    {
159
59
      x2 += 2*x0 + 1;
160
59
      x0++;
161
59
    }
162
163
1.29k
  *rp = a0 - x2;
164
1.29k
  return x0;
165
1.29k
}
166
167
168
8.61k
#define Prec (GMP_NUMB_BITS >> 1)
169
#if ! defined(SQRTREM2_INPLACE)
170
#define SQRTREM2_INPLACE 0
171
#endif
172
173
/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
174
   return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
175
#if SQRTREM2_INPLACE
176
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
177
static mp_limb_t
178
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
179
{
180
  mp_srcptr np = rp;
181
#else
182
1.22k
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
183
static mp_limb_t
184
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
185
1.23k
{
186
1.23k
#endif
187
1.23k
  mp_limb_t q, u, np0, sp0, rp0, q2;
188
1.23k
  int cc;
189
190
1.23k
  ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
191
192
1.23k
  np0 = np[0];
193
1.23k
  sp0 = mpn_sqrtrem1 (rp, np[1]);
194
1.23k
  rp0 = rp[0];
195
  /* rp0 <= 2*sp0 < 2^(Prec + 1) */
196
1.23k
  rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
197
1.23k
  q = rp0 / sp0;
198
  /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
199
1.23k
  q -= q >> Prec;
200
  /* now we have q < 2^Prec */
201
1.23k
  u = rp0 - q * sp0;
202
  /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
203
1.23k
  sp0 = (sp0 << Prec) | q;
204
1.23k
  cc = u >> (Prec - 1);
205
1.23k
  rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
206
  /* subtract q * q from rp */
207
1.23k
  q2 = q * q;
208
1.23k
  cc -= rp0 < q2;
209
1.23k
  rp0 -= q2;
210
1.23k
  if (cc < 0)
211
236
    {
212
236
      rp0 += sp0;
213
236
      cc += rp0 < sp0;
214
236
      --sp0;
215
236
      rp0 += sp0;
216
236
      cc += rp0 < sp0;
217
236
    }
218
219
1.23k
  rp[0] = rp0;
220
1.23k
  sp[0] = sp0;
221
1.23k
  return cc;
222
1.23k
}
223
224
/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
225
   and in {np, n} the low n limbs of the remainder, returns the high
226
   limb of the remainder (which is 0 or 1).
227
   Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
228
   where B=2^GMP_NUMB_BITS.
229
   Needs a scratch of n/2+1 limbs. */
230
static mp_limb_t
231
mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
232
4.67k
{
233
4.67k
  mp_limb_t q;      /* carry out of {sp, n} */
234
4.67k
  int c, b;     /* carry out of remainder */
235
4.67k
  mp_size_t l, h;
236
237
4.67k
  ASSERT (n > 1);
238
4.67k
  ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
239
240
4.67k
  l = n / 2;
241
4.67k
  h = n - l;
242
4.67k
  if (h == 1)
243
1.17k
    q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
244
3.49k
  else
245
3.49k
    q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
246
4.67k
  if (q != 0)
247
4.67k
    ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
248
4.67k
  TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
249
4.67k
  mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
250
4.67k
  q += scratch[l];
251
4.67k
  c = scratch[0] & 1;
252
4.67k
  mpn_rshift (sp, scratch, l, 1);
253
4.67k
  sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
254
4.67k
  if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
255
33
    return 1; /* Remainder is non-zero */
256
4.63k
  q >>= 1;
257
4.63k
  if (c != 0)
258
2.05k
    c = mpn_add_n (np + l, np + l, sp + l, h);
259
4.63k
  TRACE(printf("sqr(,,%u)\n", (unsigned) l));
260
4.63k
  mpn_sqr (np + n, sp, l);
261
4.63k
  b = q + mpn_sub_n (np, np, np + n, 2 * l);
262
4.63k
  c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
263
264
4.63k
  if (c < 0)
265
1.14k
    {
266
1.14k
      q = mpn_add_1 (sp + l, sp + l, h, q);
267
1.14k
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
268
1.14k
      c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
269
#else
270
      c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
271
#endif
272
1.14k
      c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
273
1.14k
      q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
274
1.14k
    }
275
276
4.63k
  return c;
277
4.67k
}
278
279
#if USE_DIVAPPR_Q
280
static void
281
mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
282
611
{
283
611
  gmp_pi1_t inv;
284
611
  mp_limb_t qh;
285
611
  ASSERT (dn > 2);
286
611
  ASSERT (nn >= dn);
287
611
  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
288
289
611
  MPN_COPY (scratch, np, nn);
290
611
  invert_pi1 (inv, dp[dn-1], dp[dn-2]);
291
611
  if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
292
611
    qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
293
0
  else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
294
0
    qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
295
0
  else
296
0
    {
297
0
      mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
298
0
      TMP_DECL;
299
0
      TMP_MARK;
300
      /* Sadly, scratch is too small. */
301
0
      qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
302
0
      TMP_FREE;
303
0
    }
304
611
  qp [nn - dn] = qh;
305
611
}
306
#endif
307
308
/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
309
   returns zero if the operand was a perfect square, one otherwise.
310
   Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
311
   where B=2^GMP_NUMB_BITS.
312
   THINK: In the odd case, three more (dummy) limbs are taken into account,
313
   when nsh is maximal, two limbs are discarded from the result of the
314
   division. Too much? Is a single dummy limb enough? */
315
static int
316
mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
317
611
{
318
611
  mp_limb_t q;      /* carry out of {sp, n} */
319
611
  int c;      /* carry out of remainder */
320
611
  mp_size_t l, h;
321
611
  mp_ptr qp, tp, scratch;
322
611
  TMP_DECL;
323
611
  TMP_MARK;
324
325
611
  ASSERT (np[2 * n - 1 - odd] != 0);
326
611
  ASSERT (n > 4);
327
611
  ASSERT (nsh < GMP_NUMB_BITS / 2);
328
329
611
  l = (n - 1) / 2;
330
611
  h = n - l;
331
611
  ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
332
611
  scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
333
611
  tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
334
611
  if (nsh != 0)
335
537
    {
336
      /* o is used to exactly set the lowest bits of the dividend, is it needed? */
337
537
      int o = l > (1 + odd);
338
537
      ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
339
537
    }
340
74
  else
341
74
    MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
342
611
  q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
343
611
  if (q != 0)
344
611
    ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
345
611
  qp = tp + n + 1; /* l + 2 */
346
611
  TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
347
611
#if USE_DIVAPPR_Q
348
611
  mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
349
#else
350
  mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
351
#endif
352
611
  q += qp [l + 1];
353
611
  c = 1;
354
611
  if (q > 1)
355
1
    {
356
      /* FIXME: if s!=0 we will shift later, a noop on this area. */
357
1
      MPN_FILL (sp, l, GMP_NUMB_MAX);
358
1
    }
359
610
  else
360
610
    {
361
      /* FIXME: if s!=0 we will shift again later, shift just once. */
362
610
      mpn_rshift (sp, qp + 1, l, 1);
363
610
      sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
364
610
      if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
365
610
     (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
366
219
  {
367
219
    mp_limb_t cy;
368
    /* Approximation is not good enough, the extra limb(+ nsh bits)
369
       is smaller than needed to absorb the possible error. */
370
    /* {qp + 1, l + 1} equals 2*{sp, l} */
371
    /* FIXME: use mullo or wrap-around, or directly evaluate
372
       remainder with a single sqrmod_bnm1. */
373
219
    TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
374
219
    ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
375
    /* Compute the remainder of the previous mpn_div(appr)_q. */
376
219
    cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
377
219
#if USE_DIVAPPR_Q || WANT_ASSERT
378
219
    MPN_DECR_U (tp + 1 + h, l, cy);
379
219
#if USE_DIVAPPR_Q
380
219
    ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
381
219
    if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
382
24
      {
383
        /* May happen only if div result was not exact. */
384
24
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
385
24
        cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
386
#else
387
        cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
388
#endif
389
24
        ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
390
24
        MPN_DECR_U (sp, l, 1);
391
24
      }
392
    /* Can the root be exact when a correction was needed? We
393
       did not find an example, but it depends on divappr
394
       internals, and we can not assume it true in general...*/
395
    /* else */
396
#else /* WANT_ASSERT */
397
    ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
398
#endif
399
219
#endif
400
219
    if (mpn_zero_p (tp + l + 1, h - l))
401
191
      {
402
191
        TRACE(printf("sqr(,,%u)\n", (unsigned) l));
403
191
        mpn_sqr (scratch, sp, l);
404
191
        c = mpn_cmp (tp + 1, scratch + l, l);
405
191
        if (c == 0)
406
38
    {
407
38
      if (nsh != 0)
408
27
        {
409
27
          mpn_lshift (tp, np, l, 2 * nsh);
410
27
          np = tp;
411
27
        }
412
38
      c = mpn_cmp (np, scratch + odd, l - odd);
413
38
    }
414
191
        if (c < 0)
415
56
    {
416
56
      MPN_DECR_U (sp, l, 1);
417
56
      c = 1;
418
56
    }
419
191
      }
420
219
  }
421
610
    }
422
611
  TMP_FREE;
423
424
611
  if ((odd | nsh) != 0)
425
551
    mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
426
611
  return c;
427
611
}
428
429
430
mp_size_t
431
mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
432
1.29k
{
433
1.29k
  mp_limb_t cc, high, rl;
434
1.29k
  int c;
435
1.29k
  mp_size_t rn, tn;
436
1.29k
  TMP_DECL;
437
438
1.29k
  ASSERT (nn > 0);
439
1.29k
  ASSERT_MPN (np, nn);
440
441
1.29k
  ASSERT (np[nn - 1] != 0);
442
1.29k
  ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
443
1.29k
  ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
444
1.29k
  ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
445
446
1.29k
  high = np[nn - 1];
447
1.29k
  if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
448
224
    c = 0;
449
1.07k
  else
450
1.07k
    {
451
1.07k
      count_leading_zeros (c, high);
452
1.07k
      c -= GMP_NAIL_BITS;
453
454
1.07k
      c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
455
1.07k
    }
456
1.29k
  if (nn == 1) {
457
68
    if (c == 0)
458
28
      {
459
28
  sp[0] = mpn_sqrtrem1 (&rl, high);
460
28
  if (rp != NULL)
461
14
    rp[0] = rl;
462
28
      }
463
40
    else
464
40
      {
465
40
  cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
466
40
  sp[0] = cc;
467
40
  if (rp != NULL)
468
24
    rp[0] = rl = high - cc*cc;
469
40
      }
470
68
    return rl != 0;
471
68
  }
472
1.23k
  if (nn == 2) {
473
56
    mp_limb_t tp [2];
474
56
    if (rp == NULL) rp = tp;
475
56
    if (c == 0)
476
6
      {
477
#if SQRTREM2_INPLACE
478
  rp[1] = high;
479
  rp[0] = np[0];
480
  cc = CALL_SQRTREM2_INPLACE (sp, rp);
481
#else
482
6
  cc = mpn_sqrtrem2 (sp, rp, np);
483
6
#endif
484
6
  rp[1] = cc;
485
6
  return ((rp[0] | cc) != 0) + cc;
486
6
      }
487
50
    else
488
50
      {
489
50
  rl = np[0];
490
50
  rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
491
50
  rp[0] = rl << (2*c);
492
50
  CALL_SQRTREM2_INPLACE (sp, rp);
493
50
  cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */
494
50
  rp[0] = rl -= cc*cc;  /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
495
50
  return rl != 0;
496
50
      }
497
56
  }
498
1.17k
  tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
499
500
1.17k
  if ((rp == NULL) && (nn > 8))
501
611
    return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
502
564
  TMP_MARK;
503
564
  if (((nn & 1) | c) != 0)
504
480
    {
505
480
      mp_limb_t s0[1], mask;
506
480
      mp_ptr tp, scratch;
507
480
      TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
508
480
      tp[0] = 0;       /* needed only when 2*tn > nn, but saves a test */
509
480
      if (c != 0)
510
448
  mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
511
32
      else
512
32
  MPN_COPY (tp + (nn & 1), np, nn);
513
480
      c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;   /* c now represents k */
514
480
      mask = (CNST_LIMB (1) << c) - 1;
515
480
      rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
516
      /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
517
   thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
518
480
      s0[0] = sp[0] & mask; /* S mod 2^k */
519
480
      rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */
520
480
      cc = mpn_submul_1 (tp, s0, 1, s0[0]);
521
480
      rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
522
480
      mpn_rshift (sp, sp, tn, c);
523
480
      tp[tn] = rl;
524
480
      if (rp == NULL)
525
39
  rp = tp;
526
480
      c = c << 1;
527
480
      if (c < GMP_NUMB_BITS)
528
270
  tn++;
529
210
      else
530
210
  {
531
210
    tp++;
532
210
    c -= GMP_NUMB_BITS;
533
210
  }
534
480
      if (c != 0)
535
448
  mpn_rshift (rp, tp, tn, c);
536
32
      else
537
32
  MPN_COPY_INCR (rp, tp, tn);
538
480
      rn = tn;
539
480
    }
540
84
  else
541
84
    {
542
84
      if (rp != np)
543
84
  {
544
84
    if (rp == NULL) /* nn <= 8 */
545
11
      rp = TMP_SALLOC_LIMBS (nn);
546
84
    MPN_COPY (rp, np, nn);
547
84
  }
548
84
      rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
549
84
    }
550
551
564
  MPN_NORMALIZE (rp, rn);
552
553
564
  TMP_FREE;
554
564
  return rn;
555
564
}