Coverage Report

Created: 2025-03-18 06:55

/src/gmp/mpn/sqrtrem.c
Line
Count
Source (jump to first uncovered line)
1
/* mpn_sqrtrem -- square root and remainder
2
3
   Contributed to the GNU project by Paul Zimmermann (most code),
4
   Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
5
6
   THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
7
   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
8
   IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
9
   FUTURE GMP RELEASE.
10
11
Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
12
Foundation, Inc.
13
14
This file is part of the GNU MP Library.
15
16
The GNU MP Library is free software; you can redistribute it and/or modify
17
it under the terms of either:
18
19
  * the GNU Lesser General Public License as published by the Free
20
    Software Foundation; either version 3 of the License, or (at your
21
    option) any later version.
22
23
or
24
25
  * the GNU General Public License as published by the Free Software
26
    Foundation; either version 2 of the License, or (at your option) any
27
    later version.
28
29
or both in parallel, as here.
30
31
The GNU MP Library is distributed in the hope that it will be useful, but
32
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
34
for more details.
35
36
You should have received copies of the GNU General Public License and the
37
GNU Lesser General Public License along with the GNU MP Library.  If not,
38
see https://www.gnu.org/licenses/.  */
39
40
41
/* See "Karatsuba Square Root", reference in gmp.texi.  */
42
43
44
#include <stdio.h>
45
#include <stdlib.h>
46
47
#include "gmp-impl.h"
48
#include "longlong.h"
49
0
#define USE_DIVAPPR_Q 1
50
#define TRACE(x)
51
52
static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
53
{
54
  0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
55
  0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
56
  0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
57
  0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
58
  0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
59
  0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
60
  0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
61
  0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
62
  0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
63
  0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
64
  0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
65
  0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
66
  0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
67
  0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
68
  0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
69
  0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
70
  0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
71
  0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
72
  0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
73
  0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
74
  0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
75
  0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
76
  0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
77
  0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
78
  0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
79
  0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
80
  0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
81
  0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
82
  0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
83
  0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
84
  0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
85
  0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
86
  0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
87
  0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
88
  0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
89
  0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
90
  0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
91
  0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
92
  0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
93
  0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
94
  0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
95
  0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
96
  0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
97
  0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
98
  0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
99
  0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
100
  0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
101
  0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
102
};
103
104
/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
105
106
#if GMP_NUMB_BITS > 32
107
0
#define MAGIC CNST_LIMB(0x10000000000)  /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
108
#else
109
#define MAGIC CNST_LIMB(0x100000)   /* 0xfee6f < MAGIC < 0x29cbc8 */
110
#endif
111
112
static mp_limb_t
113
mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
114
0
{
115
0
#if GMP_NUMB_BITS > 32
116
0
  mp_limb_t a1;
117
0
#endif
118
0
  mp_limb_t x0, t2, t, x2;
119
0
  unsigned abits;
120
121
0
  ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
122
0
  ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
123
0
  ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
124
125
  /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
126
     since we can do the former without division.  As part of the last
127
     iteration convert from 1/sqrt(a) to sqrt(a).  */
128
129
0
  abits = a0 >> (GMP_LIMB_BITS - 1 - 8);  /* extract bits for table lookup */
130
0
  x0 = 0x100 | invsqrttab[abits - 0x80];  /* initial 1/sqrt(a) */
131
132
  /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
133
134
0
#if GMP_NUMB_BITS > 32
135
0
  a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
136
0
  t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
137
0
  x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
138
139
  /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
140
141
0
  t2 = x0 * (a0 >> (32-8));
142
0
  t = t2 >> 25;
143
0
  t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
144
0
  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
145
0
  x0 >>= 32;
146
#else
147
  t2 = x0 * (a0 >> (16-8));
148
  t = t2 >> 13;
149
  t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
150
  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
151
  x0 >>= 16;
152
#endif
153
154
  /* x0 is now a full limb approximation of sqrt(a0) */
155
156
0
  x2 = x0 * x0;
157
0
  if (x2 + 2*x0 <= a0 - 1)
158
0
    {
159
0
      x2 += 2*x0 + 1;
160
0
      x0++;
161
0
    }
162
163
0
  *rp = a0 - x2;
164
0
  return x0;
165
0
}
166
167
168
0
#define Prec (GMP_NUMB_BITS >> 1)
169
#if ! defined(SQRTREM2_INPLACE)
170
#define SQRTREM2_INPLACE 0
171
#endif
172
173
/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
174
   return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
175
#if SQRTREM2_INPLACE
176
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
177
static mp_limb_t
178
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
179
{
180
  mp_srcptr np = rp;
181
#else
182
0
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
183
static mp_limb_t
184
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
185
0
{
186
0
#endif
187
0
  mp_limb_t q, u, np0, sp0, rp0, q2;
188
0
  int cc;
189
190
0
  ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
191
192
0
  np0 = np[0];
193
0
  sp0 = mpn_sqrtrem1 (rp, np[1]);
194
0
  rp0 = rp[0];
195
  /* rp0 <= 2*sp0 < 2^(Prec + 1) */
196
0
  rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
197
0
  q = rp0 / sp0;
198
  /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
199
0
  q -= q >> Prec;
200
  /* now we have q < 2^Prec */
201
0
  u = rp0 - q * sp0;
202
  /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
203
0
  sp0 = (sp0 << Prec) | q;
204
0
  cc = u >> (Prec - 1);
205
0
  rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
206
  /* subtract q * q from rp */
207
0
  q2 = q * q;
208
0
  cc -= rp0 < q2;
209
0
  rp0 -= q2;
210
0
  if (cc < 0)
211
0
    {
212
0
      rp0 += sp0;
213
0
      cc += rp0 < sp0;
214
0
      --sp0;
215
0
      rp0 += sp0;
216
0
      cc += rp0 < sp0;
217
0
    }
218
219
0
  rp[0] = rp0;
220
0
  sp[0] = sp0;
221
0
  return cc;
222
0
}
223
224
/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
225
   and in {np, n} the low n limbs of the remainder, returns the high
226
   limb of the remainder (which is 0 or 1).
227
   Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
228
   where B=2^GMP_NUMB_BITS.
229
   Needs a scratch of n/2+1 limbs. */
230
static mp_limb_t
231
mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
232
0
{
233
0
  mp_limb_t q;      /* carry out of {sp, n} */
234
0
  int c, b;     /* carry out of remainder */
235
0
  mp_size_t l, h;
236
237
0
  ASSERT (n > 1);
238
0
  ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
239
240
0
  l = n / 2;
241
0
  h = n - l;
242
0
  if (h == 1)
243
0
    q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
244
0
  else
245
0
    q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
246
0
  if (q != 0)
247
0
    ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
248
0
  TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
249
0
  mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
250
0
  q += scratch[l];
251
0
  c = scratch[0] & 1;
252
0
  mpn_rshift (sp, scratch, l, 1);
253
0
  sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
254
0
  if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
255
0
    return 1; /* Remainder is non-zero */
256
0
  q >>= 1;
257
0
  if (c != 0)
258
0
    c = mpn_add_n (np + l, np + l, sp + l, h);
259
0
  TRACE(printf("sqr(,,%u)\n", (unsigned) l));
260
0
  mpn_sqr (np + n, sp, l);
261
0
  b = q + mpn_sub_n (np, np, np + n, 2 * l);
262
0
  c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
263
264
0
  if (c < 0)
265
0
    {
266
0
      q = mpn_add_1 (sp + l, sp + l, h, q);
267
0
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
268
0
      c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
269
#else
270
      c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
271
#endif
272
0
      c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
273
0
      q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
274
0
    }
275
276
0
  return c;
277
0
}
278
279
#if USE_DIVAPPR_Q
280
static void
281
mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
282
0
{
283
0
  gmp_pi1_t inv;
284
0
  mp_limb_t qh;
285
0
  ASSERT (dn > 2);
286
0
  ASSERT (nn >= dn);
287
0
  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
288
289
0
  MPN_COPY (scratch, np, nn);
290
0
  invert_pi1 (inv, dp[dn-1], dp[dn-2]);
291
0
  if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
292
0
    qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
293
0
  else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
294
0
    qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
295
0
  else
296
0
    {
297
0
      mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
298
0
      TMP_DECL;
299
0
      TMP_MARK;
300
      /* Sadly, scratch is too small. */
301
0
      qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
302
0
      TMP_FREE;
303
0
    }
304
0
  qp [nn - dn] = qh;
305
0
}
306
#endif
307
308
/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
309
   returns zero if the operand was a perfect square, one otherwise.
310
   Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
311
   where B=2^GMP_NUMB_BITS.
312
   THINK: In the odd case, three more (dummy) limbs are taken into account,
313
   when nsh is maximal, two limbs are discarded from the result of the
314
   division. Too much? Is a single dummy limb enough? */
315
static int
316
mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
317
0
{
318
0
  mp_limb_t q;      /* carry out of {sp, n} */
319
0
  int c;      /* carry out of remainder */
320
0
  mp_size_t l, h;
321
0
  mp_ptr qp, tp, scratch;
322
0
  TMP_DECL;
323
0
  TMP_MARK;
324
325
0
  ASSERT (np[2 * n - 1 - odd] != 0);
326
0
  ASSERT (n > 4);
327
0
  ASSERT (nsh < GMP_NUMB_BITS / 2);
328
329
0
  l = (n - 1) / 2;
330
0
  h = n - l;
331
0
  ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
332
0
  scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
333
0
  tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
334
0
  if (nsh != 0)
335
0
    {
336
      /* o is used to exactly set the lowest bits of the dividend, is it needed? */
337
0
      int o = l > (1 + odd);
338
0
      ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
339
0
    }
340
0
  else
341
0
    MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
342
0
  q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
343
0
  if (q != 0)
344
0
    ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
345
0
  qp = tp + n + 1; /* l + 2 */
346
0
  TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
347
0
#if USE_DIVAPPR_Q
348
0
  mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
349
#else
350
  mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
351
#endif
352
0
  q += qp [l + 1];
353
0
  c = 1;
354
0
  if (q > 1)
355
0
    {
356
      /* FIXME: if s!=0 we will shift later, a noop on this area. */
357
0
      MPN_FILL (sp, l, GMP_NUMB_MAX);
358
0
    }
359
0
  else
360
0
    {
361
      /* FIXME: if s!=0 we will shift again later, shift just once. */
362
0
      mpn_rshift (sp, qp + 1, l, 1);
363
0
      sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
364
0
      if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
365
0
     (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
366
0
  {
367
0
    mp_limb_t cy;
368
    /* Approximation is not good enough, the extra limb(+ nsh bits)
369
       is smaller than needed to absorb the possible error. */
370
    /* {qp + 1, l + 1} equals 2*{sp, l} */
371
    /* FIXME: use mullo or wrap-around, or directly evaluate
372
       remainder with a single sqrmod_bnm1. */
373
0
    TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
374
0
    ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
375
    /* Compute the remainder of the previous mpn_div(appr)_q. */
376
0
    cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
377
0
#if USE_DIVAPPR_Q || WANT_ASSERT
378
0
    MPN_DECR_U (tp + 1 + h, l, cy);
379
0
#if USE_DIVAPPR_Q
380
0
    ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
381
0
    if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
382
0
      {
383
        /* May happen only if div result was not exact. */
384
0
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
385
0
        cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
386
#else
387
        cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
388
#endif
389
0
        ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
390
0
        MPN_DECR_U (sp, l, 1);
391
0
      }
392
    /* Can the root be exact when a correction was needed? We
393
       did not find an example, but it depends on divappr
394
       internals, and we can not assume it true in general...*/
395
    /* else */
396
#else /* WANT_ASSERT */
397
    ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
398
#endif
399
0
#endif
400
0
    if (mpn_zero_p (tp + l + 1, h - l))
401
0
      {
402
0
        TRACE(printf("sqr(,,%u)\n", (unsigned) l));
403
0
        mpn_sqr (scratch, sp, l);
404
0
        c = mpn_cmp (tp + 1, scratch + l, l);
405
0
        if (c == 0)
406
0
    {
407
0
      if (nsh != 0)
408
0
        {
409
0
          mpn_lshift (tp, np, l, 2 * nsh);
410
0
          np = tp;
411
0
        }
412
0
      c = mpn_cmp (np, scratch + odd, l - odd);
413
0
    }
414
0
        if (c < 0)
415
0
    {
416
0
      MPN_DECR_U (sp, l, 1);
417
0
      c = 1;
418
0
    }
419
0
      }
420
0
  }
421
0
    }
422
0
  TMP_FREE;
423
424
0
  if ((odd | nsh) != 0)
425
0
    mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
426
0
  return c;
427
0
}
428
429
430
mp_size_t
431
mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
432
0
{
433
0
  mp_limb_t cc, high, rl;
434
0
  int c;
435
0
  mp_size_t rn, tn;
436
0
  TMP_DECL;
437
438
0
  ASSERT (nn > 0);
439
0
  ASSERT_MPN (np, nn);
440
441
0
  ASSERT (np[nn - 1] != 0);
442
0
  ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
443
0
  ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
444
0
  ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
445
446
0
  high = np[nn - 1];
447
0
  if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
448
0
    c = 0;
449
0
  else
450
0
    {
451
0
      count_leading_zeros (c, high);
452
0
      c -= GMP_NAIL_BITS;
453
454
0
      c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
455
0
    }
456
0
  if (nn == 1) {
457
0
    if (c == 0)
458
0
      {
459
0
  sp[0] = mpn_sqrtrem1 (&rl, high);
460
0
  if (rp != NULL)
461
0
    rp[0] = rl;
462
0
      }
463
0
    else
464
0
      {
465
0
  cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
466
0
  sp[0] = cc;
467
0
  if (rp != NULL)
468
0
    rp[0] = rl = high - cc*cc;
469
0
      }
470
0
    return rl != 0;
471
0
  }
472
0
  if (nn == 2) {
473
0
    mp_limb_t tp [2];
474
0
    if (rp == NULL) rp = tp;
475
0
    if (c == 0)
476
0
      {
477
#if SQRTREM2_INPLACE
478
  rp[1] = high;
479
  rp[0] = np[0];
480
  cc = CALL_SQRTREM2_INPLACE (sp, rp);
481
#else
482
0
  cc = mpn_sqrtrem2 (sp, rp, np);
483
0
#endif
484
0
  rp[1] = cc;
485
0
  return ((rp[0] | cc) != 0) + cc;
486
0
      }
487
0
    else
488
0
      {
489
0
  rl = np[0];
490
0
  rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
491
0
  rp[0] = rl << (2*c);
492
0
  CALL_SQRTREM2_INPLACE (sp, rp);
493
0
  cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */
494
0
  rp[0] = rl -= cc*cc;  /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
495
0
  return rl != 0;
496
0
      }
497
0
  }
498
0
  tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
499
500
0
  if ((rp == NULL) && (nn > 8))
501
0
    return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
502
0
  TMP_MARK;
503
0
  if (((nn & 1) | c) != 0)
504
0
    {
505
0
      mp_limb_t s0[1], mask;
506
0
      mp_ptr tp, scratch;
507
0
      TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
508
0
      tp[0] = 0;       /* needed only when 2*tn > nn, but saves a test */
509
0
      if (c != 0)
510
0
  mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
511
0
      else
512
0
  MPN_COPY (tp + (nn & 1), np, nn);
513
0
      c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;   /* c now represents k */
514
0
      mask = (CNST_LIMB (1) << c) - 1;
515
0
      rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
516
      /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
517
   thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
518
0
      s0[0] = sp[0] & mask; /* S mod 2^k */
519
0
      rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */
520
0
      cc = mpn_submul_1 (tp, s0, 1, s0[0]);
521
0
      rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
522
0
      mpn_rshift (sp, sp, tn, c);
523
0
      tp[tn] = rl;
524
0
      if (rp == NULL)
525
0
  rp = tp;
526
0
      c = c << 1;
527
0
      if (c < GMP_NUMB_BITS)
528
0
  tn++;
529
0
      else
530
0
  {
531
0
    tp++;
532
0
    c -= GMP_NUMB_BITS;
533
0
  }
534
0
      if (c != 0)
535
0
  mpn_rshift (rp, tp, tn, c);
536
0
      else
537
0
  MPN_COPY_INCR (rp, tp, tn);
538
0
      rn = tn;
539
0
    }
540
0
  else
541
0
    {
542
0
      if (rp != np)
543
0
  {
544
0
    if (rp == NULL) /* nn <= 8 */
545
0
      rp = TMP_SALLOC_LIMBS (nn);
546
0
    MPN_COPY (rp, np, nn);
547
0
  }
548
0
      rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
549
0
    }
550
551
0
  MPN_NORMALIZE (rp, rn);
552
553
0
  TMP_FREE;
554
0
  return rn;
555
0
}