Coverage Report

Created: 2024-11-21 07:03

/src/libgmp/mpn/sqrtrem.c
Line
Count
Source (jump to first uncovered line)
1
/* mpn_sqrtrem -- square root and remainder
2
3
   Contributed to the GNU project by Paul Zimmermann (most code),
4
   Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
5
6
   THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
7
   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
8
   IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
9
   FUTURE GMP RELEASE.
10
11
Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
12
Foundation, Inc.
13
14
This file is part of the GNU MP Library.
15
16
The GNU MP Library is free software; you can redistribute it and/or modify
17
it under the terms of either:
18
19
  * the GNU Lesser General Public License as published by the Free
20
    Software Foundation; either version 3 of the License, or (at your
21
    option) any later version.
22
23
or
24
25
  * the GNU General Public License as published by the Free Software
26
    Foundation; either version 2 of the License, or (at your option) any
27
    later version.
28
29
or both in parallel, as here.
30
31
The GNU MP Library is distributed in the hope that it will be useful, but
32
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
34
for more details.
35
36
You should have received copies of the GNU General Public License and the
37
GNU Lesser General Public License along with the GNU MP Library.  If not,
38
see https://www.gnu.org/licenses/.  */
39
40
41
/* See "Karatsuba Square Root", reference in gmp.texi.  */
42
43
44
#include <stdio.h>
45
#include <stdlib.h>
46
47
#include "gmp-impl.h"
48
#include "longlong.h"
49
316
#define USE_DIVAPPR_Q 1
50
#define TRACE(x)
51
52
static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
53
{
54
  0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
55
  0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
56
  0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
57
  0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
58
  0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
59
  0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
60
  0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
61
  0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
62
  0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
63
  0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
64
  0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
65
  0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
66
  0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
67
  0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
68
  0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
69
  0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
70
  0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
71
  0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
72
  0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
73
  0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
74
  0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
75
  0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
76
  0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
77
  0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
78
  0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
79
  0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
80
  0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
81
  0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
82
  0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
83
  0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
84
  0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
85
  0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
86
  0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
87
  0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
88
  0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
89
  0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
90
  0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
91
  0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
92
  0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
93
  0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
94
  0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
95
  0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
96
  0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
97
  0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
98
  0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
99
  0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
100
  0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
101
  0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
102
};
103
104
/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
105
106
#if GMP_NUMB_BITS > 32
107
1.59k
#define MAGIC CNST_LIMB(0x10000000000)  /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
108
#else
109
#define MAGIC CNST_LIMB(0x100000)   /* 0xfee6f < MAGIC < 0x29cbc8 */
110
#endif
111
112
static mp_limb_t
113
mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
114
1.59k
{
115
1.59k
#if GMP_NUMB_BITS > 32
116
1.59k
  mp_limb_t a1;
117
1.59k
#endif
118
1.59k
  mp_limb_t x0, t2, t, x2;
119
1.59k
  unsigned abits;
120
121
1.59k
  ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
122
1.59k
  ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
123
1.59k
  ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
124
125
  /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
126
     since we can do the former without division.  As part of the last
127
     iteration convert from 1/sqrt(a) to sqrt(a).  */
128
129
1.59k
  abits = a0 >> (GMP_LIMB_BITS - 1 - 8);  /* extract bits for table lookup */
130
1.59k
  x0 = 0x100 | invsqrttab[abits - 0x80];  /* initial 1/sqrt(a) */
131
132
  /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
133
134
1.59k
#if GMP_NUMB_BITS > 32
135
1.59k
  a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
136
1.59k
  t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
137
1.59k
  x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
138
139
  /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
140
141
1.59k
  t2 = x0 * (a0 >> (32-8));
142
1.59k
  t = t2 >> 25;
143
1.59k
  t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
144
1.59k
  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
145
1.59k
  x0 >>= 32;
146
#else
147
  t2 = x0 * (a0 >> (16-8));
148
  t = t2 >> 13;
149
  t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
150
  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
151
  x0 >>= 16;
152
#endif
153
154
  /* x0 is now a full limb approximation of sqrt(a0) */
155
156
1.59k
  x2 = x0 * x0;
157
1.59k
  if (x2 + 2*x0 <= a0 - 1)
158
51
    {
159
51
      x2 += 2*x0 + 1;
160
51
      x0++;
161
51
    }
162
163
1.59k
  *rp = a0 - x2;
164
1.59k
  return x0;
165
1.59k
}
166
167
168
2.31k
#define Prec (GMP_NUMB_BITS >> 1)
169
#if ! defined(SQRTREM2_INPLACE)
170
#define SQRTREM2_INPLACE 0
171
#endif
172
173
/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
174
   return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
175
#if SQRTREM2_INPLACE
176
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
177
static mp_limb_t
178
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
179
{
180
  mp_srcptr np = rp;
181
#else
182
317
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
183
static mp_limb_t
184
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
185
330
{
186
330
#endif
187
330
  mp_limb_t q, u, np0, sp0, rp0, q2;
188
330
  int cc;
189
190
330
  ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
191
192
330
  np0 = np[0];
193
330
  sp0 = mpn_sqrtrem1 (rp, np[1]);
194
330
  rp0 = rp[0];
195
  /* rp0 <= 2*sp0 < 2^(Prec + 1) */
196
330
  rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
197
330
  q = rp0 / sp0;
198
  /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
199
330
  q -= q >> Prec;
200
  /* now we have q < 2^Prec */
201
330
  u = rp0 - q * sp0;
202
  /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
203
330
  sp0 = (sp0 << Prec) | q;
204
330
  cc = u >> (Prec - 1);
205
330
  rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
206
  /* subtract q * q from rp */
207
330
  q2 = q * q;
208
330
  cc -= rp0 < q2;
209
330
  rp0 -= q2;
210
330
  if (cc < 0)
211
112
    {
212
112
      rp0 += sp0;
213
112
      cc += rp0 < sp0;
214
112
      --sp0;
215
112
      rp0 += sp0;
216
112
      cc += rp0 < sp0;
217
112
    }
218
219
330
  rp[0] = rp0;
220
330
  sp[0] = sp0;
221
330
  return cc;
222
330
}
223
224
/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
225
   and in {np, n} the low n limbs of the remainder, returns the high
226
   limb of the remainder (which is 0 or 1).
227
   Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
228
   where B=2^GMP_NUMB_BITS.
229
   Needs a scratch of n/2+1 limbs. */
230
static mp_limb_t
231
mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
232
933
{
233
933
  mp_limb_t q;      /* carry out of {sp, n} */
234
933
  int c, b;     /* carry out of remainder */
235
933
  mp_size_t l, h;
236
237
933
  ASSERT (n > 1);
238
933
  ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
239
240
933
  l = n / 2;
241
933
  h = n - l;
242
933
  if (h == 1)
243
294
    q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
244
639
  else
245
639
    q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
246
933
  if (q != 0)
247
933
    ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
248
933
  TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
249
933
  mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
250
933
  q += scratch[l];
251
933
  c = scratch[0] & 1;
252
933
  mpn_rshift (sp, scratch, l, 1);
253
933
  sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
254
933
  if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
255
56
    return 1; /* Remainder is non-zero */
256
877
  q >>= 1;
257
877
  if (c != 0)
258
425
    c = mpn_add_n (np + l, np + l, sp + l, h);
259
877
  TRACE(printf("sqr(,,%u)\n", (unsigned) l));
260
877
  mpn_sqr (np + n, sp, l);
261
877
  b = q + mpn_sub_n (np, np, np + n, 2 * l);
262
877
  c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
263
264
877
  if (c < 0)
265
182
    {
266
182
      q = mpn_add_1 (sp + l, sp + l, h, q);
267
182
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
268
182
      c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
269
#else
270
      c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
271
#endif
272
182
      c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
273
182
      q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
274
182
    }
275
276
877
  return c;
277
933
}
278
279
#if USE_DIVAPPR_Q
280
static void
281
mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
282
158
{
283
158
  gmp_pi1_t inv;
284
158
  mp_limb_t qh;
285
158
  ASSERT (dn > 2);
286
158
  ASSERT (nn >= dn);
287
158
  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
288
289
158
  MPN_COPY (scratch, np, nn);
290
158
  invert_pi1 (inv, dp[dn-1], dp[dn-2]);
291
158
  if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
292
158
    qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
293
0
  else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
294
0
    qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
295
0
  else
296
0
    {
297
0
      mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
298
0
      TMP_DECL;
299
0
      TMP_MARK;
300
      /* Sadly, scratch is too small. */
301
0
      qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
302
0
      TMP_FREE;
303
0
    }
304
158
  qp [nn - dn] = qh;
305
158
}
306
#endif
307
308
/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
309
   returns zero if the operand was a perfect square, one otherwise.
310
   Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
311
   where B=2^GMP_NUMB_BITS.
312
   THINK: In the odd case, three more (dummy) limbs are taken into account,
313
   when nsh is maximal, two limbs are discarded from the result of the
314
   division. Too much? Is a single dummy limb enough? */
315
static int
316
mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
317
158
{
318
158
  mp_limb_t q;      /* carry out of {sp, n} */
319
158
  int c;      /* carry out of remainder */
320
158
  mp_size_t l, h;
321
158
  mp_ptr qp, tp, scratch;
322
158
  TMP_DECL;
323
158
  TMP_MARK;
324
325
158
  ASSERT (np[2 * n - 1 - odd] != 0);
326
158
  ASSERT (n > 4);
327
158
  ASSERT (nsh < GMP_NUMB_BITS / 2);
328
329
158
  l = (n - 1) / 2;
330
158
  h = n - l;
331
158
  ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
332
158
  scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
333
158
  tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
334
158
  if (nsh != 0)
335
148
    {
336
      /* o is used to exactly set the lowest bits of the dividend, is it needed? */
337
148
      int o = l > (1 + odd);
338
148
      ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
339
148
    }
340
10
  else
341
10
    MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
342
158
  q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
343
158
  if (q != 0)
344
158
    ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
345
158
  qp = tp + n + 1; /* l + 2 */
346
158
  TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
347
158
#if USE_DIVAPPR_Q
348
158
  mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
349
#else
350
  mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
351
#endif
352
158
  q += qp [l + 1];
353
158
  c = 1;
354
158
  if (q > 1)
355
0
    {
356
      /* FIXME: if s!=0 we will shift later, a noop on this area. */
357
0
      MPN_FILL (sp, l, GMP_NUMB_MAX);
358
0
    }
359
158
  else
360
158
    {
361
      /* FIXME: if s!=0 we will shift again later, shift just once. */
362
158
      mpn_rshift (sp, qp + 1, l, 1);
363
158
      sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
364
158
      if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
365
158
     (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
366
9
  {
367
9
    mp_limb_t cy;
368
    /* Approximation is not good enough, the extra limb(+ nsh bits)
369
       is smaller than needed to absorb the possible error. */
370
    /* {qp + 1, l + 1} equals 2*{sp, l} */
371
    /* FIXME: use mullo or wrap-around, or directly evaluate
372
       remainder with a single sqrmod_bnm1. */
373
9
    TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
374
9
    ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
375
    /* Compute the remainder of the previous mpn_div(appr)_q. */
376
9
    cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
377
9
#if USE_DIVAPPR_Q || WANT_ASSERT
378
9
    MPN_DECR_U (tp + 1 + h, l, cy);
379
9
#if USE_DIVAPPR_Q
380
9
    ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
381
9
    if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
382
0
      {
383
        /* May happen only if div result was not exact. */
384
0
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
385
0
        cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
386
#else
387
        cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
388
#endif
389
0
        ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
390
0
        MPN_DECR_U (sp, l, 1);
391
0
      }
392
    /* Can the root be exact when a correction was needed? We
393
       did not find an example, but it depends on divappr
394
       internals, and we can not assume it true in general...*/
395
    /* else */
396
#else /* WANT_ASSERT */
397
    ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
398
#endif
399
9
#endif
400
9
    if (mpn_zero_p (tp + l + 1, h - l))
401
8
      {
402
8
        TRACE(printf("sqr(,,%u)\n", (unsigned) l));
403
8
        mpn_sqr (scratch, sp, l);
404
8
        c = mpn_cmp (tp + 1, scratch + l, l);
405
8
        if (c == 0)
406
3
    {
407
3
      if (nsh != 0)
408
3
        {
409
3
          mpn_lshift (tp, np, l, 2 * nsh);
410
3
          np = tp;
411
3
        }
412
3
      c = mpn_cmp (np, scratch + odd, l - odd);
413
3
    }
414
8
        if (c < 0)
415
1
    {
416
1
      MPN_DECR_U (sp, l, 1);
417
1
      c = 1;
418
1
    }
419
8
      }
420
9
  }
421
158
    }
422
158
  TMP_FREE;
423
424
158
  if ((odd | nsh) != 0)
425
155
    mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
426
158
  return c;
427
158
}
428
429
430
mp_size_t
431
mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
432
1.59k
{
433
1.59k
  mp_limb_t cc, high, rl;
434
1.59k
  int c;
435
1.59k
  mp_size_t rn, tn;
436
1.59k
  TMP_DECL;
437
438
1.59k
  ASSERT (nn > 0);
439
1.59k
  ASSERT_MPN (np, nn);
440
441
1.59k
  ASSERT (np[nn - 1] != 0);
442
1.59k
  ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
443
1.59k
  ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
444
1.59k
  ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
445
446
1.59k
  high = np[nn - 1];
447
1.59k
  if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
448
62
    c = 0;
449
1.52k
  else
450
1.52k
    {
451
1.52k
      count_leading_zeros (c, high);
452
1.52k
      c -= GMP_NAIL_BITS;
453
454
1.52k
      c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
455
1.52k
    }
456
1.59k
  if (nn == 1) {
457
1.26k
    if (c == 0)
458
2
      {
459
2
  sp[0] = mpn_sqrtrem1 (&rl, high);
460
2
  if (rp != NULL)
461
1
    rp[0] = rl;
462
2
      }
463
1.25k
    else
464
1.25k
      {
465
1.25k
  cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
466
1.25k
  sp[0] = cc;
467
1.25k
  if (rp != NULL)
468
2
    rp[0] = rl = high - cc*cc;
469
1.25k
      }
470
1.26k
    return rl != 0;
471
1.26k
  }
472
330
  if (nn == 2) {
473
36
    mp_limb_t tp [2];
474
36
    if (rp == NULL) rp = tp;
475
36
    if (c == 0)
476
13
      {
477
#if SQRTREM2_INPLACE
478
  rp[1] = high;
479
  rp[0] = np[0];
480
  cc = CALL_SQRTREM2_INPLACE (sp, rp);
481
#else
482
13
  cc = mpn_sqrtrem2 (sp, rp, np);
483
13
#endif
484
13
  rp[1] = cc;
485
13
  return ((rp[0] | cc) != 0) + cc;
486
13
      }
487
23
    else
488
23
      {
489
23
  rl = np[0];
490
23
  rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
491
23
  rp[0] = rl << (2*c);
492
23
  CALL_SQRTREM2_INPLACE (sp, rp);
493
23
  cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */
494
23
  rp[0] = rl -= cc*cc;  /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
495
23
  return rl != 0;
496
23
      }
497
36
  }
498
294
  tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
499
500
294
  if ((rp == NULL) && (nn > 8))
501
158
    return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
502
136
  TMP_MARK;
503
136
  if (((nn & 1) | c) != 0)
504
120
    {
505
120
      mp_limb_t s0[1], mask;
506
120
      mp_ptr tp, scratch;
507
120
      TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
508
120
      tp[0] = 0;       /* needed only when 2*tn > nn, but saves a test */
509
120
      if (c != 0)
510
99
  mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
511
21
      else
512
21
  MPN_COPY (tp + (nn & 1), np, nn);
513
120
      c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;   /* c now represents k */
514
120
      mask = (CNST_LIMB (1) << c) - 1;
515
120
      rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
516
      /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
517
   thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
518
120
      s0[0] = sp[0] & mask; /* S mod 2^k */
519
120
      rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */
520
120
      cc = mpn_submul_1 (tp, s0, 1, s0[0]);
521
120
      rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
522
120
      mpn_rshift (sp, sp, tn, c);
523
120
      tp[tn] = rl;
524
120
      if (rp == NULL)
525
60
  rp = tp;
526
120
      c = c << 1;
527
120
      if (c < GMP_NUMB_BITS)
528
58
  tn++;
529
62
      else
530
62
  {
531
62
    tp++;
532
62
    c -= GMP_NUMB_BITS;
533
62
  }
534
120
      if (c != 0)
535
99
  mpn_rshift (rp, tp, tn, c);
536
21
      else
537
21
  MPN_COPY_INCR (rp, tp, tn);
538
120
      rn = tn;
539
120
    }
540
16
  else
541
16
    {
542
16
      if (rp != np)
543
16
  {
544
16
    if (rp == NULL) /* nn <= 8 */
545
5
      rp = TMP_SALLOC_LIMBS (nn);
546
16
    MPN_COPY (rp, np, nn);
547
16
  }
548
16
      rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
549
16
    }
550
551
136
  MPN_NORMALIZE (rp, rn);
552
553
136
  TMP_FREE;
554
136
  return rn;
555
136
}