/src/libgmp/mpn/sqrtrem.c

Source (jump to first uncovered line)
/* mpn_sqrtrem -- square root and remainder

   Contributed to the GNU project by Paul Zimmermann (most code),
   Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).

   THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
   IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
   FUTURE GMP RELEASE.

Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */


/* See "Karatsuba Square Root", reference in gmp.texi.  */


#include <stdio.h>
#include <stdlib.h>

#include "gmp-impl.h"
#include "longlong.h"
#define USE_DIVAPPR_Q 1
#define TRACE(x)

static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
{
  0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
  0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
  0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
  0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
  0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
  0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
  0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
  0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
  0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
  0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
  0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
  0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
  0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
  0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
  0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
  0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
  0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
  0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
  0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
  0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
  0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
  0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
  0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
  0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
  0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
  0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
  0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
  0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
  0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
  0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
  0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
  0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
  0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
  0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
  0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
  0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
  0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
  0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
  0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
  0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
  0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
  0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
  0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
  0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
  0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
  0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
  0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
  0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
};

/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */

#if GMP_NUMB_BITS > 32
#define MAGIC CNST_LIMB(0x10000000000)  /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
#else
#define MAGIC CNST_LIMB(0x100000)   /* 0xfee6f < MAGIC < 0x29cbc8 */
#endif

static mp_limb_t
mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
{
#if GMP_NUMB_BITS > 32
  mp_limb_t a1;
#endif
  mp_limb_t x0, t2, t, x2;
  unsigned abits;

  ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
  ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
  ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);

  /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
     since we can do the former without division.  As part of the last
     iteration convert from 1/sqrt(a) to sqrt(a).  */

  abits = a0 >> (GMP_LIMB_BITS - 1 - 8);  /* extract bits for table lookup */
  x0 = 0x100 | invsqrttab[abits - 0x80];  /* initial 1/sqrt(a) */

  /* x0 is now an 8 bits approximation of 1/sqrt(a0) */

#if GMP_NUMB_BITS > 32
  a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
  t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
  x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));

  /* x0 is now a 16 bits approximation of 1/sqrt(a0) */

  t2 = x0 * (a0 >> (32-8));
  t = t2 >> 25;
  t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
  x0 >>= 32;
#else
  t2 = x0 * (a0 >> (16-8));
  t = t2 >> 13;
  t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
  x0 >>= 16;
#endif

  /* x0 is now a full limb approximation of sqrt(a0) */

  x2 = x0 * x0;
  if (x2 + 2*x0 <= a0 - 1)
    {
      x2 += 2*x0 + 1;
      x0++;
    }

  *rp = a0 - x2;
  return x0;
}


#define Prec (GMP_NUMB_BITS >> 1)
#if ! defined(SQRTREM2_INPLACE)
#define SQRTREM2_INPLACE 0
#endif

/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
   return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
#if SQRTREM2_INPLACE
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
static mp_limb_t
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
{
  mp_srcptr np = rp;
#else
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
static mp_limb_t
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
{
#endif
  mp_limb_t q, u, np0, sp0, rp0, q2;
  int cc;

  ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);

  np0 = np[0];
  sp0 = mpn_sqrtrem1 (rp, np[1]);
  rp0 = rp[0];
  /* rp0 <= 2*sp0 < 2^(Prec + 1) */
  rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
  q = rp0 / sp0;
  /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
  q -= q >> Prec;
  /* now we have q < 2^Prec */
  u = rp0 - q * sp0;
  /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
  sp0 = (sp0 << Prec) | q;
  cc = u >> (Prec - 1);
  rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
  /* subtract q * q from rp */
  q2 = q * q;
  cc -= rp0 < q2;
  rp0 -= q2;
  if (cc < 0)
    {
      rp0 += sp0;
      cc += rp0 < sp0;
      --sp0;
      rp0 += sp0;
      cc += rp0 < sp0;
    }

  rp[0] = rp0;
  sp[0] = sp0;
  return cc;
}

/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
   and in {np, n} the low n limbs of the remainder, returns the high
   limb of the remainder (which is 0 or 1).
   Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
   where B=2^GMP_NUMB_BITS.
   Needs a scratch of n/2+1 limbs. */
static mp_limb_t
mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
{
  mp_limb_t q;      /* carry out of {sp, n} */
  int c, b;     /* carry out of remainder */
  mp_size_t l, h;

  ASSERT (n > 1);
  ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);

  l = n / 2;
  h = n - l;
  if (h == 1)
    q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
  else
    q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
  if (q != 0)
    ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
  TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
  mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
  q += scratch[l];
  c = scratch[0] & 1;
  mpn_rshift (sp, scratch, l, 1);
  sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
  if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
    return 1; /* Remainder is non-zero */
  q >>= 1;
  if (c != 0)
    c = mpn_add_n (np + l, np + l, sp + l, h);
  TRACE(printf("sqr(,,%u)\n", (unsigned) l));
  mpn_sqr (np + n, sp, l);
  b = q + mpn_sub_n (np, np, np + n, 2 * l);
  c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);

  if (c < 0)
    {
      q = mpn_add_1 (sp + l, sp + l, h, q);
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
      c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
#else
      c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
#endif
      c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
      q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
    }

  return c;
}

#if USE_DIVAPPR_Q
static void
mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
{
  gmp_pi1_t inv;
  mp_limb_t qh;
  ASSERT (dn > 2);
  ASSERT (nn >= dn);
  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);

  MPN_COPY (scratch, np, nn);
  invert_pi1 (inv, dp[dn-1], dp[dn-2]);
  if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
    qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
  else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
    qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
  else
    {
      mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
      TMP_DECL;
      TMP_MARK;
      /* Sadly, scratch is too small. */
      qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
      TMP_FREE;
    }
  qp [nn - dn] = qh;
}
#endif

/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
   returns zero if the operand was a perfect square, one otherwise.
   Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
   where B=2^GMP_NUMB_BITS.
   THINK: In the odd case, three more (dummy) limbs are taken into account,
   when nsh is maximal, two limbs are discarded from the result of the
   division. Too much? Is a single dummy limb enough? */
static int
mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
{
  mp_limb_t q;      /* carry out of {sp, n} */
  int c;      /* carry out of remainder */
  mp_size_t l, h;
  mp_ptr qp, tp, scratch;
  TMP_DECL;
  TMP_MARK;

  ASSERT (np[2 * n - 1 - odd] != 0);
  ASSERT (n > 4);
  ASSERT (nsh < GMP_NUMB_BITS / 2);

  l = (n - 1) / 2;
  h = n - l;
  ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
  scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
  tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
  if (nsh != 0)
    {
      /* o is used to exactly set the lowest bits of the dividend, is it needed? */
      int o = l > (1 + odd);
      ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
    }
  else
    MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
  q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
  if (q != 0)
    ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
  qp = tp + n + 1; /* l + 2 */
  TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
#if USE_DIVAPPR_Q
  mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
#else
  mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
#endif
  q += qp [l + 1];
  c = 1;
  if (q > 1)
    {
      /* FIXME: if s!=0 we will shift later, a noop on this area. */
      MPN_FILL (sp, l, GMP_NUMB_MAX);
    }
  else
    {
      /* FIXME: if s!=0 we will shift again later, shift just once. */
      mpn_rshift (sp, qp + 1, l, 1);
      sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
      if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
     (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
  {
    mp_limb_t cy;
    /* Approximation is not good enough, the extra limb(+ nsh bits)
       is smaller than needed to absorb the possible error. */
    /* {qp + 1, l + 1} equals 2*{sp, l} */
    /* FIXME: use mullo or wrap-around, or directly evaluate
       remainder with a single sqrmod_bnm1. */
    TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
    ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
    /* Compute the remainder of the previous mpn_div(appr)_q. */
    cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
#if USE_DIVAPPR_Q || WANT_ASSERT
    MPN_DECR_U (tp + 1 + h, l, cy);
#if USE_DIVAPPR_Q
    ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
    if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
      {
        /* May happen only if div result was not exact. */
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
        cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
#else
        cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
#endif
        ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
        MPN_DECR_U (sp, l, 1);
      }
    /* Can the root be exact when a correction was needed? We
       did not find an example, but it depends on divappr
       internals, and we can not assume it true in general...*/
    /* else */
#else /* WANT_ASSERT */
    ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
#endif
#endif
    if (mpn_zero_p (tp + l + 1, h - l))
      {
        TRACE(printf("sqr(,,%u)\n", (unsigned) l));
        mpn_sqr (scratch, sp, l);
        c = mpn_cmp (tp + 1, scratch + l, l);
        if (c == 0)
    {
      if (nsh != 0)
        {
          mpn_lshift (tp, np, l, 2 * nsh);
          np = tp;
        }
      c = mpn_cmp (np, scratch + odd, l - odd);
    }
        if (c < 0)
    {
      MPN_DECR_U (sp, l, 1);
      c = 1;
    }
      }
  }
    }
  TMP_FREE;

  if ((odd | nsh) != 0)
    mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
  return c;
}


mp_size_t
mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
{
  mp_limb_t cc, high, rl;
  int c;
  mp_size_t rn, tn;
  TMP_DECL;

  ASSERT (nn > 0);
  ASSERT_MPN (np, nn);

  ASSERT (np[nn - 1] != 0);
  ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
  ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
  ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));

  high = np[nn - 1];
  if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
    c = 0;
  else
    {
      count_leading_zeros (c, high);
      c -= GMP_NAIL_BITS;

      c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
    }
  if (nn == 1) {
    if (c == 0)
      {
  sp[0] = mpn_sqrtrem1 (&rl, high);
  if (rp != NULL)
    rp[0] = rl;
      }
    else
      {
  cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
  sp[0] = cc;
  if (rp != NULL)
    rp[0] = rl = high - cc*cc;
      }
    return rl != 0;
  }
  if (nn == 2) {
    mp_limb_t tp [2];
    if (rp == NULL) rp = tp;
    if (c == 0)
      {
#if SQRTREM2_INPLACE
  rp[1] = high;
  rp[0] = np[0];
  cc = CALL_SQRTREM2_INPLACE (sp, rp);
#else
  cc = mpn_sqrtrem2 (sp, rp, np);
#endif
  rp[1] = cc;
  return ((rp[0] | cc) != 0) + cc;
      }
    else
      {
  rl = np[0];
  rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
  rp[0] = rl << (2*c);
  CALL_SQRTREM2_INPLACE (sp, rp);
  cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */
  rp[0] = rl -= cc*cc;  /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
  return rl != 0;
      }
  }
  tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */

  if ((rp == NULL) && (nn > 8))
    return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
  TMP_MARK;
  if (((nn & 1) | c) != 0)
    {
      mp_limb_t s0[1], mask;
      mp_ptr tp, scratch;
      TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
      tp[0] = 0;       /* needed only when 2*tn > nn, but saves a test */
      if (c != 0)
  mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
      else
  MPN_COPY (tp + (nn & 1), np, nn);
      c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;   /* c now represents k */
      mask = (CNST_LIMB (1) << c) - 1;
      rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
      /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
   thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
      s0[0] = sp[0] & mask; /* S mod 2^k */
      rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */
      cc = mpn_submul_1 (tp, s0, 1, s0[0]);
      rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
      mpn_rshift (sp, sp, tn, c);
      tp[tn] = rl;
      if (rp == NULL)
  rp = tp;
      c = c << 1;
      if (c < GMP_NUMB_BITS)
  tn++;
      else
  {
    tp++;
    c -= GMP_NUMB_BITS;
  }
      if (c != 0)
  mpn_rshift (rp, tp, tn, c);
      else
  MPN_COPY_INCR (rp, tp, tn);
      rn = tn;
    }
  else
    {
      if (rp != np)
  {
    if (rp == NULL) /* nn <= 8 */
      rp = TMP_SALLOC_LIMBS (nn);
    MPN_COPY (rp, np, nn);
  }
      rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
    }

  MPN_NORMALIZE (rp, rn);

  TMP_FREE;
  return rn;
}

Coverage Report

Created: 2024-11-21 07:03

Line	Count	Source (jump to first uncovered line)
1		/* mpn_sqrtrem -- square root and remainder
2
3		Contributed to the GNU project by Paul Zimmermann (most code),
4		Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
5
6		THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
7		INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
8		IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
9		FUTURE GMP RELEASE.
10
11		Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
12		Foundation, Inc.
13
14		This file is part of the GNU MP Library.
15
16		The GNU MP Library is free software; you can redistribute it and/or modify
17		it under the terms of either:
18
19		* the GNU Lesser General Public License as published by the Free
20		Software Foundation; either version 3 of the License, or (at your
21		option) any later version.
22
23		or
24
25		* the GNU General Public License as published by the Free Software
26		Foundation; either version 2 of the License, or (at your option) any
27		later version.
28
29		or both in parallel, as here.
30
31		The GNU MP Library is distributed in the hope that it will be useful, but
32		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
34		for more details.
35
36		You should have received copies of the GNU General Public License and the
37		GNU Lesser General Public License along with the GNU MP Library. If not,
38		see https://www.gnu.org/licenses/. */
39
40
41		/* See "Karatsuba Square Root", reference in gmp.texi. */
42
43
44		#include <stdio.h>
45		#include <stdlib.h>
46
47		#include "gmp-impl.h"
48		#include "longlong.h"
49	316	#define USE_DIVAPPR_Q 1
50		#define TRACE(x)
51
52		static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
53		{
54		0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
55		0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
56		0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
57		0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
58		0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
59		0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
60		0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
61		0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
62		0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
63		0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
64		0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
65		0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
66		0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
67		0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
68		0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
69		0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
70		0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
71		0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
72		0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
73		0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
74		0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
75		0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
76		0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
77		0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
78		0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
79		0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
80		0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
81		0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
82		0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
83		0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
84		0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
85		0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
86		0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
87		0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
88		0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
89		0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
90		0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
91		0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
92		0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
93		0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
94		0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
95		0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
96		0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
97		0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
98		0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
99		0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
100		0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
101		0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */
102		};
103
104		/* Compute s = floor(sqrt(a0)), and rp = a0 - s^2. /
105
106		#if GMP_NUMB_BITS > 32
107	1.59k	#define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
108		#else
109		#define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */
110		#endif
111
112		static mp_limb_t
113		mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
114	1.59k	{
115	1.59k	#if GMP_NUMB_BITS > 32
116	1.59k	mp_limb_t a1;
117	1.59k	#endif
118	1.59k	mp_limb_t x0, t2, t, x2;
119	1.59k	unsigned abits;
120
121	1.59k	ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
122	1.59k	ASSERT_ALWAYS (GMP_LIMB_BITS == 32 \|\| GMP_LIMB_BITS == 64);
123	1.59k	ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
124
125		/* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
126		since we can do the former without division. As part of the last
127		iteration convert from 1/sqrt(a) to sqrt(a). */
128
129	1.59k	abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */
130	1.59k	x0 = 0x100 \| invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */
131
132		/* x0 is now an 8 bits approximation of 1/sqrt(a0) */
133
134	1.59k	#if GMP_NUMB_BITS > 32
135	1.59k	a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
136	1.59k	t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
137	1.59k	x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
138
139		/* x0 is now a 16 bits approximation of 1/sqrt(a0) */
140
141	1.59k	t2 = x0 * (a0 >> (32-8));
142	1.59k	t = t2 >> 25;
143	1.59k	t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
144	1.59k	x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
145	1.59k	x0 >>= 32;
146		#else
147		t2 = x0 * (a0 >> (16-8));
148		t = t2 >> 13;
149		t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
150		x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
151		x0 >>= 16;
152		#endif
153
154		/* x0 is now a full limb approximation of sqrt(a0) */
155
156	1.59k	x2 = x0 * x0;
157	1.59k	if (x2 + 2*x0 <= a0 - 1)
158	51	{
159	51	x2 += 2*x0 + 1;
160	51	x0++;
161	51	}
162
163	1.59k	*rp = a0 - x2;
164	1.59k	return x0;
165	1.59k	}
166
167
168	2.31k	#define Prec (GMP_NUMB_BITS >> 1)
169		#if ! defined(SQRTREM2_INPLACE)
170		#define SQRTREM2_INPLACE 0
171		#endif
172
173		/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
174		return cc such that {np, 2} = sp[0]^2 + cc2^GMP_NUMB_BITS + rp[0] /
175		#if SQRTREM2_INPLACE
176		#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
177		static mp_limb_t
178		mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
179		{
180		mp_srcptr np = rp;
181		#else
182	317	#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
183		static mp_limb_t
184		mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
185	330	{
186	330	#endif
187	330	mp_limb_t q, u, np0, sp0, rp0, q2;
188	330	int cc;
189
190	330	ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
191
192	330	np0 = np[0];
193	330	sp0 = mpn_sqrtrem1 (rp, np[1]);
194	330	rp0 = rp[0];
195		/* rp0 <= 2sp0 < 2^(Prec + 1) /
196	330	rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
197	330	q = rp0 / sp0;
198		/* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
199	330	q -= q >> Prec;
200		/* now we have q < 2^Prec */
201	330	u = rp0 - q * sp0;
202		/* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
203	330	sp0 = (sp0 << Prec) \| q;
204	330	cc = u >> (Prec - 1);
205	330	rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
206		/* subtract q * q from rp */
207	330	q2 = q * q;
208	330	cc -= rp0 < q2;
209	330	rp0 -= q2;
210	330	if (cc < 0)
211	112	{
212	112	rp0 += sp0;
213	112	cc += rp0 < sp0;
214	112	--sp0;
215	112	rp0 += sp0;
216	112	cc += rp0 < sp0;
217	112	}
218
219	330	rp[0] = rp0;
220	330	sp[0] = sp0;
221	330	return cc;
222	330	}
223
224		/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
225		and in {np, n} the low n limbs of the remainder, returns the high
226		limb of the remainder (which is 0 or 1).
227		Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
228		where B=2^GMP_NUMB_BITS.
229		Needs a scratch of n/2+1 limbs. */
230		static mp_limb_t
231		mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
232	933	{
233	933	mp_limb_t q; /* carry out of {sp, n} */
234	933	int c, b; /* carry out of remainder */
235	933	mp_size_t l, h;
236
237	933	ASSERT (n > 1);
238	933	ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
239
240	933	l = n / 2;
241	933	h = n - l;
242	933	if (h == 1)
243	294	q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
244	639	else
245	639	q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
246	933	if (q != 0)
247	933	ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
248	933	TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
249	933	mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
250	933	q += scratch[l];
251	933	c = scratch[0] & 1;
252	933	mpn_rshift (sp, scratch, l, 1);
253	933	sp[l - 1] \|= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
254	933	if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
255	56	return 1; /* Remainder is non-zero */
256	877	q >>= 1;
257	877	if (c != 0)
258	425	c = mpn_add_n (np + l, np + l, sp + l, h);
259	877	TRACE(printf("sqr(,,%u)\n", (unsigned) l));
260	877	mpn_sqr (np + n, sp, l);
261	877	b = q + mpn_sub_n (np, np, np + n, 2 * l);
262	877	c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
263
264	877	if (c < 0)
265	182	{
266	182	q = mpn_add_1 (sp + l, sp + l, h, q);
267	182	#if HAVE_NATIVE_mpn_addlsh1_n_ip1 \|\| HAVE_NATIVE_mpn_addlsh1_n
268	182	c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
269		#else
270		c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
271		#endif
272	182	c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
273	182	q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
274	182	}
275
276	877	return c;
277	933	}
278
279		#if USE_DIVAPPR_Q
280		static void
281		mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
282	158	{
283	158	gmp_pi1_t inv;
284	158	mp_limb_t qh;
285	158	ASSERT (dn > 2);
286	158	ASSERT (nn >= dn);
287	158	ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
288
289	158	MPN_COPY (scratch, np, nn);
290	158	invert_pi1 (inv, dp[dn-1], dp[dn-2]);
291	158	if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
292	158	qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
293	0	else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
294	0	qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
295	0	else
296	0	{
297	0	mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
298	0	TMP_DECL;
299	0	TMP_MARK;
300		/* Sadly, scratch is too small. */
301	0	qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
302	0	TMP_FREE;
303	0	}
304	158	qp [nn - dn] = qh;
305	158	}
306		#endif
307
308		/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
309		returns zero if the operand was a perfect square, one otherwise.
310		Assumes {np, 2n-odd}4^nsh is normalized, i.e. B > np[2n-1-odd]4^nsh >= B/4
311		where B=2^GMP_NUMB_BITS.
312		THINK: In the odd case, three more (dummy) limbs are taken into account,
313		when nsh is maximal, two limbs are discarded from the result of the
314		division. Too much? Is a single dummy limb enough? */
315		static int
316		mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
317	158	{
318	158	mp_limb_t q; /* carry out of {sp, n} */
319	158	int c; /* carry out of remainder */
320	158	mp_size_t l, h;
321	158	mp_ptr qp, tp, scratch;
322	158	TMP_DECL;
323	158	TMP_MARK;
324
325	158	ASSERT (np[2 * n - 1 - odd] != 0);
326	158	ASSERT (n > 4);
327	158	ASSERT (nsh < GMP_NUMB_BITS / 2);
328
329	158	l = (n - 1) / 2;
330	158	h = n - l;
331	158	ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
332	158	scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
333	158	tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
334	158	if (nsh != 0)
335	148	{
336		/* o is used to exactly set the lowest bits of the dividend, is it needed? */
337	148	int o = l > (1 + odd);
338	148	ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
339	148	}
340	10	else
341	10	MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
342	158	q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
343	158	if (q != 0)
344	158	ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
345	158	qp = tp + n + 1; /* l + 2 */
346	158	TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
347	158	#if USE_DIVAPPR_Q
348	158	mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
349		#else
350		mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
351		#endif
352	158	q += qp [l + 1];
353	158	c = 1;
354	158	if (q > 1)
355	0	{
356		/* FIXME: if s!=0 we will shift later, a noop on this area. */
357	0	MPN_FILL (sp, l, GMP_NUMB_MAX);
358	0	}
359	158	else
360	158	{
361		/* FIXME: if s!=0 we will shift again later, shift just once. */
362	158	mpn_rshift (sp, qp + 1, l, 1);
363	158	sp[l - 1] \|= q << (GMP_NUMB_BITS - 1);
364	158	if (((qp[0] >> (2 + USE_DIVAPPR_Q)) \| /* < 3 + 4USE_DIVAPPR_Q /
365	158	(qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
366	9	{
367	9	mp_limb_t cy;
368		/* Approximation is not good enough, the extra limb(+ nsh bits)
369		is smaller than needed to absorb the possible error. */
370		/* {qp + 1, l + 1} equals 2{sp, l} /
371		/* FIXME: use mullo or wrap-around, or directly evaluate
372		remainder with a single sqrmod_bnm1. */
373	9	TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
374	9	ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
375		/* Compute the remainder of the previous mpn_div(appr)_q. */
376	9	cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
377	9	#if USE_DIVAPPR_Q \|\| WANT_ASSERT
378	9	MPN_DECR_U (tp + 1 + h, l, cy);
379	9	#if USE_DIVAPPR_Q
380	9	ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
381	9	if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
382	0	{
383		/* May happen only if div result was not exact. */
384	0	#if HAVE_NATIVE_mpn_addlsh1_n_ip1 \|\| HAVE_NATIVE_mpn_addlsh1_n
385	0	cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
386		#else
387		cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
388		#endif
389	0	ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
390	0	MPN_DECR_U (sp, l, 1);
391	0	}
392		/* Can the root be exact when a correction was needed? We
393		did not find an example, but it depends on divappr
394		internals, and we can not assume it true in general...*/
395		/* else */
396		#else /* WANT_ASSERT */
397		ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
398		#endif
399	9	#endif
400	9	if (mpn_zero_p (tp + l + 1, h - l))
401	8	{
402	8	TRACE(printf("sqr(,,%u)\n", (unsigned) l));
403	8	mpn_sqr (scratch, sp, l);
404	8	c = mpn_cmp (tp + 1, scratch + l, l);
405	8	if (c == 0)
406	3	{
407	3	if (nsh != 0)
408	3	{
409	3	mpn_lshift (tp, np, l, 2 * nsh);
410	3	np = tp;
411	3	}
412	3	c = mpn_cmp (np, scratch + odd, l - odd);
413	3	}
414	8	if (c < 0)
415	1	{
416	1	MPN_DECR_U (sp, l, 1);
417	1	c = 1;
418	1	}
419	8	}
420	9	}
421	158	}
422	158	TMP_FREE;
423
424	158	if ((odd \| nsh) != 0)
425	155	mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
426	158	return c;
427	158	}
428
429
430		mp_size_t
431		mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
432	1.59k	{
433	1.59k	mp_limb_t cc, high, rl;
434	1.59k	int c;
435	1.59k	mp_size_t rn, tn;
436	1.59k	TMP_DECL;
437
438	1.59k	ASSERT (nn > 0);
439	1.59k	ASSERT_MPN (np, nn);
440
441	1.59k	ASSERT (np[nn - 1] != 0);
442	1.59k	ASSERT (rp == NULL \|\| MPN_SAME_OR_SEPARATE_P (np, rp, nn));
443	1.59k	ASSERT (rp == NULL \|\| ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
444	1.59k	ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
445
446	1.59k	high = np[nn - 1];
447	1.59k	if (high & (GMP_NUMB_HIGHBIT \| (GMP_NUMB_HIGHBIT / 2)))
448	62	c = 0;
449	1.52k	else
450	1.52k	{
451	1.52k	count_leading_zeros (c, high);
452	1.52k	c -= GMP_NAIL_BITS;
453
454	1.52k	c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
455	1.52k	}
456	1.59k	if (nn == 1) {
457	1.26k	if (c == 0)
458	2	{
459	2	sp[0] = mpn_sqrtrem1 (&rl, high);
460	2	if (rp != NULL)
461	1	rp[0] = rl;
462	2	}
463	1.25k	else
464	1.25k	{
465	1.25k	cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
466	1.25k	sp[0] = cc;
467	1.25k	if (rp != NULL)
468	2	rp[0] = rl = high - cc*cc;
469	1.25k	}
470	1.26k	return rl != 0;
471	1.26k	}
472	330	if (nn == 2) {
473	36	mp_limb_t tp [2];
474	36	if (rp == NULL) rp = tp;
475	36	if (c == 0)
476	13	{
477		#if SQRTREM2_INPLACE
478		rp[1] = high;
479		rp[0] = np[0];
480		cc = CALL_SQRTREM2_INPLACE (sp, rp);
481		#else
482	13	cc = mpn_sqrtrem2 (sp, rp, np);
483	13	#endif
484	13	rp[1] = cc;
485	13	return ((rp[0] \| cc) != 0) + cc;
486	13	}
487	23	else
488	23	{
489	23	rl = np[0];
490	23	rp[1] = (high << (2c)) \| (rl >> (GMP_NUMB_BITS - 2c));
491	23	rp[0] = rl << (2*c);
492	23	CALL_SQRTREM2_INPLACE (sp, rp);
493	23	cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */
494	23	rp[0] = rl -= cccc; / Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
495	23	return rl != 0;
496	23	}
497	36	}
498	294	tn = (nn + 1) / 2; /* 2tn is the smallest even integer >= nn /
499
500	294	if ((rp == NULL) && (nn > 8))
501	158	return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
502	136	TMP_MARK;
503	136	if (((nn & 1) \| c) != 0)
504	120	{
505	120	mp_limb_t s0[1], mask;
506	120	mp_ptr tp, scratch;
507	120	TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
508	120	tp[0] = 0; /* needed only when 2tn > nn, but saves a test /
509	120	if (c != 0)
510	99	mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
511	21	else
512	21	MPN_COPY (tp + (nn & 1), np, nn);
513	120	c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0; /* c now represents k */
514	120	mask = (CNST_LIMB (1) << c) - 1;
515	120	rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
516		/* We have 2^(2k)N = S^2 + R where k = c + (2tn-nn)GMP_NUMB_BITS/2,
517		thus 2^(2k)N = (S-s0)^2 + 2Ss0 - s0^2 + R where s0=S mod 2^k /
518	120	s0[0] = sp[0] & mask; /* S mod 2^k */
519	120	rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2s0S */
520	120	cc = mpn_submul_1 (tp, s0, 1, s0[0]);
521	120	rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
522	120	mpn_rshift (sp, sp, tn, c);
523	120	tp[tn] = rl;
524	120	if (rp == NULL)
525	60	rp = tp;
526	120	c = c << 1;
527	120	if (c < GMP_NUMB_BITS)
528	58	tn++;
529	62	else
530	62	{
531	62	tp++;
532	62	c -= GMP_NUMB_BITS;
533	62	}
534	120	if (c != 0)
535	99	mpn_rshift (rp, tp, tn, c);
536	21	else
537	21	MPN_COPY_INCR (rp, tp, tn);
538	120	rn = tn;
539	120	}
540	16	else
541	16	{
542	16	if (rp != np)
543	16	{
544	16	if (rp == NULL) /* nn <= 8 */
545	5	rp = TMP_SALLOC_LIMBS (nn);
546	16	MPN_COPY (rp, np, nn);
547	16	}
548	16	rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
549	16	}
550
551	136	MPN_NORMALIZE (rp, rn);
552
553	136	TMP_FREE;
554	136	return rn;
555	136	}