/src/gmp/mpn/sqrtrem.c

Source (jump to first uncovered line)
/* mpn_sqrtrem -- square root and remainder

   Contributed to the GNU project by Paul Zimmermann (most code),
   Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).

   THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
   IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
   FUTURE GMP RELEASE.

Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */


/* See "Karatsuba Square Root", reference in gmp.texi.  */


#include <stdio.h>
#include <stdlib.h>

#include "gmp-impl.h"
#include "longlong.h"
#define USE_DIVAPPR_Q 1
#define TRACE(x)

static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
{
  0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
  0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
  0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
  0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
  0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
  0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
  0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
  0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
  0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
  0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
  0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
  0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
  0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
  0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
  0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
  0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
  0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
  0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
  0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
  0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
  0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
  0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
  0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
  0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
  0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
  0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
  0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
  0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
  0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
  0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
  0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
  0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
  0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
  0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
  0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
  0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
  0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
  0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
  0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
  0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
  0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
  0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
  0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
  0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
  0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
  0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
  0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
  0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
};

/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */

#if GMP_NUMB_BITS > 32
#define MAGIC CNST_LIMB(0x10000000000)  /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
#else
#define MAGIC CNST_LIMB(0x100000)   /* 0xfee6f < MAGIC < 0x29cbc8 */
#endif

static mp_limb_t
mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
{
#if GMP_NUMB_BITS > 32
  mp_limb_t a1;
#endif
  mp_limb_t x0, t2, t, x2;
  unsigned abits;

  ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
  ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
  ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);

  /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
     since we can do the former without division.  As part of the last
     iteration convert from 1/sqrt(a) to sqrt(a).  */

  abits = a0 >> (GMP_LIMB_BITS - 1 - 8);  /* extract bits for table lookup */
  x0 = 0x100 | invsqrttab[abits - 0x80];  /* initial 1/sqrt(a) */

  /* x0 is now an 8 bits approximation of 1/sqrt(a0) */

#if GMP_NUMB_BITS > 32
  a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
  t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
  x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));

  /* x0 is now a 16 bits approximation of 1/sqrt(a0) */

  t2 = x0 * (a0 >> (32-8));
  t = t2 >> 25;
  t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
  x0 >>= 32;
#else
  t2 = x0 * (a0 >> (16-8));
  t = t2 >> 13;
  t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
  x0 >>= 16;
#endif

  /* x0 is now a full limb approximation of sqrt(a0) */

  x2 = x0 * x0;
  if (x2 + 2*x0 <= a0 - 1)
    {
      x2 += 2*x0 + 1;
      x0++;
    }

  *rp = a0 - x2;
  return x0;
}


#define Prec (GMP_NUMB_BITS >> 1)
#if ! defined(SQRTREM2_INPLACE)
#define SQRTREM2_INPLACE 0
#endif

/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
   return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
#if SQRTREM2_INPLACE
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
static mp_limb_t
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
{
  mp_srcptr np = rp;
#else
#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
static mp_limb_t
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
{
#endif
  mp_limb_t q, u, np0, sp0, rp0, q2;
  int cc;

  ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);

  np0 = np[0];
  sp0 = mpn_sqrtrem1 (rp, np[1]);
  rp0 = rp[0];
  /* rp0 <= 2*sp0 < 2^(Prec + 1) */
  rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
  q = rp0 / sp0;
  /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
  q -= q >> Prec;
  /* now we have q < 2^Prec */
  u = rp0 - q * sp0;
  /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
  sp0 = (sp0 << Prec) | q;
  cc = u >> (Prec - 1);
  rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
  /* subtract q * q from rp */
  q2 = q * q;
  cc -= rp0 < q2;
  rp0 -= q2;
  if (cc < 0)
    {
      rp0 += sp0;
      cc += rp0 < sp0;
      --sp0;
      rp0 += sp0;
      cc += rp0 < sp0;
    }

  rp[0] = rp0;
  sp[0] = sp0;
  return cc;
}

/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
   and in {np, n} the low n limbs of the remainder, returns the high
   limb of the remainder (which is 0 or 1).
   Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
   where B=2^GMP_NUMB_BITS.
   Needs a scratch of n/2+1 limbs. */
static mp_limb_t
mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
{
  mp_limb_t q;      /* carry out of {sp, n} */
  int c, b;     /* carry out of remainder */
  mp_size_t l, h;

  ASSERT (n > 1);
  ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);

  l = n / 2;
  h = n - l;
  if (h == 1)
    q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
  else
    q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
  if (q != 0)
    ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
  TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
  mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
  q += scratch[l];
  c = scratch[0] & 1;
  mpn_rshift (sp, scratch, l, 1);
  sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
  if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
    return 1; /* Remainder is non-zero */
  q >>= 1;
  if (c != 0)
    c = mpn_add_n (np + l, np + l, sp + l, h);
  TRACE(printf("sqr(,,%u)\n", (unsigned) l));
  mpn_sqr (np + n, sp, l);
  b = q + mpn_sub_n (np, np, np + n, 2 * l);
  c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);

  if (c < 0)
    {
      q = mpn_add_1 (sp + l, sp + l, h, q);
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
      c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
#else
      c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
#endif
      c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
      q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
    }

  return c;
}

#if USE_DIVAPPR_Q
static void
mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
{
  gmp_pi1_t inv;
  mp_limb_t qh;
  ASSERT (dn > 2);
  ASSERT (nn >= dn);
  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);

  MPN_COPY (scratch, np, nn);
  invert_pi1 (inv, dp[dn-1], dp[dn-2]);
  if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
    qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
  else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
    qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
  else
    {
      mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
      TMP_DECL;
      TMP_MARK;
      /* Sadly, scratch is too small. */
      qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
      TMP_FREE;
    }
  qp [nn - dn] = qh;
}
#endif

/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
   returns zero if the operand was a perfect square, one otherwise.
   Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
   where B=2^GMP_NUMB_BITS.
   THINK: In the odd case, three more (dummy) limbs are taken into account,
   when nsh is maximal, two limbs are discarded from the result of the
   division. Too much? Is a single dummy limb enough? */
static int
mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
{
  mp_limb_t q;      /* carry out of {sp, n} */
  int c;      /* carry out of remainder */
  mp_size_t l, h;
  mp_ptr qp, tp, scratch;
  TMP_DECL;
  TMP_MARK;

  ASSERT (np[2 * n - 1 - odd] != 0);
  ASSERT (n > 4);
  ASSERT (nsh < GMP_NUMB_BITS / 2);

  l = (n - 1) / 2;
  h = n - l;
  ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
  scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
  tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
  if (nsh != 0)
    {
      /* o is used to exactly set the lowest bits of the dividend, is it needed? */
      int o = l > (1 + odd);
      ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
    }
  else
    MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
  q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
  if (q != 0)
    ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
  qp = tp + n + 1; /* l + 2 */
  TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
#if USE_DIVAPPR_Q
  mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
#else
  mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
#endif
  q += qp [l + 1];
  c = 1;
  if (q > 1)
    {
      /* FIXME: if s!=0 we will shift later, a noop on this area. */
      MPN_FILL (sp, l, GMP_NUMB_MAX);
    }
  else
    {
      /* FIXME: if s!=0 we will shift again later, shift just once. */
      mpn_rshift (sp, qp + 1, l, 1);
      sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
      if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
     (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
  {
    mp_limb_t cy;
    /* Approximation is not good enough, the extra limb(+ nsh bits)
       is smaller than needed to absorb the possible error. */
    /* {qp + 1, l + 1} equals 2*{sp, l} */
    /* FIXME: use mullo or wrap-around, or directly evaluate
       remainder with a single sqrmod_bnm1. */
    TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
    ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
    /* Compute the remainder of the previous mpn_div(appr)_q. */
    cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
#if USE_DIVAPPR_Q || WANT_ASSERT
    MPN_DECR_U (tp + 1 + h, l, cy);
#if USE_DIVAPPR_Q
    ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
    if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
      {
        /* May happen only if div result was not exact. */
#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
        cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
#else
        cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
#endif
        ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
        MPN_DECR_U (sp, l, 1);
      }
    /* Can the root be exact when a correction was needed? We
       did not find an example, but it depends on divappr
       internals, and we can not assume it true in general...*/
    /* else */
#else /* WANT_ASSERT */
    ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
#endif
#endif
    if (mpn_zero_p (tp + l + 1, h - l))
      {
        TRACE(printf("sqr(,,%u)\n", (unsigned) l));
        mpn_sqr (scratch, sp, l);
        c = mpn_cmp (tp + 1, scratch + l, l);
        if (c == 0)
    {
      if (nsh != 0)
        {
          mpn_lshift (tp, np, l, 2 * nsh);
          np = tp;
        }
      c = mpn_cmp (np, scratch + odd, l - odd);
    }
        if (c < 0)
    {
      MPN_DECR_U (sp, l, 1);
      c = 1;
    }
      }
  }
    }
  TMP_FREE;

  if ((odd | nsh) != 0)
    mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
  return c;
}


mp_size_t
mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
{
  mp_limb_t cc, high, rl;
  int c;
  mp_size_t rn, tn;
  TMP_DECL;

  ASSERT (nn > 0);
  ASSERT_MPN (np, nn);

  ASSERT (np[nn - 1] != 0);
  ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
  ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
  ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));

  high = np[nn - 1];
  if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
    c = 0;
  else
    {
      count_leading_zeros (c, high);
      c -= GMP_NAIL_BITS;

      c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
    }
  if (nn == 1) {
    if (c == 0)
      {
  sp[0] = mpn_sqrtrem1 (&rl, high);
  if (rp != NULL)
    rp[0] = rl;
      }
    else
      {
  cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
  sp[0] = cc;
  if (rp != NULL)
    rp[0] = rl = high - cc*cc;
      }
    return rl != 0;
  }
  if (nn == 2) {
    mp_limb_t tp [2];
    if (rp == NULL) rp = tp;
    if (c == 0)
      {
#if SQRTREM2_INPLACE
  rp[1] = high;
  rp[0] = np[0];
  cc = CALL_SQRTREM2_INPLACE (sp, rp);
#else
  cc = mpn_sqrtrem2 (sp, rp, np);
#endif
  rp[1] = cc;
  return ((rp[0] | cc) != 0) + cc;
      }
    else
      {
  rl = np[0];
  rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
  rp[0] = rl << (2*c);
  CALL_SQRTREM2_INPLACE (sp, rp);
  cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */
  rp[0] = rl -= cc*cc;  /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
  return rl != 0;
      }
  }
  tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */

  if ((rp == NULL) && (nn > 8))
    return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
  TMP_MARK;
  if (((nn & 1) | c) != 0)
    {
      mp_limb_t s0[1], mask;
      mp_ptr tp, scratch;
      TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
      tp[0] = 0;       /* needed only when 2*tn > nn, but saves a test */
      if (c != 0)
  mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
      else
  MPN_COPY (tp + (nn & 1), np, nn);
      c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;   /* c now represents k */
      mask = (CNST_LIMB (1) << c) - 1;
      rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
      /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
   thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
      s0[0] = sp[0] & mask; /* S mod 2^k */
      rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */
      cc = mpn_submul_1 (tp, s0, 1, s0[0]);
      rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
      mpn_rshift (sp, sp, tn, c);
      tp[tn] = rl;
      if (rp == NULL)
  rp = tp;
      c = c << 1;
      if (c < GMP_NUMB_BITS)
  tn++;
      else
  {
    tp++;
    c -= GMP_NUMB_BITS;
  }
      if (c != 0)
  mpn_rshift (rp, tp, tn, c);
      else
  MPN_COPY_INCR (rp, tp, tn);
      rn = tn;
    }
  else
    {
      if (rp != np)
  {
    if (rp == NULL) /* nn <= 8 */
      rp = TMP_SALLOC_LIMBS (nn);
    MPN_COPY (rp, np, nn);
  }
      rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
    }

  MPN_NORMALIZE (rp, rn);

  TMP_FREE;
  return rn;
}

Coverage Report

Created: 2025-03-18 06:55

Line	Count	Source (jump to first uncovered line)
1		/* mpn_sqrtrem -- square root and remainder
2
3		Contributed to the GNU project by Paul Zimmermann (most code),
4		Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
5
6		THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
7		INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
8		IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
9		FUTURE GMP RELEASE.
10
11		Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
12		Foundation, Inc.
13
14		This file is part of the GNU MP Library.
15
16		The GNU MP Library is free software; you can redistribute it and/or modify
17		it under the terms of either:
18
19		* the GNU Lesser General Public License as published by the Free
20		Software Foundation; either version 3 of the License, or (at your
21		option) any later version.
22
23		or
24
25		* the GNU General Public License as published by the Free Software
26		Foundation; either version 2 of the License, or (at your option) any
27		later version.
28
29		or both in parallel, as here.
30
31		The GNU MP Library is distributed in the hope that it will be useful, but
32		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
34		for more details.
35
36		You should have received copies of the GNU General Public License and the
37		GNU Lesser General Public License along with the GNU MP Library. If not,
38		see https://www.gnu.org/licenses/. */
39
40
41		/* See "Karatsuba Square Root", reference in gmp.texi. */
42
43
44		#include <stdio.h>
45		#include <stdlib.h>
46
47		#include "gmp-impl.h"
48		#include "longlong.h"
49	0	#define USE_DIVAPPR_Q 1
50		#define TRACE(x)
51
52		static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
53		{
54		0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
55		0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
56		0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
57		0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
58		0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
59		0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
60		0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
61		0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
62		0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
63		0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
64		0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
65		0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
66		0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
67		0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
68		0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
69		0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
70		0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
71		0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
72		0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
73		0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
74		0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
75		0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
76		0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
77		0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
78		0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
79		0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
80		0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
81		0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
82		0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
83		0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
84		0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
85		0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
86		0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
87		0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
88		0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
89		0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
90		0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
91		0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
92		0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
93		0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
94		0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
95		0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
96		0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
97		0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
98		0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
99		0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
100		0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
101		0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */
102		};
103
104		/* Compute s = floor(sqrt(a0)), and rp = a0 - s^2. /
105
106		#if GMP_NUMB_BITS > 32
107	0	#define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
108		#else
109		#define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */
110		#endif
111
112		static mp_limb_t
113		mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
114	0	{
115	0	#if GMP_NUMB_BITS > 32
116	0	mp_limb_t a1;
117	0	#endif
118	0	mp_limb_t x0, t2, t, x2;
119	0	unsigned abits;
120
121	0	ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
122	0	ASSERT_ALWAYS (GMP_LIMB_BITS == 32 \|\| GMP_LIMB_BITS == 64);
123	0	ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
124
125		/* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
126		since we can do the former without division. As part of the last
127		iteration convert from 1/sqrt(a) to sqrt(a). */
128
129	0	abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */
130	0	x0 = 0x100 \| invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */
131
132		/* x0 is now an 8 bits approximation of 1/sqrt(a0) */
133
134	0	#if GMP_NUMB_BITS > 32
135	0	a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
136	0	t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
137	0	x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
138
139		/* x0 is now a 16 bits approximation of 1/sqrt(a0) */
140
141	0	t2 = x0 * (a0 >> (32-8));
142	0	t = t2 >> 25;
143	0	t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
144	0	x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
145	0	x0 >>= 32;
146		#else
147		t2 = x0 * (a0 >> (16-8));
148		t = t2 >> 13;
149		t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
150		x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
151		x0 >>= 16;
152		#endif
153
154		/* x0 is now a full limb approximation of sqrt(a0) */
155
156	0	x2 = x0 * x0;
157	0	if (x2 + 2*x0 <= a0 - 1)
158	0	{
159	0	x2 += 2*x0 + 1;
160	0	x0++;
161	0	}
162
163	0	*rp = a0 - x2;
164	0	return x0;
165	0	}
166
167
168	0	#define Prec (GMP_NUMB_BITS >> 1)
169		#if ! defined(SQRTREM2_INPLACE)
170		#define SQRTREM2_INPLACE 0
171		#endif
172
173		/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
174		return cc such that {np, 2} = sp[0]^2 + cc2^GMP_NUMB_BITS + rp[0] /
175		#if SQRTREM2_INPLACE
176		#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
177		static mp_limb_t
178		mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
179		{
180		mp_srcptr np = rp;
181		#else
182	0	#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
183		static mp_limb_t
184		mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
185	0	{
186	0	#endif
187	0	mp_limb_t q, u, np0, sp0, rp0, q2;
188	0	int cc;
189
190	0	ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
191
192	0	np0 = np[0];
193	0	sp0 = mpn_sqrtrem1 (rp, np[1]);
194	0	rp0 = rp[0];
195		/* rp0 <= 2sp0 < 2^(Prec + 1) /
196	0	rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
197	0	q = rp0 / sp0;
198		/* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
199	0	q -= q >> Prec;
200		/* now we have q < 2^Prec */
201	0	u = rp0 - q * sp0;
202		/* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
203	0	sp0 = (sp0 << Prec) \| q;
204	0	cc = u >> (Prec - 1);
205	0	rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
206		/* subtract q * q from rp */
207	0	q2 = q * q;
208	0	cc -= rp0 < q2;
209	0	rp0 -= q2;
210	0	if (cc < 0)
211	0	{
212	0	rp0 += sp0;
213	0	cc += rp0 < sp0;
214	0	--sp0;
215	0	rp0 += sp0;
216	0	cc += rp0 < sp0;
217	0	}
218
219	0	rp[0] = rp0;
220	0	sp[0] = sp0;
221	0	return cc;
222	0	}
223
224		/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
225		and in {np, n} the low n limbs of the remainder, returns the high
226		limb of the remainder (which is 0 or 1).
227		Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
228		where B=2^GMP_NUMB_BITS.
229		Needs a scratch of n/2+1 limbs. */
230		static mp_limb_t
231		mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
232	0	{
233	0	mp_limb_t q; /* carry out of {sp, n} */
234	0	int c, b; /* carry out of remainder */
235	0	mp_size_t l, h;
236
237	0	ASSERT (n > 1);
238	0	ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
239
240	0	l = n / 2;
241	0	h = n - l;
242	0	if (h == 1)
243	0	q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
244	0	else
245	0	q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
246	0	if (q != 0)
247	0	ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
248	0	TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
249	0	mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
250	0	q += scratch[l];
251	0	c = scratch[0] & 1;
252	0	mpn_rshift (sp, scratch, l, 1);
253	0	sp[l - 1] \|= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
254	0	if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
255	0	return 1; /* Remainder is non-zero */
256	0	q >>= 1;
257	0	if (c != 0)
258	0	c = mpn_add_n (np + l, np + l, sp + l, h);
259	0	TRACE(printf("sqr(,,%u)\n", (unsigned) l));
260	0	mpn_sqr (np + n, sp, l);
261	0	b = q + mpn_sub_n (np, np, np + n, 2 * l);
262	0	c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
263
264	0	if (c < 0)
265	0	{
266	0	q = mpn_add_1 (sp + l, sp + l, h, q);
267	0	#if HAVE_NATIVE_mpn_addlsh1_n_ip1 \|\| HAVE_NATIVE_mpn_addlsh1_n
268	0	c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
269		#else
270		c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
271		#endif
272	0	c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
273	0	q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
274	0	}
275
276	0	return c;
277	0	}
278
279		#if USE_DIVAPPR_Q
280		static void
281		mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
282	0	{
283	0	gmp_pi1_t inv;
284	0	mp_limb_t qh;
285	0	ASSERT (dn > 2);
286	0	ASSERT (nn >= dn);
287	0	ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
288
289	0	MPN_COPY (scratch, np, nn);
290	0	invert_pi1 (inv, dp[dn-1], dp[dn-2]);
291	0	if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
292	0	qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
293	0	else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
294	0	qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
295	0	else
296	0	{
297	0	mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
298	0	TMP_DECL;
299	0	TMP_MARK;
300		/* Sadly, scratch is too small. */
301	0	qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
302	0	TMP_FREE;
303	0	}
304	0	qp [nn - dn] = qh;
305	0	}
306		#endif
307
308		/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
309		returns zero if the operand was a perfect square, one otherwise.
310		Assumes {np, 2n-odd}4^nsh is normalized, i.e. B > np[2n-1-odd]4^nsh >= B/4
311		where B=2^GMP_NUMB_BITS.
312		THINK: In the odd case, three more (dummy) limbs are taken into account,
313		when nsh is maximal, two limbs are discarded from the result of the
314		division. Too much? Is a single dummy limb enough? */
315		static int
316		mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
317	0	{
318	0	mp_limb_t q; /* carry out of {sp, n} */
319	0	int c; /* carry out of remainder */
320	0	mp_size_t l, h;
321	0	mp_ptr qp, tp, scratch;
322	0	TMP_DECL;
323	0	TMP_MARK;
324
325	0	ASSERT (np[2 * n - 1 - odd] != 0);
326	0	ASSERT (n > 4);
327	0	ASSERT (nsh < GMP_NUMB_BITS / 2);
328
329	0	l = (n - 1) / 2;
330	0	h = n - l;
331	0	ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
332	0	scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
333	0	tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
334	0	if (nsh != 0)
335	0	{
336		/* o is used to exactly set the lowest bits of the dividend, is it needed? */
337	0	int o = l > (1 + odd);
338	0	ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
339	0	}
340	0	else
341	0	MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
342	0	q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
343	0	if (q != 0)
344	0	ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
345	0	qp = tp + n + 1; /* l + 2 */
346	0	TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
347	0	#if USE_DIVAPPR_Q
348	0	mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
349		#else
350		mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
351		#endif
352	0	q += qp [l + 1];
353	0	c = 1;
354	0	if (q > 1)
355	0	{
356		/* FIXME: if s!=0 we will shift later, a noop on this area. */
357	0	MPN_FILL (sp, l, GMP_NUMB_MAX);
358	0	}
359	0	else
360	0	{
361		/* FIXME: if s!=0 we will shift again later, shift just once. */
362	0	mpn_rshift (sp, qp + 1, l, 1);
363	0	sp[l - 1] \|= q << (GMP_NUMB_BITS - 1);
364	0	if (((qp[0] >> (2 + USE_DIVAPPR_Q)) \| /* < 3 + 4USE_DIVAPPR_Q /
365	0	(qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
366	0	{
367	0	mp_limb_t cy;
368		/* Approximation is not good enough, the extra limb(+ nsh bits)
369		is smaller than needed to absorb the possible error. */
370		/* {qp + 1, l + 1} equals 2{sp, l} /
371		/* FIXME: use mullo or wrap-around, or directly evaluate
372		remainder with a single sqrmod_bnm1. */
373	0	TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
374	0	ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
375		/* Compute the remainder of the previous mpn_div(appr)_q. */
376	0	cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
377	0	#if USE_DIVAPPR_Q \|\| WANT_ASSERT
378	0	MPN_DECR_U (tp + 1 + h, l, cy);
379	0	#if USE_DIVAPPR_Q
380	0	ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
381	0	if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
382	0	{
383		/* May happen only if div result was not exact. */
384	0	#if HAVE_NATIVE_mpn_addlsh1_n_ip1 \|\| HAVE_NATIVE_mpn_addlsh1_n
385	0	cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
386		#else
387		cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
388		#endif
389	0	ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
390	0	MPN_DECR_U (sp, l, 1);
391	0	}
392		/* Can the root be exact when a correction was needed? We
393		did not find an example, but it depends on divappr
394		internals, and we can not assume it true in general...*/
395		/* else */
396		#else /* WANT_ASSERT */
397		ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
398		#endif
399	0	#endif
400	0	if (mpn_zero_p (tp + l + 1, h - l))
401	0	{
402	0	TRACE(printf("sqr(,,%u)\n", (unsigned) l));
403	0	mpn_sqr (scratch, sp, l);
404	0	c = mpn_cmp (tp + 1, scratch + l, l);
405	0	if (c == 0)
406	0	{
407	0	if (nsh != 0)
408	0	{
409	0	mpn_lshift (tp, np, l, 2 * nsh);
410	0	np = tp;
411	0	}
412	0	c = mpn_cmp (np, scratch + odd, l - odd);
413	0	}
414	0	if (c < 0)
415	0	{
416	0	MPN_DECR_U (sp, l, 1);
417	0	c = 1;
418	0	}
419	0	}
420	0	}
421	0	}
422	0	TMP_FREE;
423
424	0	if ((odd \| nsh) != 0)
425	0	mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
426	0	return c;
427	0	}
428
429
430		mp_size_t
431		mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
432	0	{
433	0	mp_limb_t cc, high, rl;
434	0	int c;
435	0	mp_size_t rn, tn;
436	0	TMP_DECL;
437
438	0	ASSERT (nn > 0);
439	0	ASSERT_MPN (np, nn);
440
441	0	ASSERT (np[nn - 1] != 0);
442	0	ASSERT (rp == NULL \|\| MPN_SAME_OR_SEPARATE_P (np, rp, nn));
443	0	ASSERT (rp == NULL \|\| ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
444	0	ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
445
446	0	high = np[nn - 1];
447	0	if (high & (GMP_NUMB_HIGHBIT \| (GMP_NUMB_HIGHBIT / 2)))
448	0	c = 0;
449	0	else
450	0	{
451	0	count_leading_zeros (c, high);
452	0	c -= GMP_NAIL_BITS;
453
454	0	c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
455	0	}
456	0	if (nn == 1) {
457	0	if (c == 0)
458	0	{
459	0	sp[0] = mpn_sqrtrem1 (&rl, high);
460	0	if (rp != NULL)
461	0	rp[0] = rl;
462	0	}
463	0	else
464	0	{
465	0	cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
466	0	sp[0] = cc;
467	0	if (rp != NULL)
468	0	rp[0] = rl = high - cc*cc;
469	0	}
470	0	return rl != 0;
471	0	}
472	0	if (nn == 2) {
473	0	mp_limb_t tp [2];
474	0	if (rp == NULL) rp = tp;
475	0	if (c == 0)
476	0	{
477		#if SQRTREM2_INPLACE
478		rp[1] = high;
479		rp[0] = np[0];
480		cc = CALL_SQRTREM2_INPLACE (sp, rp);
481		#else
482	0	cc = mpn_sqrtrem2 (sp, rp, np);
483	0	#endif
484	0	rp[1] = cc;
485	0	return ((rp[0] \| cc) != 0) + cc;
486	0	}
487	0	else
488	0	{
489	0	rl = np[0];
490	0	rp[1] = (high << (2c)) \| (rl >> (GMP_NUMB_BITS - 2c));
491	0	rp[0] = rl << (2*c);
492	0	CALL_SQRTREM2_INPLACE (sp, rp);
493	0	cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */
494	0	rp[0] = rl -= cccc; / Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
495	0	return rl != 0;
496	0	}
497	0	}
498	0	tn = (nn + 1) / 2; /* 2tn is the smallest even integer >= nn /
499
500	0	if ((rp == NULL) && (nn > 8))
501	0	return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
502	0	TMP_MARK;
503	0	if (((nn & 1) \| c) != 0)
504	0	{
505	0	mp_limb_t s0[1], mask;
506	0	mp_ptr tp, scratch;
507	0	TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
508	0	tp[0] = 0; /* needed only when 2tn > nn, but saves a test /
509	0	if (c != 0)
510	0	mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
511	0	else
512	0	MPN_COPY (tp + (nn & 1), np, nn);
513	0	c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0; /* c now represents k */
514	0	mask = (CNST_LIMB (1) << c) - 1;
515	0	rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
516		/* We have 2^(2k)N = S^2 + R where k = c + (2tn-nn)GMP_NUMB_BITS/2,
517		thus 2^(2k)N = (S-s0)^2 + 2Ss0 - s0^2 + R where s0=S mod 2^k /
518	0	s0[0] = sp[0] & mask; /* S mod 2^k */
519	0	rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2s0S */
520	0	cc = mpn_submul_1 (tp, s0, 1, s0[0]);
521	0	rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
522	0	mpn_rshift (sp, sp, tn, c);
523	0	tp[tn] = rl;
524	0	if (rp == NULL)
525	0	rp = tp;
526	0	c = c << 1;
527	0	if (c < GMP_NUMB_BITS)
528	0	tn++;
529	0	else
530	0	{
531	0	tp++;
532	0	c -= GMP_NUMB_BITS;
533	0	}
534	0	if (c != 0)
535	0	mpn_rshift (rp, tp, tn, c);
536	0	else
537	0	MPN_COPY_INCR (rp, tp, tn);
538	0	rn = tn;
539	0	}
540	0	else
541	0	{
542	0	if (rp != np)
543	0	{
544	0	if (rp == NULL) /* nn <= 8 */
545	0	rp = TMP_SALLOC_LIMBS (nn);
546	0	MPN_COPY (rp, np, nn);
547	0	}
548	0	rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
549	0	}
550
551	0	MPN_NORMALIZE (rp, rn);
552
553	0	TMP_FREE;
554	0	return rn;
555	0	}