/src/gmp-6.2.1/mpn/toom_interpolate_16pts.c

Source (jump to first uncovered line)
/* Interpolation for the algorithm Toom-Cook 8.5-way.

   Contributed to the GNU project by Marco Bodrato.

   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */


#include "gmp-impl.h"


#if GMP_NUMB_BITS < 29
#error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB.
#endif

#if GMP_NUMB_BITS < 28
#error Not implemented: divexact_by188513325 and _by182712915 will not work.
#endif


/* FIXME: tuneup should decide the best variant */
#ifndef AORSMUL_FASTER_AORS_AORSLSH
#define AORSMUL_FASTER_AORS_AORSLSH 1
#endif
#ifndef AORSMUL_FASTER_AORS_2AORSLSH
#define AORSMUL_FASTER_AORS_2AORSLSH 1
#endif
#ifndef AORSMUL_FASTER_2AORSLSH
#define AORSMUL_FASTER_2AORSLSH 1
#endif
#ifndef AORSMUL_FASTER_3AORSLSH
#define AORSMUL_FASTER_3AORSLSH 1
#endif


#if HAVE_NATIVE_mpn_sublsh_n
#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
#else
static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
{
#if USE_MUL_1 && 0
  return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
#else
  mp_limb_t __cy;
  __cy = mpn_lshift(ws,src,n,s);
  return    __cy + mpn_sub_n(dst,dst,ws,n);
#endif
}
#endif

#if HAVE_NATIVE_mpn_addlsh_n
#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
#else
#if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
static mp_limb_t
DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
{
#if USE_MUL_1 && 0
  return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
#else
  mp_limb_t __cy;
  __cy = mpn_lshift(ws,src,n,s);
  return    __cy + mpn_add_n(dst,dst,ws,n);
#endif
}
#endif
#endif

#if HAVE_NATIVE_mpn_subrsh
#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
#else
/* FIXME: This is not a correct definition, it assumes no carry */
#define DO_mpn_subrsh(dst,nd,src,ns,s,ws)       \
do {                 \
  mp_limb_t __cy;             \
  MPN_DECR_U (dst, nd, src[0] >> s);          \
  __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);  \
  MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);       \
} while (0)
#endif


#if GMP_NUMB_BITS < 43
#define BIT_CORRECTION 1
#define CORRECTION_BITS GMP_NUMB_BITS
#else
#define BIT_CORRECTION 0
#define CORRECTION_BITS 0
#endif

#define BINVERT_9 \
  ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)

#define BINVERT_255 \
  (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))

  /* FIXME: find some more general expressions for inverses */
#if GMP_LIMB_BITS == 32
#define BINVERT_2835  (GMP_NUMB_MASK &    CNST_LIMB(0x53E3771B))
#define BINVERT_42525 (GMP_NUMB_MASK &    CNST_LIMB(0x9F314C35))
#define BINVERT_182712915 (GMP_NUMB_MASK &  CNST_LIMB(0x550659DB))
#define BINVERT_188513325 (GMP_NUMB_MASK &  CNST_LIMB(0xFBC333A5))
#define BINVERT_255x182712915L (GMP_NUMB_MASK & CNST_LIMB(0x6FC4CB25))
#define BINVERT_255x188513325L (GMP_NUMB_MASK & CNST_LIMB(0x6864275B))
#if GMP_NAIL_BITS == 0
#define BINVERT_255x182712915H CNST_LIMB(0x1B649A07)
#define BINVERT_255x188513325H CNST_LIMB(0x06DB993A)
#else /* GMP_NAIL_BITS != 0 */
#define BINVERT_255x182712915H \
  (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) | (0x6FC4CB25>>GMP_NUMB_BITS)))
#define BINVERT_255x188513325H \
  (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) | (0x6864275B>>GMP_NUMB_BITS)))
#endif
#else
#if GMP_LIMB_BITS == 64
#define BINVERT_2835  (GMP_NUMB_MASK &  CNST_LIMB(0x938CC70553E3771B))
#define BINVERT_42525 (GMP_NUMB_MASK &  CNST_LIMB(0xE7B40D449F314C35))
#define BINVERT_255x182712915  (GMP_NUMB_MASK &  CNST_LIMB(0x1B649A076FC4CB25))
#define BINVERT_255x188513325  (GMP_NUMB_MASK &  CNST_LIMB(0x06DB993A6864275B))
#endif
#endif

#ifndef mpn_divexact_by255
#if GMP_NUMB_BITS % 8 == 0
#define mpn_divexact_by255(dst,src,size) \
  (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
#else
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
#else
#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
#endif
#endif
#endif

#ifndef mpn_divexact_by255x4
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
#define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2)
#else
#define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2)
#endif
#endif

#ifndef mpn_divexact_by9x16
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
#define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4)
#else
#define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4)
#endif
#endif

#ifndef mpn_divexact_by42525x16
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
#define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4)
#else
#define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4)
#endif
#endif

#ifndef mpn_divexact_by2835x64
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
#define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6)
#else
#define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6)
#endif
#endif

#ifndef  mpn_divexact_by255x182712915
#if GMP_NUMB_BITS < 36
#if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H)
/* FIXME: use mpn_bdiv_q_2_pi2 */
#endif
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915)
#define mpn_divexact_by255x182712915(dst,src,size)        \
  do {                    \
    mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0);  \
    mpn_divexact_by255(dst,dst,size);           \
  } while(0)
#else
#define mpn_divexact_by255x182712915(dst,src,size)  \
  do {              \
    mpn_divexact_1(dst,src,size,CNST_LIMB(182712915));  \
    mpn_divexact_by255(dst,dst,size);     \
  } while(0)
#endif
#else /* GMP_NUMB_BITS > 35 */
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915)
#define mpn_divexact_by255x182712915(dst,src,size) \
  mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0)
#else
#define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915))
#endif
#endif /* GMP_NUMB_BITS >?< 36 */
#endif

#ifndef  mpn_divexact_by255x188513325
#if GMP_NUMB_BITS < 36
#if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H)
/* FIXME: use mpn_bdiv_q_1_pi2 */
#endif
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325)
#define mpn_divexact_by255x188513325(dst,src,size)      \
  do {                  \
    mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0);  \
    mpn_divexact_by255(dst,dst,size);         \
  } while(0)
#else
#define mpn_divexact_by255x188513325(dst,src,size)  \
  do {              \
    mpn_divexact_1(dst,src,size,CNST_LIMB(188513325));  \
    mpn_divexact_by255(dst,dst,size);     \
  } while(0)
#endif
#else /* GMP_NUMB_BITS > 35 */
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325)
#define mpn_divexact_by255x188513325(dst,src,size) \
  mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0)
#else
#define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325))
#endif
#endif /* GMP_NUMB_BITS >?< 36 */
#endif

/* Interpolation for Toom-8.5 (or Toom-8), using the evaluation
   points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2,
   +-1/8, 0. More precisely, we want to compute
   f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or
   14), given the 16 (rsp. 15) values:

     r0 = limit at infinity of f(x) / x^15,
     r1 = f(8),f(-8),
     r2 = f(4),f(-4),
     r3 = f(2),f(-2),
     r4 = f(1),f(-1),
     r5 = f(1/4),f(-1/4),
     r6 = f(1/2),f(-1/2),
     r7 = f(1/8),f(-1/8),
     r8 = f(0).

   All couples of the form f(n),f(-n) must be already mixed with
   toom_couple_handling(f(n),...,f(-n),...)

   The result is stored in {pp, spt + 7*n (or 8*n)}.
   At entry, r8 is stored at {pp, 2n},
   r6 is stored at {pp + 3n, 3n + 1}.
   r4 is stored at {pp + 7n, 3n + 1}.
   r2 is stored at {pp +11n, 3n + 1}.
   r0 is stored at {pp +15n, spt}.

   The other values are 3n+1 limbs each (with most significant limbs small).

   Negative intermediate results are stored two-complemented.
   Inputs are destroyed.
*/

void
mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7,
      mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
{
  mp_limb_t cy;
  mp_size_t n3;
  mp_size_t n3p1;
  n3 = 3 * n;
  n3p1 = n3 + 1;

#define   r6    (pp + n3)      /* 3n+1 */
#define   r4    (pp + 7 * n)      /* 3n+1 */
#define   r2    (pp +11 * n)      /* 3n+1 */
#define   r0    (pp +15 * n)      /* s+t <= 2*n */

  ASSERT( spt <= 2 * n );
  /******************************* interpolation *****************************/
  if( half != 0) {
    cy = mpn_sub_n (r4, r4, r0, spt);
    MPN_DECR_U (r4 + spt, n3p1 - spt, cy);

    cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi);
    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi);

    cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi);
    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi);

    cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi);
#if BIT_CORRECTION
    cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION,
        n3p1 - spt - BIT_CORRECTION, cy);
    ASSERT (BIT_CORRECTION > 0 || cy == 0);
    /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */
    cy = r7[n3p1];
    r7[n3p1] = 0x80;
#else
    MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
#endif
    DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi);
#if BIT_CORRECTION
    /* FIXME: assumes r7[n3p1] is writable. */
    ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 );
    r7[n3p1] = cy;
#endif
  };

  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi);
  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
#else
  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
  MP_PTR_SWAP(r5, wsi);
#endif

  r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi);
  DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r3, r6, r6, r3, n3p1);
#else
  ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1));
  mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */
  MP_PTR_SWAP(r3, wsi);
#endif

  cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi);
#if BIT_CORRECTION
  MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6);
  cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi);
  cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy);
  ASSERT ( BIT_CORRECTION > 0 || cy != 0 );
#else
  r7[n3] -= cy;
  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi);
#endif

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r1, r7, r7, r1, n3p1);
#else
  mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */
  mpn_add_n (r1, r1, r7, n3p1);  /* if BIT_CORRECTION != 0, can give a carry. */
  MP_PTR_SWAP(r7, wsi);
#endif

  r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n);

#if AORSMUL_FASTER_2AORSLSH
  mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */
#else
  DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */
  DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */
#endif

  mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */
#if AORSMUL_FASTER_3AORSLSH
  mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */
#else
  DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */
  DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */
  DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */
#endif
  mpn_divexact_by255x188513325(r7, r7, n3p1);

  mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */
  /* A division by 2835x64 follows. Warning: the operand can be negative! */
  mpn_divexact_by2835x64(r5, r5, n3p1);
  if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0)
    r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6));

#if AORSMUL_FASTER_AORS_AORSLSH
  mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */
#else
  mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */
  DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */
#endif
#if AORSMUL_FASTER_2AORSLSH
  mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */
#else
  DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */
  DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */
#endif
  /* A division by 255x4 follows. Warning: the operand can be negative! */
  mpn_divexact_by255x4(r6, r6, n3p1);
  if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
    r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));

  ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi));

  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi));
  ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400));

  /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/
  DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi);
  mpn_submul_1 (r1, r2, n3p1, 1428);
  mpn_submul_1 (r1, r3, n3p1, 112896);
  mpn_divexact_by255x182712915(r1, r1, n3p1);

  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425));
  mpn_divexact_by42525x16(r2, r2, n3p1);

#if AORSMUL_FASTER_AORS_2AORSLSH
  ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969));
#else
  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
  ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi));
#endif
  ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900));
  mpn_divexact_by9x16(r3, r3, n3p1);

  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1));
  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1));
  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1));

#ifdef HAVE_NATIVE_mpn_rsh1add_n
  mpn_rsh1add_n (r6, r2, r6, n3p1);
  r6 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
#else
  mpn_add_n (r6, r2, r6, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1));
#endif
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1));

#ifdef HAVE_NATIVE_mpn_rsh1sub_n
  mpn_rsh1sub_n (r5, r3, r5, n3p1);
  r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
#else
  mpn_sub_n (r5, r3, r5, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
#endif
  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1));

#ifdef HAVE_NATIVE_mpn_rsh1add_n
  mpn_rsh1add_n (r7, r1, r7, n3p1);
  r7 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
#else
  mpn_add_n (r7, r1, r7, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1));
#endif
  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1));

  /* last interpolation steps... */
  /* ... could be mixed with recomposition
  ||H-r7|M-r7|L-r7|   ||H-r5|M-r5|L-r5|
  */

  /***************************** recomposition *******************************/
  /*
    pp[] prior to operations:
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp

    summation scheme for remaining operations:
    |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
  ||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|   ||H r7|M r7|L r7|
  */

  cy = mpn_add_n (pp + n, pp + n, r7, n);
  cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy);
#if HAVE_NATIVE_mpn_add_nc
  cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy);
#else
  MPN_INCR_U (r7 + 2 * n, n + 1, cy);
  cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n);
#endif
  MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy);

  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n);
  cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]);
#if HAVE_NATIVE_mpn_add_nc
  cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy);
#else
  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
  cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n);
#endif
  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);

  pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n);
  cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]);
#if HAVE_NATIVE_mpn_add_nc
  cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy);
#else
  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
  cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n);
#endif
  MPN_INCR_U (pp +12 * n, 2 * n + 1, cy);

  pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n);
  if ( half ) {
    cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]);
#if HAVE_NATIVE_mpn_add_nc
    if(LIKELY(spt > n)) {
      cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy);
      MPN_INCR_U (pp + 16 * n, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy));
    }
#else
    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
    if(LIKELY(spt > n)) {
      cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n);
      MPN_INCR_U (pp + 16 * n, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt));
    }
#endif
  } else {
    ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n]));
  }

#undef   r0
#undef   r2
#undef   r4
#undef   r6
}

Coverage Report

Created: 2025-03-09 06:52

Line	Count	Source (jump to first uncovered line)
1		/* Interpolation for the algorithm Toom-Cook 8.5-way.
2
3		Contributed to the GNU project by Marco Bodrato.
4
5		THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6		SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7		GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9		Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.
10
11		This file is part of the GNU MP Library.
12
13		The GNU MP Library is free software; you can redistribute it and/or modify
14		it under the terms of either:
15
16		* the GNU Lesser General Public License as published by the Free
17		Software Foundation; either version 3 of the License, or (at your
18		option) any later version.
19
20		or
21
22		* the GNU General Public License as published by the Free Software
23		Foundation; either version 2 of the License, or (at your option) any
24		later version.
25
26		or both in parallel, as here.
27
28		The GNU MP Library is distributed in the hope that it will be useful, but
29		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31		for more details.
32
33		You should have received copies of the GNU General Public License and the
34		GNU Lesser General Public License along with the GNU MP Library. If not,
35		see https://www.gnu.org/licenses/. */
36
37
38		#include "gmp-impl.h"
39
40
41		#if GMP_NUMB_BITS < 29
42		#error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB.
43		#endif
44
45		#if GMP_NUMB_BITS < 28
46		#error Not implemented: divexact_by188513325 and _by182712915 will not work.
47		#endif
48
49
50		/* FIXME: tuneup should decide the best variant */
51		#ifndef AORSMUL_FASTER_AORS_AORSLSH
52		#define AORSMUL_FASTER_AORS_AORSLSH 1
53		#endif
54		#ifndef AORSMUL_FASTER_AORS_2AORSLSH
55		#define AORSMUL_FASTER_AORS_2AORSLSH 1
56		#endif
57		#ifndef AORSMUL_FASTER_2AORSLSH
58		#define AORSMUL_FASTER_2AORSLSH 1
59		#endif
60		#ifndef AORSMUL_FASTER_3AORSLSH
61		#define AORSMUL_FASTER_3AORSLSH 1
62		#endif
63
64
65		#if HAVE_NATIVE_mpn_sublsh_n
66		#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
67		#else
68		static mp_limb_t
69		DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
70	244k	{
71		#if USE_MUL_1 && 0
72		return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
73		#else
74	244k	mp_limb_t __cy;
75	244k	__cy = mpn_lshift(ws,src,n,s);
76	244k	return __cy + mpn_sub_n(dst,dst,ws,n);
77	244k	#endif
78	244k	}
79		#endif
80
81		#if HAVE_NATIVE_mpn_addlsh_n
82		#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
83		#else
84		#if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
85		static mp_limb_t
86		DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
87		{
88		#if USE_MUL_1 && 0
89		return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
90		#else
91		mp_limb_t __cy;
92		__cy = mpn_lshift(ws,src,n,s);
93		return __cy + mpn_add_n(dst,dst,ws,n);
94		#endif
95		}
96		#endif
97		#endif
98
99		#if HAVE_NATIVE_mpn_subrsh
100		#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
101		#else
102		/* FIXME: This is not a correct definition, it assumes no carry */
103	81.6k	#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \
104	81.6k	do { \
105	81.6k	mp_limb_t __cy; \
106	81.6k	MPN_DECR_U (dst, nd, src[0] >> s); \
107	81.6k	__cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
108	81.6k	MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \
109	81.6k	} while (0)
110		#endif
111
112
113		#if GMP_NUMB_BITS < 43
114		#define BIT_CORRECTION 1
115		#define CORRECTION_BITS GMP_NUMB_BITS
116		#else
117	27.2k	#define BIT_CORRECTION 0
118	27.2k	#define CORRECTION_BITS 0
119		#endif
120
121		#define BINVERT_9 \
122	27.0k	((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) \| 0x39)
123
124		#define BINVERT_255 \
125	27.0k	(GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
126
127		/* FIXME: find some more general expressions for inverses */
128		#if GMP_LIMB_BITS == 32
129		#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B))
130		#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35))
131		#define BINVERT_182712915 (GMP_NUMB_MASK & CNST_LIMB(0x550659DB))
132		#define BINVERT_188513325 (GMP_NUMB_MASK & CNST_LIMB(0xFBC333A5))
133		#define BINVERT_255x182712915L (GMP_NUMB_MASK & CNST_LIMB(0x6FC4CB25))
134		#define BINVERT_255x188513325L (GMP_NUMB_MASK & CNST_LIMB(0x6864275B))
135		#if GMP_NAIL_BITS == 0
136		#define BINVERT_255x182712915H CNST_LIMB(0x1B649A07)
137		#define BINVERT_255x188513325H CNST_LIMB(0x06DB993A)
138		#else /* GMP_NAIL_BITS != 0 */
139		#define BINVERT_255x182712915H \
140		(GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) \| (0x6FC4CB25>>GMP_NUMB_BITS)))
141		#define BINVERT_255x188513325H \
142		(GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) \| (0x6864275B>>GMP_NUMB_BITS)))
143		#endif
144		#else
145		#if GMP_LIMB_BITS == 64
146	27.0k	#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B))
147	27.0k	#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35))
148	27.0k	#define BINVERT_255x182712915 (GMP_NUMB_MASK & CNST_LIMB(0x1B649A076FC4CB25))
149	27.0k	#define BINVERT_255x188513325 (GMP_NUMB_MASK & CNST_LIMB(0x06DB993A6864275B))
150		#endif
151		#endif
152
153		#ifndef mpn_divexact_by255
154		#if GMP_NUMB_BITS % 8 == 0
155		#define mpn_divexact_by255(dst,src,size) \
156		(255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
157		#else
158		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
159		#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
160		#else
161		#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
162		#endif
163		#endif
164		#endif
165
166		#ifndef mpn_divexact_by255x4
167		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
168	27.0k	#define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2)
169		#else
170		#define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2)
171		#endif
172		#endif
173
174		#ifndef mpn_divexact_by9x16
175		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
176	27.0k	#define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4)
177		#else
178		#define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4)
179		#endif
180		#endif
181
182		#ifndef mpn_divexact_by42525x16
183		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
184	27.0k	#define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4)
185		#else
186		#define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4)
187		#endif
188		#endif
189
190		#ifndef mpn_divexact_by2835x64
191		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
192	27.0k	#define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6)
193		#else
194		#define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6)
195		#endif
196		#endif
197
198		#ifndef mpn_divexact_by255x182712915
199		#if GMP_NUMB_BITS < 36
200		#if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H)
201		/* FIXME: use mpn_bdiv_q_2_pi2 */
202		#endif
203		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915)
204		#define mpn_divexact_by255x182712915(dst,src,size) \
205		do { \
206		mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0); \
207		mpn_divexact_by255(dst,dst,size); \
208		} while(0)
209		#else
210		#define mpn_divexact_by255x182712915(dst,src,size) \
211		do { \
212		mpn_divexact_1(dst,src,size,CNST_LIMB(182712915)); \
213		mpn_divexact_by255(dst,dst,size); \
214		} while(0)
215		#endif
216		#else /* GMP_NUMB_BITS > 35 */
217		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915)
218		#define mpn_divexact_by255x182712915(dst,src,size) \
219	27.0k	mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0)
220		#else
221		#define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915))
222		#endif
223		#endif /* GMP_NUMB_BITS >?< 36 */
224		#endif
225
226		#ifndef mpn_divexact_by255x188513325
227		#if GMP_NUMB_BITS < 36
228		#if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H)
229		/* FIXME: use mpn_bdiv_q_1_pi2 */
230		#endif
231		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325)
232		#define mpn_divexact_by255x188513325(dst,src,size) \
233		do { \
234		mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0); \
235		mpn_divexact_by255(dst,dst,size); \
236		} while(0)
237		#else
238		#define mpn_divexact_by255x188513325(dst,src,size) \
239		do { \
240		mpn_divexact_1(dst,src,size,CNST_LIMB(188513325)); \
241		mpn_divexact_by255(dst,dst,size); \
242		} while(0)
243		#endif
244		#else /* GMP_NUMB_BITS > 35 */
245		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325)
246		#define mpn_divexact_by255x188513325(dst,src,size) \
247	27.0k	mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0)
248		#else
249		#define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325))
250		#endif
251		#endif /* GMP_NUMB_BITS >?< 36 */
252		#endif
253
254		/* Interpolation for Toom-8.5 (or Toom-8), using the evaluation
255		points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2,
256		+-1/8, 0. More precisely, we want to compute
257		f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or
258		14), given the 16 (rsp. 15) values:
259
260		r0 = limit at infinity of f(x) / x^15,
261		r1 = f(8),f(-8),
262		r2 = f(4),f(-4),
263		r3 = f(2),f(-2),
264		r4 = f(1),f(-1),
265		r5 = f(1/4),f(-1/4),
266		r6 = f(1/2),f(-1/2),
267		r7 = f(1/8),f(-1/8),
268		r8 = f(0).
269
270		All couples of the form f(n),f(-n) must be already mixed with
271		toom_couple_handling(f(n),...,f(-n),...)
272
273		The result is stored in {pp, spt + 7n (or 8n)}.
274		At entry, r8 is stored at {pp, 2n},
275		r6 is stored at {pp + 3n, 3n + 1}.
276		r4 is stored at {pp + 7n, 3n + 1}.
277		r2 is stored at {pp +11n, 3n + 1}.
278		r0 is stored at {pp +15n, spt}.
279
280		The other values are 3n+1 limbs each (with most significant limbs small).
281
282		Negative intermediate results are stored two-complemented.
283		Inputs are destroyed.
284		*/
285
286		void
287		mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7,
288		mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
289	27.0k	{
290	27.0k	mp_limb_t cy;
291	27.0k	mp_size_t n3;
292	27.0k	mp_size_t n3p1;
293	27.0k	n3 = 3 * n;
294	27.0k	n3p1 = n3 + 1;
295
296	325k	#define r6 (pp + n3) /* 3n+1 */
297	108k	#define r4 (pp + 7 * n) /* 3n+1 */
298	81.2k	#define r2 (pp +11 * n) /* 3n+1 */
299	27.0k	#define r0 (pp +15 * n) /* s+t <= 2n /
300
301	27.0k	ASSERT( spt <= 2 * n );
302		/***************************** interpolation ***************************/
303	27.0k	if( half != 0) {
304	192	cy = mpn_sub_n (r4, r4, r0, spt);
305	192	MPN_DECR_U (r4 + spt, n3p1 - spt, cy);
306
307	192	cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi);
308	192	MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
309	192	DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi);
310
311	192	cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi);
312	192	MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
313	192	DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi);
314
315	192	cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi);
316		#if BIT_CORRECTION
317		cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION,
318		n3p1 - spt - BIT_CORRECTION, cy);
319		ASSERT (BIT_CORRECTION > 0 \|\| cy == 0);
320		/* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */
321		cy = r7[n3p1];
322		r7[n3p1] = 0x80;
323		#else
324	192	MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
325	192	#endif
326	192	DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi);
327		#if BIT_CORRECTION
328		/* FIXME: assumes r7[n3p1] is writable. */
329		ASSERT ( BIT_CORRECTION > 0 \|\| r7[n3p1] == 0x80 );
330		r7[n3p1] = cy;
331		#endif
332	27.0k	};
333
334	27.0k	r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi);
335	27.0k	DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
336
337		#if HAVE_NATIVE_mpn_add_n_sub_n
338		mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
339		#else
340	27.0k	mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
341	27.0k	ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
342	27.0k	MP_PTR_SWAP(r5, wsi);
343	27.0k	#endif
344
345	27.0k	r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi);
346	27.0k	DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
347
348		#if HAVE_NATIVE_mpn_add_n_sub_n
349		mpn_add_n_sub_n (r3, r6, r6, r3, n3p1);
350		#else
351	27.0k	ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1));
352	27.0k	mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */
353	27.0k	MP_PTR_SWAP(r3, wsi);
354	27.0k	#endif
355
356	27.0k	cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi);
357		#if BIT_CORRECTION
358		MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6);
359		cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi);
360		cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy);
361		ASSERT ( BIT_CORRECTION > 0 \|\| cy != 0 );
362		#else
363	27.0k	r7[n3] -= cy;
364	27.0k	DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi);
365	27.0k	#endif
366
367		#if HAVE_NATIVE_mpn_add_n_sub_n
368		mpn_add_n_sub_n (r1, r7, r7, r1, n3p1);
369		#else
370	27.0k	mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */
371	27.0k	mpn_add_n (r1, r1, r7, n3p1); /* if BIT_CORRECTION != 0, can give a carry. */
372	27.0k	MP_PTR_SWAP(r7, wsi);
373	27.0k	#endif
374
375	27.0k	r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n);
376
377	27.0k	#if AORSMUL_FASTER_2AORSLSH
378	27.0k	mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */
379		#else
380		DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */
381		DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */
382		#endif
383
384	27.0k	mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */
385	27.0k	#if AORSMUL_FASTER_3AORSLSH
386	27.0k	mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */
387		#else
388		DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */
389		DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */
390		DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */
391		#endif
392	27.0k	mpn_divexact_by255x188513325(r7, r7, n3p1);
393
394	27.0k	mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */
395		/* A division by 2835x64 follows. Warning: the operand can be negative! */
396	27.0k	mpn_divexact_by2835x64(r5, r5, n3p1);
397	27.0k	if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0)
398	26.1k	r5[n3] \|= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6));
399
400	27.0k	#if AORSMUL_FASTER_AORS_AORSLSH
401	27.0k	mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */
402		#else
403		mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */
404		DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */
405		#endif
406	27.0k	#if AORSMUL_FASTER_2AORSLSH
407	27.0k	mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */
408		#else
409		DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */
410		DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */
411		#endif
412		/* A division by 255x4 follows. Warning: the operand can be negative! */
413	27.0k	mpn_divexact_by255x4(r6, r6, n3p1);
414	27.0k	if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
415	1.21k	r6[n3] \|= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
416
417	27.0k	ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi));
418
419	27.0k	ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi));
420	27.0k	ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400));
421
422		/* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/
423	27.0k	DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi);
424	27.0k	mpn_submul_1 (r1, r2, n3p1, 1428);
425	27.0k	mpn_submul_1 (r1, r3, n3p1, 112896);
426	27.0k	mpn_divexact_by255x182712915(r1, r1, n3p1);
427
428	27.0k	ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425));
429	27.0k	mpn_divexact_by42525x16(r2, r2, n3p1);
430
431	27.0k	#if AORSMUL_FASTER_AORS_2AORSLSH
432	27.0k	ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969));
433		#else
434		ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
435		ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi));
436		ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi));
437		#endif
438	27.0k	ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900));
439	27.0k	mpn_divexact_by9x16(r3, r3, n3p1);
440
441	27.0k	ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1));
442	27.0k	ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1));
443	27.0k	ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1));
444
445	27.0k	#ifdef HAVE_NATIVE_mpn_rsh1add_n
446	27.0k	mpn_rsh1add_n (r6, r2, r6, n3p1);
447	27.0k	r6 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
448		#else
449		mpn_add_n (r6, r2, r6, n3p1);
450		ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1));
451		#endif
452	27.0k	ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1));
453
454	27.0k	#ifdef HAVE_NATIVE_mpn_rsh1sub_n
455	27.0k	mpn_rsh1sub_n (r5, r3, r5, n3p1);
456	27.0k	r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
457		#else
458		mpn_sub_n (r5, r3, r5, n3p1);
459		ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
460		#endif
461	27.0k	ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1));
462
463	27.0k	#ifdef HAVE_NATIVE_mpn_rsh1add_n
464	27.0k	mpn_rsh1add_n (r7, r1, r7, n3p1);
465	27.0k	r7 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
466		#else
467		mpn_add_n (r7, r1, r7, n3p1);
468		ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1));
469		#endif
470	27.0k	ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1));
471
472		/* last interpolation steps... */
473		/* ... could be mixed with recomposition
474		\|\|H-r7\|M-r7\|L-r7\| \|\|H-r5\|M-r5\|L-r5\|
475		*/
476
477		/*************************** recomposition *****************************/
478		/*
479		pp[] prior to operations:
480		\|M r0\|L r0\|___\|\|H r2\|M r2\|L r2\|___\|\|H r4\|M r4\|L r4\|___\|\|H r6\|M r6\|L r6\|____\|H_r8\|L r8\|pp
481
482		summation scheme for remaining operations:
483		\|__16\|n_15\|n_14\|n_13\|n_12\|n_11\|n_10\|n__9\|n__8\|n__7\|n__6\|n__5\|n__4\|n__3\|n__2\|n___\|n___\|pp
484		\|M r0\|L r0\|___\|\|H r2\|M r2\|L r2\|___\|\|H r4\|M r4\|L r4\|___\|\|H r6\|M r6\|L r6\|____\|H_r8\|L r8\|pp
485		\|\|H r1\|M r1\|L r1\| \|\|H r3\|M r3\|L r3\| \|\|H_r5\|M_r5\|L_r5\| \|\|H r7\|M r7\|L r7\|
486		*/
487
488	27.0k	cy = mpn_add_n (pp + n, pp + n, r7, n);
489	27.0k	cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy);
490	27.0k	#if HAVE_NATIVE_mpn_add_nc
491	27.0k	cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy);
492		#else
493		MPN_INCR_U (r7 + 2 * n, n + 1, cy);
494		cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n);
495		#endif
496	27.0k	MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy);
497
498	27.0k	pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n);
499	27.0k	cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]);
500	27.0k	#if HAVE_NATIVE_mpn_add_nc
501	27.0k	cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy);
502		#else
503		MPN_INCR_U (r5 + 2 * n, n + 1, cy);
504		cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n);
505		#endif
506	27.0k	MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
507
508	27.0k	pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n);
509	27.0k	cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]);
510	27.0k	#if HAVE_NATIVE_mpn_add_nc
511	27.0k	cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy);
512		#else
513		MPN_INCR_U (r3 + 2 * n, n + 1, cy);
514		cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n);
515		#endif
516	27.0k	MPN_INCR_U (pp +12 * n, 2 * n + 1, cy);
517
518	27.0k	pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n);
519	27.0k	if ( half ) {
520	192	cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]);
521	192	#if HAVE_NATIVE_mpn_add_nc
522	192	if(LIKELY(spt > n)) {
523	192	cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy);
524	192	MPN_INCR_U (pp + 16 * n, spt - n, cy);
525	192	} else {
526	0	ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy));
527	0	}
528		#else
529		MPN_INCR_U (r1 + 2 * n, n + 1, cy);
530		if(LIKELY(spt > n)) {
531		cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n);
532		MPN_INCR_U (pp + 16 * n, spt - n, cy);
533		} else {
534		ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt));
535		}
536		#endif
537	26.8k	} else {
538	26.8k	ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n]));
539	26.8k	}
540
541	27.0k	#undef r0
542	27.0k	#undef r2
543	27.0k	#undef r4
544	27.0k	#undef r6
545	27.0k	}