/src/gmp-6.2.1/mpn/toom_interpolate_12pts.c

Source (jump to first uncovered line)
/* Interpolation for the algorithm Toom-Cook 6.5-way.

   Contributed to the GNU project by Marco Bodrato.

   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */


#include "gmp-impl.h"


#if GMP_NUMB_BITS < 21
#error Not implemented: Both sublsh_n(,,,20) should be corrected.
#endif

#if GMP_NUMB_BITS < 16
#error Not implemented: divexact_by42525 needs splitting.
#endif

#if GMP_NUMB_BITS < 12
#error Not implemented: Hard to adapt...
#endif


/* FIXME: tuneup should decide the best variant */
#ifndef AORSMUL_FASTER_AORS_AORSLSH
#define AORSMUL_FASTER_AORS_AORSLSH 1
#endif
#ifndef AORSMUL_FASTER_AORS_2AORSLSH
#define AORSMUL_FASTER_AORS_2AORSLSH 1
#endif
#ifndef AORSMUL_FASTER_2AORSLSH
#define AORSMUL_FASTER_2AORSLSH 1
#endif
#ifndef AORSMUL_FASTER_3AORSLSH
#define AORSMUL_FASTER_3AORSLSH 1
#endif


#if HAVE_NATIVE_mpn_sublsh_n
#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
#else
static mp_limb_t
DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
{
#if USE_MUL_1 && 0
  return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
#else
  mp_limb_t __cy;
  __cy = mpn_lshift(ws,src,n,s);
  return    __cy + mpn_sub_n(dst,dst,ws,n);
#endif
}
#endif

#if HAVE_NATIVE_mpn_addlsh_n
#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
#else
#if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
static mp_limb_t
DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
{
#if USE_MUL_1 && 0
  return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
#else
  mp_limb_t __cy;
  __cy = mpn_lshift(ws,src,n,s);
  return    __cy + mpn_add_n(dst,dst,ws,n);
#endif
}
#endif
#endif

#if HAVE_NATIVE_mpn_subrsh
#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
#else
/* FIXME: This is not a correct definition, it assumes no carry */
#define DO_mpn_subrsh(dst,nd,src,ns,s,ws)       \
do {                 \
  mp_limb_t __cy;             \
  MPN_DECR_U (dst, nd, src[0] >> s);          \
  __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);  \
  MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);       \
} while (0)
#endif


#define BINVERT_9 \
  ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)

#define BINVERT_255 \
  (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))

  /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
#if GMP_LIMB_BITS == 32
#define BINVERT_2835  (GMP_NUMB_MASK &    CNST_LIMB(0x53E3771B))
#define BINVERT_42525 (GMP_NUMB_MASK &    CNST_LIMB(0x9F314C35))
#else
#if GMP_LIMB_BITS == 64
#define BINVERT_2835  (GMP_NUMB_MASK &  CNST_LIMB(0x938CC70553E3771B))
#define BINVERT_42525 (GMP_NUMB_MASK &  CNST_LIMB(0xE7B40D449F314C35))
#endif
#endif

#ifndef mpn_divexact_by255
#if GMP_NUMB_BITS % 8 == 0
#define mpn_divexact_by255(dst,src,size) \
  (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
#else
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
#else
#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
#endif
#endif
#endif

#ifndef mpn_divexact_by9x4
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
#define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
#else
#define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
#endif
#endif

#ifndef mpn_divexact_by42525
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
#define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
#else
#define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
#endif
#endif

#ifndef mpn_divexact_by2835x4
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
#define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
#else
#define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
#endif
#endif

/* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
   points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
   we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
   degree 11 (or 10), given the 12 (rsp. 11) values:

     r0 = limit at infinity of f(x) / x^11,
     r1 = f(4),f(-4),
     r2 = f(2),f(-2),
     r3 = f(1),f(-1),
     r4 = f(1/4),f(-1/4),
     r5 = f(1/2),f(-1/2),
     r6 = f(0).

   All couples of the form f(n),f(-n) must be already mixed with
   toom_couple_handling(f(n),...,f(-n),...)

   The result is stored in {pp, spt + 7*n (or 6*n)}.
   At entry, r6 is stored at {pp, 2n},
   r4 is stored at {pp + 3n, 3n + 1}.
   r2 is stored at {pp + 7n, 3n + 1}.
   r0 is stored at {pp +11n, spt}.

   The other values are 3n+1 limbs each (with most significant limbs small).

   Negative intermediate results are stored two-complemented.
   Inputs are destroyed.
*/

void
mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
      mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
{
  mp_limb_t cy;
  mp_size_t n3;
  mp_size_t n3p1;
  n3 = 3 * n;
  n3p1 = n3 + 1;

#define   r4    (pp + n3)      /* 3n+1 */
#define   r2    (pp + 7 * n)      /* 3n+1 */
#define   r0    (pp +11 * n)      /* s+t <= 2*n */

  /******************************* interpolation *****************************/
  if (half != 0) {
    cy = mpn_sub_n (r3, r3, r0, spt);
    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);

    cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);

    cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
    MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
    DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
  };

  r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
#else
  ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
  mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
  MP_PTR_SWAP(r1, wsi);
#endif

  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);

#if HAVE_NATIVE_mpn_add_n_sub_n
  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
#else
  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
  MP_PTR_SWAP(r5, wsi);
#endif

  r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);

#if AORSMUL_FASTER_AORS_AORSLSH
  mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
#else
  mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
  DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
#endif
  /* A division by 2835x4 follows. Warning: the operand can be negative! */
  mpn_divexact_by2835x4(r4, r4, n3p1);
  if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
    r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));

#if AORSMUL_FASTER_2AORSLSH
  mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
#else
  DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
  DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
#endif
  mpn_divexact_by255(r5, r5, n3p1);

  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));

#if AORSMUL_FASTER_3AORSLSH
  ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
#else
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
#endif
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
  mpn_divexact_by42525(r1, r1, n3p1);

#if AORSMUL_FASTER_AORS_2AORSLSH
  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
#else
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
  ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
#endif
  mpn_divexact_by9x4(r2, r2, n3p1);

  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));

#ifdef HAVE_NATIVE_mpn_rsh1sub_n
  mpn_rsh1sub_n (r4, r2, r4, n3p1);
  r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
#else
  mpn_sub_n (r4, r2, r4, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
#endif
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));

#ifdef HAVE_NATIVE_mpn_rsh1add_n
  mpn_rsh1add_n (r5, r5, r1, n3p1);
  r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
#else
  mpn_add_n (r5, r5, r1, n3p1);
  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
#endif

  /* last interpolation steps... */
  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
  /* ... could be mixed with recomposition
  ||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
  */

  /***************************** recomposition *******************************/
  /*
    pp[] prior to operations:
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp

    summation scheme for remaining operations:
    |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
  ||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
  */

  cy = mpn_add_n (pp + n, pp + n, r5, n);
  cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
#if HAVE_NATIVE_mpn_add_nc
  cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
#else
  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
  cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
#endif
  MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);

  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
  cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
#if HAVE_NATIVE_mpn_add_nc
  cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
#else
  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
  cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
#endif
  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);

  pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
  if (half) {
    cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
#if HAVE_NATIVE_mpn_add_nc
    if (LIKELY (spt > n)) {
      cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
    }
#else
    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
    if (LIKELY (spt > n)) {
      cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
    } else {
      ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
    }
#endif
  } else {
    ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
  }

#undef   r0
#undef   r2
#undef   r4
}

Coverage Report

Created: 2025-03-09 06:52

Line	Count	Source (jump to first uncovered line)
1		/* Interpolation for the algorithm Toom-Cook 6.5-way.
2
3		Contributed to the GNU project by Marco Bodrato.
4
5		THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6		SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7		GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9		Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.
10
11		This file is part of the GNU MP Library.
12
13		The GNU MP Library is free software; you can redistribute it and/or modify
14		it under the terms of either:
15
16		* the GNU Lesser General Public License as published by the Free
17		Software Foundation; either version 3 of the License, or (at your
18		option) any later version.
19
20		or
21
22		* the GNU General Public License as published by the Free Software
23		Foundation; either version 2 of the License, or (at your option) any
24		later version.
25
26		or both in parallel, as here.
27
28		The GNU MP Library is distributed in the hope that it will be useful, but
29		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31		for more details.
32
33		You should have received copies of the GNU General Public License and the
34		GNU Lesser General Public License along with the GNU MP Library. If not,
35		see https://www.gnu.org/licenses/. */
36
37
38		#include "gmp-impl.h"
39
40
41		#if GMP_NUMB_BITS < 21
42		#error Not implemented: Both sublsh_n(,,,20) should be corrected.
43		#endif
44
45		#if GMP_NUMB_BITS < 16
46		#error Not implemented: divexact_by42525 needs splitting.
47		#endif
48
49		#if GMP_NUMB_BITS < 12
50		#error Not implemented: Hard to adapt...
51		#endif
52
53
54		/* FIXME: tuneup should decide the best variant */
55		#ifndef AORSMUL_FASTER_AORS_AORSLSH
56		#define AORSMUL_FASTER_AORS_AORSLSH 1
57		#endif
58		#ifndef AORSMUL_FASTER_AORS_2AORSLSH
59		#define AORSMUL_FASTER_AORS_2AORSLSH 1
60		#endif
61		#ifndef AORSMUL_FASTER_2AORSLSH
62		#define AORSMUL_FASTER_2AORSLSH 1
63		#endif
64		#ifndef AORSMUL_FASTER_3AORSLSH
65		#define AORSMUL_FASTER_3AORSLSH 1
66		#endif
67
68
69		#if HAVE_NATIVE_mpn_sublsh_n
70		#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
71		#else
72		static mp_limb_t
73		DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
74	116k	{
75		#if USE_MUL_1 && 0
76		return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
77		#else
78	116k	mp_limb_t __cy;
79	116k	__cy = mpn_lshift(ws,src,n,s);
80	116k	return __cy + mpn_sub_n(dst,dst,ws,n);
81	116k	#endif
82	116k	}
83		#endif
84
85		#if HAVE_NATIVE_mpn_addlsh_n
86		#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
87		#else
88		#if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
89		static mp_limb_t
90		DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
91		{
92		#if USE_MUL_1 && 0
93		return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
94		#else
95		mp_limb_t __cy;
96		__cy = mpn_lshift(ws,src,n,s);
97		return __cy + mpn_add_n(dst,dst,ws,n);
98		#endif
99		}
100		#endif
101		#endif
102
103		#if HAVE_NATIVE_mpn_subrsh
104		#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
105		#else
106		/* FIXME: This is not a correct definition, it assumes no carry */
107	38.9k	#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \
108	38.9k	do { \
109	38.9k	mp_limb_t __cy; \
110	38.9k	MPN_DECR_U (dst, nd, src[0] >> s); \
111	38.9k	__cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
112	38.9k	MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \
113	38.9k	} while (0)
114		#endif
115
116
117		#define BINVERT_9 \
118	19.1k	((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) \| 0x39)
119
120		#define BINVERT_255 \
121		(GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
122
123		/* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
124		#if GMP_LIMB_BITS == 32
125		#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B))
126		#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35))
127		#else
128		#if GMP_LIMB_BITS == 64
129	19.1k	#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B))
130	19.1k	#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35))
131		#endif
132		#endif
133
134		#ifndef mpn_divexact_by255
135		#if GMP_NUMB_BITS % 8 == 0
136		#define mpn_divexact_by255(dst,src,size) \
137	19.1k	(255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
138		#else
139		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
140		#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
141		#else
142		#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
143		#endif
144		#endif
145		#endif
146
147		#ifndef mpn_divexact_by9x4
148		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
149	19.1k	#define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
150		#else
151		#define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
152		#endif
153		#endif
154
155		#ifndef mpn_divexact_by42525
156		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
157	19.1k	#define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
158		#else
159		#define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
160		#endif
161		#endif
162
163		#ifndef mpn_divexact_by2835x4
164		#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
165	19.1k	#define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
166		#else
167		#define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
168		#endif
169		#endif
170
171		/* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
172		points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
173		we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
174		degree 11 (or 10), given the 12 (rsp. 11) values:
175
176		r0 = limit at infinity of f(x) / x^11,
177		r1 = f(4),f(-4),
178		r2 = f(2),f(-2),
179		r3 = f(1),f(-1),
180		r4 = f(1/4),f(-1/4),
181		r5 = f(1/2),f(-1/2),
182		r6 = f(0).
183
184		All couples of the form f(n),f(-n) must be already mixed with
185		toom_couple_handling(f(n),...,f(-n),...)
186
187		The result is stored in {pp, spt + 7n (or 6n)}.
188		At entry, r6 is stored at {pp, 2n},
189		r4 is stored at {pp + 3n, 3n + 1}.
190		r2 is stored at {pp + 7n, 3n + 1}.
191		r0 is stored at {pp +11n, spt}.
192
193		The other values are 3n+1 limbs each (with most significant limbs small).
194
195		Negative intermediate results are stored two-complemented.
196		Inputs are destroyed.
197		*/
198
199		void
200		mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
201		mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
202	19.1k	{
203	19.1k	mp_limb_t cy;
204	19.1k	mp_size_t n3;
205	19.1k	mp_size_t n3p1;
206	19.1k	n3 = 3 * n;
207	19.1k	n3p1 = n3 + 1;
208
209	209k	#define r4 (pp + n3) /* 3n+1 */
210	38.5k	#define r2 (pp + 7 * n) /* 3n+1 */
211	19.1k	#define r0 (pp +11 * n) /* s+t <= 2n /
212
213		/***************************** interpolation ***************************/
214	19.1k	if (half != 0) {
215	392	cy = mpn_sub_n (r3, r3, r0, spt);
216	392	MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
217
218	392	cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
219	392	MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
220	392	DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
221
222	392	cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
223	392	MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
224	392	DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
225	19.1k	};
226
227	19.1k	r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
228	19.1k	DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
229
230		#if HAVE_NATIVE_mpn_add_n_sub_n
231		mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
232		#else
233	19.1k	ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
234	19.1k	mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
235	19.1k	MP_PTR_SWAP(r1, wsi);
236	19.1k	#endif
237
238	19.1k	r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
239	19.1k	DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
240
241		#if HAVE_NATIVE_mpn_add_n_sub_n
242		mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
243		#else
244	19.1k	mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
245	19.1k	ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
246	19.1k	MP_PTR_SWAP(r5, wsi);
247	19.1k	#endif
248
249	19.1k	r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
250
251	19.1k	#if AORSMUL_FASTER_AORS_AORSLSH
252	19.1k	mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
253		#else
254		mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
255		DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
256		#endif
257		/* A division by 2835x4 follows. Warning: the operand can be negative! */
258	19.1k	mpn_divexact_by2835x4(r4, r4, n3p1);
259	19.1k	if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
260	18.8k	r4[n3] \|= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
261
262	19.1k	#if AORSMUL_FASTER_2AORSLSH
263	19.1k	mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
264		#else
265		DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
266		DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
267		#endif
268	19.1k	mpn_divexact_by255(r5, r5, n3p1);
269
270	19.1k	ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
271
272	19.1k	#if AORSMUL_FASTER_3AORSLSH
273	19.1k	ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
274		#else
275		ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
276		ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
277		ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
278		#endif
279	19.1k	ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
280	19.1k	mpn_divexact_by42525(r1, r1, n3p1);
281
282	19.1k	#if AORSMUL_FASTER_AORS_2AORSLSH
283	19.1k	ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
284		#else
285		ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
286		ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
287		ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
288		#endif
289	19.1k	mpn_divexact_by9x4(r2, r2, n3p1);
290
291	19.1k	ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
292
293	19.1k	#ifdef HAVE_NATIVE_mpn_rsh1sub_n
294	19.1k	mpn_rsh1sub_n (r4, r2, r4, n3p1);
295	19.1k	r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
296		#else
297		mpn_sub_n (r4, r2, r4, n3p1);
298		ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
299		#endif
300	19.1k	ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
301
302	19.1k	#ifdef HAVE_NATIVE_mpn_rsh1add_n
303	19.1k	mpn_rsh1add_n (r5, r5, r1, n3p1);
304	19.1k	r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
305		#else
306		mpn_add_n (r5, r5, r1, n3p1);
307		ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
308		#endif
309
310		/* last interpolation steps... */
311	19.1k	ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
312	19.1k	ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
313		/* ... could be mixed with recomposition
314		\|\|H-r5\|M-r5\|L-r5\| \|\|H-r1\|M-r1\|L-r1\|
315		*/
316
317		/*************************** recomposition *****************************/
318		/*
319		pp[] prior to operations:
320		\|M r0\|L r0\|___\|\|H r2\|M r2\|L r2\|___\|\|H r4\|M r4\|L r4\|____\|H_r6\|L r6\|pp
321
322		summation scheme for remaining operations:
323		\|__12\|n_11\|n_10\|n__9\|n__8\|n__7\|n__6\|n__5\|n__4\|n__3\|n__2\|n___\|n___\|pp
324		\|M r0\|L r0\|___\|\|H r2\|M r2\|L r2\|___\|\|H r4\|M r4\|L r4\|____\|H_r6\|L r6\|pp
325		\|\|H r1\|M r1\|L r1\| \|\|H r3\|M r3\|L r3\| \|\|H_r5\|M_r5\|L_r5\|
326		*/
327
328	19.1k	cy = mpn_add_n (pp + n, pp + n, r5, n);
329	19.1k	cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
330	19.1k	#if HAVE_NATIVE_mpn_add_nc
331	19.1k	cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
332		#else
333		MPN_INCR_U (r5 + 2 * n, n + 1, cy);
334		cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
335		#endif
336	19.1k	MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
337
338	19.1k	pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
339	19.1k	cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
340	19.1k	#if HAVE_NATIVE_mpn_add_nc
341	19.1k	cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
342		#else
343		MPN_INCR_U (r3 + 2 * n, n + 1, cy);
344		cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
345		#endif
346	19.1k	MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
347
348	19.1k	pp[10n]+=mpn_add_n (pp + 9 n, pp + 9 * n, r1, n);
349	19.1k	if (half) {
350	392	cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
351	392	#if HAVE_NATIVE_mpn_add_nc
352	392	if (LIKELY (spt > n)) {
353	392	cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
354	392	MPN_INCR_U (pp + 4 * n3, spt - n, cy);
355	392	} else {
356	0	ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
357	0	}
358		#else
359		MPN_INCR_U (r1 + 2 * n, n + 1, cy);
360		if (LIKELY (spt > n)) {
361		cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
362		MPN_INCR_U (pp + 4 * n3, spt - n, cy);
363		} else {
364		ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
365		}
366		#endif
367	18.7k	} else {
368	18.7k	ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
369	18.7k	}
370
371	19.1k	#undef r0
372	19.1k	#undef r2
373	19.1k	#undef r4
374	19.1k	}