/src/gmp-6.2.1/mpn/toom32_mul.c

Source (jump to first uncovered line)
/* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
   times as large as bn.  Or more accurately, bn < an < 3bn.

   Contributed to the GNU project by Torbjorn Granlund.
   Improvements by Marco Bodrato and Niels Möller.

   The idea of applying toom to unbalanced multiplication is due to Marco
   Bodrato and Alberto Zanoni.

   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

Copyright 2006-2010 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */


#include "gmp-impl.h"

/* Evaluate in: -1, 0, +1, +inf

  <-s-><--n--><--n-->
   ___ ______ ______
  |a2_|___a1_|___a0_|
  |_b1_|___b0_|
  <-t--><--n-->

  v0  =  a0         * b0      #   A(0)*B(0)
  v1  = (a0+ a1+ a2)*(b0+ b1) #   A(1)*B(1)      ah  <= 2  bh <= 1
  vm1 = (a0- a1+ a2)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh = 0
  vinf=          a2 *     b1  # A(inf)*B(inf)
*/

#define TOOM32_MUL_N_REC(p, a, b, n, ws)        \
  do {                 \
    mpn_mul_n (p, a, b, n);            \
  } while (0)

void
mpn_toom32_mul (mp_ptr pp,
    mp_srcptr ap, mp_size_t an,
    mp_srcptr bp, mp_size_t bn,
    mp_ptr scratch)
{
  mp_size_t n, s, t;
  int vm1_neg;
  mp_limb_t cy;
  mp_limb_signed_t hi;
  mp_limb_t ap1_hi, bp1_hi;

#define a0  ap
#define a1  (ap + n)
#define a2  (ap + 2 * n)
#define b0  bp
#define b1  (bp + n)

  /* Required, to ensure that s + t >= n. */
  ASSERT (bn + 2 <= an && an + 6 <= 3*bn);

  n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);

  s = an - 2 * n;
  t = bn - n;

  ASSERT (0 < s && s <= n);
  ASSERT (0 < t && t <= n);
  ASSERT (s + t >= n);

  /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
#define ap1 (pp)    /* n, most significant limb in ap1_hi */
#define bp1 (pp + n)    /* n, most significant bit in bp1_hi */
#define am1 (pp + 2*n)    /* n, most significant bit in hi */
#define bm1 (pp + 3*n)    /* n */
#define v1 (scratch)    /* 2n + 1 */
#define vm1 (pp)    /* 2n + 1 */
#define scratch_out (scratch + 2*n + 1) /* Currently unused. */

  /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */

  /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */

  /* Compute ap1 = a0 + a1 + a2, am1 = a0 - a1 + a2 */
  ap1_hi = mpn_add (ap1, a0, n, a2, s);
#if HAVE_NATIVE_mpn_add_n_sub_n
  if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
    {
      ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1;
      hi = 0;
      vm1_neg = 1;
    }
  else
    {
      cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n);
      hi = ap1_hi - (cy & 1);
      ap1_hi += (cy >> 1);
      vm1_neg = 0;
    }
#else
  if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
    {
      ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
      hi = 0;
      vm1_neg = 1;
    }
  else
    {
      hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n);
      vm1_neg = 0;
    }
  ap1_hi += mpn_add_n (ap1, ap1, a1, n);
#endif

  /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
  if (t == n)
    {
#if HAVE_NATIVE_mpn_add_n_sub_n
      if (mpn_cmp (b0, b1, n) < 0)
  {
    cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n);
    vm1_neg ^= 1;
  }
      else
  {
    cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n);
  }
      bp1_hi = cy >> 1;
#else
      bp1_hi = mpn_add_n (bp1, b0, b1, n);

      if (mpn_cmp (b0, b1, n) < 0)
  {
    ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
    vm1_neg ^= 1;
  }
      else
  {
    ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
  }
#endif
    }
  else
    {
      /* FIXME: Should still use mpn_add_n_sub_n for the main part. */
      bp1_hi = mpn_add (bp1, b0, n, b1, t);

      if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
  {
    ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
    MPN_ZERO (bm1 + t, n - t);
    vm1_neg ^= 1;
  }
      else
  {
    ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
  }
    }

  TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
  if (ap1_hi == 1)
    {
      cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n);
    }
  else if (ap1_hi == 2)
    {
#if HAVE_NATIVE_mpn_addlsh1_n
      cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
#else
      cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
#endif
    }
  else
    cy = 0;
  if (bp1_hi != 0)
    cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
  v1[2 * n] = cy;

  TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
  if (hi)
    hi = mpn_add_n (vm1+n, vm1+n, bm1, n);

  vm1[2*n] = hi;

  /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
  if (vm1_neg)
    {
#if HAVE_NATIVE_mpn_rsh1sub_n
      mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
#else
      mpn_sub_n (v1, v1, vm1, 2*n+1);
      ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
#endif
    }
  else
    {
#if HAVE_NATIVE_mpn_rsh1add_n
      mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
#else
      mpn_add_n (v1, v1, vm1, 2*n+1);
      ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
#endif
    }

  /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence

     y = x1 + x3 + (x0 + x2) * B
       = (x0 + x2) * B + (x0 + x2) - vm1.

     y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
     follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
     (already in place, except for carry propagation).

     We thus add

   B^3  B^2   B    1
    |    |    |    |
   +-----+----+
 + |  x0 + x2 |
   +----+-----+----+
 +      |  x0 + x2 |
  +----------+
 -      |  vm1     |
 --+----++----+----+-
   | y2  | y1 | y0 |
   +-----+----+----+

  Since we store y0 at the same location as the low half of x0 + x2, we
  need to do the middle sum first. */

  hi = vm1[2*n];
  cy = mpn_add_n (pp + 2*n, v1, v1 + n, n);
  MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]);

  /* FIXME: Can we get rid of this second vm1_neg conditional by
     swapping the location of +1 and -1 values? */
  if (vm1_neg)
    {
      cy = mpn_add_n (v1, v1, vm1, n);
      hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
      MPN_INCR_U (v1 + n, n+1, hi);
    }
  else
    {
      cy = mpn_sub_n (v1, v1, vm1, n);
      hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
      MPN_DECR_U (v1 + n, n+1, hi);
    }

  TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
  /* vinf, s+t limbs.  Use mpn_mul for now, to handle unbalanced operands */
  if (s > t)  mpn_mul (pp+3*n, a2, s, b1, t);
  else        mpn_mul (pp+3*n, b1, t, a2, s);

  /* Remaining interpolation.

     y * B + x0 + x3 B^3 - x0 B^2 - x3 B
     = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
     = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
       + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
     = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
       + (y2 - (H x0 - L x3)) B^3 + H x3 B^4

    B^4       B^3       B^2        B         1
 |         |         |         |         |         |
   +-------+                   +---------+---------+
   |  Hx3  |                   | Hx0-Lx3 |    Lx0  |
   +------+----------+---------+---------+---------+
    |    y2    |  y1     |   y0    |
    ++---------+---------+---------+
    -| Hx0-Lx3 | - Lx0   |
     +---------+---------+
          | - Hx3  |
          +--------+

    We must take into account the carry from Hx0 - Lx3.
  */

  cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n);
  hi = scratch[2*n] + cy;

  cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy);
  hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy);

  hi += mpn_add (pp + n, pp + n, 3*n, scratch, n);

  /* FIXME: Is support for s + t == n needed? */
  if (LIKELY (s + t > n))
    {
      hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n);

      if (hi < 0)
  MPN_DECR_U (pp + 4*n, s+t-n, -hi);
      else
  MPN_INCR_U (pp + 4*n, s+t-n, hi);
    }
  else
    ASSERT (hi == 0);
}

Coverage Report

Created: 2023-02-22 06:39

Line	Count	Source (jump to first uncovered line)
1		/* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
2		times as large as bn. Or more accurately, bn < an < 3bn.
3
4		Contributed to the GNU project by Torbjorn Granlund.
5		Improvements by Marco Bodrato and Niels Möller.
6
7		The idea of applying toom to unbalanced multiplication is due to Marco
8		Bodrato and Alberto Zanoni.
9
10		THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
11		SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
12		GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
13
14		Copyright 2006-2010 Free Software Foundation, Inc.
15
16		This file is part of the GNU MP Library.
17
18		The GNU MP Library is free software; you can redistribute it and/or modify
19		it under the terms of either:
20
21		* the GNU Lesser General Public License as published by the Free
22		Software Foundation; either version 3 of the License, or (at your
23		option) any later version.
24
25		or
26
27		* the GNU General Public License as published by the Free Software
28		Foundation; either version 2 of the License, or (at your option) any
29		later version.
30
31		or both in parallel, as here.
32
33		The GNU MP Library is distributed in the hope that it will be useful, but
34		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
36		for more details.
37
38		You should have received copies of the GNU General Public License and the
39		GNU Lesser General Public License along with the GNU MP Library. If not,
40		see https://www.gnu.org/licenses/. */
41
42
43		#include "gmp-impl.h"
44
45		/* Evaluate in: -1, 0, +1, +inf
46
47		<-s-><--n--><--n-->
48		___ ______ ______
49		\|a2_\|___a1_\|___a0_\|
50		\|_b1_\|___b0_\|
51		<-t--><--n-->
52
53		v0 = a0 * b0 # A(0)*B(0)
54		v1 = (a0+ a1+ a2)(b0+ b1) # A(1)B(1) ah <= 2 bh <= 1
55		vm1 = (a0- a1+ a2)(b0- b1) # A(-1)B(-1) \|ah\| <= 1 bh = 0
56		vinf= a2 * b1 # A(inf)*B(inf)
57		*/
58
59		#define TOOM32_MUL_N_REC(p, a, b, n, ws) \
60	13.7k	do { \
61	13.7k	mpn_mul_n (p, a, b, n); \
62	13.7k	} while (0)
63
64		void
65		mpn_toom32_mul (mp_ptr pp,
66		mp_srcptr ap, mp_size_t an,
67		mp_srcptr bp, mp_size_t bn,
68		mp_ptr scratch)
69	4.59k	{
70	4.59k	mp_size_t n, s, t;
71	4.59k	int vm1_neg;
72	4.59k	mp_limb_t cy;
73	4.59k	mp_limb_signed_t hi;
74	4.59k	mp_limb_t ap1_hi, bp1_hi;
75
76	4.59k	#define a0 ap
77	11.2k	#define a1 (ap + n)
78	9.19k	#define a2 (ap + 2 * n)
79	9.24k	#define b0 bp
80	11.5k	#define b1 (bp + n)
81
82		/* Required, to ensure that s + t >= n. */
83	4.59k	ASSERT (bn + 2 <= an && an + 6 <= 3*bn);
84
85	4.59k	n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
86
87	4.59k	s = an - 2 * n;
88	4.59k	t = bn - n;
89
90	4.59k	ASSERT (0 < s && s <= n);
91	4.59k	ASSERT (0 < t && t <= n);
92	4.59k	ASSERT (s + t >= n);
93
94		/* Product area of size an + bn = 3n + s + t >= 4n + 2. */
95	20.4k	#define ap1 (pp) /* n, most significant limb in ap1_hi */
96	7.00k	#define bp1 (pp + n) /* n, most significant bit in bp1_hi */
97	4.59k	#define am1 (pp + 2n) / n, most significant bit in hi */
98	4.59k	#define bm1 (pp + 3n) / n */
99	37.1k	#define v1 (scratch) /* 2n + 1 */
100	23.0k	#define vm1 (pp) /* 2n + 1 */
101	4.59k	#define scratch_out (scratch + 2n + 1) / Currently unused. */
102
103		/* Scratch need: 2n + 1 + scratch for the recursive multiplications. /
104
105		/* FIXME: Keep v1[2n] and vm1[2n] in scalar variables? */
106
107		/* Compute ap1 = a0 + a1 + a2, am1 = a0 - a1 + a2 */
108	4.59k	ap1_hi = mpn_add (ap1, a0, n, a2, s);
109		#if HAVE_NATIVE_mpn_add_n_sub_n
110		if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
111		{
112		ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1;
113		hi = 0;
114		vm1_neg = 1;
115		}
116		else
117		{
118		cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n);
119		hi = ap1_hi - (cy & 1);
120		ap1_hi += (cy >> 1);
121		vm1_neg = 0;
122		}
123		#else
124	4.59k	if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
125	2.43k	{
126	2.43k	ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
127	2.43k	hi = 0;
128	2.43k	vm1_neg = 1;
129	2.43k	}
130	2.16k	else
131	2.16k	{
132	2.16k	hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n);
133	2.16k	vm1_neg = 0;
134	2.16k	}
135	4.59k	ap1_hi += mpn_add_n (ap1, ap1, a1, n);
136	4.59k	#endif
137
138		/* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
139	4.59k	if (t == n)
140	2.25k	{
141		#if HAVE_NATIVE_mpn_add_n_sub_n
142		if (mpn_cmp (b0, b1, n) < 0)
143		{
144		cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n);
145		vm1_neg ^= 1;
146		}
147		else
148		{
149		cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n);
150		}
151		bp1_hi = cy >> 1;
152		#else
153	2.25k	bp1_hi = mpn_add_n (bp1, b0, b1, n);
154
155	2.25k	if (mpn_cmp (b0, b1, n) < 0)
156	64	{
157	64	ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
158	64	vm1_neg ^= 1;
159	64	}
160	2.19k	else
161	2.19k	{
162	2.19k	ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
163	2.19k	}
164	2.25k	#endif
165	2.25k	}
166	2.34k	else
167	2.34k	{
168		/* FIXME: Should still use mpn_add_n_sub_n for the main part. */
169	2.34k	bp1_hi = mpn_add (bp1, b0, n, b1, t);
170
171	2.34k	if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
172	25	{
173	25	ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
174	25	MPN_ZERO (bm1 + t, n - t);
175	25	vm1_neg ^= 1;
176	25	}
177	2.31k	else
178	2.31k	{
179	2.31k	ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
180	2.31k	}
181	2.34k	}
182
183	4.59k	TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
184	4.59k	if (ap1_hi == 1)
185	2.37k	{
186	2.37k	cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n);
187	2.37k	}
188	2.22k	else if (ap1_hi == 2)
189	36	{
190	36	#if HAVE_NATIVE_mpn_addlsh1_n
191	36	cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
192		#else
193		cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
194		#endif
195	36	}
196	2.19k	else
197	2.19k	cy = 0;
198	4.59k	if (bp1_hi != 0)
199	50	cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
200	4.59k	v1[2 * n] = cy;
201
202	4.59k	TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
203	4.59k	if (hi)
204	25	hi = mpn_add_n (vm1+n, vm1+n, bm1, n);
205
206	4.59k	vm1[2*n] = hi;
207
208		/* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
209	4.59k	if (vm1_neg)
210	2.46k	{
211	2.46k	#if HAVE_NATIVE_mpn_rsh1sub_n
212	2.46k	mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
213		#else
214		mpn_sub_n (v1, v1, vm1, 2*n+1);
215		ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
216		#endif
217	2.46k	}
218	2.13k	else
219	2.13k	{
220	2.13k	#if HAVE_NATIVE_mpn_rsh1add_n
221	2.13k	mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
222		#else
223		mpn_add_n (v1, v1, vm1, 2*n+1);
224		ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
225		#endif
226	2.13k	}
227
228		/* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
229
230		y = x1 + x3 + (x0 + x2) * B
231		= (x0 + x2) * B + (x0 + x2) - vm1.
232
233		y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
234		follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
235		(already in place, except for carry propagation).
236
237		We thus add
238
239		B^3 B^2 B 1
240		\| \| \| \|
241		+-----+----+
242		+ \| x0 + x2 \|
243		+----+-----+----+
244		+ \| x0 + x2 \|
245		+----------+
246		- \| vm1 \|
247		--+----++----+----+-
248		\| y2 \| y1 \| y0 \|
249		+-----+----+----+
250
251		Since we store y0 at the same location as the low half of x0 + x2, we
252		need to do the middle sum first. */
253
254	4.59k	hi = vm1[2*n];
255	4.59k	cy = mpn_add_n (pp + 2*n, v1, v1 + n, n);
256	4.59k	MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]);
257
258		/* FIXME: Can we get rid of this second vm1_neg conditional by
259		swapping the location of +1 and -1 values? */
260	4.59k	if (vm1_neg)
261	2.46k	{
262	2.46k	cy = mpn_add_n (v1, v1, vm1, n);
263	2.46k	hi += mpn_add_nc (pp + 2n, pp + 2n, vm1 + n, n, cy);
264	2.46k	MPN_INCR_U (v1 + n, n+1, hi);
265	2.46k	}
266	2.13k	else
267	2.13k	{
268	2.13k	cy = mpn_sub_n (v1, v1, vm1, n);
269	2.13k	hi += mpn_sub_nc (pp + 2n, pp + 2n, vm1 + n, n, cy);
270	2.13k	MPN_DECR_U (v1 + n, n+1, hi);
271	2.13k	}
272
273	4.59k	TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
274		/* vinf, s+t limbs. Use mpn_mul for now, to handle unbalanced operands */
275	4.59k	if (s > t) mpn_mul (pp+3*n, a2, s, b1, t);
276	4.10k	else mpn_mul (pp+3*n, b1, t, a2, s);
277
278		/* Remaining interpolation.
279
280		y * B + x0 + x3 B^3 - x0 B^2 - x3 B
281		= (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
282		= y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
283		+ L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
284		= L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
285		+ (y2 - (H x0 - L x3)) B^3 + H x3 B^4
286
287		B^4 B^3 B^2 B 1
288		\| \| \| \| \| \|
289		+-------+ +---------+---------+
290		\| Hx3 \| \| Hx0-Lx3 \| Lx0 \|
291		+------+----------+---------+---------+---------+
292		\| y2 \| y1 \| y0 \|
293		++---------+---------+---------+
294		-\| Hx0-Lx3 \| - Lx0 \|
295		+---------+---------+
296		\| - Hx3 \|
297		+--------+
298
299		We must take into account the carry from Hx0 - Lx3.
300		*/
301
302	4.59k	cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n);
303	4.59k	hi = scratch[2*n] + cy;
304
305	4.59k	cy = mpn_sub_nc (pp + 2n, pp + 2n, pp, n, cy);
306	4.59k	hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy);
307
308	4.59k	hi += mpn_add (pp + n, pp + n, 3*n, scratch, n);
309
310		/* FIXME: Is support for s + t == n needed? */
311	4.59k	if (LIKELY (s + t > n))
312	4.59k	{
313	4.59k	hi -= mpn_sub (pp + 2n, pp + 2n, 2n, pp + 4n, s+t-n);
314
315	4.59k	if (hi < 0)
316	0	MPN_DECR_U (pp + 4*n, s+t-n, -hi);
317	4.59k	else
318	4.59k	MPN_INCR_U (pp + 4*n, s+t-n, hi);
319	4.59k	}
320	0	else
321	4.59k	ASSERT (hi == 0);
322	4.59k	}