Coverage Report

Created: 2024-02-25 06:16

/src/gmp-6.2.1/mpn/toom32_mul.c
Line
Count
Source (jump to first uncovered line)
1
/* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
2
   times as large as bn.  Or more accurately, bn < an < 3bn.
3
4
   Contributed to the GNU project by Torbjorn Granlund.
5
   Improvements by Marco Bodrato and Niels Möller.
6
7
   The idea of applying toom to unbalanced multiplication is due to Marco
8
   Bodrato and Alberto Zanoni.
9
10
   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
11
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
12
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
13
14
Copyright 2006-2010 Free Software Foundation, Inc.
15
16
This file is part of the GNU MP Library.
17
18
The GNU MP Library is free software; you can redistribute it and/or modify
19
it under the terms of either:
20
21
  * the GNU Lesser General Public License as published by the Free
22
    Software Foundation; either version 3 of the License, or (at your
23
    option) any later version.
24
25
or
26
27
  * the GNU General Public License as published by the Free Software
28
    Foundation; either version 2 of the License, or (at your option) any
29
    later version.
30
31
or both in parallel, as here.
32
33
The GNU MP Library is distributed in the hope that it will be useful, but
34
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
36
for more details.
37
38
You should have received copies of the GNU General Public License and the
39
GNU Lesser General Public License along with the GNU MP Library.  If not,
40
see https://www.gnu.org/licenses/.  */
41
42
43
#include "gmp-impl.h"
44
45
/* Evaluate in: -1, 0, +1, +inf
46
47
  <-s-><--n--><--n-->
48
   ___ ______ ______
49
  |a2_|___a1_|___a0_|
50
  |_b1_|___b0_|
51
  <-t--><--n-->
52
53
  v0  =  a0         * b0      #   A(0)*B(0)
54
  v1  = (a0+ a1+ a2)*(b0+ b1) #   A(1)*B(1)      ah  <= 2  bh <= 1
55
  vm1 = (a0- a1+ a2)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh = 0
56
  vinf=          a2 *     b1  # A(inf)*B(inf)
57
*/
58
59
#define TOOM32_MUL_N_REC(p, a, b, n, ws)        \
60
10.4k
  do {                 \
61
10.4k
    mpn_mul_n (p, a, b, n);            \
62
10.4k
  } while (0)
63
64
void
65
mpn_toom32_mul (mp_ptr pp,
66
    mp_srcptr ap, mp_size_t an,
67
    mp_srcptr bp, mp_size_t bn,
68
    mp_ptr scratch)
69
3.46k
{
70
3.46k
  mp_size_t n, s, t;
71
3.46k
  int vm1_neg;
72
3.46k
  mp_limb_t cy;
73
3.46k
  mp_limb_signed_t hi;
74
3.46k
  mp_limb_t ap1_hi, bp1_hi;
75
76
3.46k
#define a0  ap
77
8.57k
#define a1  (ap + n)
78
6.93k
#define a2  (ap + 2 * n)
79
7.03k
#define b0  bp
80
8.40k
#define b1  (bp + n)
81
82
  /* Required, to ensure that s + t >= n. */
83
3.46k
  ASSERT (bn + 2 <= an && an + 6 <= 3*bn);
84
85
3.46k
  n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
86
87
3.46k
  s = an - 2 * n;
88
3.46k
  t = bn - n;
89
90
3.46k
  ASSERT (0 < s && s <= n);
91
3.46k
  ASSERT (0 < t && t <= n);
92
3.46k
  ASSERT (s + t >= n);
93
94
  /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
95
15.5k
#define ap1 (pp)    /* n, most significant limb in ap1_hi */
96
5.15k
#define bp1 (pp + n)    /* n, most significant bit in bp1_hi */
97
3.46k
#define am1 (pp + 2*n)    /* n, most significant bit in hi */
98
3.46k
#define bm1 (pp + 3*n)    /* n */
99
27.7k
#define v1 (scratch)    /* 2n + 1 */
100
17.3k
#define vm1 (pp)    /* 2n + 1 */
101
3.46k
#define scratch_out (scratch + 2*n + 1) /* Currently unused. */
102
103
  /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */
104
105
  /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */
106
107
  /* Compute ap1 = a0 + a1 + a2, am1 = a0 - a1 + a2 */
108
3.46k
  ap1_hi = mpn_add (ap1, a0, n, a2, s);
109
#if HAVE_NATIVE_mpn_add_n_sub_n
110
  if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
111
    {
112
      ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1;
113
      hi = 0;
114
      vm1_neg = 1;
115
    }
116
  else
117
    {
118
      cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n);
119
      hi = ap1_hi - (cy & 1);
120
      ap1_hi += (cy >> 1);
121
      vm1_neg = 0;
122
    }
123
#else
124
3.46k
  if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
125
1.82k
    {
126
1.82k
      ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
127
1.82k
      hi = 0;
128
1.82k
      vm1_neg = 1;
129
1.82k
    }
130
1.64k
  else
131
1.64k
    {
132
1.64k
      hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n);
133
1.64k
      vm1_neg = 0;
134
1.64k
    }
135
3.46k
  ap1_hi += mpn_add_n (ap1, ap1, a1, n);
136
3.46k
#endif
137
138
  /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
139
3.46k
  if (t == n)
140
1.37k
    {
141
#if HAVE_NATIVE_mpn_add_n_sub_n
142
      if (mpn_cmp (b0, b1, n) < 0)
143
  {
144
    cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n);
145
    vm1_neg ^= 1;
146
  }
147
      else
148
  {
149
    cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n);
150
  }
151
      bp1_hi = cy >> 1;
152
#else
153
1.37k
      bp1_hi = mpn_add_n (bp1, b0, b1, n);
154
155
1.37k
      if (mpn_cmp (b0, b1, n) < 0)
156
59
  {
157
59
    ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
158
59
    vm1_neg ^= 1;
159
59
  }
160
1.31k
      else
161
1.31k
  {
162
1.31k
    ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
163
1.31k
  }
164
1.37k
#endif
165
1.37k
    }
166
2.09k
  else
167
2.09k
    {
168
      /* FIXME: Should still use mpn_add_n_sub_n for the main part. */
169
2.09k
      bp1_hi = mpn_add (bp1, b0, n, b1, t);
170
171
2.09k
      if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
172
0
  {
173
0
    ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
174
0
    MPN_ZERO (bm1 + t, n - t);
175
0
    vm1_neg ^= 1;
176
0
  }
177
2.09k
      else
178
2.09k
  {
179
2.09k
    ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
180
2.09k
  }
181
2.09k
    }
182
183
3.46k
  TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
184
3.46k
  if (ap1_hi == 1)
185
1.67k
    {
186
1.67k
      cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n);
187
1.67k
    }
188
1.79k
  else if (ap1_hi == 2)
189
10
    {
190
10
#if HAVE_NATIVE_mpn_addlsh1_n
191
10
      cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
192
#else
193
      cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
194
#endif
195
10
    }
196
1.78k
  else
197
1.78k
    cy = 0;
198
3.46k
  if (bp1_hi != 0)
199
20
    cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
200
3.46k
  v1[2 * n] = cy;
201
202
3.46k
  TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
203
3.46k
  if (hi)
204
0
    hi = mpn_add_n (vm1+n, vm1+n, bm1, n);
205
206
3.46k
  vm1[2*n] = hi;
207
208
  /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
209
3.46k
  if (vm1_neg)
210
1.79k
    {
211
1.79k
#if HAVE_NATIVE_mpn_rsh1sub_n
212
1.79k
      mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
213
#else
214
      mpn_sub_n (v1, v1, vm1, 2*n+1);
215
      ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
216
#endif
217
1.79k
    }
218
1.67k
  else
219
1.67k
    {
220
1.67k
#if HAVE_NATIVE_mpn_rsh1add_n
221
1.67k
      mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
222
#else
223
      mpn_add_n (v1, v1, vm1, 2*n+1);
224
      ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
225
#endif
226
1.67k
    }
227
228
  /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
229
230
     y = x1 + x3 + (x0 + x2) * B
231
       = (x0 + x2) * B + (x0 + x2) - vm1.
232
233
     y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
234
     follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
235
     (already in place, except for carry propagation).
236
237
     We thus add
238
239
   B^3  B^2   B    1
240
    |    |    |    |
241
   +-----+----+
242
 + |  x0 + x2 |
243
   +----+-----+----+
244
 +      |  x0 + x2 |
245
  +----------+
246
 -      |  vm1     |
247
 --+----++----+----+-
248
   | y2  | y1 | y0 |
249
   +-----+----+----+
250
251
  Since we store y0 at the same location as the low half of x0 + x2, we
252
  need to do the middle sum first. */
253
254
3.46k
  hi = vm1[2*n];
255
3.46k
  cy = mpn_add_n (pp + 2*n, v1, v1 + n, n);
256
3.46k
  MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]);
257
258
  /* FIXME: Can we get rid of this second vm1_neg conditional by
259
     swapping the location of +1 and -1 values? */
260
3.46k
  if (vm1_neg)
261
1.79k
    {
262
1.79k
      cy = mpn_add_n (v1, v1, vm1, n);
263
1.79k
      hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
264
1.79k
      MPN_INCR_U (v1 + n, n+1, hi);
265
1.79k
    }
266
1.67k
  else
267
1.67k
    {
268
1.67k
      cy = mpn_sub_n (v1, v1, vm1, n);
269
1.67k
      hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
270
1.67k
      MPN_DECR_U (v1 + n, n+1, hi);
271
1.67k
    }
272
273
3.46k
  TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
274
  /* vinf, s+t limbs.  Use mpn_mul for now, to handle unbalanced operands */
275
3.46k
  if (s > t)  mpn_mul (pp+3*n, a2, s, b1, t);
276
3.32k
  else        mpn_mul (pp+3*n, b1, t, a2, s);
277
278
  /* Remaining interpolation.
279
280
     y * B + x0 + x3 B^3 - x0 B^2 - x3 B
281
     = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
282
     = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
283
       + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
284
     = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
285
       + (y2 - (H x0 - L x3)) B^3 + H x3 B^4
286
287
    B^4       B^3       B^2        B         1
288
 |         |         |         |         |         |
289
   +-------+                   +---------+---------+
290
   |  Hx3  |                   | Hx0-Lx3 |    Lx0  |
291
   +------+----------+---------+---------+---------+
292
    |    y2    |  y1     |   y0    |
293
    ++---------+---------+---------+
294
    -| Hx0-Lx3 | - Lx0   |
295
     +---------+---------+
296
          | - Hx3  |
297
          +--------+
298
299
    We must take into account the carry from Hx0 - Lx3.
300
  */
301
302
3.46k
  cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n);
303
3.46k
  hi = scratch[2*n] + cy;
304
305
3.46k
  cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy);
306
3.46k
  hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy);
307
308
3.46k
  hi += mpn_add (pp + n, pp + n, 3*n, scratch, n);
309
310
  /* FIXME: Is support for s + t == n needed? */
311
3.46k
  if (LIKELY (s + t > n))
312
3.46k
    {
313
3.46k
      hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n);
314
315
3.46k
      if (hi < 0)
316
3.46k
  MPN_DECR_U (pp + 4*n, s+t-n, -hi);
317
3.46k
      else
318
3.46k
  MPN_INCR_U (pp + 4*n, s+t-n, hi);
319
3.46k
    }
320
0
  else
321
0
    ASSERT (hi == 0);
322
3.46k
}