Coverage Report

Created: 2024-06-28 06:19

/src/gmp-6.2.1/mpn/toom33_mul.c
Line
Count
Source (jump to first uncovered line)
1
/* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2
   size.  Or more accurately, bn <= an < (3/2)bn.
3
4
   Contributed to the GNU project by Torbjorn Granlund.
5
   Additional improvements by Marco Bodrato.
6
7
   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
8
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10
11
Copyright 2006-2008, 2010, 2012, 2015 Free Software Foundation, Inc.
12
13
This file is part of the GNU MP Library.
14
15
The GNU MP Library is free software; you can redistribute it and/or modify
16
it under the terms of either:
17
18
  * the GNU Lesser General Public License as published by the Free
19
    Software Foundation; either version 3 of the License, or (at your
20
    option) any later version.
21
22
or
23
24
  * the GNU General Public License as published by the Free Software
25
    Foundation; either version 2 of the License, or (at your option) any
26
    later version.
27
28
or both in parallel, as here.
29
30
The GNU MP Library is distributed in the hope that it will be useful, but
31
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33
for more details.
34
35
You should have received copies of the GNU General Public License and the
36
GNU Lesser General Public License along with the GNU MP Library.  If not,
37
see https://www.gnu.org/licenses/.  */
38
39
40
#include "gmp-impl.h"
41
42
/* Evaluate in: -1, 0, +1, +2, +inf
43
44
  <-s--><--n--><--n-->
45
   ____ ______ ______
46
  |_a2_|___a1_|___a0_|
47
   |b2_|___b1_|___b0_|
48
   <-t-><--n--><--n-->
49
50
  v0  =  a0         * b0          #   A(0)*B(0)
51
  v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
52
  vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
53
  v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
54
  vinf=          a2 *         b2  # A(inf)*B(inf)
55
*/
56
57
#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
58
#define MAYBE_mul_basecase 1
59
#define MAYBE_mul_toom33   1
60
#else
61
#define MAYBE_mul_basecase            \
62
27.9M
  (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
63
#define MAYBE_mul_toom33            \
64
27.7M
  (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
65
#endif
66
67
/* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
68
   multiplication at the infinity point. We may have
69
   MAYBE_mul_basecase == 0, and still get s just below
70
   MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
71
   s == 1 and mpn_toom22_mul will crash.
72
*/
73
74
#define TOOM33_MUL_N_REC(p, a, b, n, ws)        \
75
13.9M
  do {                 \
76
13.9M
    if (MAYBE_mul_basecase            \
77
13.9M
  && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))     \
78
13.9M
      mpn_mul_basecase (p, a, n, b, n);         \
79
13.9M
    else if (! MAYBE_mul_toom33            \
80
13.8M
       || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))   \
81
13.8M
      mpn_toom22_mul (p, a, n, b, n, ws);       \
82
13.8M
    else                \
83
13.8M
      mpn_toom33_mul (p, a, n, b, n, ws);       \
84
13.9M
  } while (0)
85
86
void
87
mpn_toom33_mul (mp_ptr pp,
88
    mp_srcptr ap, mp_size_t an,
89
    mp_srcptr bp, mp_size_t bn,
90
    mp_ptr scratch)
91
2.79M
{
92
2.79M
  const int __gmpn_cpuvec_initialized = 1;
93
2.79M
  mp_size_t n, s, t;
94
2.79M
  int vm1_neg;
95
2.79M
  mp_limb_t cy, vinf0;
96
2.79M
  mp_ptr gp;
97
2.79M
  mp_ptr as1, asm1, as2;
98
2.79M
  mp_ptr bs1, bsm1, bs2;
99
100
5.59M
#define a0  ap
101
8.01M
#define a1  (ap + n)
102
5.59M
#define a2  (ap + 2*n)
103
5.59M
#define b0  bp
104
8.04M
#define b1  (bp + n)
105
5.59M
#define b2  (bp + 2*n)
106
107
2.79M
  n = (an + 2) / (size_t) 3;
108
109
2.79M
  s = an - 2 * n;
110
2.79M
  t = bn - 2 * n;
111
112
2.79M
  ASSERT (an >= bn);
113
114
2.79M
  ASSERT (0 < s && s <= n);
115
2.79M
  ASSERT (0 < t && t <= n);
116
117
2.79M
  as1  = scratch + 4 * n + 4;
118
2.79M
  asm1 = scratch + 2 * n + 2;
119
2.79M
  as2 = pp + n + 1;
120
121
2.79M
  bs1 = pp;
122
2.79M
  bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
123
2.79M
  bs2 = pp + 2 * n + 2;
124
125
2.79M
  gp = scratch;
126
127
2.79M
  vm1_neg = 0;
128
129
  /* Compute as1 and asm1.  */
130
2.79M
  cy = mpn_add (gp, a0, n, a2, s);
131
#if HAVE_NATIVE_mpn_add_n_sub_n
132
  if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
133
    {
134
      cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
135
      as1[n] = cy >> 1;
136
      asm1[n] = 0;
137
      vm1_neg = 1;
138
    }
139
  else
140
    {
141
      mp_limb_t cy2;
142
      cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
143
      as1[n] = cy + (cy2 >> 1);
144
      asm1[n] = cy - (cy2 & 1);
145
    }
146
#else
147
2.79M
  as1[n] = cy + mpn_add_n (as1, gp, a1, n);
148
2.79M
  if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
149
1.20M
    {
150
1.20M
      mpn_sub_n (asm1, a1, gp, n);
151
1.20M
      asm1[n] = 0;
152
1.20M
      vm1_neg = 1;
153
1.20M
    }
154
1.58M
  else
155
1.58M
    {
156
1.58M
      cy -= mpn_sub_n (asm1, gp, a1, n);
157
1.58M
      asm1[n] = cy;
158
1.58M
    }
159
2.79M
#endif
160
161
  /* Compute as2.  */
162
2.79M
#if HAVE_NATIVE_mpn_rsblsh1_n
163
2.79M
  cy = mpn_add_n (as2, a2, as1, s);
164
2.79M
  if (s != n)
165
1.69M
    cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
166
2.79M
  cy += as1[n];
167
2.79M
  cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
168
#else
169
#if HAVE_NATIVE_mpn_addlsh1_n
170
  cy  = mpn_addlsh1_n (as2, a1, a2, s);
171
  if (s != n)
172
    cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
173
  cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
174
#else
175
  cy = mpn_add_n (as2, a2, as1, s);
176
  if (s != n)
177
    cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
178
  cy += as1[n];
179
  cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
180
  cy -= mpn_sub_n (as2, as2, a0, n);
181
#endif
182
#endif
183
2.79M
  as2[n] = cy;
184
185
  /* Compute bs1 and bsm1.  */
186
2.79M
  cy = mpn_add (gp, b0, n, b2, t);
187
#if HAVE_NATIVE_mpn_add_n_sub_n
188
  if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
189
    {
190
      cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
191
      bs1[n] = cy >> 1;
192
      bsm1[n] = 0;
193
      vm1_neg ^= 1;
194
    }
195
  else
196
    {
197
      mp_limb_t cy2;
198
      cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
199
      bs1[n] = cy + (cy2 >> 1);
200
      bsm1[n] = cy - (cy2 & 1);
201
    }
202
#else
203
2.79M
  bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
204
2.79M
  if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
205
1.12M
    {
206
1.12M
      mpn_sub_n (bsm1, b1, gp, n);
207
1.12M
      bsm1[n] = 0;
208
1.12M
      vm1_neg ^= 1;
209
1.12M
    }
210
1.67M
  else
211
1.67M
    {
212
1.67M
      cy -= mpn_sub_n (bsm1, gp, b1, n);
213
1.67M
      bsm1[n] = cy;
214
1.67M
    }
215
2.79M
#endif
216
217
  /* Compute bs2.  */
218
2.79M
#if HAVE_NATIVE_mpn_rsblsh1_n
219
2.79M
  cy = mpn_add_n (bs2, b2, bs1, t);
220
2.79M
  if (t != n)
221
1.69M
    cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
222
2.79M
  cy += bs1[n];
223
2.79M
  cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
224
#else
225
#if HAVE_NATIVE_mpn_addlsh1_n
226
  cy  = mpn_addlsh1_n (bs2, b1, b2, t);
227
  if (t != n)
228
    cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
229
  cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
230
#else
231
  cy  = mpn_add_n (bs2, bs1, b2, t);
232
  if (t != n)
233
    cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
234
  cy += bs1[n];
235
  cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
236
  cy -= mpn_sub_n (bs2, bs2, b0, n);
237
#endif
238
#endif
239
2.79M
  bs2[n] = cy;
240
241
2.79M
  ASSERT (as1[n] <= 2);
242
2.79M
  ASSERT (bs1[n] <= 2);
243
2.79M
  ASSERT (asm1[n] <= 1);
244
2.79M
  ASSERT (bsm1[n] <= 1);
245
2.79M
  ASSERT (as2[n] <= 6);
246
2.79M
  ASSERT (bs2[n] <= 6);
247
248
2.79M
#define v0    pp        /* 2n */
249
2.79M
#define v1    (pp + 2 * n)      /* 2n+1 */
250
8.38M
#define vinf  (pp + 4 * n)      /* s+t */
251
2.79M
#define vm1   scratch        /* 2n+1 */
252
2.79M
#define v2    (scratch + 2 * n + 1)    /* 2n+2 */
253
2.79M
#define scratch_out  (scratch + 5 * n + 5)
254
255
  /* vm1, 2n+1 limbs */
256
#ifdef SMALLER_RECURSION
257
  TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
258
  cy = 0;
259
  if (asm1[n] != 0)
260
    cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
261
  if (bsm1[n] != 0)
262
    cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
263
  vm1[2 * n] = cy;
264
#else
265
2.79M
  TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
266
2.79M
#endif
267
268
2.79M
  TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);  /* v2, 2n+1 limbs */
269
270
  /* vinf, s+t limbs */
271
2.79M
  if (s > t)  mpn_mul (vinf, a2, s, b2, t);
272
2.79M
  else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
273
274
2.79M
  vinf0 = vinf[0];        /* v1 overlaps with this */
275
276
#ifdef SMALLER_RECURSION
277
  /* v1, 2n+1 limbs */
278
  TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
279
  if (as1[n] == 1)
280
    {
281
      cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
282
    }
283
  else if (as1[n] != 0)
284
    {
285
#if HAVE_NATIVE_mpn_addlsh1_n_ip1
286
      cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
287
#else
288
      cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
289
#endif
290
    }
291
  else
292
    cy = 0;
293
  if (bs1[n] == 1)
294
    {
295
      cy += mpn_add_n (v1 + n, v1 + n, as1, n);
296
    }
297
  else if (bs1[n] != 0)
298
    {
299
#if HAVE_NATIVE_mpn_addlsh1_n_ip1
300
      cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
301
#else
302
      cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
303
#endif
304
    }
305
  v1[2 * n] = cy;
306
#else
307
2.79M
  cy = vinf[1];
308
2.79M
  TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
309
2.79M
  vinf[1] = cy;
310
2.79M
#endif
311
312
2.79M
  TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);  /* v0, 2n limbs */
313
314
2.79M
  mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
315
2.79M
}