Coverage Report

Created: 2024-11-25 06:29

/src/gmp/mpn/toom33_mul.c
Line
Count
Source (jump to first uncovered line)
1
/* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2
   size.  Or more accurately, bn <= an < (3/2)bn.
3
4
   Contributed to the GNU project by Torbjorn Granlund.
5
   Additional improvements by Marco Bodrato.
6
7
   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
8
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10
11
Copyright 2006-2008, 2010, 2012, 2015, 2021 Free Software Foundation, Inc.
12
13
This file is part of the GNU MP Library.
14
15
The GNU MP Library is free software; you can redistribute it and/or modify
16
it under the terms of either:
17
18
  * the GNU Lesser General Public License as published by the Free
19
    Software Foundation; either version 3 of the License, or (at your
20
    option) any later version.
21
22
or
23
24
  * the GNU General Public License as published by the Free Software
25
    Foundation; either version 2 of the License, or (at your option) any
26
    later version.
27
28
or both in parallel, as here.
29
30
The GNU MP Library is distributed in the hope that it will be useful, but
31
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33
for more details.
34
35
You should have received copies of the GNU General Public License and the
36
GNU Lesser General Public License along with the GNU MP Library.  If not,
37
see https://www.gnu.org/licenses/.  */
38
39
40
#include "gmp-impl.h"
41
42
/* Evaluate in: -1, 0, +1, +2, +inf
43
44
  <-s--><--n--><--n-->
45
   ____ ______ ______
46
  |_a2_|___a1_|___a0_|
47
   |b2_|___b1_|___b0_|
48
   <-t-><--n--><--n-->
49
50
  v0  =  a0         * b0          #   A(0)*B(0)
51
  v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
52
  vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
53
  v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
54
  vinf=          a2 *         b2  # A(inf)*B(inf)
55
*/
56
57
#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
58
#define MAYBE_mul_basecase 1
59
#define MAYBE_mul_toom33   1
60
#else
61
#define MAYBE_mul_basecase            \
62
519k
  (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
63
#define MAYBE_mul_toom33            \
64
519k
  (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
65
#endif
66
67
/* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
68
   multiplication at the infinity point. We may have
69
   MAYBE_mul_basecase == 0, and still get s just below
70
   MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
71
   s == 1 and mpn_toom22_mul will crash.
72
*/
73
74
#define TOOM33_MUL_N_REC(p, a, b, n, ws)        \
75
259k
  do {                 \
76
259k
    if (MAYBE_mul_basecase            \
77
259k
  && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))     \
78
259k
      mpn_mul_basecase (p, a, n, b, n);         \
79
259k
    else if (! MAYBE_mul_toom33            \
80
259k
       || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))   \
81
259k
      mpn_toom22_mul (p, a, n, b, n, ws);       \
82
259k
    else                \
83
259k
      mpn_toom33_mul (p, a, n, b, n, ws);       \
84
259k
  } while (0)
85
86
void
87
mpn_toom33_mul (mp_ptr pp,
88
    mp_srcptr ap, mp_size_t an,
89
    mp_srcptr bp, mp_size_t bn,
90
    mp_ptr scratch)
91
51.9k
{
92
51.9k
  const int __gmpn_cpuvec_initialized = 1;
93
51.9k
  mp_size_t n, s, t;
94
51.9k
  int vm1_neg;
95
51.9k
  mp_limb_t cy, vinf0;
96
51.9k
  mp_ptr gp;
97
51.9k
  mp_ptr as1, asm1, as2;
98
51.9k
  mp_ptr bs1, bsm1, bs2;
99
100
103k
#define a0  ap
101
155k
#define a1  (ap + n)
102
103k
#define a2  (ap + 2*n)
103
103k
#define b0  bp
104
155k
#define b1  (bp + n)
105
103k
#define b2  (bp + 2*n)
106
107
51.9k
  n = (an + 2) / (size_t) 3;
108
109
51.9k
  s = an - 2 * n;
110
51.9k
  t = bn - 2 * n;
111
112
51.9k
  ASSERT (an >= bn);
113
114
51.9k
  ASSERT (0 < s && s <= n);
115
51.9k
  ASSERT (0 < t && t <= n);
116
117
51.9k
  as1  = scratch + 4 * n + 4;
118
51.9k
  asm1 = scratch + 2 * n + 2;
119
51.9k
  as2 = pp + n + 1;
120
121
51.9k
  bs1 = pp;
122
51.9k
  bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
123
51.9k
  bs2 = pp + 2 * n + 2;
124
125
51.9k
  gp = scratch;
126
127
51.9k
  vm1_neg = 0;
128
129
  /* Compute as1 and asm1.  */
130
51.9k
  cy = mpn_add (gp, a0, n, a2, s);
131
#if HAVE_NATIVE_mpn_add_n_sub_n
132
  if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
133
    {
134
      cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
135
      as1[n] = cy >> 1;
136
      asm1[n] = 0;
137
      vm1_neg = 1;
138
    }
139
  else
140
    {
141
      mp_limb_t cy2;
142
      cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
143
      as1[n] = cy + (cy2 >> 1);
144
      asm1[n] = cy - (cy2 & 1);
145
    }
146
#else
147
51.9k
  as1[n] = cy + mpn_add_n (as1, gp, a1, n);
148
51.9k
  if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
149
25.9k
    {
150
25.9k
      mpn_sub_n (asm1, a1, gp, n);
151
25.9k
      asm1[n] = 0;
152
25.9k
      vm1_neg = 1;
153
25.9k
    }
154
25.9k
  else
155
25.9k
    {
156
25.9k
      cy -= mpn_sub_n (asm1, gp, a1, n);
157
25.9k
      asm1[n] = cy;
158
25.9k
    }
159
51.9k
#endif
160
161
  /* Compute as2.  */
162
#if HAVE_NATIVE_mpn_rsblsh1_n
163
  cy = mpn_add_n (as2, a2, as1, s);
164
  if (s != n)
165
    cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
166
  cy += as1[n];
167
  cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
168
#else
169
#if HAVE_NATIVE_mpn_addlsh1_n
170
  cy  = mpn_addlsh1_n (as2, a1, a2, s);
171
  if (s != n)
172
    cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
173
  cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
174
#else
175
51.9k
  cy = mpn_add_n (as2, a2, as1, s);
176
51.9k
  if (s != n)
177
51.9k
    cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
178
51.9k
  cy += as1[n];
179
51.9k
  cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
180
51.9k
  cy -= mpn_sub_n (as2, as2, a0, n);
181
51.9k
#endif
182
51.9k
#endif
183
51.9k
  as2[n] = cy;
184
185
  /* Compute bs1 and bsm1.  */
186
51.9k
  cy = mpn_add (gp, b0, n, b2, t);
187
#if HAVE_NATIVE_mpn_add_n_sub_n
188
  if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
189
    {
190
      cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
191
      bs1[n] = cy >> 1;
192
      bsm1[n] = 0;
193
      vm1_neg ^= 1;
194
    }
195
  else
196
    {
197
      mp_limb_t cy2;
198
      cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
199
      bs1[n] = cy + (cy2 >> 1);
200
      bsm1[n] = cy - (cy2 & 1);
201
    }
202
#else
203
51.9k
  bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
204
51.9k
  if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
205
25.0k
    {
206
25.0k
      mpn_sub_n (bsm1, b1, gp, n);
207
25.0k
      bsm1[n] = 0;
208
25.0k
      vm1_neg ^= 1;
209
25.0k
    }
210
26.8k
  else
211
26.8k
    {
212
26.8k
      cy -= mpn_sub_n (bsm1, gp, b1, n);
213
26.8k
      bsm1[n] = cy;
214
26.8k
    }
215
51.9k
#endif
216
217
  /* Compute bs2.  */
218
#if HAVE_NATIVE_mpn_rsblsh1_n
219
  cy = mpn_add_n (bs2, b2, bs1, t);
220
  if (t != n)
221
    cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
222
  cy += bs1[n];
223
  cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
224
#else
225
#if HAVE_NATIVE_mpn_addlsh1_n
226
  cy  = mpn_addlsh1_n (bs2, b1, b2, t);
227
  if (t != n)
228
    cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
229
  cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
230
#else
231
51.9k
  cy  = mpn_add_n (bs2, bs1, b2, t);
232
51.9k
  if (t != n)
233
51.9k
    cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
234
51.9k
  cy += bs1[n];
235
51.9k
  cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
236
51.9k
  cy -= mpn_sub_n (bs2, bs2, b0, n);
237
51.9k
#endif
238
51.9k
#endif
239
51.9k
  bs2[n] = cy;
240
241
51.9k
  ASSERT (as1[n] <= 2);
242
51.9k
  ASSERT (bs1[n] <= 2);
243
51.9k
  ASSERT (asm1[n] <= 1);
244
51.9k
  ASSERT (bsm1[n] <= 1);
245
51.9k
  ASSERT (as2[n] <= 6);
246
51.9k
  ASSERT (bs2[n] <= 6);
247
248
51.9k
#define v0    pp        /* 2n */
249
51.9k
#define v1    (pp + 2 * n)      /* 2n+1 */
250
155k
#define vinf  (pp + 4 * n)      /* s+t */
251
103k
#define vm1   scratch        /* 2n+1 */
252
51.9k
#define v2    (scratch + 2 * n + 1)    /* 2n+2 */
253
51.9k
#define scratch_out  (scratch + 5 * n + 5)
254
255
  /* vm1, 2n+1 limbs */
256
#ifdef SMALLER_RECURSION
257
  TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
258
  cy = 0;
259
  if (asm1[n] != 0)
260
    cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
261
  if (bsm1[n] != 0)
262
    cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
263
  vm1[2 * n] = cy;
264
#else
265
51.9k
  vm1[2 * n] = 0;
266
51.9k
  TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + (bsm1[n] | asm1[n]), scratch_out);
267
51.9k
#endif
268
269
51.9k
  TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);  /* v2, 2n+1 limbs */
270
271
  /* vinf, s+t limbs */
272
51.9k
  if (s > t)  mpn_mul (vinf, a2, s, b2, t);
273
51.9k
  else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
274
275
51.9k
  vinf0 = vinf[0];        /* v1 overlaps with this */
276
277
#ifdef SMALLER_RECURSION
278
  /* v1, 2n+1 limbs */
279
  TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
280
  if (as1[n] == 1)
281
    {
282
      cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
283
    }
284
  else if (as1[n] != 0)
285
    {
286
#if HAVE_NATIVE_mpn_addlsh1_n_ip1
287
      cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
288
#else
289
      cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
290
#endif
291
    }
292
  else
293
    cy = 0;
294
  if (bs1[n] == 1)
295
    {
296
      cy += mpn_add_n (v1 + n, v1 + n, as1, n);
297
    }
298
  else if (bs1[n] != 0)
299
    {
300
#if HAVE_NATIVE_mpn_addlsh1_n_ip1
301
      cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
302
#else
303
      cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
304
#endif
305
    }
306
  v1[2 * n] = cy;
307
#else
308
51.9k
  cy = vinf[1];
309
51.9k
  TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
310
51.9k
  vinf[1] = cy;
311
51.9k
#endif
312
313
51.9k
  TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);  /* v0, 2n limbs */
314
315
51.9k
  mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
316
51.9k
}