Coverage Report

Created: 2025-03-09 06:52

/src/gmp-6.2.1/mpn/toom53_mul.c
Line
Count
Source
1
/* mpn_toom53_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 5/3
2
   times as large as bn.  Or more accurately, (4/3)bn < an < (5/2)bn.
3
4
   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5
6
   The idea of applying toom to unbalanced multiplication is due to Marco
7
   Bodrato and Alberto Zanoni.
8
9
   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
10
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
11
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
12
13
Copyright 2006-2008, 2012, 2014, 2015 Free Software Foundation, Inc.
14
15
This file is part of the GNU MP Library.
16
17
The GNU MP Library is free software; you can redistribute it and/or modify
18
it under the terms of either:
19
20
  * the GNU Lesser General Public License as published by the Free
21
    Software Foundation; either version 3 of the License, or (at your
22
    option) any later version.
23
24
or
25
26
  * the GNU General Public License as published by the Free Software
27
    Foundation; either version 2 of the License, or (at your option) any
28
    later version.
29
30
or both in parallel, as here.
31
32
The GNU MP Library is distributed in the hope that it will be useful, but
33
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
34
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
35
for more details.
36
37
You should have received copies of the GNU General Public License and the
38
GNU Lesser General Public License along with the GNU MP Library.  If not,
39
see https://www.gnu.org/licenses/.  */
40
41
42
#include "gmp-impl.h"
43
44
/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
45
46
  <-s-><--n--><--n--><--n--><--n-->
47
   ___ ______ ______ ______ ______
48
  |a4_|___a3_|___a2_|___a1_|___a0_|
49
         |__b2|___b1_|___b0_|
50
         <-t--><--n--><--n-->
51
52
  v0  =    a0                  *  b0          #    A(0)*B(0)
53
  v1  = (  a0+ a1+ a2+ a3+  a4)*( b0+ b1+ b2) #    A(1)*B(1)      ah  <= 4   bh <= 2
54
  vm1 = (  a0- a1+ a2- a3+  a4)*( b0- b1+ b2) #   A(-1)*B(-1)    |ah| <= 2   bh <= 1
55
  v2  = (  a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) #    A(2)*B(2)      ah  <= 30  bh <= 6
56
  vm2 = (  a0-2a1+4a2-8a3+16a4)*( b0-2b1+4b2) #    A(2)*B(2)     -9<=ah<=20 -1<=bh<=4
57
  vh  = (16a0+8a1+4a2+2a3+  a4)*(4b0+2b1+ b2) #  A(1/2)*B(1/2)    ah  <= 30  bh <= 6
58
  vinf=                     a4 *          b2  #  A(inf)*B(inf)
59
*/
60
61
void
62
mpn_toom53_mul (mp_ptr pp,
63
    mp_srcptr ap, mp_size_t an,
64
    mp_srcptr bp, mp_size_t bn,
65
    mp_ptr scratch)
66
452
{
67
452
  mp_size_t n, s, t;
68
452
  mp_limb_t cy;
69
452
  mp_ptr gp;
70
452
  mp_ptr as1, asm1, as2, asm2, ash;
71
452
  mp_ptr bs1, bsm1, bs2, bsm2, bsh;
72
452
  mp_ptr tmp;
73
452
  enum toom7_flags flags;
74
452
  TMP_DECL;
75
76
904
#define a0  ap
77
452
#define a1  (ap + n)
78
452
#define a2  (ap + 2*n)
79
452
#define a3  (ap + 3*n)
80
904
#define a4  (ap + 4*n)
81
2.18k
#define b0  bp
82
2.23k
#define b1  (bp + n)
83
1.80k
#define b2  (bp + 2*n)
84
85
452
  n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3);
86
87
452
  s = an - 4 * n;
88
452
  t = bn - 2 * n;
89
90
452
  ASSERT (0 < s && s <= n);
91
452
  ASSERT (0 < t && t <= n);
92
93
452
  TMP_MARK;
94
95
452
  tmp = TMP_ALLOC_LIMBS (10 * (n + 1));
96
452
  as1  = tmp; tmp += n + 1;
97
452
  asm1 = tmp; tmp += n + 1;
98
452
  as2  = tmp; tmp += n + 1;
99
452
  asm2 = tmp; tmp += n + 1;
100
452
  ash  = tmp; tmp += n + 1;
101
452
  bs1  = tmp; tmp += n + 1;
102
452
  bsm1 = tmp; tmp += n + 1;
103
452
  bs2  = tmp; tmp += n + 1;
104
452
  bsm2 = tmp; tmp += n + 1;
105
452
  bsh  = tmp; tmp += n + 1;
106
107
452
  gp = pp;
108
109
  /* Compute as1 and asm1.  */
110
452
  flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp));
111
112
  /* Compute as2 and asm2. */
113
452
  flags = (enum toom7_flags) (flags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp)));
114
115
  /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4
116
     = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4  */
117
452
#if HAVE_NATIVE_mpn_addlsh1_n
118
452
  cy = mpn_addlsh1_n (ash, a1, a0, n);
119
452
  cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
120
452
  cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
121
452
  if (s < n)
122
412
    {
123
412
      mp_limb_t cy2;
124
412
      cy2 = mpn_addlsh1_n (ash, a4, ash, s);
125
412
      ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
126
412
      MPN_INCR_U (ash + s, n+1-s, cy2);
127
412
    }
128
40
  else
129
40
    ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
130
#else
131
  cy = mpn_lshift (ash, a0, n, 1);
132
  cy += mpn_add_n (ash, ash, a1, n);
133
  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
134
  cy += mpn_add_n (ash, ash, a2, n);
135
  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
136
  cy += mpn_add_n (ash, ash, a3, n);
137
  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
138
  ash[n] = cy + mpn_add (ash, ash, n, a4, s);
139
#endif
140
141
  /* Compute bs1 and bsm1.  */
142
452
  bs1[n] = mpn_add (bs1, b0, n, b2, t);   /* b0 + b2 */
143
#if HAVE_NATIVE_mpn_add_n_sub_n
144
  if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
145
    {
146
      bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1;
147
      bsm1[n] = 0;
148
      flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
149
    }
150
  else
151
    {
152
      cy = mpn_add_n_sub_n (bs1, bsm1, bs1, b1, n);
153
      bsm1[n] = bs1[n] - (cy & 1);
154
      bs1[n] += (cy >> 1);
155
    }
156
#else
157
452
  if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
158
188
    {
159
188
      mpn_sub_n (bsm1, b1, bs1, n);
160
188
      bsm1[n] = 0;
161
188
      flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
162
188
    }
163
264
  else
164
264
    {
165
264
      bsm1[n] = bs1[n] - mpn_sub_n (bsm1, bs1, b1, n);
166
264
    }
167
452
  bs1[n] += mpn_add_n (bs1, bs1, b1, n);  /* b0+b1+b2 */
168
452
#endif
169
170
  /* Compute bs2 and bsm2. */
171
452
#if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n
172
452
#if HAVE_NATIVE_mpn_addlsh2_n
173
452
  cy = mpn_addlsh2_n (bs2, b0, b2, t);
174
#else /* HAVE_NATIVE_mpn_addlsh_n */
175
  cy = mpn_addlsh_n (bs2, b0, b2, t, 2);
176
#endif
177
452
  if (t < n)
178
377
    cy = mpn_add_1 (bs2 + t, b0 + t, n - t, cy);
179
452
  bs2[n] = cy;
180
#else
181
  cy = mpn_lshift (gp, b2, t, 2);
182
  bs2[n] = mpn_add (bs2, b0, n, gp, t);
183
  MPN_INCR_U (bs2 + t, n+1-t, cy);
184
#endif
185
186
452
  gp[n] = mpn_lshift (gp, b1, n, 1);
187
188
#if HAVE_NATIVE_mpn_add_n_sub_n
189
  if (mpn_cmp (bs2, gp, n+1) < 0)
190
    {
191
      ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, gp, bs2, n+1));
192
      flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
193
    }
194
  else
195
    {
196
      ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, bs2, gp, n+1));
197
    }
198
#else
199
452
  if (mpn_cmp (bs2, gp, n+1) < 0)
200
295
    {
201
295
      ASSERT_NOCARRY (mpn_sub_n (bsm2, gp, bs2, n+1));
202
295
      flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
203
295
    }
204
157
  else
205
157
    {
206
157
      ASSERT_NOCARRY (mpn_sub_n (bsm2, bs2, gp, n+1));
207
157
    }
208
452
  mpn_add_n (bs2, bs2, gp, n+1);
209
452
#endif
210
211
  /* Compute bsh = 4 b0 + 2 b1 + b2 = 2*(2*b0 + b1)+b2.  */
212
452
#if HAVE_NATIVE_mpn_addlsh1_n
213
452
  cy = mpn_addlsh1_n (bsh, b1, b0, n);
214
452
  if (t < n)
215
377
    {
216
377
      mp_limb_t cy2;
217
377
      cy2 = mpn_addlsh1_n (bsh, b2, bsh, t);
218
377
      bsh[n] = 2*cy + mpn_lshift (bsh + t, bsh + t, n - t, 1);
219
377
      MPN_INCR_U (bsh + t, n+1-t, cy2);
220
377
    }
221
75
  else
222
75
    bsh[n] = 2*cy + mpn_addlsh1_n (bsh, b2, bsh, n);
223
#else
224
  cy = mpn_lshift (bsh, b0, n, 1);
225
  cy += mpn_add_n (bsh, bsh, b1, n);
226
  cy = 2*cy + mpn_lshift (bsh, bsh, n, 1);
227
  bsh[n] = cy + mpn_add (bsh, bsh, n, b2, t);
228
#endif
229
230
452
  ASSERT (as1[n] <= 4);
231
452
  ASSERT (bs1[n] <= 2);
232
452
  ASSERT (asm1[n] <= 2);
233
452
  ASSERT (bsm1[n] <= 1);
234
452
  ASSERT (as2[n] <= 30);
235
452
  ASSERT (bs2[n] <= 6);
236
452
  ASSERT (asm2[n] <= 20);
237
452
  ASSERT (bsm2[n] <= 4);
238
452
  ASSERT (ash[n] <= 30);
239
452
  ASSERT (bsh[n] <= 6);
240
241
452
#define v0    pp        /* 2n */
242
904
#define v1    (pp + 2 * n)      /* 2n+1 */
243
452
#define vinf  (pp + 6 * n)      /* s+t */
244
904
#define v2    scratch        /* 2n+1 */
245
904
#define vm2   (scratch + 2 * n + 1)    /* 2n+1 */
246
904
#define vh    (scratch + 4 * n + 2)    /* 2n+1 */
247
1.35k
#define vm1   (scratch + 6 * n + 3)    /* 2n+1 */
248
452
#define scratch_out (scratch + 8 * n + 4)    /* 2n+1 */
249
  /* Total scratch need: 10*n+5 */
250
251
  /* Must be in allocation order, as they overwrite one limb beyond
252
   * 2n+1. */
253
452
  mpn_mul_n (v2, as2, bs2, n + 1);   /* v2, 2n+1 limbs */
254
452
  mpn_mul_n (vm2, asm2, bsm2, n + 1);   /* vm2, 2n+1 limbs */
255
452
  mpn_mul_n (vh, ash, bsh, n + 1);   /* vh, 2n+1 limbs */
256
257
  /* vm1, 2n+1 limbs */
258
#ifdef SMALLER_RECURSION
259
  mpn_mul_n (vm1, asm1, bsm1, n);
260
  if (asm1[n] == 1)
261
    {
262
      cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
263
    }
264
  else if (asm1[n] == 2)
265
    {
266
#if HAVE_NATIVE_mpn_addlsh1_n_ip1
267
      cy = 2 * bsm1[n] + mpn_addlsh1_n_ip1 (vm1 + n, bsm1, n);
268
#else
269
      cy = 2 * bsm1[n] + mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
270
#endif
271
    }
272
  else
273
    cy = 0;
274
  if (bsm1[n] != 0)
275
    cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
276
  vm1[2 * n] = cy;
277
#else /* SMALLER_RECURSION */
278
452
  vm1[2 * n] = 0;
279
452
  mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0));
280
452
#endif /* SMALLER_RECURSION */
281
282
  /* v1, 2n+1 limbs */
283
#ifdef SMALLER_RECURSION
284
  mpn_mul_n (v1, as1, bs1, n);
285
  if (as1[n] == 1)
286
    {
287
      cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
288
    }
289
  else if (as1[n] == 2)
290
    {
291
#if HAVE_NATIVE_mpn_addlsh1_n_ip1
292
      cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
293
#else
294
      cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
295
#endif
296
    }
297
  else if (as1[n] != 0)
298
    {
299
      cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
300
    }
301
  else
302
    cy = 0;
303
  if (bs1[n] == 1)
304
    {
305
      cy += mpn_add_n (v1 + n, v1 + n, as1, n);
306
    }
307
  else if (bs1[n] == 2)
308
    {
309
#if HAVE_NATIVE_mpn_addlsh1_n_ip1
310
      cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
311
#else
312
      cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
313
#endif
314
    }
315
  v1[2 * n] = cy;
316
#else /* SMALLER_RECURSION */
317
452
  v1[2 * n] = 0;
318
452
  mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0));
319
452
#endif /* SMALLER_RECURSION */
320
321
452
  mpn_mul_n (v0, a0, b0, n);     /* v0, 2n limbs */
322
323
  /* vinf, s+t limbs */
324
452
  if (s > t)  mpn_mul (vinf, a4, s, b2, t);
325
252
  else        mpn_mul (vinf, b2, t, a4, s);
326
327
452
  mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t,
328
452
           scratch_out);
329
330
452
  TMP_FREE;
331
452
}