Coverage Report

Created: 2025-03-09 06:52

/src/gmp-6.2.1/mpn/toom_interpolate_12pts.c
Line
Count
Source (jump to first uncovered line)
1
/* Interpolation for the algorithm Toom-Cook 6.5-way.
2
3
   Contributed to the GNU project by Marco Bodrato.
4
5
   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9
Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.
10
11
This file is part of the GNU MP Library.
12
13
The GNU MP Library is free software; you can redistribute it and/or modify
14
it under the terms of either:
15
16
  * the GNU Lesser General Public License as published by the Free
17
    Software Foundation; either version 3 of the License, or (at your
18
    option) any later version.
19
20
or
21
22
  * the GNU General Public License as published by the Free Software
23
    Foundation; either version 2 of the License, or (at your option) any
24
    later version.
25
26
or both in parallel, as here.
27
28
The GNU MP Library is distributed in the hope that it will be useful, but
29
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31
for more details.
32
33
You should have received copies of the GNU General Public License and the
34
GNU Lesser General Public License along with the GNU MP Library.  If not,
35
see https://www.gnu.org/licenses/.  */
36
37
38
#include "gmp-impl.h"
39
40
41
#if GMP_NUMB_BITS < 21
42
#error Not implemented: Both sublsh_n(,,,20) should be corrected.
43
#endif
44
45
#if GMP_NUMB_BITS < 16
46
#error Not implemented: divexact_by42525 needs splitting.
47
#endif
48
49
#if GMP_NUMB_BITS < 12
50
#error Not implemented: Hard to adapt...
51
#endif
52
53
54
/* FIXME: tuneup should decide the best variant */
55
#ifndef AORSMUL_FASTER_AORS_AORSLSH
56
#define AORSMUL_FASTER_AORS_AORSLSH 1
57
#endif
58
#ifndef AORSMUL_FASTER_AORS_2AORSLSH
59
#define AORSMUL_FASTER_AORS_2AORSLSH 1
60
#endif
61
#ifndef AORSMUL_FASTER_2AORSLSH
62
#define AORSMUL_FASTER_2AORSLSH 1
63
#endif
64
#ifndef AORSMUL_FASTER_3AORSLSH
65
#define AORSMUL_FASTER_3AORSLSH 1
66
#endif
67
68
69
#if HAVE_NATIVE_mpn_sublsh_n
70
#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
71
#else
72
static mp_limb_t
73
DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
74
116k
{
75
#if USE_MUL_1 && 0
76
  return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
77
#else
78
116k
  mp_limb_t __cy;
79
116k
  __cy = mpn_lshift(ws,src,n,s);
80
116k
  return    __cy + mpn_sub_n(dst,dst,ws,n);
81
116k
#endif
82
116k
}
83
#endif
84
85
#if HAVE_NATIVE_mpn_addlsh_n
86
#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
87
#else
88
#if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
89
static mp_limb_t
90
DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
91
{
92
#if USE_MUL_1 && 0
93
  return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
94
#else
95
  mp_limb_t __cy;
96
  __cy = mpn_lshift(ws,src,n,s);
97
  return    __cy + mpn_add_n(dst,dst,ws,n);
98
#endif
99
}
100
#endif
101
#endif
102
103
#if HAVE_NATIVE_mpn_subrsh
104
#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
105
#else
106
/* FIXME: This is not a correct definition, it assumes no carry */
107
38.9k
#define DO_mpn_subrsh(dst,nd,src,ns,s,ws)       \
108
38.9k
do {                 \
109
38.9k
  mp_limb_t __cy;             \
110
38.9k
  MPN_DECR_U (dst, nd, src[0] >> s);          \
111
38.9k
  __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);  \
112
38.9k
  MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);       \
113
38.9k
} while (0)
114
#endif
115
116
117
#define BINVERT_9 \
118
19.1k
  ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
119
120
#define BINVERT_255 \
121
  (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
122
123
  /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
124
#if GMP_LIMB_BITS == 32
125
#define BINVERT_2835  (GMP_NUMB_MASK &    CNST_LIMB(0x53E3771B))
126
#define BINVERT_42525 (GMP_NUMB_MASK &    CNST_LIMB(0x9F314C35))
127
#else
128
#if GMP_LIMB_BITS == 64
129
19.1k
#define BINVERT_2835  (GMP_NUMB_MASK &  CNST_LIMB(0x938CC70553E3771B))
130
19.1k
#define BINVERT_42525 (GMP_NUMB_MASK &  CNST_LIMB(0xE7B40D449F314C35))
131
#endif
132
#endif
133
134
#ifndef mpn_divexact_by255
135
#if GMP_NUMB_BITS % 8 == 0
136
#define mpn_divexact_by255(dst,src,size) \
137
19.1k
  (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
138
#else
139
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
140
#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
141
#else
142
#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
143
#endif
144
#endif
145
#endif
146
147
#ifndef mpn_divexact_by9x4
148
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
149
19.1k
#define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
150
#else
151
#define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
152
#endif
153
#endif
154
155
#ifndef mpn_divexact_by42525
156
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
157
19.1k
#define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
158
#else
159
#define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
160
#endif
161
#endif
162
163
#ifndef mpn_divexact_by2835x4
164
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
165
19.1k
#define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
166
#else
167
#define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
168
#endif
169
#endif
170
171
/* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
172
   points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
173
   we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
174
   degree 11 (or 10), given the 12 (rsp. 11) values:
175
176
     r0 = limit at infinity of f(x) / x^11,
177
     r1 = f(4),f(-4),
178
     r2 = f(2),f(-2),
179
     r3 = f(1),f(-1),
180
     r4 = f(1/4),f(-1/4),
181
     r5 = f(1/2),f(-1/2),
182
     r6 = f(0).
183
184
   All couples of the form f(n),f(-n) must be already mixed with
185
   toom_couple_handling(f(n),...,f(-n),...)
186
187
   The result is stored in {pp, spt + 7*n (or 6*n)}.
188
   At entry, r6 is stored at {pp, 2n},
189
   r4 is stored at {pp + 3n, 3n + 1}.
190
   r2 is stored at {pp + 7n, 3n + 1}.
191
   r0 is stored at {pp +11n, spt}.
192
193
   The other values are 3n+1 limbs each (with most significant limbs small).
194
195
   Negative intermediate results are stored two-complemented.
196
   Inputs are destroyed.
197
*/
198
199
void
200
mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
201
      mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
202
19.1k
{
203
19.1k
  mp_limb_t cy;
204
19.1k
  mp_size_t n3;
205
19.1k
  mp_size_t n3p1;
206
19.1k
  n3 = 3 * n;
207
19.1k
  n3p1 = n3 + 1;
208
209
209k
#define   r4    (pp + n3)      /* 3n+1 */
210
38.5k
#define   r2    (pp + 7 * n)      /* 3n+1 */
211
19.1k
#define   r0    (pp +11 * n)      /* s+t <= 2*n */
212
213
  /******************************* interpolation *****************************/
214
19.1k
  if (half != 0) {
215
392
    cy = mpn_sub_n (r3, r3, r0, spt);
216
392
    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
217
218
392
    cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
219
392
    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
220
392
    DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
221
222
392
    cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
223
392
    MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
224
392
    DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
225
19.1k
  };
226
227
19.1k
  r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
228
19.1k
  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
229
230
#if HAVE_NATIVE_mpn_add_n_sub_n
231
  mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
232
#else
233
19.1k
  ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
234
19.1k
  mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
235
19.1k
  MP_PTR_SWAP(r1, wsi);
236
19.1k
#endif
237
238
19.1k
  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
239
19.1k
  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
240
241
#if HAVE_NATIVE_mpn_add_n_sub_n
242
  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
243
#else
244
19.1k
  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
245
19.1k
  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
246
19.1k
  MP_PTR_SWAP(r5, wsi);
247
19.1k
#endif
248
249
19.1k
  r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
250
251
19.1k
#if AORSMUL_FASTER_AORS_AORSLSH
252
19.1k
  mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
253
#else
254
  mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
255
  DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
256
#endif
257
  /* A division by 2835x4 follows. Warning: the operand can be negative! */
258
19.1k
  mpn_divexact_by2835x4(r4, r4, n3p1);
259
19.1k
  if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
260
18.8k
    r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
261
262
19.1k
#if AORSMUL_FASTER_2AORSLSH
263
19.1k
  mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
264
#else
265
  DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
266
  DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
267
#endif
268
19.1k
  mpn_divexact_by255(r5, r5, n3p1);
269
270
19.1k
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
271
272
19.1k
#if AORSMUL_FASTER_3AORSLSH
273
19.1k
  ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
274
#else
275
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
276
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
277
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
278
#endif
279
19.1k
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
280
19.1k
  mpn_divexact_by42525(r1, r1, n3p1);
281
282
19.1k
#if AORSMUL_FASTER_AORS_2AORSLSH
283
19.1k
  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
284
#else
285
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
286
  ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
287
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
288
#endif
289
19.1k
  mpn_divexact_by9x4(r2, r2, n3p1);
290
291
19.1k
  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
292
293
19.1k
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
294
19.1k
  mpn_rsh1sub_n (r4, r2, r4, n3p1);
295
19.1k
  r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
296
#else
297
  mpn_sub_n (r4, r2, r4, n3p1);
298
  ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
299
#endif
300
19.1k
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
301
302
19.1k
#ifdef HAVE_NATIVE_mpn_rsh1add_n
303
19.1k
  mpn_rsh1add_n (r5, r5, r1, n3p1);
304
19.1k
  r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
305
#else
306
  mpn_add_n (r5, r5, r1, n3p1);
307
  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
308
#endif
309
310
  /* last interpolation steps... */
311
19.1k
  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
312
19.1k
  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
313
  /* ... could be mixed with recomposition
314
  ||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
315
  */
316
317
  /***************************** recomposition *******************************/
318
  /*
319
    pp[] prior to operations:
320
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
321
322
    summation scheme for remaining operations:
323
    |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
324
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
325
  ||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
326
  */
327
328
19.1k
  cy = mpn_add_n (pp + n, pp + n, r5, n);
329
19.1k
  cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
330
19.1k
#if HAVE_NATIVE_mpn_add_nc
331
19.1k
  cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
332
#else
333
  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
334
  cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
335
#endif
336
19.1k
  MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
337
338
19.1k
  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
339
19.1k
  cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
340
19.1k
#if HAVE_NATIVE_mpn_add_nc
341
19.1k
  cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
342
#else
343
  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
344
  cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
345
#endif
346
19.1k
  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
347
348
19.1k
  pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
349
19.1k
  if (half) {
350
392
    cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
351
392
#if HAVE_NATIVE_mpn_add_nc
352
392
    if (LIKELY (spt > n)) {
353
392
      cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
354
392
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
355
392
    } else {
356
0
      ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
357
0
    }
358
#else
359
    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
360
    if (LIKELY (spt > n)) {
361
      cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
362
      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
363
    } else {
364
      ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
365
    }
366
#endif
367
18.7k
  } else {
368
18.7k
    ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
369
18.7k
  }
370
371
19.1k
#undef   r0
372
19.1k
#undef   r2
373
19.1k
#undef   r4
374
19.1k
}