Coverage Report

Created: 2025-03-09 06:52

/src/gmp-6.2.1/mpn/toom_interpolate_16pts.c
Line
Count
Source (jump to first uncovered line)
1
/* Interpolation for the algorithm Toom-Cook 8.5-way.
2
3
   Contributed to the GNU project by Marco Bodrato.
4
5
   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9
Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.
10
11
This file is part of the GNU MP Library.
12
13
The GNU MP Library is free software; you can redistribute it and/or modify
14
it under the terms of either:
15
16
  * the GNU Lesser General Public License as published by the Free
17
    Software Foundation; either version 3 of the License, or (at your
18
    option) any later version.
19
20
or
21
22
  * the GNU General Public License as published by the Free Software
23
    Foundation; either version 2 of the License, or (at your option) any
24
    later version.
25
26
or both in parallel, as here.
27
28
The GNU MP Library is distributed in the hope that it will be useful, but
29
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31
for more details.
32
33
You should have received copies of the GNU General Public License and the
34
GNU Lesser General Public License along with the GNU MP Library.  If not,
35
see https://www.gnu.org/licenses/.  */
36
37
38
#include "gmp-impl.h"
39
40
41
#if GMP_NUMB_BITS < 29
42
#error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB.
43
#endif
44
45
#if GMP_NUMB_BITS < 28
46
#error Not implemented: divexact_by188513325 and _by182712915 will not work.
47
#endif
48
49
50
/* FIXME: tuneup should decide the best variant */
51
#ifndef AORSMUL_FASTER_AORS_AORSLSH
52
#define AORSMUL_FASTER_AORS_AORSLSH 1
53
#endif
54
#ifndef AORSMUL_FASTER_AORS_2AORSLSH
55
#define AORSMUL_FASTER_AORS_2AORSLSH 1
56
#endif
57
#ifndef AORSMUL_FASTER_2AORSLSH
58
#define AORSMUL_FASTER_2AORSLSH 1
59
#endif
60
#ifndef AORSMUL_FASTER_3AORSLSH
61
#define AORSMUL_FASTER_3AORSLSH 1
62
#endif
63
64
65
#if HAVE_NATIVE_mpn_sublsh_n
66
#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
67
#else
68
static mp_limb_t
69
DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
70
244k
{
71
#if USE_MUL_1 && 0
72
  return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
73
#else
74
244k
  mp_limb_t __cy;
75
244k
  __cy = mpn_lshift(ws,src,n,s);
76
244k
  return    __cy + mpn_sub_n(dst,dst,ws,n);
77
244k
#endif
78
244k
}
79
#endif
80
81
#if HAVE_NATIVE_mpn_addlsh_n
82
#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
83
#else
84
#if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
85
static mp_limb_t
86
DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
87
{
88
#if USE_MUL_1 && 0
89
  return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
90
#else
91
  mp_limb_t __cy;
92
  __cy = mpn_lshift(ws,src,n,s);
93
  return    __cy + mpn_add_n(dst,dst,ws,n);
94
#endif
95
}
96
#endif
97
#endif
98
99
#if HAVE_NATIVE_mpn_subrsh
100
#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
101
#else
102
/* FIXME: This is not a correct definition, it assumes no carry */
103
81.6k
#define DO_mpn_subrsh(dst,nd,src,ns,s,ws)       \
104
81.6k
do {                 \
105
81.6k
  mp_limb_t __cy;             \
106
81.6k
  MPN_DECR_U (dst, nd, src[0] >> s);          \
107
81.6k
  __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);  \
108
81.6k
  MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);       \
109
81.6k
} while (0)
110
#endif
111
112
113
#if GMP_NUMB_BITS < 43
114
#define BIT_CORRECTION 1
115
#define CORRECTION_BITS GMP_NUMB_BITS
116
#else
117
27.2k
#define BIT_CORRECTION 0
118
27.2k
#define CORRECTION_BITS 0
119
#endif
120
121
#define BINVERT_9 \
122
27.0k
  ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
123
124
#define BINVERT_255 \
125
27.0k
  (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
126
127
  /* FIXME: find some more general expressions for inverses */
128
#if GMP_LIMB_BITS == 32
129
#define BINVERT_2835  (GMP_NUMB_MASK &    CNST_LIMB(0x53E3771B))
130
#define BINVERT_42525 (GMP_NUMB_MASK &    CNST_LIMB(0x9F314C35))
131
#define BINVERT_182712915 (GMP_NUMB_MASK &  CNST_LIMB(0x550659DB))
132
#define BINVERT_188513325 (GMP_NUMB_MASK &  CNST_LIMB(0xFBC333A5))
133
#define BINVERT_255x182712915L (GMP_NUMB_MASK & CNST_LIMB(0x6FC4CB25))
134
#define BINVERT_255x188513325L (GMP_NUMB_MASK & CNST_LIMB(0x6864275B))
135
#if GMP_NAIL_BITS == 0
136
#define BINVERT_255x182712915H CNST_LIMB(0x1B649A07)
137
#define BINVERT_255x188513325H CNST_LIMB(0x06DB993A)
138
#else /* GMP_NAIL_BITS != 0 */
139
#define BINVERT_255x182712915H \
140
  (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) | (0x6FC4CB25>>GMP_NUMB_BITS)))
141
#define BINVERT_255x188513325H \
142
  (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) | (0x6864275B>>GMP_NUMB_BITS)))
143
#endif
144
#else
145
#if GMP_LIMB_BITS == 64
146
27.0k
#define BINVERT_2835  (GMP_NUMB_MASK &  CNST_LIMB(0x938CC70553E3771B))
147
27.0k
#define BINVERT_42525 (GMP_NUMB_MASK &  CNST_LIMB(0xE7B40D449F314C35))
148
27.0k
#define BINVERT_255x182712915  (GMP_NUMB_MASK &  CNST_LIMB(0x1B649A076FC4CB25))
149
27.0k
#define BINVERT_255x188513325  (GMP_NUMB_MASK &  CNST_LIMB(0x06DB993A6864275B))
150
#endif
151
#endif
152
153
#ifndef mpn_divexact_by255
154
#if GMP_NUMB_BITS % 8 == 0
155
#define mpn_divexact_by255(dst,src,size) \
156
  (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
157
#else
158
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
159
#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
160
#else
161
#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
162
#endif
163
#endif
164
#endif
165
166
#ifndef mpn_divexact_by255x4
167
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
168
27.0k
#define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2)
169
#else
170
#define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2)
171
#endif
172
#endif
173
174
#ifndef mpn_divexact_by9x16
175
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
176
27.0k
#define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4)
177
#else
178
#define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4)
179
#endif
180
#endif
181
182
#ifndef mpn_divexact_by42525x16
183
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
184
27.0k
#define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4)
185
#else
186
#define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4)
187
#endif
188
#endif
189
190
#ifndef mpn_divexact_by2835x64
191
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
192
27.0k
#define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6)
193
#else
194
#define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6)
195
#endif
196
#endif
197
198
#ifndef  mpn_divexact_by255x182712915
199
#if GMP_NUMB_BITS < 36
200
#if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H)
201
/* FIXME: use mpn_bdiv_q_2_pi2 */
202
#endif
203
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915)
204
#define mpn_divexact_by255x182712915(dst,src,size)        \
205
  do {                    \
206
    mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0);  \
207
    mpn_divexact_by255(dst,dst,size);           \
208
  } while(0)
209
#else
210
#define mpn_divexact_by255x182712915(dst,src,size)  \
211
  do {              \
212
    mpn_divexact_1(dst,src,size,CNST_LIMB(182712915));  \
213
    mpn_divexact_by255(dst,dst,size);     \
214
  } while(0)
215
#endif
216
#else /* GMP_NUMB_BITS > 35 */
217
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915)
218
#define mpn_divexact_by255x182712915(dst,src,size) \
219
27.0k
  mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0)
220
#else
221
#define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915))
222
#endif
223
#endif /* GMP_NUMB_BITS >?< 36 */
224
#endif
225
226
#ifndef  mpn_divexact_by255x188513325
227
#if GMP_NUMB_BITS < 36
228
#if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H)
229
/* FIXME: use mpn_bdiv_q_1_pi2 */
230
#endif
231
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325)
232
#define mpn_divexact_by255x188513325(dst,src,size)      \
233
  do {                  \
234
    mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0);  \
235
    mpn_divexact_by255(dst,dst,size);         \
236
  } while(0)
237
#else
238
#define mpn_divexact_by255x188513325(dst,src,size)  \
239
  do {              \
240
    mpn_divexact_1(dst,src,size,CNST_LIMB(188513325));  \
241
    mpn_divexact_by255(dst,dst,size);     \
242
  } while(0)
243
#endif
244
#else /* GMP_NUMB_BITS > 35 */
245
#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325)
246
#define mpn_divexact_by255x188513325(dst,src,size) \
247
27.0k
  mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0)
248
#else
249
#define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325))
250
#endif
251
#endif /* GMP_NUMB_BITS >?< 36 */
252
#endif
253
254
/* Interpolation for Toom-8.5 (or Toom-8), using the evaluation
255
   points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2,
256
   +-1/8, 0. More precisely, we want to compute
257
   f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or
258
   14), given the 16 (rsp. 15) values:
259
260
     r0 = limit at infinity of f(x) / x^15,
261
     r1 = f(8),f(-8),
262
     r2 = f(4),f(-4),
263
     r3 = f(2),f(-2),
264
     r4 = f(1),f(-1),
265
     r5 = f(1/4),f(-1/4),
266
     r6 = f(1/2),f(-1/2),
267
     r7 = f(1/8),f(-1/8),
268
     r8 = f(0).
269
270
   All couples of the form f(n),f(-n) must be already mixed with
271
   toom_couple_handling(f(n),...,f(-n),...)
272
273
   The result is stored in {pp, spt + 7*n (or 8*n)}.
274
   At entry, r8 is stored at {pp, 2n},
275
   r6 is stored at {pp + 3n, 3n + 1}.
276
   r4 is stored at {pp + 7n, 3n + 1}.
277
   r2 is stored at {pp +11n, 3n + 1}.
278
   r0 is stored at {pp +15n, spt}.
279
280
   The other values are 3n+1 limbs each (with most significant limbs small).
281
282
   Negative intermediate results are stored two-complemented.
283
   Inputs are destroyed.
284
*/
285
286
void
287
mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7,
288
      mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
289
27.0k
{
290
27.0k
  mp_limb_t cy;
291
27.0k
  mp_size_t n3;
292
27.0k
  mp_size_t n3p1;
293
27.0k
  n3 = 3 * n;
294
27.0k
  n3p1 = n3 + 1;
295
296
325k
#define   r6    (pp + n3)      /* 3n+1 */
297
108k
#define   r4    (pp + 7 * n)      /* 3n+1 */
298
81.2k
#define   r2    (pp +11 * n)      /* 3n+1 */
299
27.0k
#define   r0    (pp +15 * n)      /* s+t <= 2*n */
300
301
27.0k
  ASSERT( spt <= 2 * n );
302
  /******************************* interpolation *****************************/
303
27.0k
  if( half != 0) {
304
192
    cy = mpn_sub_n (r4, r4, r0, spt);
305
192
    MPN_DECR_U (r4 + spt, n3p1 - spt, cy);
306
307
192
    cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi);
308
192
    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
309
192
    DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi);
310
311
192
    cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi);
312
192
    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
313
192
    DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi);
314
315
192
    cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi);
316
#if BIT_CORRECTION
317
    cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION,
318
        n3p1 - spt - BIT_CORRECTION, cy);
319
    ASSERT (BIT_CORRECTION > 0 || cy == 0);
320
    /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */
321
    cy = r7[n3p1];
322
    r7[n3p1] = 0x80;
323
#else
324
192
    MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
325
192
#endif
326
192
    DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi);
327
#if BIT_CORRECTION
328
    /* FIXME: assumes r7[n3p1] is writable. */
329
    ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 );
330
    r7[n3p1] = cy;
331
#endif
332
27.0k
  };
333
334
27.0k
  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi);
335
27.0k
  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
336
337
#if HAVE_NATIVE_mpn_add_n_sub_n
338
  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
339
#else
340
27.0k
  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
341
27.0k
  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
342
27.0k
  MP_PTR_SWAP(r5, wsi);
343
27.0k
#endif
344
345
27.0k
  r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi);
346
27.0k
  DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
347
348
#if HAVE_NATIVE_mpn_add_n_sub_n
349
  mpn_add_n_sub_n (r3, r6, r6, r3, n3p1);
350
#else
351
27.0k
  ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1));
352
27.0k
  mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */
353
27.0k
  MP_PTR_SWAP(r3, wsi);
354
27.0k
#endif
355
356
27.0k
  cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi);
357
#if BIT_CORRECTION
358
  MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6);
359
  cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi);
360
  cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy);
361
  ASSERT ( BIT_CORRECTION > 0 || cy != 0 );
362
#else
363
27.0k
  r7[n3] -= cy;
364
27.0k
  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi);
365
27.0k
#endif
366
367
#if HAVE_NATIVE_mpn_add_n_sub_n
368
  mpn_add_n_sub_n (r1, r7, r7, r1, n3p1);
369
#else
370
27.0k
  mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */
371
27.0k
  mpn_add_n (r1, r1, r7, n3p1);  /* if BIT_CORRECTION != 0, can give a carry. */
372
27.0k
  MP_PTR_SWAP(r7, wsi);
373
27.0k
#endif
374
375
27.0k
  r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n);
376
377
27.0k
#if AORSMUL_FASTER_2AORSLSH
378
27.0k
  mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */
379
#else
380
  DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */
381
  DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */
382
#endif
383
384
27.0k
  mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */
385
27.0k
#if AORSMUL_FASTER_3AORSLSH
386
27.0k
  mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */
387
#else
388
  DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */
389
  DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */
390
  DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */
391
#endif
392
27.0k
  mpn_divexact_by255x188513325(r7, r7, n3p1);
393
394
27.0k
  mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */
395
  /* A division by 2835x64 follows. Warning: the operand can be negative! */
396
27.0k
  mpn_divexact_by2835x64(r5, r5, n3p1);
397
27.0k
  if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0)
398
26.1k
    r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6));
399
400
27.0k
#if AORSMUL_FASTER_AORS_AORSLSH
401
27.0k
  mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */
402
#else
403
  mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */
404
  DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */
405
#endif
406
27.0k
#if AORSMUL_FASTER_2AORSLSH
407
27.0k
  mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */
408
#else
409
  DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */
410
  DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */
411
#endif
412
  /* A division by 255x4 follows. Warning: the operand can be negative! */
413
27.0k
  mpn_divexact_by255x4(r6, r6, n3p1);
414
27.0k
  if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
415
1.21k
    r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
416
417
27.0k
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi));
418
419
27.0k
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi));
420
27.0k
  ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400));
421
422
  /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/
423
27.0k
  DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi);
424
27.0k
  mpn_submul_1 (r1, r2, n3p1, 1428);
425
27.0k
  mpn_submul_1 (r1, r3, n3p1, 112896);
426
27.0k
  mpn_divexact_by255x182712915(r1, r1, n3p1);
427
428
27.0k
  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425));
429
27.0k
  mpn_divexact_by42525x16(r2, r2, n3p1);
430
431
27.0k
#if AORSMUL_FASTER_AORS_2AORSLSH
432
27.0k
  ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969));
433
#else
434
  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
435
  ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi));
436
  ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi));
437
#endif
438
27.0k
  ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900));
439
27.0k
  mpn_divexact_by9x16(r3, r3, n3p1);
440
441
27.0k
  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1));
442
27.0k
  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1));
443
27.0k
  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1));
444
445
27.0k
#ifdef HAVE_NATIVE_mpn_rsh1add_n
446
27.0k
  mpn_rsh1add_n (r6, r2, r6, n3p1);
447
27.0k
  r6 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
448
#else
449
  mpn_add_n (r6, r2, r6, n3p1);
450
  ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1));
451
#endif
452
27.0k
  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1));
453
454
27.0k
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
455
27.0k
  mpn_rsh1sub_n (r5, r3, r5, n3p1);
456
27.0k
  r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
457
#else
458
  mpn_sub_n (r5, r3, r5, n3p1);
459
  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
460
#endif
461
27.0k
  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1));
462
463
27.0k
#ifdef HAVE_NATIVE_mpn_rsh1add_n
464
27.0k
  mpn_rsh1add_n (r7, r1, r7, n3p1);
465
27.0k
  r7 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
466
#else
467
  mpn_add_n (r7, r1, r7, n3p1);
468
  ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1));
469
#endif
470
27.0k
  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1));
471
472
  /* last interpolation steps... */
473
  /* ... could be mixed with recomposition
474
  ||H-r7|M-r7|L-r7|   ||H-r5|M-r5|L-r5|
475
  */
476
477
  /***************************** recomposition *******************************/
478
  /*
479
    pp[] prior to operations:
480
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
481
482
    summation scheme for remaining operations:
483
    |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
484
    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
485
  ||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|   ||H r7|M r7|L r7|
486
  */
487
488
27.0k
  cy = mpn_add_n (pp + n, pp + n, r7, n);
489
27.0k
  cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy);
490
27.0k
#if HAVE_NATIVE_mpn_add_nc
491
27.0k
  cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy);
492
#else
493
  MPN_INCR_U (r7 + 2 * n, n + 1, cy);
494
  cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n);
495
#endif
496
27.0k
  MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy);
497
498
27.0k
  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n);
499
27.0k
  cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]);
500
27.0k
#if HAVE_NATIVE_mpn_add_nc
501
27.0k
  cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy);
502
#else
503
  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
504
  cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n);
505
#endif
506
27.0k
  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
507
508
27.0k
  pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n);
509
27.0k
  cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]);
510
27.0k
#if HAVE_NATIVE_mpn_add_nc
511
27.0k
  cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy);
512
#else
513
  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
514
  cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n);
515
#endif
516
27.0k
  MPN_INCR_U (pp +12 * n, 2 * n + 1, cy);
517
518
27.0k
  pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n);
519
27.0k
  if ( half ) {
520
192
    cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]);
521
192
#if HAVE_NATIVE_mpn_add_nc
522
192
    if(LIKELY(spt > n)) {
523
192
      cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy);
524
192
      MPN_INCR_U (pp + 16 * n, spt - n, cy);
525
192
    } else {
526
0
      ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy));
527
0
    }
528
#else
529
    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
530
    if(LIKELY(spt > n)) {
531
      cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n);
532
      MPN_INCR_U (pp + 16 * n, spt - n, cy);
533
    } else {
534
      ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt));
535
    }
536
#endif
537
26.8k
  } else {
538
26.8k
    ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n]));
539
26.8k
  }
540
541
27.0k
#undef   r0
542
27.0k
#undef   r2
543
27.0k
#undef   r4
544
27.0k
#undef   r6
545
27.0k
}