Coverage Report

Created: 2024-11-21 07:03

/src/libgmp/mpn/toom4_sqr.c
Line
Count
Source (jump to first uncovered line)
1
/* mpn_toom4_sqr -- Square {ap,an}.
2
3
   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
4
5
   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6
   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7
   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9
Copyright 2006-2010, 2013, 2021 Free Software Foundation, Inc.
10
11
This file is part of the GNU MP Library.
12
13
The GNU MP Library is free software; you can redistribute it and/or modify
14
it under the terms of either:
15
16
  * the GNU Lesser General Public License as published by the Free
17
    Software Foundation; either version 3 of the License, or (at your
18
    option) any later version.
19
20
or
21
22
  * the GNU General Public License as published by the Free Software
23
    Foundation; either version 2 of the License, or (at your option) any
24
    later version.
25
26
or both in parallel, as here.
27
28
The GNU MP Library is distributed in the hope that it will be useful, but
29
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31
for more details.
32
33
You should have received copies of the GNU General Public License and the
34
GNU Lesser General Public License along with the GNU MP Library.  If not,
35
see https://www.gnu.org/licenses/.  */
36
37
38
#include "gmp-impl.h"
39
40
/* Evaluate in: -2, -1, 0, +1/2, +1, +2, +inf
41
42
  <-s--><--n--><--n--><--n-->
43
   ____ ______ ______ ______
44
  |_a3_|___a2_|___a1_|___a0_|
45
46
  v0  =   a0             ^2 #    A(0)^2
47
  v1  = ( a0+ a1+ a2+ a3)^2 #    A(1)^2   ah  <= 3
48
  vm1 = ( a0- a1+ a2- a3)^2 #   A(-1)^2  |ah| <= 1
49
  v2  = ( a0+2a1+4a2+8a3)^2 #    A(2)^2   ah  <= 14
50
  vm2 = ( a0-2a1+4a2-8a3)^2 #   A(-2)^2  -9<=ah<=4
51
  vh  = (8a0+4a1+2a2+ a3)^2 #  A(1/2)^2   ah  <= 14
52
  vinf=               a3 ^2 #  A(inf)^2
53
*/
54
55
#if TUNE_PROGRAM_BUILD
56
#define MAYBE_sqr_basecase 1
57
#define MAYBE_sqr_toom2   1
58
#define MAYBE_sqr_toom4   1
59
#else
60
#define MAYBE_sqr_basecase            \
61
2.73k
  (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM2_THRESHOLD)
62
#define MAYBE_sqr_toom2             \
63
2.73k
  (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM3_THRESHOLD)
64
#define MAYBE_sqr_toom4             \
65
0
  (SQR_TOOM6_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD)
66
#endif
67
68
#define TOOM4_SQR_REC(p, a, n, ws)          \
69
1.36k
  do {                 \
70
1.36k
    if (MAYBE_sqr_basecase            \
71
1.36k
  && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))      \
72
1.36k
      mpn_sqr_basecase (p, a, n);         \
73
1.36k
    else if (MAYBE_sqr_toom2            \
74
1.36k
       && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))    \
75
1.36k
      mpn_toom2_sqr (p, a, n, ws);         \
76
1.36k
    else if (! MAYBE_sqr_toom4            \
77
0
       || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))    \
78
0
      mpn_toom3_sqr (p, a, n, ws);         \
79
0
    else                \
80
0
      mpn_toom4_sqr (p, a, n, ws);         \
81
1.36k
  } while (0)
82
83
void
84
mpn_toom4_sqr (mp_ptr pp,
85
         mp_srcptr ap, mp_size_t an,
86
         mp_ptr scratch)
87
195
{
88
195
  mp_size_t n, s;
89
195
  mp_limb_t cy;
90
91
195
#define a0  ap
92
195
#define a1  (ap + n)
93
195
#define a2  (ap + 2*n)
94
195
#define a3  (ap + 3*n)
95
96
195
  n = (an + 3) >> 2;
97
98
195
  s = an - 3 * n;
99
100
195
  ASSERT (0 < s && s <= n);
101
102
  /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
103
   * following limb, so these must be computed in order, and we need a
104
   * one limb gap to tp. */
105
195
#define v0    pp        /* 2n */
106
195
#define v1    (pp + 2 * n)      /* 2n+1 */
107
195
#define vinf  (pp + 6 * n)      /* s+t */
108
195
#define v2    scratch        /* 2n+1 */
109
195
#define vm2   (scratch + 2 * n + 1)    /* 2n+1 */
110
195
#define vh    (scratch + 4 * n + 2)    /* 2n+1 */
111
390
#define vm1   (scratch + 6 * n + 3)    /* 2n+1 */
112
585
#define tp (scratch + 8*n + 5)
113
114
  /* No overlap with v1 */
115
1.74k
#define apx   pp        /* n+1 */
116
390
#define amx   (pp + 4*n + 2)      /* n+1 */
117
118
  /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
119
     gives roughly 32 n/3 + log term. */
120
121
  /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3.  */
122
195
  mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp);
123
124
195
  TOOM4_SQR_REC (v2, apx, n + 1, tp);  /* v2,  2n+1 limbs */
125
195
  TOOM4_SQR_REC (vm2, amx, n + 1, tp); /* vm2,  2n+1 limbs */
126
127
  /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
128
195
#if HAVE_NATIVE_mpn_addlsh1_n
129
195
  cy = mpn_addlsh1_n (apx, a1, a0, n);
130
195
  cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
131
195
  if (s < n)
132
91
    {
133
91
      mp_limb_t cy2;
134
91
      cy2 = mpn_addlsh1_n (apx, a3, apx, s);
135
91
      apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
136
91
      MPN_INCR_U (apx + s, n+1-s, cy2);
137
91
    }
138
104
  else
139
104
    apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
140
#else
141
  cy = mpn_lshift (apx, a0, n, 1);
142
  cy += mpn_add_n (apx, apx, a1, n);
143
  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
144
  cy += mpn_add_n (apx, apx, a2, n);
145
  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
146
  apx[n] = cy + mpn_add (apx, apx, n, a3, s);
147
#endif
148
149
195
  ASSERT (apx[n] < 15);
150
151
195
  TOOM4_SQR_REC (vh, apx, n + 1, tp);  /* vh,  2n+1 limbs */
152
153
  /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3.  */
154
195
  mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp);
155
156
195
  TOOM4_SQR_REC (v1, apx, n + 1, tp);  /* v1,  2n+1 limbs */
157
195
  vm1 [2 * n] = 0;
158
195
  TOOM4_SQR_REC (vm1, amx, n + amx[n], tp);  /* vm1,  2n+1 limbs */
159
160
195
  TOOM4_SQR_REC (v0, a0, n, tp);
161
195
  TOOM4_SQR_REC (vinf, a3, s, tp); /* vinf, 2s limbs */
162
163
195
  mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) 0, vm2, vm1, v2, vh, 2*s, tp);
164
195
}