Coverage Report

Created: 2025-11-16 06:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gmp/mpn/sqrlo.c
Line
Count
Source
1
/* mpn_sqrlo -- squares an n-limb number and returns the low n limbs
2
   of the result.
3
4
   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5
6
   THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
7
   FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
8
   THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10
Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
11
12
This file is part of the GNU MP Library.
13
14
The GNU MP Library is free software; you can redistribute it and/or modify
15
it under the terms of either:
16
17
  * the GNU Lesser General Public License as published by the Free
18
    Software Foundation; either version 3 of the License, or (at your
19
    option) any later version.
20
21
or
22
23
  * the GNU General Public License as published by the Free Software
24
    Foundation; either version 2 of the License, or (at your option) any
25
    later version.
26
27
or both in parallel, as here.
28
29
The GNU MP Library is distributed in the hope that it will be useful, but
30
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
32
for more details.
33
34
You should have received copies of the GNU General Public License and the
35
GNU Lesser General Public License along with the GNU MP Library.  If not,
36
see https://www.gnu.org/licenses/.  */
37
38
#include "gmp-impl.h"
39
40
#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
41
#define MAYBE_range_basecase 1
42
#define MAYBE_range_toom22   1
43
#else
44
#define MAYBE_range_basecase                                           \
45
375k
  ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11))
46
#define MAYBE_range_toom22                                             \
47
350k
  ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) )
48
#endif
49
50
/*  THINK: The DC strategy uses different constants in different Toom's
51
   ranges. Something smoother?
52
*/
53
54
/*
55
  Compute the least significant half of the product {xy,n}*{yp,n}, or
56
  formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
57
58
  Above the given threshold, the Divide and Conquer strategy is used.
59
  The operand is split in two, and a full square plus a mullo
60
  is used to obtain the final result. The more natural strategy is to
61
  split in two halves, but this is far from optimal when a
62
  sub-quadratic multiplication is used.
63
64
  Mulders suggests an unbalanced split in favour of the full product,
65
  split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
66
67
  To compute the value of a, we assume that the cost of mullo for a
68
  given size ML(n) is a fraction of the cost of a full product with
69
  same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
70
  then we can write:
71
72
  ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
73
74
  Given a value for e, want to minimise the value of k, i.e. the
75
  function k=(1-a)^e/(1-2*a^e).
76
77
  With e=2, the exponent for schoolbook multiplication, the minimum is
78
  given by the values a=1-a=1/2.
79
80
  With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
81
  Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
82
83
  Other possible approximations follow:
84
  e=log(5)/log(3) [Toom-3] -> a ~= 9/40
85
  e=log(7)/log(4) [Toom-4] -> a ~= 7/39
86
  e=log(11)/log(6) [Toom-6] -> a ~= 1/8
87
  e=log(15)/log(8) [Toom-8] -> a ~= 1/10
88
89
  The values above where obtained with the following trivial commands
90
  in the gp-pari shell:
91
92
fun(e,a)=(1-a)^e/(1-2*a^e)
93
mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
94
contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
95
contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
96
contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
97
contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
98
contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
99
100
  ,
101
  |\
102
  | \
103
  +----,
104
  |    |
105
  |    |
106
  |    |\
107
  |    | \
108
  +----+--`
109
  ^ n2 ^n1^
110
111
  For an actual implementation, the assumption that M(n)=n^e is
112
  incorrect, as a consequence also the assumption that ML(n)=k*M(n)
113
  with a constant k is wrong.
114
115
  But theory suggest us two things:
116
  - the best the multiplication product is (lower e), the more k
117
    approaches 1, and a approaches 0.
118
119
  - A value for a smaller than optimal is probably less bad than a
120
    bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
121
    value, and k(a)=0.808_ the mul/mullo speed ratio. We get
122
    k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
123
*/
124
125
static mp_size_t
126
mpn_sqrlo_itch (mp_size_t n)
127
375k
{
128
375k
  return 2*n;
129
375k
}
130
131
/*
132
    mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
133
    It accepts tp == rp.
134
*/
135
static void
136
mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
137
187k
{
138
187k
  mp_size_t n2, n1;
139
187k
  ASSERT (n >= 2);
140
187k
  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
141
187k
  ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
142
143
  /* Divide-and-conquer */
144
145
  /* We need fractional approximation of the value 0 < a <= 1/2
146
     giving the minimum in the function k=(1-a)^e/(1-2*a^e).
147
  */
148
187k
  if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
149
12.3k
    n1 = n >> 1;
150
175k
  else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
151
136k
    n1 = n * 11 / (size_t) 36;  /* n1 ~= n*(1-.694...) */
152
38.3k
  else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9)))
153
38.3k
    n1 = n * 9 / (size_t) 40;  /* n1 ~= n*(1-.775...) */
154
0
  else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9))
155
0
    n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */
156
  /* n1 = n * 4 / (size_t) 31;  // n1 ~= n*(1-.871...) [TOOM66] */
157
0
  else
158
0
    n1 = n / (size_t) 10;   /* n1 ~= n*(1-.899...) [TOOM88] */
159
160
187k
  n2 = n - n1;
161
162
  /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */
163
164
  /* x0 ^ 2 */
165
187k
  mpn_sqr (tp, xp, n2);
166
187k
  MPN_COPY (rp, tp, n2);
167
168
  /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */
169
187k
  if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
170
0
    mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1);
171
187k
  else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
172
187k
    mpn_mullo_basecase (tp + n, xp + n2, xp, n1);
173
0
  else
174
0
    mpn_mullo_n (tp + n, xp + n2, xp, n1);
175
  /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */
176
#if HAVE_NATIVE_mpn_addlsh1_n
177
  mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1);
178
#else
179
187k
  mpn_lshift (rp + n2, tp + n, n1, 1);
180
187k
  mpn_add_n (rp + n2, rp + n2, tp + n2, n1);
181
187k
#endif
182
187k
}
183
184
/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
185
#define SQR_BASECASE_ALLOC \
186
 (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT)
187
188
/* FIXME: This function should accept a temporary area; dc_sqrlo
189
   accepts a pointer tp, and handle the case tp == rp, do the same here.
190
*/
191
192
void
193
mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n)
194
1.09M
{
195
1.09M
  ASSERT (n >= 1);
196
1.09M
  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
197
198
1.09M
  if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD))
199
0
    {
200
      /* FIXME: smarter criteria? */
201
0
#if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase
202
      /* mullo computes as many products as sqr, but directly writes
203
   on the result area. */
204
0
      mpn_mullo_basecase (rp, xp, xp, n);
205
#else
206
      /* Allocate workspace of fixed size on stack: fast! */
207
      mp_limb_t tp[SQR_BASECASE_ALLOC];
208
      mpn_sqr_basecase (tp, xp, n);
209
      MPN_COPY (rp, tp, n);
210
#endif
211
0
    }
212
1.09M
  else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD))
213
909k
    {
214
909k
      mpn_sqrlo_basecase (rp, xp, n);
215
909k
    }
216
187k
  else
217
187k
    {
218
187k
      mp_ptr tp;
219
187k
      TMP_DECL;
220
187k
      TMP_MARK;
221
187k
      tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n));
222
187k
      if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD))
223
187k
  {
224
187k
    mpn_dc_sqrlo (rp, xp, n, tp);
225
187k
  }
226
0
      else
227
0
  {
228
    /* For really large operands, use plain mpn_mul_n but throw away upper n
229
       limbs of result.  */
230
0
#if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD)
231
0
    mpn_fft_mul (tp, xp, n, xp, n);
232
#else
233
    mpn_sqr (tp, xp, n);
234
#endif
235
0
    MPN_COPY (rp, tp, n);
236
0
  }
237
187k
      TMP_FREE;
238
187k
    }
239
1.09M
}