Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_sqrlo -- squares an n-limb number and returns the low n limbs |
2 | | of the result. |
3 | | |
4 | | Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. |
5 | | |
6 | | THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS |
7 | | FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED |
8 | | THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
9 | | |
10 | | Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc. |
11 | | |
12 | | This file is part of the GNU MP Library. |
13 | | |
14 | | The GNU MP Library is free software; you can redistribute it and/or modify |
15 | | it under the terms of either: |
16 | | |
17 | | * the GNU Lesser General Public License as published by the Free |
18 | | Software Foundation; either version 3 of the License, or (at your |
19 | | option) any later version. |
20 | | |
21 | | or |
22 | | |
23 | | * the GNU General Public License as published by the Free Software |
24 | | Foundation; either version 2 of the License, or (at your option) any |
25 | | later version. |
26 | | |
27 | | or both in parallel, as here. |
28 | | |
29 | | The GNU MP Library is distributed in the hope that it will be useful, but |
30 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
31 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
32 | | for more details. |
33 | | |
34 | | You should have received copies of the GNU General Public License and the |
35 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
36 | | see https://www.gnu.org/licenses/. */ |
37 | | |
38 | | #include "gmp-impl.h" |
39 | | |
40 | | #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY |
41 | | #define MAYBE_range_basecase 1 |
42 | | #define MAYBE_range_toom22 1 |
43 | | #else |
44 | | #define MAYBE_range_basecase \ |
45 | 0 | ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11)) |
46 | | #define MAYBE_range_toom22 \ |
47 | 0 | ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) ) |
48 | | #endif |
49 | | |
50 | | /* THINK: The DC strategy uses different constants in different Toom's |
51 | | ranges. Something smoother? |
52 | | */ |
53 | | |
54 | | /* |
55 | | Compute the least significant half of the product {xy,n}*{yp,n}, or |
56 | | formally {rp,n} = {xy,n}*{yp,n} Mod (B^n). |
57 | | |
58 | | Above the given threshold, the Divide and Conquer strategy is used. |
59 | | The operand is split in two, and a full square plus a mullo |
60 | | is used to obtain the final result. The more natural strategy is to |
61 | | split in two halves, but this is far from optimal when a |
62 | | sub-quadratic multiplication is used. |
63 | | |
64 | | Mulders suggests an unbalanced split in favour of the full product, |
65 | | split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2. |
66 | | |
67 | | To compute the value of a, we assume that the cost of mullo for a |
68 | | given size ML(n) is a fraction of the cost of a full product with |
69 | | same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2; |
70 | | then we can write: |
71 | | |
72 | | ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e |
73 | | |
74 | | Given a value for e, want to minimise the value of k, i.e. the |
75 | | function k=(1-a)^e/(1-2*a^e). |
76 | | |
77 | | With e=2, the exponent for schoolbook multiplication, the minimum is |
78 | | given by the values a=1-a=1/2. |
79 | | |
80 | | With e=log(3)/log(2), the exponent for Karatsuba (aka toom22), |
81 | | Mulders compute (1-a) = 0.694... and we approximate a with 11/36. |
82 | | |
83 | | Other possible approximations follow: |
84 | | e=log(5)/log(3) [Toom-3] -> a ~= 9/40 |
85 | | e=log(7)/log(4) [Toom-4] -> a ~= 7/39 |
86 | | e=log(11)/log(6) [Toom-6] -> a ~= 1/8 |
87 | | e=log(15)/log(8) [Toom-8] -> a ~= 1/10 |
88 | | |
89 | | The values above where obtained with the following trivial commands |
90 | | in the gp-pari shell: |
91 | | |
92 | | fun(e,a)=(1-a)^e/(1-2*a^e) |
93 | | mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))} |
94 | | contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5)) |
95 | | contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5)) |
96 | | contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5)) |
97 | | contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3)) |
98 | | contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3)) |
99 | | |
100 | | , |
101 | | |\ |
102 | | | \ |
103 | | +----, |
104 | | | | |
105 | | | | |
106 | | | |\ |
107 | | | | \ |
108 | | +----+--` |
109 | | ^ n2 ^n1^ |
110 | | |
111 | | For an actual implementation, the assumption that M(n)=n^e is |
112 | | incorrect, as a consequence also the assumption that ML(n)=k*M(n) |
113 | | with a constant k is wrong. |
114 | | |
115 | | But theory suggest us two things: |
116 | | - the best the multiplication product is (lower e), the more k |
117 | | approaches 1, and a approaches 0. |
118 | | |
119 | | - A value for a smaller than optimal is probably less bad than a |
120 | | bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal |
121 | | value, and k(a)=0.808_ the mul/mullo speed ratio. We get |
122 | | k(a+1/6)=0.929_ but k(a-1/6)=0.865_. |
123 | | */ |
124 | | |
125 | | static mp_size_t |
126 | | mpn_sqrlo_itch (mp_size_t n) |
127 | 0 | { |
128 | 0 | return 2*n; |
129 | 0 | } |
130 | | |
131 | | /* |
132 | | mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp. |
133 | | It accepts tp == rp. |
134 | | */ |
135 | | static void |
136 | | mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp) |
137 | 0 | { |
138 | 0 | mp_size_t n2, n1; |
139 | 0 | ASSERT (n >= 2); |
140 | 0 | ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); |
141 | 0 | ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n)); |
142 | | |
143 | | /* Divide-and-conquer */ |
144 | | |
145 | | /* We need fractional approximation of the value 0 < a <= 1/2 |
146 | | giving the minimum in the function k=(1-a)^e/(1-2*a^e). |
147 | | */ |
148 | 0 | if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11))) |
149 | 0 | n1 = n >> 1; |
150 | 0 | else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11))) |
151 | 0 | n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */ |
152 | 0 | else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9))) |
153 | 0 | n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */ |
154 | 0 | else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9)) |
155 | 0 | n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */ |
156 | | /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */ |
157 | 0 | else |
158 | 0 | n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */ |
159 | |
|
160 | 0 | n2 = n - n1; |
161 | | |
162 | | /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */ |
163 | | |
164 | | /* x0 ^ 2 */ |
165 | 0 | mpn_sqr (tp, xp, n2); |
166 | 0 | MPN_COPY (rp, tp, n2); |
167 | | |
168 | | /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */ |
169 | 0 | if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) |
170 | 0 | mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1); |
171 | 0 | else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) |
172 | 0 | mpn_mullo_basecase (tp + n, xp + n2, xp, n1); |
173 | 0 | else |
174 | 0 | mpn_mullo_n (tp + n, xp + n2, xp, n1); |
175 | | /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */ |
176 | 0 | #if HAVE_NATIVE_mpn_addlsh1_n |
177 | 0 | mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1); |
178 | | #else |
179 | | mpn_lshift (rp + n2, tp + n, n1, 1); |
180 | | mpn_add_n (rp + n2, rp + n2, tp + n2, n1); |
181 | | #endif |
182 | 0 | } |
183 | | |
184 | | /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */ |
185 | | #define SQR_BASECASE_ALLOC \ |
186 | | (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT) |
187 | | |
188 | | /* FIXME: This function should accept a temporary area; dc_sqrlo |
189 | | accepts a pointer tp, and handle the case tp == rp, do the same here. |
190 | | */ |
191 | | |
192 | | void |
193 | | mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n) |
194 | 0 | { |
195 | 0 | ASSERT (n >= 1); |
196 | 0 | ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); |
197 | |
|
198 | 0 | if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD)) |
199 | 0 | { |
200 | | /* FIXME: smarter criteria? */ |
201 | 0 | #if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase |
202 | | /* mullo computes as many products as sqr, but directly writes |
203 | | on the result area. */ |
204 | 0 | mpn_mullo_basecase (rp, xp, xp, n); |
205 | | #else |
206 | | /* Allocate workspace of fixed size on stack: fast! */ |
207 | | mp_limb_t tp[SQR_BASECASE_ALLOC]; |
208 | | mpn_sqr_basecase (tp, xp, n); |
209 | | MPN_COPY (rp, tp, n); |
210 | | #endif |
211 | 0 | } |
212 | 0 | else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD)) |
213 | 0 | { |
214 | 0 | mpn_sqrlo_basecase (rp, xp, n); |
215 | 0 | } |
216 | 0 | else |
217 | 0 | { |
218 | 0 | mp_ptr tp; |
219 | 0 | TMP_DECL; |
220 | 0 | TMP_MARK; |
221 | 0 | tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n)); |
222 | 0 | if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD)) |
223 | 0 | { |
224 | 0 | mpn_dc_sqrlo (rp, xp, n, tp); |
225 | 0 | } |
226 | 0 | else |
227 | 0 | { |
228 | | /* For really large operands, use plain mpn_mul_n but throw away upper n |
229 | | limbs of result. */ |
230 | 0 | #if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD) |
231 | 0 | mpn_fft_mul (tp, xp, n, xp, n); |
232 | | #else |
233 | | mpn_sqr (tp, xp, n); |
234 | | #endif |
235 | 0 | MPN_COPY (rp, tp, n); |
236 | 0 | } |
237 | 0 | TMP_FREE; |
238 | 0 | } |
239 | 0 | } |