/src/libgmp/mpn/toom22_mul.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_toom22_mul -- Multiply {ap,an} and {bp,bn} where an >= bn. Or more |
2 | | accurately, bn <= an < 2bn. |
3 | | |
4 | | Contributed to the GNU project by Torbjorn Granlund. |
5 | | |
6 | | THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY |
7 | | SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
8 | | GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
9 | | |
10 | | Copyright 2006-2010, 2012, 2014, 2018, 2020 Free Software Foundation, Inc. |
11 | | |
12 | | This file is part of the GNU MP Library. |
13 | | |
14 | | The GNU MP Library is free software; you can redistribute it and/or modify |
15 | | it under the terms of either: |
16 | | |
17 | | * the GNU Lesser General Public License as published by the Free |
18 | | Software Foundation; either version 3 of the License, or (at your |
19 | | option) any later version. |
20 | | |
21 | | or |
22 | | |
23 | | * the GNU General Public License as published by the Free Software |
24 | | Foundation; either version 2 of the License, or (at your option) any |
25 | | later version. |
26 | | |
27 | | or both in parallel, as here. |
28 | | |
29 | | The GNU MP Library is distributed in the hope that it will be useful, but |
30 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
31 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
32 | | for more details. |
33 | | |
34 | | You should have received copies of the GNU General Public License and the |
35 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
36 | | see https://www.gnu.org/licenses/. */ |
37 | | |
38 | | |
39 | | #include "gmp-impl.h" |
40 | | |
41 | | /* Evaluate in: -1, 0, +inf |
42 | | |
43 | | <-s--><--n--> |
44 | | ____ ______ |
45 | | |_a1_|___a0_| |
46 | | |b1_|___b0_| |
47 | | <-t-><--n--> |
48 | | |
49 | | v0 = a0 * b0 # A(0)*B(0) |
50 | | vm1 = (a0- a1)*(b0- b1) # A(-1)*B(-1) |
51 | | vinf= a1 * b1 # A(inf)*B(inf) |
52 | | */ |
53 | | |
54 | | #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY |
55 | | #define MAYBE_mul_toom22 1 |
56 | | #else |
57 | | #define MAYBE_mul_toom22 \ |
58 | 133M | (MUL_TOOM33_THRESHOLD >= 2 * MUL_TOOM22_THRESHOLD) |
59 | | #endif |
60 | | |
61 | | #define TOOM22_MUL_N_REC(p, a, b, n, ws) \ |
62 | 66.4M | do { \ |
63 | 66.4M | if (! MAYBE_mul_toom22 \ |
64 | 66.4M | || BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ |
65 | 66.4M | mpn_mul_basecase (p, a, n, b, n); \ |
66 | 66.4M | else \ |
67 | 66.4M | mpn_toom22_mul (p, a, n, b, n, ws); \ |
68 | 66.4M | } while (0) |
69 | | |
70 | | /* Normally, this calls mul_basecase or toom22_mul. But when when the fraction |
71 | | MUL_TOOM33_THRESHOLD / MUL_TOOM22_THRESHOLD is large, an initially small |
72 | | relative unbalance will become a larger and larger relative unbalance with |
73 | | each recursion (the difference s-t will be invariant over recursive calls). |
74 | | Therefore, we need to call toom32_mul. FIXME: Suppress depending on |
75 | | MUL_TOOM33_THRESHOLD / MUL_TOOM22_THRESHOLD and on MUL_TOOM22_THRESHOLD. */ |
76 | | #define TOOM22_MUL_REC(p, a, an, b, bn, ws) \ |
77 | 34.8k | do { \ |
78 | 34.8k | if (! MAYBE_mul_toom22 \ |
79 | 34.8k | || BELOW_THRESHOLD (bn, MUL_TOOM22_THRESHOLD)) \ |
80 | 34.8k | mpn_mul_basecase (p, a, an, b, bn); \ |
81 | 34.8k | else if (4 * an < 5 * bn) \ |
82 | 1.40k | mpn_toom22_mul (p, a, an, b, bn, ws); \ |
83 | 1.40k | else \ |
84 | 1.40k | mpn_toom32_mul (p, a, an, b, bn, ws); \ |
85 | 34.8k | } while (0) |
86 | | |
87 | | void |
88 | | mpn_toom22_mul (mp_ptr pp, |
89 | | mp_srcptr ap, mp_size_t an, |
90 | | mp_srcptr bp, mp_size_t bn, |
91 | | mp_ptr scratch) |
92 | 22.1M | { |
93 | 22.1M | const int __gmpn_cpuvec_initialized = 1; |
94 | 22.1M | mp_size_t n, s, t; |
95 | 22.1M | int vm1_neg; |
96 | 22.1M | mp_limb_t cy, cy2; |
97 | 22.1M | mp_ptr asm1; |
98 | 22.1M | mp_ptr bsm1; |
99 | | |
100 | 51.5M | #define a0 ap |
101 | 37.1M | #define a1 (ap + n) |
102 | 44.3M | #define b0 bp |
103 | 37.1M | #define b1 (bp + n) |
104 | | |
105 | 22.1M | s = an >> 1; |
106 | 22.1M | n = an - s; |
107 | 22.1M | t = bn - n; |
108 | | |
109 | 22.1M | ASSERT (an >= bn); |
110 | | |
111 | 22.1M | ASSERT (0 < s && s <= n && (n - s) == (an & 1)); |
112 | 22.1M | ASSERT (0 < t && t <= s); |
113 | | |
114 | 22.1M | asm1 = pp; |
115 | 22.1M | bsm1 = pp + n; |
116 | | |
117 | 22.1M | vm1_neg = 0; |
118 | | |
119 | | /* Compute asm1. */ |
120 | 22.1M | if ((an & 1) == 0) /* s == n */ |
121 | 14.9M | { |
122 | 14.9M | if (mpn_cmp (a0, a1, n) < 0) |
123 | 6.01M | { |
124 | 6.01M | mpn_sub_n (asm1, a1, a0, n); |
125 | 6.01M | vm1_neg = 1; |
126 | 6.01M | } |
127 | 8.97M | else |
128 | 8.97M | { |
129 | 8.97M | mpn_sub_n (asm1, a0, a1, n); |
130 | 8.97M | } |
131 | 14.9M | } |
132 | 7.18M | else /* n - s == 1 */ |
133 | 7.18M | { |
134 | 7.18M | if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0) |
135 | 47 | { |
136 | 47 | mpn_sub_n (asm1, a1, a0, s); |
137 | 47 | asm1[s] = 0; |
138 | 47 | vm1_neg = 1; |
139 | 47 | } |
140 | 7.18M | else |
141 | 7.18M | { |
142 | 7.18M | asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s); |
143 | 7.18M | } |
144 | 7.18M | } |
145 | | |
146 | | /* Compute bsm1. */ |
147 | 22.1M | if (t == n) |
148 | 14.9M | { |
149 | 14.9M | if (mpn_cmp (b0, b1, n) < 0) |
150 | 5.93M | { |
151 | 5.93M | mpn_sub_n (bsm1, b1, b0, n); |
152 | 5.93M | vm1_neg ^= 1; |
153 | 5.93M | } |
154 | 9.05M | else |
155 | 9.05M | { |
156 | 9.05M | mpn_sub_n (bsm1, b0, b1, n); |
157 | 9.05M | } |
158 | 14.9M | } |
159 | 7.18M | else |
160 | 7.18M | { |
161 | 7.18M | if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) |
162 | 9.12k | { |
163 | 9.12k | mpn_sub_n (bsm1, b1, b0, t); |
164 | 9.12k | MPN_ZERO (bsm1 + t, n - t); |
165 | 9.12k | vm1_neg ^= 1; |
166 | 9.12k | } |
167 | 7.17M | else |
168 | 7.17M | { |
169 | 7.17M | mpn_sub (bsm1, b0, n, b1, t); |
170 | 7.17M | } |
171 | 7.18M | } |
172 | | |
173 | 44.3M | #define v0 pp /* 2n */ |
174 | 44.3M | #define vinf (pp + 2 * n) /* s+t */ |
175 | 22.1M | #define vm1 scratch /* 2n */ |
176 | 22.1M | #define scratch_out scratch + 2 * n |
177 | | |
178 | | /* vm1, 2n limbs */ |
179 | 22.1M | TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); |
180 | | |
181 | 22.1M | if (s > t) TOOM22_MUL_REC (vinf, a1, s, b1, t, scratch_out); |
182 | 22.1M | else TOOM22_MUL_N_REC (vinf, a1, b1, s, scratch_out); |
183 | | |
184 | | /* v0, 2n limbs */ |
185 | 22.1M | TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out); |
186 | | |
187 | | /* H(v0) + L(vinf) */ |
188 | 22.1M | cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); |
189 | | |
190 | | /* L(v0) + (H(v0) + L(vinf)) */ |
191 | 22.1M | cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); |
192 | | |
193 | | /* (H(v0) + L(vinf)) + H(vinf) */ |
194 | 22.1M | cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + t - n); |
195 | | |
196 | 22.1M | if (vm1_neg) |
197 | 6.28M | cy += mpn_add_n (pp + n, pp + n, vm1, 2 * n); |
198 | 15.8M | else { |
199 | 15.8M | cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); |
200 | 15.8M | if (UNLIKELY (cy + 1 == 0)) { /* cy is negative */ |
201 | | /* The total contribution of v0+vinf-vm1 can not be negative. */ |
202 | 0 | #if WANT_ASSERT |
203 | | /* The borrow in cy stops the propagation of the carry cy2, */ |
204 | 0 | ASSERT (cy2 == 1); |
205 | 0 | cy += mpn_add_1 (pp + 2 * n, pp + 2 * n, n, cy2); |
206 | 0 | ASSERT (cy == 0); |
207 | | #else |
208 | | /* we simply fill the area with zeros. */ |
209 | | MPN_FILL (pp + 2 * n, n, 0); |
210 | | /* ASSERT (s + t == n || mpn_zero_p (pp + 3 * n, s + t - n)); */ |
211 | | #endif |
212 | 0 | return; |
213 | 0 | } |
214 | 15.8M | } |
215 | | |
216 | 22.1M | ASSERT (cy <= 2); |
217 | 22.1M | ASSERT (cy2 <= 2); |
218 | | |
219 | 22.1M | MPN_INCR_U (pp + 2 * n, s + t, cy2); |
220 | | /* if s+t==n, cy is zero, but we should not access pp[3*n] at all. */ |
221 | 22.1M | MPN_INCR_U (pp + 3 * n, s + t - n, cy); |
222 | 22.1M | } |