Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square, |
2 | | zero otherwise. |
3 | | |
4 | | Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software |
5 | | Foundation, Inc. |
6 | | |
7 | | This file is part of the GNU MP Library. |
8 | | |
9 | | The GNU MP Library is free software; you can redistribute it and/or modify |
10 | | it under the terms of either: |
11 | | |
12 | | * the GNU Lesser General Public License as published by the Free |
13 | | Software Foundation; either version 3 of the License, or (at your |
14 | | option) any later version. |
15 | | |
16 | | or |
17 | | |
18 | | * the GNU General Public License as published by the Free Software |
19 | | Foundation; either version 2 of the License, or (at your option) any |
20 | | later version. |
21 | | |
22 | | or both in parallel, as here. |
23 | | |
24 | | The GNU MP Library is distributed in the hope that it will be useful, but |
25 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
26 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
27 | | for more details. |
28 | | |
29 | | You should have received copies of the GNU General Public License and the |
30 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
31 | | see https://www.gnu.org/licenses/. */ |
32 | | |
33 | | #include <stdio.h> /* for NULL */ |
34 | | #include "gmp-impl.h" |
35 | | #include "longlong.h" |
36 | | |
37 | | #include "perfsqr.h" |
38 | | |
39 | | |
40 | | /* change this to "#define TRACE(x) x" for diagnostics */ |
41 | | #define TRACE(x) |
42 | | |
43 | | |
44 | | |
45 | | /* PERFSQR_MOD_* detects non-squares using residue tests. |
46 | | |
47 | | A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h. It takes |
48 | | {up,usize} modulo a selected modulus to get a remainder r. For 32-bit or |
49 | | 64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34, |
50 | | or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP |
51 | | used. PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this |
52 | | case too. |
53 | | |
54 | | PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or |
55 | | PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table |
56 | | data indicating residues and non-residues modulo those divisors. The |
57 | | table data is in 1 or 2 limbs worth of bits respectively, per the size of |
58 | | each d. |
59 | | |
60 | | A "modexact" style remainder is taken to reduce r modulo d. |
61 | | PERFSQR_MOD_IDX implements this, producing an index "idx" for use with |
62 | | the table data. Notice there's just one multiplication by a constant |
63 | | "inv", for each d. |
64 | | |
65 | | The modexact doesn't produce a true r%d remainder, instead idx satisfies |
66 | | "-(idx<<PERFSQR_MOD_BITS) == r mod d". Because d is odd, this factor |
67 | | -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is |
68 | | accounted for by having the table data suitably permuted. |
69 | | |
70 | | The remainder r fits within PERFSQR_MOD_BITS which is less than a limb. |
71 | | In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit |
72 | | each divisor d meaning the modexact multiply can take place entirely |
73 | | within one limb, giving the compiler the chance to optimize it, in a way |
74 | | that say umul_ppmm would not give. |
75 | | |
76 | | There's no need for the divisors d to be prime, in fact gen-psqr.c makes |
77 | | a deliberate effort to combine factors so as to reduce the number of |
78 | | separate tests done on r. But such combining is limited to d <= |
79 | | 2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs. |
80 | | |
81 | | Alternatives: |
82 | | |
83 | | It'd be possible to use bigger divisors d, and more than 2 limbs of table |
84 | | data, but this doesn't look like it would be of much help to the prime |
85 | | factors in the usual moduli 2^24-1 or 2^48-1. |
86 | | |
87 | | The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're |
88 | | just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime |
89 | | factors. 2^32-1 and 2^64-1 would be equally easy to calculate, but have |
90 | | fewer prime factors. |
91 | | |
92 | | The nails case usually ends up using mpn_mod_1, which is a lot slower |
93 | | than mpn_mod_34lsub1. Perhaps other such special moduli could be found |
94 | | for the nails case. Two-term things like 2^30-2^15-1 might be |
95 | | candidates. Or at worst some on-the-fly de-nailing would allow the plain |
96 | | 2^24-1 to be used. Currently nails are too preliminary to be worried |
97 | | about. |
98 | | |
99 | | */ |
100 | | |
101 | 0 | #define PERFSQR_MOD_MASK ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1) |
102 | | |
103 | 0 | #define MOD34_BITS (GMP_NUMB_BITS / 4 * 3) |
104 | 0 | #define MOD34_MASK ((CNST_LIMB(1) << MOD34_BITS) - 1) |
105 | | |
106 | | #define PERFSQR_MOD_34(r, up, usize) \ |
107 | 0 | do { \ |
108 | 0 | (r) = mpn_mod_34lsub1 (up, usize); \ |
109 | 0 | (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS); \ |
110 | 0 | } while (0) |
111 | | |
112 | | /* FIXME: The %= here isn't good, and might destroy any savings from keeping |
113 | | the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm). |
114 | | Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor |
115 | | and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our |
116 | | normal case, so lets not worry too much about mod_1. */ |
117 | | #define PERFSQR_MOD_PP(r, up, usize) \ |
118 | | do { \ |
119 | | if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD)) \ |
120 | | { \ |
121 | | (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \ |
122 | | PERFSQR_PP_INVERTED); \ |
123 | | (r) %= PERFSQR_PP; \ |
124 | | } \ |
125 | | else \ |
126 | | { \ |
127 | | (r) = mpn_mod_1 (up, usize, PERFSQR_PP); \ |
128 | | } \ |
129 | | } while (0) |
130 | | |
131 | | #define PERFSQR_MOD_IDX(idx, r, d, inv) \ |
132 | 0 | do { \ |
133 | 0 | mp_limb_t q; \ |
134 | 0 | ASSERT ((r) <= PERFSQR_MOD_MASK); \ |
135 | 0 | ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1); \ |
136 | 0 | ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK); \ |
137 | 0 | \ |
138 | 0 | q = ((r) * (inv)) & PERFSQR_MOD_MASK; \ |
139 | 0 | ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK)); \ |
140 | 0 | (idx) = (q * (d)) >> PERFSQR_MOD_BITS; \ |
141 | 0 | } while (0) |
142 | | |
143 | | #define PERFSQR_MOD_1(r, d, inv, mask) \ |
144 | 0 | do { \ |
145 | 0 | unsigned idx; \ |
146 | 0 | ASSERT ((d) <= GMP_LIMB_BITS); \ |
147 | 0 | PERFSQR_MOD_IDX(idx, r, d, inv); \ |
148 | 0 | TRACE (printf (" PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \ |
149 | 0 | d, r%d, idx)); \ |
150 | 0 | if ((((mask) >> idx) & 1) == 0) \ |
151 | 0 | { \ |
152 | 0 | TRACE (printf (" non-square\n")); \ |
153 | 0 | return 0; \ |
154 | 0 | } \ |
155 | 0 | } while (0) |
156 | | |
157 | | /* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the |
158 | | sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */ |
159 | | #define PERFSQR_MOD_2(r, d, inv, mhi, mlo) \ |
160 | 0 | do { \ |
161 | 0 | mp_limb_t m; \ |
162 | 0 | unsigned idx; \ |
163 | 0 | ASSERT ((d) <= 2*GMP_LIMB_BITS); \ |
164 | 0 | \ |
165 | 0 | PERFSQR_MOD_IDX (idx, r, d, inv); \ |
166 | 0 | TRACE (printf (" PERFSQR_MOD_2 d=%u r=%lu idx=%u\n", \ |
167 | 0 | d, r%d, idx)); \ |
168 | 0 | m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi)); \ |
169 | 0 | idx %= GMP_LIMB_BITS; \ |
170 | 0 | if (((m >> idx) & 1) == 0) \ |
171 | 0 | { \ |
172 | 0 | TRACE (printf (" non-square\n")); \ |
173 | 0 | return 0; \ |
174 | 0 | } \ |
175 | 0 | } while (0) |
176 | | |
177 | | |
178 | | int |
179 | | mpn_perfect_square_p (mp_srcptr up, mp_size_t usize) |
180 | 0 | { |
181 | 0 | ASSERT (usize >= 1); |
182 | |
|
183 | 0 | TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize)); |
184 | | |
185 | | /* The first test excludes 212/256 (82.8%) of the perfect square candidates |
186 | | in O(1) time. */ |
187 | 0 | { |
188 | 0 | unsigned idx = up[0] % 0x100; |
189 | 0 | if (((sq_res_0x100[idx / GMP_LIMB_BITS] |
190 | 0 | >> (idx % GMP_LIMB_BITS)) & 1) == 0) |
191 | 0 | return 0; |
192 | 0 | } |
193 | | |
194 | | #if 0 |
195 | | /* Check that we have even multiplicity of 2, and then check that the rest is |
196 | | a possible perfect square. Leave disabled until we can determine this |
197 | | really is an improvement. If it is, it could completely replace the |
198 | | simple probe above, since this should throw out more non-squares, but at |
199 | | the expense of somewhat more cycles. */ |
200 | | { |
201 | | mp_limb_t lo; |
202 | | int cnt; |
203 | | lo = up[0]; |
204 | | while (lo == 0) |
205 | | up++, lo = up[0], usize--; |
206 | | count_trailing_zeros (cnt, lo); |
207 | | if ((cnt & 1) != 0) |
208 | | return 0; /* return of not even multiplicity of 2 */ |
209 | | lo >>= cnt; /* shift down to align lowest non-zero bit */ |
210 | | if ((lo & 6) != 0) |
211 | | return 0; |
212 | | } |
213 | | #endif |
214 | | |
215 | | |
216 | | /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares |
217 | | according to their residues modulo small primes (or powers of |
218 | | primes). See perfsqr.h. */ |
219 | 0 | PERFSQR_MOD_TEST (up, usize); |
220 | | |
221 | | |
222 | | /* For the third and last test, we finally compute the square root, |
223 | | to make sure we've really got a perfect square. */ |
224 | 0 | { |
225 | 0 | mp_ptr root_ptr; |
226 | 0 | int res; |
227 | 0 | TMP_DECL; |
228 | |
|
229 | 0 | TMP_MARK; |
230 | 0 | root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2); |
231 | | |
232 | | /* Iff mpn_sqrtrem returns zero, the square is perfect. */ |
233 | 0 | res = ! mpn_sqrtrem (root_ptr, NULL, up, usize); |
234 | 0 | TMP_FREE; |
235 | |
|
236 | 0 | return res; |
237 | 0 | } |
238 | 0 | } |