/src/gmp-6.2.1/mpn/perfsqr.c

Source
/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
   zero otherwise.

Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software
Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */

#include <stdio.h> /* for NULL */
#include "gmp-impl.h"
#include "longlong.h"

#include "perfsqr.h"


/* change this to "#define TRACE(x) x" for diagnostics */
#define TRACE(x)



/* PERFSQR_MOD_* detects non-squares using residue tests.

   A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h.  It takes
   {up,usize} modulo a selected modulus to get a remainder r.  For 32-bit or
   64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34,
   or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP
   used.  PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this
   case too.

   PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or
   PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table
   data indicating residues and non-residues modulo those divisors.  The
   table data is in 1 or 2 limbs worth of bits respectively, per the size of
   each d.

   A "modexact" style remainder is taken to reduce r modulo d.
   PERFSQR_MOD_IDX implements this, producing an index "idx" for use with
   the table data.  Notice there's just one multiplication by a constant
   "inv", for each d.

   The modexact doesn't produce a true r%d remainder, instead idx satisfies
   "-(idx<<PERFSQR_MOD_BITS) == r mod d".  Because d is odd, this factor
   -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is
   accounted for by having the table data suitably permuted.

   The remainder r fits within PERFSQR_MOD_BITS which is less than a limb.
   In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit
   each divisor d meaning the modexact multiply can take place entirely
   within one limb, giving the compiler the chance to optimize it, in a way
   that say umul_ppmm would not give.

   There's no need for the divisors d to be prime, in fact gen-psqr.c makes
   a deliberate effort to combine factors so as to reduce the number of
   separate tests done on r.  But such combining is limited to d <=
   2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs.

   Alternatives:

   It'd be possible to use bigger divisors d, and more than 2 limbs of table
   data, but this doesn't look like it would be of much help to the prime
   factors in the usual moduli 2^24-1 or 2^48-1.

   The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're
   just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime
   factors.  2^32-1 and 2^64-1 would be equally easy to calculate, but have
   fewer prime factors.

   The nails case usually ends up using mpn_mod_1, which is a lot slower
   than mpn_mod_34lsub1.  Perhaps other such special moduli could be found
   for the nails case.  Two-term things like 2^30-2^15-1 might be
   candidates.  Or at worst some on-the-fly de-nailing would allow the plain
   2^24-1 to be used.  Currently nails are too preliminary to be worried
   about.

*/

#define PERFSQR_MOD_MASK       ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1)

#define MOD34_BITS  (GMP_NUMB_BITS / 4 * 3)
#define MOD34_MASK  ((CNST_LIMB(1) << MOD34_BITS) - 1)

#define PERFSQR_MOD_34(r, up, usize)        \
  do {               \
    (r) = mpn_mod_34lsub1 (up, usize);       \
    (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS);    \
  } while (0)

/* FIXME: The %= here isn't good, and might destroy any savings from keeping
   the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
   Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
   and a shift count, like mpn_preinv_divrem_1.  But mod_34lsub1 is our
   normal case, so lets not worry too much about mod_1.  */
#define PERFSQR_MOD_PP(r, up, usize)          \
  do {                  \
    if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD)) \
      {                 \
  (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM,   \
        PERFSQR_PP_INVERTED);     \
  (r) %= PERFSQR_PP;            \
      }                 \
    else                \
      {                 \
  (r) = mpn_mod_1 (up, usize, PERFSQR_PP);      \
      }                 \
  } while (0)

#define PERFSQR_MOD_IDX(idx, r, d, inv)       \
  do {               \
    mp_limb_t  q;           \
    ASSERT ((r) <= PERFSQR_MOD_MASK);       \
    ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1);   \
    ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK);   \
                \
    q = ((r) * (inv)) & PERFSQR_MOD_MASK;     \
    ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK));   \
    (idx) = (q * (d)) >> PERFSQR_MOD_BITS;     \
  } while (0)

#define PERFSQR_MOD_1(r, d, inv, mask)        \
  do {               \
    unsigned   idx;           \
    ASSERT ((d) <= GMP_LIMB_BITS);        \
    PERFSQR_MOD_IDX(idx, r, d, inv);       \
    TRACE (printf ("  PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \
       d, r%d, idx));       \
    if ((((mask) >> idx) & 1) == 0)       \
      {               \
  TRACE (printf ("  non-square\n"));      \
  return 0;           \
      }                \
  } while (0)

/* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
   sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch.  */
#define PERFSQR_MOD_2(r, d, inv, mhi, mlo)      \
  do {               \
    mp_limb_t  m;           \
    unsigned   idx;           \
    ASSERT ((d) <= 2*GMP_LIMB_BITS);        \
                \
    PERFSQR_MOD_IDX (idx, r, d, inv);        \
    TRACE (printf ("  PERFSQR_MOD_2 d=%u r=%lu idx=%u\n", \
       d, r%d, idx));       \
    m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi));  \
    idx %= GMP_LIMB_BITS;         \
    if (((m >> idx) & 1) == 0)         \
      {               \
  TRACE (printf ("  non-square\n"));      \
  return 0;           \
      }                \
  } while (0)


int
mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
{
  ASSERT (usize >= 1);

  TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));

  /* The first test excludes 212/256 (82.8%) of the perfect square candidates
     in O(1) time.  */
  {
    unsigned  idx = up[0] % 0x100;
    if (((sq_res_0x100[idx / GMP_LIMB_BITS]
    >> (idx % GMP_LIMB_BITS)) & 1) == 0)
      return 0;
  }

#if 0
  /* Check that we have even multiplicity of 2, and then check that the rest is
     a possible perfect square.  Leave disabled until we can determine this
     really is an improvement.  It it is, it could completely replace the
     simple probe above, since this should throw out more non-squares, but at
     the expense of somewhat more cycles.  */
  {
    mp_limb_t lo;
    int cnt;
    lo = up[0];
    while (lo == 0)
      up++, lo = up[0], usize--;
    count_trailing_zeros (cnt, lo);
    if ((cnt & 1) != 0)
      return 0;     /* return of not even multiplicity of 2 */
    lo >>= cnt;     /* shift down to align lowest non-zero bit */
    lo >>= 1;     /* shift away lowest non-zero bit */
    if ((lo & 3) != 0)
      return 0;
  }
#endif


  /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares
     according to their residues modulo small primes (or powers of
     primes).  See perfsqr.h.  */
  PERFSQR_MOD_TEST (up, usize);


  /* For the third and last test, we finally compute the square root,
     to make sure we've really got a perfect square.  */
  {
    mp_ptr root_ptr;
    int res;
    TMP_DECL;

    TMP_MARK;
    root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);

    /* Iff mpn_sqrtrem returns zero, the square is perfect.  */
    res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
    TMP_FREE;

    return res;
  }
}

Coverage Report

Created: 2025-03-09 06:52

Line	Count	Source
1		/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
2		zero otherwise.
3
4		Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software
5		Foundation, Inc.
6
7		This file is part of the GNU MP Library.
8
9		The GNU MP Library is free software; you can redistribute it and/or modify
10		it under the terms of either:
11
12		* the GNU Lesser General Public License as published by the Free
13		Software Foundation; either version 3 of the License, or (at your
14		option) any later version.
15
16		or
17
18		* the GNU General Public License as published by the Free Software
19		Foundation; either version 2 of the License, or (at your option) any
20		later version.
21
22		or both in parallel, as here.
23
24		The GNU MP Library is distributed in the hope that it will be useful, but
25		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27		for more details.
28
29		You should have received copies of the GNU General Public License and the
30		GNU Lesser General Public License along with the GNU MP Library. If not,
31		see https://www.gnu.org/licenses/. */
32
33		#include <stdio.h> /* for NULL */
34		#include "gmp-impl.h"
35		#include "longlong.h"
36
37		#include "perfsqr.h"
38
39
40		/* change this to "#define TRACE(x) x" for diagnostics */
41		#define TRACE(x)
42
43
44
45		/* PERFSQR_MOD_* detects non-squares using residue tests.
46
47		A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h. It takes
48		{up,usize} modulo a selected modulus to get a remainder r. For 32-bit or
49		64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34,
50		or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP
51		used. PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this
52		case too.
53
54		PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or
55		PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table
56		data indicating residues and non-residues modulo those divisors. The
57		table data is in 1 or 2 limbs worth of bits respectively, per the size of
58		each d.
59
60		A "modexact" style remainder is taken to reduce r modulo d.
61		PERFSQR_MOD_IDX implements this, producing an index "idx" for use with
62		the table data. Notice there's just one multiplication by a constant
63		"inv", for each d.
64
65		The modexact doesn't produce a true r%d remainder, instead idx satisfies
66		"-(idx<<PERFSQR_MOD_BITS) == r mod d". Because d is odd, this factor
67		-2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is
68		accounted for by having the table data suitably permuted.
69
70		The remainder r fits within PERFSQR_MOD_BITS which is less than a limb.
71		In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit
72		each divisor d meaning the modexact multiply can take place entirely
73		within one limb, giving the compiler the chance to optimize it, in a way
74		that say umul_ppmm would not give.
75
76		There's no need for the divisors d to be prime, in fact gen-psqr.c makes
77		a deliberate effort to combine factors so as to reduce the number of
78		separate tests done on r. But such combining is limited to d <=
79		2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs.
80
81		Alternatives:
82
83		It'd be possible to use bigger divisors d, and more than 2 limbs of table
84		data, but this doesn't look like it would be of much help to the prime
85		factors in the usual moduli 2^24-1 or 2^48-1.
86
87		The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're
88		just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime
89		factors. 2^32-1 and 2^64-1 would be equally easy to calculate, but have
90		fewer prime factors.
91
92		The nails case usually ends up using mpn_mod_1, which is a lot slower
93		than mpn_mod_34lsub1. Perhaps other such special moduli could be found
94		for the nails case. Two-term things like 2^30-2^15-1 might be
95		candidates. Or at worst some on-the-fly de-nailing would allow the plain
96		2^24-1 to be used. Currently nails are too preliminary to be worried
97		about.
98
99		*/
100
101	355	#define PERFSQR_MOD_MASK ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1)
102
103	210	#define MOD34_BITS (GMP_NUMB_BITS / 4 * 3)
104	105	#define MOD34_MASK ((CNST_LIMB(1) << MOD34_BITS) - 1)
105
106		#define PERFSQR_MOD_34(r, up, usize) \
107	105	do { \
108	105	(r) = mpn_mod_34lsub1 (up, usize); \
109	105	(r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS); \
110	105	} while (0)
111
112		/* FIXME: The %= here isn't good, and might destroy any savings from keeping
113		the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
114		Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
115		and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our
116		normal case, so lets not worry too much about mod_1. */
117		#define PERFSQR_MOD_PP(r, up, usize) \
118		do { \
119		if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD)) \
120		{ \
121		(r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \
122		PERFSQR_PP_INVERTED); \
123		(r) %= PERFSQR_PP; \
124		} \
125		else \
126		{ \
127		(r) = mpn_mod_1 (up, usize, PERFSQR_PP); \
128		} \
129		} while (0)
130
131		#define PERFSQR_MOD_IDX(idx, r, d, inv) \
132	355	do { \
133	355	mp_limb_t q; \
134	355	ASSERT ((r) <= PERFSQR_MOD_MASK); \
135	355	ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1); \
136	355	ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK); \
137	355	\
138	355	q = ((r) * (inv)) & PERFSQR_MOD_MASK; \
139	355	ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK)); \
140	355	(idx) = (q * (d)) >> PERFSQR_MOD_BITS; \
141	355	} while (0)
142
143		#define PERFSQR_MOD_1(r, d, inv, mask) \
144	83	do { \
145	83	unsigned idx; \
146	83	ASSERT ((d) <= GMP_LIMB_BITS); \
147	83	PERFSQR_MOD_IDX(idx, r, d, inv); \
148	83	TRACE (printf (" PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \
149	83	d, r%d, idx)); \
150	83	if ((((mask) >> idx) & 1) == 0) \
151	83	{ \
152	10	TRACE (printf (" non-square\n")); \
153	10	return 0; \
154	10	} \
155	83	} while (0)
156
157		/* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
158		sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */
159		#define PERFSQR_MOD_2(r, d, inv, mhi, mlo) \
160	272	do { \
161	272	mp_limb_t m; \
162	272	unsigned idx; \
163	272	ASSERT ((d) <= 2*GMP_LIMB_BITS); \
164	272	\
165	272	PERFSQR_MOD_IDX (idx, r, d, inv); \
166	272	TRACE (printf (" PERFSQR_MOD_2 d=%u r=%lu idx=%u\n", \
167	272	d, r%d, idx)); \
168	272	m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi)); \
169	272	idx %= GMP_LIMB_BITS; \
170	272	if (((m >> idx) & 1) == 0) \
171	272	{ \
172	40	TRACE (printf (" non-square\n")); \
173	40	return 0; \
174	40	} \
175	272	} while (0)
176
177
178		int
179		mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
180	131	{
181	131	ASSERT (usize >= 1);
182
183	131	TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
184
185		/* The first test excludes 212/256 (82.8%) of the perfect square candidates
186		in O(1) time. */
187	131	{
188	131	unsigned idx = up[0] % 0x100;
189	131	if (((sq_res_0x100[idx / GMP_LIMB_BITS]
190	131	>> (idx % GMP_LIMB_BITS)) & 1) == 0)
191	26	return 0;
192	131	}
193
194		#if 0
195		/* Check that we have even multiplicity of 2, and then check that the rest is
196		a possible perfect square. Leave disabled until we can determine this
197		really is an improvement. It it is, it could completely replace the
198		simple probe above, since this should throw out more non-squares, but at
199		the expense of somewhat more cycles. */
200		{
201		mp_limb_t lo;
202		int cnt;
203		lo = up[0];
204		while (lo == 0)
205		up++, lo = up[0], usize--;
206		count_trailing_zeros (cnt, lo);
207		if ((cnt & 1) != 0)
208		return 0; /* return of not even multiplicity of 2 */
209		lo >>= cnt; /* shift down to align lowest non-zero bit */
210		lo >>= 1; /* shift away lowest non-zero bit */
211		if ((lo & 3) != 0)
212		return 0;
213		}
214		#endif
215
216
217		/* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares
218		according to their residues modulo small primes (or powers of
219		primes). See perfsqr.h. */
220	105	PERFSQR_MOD_TEST (up, usize);
221
222
223		/* For the third and last test, we finally compute the square root,
224		to make sure we've really got a perfect square. */
225	55	{
226	55	mp_ptr root_ptr;
227	55	int res;
228	55	TMP_DECL;
229
230	55	TMP_MARK;
231	55	root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);
232
233		/* Iff mpn_sqrtrem returns zero, the square is perfect. */
234	55	res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
235	55	TMP_FREE;
236
237	55	return res;
238	105	}
239	105	}