/src/gmp/mpn/mode1o.c

Source (jump to first uncovered line)
/* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.

   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
   FUTURE GNU MP RELEASES.

Copyright 2000-2004 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */

#include "gmp-impl.h"
#include "longlong.h"


/* Calculate an r satisfying

           r*B^k + a - c == q*d

   where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
   (the caller won't know which), and q is the quotient (discarded).  d must
   be odd, c can be any limb value.

   If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.

   This slightly strange function suits the initial Nx1 reduction for GCDs
   or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
   -r == a mod d (by passing c=0).  For a GCD the factor of -1 on r can be
   ignored, or for the Jacobi symbol it can be accounted for.  The function
   also suits divisibility and congruence testing since if r=0 (or r=d) is
   obtained then a==c mod d.


   r is a bit like the remainder returned by mpn_divexact_by3c, and is the
   sort of remainder mpn_divexact_1 might return.  Like mpn_divexact_by3c, r
   represents a borrow, since effectively quotient limbs are chosen so that
   subtracting that multiple of d from src at each step will produce a zero
   limb.

   A long calculation can be done piece by piece from low to high by passing
   the return value from one part as the carry parameter to the next part.
   The effective final k becomes anything between size and size-n, if n
   pieces are used.


   A similar sort of routine could be constructed based on adding multiples
   of d at each limb, much like redc in mpz_powm does.  Subtracting however
   has a small advantage that when subtracting to cancel out l there's never
   a borrow into h, whereas using an addition would put a carry into h
   depending whether l==0 or l!=0.


   In terms of efficiency, this function is similar to a mul-by-inverse
   mpn_mod_1.  Both are essentially two multiplies and are best suited to
   CPUs with low latency multipliers (in comparison to a divide instruction
   at least.)  But modexact has a few less supplementary operations, only
   needs low part and high part multiplies, and has fewer working quantities
   (helping CPUs with few registers).


   In the main loop it will be noted that the new carry (call it r) is the
   sum of the high product h and any borrow from l=s-c.  If c<d then we will
   have r<d too, for the following reasons.  Let q=l*inverse be the quotient
   limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS.  Now if h=d-1 then

       l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d

   But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
   never have h=d-1 and so r=h+borrow <= d-1.

   When c>=d, on the other hand, h=d-1 can certainly occur together with a
   borrow, thereby giving only r<=d, as per the function definition above.

   As a design decision it's left to the caller to check for r=d if it might
   be passing c>=d.  Several applications have c<d initially so the extra
   test is often unnecessary, for example the GCDs or a plain divisibility
   d|a test will pass c=0.


   The special case for size==1 is so that it can be assumed c<=d in the
   high<=divisor test at the end.  c<=d is only guaranteed after at least
   one iteration of the main loop.  There's also a decent chance one % is
   faster than a binvert_limb, though that will depend on the processor.

   A CPU specific implementation might want to omit the size==1 code or the
   high<divisor test.  mpn/x86/k6/mode1o.asm for instance finds neither
   useful.  */


mp_limb_t
mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
                     mp_limb_t orig_c)
{
  mp_limb_t  s, h, l, inverse, dummy, dmul, ret;
  mp_limb_t  c = orig_c;
  mp_size_t  i;

  ASSERT (size >= 1);
  ASSERT (d & 1);
  ASSERT_MPN (src, size);
  ASSERT_LIMB (d);
  ASSERT_LIMB (c);

  if (size == 1)
    {
      s = src[0];
      if (s > c)
  {
    l = s-c;
    h = l % d;
    if (h != 0)
      h = d - h;
  }
      else
  {
    l = c-s;
    h = l % d;
  }
      return h;
    }


  binvert_limb (inverse, d);
  dmul = d << GMP_NAIL_BITS;

  i = 0;
  do
    {
      s = src[i];
      SUBC_LIMB (c, l, s, c);
      l = (l * inverse) & GMP_NUMB_MASK;
      umul_ppmm (h, dummy, l, dmul);
      c += h;
    }
  while (++i < size-1);


  s = src[i];
  if (s <= d)
    {
      /* With high<=d the final step can be a subtract and addback.  If c==0
   then the addback will restore to l>=0.  If c==d then will get l==d
   if s==0, but that's ok per the function definition.  */

      l = c - s;
      if (c < s)
  l += d;

      ret = l;
    }
  else
    {
      /* Can't skip a divide, just do the loop code once more. */

      SUBC_LIMB (c, l, s, c);
      l = (l * inverse) & GMP_NUMB_MASK;
      umul_ppmm (h, dummy, l, dmul);
      c += h;
      ret = c;
    }

  ASSERT (orig_c < d ? ret < d : ret <= d);
  return ret;
}



#if 0

/* The following is an alternate form that might shave one cycle on a
   superscalar processor since it takes c+=h off the dependent chain,
   leaving just a low product, high product, and a subtract.

   This is for CPU specific implementations to consider.  A special case for
   high<divisor and/or size==1 can be added if desired.

   Notice that c is only ever 0 or 1, since if s-c produces a borrow then
   x=0xFF..FF and x-h cannot produce a borrow.  The c=(x>s) could become
   c=(x==0xFF..FF) too, if that helped.  */

mp_limb_t
mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
{
  mp_limb_t  s, x, y, inverse, dummy, dmul, c1, c2;
  mp_limb_t  c = 0;
  mp_size_t  i;

  ASSERT (size >= 1);
  ASSERT (d & 1);

  binvert_limb (inverse, d);
  dmul = d << GMP_NAIL_BITS;

  for (i = 0; i < size; i++)
    {
      ASSERT (c==0 || c==1);

      s = src[i];
      SUBC_LIMB (c1, x, s, c);

      SUBC_LIMB (c2, y, x, h);
      c = c1 + c2;

      y = (y * inverse) & GMP_NUMB_MASK;
      umul_ppmm (h, dummy, y, dmul);
    }

  h += c;
  return h;
}

#endif

Coverage Report

Created: 2024-11-25 06:27

Line	Count	Source (jump to first uncovered line)
1		/* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
2
3		THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4		CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5		FUTURE GNU MP RELEASES.
6
7		Copyright 2000-2004 Free Software Foundation, Inc.
8
9		This file is part of the GNU MP Library.
10
11		The GNU MP Library is free software; you can redistribute it and/or modify
12		it under the terms of either:
13
14		* the GNU Lesser General Public License as published by the Free
15		Software Foundation; either version 3 of the License, or (at your
16		option) any later version.
17
18		or
19
20		* the GNU General Public License as published by the Free Software
21		Foundation; either version 2 of the License, or (at your option) any
22		later version.
23
24		or both in parallel, as here.
25
26		The GNU MP Library is distributed in the hope that it will be useful, but
27		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29		for more details.
30
31		You should have received copies of the GNU General Public License and the
32		GNU Lesser General Public License along with the GNU MP Library. If not,
33		see https://www.gnu.org/licenses/. */
34
35		#include "gmp-impl.h"
36		#include "longlong.h"
37
38
39		/* Calculate an r satisfying
40
41		rB^k + a - c == qd
42
43		where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
44		(the caller won't know which), and q is the quotient (discarded). d must
45		be odd, c can be any limb value.
46
47		If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
48
49		This slightly strange function suits the initial Nx1 reduction for GCDs
50		or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
51		-r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be
52		ignored, or for the Jacobi symbol it can be accounted for. The function
53		also suits divisibility and congruence testing since if r=0 (or r=d) is
54		obtained then a==c mod d.
55
56
57		r is a bit like the remainder returned by mpn_divexact_by3c, and is the
58		sort of remainder mpn_divexact_1 might return. Like mpn_divexact_by3c, r
59		represents a borrow, since effectively quotient limbs are chosen so that
60		subtracting that multiple of d from src at each step will produce a zero
61		limb.
62
63		A long calculation can be done piece by piece from low to high by passing
64		the return value from one part as the carry parameter to the next part.
65		The effective final k becomes anything between size and size-n, if n
66		pieces are used.
67
68
69		A similar sort of routine could be constructed based on adding multiples
70		of d at each limb, much like redc in mpz_powm does. Subtracting however
71		has a small advantage that when subtracting to cancel out l there's never
72		a borrow into h, whereas using an addition would put a carry into h
73		depending whether l==0 or l!=0.
74
75
76		In terms of efficiency, this function is similar to a mul-by-inverse
77		mpn_mod_1. Both are essentially two multiplies and are best suited to
78		CPUs with low latency multipliers (in comparison to a divide instruction
79		at least.) But modexact has a few less supplementary operations, only
80		needs low part and high part multiplies, and has fewer working quantities
81		(helping CPUs with few registers).
82
83
84		In the main loop it will be noted that the new carry (call it r) is the
85		sum of the high product h and any borrow from l=s-c. If c<d then we will
86		have r<d too, for the following reasons. Let q=l*inverse be the quotient
87		limb, so that qd = Bh + l, where B=2^GMP_NUMB_BITS. Now if h=d-1 then
88
89		l = qd - B(d-1) <= (B-1)d - B(d-1) = B-d
90
91		But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
92		never have h=d-1 and so r=h+borrow <= d-1.
93
94		When c>=d, on the other hand, h=d-1 can certainly occur together with a
95		borrow, thereby giving only r<=d, as per the function definition above.
96
97		As a design decision it's left to the caller to check for r=d if it might
98		be passing c>=d. Several applications have c<d initially so the extra
99		test is often unnecessary, for example the GCDs or a plain divisibility
100		d\|a test will pass c=0.
101
102
103		The special case for size==1 is so that it can be assumed c<=d in the
104		high<=divisor test at the end. c<=d is only guaranteed after at least
105		one iteration of the main loop. There's also a decent chance one % is
106		faster than a binvert_limb, though that will depend on the processor.
107
108		A CPU specific implementation might want to omit the size==1 code or the
109		high<divisor test. mpn/x86/k6/mode1o.asm for instance finds neither
110		useful. */
111
112
113		mp_limb_t
114		mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
115		mp_limb_t orig_c)
116	0	{
117	0	mp_limb_t s, h, l, inverse, dummy, dmul, ret;
118	0	mp_limb_t c = orig_c;
119	0	mp_size_t i;
120
121	0	ASSERT (size >= 1);
122	0	ASSERT (d & 1);
123	0	ASSERT_MPN (src, size);
124	0	ASSERT_LIMB (d);
125	0	ASSERT_LIMB (c);
126
127	0	if (size == 1)
128	0	{
129	0	s = src[0];
130	0	if (s > c)
131	0	{
132	0	l = s-c;
133	0	h = l % d;
134	0	if (h != 0)
135	0	h = d - h;
136	0	}
137	0	else
138	0	{
139	0	l = c-s;
140	0	h = l % d;
141	0	}
142	0	return h;
143	0	}
144
145
146	0	binvert_limb (inverse, d);
147	0	dmul = d << GMP_NAIL_BITS;
148
149	0	i = 0;
150	0	do
151	0	{
152	0	s = src[i];
153	0	SUBC_LIMB (c, l, s, c);
154	0	l = (l * inverse) & GMP_NUMB_MASK;
155	0	umul_ppmm (h, dummy, l, dmul);
156	0	c += h;
157	0	}
158	0	while (++i < size-1);
159
160
161	0	s = src[i];
162	0	if (s <= d)
163	0	{
164		/* With high<=d the final step can be a subtract and addback. If c==0
165		then the addback will restore to l>=0. If c==d then will get l==d
166		if s==0, but that's ok per the function definition. */
167
168	0	l = c - s;
169	0	if (c < s)
170	0	l += d;
171
172	0	ret = l;
173	0	}
174	0	else
175	0	{
176		/* Can't skip a divide, just do the loop code once more. */
177
178	0	SUBC_LIMB (c, l, s, c);
179	0	l = (l * inverse) & GMP_NUMB_MASK;
180	0	umul_ppmm (h, dummy, l, dmul);
181	0	c += h;
182	0	ret = c;
183	0	}
184
185	0	ASSERT (orig_c < d ? ret < d : ret <= d);
186	0	return ret;
187	0	}
188
189
190
191		#if 0
192
193		/* The following is an alternate form that might shave one cycle on a
194		superscalar processor since it takes c+=h off the dependent chain,
195		leaving just a low product, high product, and a subtract.
196
197		This is for CPU specific implementations to consider. A special case for
198		high<divisor and/or size==1 can be added if desired.
199
200		Notice that c is only ever 0 or 1, since if s-c produces a borrow then
201		x=0xFF..FF and x-h cannot produce a borrow. The c=(x>s) could become
202		c=(x==0xFF..FF) too, if that helped. */
203
204		mp_limb_t
205		mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
206		{
207		mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2;
208		mp_limb_t c = 0;
209		mp_size_t i;
210
211		ASSERT (size >= 1);
212		ASSERT (d & 1);
213
214		binvert_limb (inverse, d);
215		dmul = d << GMP_NAIL_BITS;
216
217		for (i = 0; i < size; i++)
218		{
219		ASSERT (c==0 \|\| c==1);
220
221		s = src[i];
222		SUBC_LIMB (c1, x, s, c);
223
224		SUBC_LIMB (c2, y, x, h);
225		c = c1 + c2;
226
227		y = (y * inverse) & GMP_NUMB_MASK;
228		umul_ppmm (h, dummy, y, dmul);
229		}
230
231		h += c;
232		return h;
233		}
234
235		#endif