Line | Count | Source |
1 | | /* mpn_divexact_1 -- mpn by limb exact division. |
2 | | |
3 | | THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST |
4 | | CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN |
5 | | FUTURE GNU MP RELEASES. |
6 | | |
7 | | Copyright 2000-2003, 2005, 2013 Free Software Foundation, Inc. |
8 | | |
9 | | This file is part of the GNU MP Library. |
10 | | |
11 | | The GNU MP Library is free software; you can redistribute it and/or modify |
12 | | it under the terms of either: |
13 | | |
14 | | * the GNU Lesser General Public License as published by the Free |
15 | | Software Foundation; either version 3 of the License, or (at your |
16 | | option) any later version. |
17 | | |
18 | | or |
19 | | |
20 | | * the GNU General Public License as published by the Free Software |
21 | | Foundation; either version 2 of the License, or (at your option) any |
22 | | later version. |
23 | | |
24 | | or both in parallel, as here. |
25 | | |
26 | | The GNU MP Library is distributed in the hope that it will be useful, but |
27 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
28 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
29 | | for more details. |
30 | | |
31 | | You should have received copies of the GNU General Public License and the |
32 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
33 | | see https://www.gnu.org/licenses/. */ |
34 | | |
35 | | #include "gmp-impl.h" |
36 | | #include "longlong.h" |
37 | | |
38 | | |
39 | | |
40 | | /* Divide a={src,size} by d=divisor and store the quotient in q={dst,size}. |
41 | | q will only be correct if d divides a exactly. |
42 | | |
43 | | A separate loop is used for shift==0 because n<<GMP_LIMB_BITS doesn't |
44 | | give zero on all CPUs (for instance it doesn't on the x86s). This |
45 | | separate loop might run faster too, helping odd divisors. |
46 | | |
47 | | Possibilities: |
48 | | |
49 | | mpn_divexact_1c could be created, accepting and returning c. This would |
50 | | let a long calculation be done piece by piece. Currently there's no |
51 | | particular need for that, and not returning c means that a final umul can |
52 | | be skipped. |
53 | | |
54 | | Another use for returning c would be letting the caller know whether the |
55 | | division was in fact exact. It would work just to return the carry bit |
56 | | "c=(l>s)" and let the caller do a final umul if interested. |
57 | | |
58 | | When the divisor is even, the factors of two could be handled with a |
59 | | separate mpn_rshift, instead of shifting on the fly. That might be |
60 | | faster on some CPUs and would mean just the shift==0 style loop would be |
61 | | needed. |
62 | | |
63 | | If n<<GMP_LIMB_BITS gives zero on a particular CPU then the separate |
64 | | shift==0 loop is unnecessary, and could be eliminated if there's no great |
65 | | speed difference. |
66 | | |
67 | | It's not clear whether "/" is the best way to handle size==1. Alpha gcc |
68 | | 2.95 for instance has a poor "/" and might prefer the modular method. |
69 | | Perhaps a tuned parameter should control this. |
70 | | |
71 | | If src[size-1] < divisor then dst[size-1] will be zero, and one divide |
72 | | step could be skipped. A test at last step for s<divisor (or ls in the |
73 | | even case) might be a good way to do that. But if this code is often |
74 | | used with small divisors then it might not be worth bothering */ |
75 | | |
76 | | void |
77 | | mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor) |
78 | 513k | { |
79 | 513k | mp_size_t i; |
80 | 513k | mp_limb_t c, h, l, ls, s, s_next, inverse, dummy; |
81 | 513k | unsigned shift; |
82 | | |
83 | 513k | ASSERT (size >= 1); |
84 | 513k | ASSERT (divisor != 0); |
85 | 513k | ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); |
86 | 513k | ASSERT_MPN (src, size); |
87 | 513k | ASSERT_LIMB (divisor); |
88 | | |
89 | 513k | if ((divisor & 1) == 0) |
90 | 1.23k | { |
91 | 1.23k | count_trailing_zeros (shift, divisor); |
92 | 1.23k | divisor >>= shift; |
93 | 1.23k | } |
94 | 512k | else |
95 | 512k | shift = 0; |
96 | | |
97 | 513k | binvert_limb (inverse, divisor); |
98 | 513k | divisor <<= GMP_NAIL_BITS; |
99 | | |
100 | 513k | if (shift != 0) |
101 | 1.23k | { |
102 | 1.23k | c = 0; |
103 | | |
104 | 1.23k | s = src[0]; |
105 | | |
106 | 26.5k | for (i = 1; i < size; i++) |
107 | 25.3k | { |
108 | 25.3k | s_next = src[i]; |
109 | 25.3k | ls = ((s >> shift) | (s_next << (GMP_NUMB_BITS-shift))) & GMP_NUMB_MASK; |
110 | 25.3k | s = s_next; |
111 | | |
112 | 25.3k | SUBC_LIMB (c, l, ls, c); |
113 | | |
114 | 25.3k | l = (l * inverse) & GMP_NUMB_MASK; |
115 | 25.3k | dst[i - 1] = l; |
116 | | |
117 | 25.3k | umul_ppmm (h, dummy, l, divisor); |
118 | 25.3k | c += h; |
119 | 25.3k | } |
120 | | |
121 | 1.23k | ls = s >> shift; |
122 | 1.23k | l = ls - c; |
123 | 1.23k | l = (l * inverse) & GMP_NUMB_MASK; |
124 | 1.23k | dst[size - 1] = l; |
125 | 1.23k | } |
126 | 512k | else |
127 | 512k | { |
128 | 512k | s = src[0]; |
129 | | |
130 | 512k | l = (s * inverse) & GMP_NUMB_MASK; |
131 | 512k | dst[0] = l; |
132 | 512k | c = 0; |
133 | | |
134 | 6.14M | for (i = 1; i < size; i++) |
135 | 5.63M | { |
136 | 5.63M | umul_ppmm (h, dummy, l, divisor); |
137 | 5.63M | c += h; |
138 | | |
139 | 5.63M | s = src[i]; |
140 | 5.63M | SUBC_LIMB (c, l, s, c); |
141 | | |
142 | 5.63M | l = (l * inverse) & GMP_NUMB_MASK; |
143 | 5.63M | dst[i] = l; |
144 | 5.63M | } |
145 | 512k | } |
146 | 513k | } |