/src/gmp-6.2.1/mpn/hgcd2.c
Line  | Count  | Source  | 
1  |  | /* hgcd2.c  | 
2  |  |  | 
3  |  |    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY  | 
4  |  |    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST  | 
5  |  |    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.  | 
6  |  |  | 
7  |  | Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation,  | 
8  |  | Inc.  | 
9  |  |  | 
10  |  | This file is part of the GNU MP Library.  | 
11  |  |  | 
12  |  | The GNU MP Library is free software; you can redistribute it and/or modify  | 
13  |  | it under the terms of either:  | 
14  |  |  | 
15  |  |   * the GNU Lesser General Public License as published by the Free  | 
16  |  |     Software Foundation; either version 3 of the License, or (at your  | 
17  |  |     option) any later version.  | 
18  |  |  | 
19  |  | or  | 
20  |  |  | 
21  |  |   * the GNU General Public License as published by the Free Software  | 
22  |  |     Foundation; either version 2 of the License, or (at your option) any  | 
23  |  |     later version.  | 
24  |  |  | 
25  |  | or both in parallel, as here.  | 
26  |  |  | 
27  |  | The GNU MP Library is distributed in the hope that it will be useful, but  | 
28  |  | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY  | 
29  |  | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License  | 
30  |  | for more details.  | 
31  |  |  | 
32  |  | You should have received copies of the GNU General Public License and the  | 
33  |  | GNU Lesser General Public License along with the GNU MP Library.  If not,  | 
34  |  | see https://www.gnu.org/licenses/.  */  | 
35  |  |  | 
36  |  | #include "gmp-impl.h"  | 
37  |  | #include "longlong.h"  | 
38  |  |  | 
39  |  | #ifndef HGCD2_DIV1_METHOD  | 
40  |  | #define HGCD2_DIV1_METHOD 3  | 
41  |  | #endif  | 
42  |  |  | 
43  |  | #ifndef HGCD2_DIV2_METHOD  | 
44  |  | #define HGCD2_DIV2_METHOD 2  | 
45  |  | #endif  | 
46  |  |  | 
47  |  | #if GMP_NAIL_BITS != 0  | 
48  |  | #error Nails not implemented  | 
49  |  | #endif  | 
50  |  |  | 
51  |  | #if HAVE_NATIVE_mpn_div_11  | 
52  |  |  | 
53  |  | #define div1 mpn_div_11  | 
54  |  | /* Single-limb division optimized for small quotients.  | 
55  |  |    Returned value holds d0 = r, d1 = q. */  | 
56  |  | mp_double_limb_t div1 (mp_limb_t, mp_limb_t);  | 
57  |  |  | 
58  |  | #elif HGCD2_DIV1_METHOD == 1  | 
59  |  |  | 
60  |  | static inline mp_double_limb_t  | 
61  |  | div1 (mp_limb_t n0, mp_limb_t d0)  | 
62  |  | { | 
63  |  |   mp_double_limb_t res;  | 
64  |  |   res.d1 = n0 / d0;  | 
65  |  |   res.d0 = n0 - res.d1 * d0;  | 
66  |  |  | 
67  |  |   return res;  | 
68  |  | }  | 
69  |  |  | 
70  |  | #elif HGCD2_DIV1_METHOD == 2  | 
71  |  |  | 
72  |  | static mp_double_limb_t  | 
73  |  | div1 (mp_limb_t n0, mp_limb_t d0)  | 
74  |  | { | 
75  |  |   mp_double_limb_t res;  | 
76  |  |   int ncnt, dcnt, cnt;  | 
77  |  |   mp_limb_t q;  | 
78  |  |   mp_limb_t mask;  | 
79  |  |  | 
80  |  |   ASSERT (n0 >= d0);  | 
81  |  |  | 
82  |  |   count_leading_zeros (ncnt, n0);  | 
83  |  |   count_leading_zeros (dcnt, d0);  | 
84  |  |   cnt = dcnt - ncnt;  | 
85  |  |  | 
86  |  |   d0 <<= cnt;  | 
87  |  |  | 
88  |  |   q = -(mp_limb_t) (n0 >= d0);  | 
89  |  |   n0 -= d0 & q;  | 
90  |  |   d0 >>= 1;  | 
91  |  |   q = -q;  | 
92  |  |  | 
93  |  |   while (--cnt >= 0)  | 
94  |  |     { | 
95  |  |       mask = -(mp_limb_t) (n0 >= d0);  | 
96  |  |       n0 -= d0 & mask;  | 
97  |  |       d0 >>= 1;  | 
98  |  |       q = (q << 1) - mask;  | 
99  |  |     }  | 
100  |  |  | 
101  |  |   res.d0 = n0;  | 
102  |  |   res.d1 = q;  | 
103  |  |   return res;  | 
104  |  | }  | 
105  |  |  | 
106  |  | #elif HGCD2_DIV1_METHOD == 3  | 
107  |  |  | 
108  |  | static inline mp_double_limb_t  | 
109  |  | div1 (mp_limb_t n0, mp_limb_t d0)  | 
110  | 695k  | { | 
111  | 695k  |   mp_double_limb_t res;  | 
112  | 695k  |   if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)  | 
113  | 695k  |       || UNLIKELY (n0 >= (d0 << 3)))  | 
114  | 188k  |     { | 
115  | 188k  |       res.d1 = n0 / d0;  | 
116  | 188k  |       res.d0 = n0 - res.d1 * d0;  | 
117  | 188k  |     }  | 
118  | 507k  |   else  | 
119  | 507k  |     { | 
120  | 507k  |       mp_limb_t q, mask;  | 
121  |  |  | 
122  | 507k  |       d0 <<= 2;  | 
123  |  |  | 
124  | 507k  |       mask = -(mp_limb_t) (n0 >= d0);  | 
125  | 507k  |       n0 -= d0 & mask;  | 
126  | 507k  |       q = 4 & mask;  | 
127  |  |  | 
128  | 507k  |       d0 >>= 1;  | 
129  | 507k  |       mask = -(mp_limb_t) (n0 >= d0);  | 
130  | 507k  |       n0 -= d0 & mask;  | 
131  | 507k  |       q += 2 & mask;  | 
132  |  |  | 
133  | 507k  |       d0 >>= 1;  | 
134  | 507k  |       mask = -(mp_limb_t) (n0 >= d0);  | 
135  | 507k  |       n0 -= d0 & mask;  | 
136  | 507k  |       q -= mask;  | 
137  |  |  | 
138  | 507k  |       res.d0 = n0;  | 
139  | 507k  |       res.d1 = q;  | 
140  | 507k  |     }  | 
141  | 695k  |   return res;  | 
142  | 695k  | }  | 
143  |  |  | 
144  |  | #elif HGCD2_DIV1_METHOD == 4  | 
145  |  |  | 
146  |  | /* Table quotients.  We extract the NBITS most significant bits of the  | 
147  |  |    numerator limb, and the corresponding bits from the divisor limb, and use  | 
148  |  |    these to form an index into the table.  This method is probably only useful  | 
149  |  |    for short pipelines with slow multiplication.  | 
150  |  |  | 
151  |  |    Possible improvements:  | 
152  |  |  | 
153  |  |    * Perhaps extract the highest NBITS of the divisor instead of the same bits  | 
154  |  |      as from the numerator.  That would require another count_leading_zeros,  | 
155  |  |      and a post-multiply shift of the quotient.  | 
156  |  |  | 
157  |  |    * Compress tables?  Their values are tiny, and there are lots of zero  | 
158  |  |      entries (which are never used).  | 
159  |  |  | 
160  |  |    * Round the table entries more cleverly?  | 
161  |  | */  | 
162  |  |  | 
163  |  | #ifndef NBITS  | 
164  |  | #define NBITS 5  | 
165  |  | #endif  | 
166  |  |  | 
167  |  | #if NBITS == 5  | 
168  |  | /* This needs full division about 13.2% of the time. */  | 
169  |  | static const unsigned char tab[512] = { | 
170  |  | 17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
171  |  | 18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
172  |  | 19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
173  |  | 20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,  | 
174  |  | 21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,  | 
175  |  | 22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,  | 
176  |  | 23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,  | 
177  |  | 24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,  | 
178  |  | 25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,  | 
179  |  | 26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,  | 
180  |  | 27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,  | 
181  |  | 28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,  | 
182  |  | 29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,  | 
183  |  | 30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,  | 
184  |  | 31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,  | 
185  |  | 32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1  | 
186  |  | };  | 
187  |  | #elif NBITS == 6  | 
188  |  | /* This needs full division about 9.8% of the time. */  | 
189  |  | static const unsigned char tab[2048] = { | 
190  |  | 33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
191  |  |  1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
192  |  | 34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
193  |  |  1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
194  |  | 35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
195  |  |  1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
196  |  | 36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
197  |  |  1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
198  |  | 37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
199  |  |  1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
200  |  | 38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
201  |  |  1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
202  |  | 39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
203  |  |  1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
204  |  | 40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
205  |  |  1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
206  |  | 41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,  | 
207  |  |  1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
208  |  | 42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,  | 
209  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
210  |  | 43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,  | 
211  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
212  |  | 44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,  | 
213  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
214  |  | 45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,  | 
215  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
216  |  | 46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,  | 
217  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
218  |  | 47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,  | 
219  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
220  |  | 48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,  | 
221  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
222  |  | 49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,  | 
223  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
224  |  | 50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,  | 
225  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
226  |  | 51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,  | 
227  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
228  |  | 52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,  | 
229  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,  | 
230  |  | 53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,  | 
231  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,  | 
232  |  | 54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,  | 
233  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,  | 
234  |  | 55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,  | 
235  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,  | 
236  |  | 56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,  | 
237  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,  | 
238  |  | 57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,  | 
239  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,  | 
240  |  | 58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,  | 
241  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,  | 
242  |  | 59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,  | 
243  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,  | 
244  |  | 60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,  | 
245  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,  | 
246  |  | 61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,  | 
247  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,  | 
248  |  | 62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,  | 
249  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,  | 
250  |  | 63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,  | 
251  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,  | 
252  |  | 64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,  | 
253  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1  | 
254  |  | };  | 
255  |  | #else  | 
256  |  | #error No table for provided NBITS  | 
257  |  | #endif  | 
258  |  |  | 
259  |  | /* Doing tabp with a #define makes compiler warnings about pointing outside an  | 
260  |  |    object go away.  We used to define this as a variable.  It is not clear if  | 
261  |  |    e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;  | 
262  |  |    (vector[100] + 10) - 10 surely is and there is no sequence point so the  | 
263  |  |    expressions should be equivalent.  To make this safe, we might want to  | 
264  |  |    define tabp as a macro with the index as an argument.  Depending on the  | 
265  |  |    platform, relocs might allow for assembly-time or linker-time resolution to  | 
266  |  |    take place. */  | 
267  |  | #define tabp (tab - (1 << (NBITS - 1) << NBITS))  | 
268  |  |  | 
269  |  | static inline mp_double_limb_t  | 
270  |  | div1 (mp_limb_t n0, mp_limb_t d0)  | 
271  |  | { | 
272  |  |   int ncnt;  | 
273  |  |   size_t nbi, dbi;  | 
274  |  |   mp_limb_t q0;  | 
275  |  |   mp_limb_t r0;  | 
276  |  |   mp_limb_t mask;  | 
277  |  |   mp_double_limb_t res;  | 
278  |  |  | 
279  |  |   ASSERT (n0 >= d0);    /* Actually only msb position is critical. */  | 
280  |  |  | 
281  |  |   count_leading_zeros (ncnt, n0);  | 
282  |  |   nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);  | 
283  |  |   dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);  | 
284  |  |  | 
285  |  |   q0 = tabp[(nbi << NBITS) + dbi];  | 
286  |  |   r0 = n0 - q0 * d0;  | 
287  |  |   mask = -(mp_limb_t) (r0 >= d0);  | 
288  |  |   q0 -= mask;  | 
289  |  |   r0 -= d0 & mask;  | 
290  |  |  | 
291  |  |   if (UNLIKELY (r0 >= d0))  | 
292  |  |     { | 
293  |  |       q0 = n0 / d0;  | 
294  |  |       r0 = n0 - q0 * d0;  | 
295  |  |     }  | 
296  |  |  | 
297  |  |   res.d1 = q0;  | 
298  |  |   res.d0 = r0;  | 
299  |  |   return res;  | 
300  |  | }  | 
301  |  |  | 
302  |  | #elif HGCD2_DIV1_METHOD == 5  | 
303  |  |  | 
304  |  | /* Table inverses of divisors.  We don't bother with suppressing the msb from  | 
305  |  |    the tables.  We index with the NBITS most significant divisor bits,  | 
306  |  |    including the always-set highest bit, but use addressing trickery via tabp  | 
307  |  |    to suppress it.  | 
308  |  |  | 
309  |  |    Possible improvements:  | 
310  |  |  | 
311  |  |    * Do first multiply using 32-bit operations on 64-bit computers.  At least  | 
312  |  |      on most Arm64 cores, that uses 3 times less resources.  It also saves on  | 
313  |  |      many x86-64 processors.  | 
314  |  | */  | 
315  |  |  | 
316  |  | #ifndef NBITS  | 
317  |  | #define NBITS 7  | 
318  |  | #endif  | 
319  |  |  | 
320  |  | #if NBITS == 5  | 
321  |  | /* This needs full division about 1.63% of the time. */  | 
322  |  | static const unsigned char tab[16] = { | 
323  |  |  63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32  | 
324  |  | };  | 
325  |  | #elif NBITS == 6  | 
326  |  | /* This needs full division about 0.93% of the time. */  | 
327  |  | static const unsigned char tab[32] = { | 
328  |  | 127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,  | 
329  |  |  84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64  | 
330  |  | };  | 
331  |  | #elif NBITS == 7  | 
332  |  | /* This needs full division about 0.49% of the time. */  | 
333  |  | static const unsigned char tab[64] = { | 
334  |  | 255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,  | 
335  |  | 203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,  | 
336  |  | 169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,  | 
337  |  | 145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128  | 
338  |  | };  | 
339  |  | #elif NBITS == 8  | 
340  |  | /* This needs full division about 0.26% of the time. */  | 
341  |  | static const unsigned short tab[128] = { | 
342  |  | 511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,  | 
343  |  | 454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,  | 
344  |  | 408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,  | 
345  |  | 371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,  | 
346  |  | 340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,  | 
347  |  | 314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,  | 
348  |  | 291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,  | 
349  |  | 272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256  | 
350  |  | };  | 
351  |  | #else  | 
352  |  | #error No table for provided NBITS  | 
353  |  | #endif  | 
354  |  |  | 
355  |  | /* Doing tabp with a #define makes compiler warnings about pointing outside an  | 
356  |  |    object go away.  We used to define this as a variable.  It is not clear if  | 
357  |  |    e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;  | 
358  |  |    (vector[100] + 10) - 10 surely is and there is no sequence point so the  | 
359  |  |    expressions should be equivalent.  To make this safe, we might want to  | 
360  |  |    define tabp as a macro with the index as an argument.  Depending on the  | 
361  |  |    platform, relocs might allow for assembly-time or linker-time resolution to  | 
362  |  |    take place. */  | 
363  |  | #define tabp (tab - (1 << (NBITS - 1)))  | 
364  |  |  | 
365  |  | static inline mp_double_limb_t  | 
366  |  | div1 (mp_limb_t n0, mp_limb_t d0)  | 
367  |  | { | 
368  |  |   int ncnt, dcnt;  | 
369  |  |   size_t dbi;  | 
370  |  |   mp_limb_t inv;  | 
371  |  |   mp_limb_t q0;  | 
372  |  |   mp_limb_t r0;  | 
373  |  |   mp_limb_t mask;  | 
374  |  |   mp_double_limb_t res;  | 
375  |  |  | 
376  |  |   count_leading_zeros (ncnt, n0);  | 
377  |  |   count_leading_zeros (dcnt, d0);  | 
378  |  |  | 
379  |  |   dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);  | 
380  |  |   inv = tabp[dbi];  | 
381  |  |   q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);  | 
382  |  |   r0 = n0 - q0 * d0;  | 
383  |  |   mask = -(mp_limb_t) (r0 >= d0);  | 
384  |  |   q0 -= mask;  | 
385  |  |   r0 -= d0 & mask;  | 
386  |  |  | 
387  |  |   if (UNLIKELY (r0 >= d0))  | 
388  |  |     { | 
389  |  |       q0 = n0 / d0;  | 
390  |  |       r0 = n0 - q0 * d0;  | 
391  |  |     }  | 
392  |  |  | 
393  |  |   res.d1 = q0;  | 
394  |  |   res.d0 = r0;  | 
395  |  |   return res;  | 
396  |  | }  | 
397  |  |  | 
398  |  | #else  | 
399  |  | #error Unknown HGCD2_DIV1_METHOD  | 
400  |  | #endif  | 
401  |  |  | 
402  |  | #if HAVE_NATIVE_mpn_div_22  | 
403  |  |  | 
404  |  | #define div2 mpn_div_22  | 
405  |  | /* Two-limb division optimized for small quotients.  */  | 
406  |  | mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);  | 
407  |  |  | 
408  |  | #elif HGCD2_DIV2_METHOD == 1  | 
409  |  |  | 
410  |  | static mp_limb_t  | 
411  |  | div2 (mp_ptr rp,  | 
412  |  |       mp_limb_t n1, mp_limb_t n0,  | 
413  |  |       mp_limb_t d1, mp_limb_t d0)  | 
414  |  | { | 
415  |  |   mp_double_limb_t rq = div1 (n1, d1);  | 
416  |  |   if (UNLIKELY (rq.d1 > d1))  | 
417  |  |     { | 
418  |  |       mp_limb_t n2, q, t1, t0;  | 
419  |  |       int c;  | 
420  |  |  | 
421  |  |       /* Normalize */  | 
422  |  |       count_leading_zeros (c, d1);  | 
423  |  |       ASSERT (c > 0);  | 
424  |  |  | 
425  |  |       n2 = n1 >> (GMP_LIMB_BITS - c);  | 
426  |  |       n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));  | 
427  |  |       n0 <<= c;  | 
428  |  |       d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));  | 
429  |  |       d0 <<= c;  | 
430  |  |  | 
431  |  |       udiv_qrnnd (q, n1, n2, n1, d1);  | 
432  |  |       umul_ppmm (t1, t0, q, d0);  | 
433  |  |       if (t1 > n1 || (t1 == n1 && t0 > n0))  | 
434  |  |   { | 
435  |  |     ASSERT (q > 0);  | 
436  |  |     q--;  | 
437  |  |     sub_ddmmss (t1, t0, t1, t0, d1, d0);  | 
438  |  |   }  | 
439  |  |       sub_ddmmss (n1, n0, n1, n0, t1, t0);  | 
440  |  |  | 
441  |  |       /* Undo normalization */  | 
442  |  |       rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));  | 
443  |  |       rp[1] = n1 >> c;  | 
444  |  |  | 
445  |  |       return q;  | 
446  |  |     }  | 
447  |  |   else  | 
448  |  |     { | 
449  |  |       mp_limb_t q, t1, t0;  | 
450  |  |       n1 = rq.d0;  | 
451  |  |       q = rq.d1;  | 
452  |  |       umul_ppmm (t1, t0, q, d0);  | 
453  |  |       if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))  | 
454  |  |   { | 
455  |  |     ASSERT (q > 0);  | 
456  |  |     q--;  | 
457  |  |     sub_ddmmss (t1, t0, t1, t0, d1, d0);  | 
458  |  |   }  | 
459  |  |       sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);  | 
460  |  |       return q;  | 
461  |  |     }  | 
462  |  | }  | 
463  |  |  | 
464  |  | #elif HGCD2_DIV2_METHOD == 2  | 
465  |  |  | 
466  |  | /* Bit-wise div2. Relies on fast count_leading_zeros. */  | 
467  |  | static mp_limb_t  | 
468  |  | div2 (mp_ptr rp,  | 
469  |  |       mp_limb_t n1, mp_limb_t n0,  | 
470  |  |       mp_limb_t d1, mp_limb_t d0)  | 
471  | 798k  | { | 
472  | 798k  |   mp_limb_t q = 0;  | 
473  | 798k  |   int ncnt;  | 
474  | 798k  |   int dcnt;  | 
475  |  |  | 
476  | 798k  |   count_leading_zeros (ncnt, n1);  | 
477  | 798k  |   count_leading_zeros (dcnt, d1);  | 
478  | 798k  |   dcnt -= ncnt;  | 
479  |  |  | 
480  | 798k  |   d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));  | 
481  | 798k  |   d0 <<= dcnt;  | 
482  |  |  | 
483  | 798k  |   do  | 
484  | 2.65M  |     { | 
485  | 2.65M  |       mp_limb_t mask;  | 
486  | 2.65M  |       q <<= 1;  | 
487  | 2.65M  |       if (UNLIKELY (n1 == d1))  | 
488  | 1.32k  |   mask = -(n0 >= d0);  | 
489  | 2.65M  |       else  | 
490  | 2.65M  |   mask = -(n1 > d1);  | 
491  |  |  | 
492  | 2.65M  |       q -= mask;  | 
493  |  |  | 
494  | 2.65M  |       sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);  | 
495  |  |  | 
496  | 2.65M  |       d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);  | 
497  | 2.65M  |       d1 = d1 >> 1;  | 
498  | 2.65M  |     }  | 
499  | 2.65M  |   while (dcnt--);  | 
500  |  |  | 
501  | 798k  |   rp[0] = n0;  | 
502  | 798k  |   rp[1] = n1;  | 
503  |  |  | 
504  | 798k  |   return q;  | 
505  | 798k  | }  | 
506  |  | #else  | 
507  |  | #error Unknown HGCD2_DIV2_METHOD  | 
508  |  | #endif  | 
509  |  |  | 
510  |  | /* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs  | 
511  |  |    matrix M. Returns 1 if we make progress, i.e. can perform at least  | 
512  |  |    one subtraction. Otherwise returns zero. */  | 
513  |  |  | 
514  |  | /* FIXME: Possible optimizations:  | 
515  |  |  | 
516  |  |    The div2 function starts with checking the most significant bit of  | 
517  |  |    the numerator. We can maintained normalized operands here, call  | 
518  |  |    hgcd with normalized operands only, which should make the code  | 
519  |  |    simpler and possibly faster.  | 
520  |  |  | 
521  |  |    Experiment with table lookups on the most significant bits.  | 
522  |  |  | 
523  |  |    This function is also a candidate for assembler implementation.  | 
524  |  | */  | 
525  |  | int  | 
526  |  | mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,  | 
527  |  |      struct hgcd_matrix1 *M)  | 
528  | 77.3k  | { | 
529  | 77.3k  |   mp_limb_t u00, u01, u10, u11;  | 
530  |  |  | 
531  | 77.3k  |   if (ah < 2 || bh < 2)  | 
532  | 344  |     return 0;  | 
533  |  |  | 
534  | 77.0k  |   if (ah > bh || (ah == bh && al > bl))  | 
535  | 37.5k  |     { | 
536  | 37.5k  |       sub_ddmmss (ah, al, ah, al, bh, bl);  | 
537  | 37.5k  |       if (ah < 2)  | 
538  | 723  |   return 0;  | 
539  |  |  | 
540  | 36.8k  |       u00 = u01 = u11 = 1;  | 
541  | 36.8k  |       u10 = 0;  | 
542  | 36.8k  |     }  | 
543  | 39.4k  |   else  | 
544  | 39.4k  |     { | 
545  | 39.4k  |       sub_ddmmss (bh, bl, bh, bl, ah, al);  | 
546  | 39.4k  |       if (bh < 2)  | 
547  | 1.71k  |   return 0;  | 
548  |  |  | 
549  | 37.7k  |       u00 = u10 = u11 = 1;  | 
550  | 37.7k  |       u01 = 0;  | 
551  | 37.7k  |     }  | 
552  |  |  | 
553  | 74.6k  |   if (ah < bh)  | 
554  | 37.2k  |     goto subtract_a;  | 
555  |  |  | 
556  | 37.3k  |   for (;;)  | 
557  | 700k  |     { | 
558  | 700k  |       ASSERT (ah >= bh);  | 
559  | 700k  |       if (ah == bh)  | 
560  | 617  |   goto done;  | 
561  |  |  | 
562  | 699k  |       if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))  | 
563  | 36.2k  |   { | 
564  | 36.2k  |     ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));  | 
565  | 36.2k  |     bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));  | 
566  |  |  | 
567  | 36.2k  |     break;  | 
568  | 36.2k  |   }  | 
569  |  |  | 
570  |  |       /* Subtract a -= q b, and multiply M from the right by (1 q ; 0  | 
571  |  |    1), affecting the second column of M. */  | 
572  | 663k  |       ASSERT (ah > bh);  | 
573  | 663k  |       sub_ddmmss (ah, al, ah, al, bh, bl);  | 
574  |  |  | 
575  | 663k  |       if (ah < 2)  | 
576  | 31  |   goto done;  | 
577  |  |  | 
578  | 663k  |       if (ah <= bh)  | 
579  | 264k  |   { | 
580  |  |     /* Use q = 1 */  | 
581  | 264k  |     u01 += u00;  | 
582  | 264k  |     u11 += u10;  | 
583  | 264k  |   }  | 
584  | 399k  |       else  | 
585  | 399k  |   { | 
586  | 399k  |     mp_limb_t r[2];  | 
587  | 399k  |     mp_limb_t q = div2 (r, ah, al, bh, bl);  | 
588  | 399k  |     al = r[0]; ah = r[1];  | 
589  | 399k  |     if (ah < 2)  | 
590  | 461  |       { | 
591  |  |         /* A is too small, but q is correct. */  | 
592  | 461  |         u01 += q * u00;  | 
593  | 461  |         u11 += q * u10;  | 
594  | 461  |         goto done;  | 
595  | 461  |       }  | 
596  | 398k  |     q++;  | 
597  | 398k  |     u01 += q * u00;  | 
598  | 398k  |     u11 += q * u10;  | 
599  | 398k  |   }  | 
600  | 700k  |     subtract_a:  | 
601  | 700k  |       ASSERT (bh >= ah);  | 
602  | 700k  |       if (ah == bh)  | 
603  | 560  |   goto done;  | 
604  |  |  | 
605  | 699k  |       if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))  | 
606  | 36.2k  |   { | 
607  | 36.2k  |     ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));  | 
608  | 36.2k  |     bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));  | 
609  |  |  | 
610  | 36.2k  |     goto subtract_a1;  | 
611  | 36.2k  |   }  | 
612  |  |  | 
613  |  |       /* Subtract b -= q a, and multiply M from the right by (1 0 ; q  | 
614  |  |    1), affecting the first column of M. */  | 
615  | 663k  |       sub_ddmmss (bh, bl, bh, bl, ah, al);  | 
616  |  |  | 
617  | 663k  |       if (bh < 2)  | 
618  | 9  |   goto done;  | 
619  |  |  | 
620  | 663k  |       if (bh <= ah)  | 
621  | 264k  |   { | 
622  |  |     /* Use q = 1 */  | 
623  | 264k  |     u00 += u01;  | 
624  | 264k  |     u10 += u11;  | 
625  | 264k  |   }  | 
626  | 399k  |       else  | 
627  | 399k  |   { | 
628  | 399k  |     mp_limb_t r[2];  | 
629  | 399k  |     mp_limb_t q = div2 (r, bh, bl, ah, al);  | 
630  | 399k  |     bl = r[0]; bh = r[1];  | 
631  | 399k  |     if (bh < 2)  | 
632  | 391  |       { | 
633  |  |         /* B is too small, but q is correct. */  | 
634  | 391  |         u00 += q * u01;  | 
635  | 391  |         u10 += q * u11;  | 
636  | 391  |         goto done;  | 
637  | 391  |       }  | 
638  | 398k  |     q++;  | 
639  | 398k  |     u00 += q * u01;  | 
640  | 398k  |     u10 += q * u11;  | 
641  | 398k  |   }  | 
642  | 663k  |     }  | 
643  |  |  | 
644  |  |   /* NOTE: Since we discard the least significant half limb, we don't get a  | 
645  |  |      truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */ | 
646  |  |   /* Single precision loop */  | 
647  | 36.2k  |   for (;;)  | 
648  | 613k  |     { | 
649  | 613k  |       ASSERT (ah >= bh);  | 
650  |  |  | 
651  | 613k  |       ah -= bh;  | 
652  | 613k  |       if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))  | 
653  | 16.6k  |   break;  | 
654  |  |  | 
655  | 596k  |       if (ah <= bh)  | 
656  | 248k  |   { | 
657  |  |     /* Use q = 1 */  | 
658  | 248k  |     u01 += u00;  | 
659  | 248k  |     u11 += u10;  | 
660  | 248k  |   }  | 
661  | 348k  |       else  | 
662  | 348k  |   { | 
663  | 348k  |     mp_double_limb_t rq = div1 (ah, bh);  | 
664  | 348k  |     mp_limb_t q = rq.d1;  | 
665  | 348k  |     ah = rq.d0;  | 
666  |  |  | 
667  | 348k  |     if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))  | 
668  | 19.6k  |       { | 
669  |  |         /* A is too small, but q is correct. */  | 
670  | 19.6k  |         u01 += q * u00;  | 
671  | 19.6k  |         u11 += q * u10;  | 
672  | 19.6k  |         break;  | 
673  | 19.6k  |       }  | 
674  | 328k  |     q++;  | 
675  | 328k  |     u01 += q * u00;  | 
676  | 328k  |     u11 += q * u10;  | 
677  | 328k  |   }  | 
678  | 613k  |     subtract_a1:  | 
679  | 613k  |       ASSERT (bh >= ah);  | 
680  |  |  | 
681  | 613k  |       bh -= ah;  | 
682  | 613k  |       if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))  | 
683  | 16.5k  |   break;  | 
684  |  |  | 
685  | 596k  |       if (bh <= ah)  | 
686  | 249k  |   { | 
687  |  |     /* Use q = 1 */  | 
688  | 249k  |     u00 += u01;  | 
689  | 249k  |     u10 += u11;  | 
690  | 249k  |   }  | 
691  | 347k  |       else  | 
692  | 347k  |   { | 
693  | 347k  |     mp_double_limb_t rq = div1 (bh, ah);  | 
694  | 347k  |     mp_limb_t q = rq.d1;  | 
695  | 347k  |     bh = rq.d0;  | 
696  |  |  | 
697  | 347k  |     if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))  | 
698  | 19.7k  |       { | 
699  |  |         /* B is too small, but q is correct. */  | 
700  | 19.7k  |         u00 += q * u01;  | 
701  | 19.7k  |         u10 += q * u11;  | 
702  | 19.7k  |         break;  | 
703  | 19.7k  |       }  | 
704  | 327k  |     q++;  | 
705  | 327k  |     u00 += q * u01;  | 
706  | 327k  |     u10 += q * u11;  | 
707  | 327k  |   }  | 
708  | 596k  |     }  | 
709  |  |  | 
710  | 74.6k  |  done:  | 
711  | 74.6k  |   M->u[0][0] = u00; M->u[0][1] = u01;  | 
712  | 74.6k  |   M->u[1][0] = u10; M->u[1][1] = u11;  | 
713  |  |  | 
714  | 74.6k  |   return 1;  | 
715  | 36.2k  | }  | 
716  |  |  | 
717  |  | /* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must  | 
718  |  |  * have space for n + 1 limbs. Uses three buffers to avoid a copy*/  | 
719  |  | mp_size_t  | 
720  |  | mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,  | 
721  |  |            mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)  | 
722  | 72.9k  | { | 
723  | 72.9k  |   mp_limb_t ah, bh;  | 
724  |  |  | 
725  |  |   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as  | 
726  |  |  | 
727  |  |      r  = u00 * a  | 
728  |  |      r += u10 * b  | 
729  |  |      b *= u11  | 
730  |  |      b += u01 * a  | 
731  |  |   */  | 
732  |  |  | 
733  | 72.9k  | #if HAVE_NATIVE_mpn_addaddmul_1msb0  | 
734  | 72.9k  |   ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);  | 
735  | 72.9k  |   bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);  | 
736  |  | #else  | 
737  |  |   ah =     mpn_mul_1 (rp, ap, n, M->u[0][0]);  | 
738  |  |   ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);  | 
739  |  |  | 
740  |  |   bh =     mpn_mul_1 (bp, bp, n, M->u[1][1]);  | 
741  |  |   bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);  | 
742  |  | #endif  | 
743  | 72.9k  |   rp[n] = ah;  | 
744  | 72.9k  |   bp[n] = bh;  | 
745  |  |  | 
746  | 72.9k  |   n += (ah | bh) > 0;  | 
747  | 72.9k  |   return n;  | 
748  | 72.9k  | }  |