/src/openssl/crypto/bn/asm/x86_64-gcc.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  |  * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved.  | 
3  |  |  *  | 
4  |  |  * Licensed under the Apache License 2.0 (the "License").  You may not use  | 
5  |  |  * this file except in compliance with the License.  You can obtain a copy  | 
6  |  |  * in the file LICENSE in the source distribution or at  | 
7  |  |  * https://www.openssl.org/source/license.html  | 
8  |  |  */  | 
9  |  |  | 
10  |  | #include "../bn_local.h"  | 
11  |  | #if !(defined(__GNUC__) && __GNUC__>=2)  | 
12  |  | # include "../bn_asm.c"         /* kind of dirty hack for Sun Studio */  | 
13  |  | #else  | 
14  |  | /*-  | 
15  |  |  * x86_64 BIGNUM accelerator version 0.1, December 2002.  | 
16  |  |  *  | 
17  |  |  * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL  | 
18  |  |  * project.  | 
19  |  |  *  | 
20  |  |  * Rights for redistribution and usage in source and binary forms are  | 
21  |  |  * granted according to the License. Warranty of any kind is disclaimed.  | 
22  |  |  *  | 
23  |  |  * Q. Version 0.1? It doesn't sound like Andy, he used to assign real  | 
24  |  |  *    versions, like 1.0...  | 
25  |  |  * A. Well, that's because this code is basically a quick-n-dirty  | 
26  |  |  *    proof-of-concept hack. As you can see it's implemented with  | 
27  |  |  *    inline assembler, which means that you're bound to GCC and that  | 
28  |  |  *    there might be enough room for further improvement.  | 
29  |  |  *  | 
30  |  |  * Q. Why inline assembler?  | 
31  |  |  * A. x86_64 features own ABI which I'm not familiar with. This is  | 
32  |  |  *    why I decided to let the compiler take care of subroutine  | 
33  |  |  *    prologue/epilogue as well as register allocation. For reference.  | 
34  |  |  *    Win64 implements different ABI for AMD64, different from Linux.  | 
35  |  |  *  | 
36  |  |  * Q. How much faster does it get?  | 
37  |  |  * A. 'apps/openssl speed rsa dsa' output with no-asm:  | 
38  |  |  *  | 
39  |  |  *                        sign    verify    sign/s verify/s  | 
40  |  |  *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2  | 
41  |  |  *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0  | 
42  |  |  *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8  | 
43  |  |  *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6  | 
44  |  |  *                        sign    verify    sign/s verify/s  | 
45  |  |  *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3  | 
46  |  |  *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2  | 
47  |  |  *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0  | 
48  |  |  *  | 
49  |  |  *    'apps/openssl speed rsa dsa' output with this module:  | 
50  |  |  *  | 
51  |  |  *                        sign    verify    sign/s verify/s  | 
52  |  |  *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9  | 
53  |  |  *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7  | 
54  |  |  *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0  | 
55  |  |  *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8  | 
56  |  |  *                        sign    verify    sign/s verify/s  | 
57  |  |  *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3  | 
58  |  |  *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4  | 
59  |  |  *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6  | 
60  |  |  *  | 
61  |  |  *    For the reference. IA-32 assembler implementation performs  | 
62  |  |  *    very much like 64-bit code compiled with no-asm on the same  | 
63  |  |  *    machine.  | 
64  |  |  */  | 
65  |  |  | 
66  |  | # undef mul  | 
67  |  | # undef mul_add  | 
68  |  |  | 
69  |  | /*-  | 
70  |  |  * "m"(a), "+m"(r)      is the way to favor DirectPath ยต-code;  | 
71  |  |  * "g"(0)               let the compiler to decide where does it  | 
72  |  |  *                      want to keep the value of zero;  | 
73  |  |  */  | 
74  | 0  | # define mul_add(r,a,word,carry) do {   \ | 
75  | 0  |         register BN_ULONG high,low;     \  | 
76  | 0  |         asm ("mulq %3"                  \ | 
77  | 0  |                 : "=a"(low),"=d"(high)  \  | 
78  | 0  |                 : "a"(word),"m"(a)      \  | 
79  | 0  |                 : "cc");                \  | 
80  | 0  |         asm ("addq %2,%0; adcq %3,%1"   \ | 
81  | 0  |                 : "+r"(carry),"+d"(high)\  | 
82  | 0  |                 : "a"(low),"g"(0)       \  | 
83  | 0  |                 : "cc");                \  | 
84  | 0  |         asm ("addq %2,%0; adcq %3,%1"   \ | 
85  | 0  |                 : "+m"(r),"+d"(high)    \  | 
86  | 0  |                 : "r"(carry),"g"(0)     \  | 
87  | 0  |                 : "cc");                \  | 
88  | 0  |         carry=high;                     \  | 
89  | 0  |         } while (0)  | 
90  |  |  | 
91  | 0  | # define mul(r,a,word,carry) do {       \ | 
92  | 0  |         register BN_ULONG high,low;     \  | 
93  | 0  |         asm ("mulq %3"                  \ | 
94  | 0  |                 : "=a"(low),"=d"(high)  \  | 
95  | 0  |                 : "a"(word),"g"(a)      \  | 
96  | 0  |                 : "cc");                \  | 
97  | 0  |         asm ("addq %2,%0; adcq %3,%1"   \ | 
98  | 0  |                 : "+r"(carry),"+d"(high)\  | 
99  | 0  |                 : "a"(low),"g"(0)       \  | 
100  | 0  |                 : "cc");                \  | 
101  | 0  |         (r)=carry, carry=high;          \  | 
102  | 0  |         } while (0)  | 
103  |  | # undef sqr  | 
104  |  | # define sqr(r0,r1,a)                   \  | 
105  | 0  |         asm ("mulq %2"                  \ | 
106  | 0  |                 : "=a"(r0),"=d"(r1)     \  | 
107  | 0  |                 : "a"(a)                \  | 
108  | 0  |                 : "cc");  | 
109  |  |  | 
110  |  | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,  | 
111  |  |                           BN_ULONG w)  | 
112  | 0  | { | 
113  | 0  |     BN_ULONG c1 = 0;  | 
114  |  | 
  | 
115  | 0  |     if (num <= 0)  | 
116  | 0  |         return c1;  | 
117  |  |  | 
118  | 0  |     while (num & ~3) { | 
119  | 0  |         mul_add(rp[0], ap[0], w, c1);  | 
120  | 0  |         mul_add(rp[1], ap[1], w, c1);  | 
121  | 0  |         mul_add(rp[2], ap[2], w, c1);  | 
122  | 0  |         mul_add(rp[3], ap[3], w, c1);  | 
123  | 0  |         ap += 4;  | 
124  | 0  |         rp += 4;  | 
125  | 0  |         num -= 4;  | 
126  | 0  |     }  | 
127  | 0  |     if (num) { | 
128  | 0  |         mul_add(rp[0], ap[0], w, c1);  | 
129  | 0  |         if (--num == 0)  | 
130  | 0  |             return c1;  | 
131  | 0  |         mul_add(rp[1], ap[1], w, c1);  | 
132  | 0  |         if (--num == 0)  | 
133  | 0  |             return c1;  | 
134  | 0  |         mul_add(rp[2], ap[2], w, c1);  | 
135  | 0  |         return c1;  | 
136  | 0  |     }  | 
137  |  |  | 
138  | 0  |     return c1;  | 
139  | 0  | }  | 
140  |  |  | 
141  |  | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)  | 
142  | 0  | { | 
143  | 0  |     BN_ULONG c1 = 0;  | 
144  |  | 
  | 
145  | 0  |     if (num <= 0)  | 
146  | 0  |         return c1;  | 
147  |  |  | 
148  | 0  |     while (num & ~3) { | 
149  | 0  |         mul(rp[0], ap[0], w, c1);  | 
150  | 0  |         mul(rp[1], ap[1], w, c1);  | 
151  | 0  |         mul(rp[2], ap[2], w, c1);  | 
152  | 0  |         mul(rp[3], ap[3], w, c1);  | 
153  | 0  |         ap += 4;  | 
154  | 0  |         rp += 4;  | 
155  | 0  |         num -= 4;  | 
156  | 0  |     }  | 
157  | 0  |     if (num) { | 
158  | 0  |         mul(rp[0], ap[0], w, c1);  | 
159  | 0  |         if (--num == 0)  | 
160  | 0  |             return c1;  | 
161  | 0  |         mul(rp[1], ap[1], w, c1);  | 
162  | 0  |         if (--num == 0)  | 
163  | 0  |             return c1;  | 
164  | 0  |         mul(rp[2], ap[2], w, c1);  | 
165  | 0  |     }  | 
166  | 0  |     return c1;  | 
167  | 0  | }  | 
168  |  |  | 
169  |  | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)  | 
170  | 0  | { | 
171  | 0  |     if (n <= 0)  | 
172  | 0  |         return;  | 
173  |  |  | 
174  | 0  |     while (n & ~3) { | 
175  | 0  |         sqr(r[0], r[1], a[0]);  | 
176  | 0  |         sqr(r[2], r[3], a[1]);  | 
177  | 0  |         sqr(r[4], r[5], a[2]);  | 
178  | 0  |         sqr(r[6], r[7], a[3]);  | 
179  | 0  |         a += 4;  | 
180  | 0  |         r += 8;  | 
181  | 0  |         n -= 4;  | 
182  | 0  |     }  | 
183  | 0  |     if (n) { | 
184  | 0  |         sqr(r[0], r[1], a[0]);  | 
185  | 0  |         if (--n == 0)  | 
186  | 0  |             return;  | 
187  | 0  |         sqr(r[2], r[3], a[1]);  | 
188  | 0  |         if (--n == 0)  | 
189  | 0  |             return;  | 
190  | 0  |         sqr(r[4], r[5], a[2]);  | 
191  | 0  |     }  | 
192  | 0  | }  | 
193  |  |  | 
194  |  | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)  | 
195  | 0  | { | 
196  | 0  |     BN_ULONG ret, waste;  | 
197  |  | 
  | 
198  | 0  |  asm("divq      %4":"=a"(ret), "=d"(waste) | 
199  | 0  |  :     "a"(l), "d"(h), "r"(d)  | 
200  | 0  |  :     "cc");  | 
201  |  | 
  | 
202  | 0  |     return ret;  | 
203  | 0  | }  | 
204  |  |  | 
205  |  | BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,  | 
206  |  |                       int n)  | 
207  | 0  | { | 
208  | 0  |     BN_ULONG ret;  | 
209  | 0  |     size_t i = 0;  | 
210  |  | 
  | 
211  | 0  |     if (n <= 0)  | 
212  | 0  |         return 0;  | 
213  |  |  | 
214  | 0  |     asm volatile ("       subq    %0,%0           \n" /* clear carry */ | 
215  | 0  |                   "       jmp     1f              \n"  | 
216  | 0  |                   ".p2align 4                     \n"  | 
217  | 0  |                   "1:     movq    (%4,%2,8),%0    \n"  | 
218  | 0  |                   "       adcq    (%5,%2,8),%0    \n"  | 
219  | 0  |                   "       movq    %0,(%3,%2,8)    \n"  | 
220  | 0  |                   "       lea     1(%2),%2        \n"  | 
221  | 0  |                   "       dec     %1              \n"  | 
222  | 0  |                   "       jnz     1b              \n"  | 
223  | 0  |                   "       sbbq    %0,%0           \n"  | 
224  | 0  |                   :"=&r" (ret), "+c"(n), "+r"(i)  | 
225  | 0  |                   :"r"(rp), "r"(ap), "r"(bp)  | 
226  | 0  |                   :"cc", "memory");  | 
227  |  | 
  | 
228  | 0  |     return ret & 1;  | 
229  | 0  | }  | 
230  |  |  | 
231  |  | # ifndef SIMICS  | 
232  |  | BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,  | 
233  |  |                       int n)  | 
234  | 0  | { | 
235  | 0  |     BN_ULONG ret;  | 
236  | 0  |     size_t i = 0;  | 
237  |  | 
  | 
238  | 0  |     if (n <= 0)  | 
239  | 0  |         return 0;  | 
240  |  |  | 
241  | 0  |     asm volatile ("       subq    %0,%0           \n" /* clear borrow */ | 
242  | 0  |                   "       jmp     1f              \n"  | 
243  | 0  |                   ".p2align 4                     \n"  | 
244  | 0  |                   "1:     movq    (%4,%2,8),%0    \n"  | 
245  | 0  |                   "       sbbq    (%5,%2,8),%0    \n"  | 
246  | 0  |                   "       movq    %0,(%3,%2,8)    \n"  | 
247  | 0  |                   "       lea     1(%2),%2        \n"  | 
248  | 0  |                   "       dec     %1              \n"  | 
249  | 0  |                   "       jnz     1b              \n"  | 
250  | 0  |                   "       sbbq    %0,%0           \n"  | 
251  | 0  |                   :"=&r" (ret), "+c"(n), "+r"(i)  | 
252  | 0  |                   :"r"(rp), "r"(ap), "r"(bp)  | 
253  | 0  |                   :"cc", "memory");  | 
254  |  | 
  | 
255  | 0  |     return ret & 1;  | 
256  | 0  | }  | 
257  |  | # else  | 
258  |  | /* Simics 1.4<7 has buggy sbbq:-( */  | 
259  |  | #  define BN_MASK2 0xffffffffffffffffL  | 
260  |  | BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)  | 
261  |  | { | 
262  |  |     BN_ULONG t1, t2;  | 
263  |  |     int c = 0;  | 
264  |  |  | 
265  |  |     if (n <= 0)  | 
266  |  |         return (BN_ULONG)0;  | 
267  |  |  | 
268  |  |     for (;;) { | 
269  |  |         t1 = a[0];  | 
270  |  |         t2 = b[0];  | 
271  |  |         r[0] = (t1 - t2 - c) & BN_MASK2;  | 
272  |  |         if (t1 != t2)  | 
273  |  |             c = (t1 < t2);  | 
274  |  |         if (--n <= 0)  | 
275  |  |             break;  | 
276  |  |  | 
277  |  |         t1 = a[1];  | 
278  |  |         t2 = b[1];  | 
279  |  |         r[1] = (t1 - t2 - c) & BN_MASK2;  | 
280  |  |         if (t1 != t2)  | 
281  |  |             c = (t1 < t2);  | 
282  |  |         if (--n <= 0)  | 
283  |  |             break;  | 
284  |  |  | 
285  |  |         t1 = a[2];  | 
286  |  |         t2 = b[2];  | 
287  |  |         r[2] = (t1 - t2 - c) & BN_MASK2;  | 
288  |  |         if (t1 != t2)  | 
289  |  |             c = (t1 < t2);  | 
290  |  |         if (--n <= 0)  | 
291  |  |             break;  | 
292  |  |  | 
293  |  |         t1 = a[3];  | 
294  |  |         t2 = b[3];  | 
295  |  |         r[3] = (t1 - t2 - c) & BN_MASK2;  | 
296  |  |         if (t1 != t2)  | 
297  |  |             c = (t1 < t2);  | 
298  |  |         if (--n <= 0)  | 
299  |  |             break;  | 
300  |  |  | 
301  |  |         a += 4;  | 
302  |  |         b += 4;  | 
303  |  |         r += 4;  | 
304  |  |     }  | 
305  |  |     return c;  | 
306  |  | }  | 
307  |  | # endif  | 
308  |  |  | 
309  |  | /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */  | 
310  |  | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */  | 
311  |  | /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */  | 
312  |  | /*  | 
313  |  |  * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number  | 
314  |  |  * c=(c2,c1,c0)  | 
315  |  |  */  | 
316  |  |  | 
317  |  | /*  | 
318  |  |  * Keep in mind that carrying into high part of multiplication result  | 
319  |  |  * can not overflow, because it cannot be all-ones.  | 
320  |  |  */  | 
321  |  | # if 0  | 
322  |  | /* original macros are kept for reference purposes */  | 
323  |  | #  define mul_add_c(a,b,c0,c1,c2)       do {    \ | 
324  |  |         BN_ULONG ta = (a), tb = (b);            \  | 
325  |  |         BN_ULONG lo, hi;                        \  | 
326  |  |         BN_UMULT_LOHI(lo,hi,ta,tb);             \  | 
327  |  |         c0 += lo; hi += (c0<lo)?1:0;            \  | 
328  |  |         c1 += hi; c2 += (c1<hi)?1:0;            \  | 
329  |  |         } while(0)  | 
330  |  |  | 
331  |  | #  define mul_add_c2(a,b,c0,c1,c2)      do {    \ | 
332  |  |         BN_ULONG ta = (a), tb = (b);            \  | 
333  |  |         BN_ULONG lo, hi, tt;                    \  | 
334  |  |         BN_UMULT_LOHI(lo,hi,ta,tb);             \  | 
335  |  |         c0 += lo; tt = hi+((c0<lo)?1:0);        \  | 
336  |  |         c1 += tt; c2 += (c1<tt)?1:0;            \  | 
337  |  |         c0 += lo; hi += (c0<lo)?1:0;            \  | 
338  |  |         c1 += hi; c2 += (c1<hi)?1:0;            \  | 
339  |  |         } while(0)  | 
340  |  |  | 
341  |  | #  define sqr_add_c(a,i,c0,c1,c2)       do {    \ | 
342  |  |         BN_ULONG ta = (a)[i];                   \  | 
343  |  |         BN_ULONG lo, hi;                        \  | 
344  |  |         BN_UMULT_LOHI(lo,hi,ta,ta);             \  | 
345  |  |         c0 += lo; hi += (c0<lo)?1:0;            \  | 
346  |  |         c1 += hi; c2 += (c1<hi)?1:0;            \  | 
347  |  |         } while(0)  | 
348  |  | # else  | 
349  | 0  | #  define mul_add_c(a,b,c0,c1,c2) do {  \ | 
350  | 0  |         BN_ULONG t1,t2;                 \  | 
351  | 0  |         asm ("mulq %3"                  \ | 
352  | 0  |                 : "=a"(t1),"=d"(t2)     \  | 
353  | 0  |                 : "a"(a),"m"(b)         \  | 
354  | 0  |                 : "cc");                \  | 
355  | 0  |         asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \ | 
356  | 0  |                 : "+r"(c0),"+r"(c1),"+r"(c2)            \  | 
357  | 0  |                 : "r"(t1),"r"(t2),"g"(0)                \  | 
358  | 0  |                 : "cc");                                \  | 
359  | 0  |         } while (0)  | 
360  |  |  | 
361  | 0  | #  define sqr_add_c(a,i,c0,c1,c2) do {  \ | 
362  | 0  |         BN_ULONG t1,t2;                 \  | 
363  | 0  |         asm ("mulq %2"                  \ | 
364  | 0  |                 : "=a"(t1),"=d"(t2)     \  | 
365  | 0  |                 : "a"(a[i])             \  | 
366  | 0  |                 : "cc");                \  | 
367  | 0  |         asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \ | 
368  | 0  |                 : "+r"(c0),"+r"(c1),"+r"(c2)            \  | 
369  | 0  |                 : "r"(t1),"r"(t2),"g"(0)                \  | 
370  | 0  |                 : "cc");                                \  | 
371  | 0  |         } while (0)  | 
372  |  |  | 
373  | 0  | #  define mul_add_c2(a,b,c0,c1,c2) do { \ | 
374  | 0  |         BN_ULONG t1,t2;                 \  | 
375  | 0  |         asm ("mulq %3"                  \ | 
376  | 0  |                 : "=a"(t1),"=d"(t2)     \  | 
377  | 0  |                 : "a"(a),"m"(b)         \  | 
378  | 0  |                 : "cc");                \  | 
379  | 0  |         asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \ | 
380  | 0  |                 : "+r"(c0),"+r"(c1),"+r"(c2)            \  | 
381  | 0  |                 : "r"(t1),"r"(t2),"g"(0)                \  | 
382  | 0  |                 : "cc");                                \  | 
383  | 0  |         asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \ | 
384  | 0  |                 : "+r"(c0),"+r"(c1),"+r"(c2)            \  | 
385  | 0  |                 : "r"(t1),"r"(t2),"g"(0)                \  | 
386  | 0  |                 : "cc");                                \  | 
387  | 0  |         } while (0)  | 
388  |  | # endif  | 
389  |  |  | 
390  |  | # define sqr_add_c2(a,i,j,c0,c1,c2)      \  | 
391  | 0  |         mul_add_c2((a)[i],(a)[j],c0,c1,c2)  | 
392  |  |  | 
393  |  | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)  | 
394  | 0  | { | 
395  | 0  |     BN_ULONG c1, c2, c3;  | 
396  |  | 
  | 
397  | 0  |     c1 = 0;  | 
398  | 0  |     c2 = 0;  | 
399  | 0  |     c3 = 0;  | 
400  | 0  |     mul_add_c(a[0], b[0], c1, c2, c3);  | 
401  | 0  |     r[0] = c1;  | 
402  | 0  |     c1 = 0;  | 
403  | 0  |     mul_add_c(a[0], b[1], c2, c3, c1);  | 
404  | 0  |     mul_add_c(a[1], b[0], c2, c3, c1);  | 
405  | 0  |     r[1] = c2;  | 
406  | 0  |     c2 = 0;  | 
407  | 0  |     mul_add_c(a[2], b[0], c3, c1, c2);  | 
408  | 0  |     mul_add_c(a[1], b[1], c3, c1, c2);  | 
409  | 0  |     mul_add_c(a[0], b[2], c3, c1, c2);  | 
410  | 0  |     r[2] = c3;  | 
411  | 0  |     c3 = 0;  | 
412  | 0  |     mul_add_c(a[0], b[3], c1, c2, c3);  | 
413  | 0  |     mul_add_c(a[1], b[2], c1, c2, c3);  | 
414  | 0  |     mul_add_c(a[2], b[1], c1, c2, c3);  | 
415  | 0  |     mul_add_c(a[3], b[0], c1, c2, c3);  | 
416  | 0  |     r[3] = c1;  | 
417  | 0  |     c1 = 0;  | 
418  | 0  |     mul_add_c(a[4], b[0], c2, c3, c1);  | 
419  | 0  |     mul_add_c(a[3], b[1], c2, c3, c1);  | 
420  | 0  |     mul_add_c(a[2], b[2], c2, c3, c1);  | 
421  | 0  |     mul_add_c(a[1], b[3], c2, c3, c1);  | 
422  | 0  |     mul_add_c(a[0], b[4], c2, c3, c1);  | 
423  | 0  |     r[4] = c2;  | 
424  | 0  |     c2 = 0;  | 
425  | 0  |     mul_add_c(a[0], b[5], c3, c1, c2);  | 
426  | 0  |     mul_add_c(a[1], b[4], c3, c1, c2);  | 
427  | 0  |     mul_add_c(a[2], b[3], c3, c1, c2);  | 
428  | 0  |     mul_add_c(a[3], b[2], c3, c1, c2);  | 
429  | 0  |     mul_add_c(a[4], b[1], c3, c1, c2);  | 
430  | 0  |     mul_add_c(a[5], b[0], c3, c1, c2);  | 
431  | 0  |     r[5] = c3;  | 
432  | 0  |     c3 = 0;  | 
433  | 0  |     mul_add_c(a[6], b[0], c1, c2, c3);  | 
434  | 0  |     mul_add_c(a[5], b[1], c1, c2, c3);  | 
435  | 0  |     mul_add_c(a[4], b[2], c1, c2, c3);  | 
436  | 0  |     mul_add_c(a[3], b[3], c1, c2, c3);  | 
437  | 0  |     mul_add_c(a[2], b[4], c1, c2, c3);  | 
438  | 0  |     mul_add_c(a[1], b[5], c1, c2, c3);  | 
439  | 0  |     mul_add_c(a[0], b[6], c1, c2, c3);  | 
440  | 0  |     r[6] = c1;  | 
441  | 0  |     c1 = 0;  | 
442  | 0  |     mul_add_c(a[0], b[7], c2, c3, c1);  | 
443  | 0  |     mul_add_c(a[1], b[6], c2, c3, c1);  | 
444  | 0  |     mul_add_c(a[2], b[5], c2, c3, c1);  | 
445  | 0  |     mul_add_c(a[3], b[4], c2, c3, c1);  | 
446  | 0  |     mul_add_c(a[4], b[3], c2, c3, c1);  | 
447  | 0  |     mul_add_c(a[5], b[2], c2, c3, c1);  | 
448  | 0  |     mul_add_c(a[6], b[1], c2, c3, c1);  | 
449  | 0  |     mul_add_c(a[7], b[0], c2, c3, c1);  | 
450  | 0  |     r[7] = c2;  | 
451  | 0  |     c2 = 0;  | 
452  | 0  |     mul_add_c(a[7], b[1], c3, c1, c2);  | 
453  | 0  |     mul_add_c(a[6], b[2], c3, c1, c2);  | 
454  | 0  |     mul_add_c(a[5], b[3], c3, c1, c2);  | 
455  | 0  |     mul_add_c(a[4], b[4], c3, c1, c2);  | 
456  | 0  |     mul_add_c(a[3], b[5], c3, c1, c2);  | 
457  | 0  |     mul_add_c(a[2], b[6], c3, c1, c2);  | 
458  | 0  |     mul_add_c(a[1], b[7], c3, c1, c2);  | 
459  | 0  |     r[8] = c3;  | 
460  | 0  |     c3 = 0;  | 
461  | 0  |     mul_add_c(a[2], b[7], c1, c2, c3);  | 
462  | 0  |     mul_add_c(a[3], b[6], c1, c2, c3);  | 
463  | 0  |     mul_add_c(a[4], b[5], c1, c2, c3);  | 
464  | 0  |     mul_add_c(a[5], b[4], c1, c2, c3);  | 
465  | 0  |     mul_add_c(a[6], b[3], c1, c2, c3);  | 
466  | 0  |     mul_add_c(a[7], b[2], c1, c2, c3);  | 
467  | 0  |     r[9] = c1;  | 
468  | 0  |     c1 = 0;  | 
469  | 0  |     mul_add_c(a[7], b[3], c2, c3, c1);  | 
470  | 0  |     mul_add_c(a[6], b[4], c2, c3, c1);  | 
471  | 0  |     mul_add_c(a[5], b[5], c2, c3, c1);  | 
472  | 0  |     mul_add_c(a[4], b[6], c2, c3, c1);  | 
473  | 0  |     mul_add_c(a[3], b[7], c2, c3, c1);  | 
474  | 0  |     r[10] = c2;  | 
475  | 0  |     c2 = 0;  | 
476  | 0  |     mul_add_c(a[4], b[7], c3, c1, c2);  | 
477  | 0  |     mul_add_c(a[5], b[6], c3, c1, c2);  | 
478  | 0  |     mul_add_c(a[6], b[5], c3, c1, c2);  | 
479  | 0  |     mul_add_c(a[7], b[4], c3, c1, c2);  | 
480  | 0  |     r[11] = c3;  | 
481  | 0  |     c3 = 0;  | 
482  | 0  |     mul_add_c(a[7], b[5], c1, c2, c3);  | 
483  | 0  |     mul_add_c(a[6], b[6], c1, c2, c3);  | 
484  | 0  |     mul_add_c(a[5], b[7], c1, c2, c3);  | 
485  | 0  |     r[12] = c1;  | 
486  | 0  |     c1 = 0;  | 
487  | 0  |     mul_add_c(a[6], b[7], c2, c3, c1);  | 
488  | 0  |     mul_add_c(a[7], b[6], c2, c3, c1);  | 
489  | 0  |     r[13] = c2;  | 
490  | 0  |     c2 = 0;  | 
491  | 0  |     mul_add_c(a[7], b[7], c3, c1, c2);  | 
492  | 0  |     r[14] = c3;  | 
493  | 0  |     r[15] = c1;  | 
494  | 0  | }  | 
495  |  |  | 
496  |  | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)  | 
497  | 0  | { | 
498  | 0  |     BN_ULONG c1, c2, c3;  | 
499  |  | 
  | 
500  | 0  |     c1 = 0;  | 
501  | 0  |     c2 = 0;  | 
502  | 0  |     c3 = 0;  | 
503  | 0  |     mul_add_c(a[0], b[0], c1, c2, c3);  | 
504  | 0  |     r[0] = c1;  | 
505  | 0  |     c1 = 0;  | 
506  | 0  |     mul_add_c(a[0], b[1], c2, c3, c1);  | 
507  | 0  |     mul_add_c(a[1], b[0], c2, c3, c1);  | 
508  | 0  |     r[1] = c2;  | 
509  | 0  |     c2 = 0;  | 
510  | 0  |     mul_add_c(a[2], b[0], c3, c1, c2);  | 
511  | 0  |     mul_add_c(a[1], b[1], c3, c1, c2);  | 
512  | 0  |     mul_add_c(a[0], b[2], c3, c1, c2);  | 
513  | 0  |     r[2] = c3;  | 
514  | 0  |     c3 = 0;  | 
515  | 0  |     mul_add_c(a[0], b[3], c1, c2, c3);  | 
516  | 0  |     mul_add_c(a[1], b[2], c1, c2, c3);  | 
517  | 0  |     mul_add_c(a[2], b[1], c1, c2, c3);  | 
518  | 0  |     mul_add_c(a[3], b[0], c1, c2, c3);  | 
519  | 0  |     r[3] = c1;  | 
520  | 0  |     c1 = 0;  | 
521  | 0  |     mul_add_c(a[3], b[1], c2, c3, c1);  | 
522  | 0  |     mul_add_c(a[2], b[2], c2, c3, c1);  | 
523  | 0  |     mul_add_c(a[1], b[3], c2, c3, c1);  | 
524  | 0  |     r[4] = c2;  | 
525  | 0  |     c2 = 0;  | 
526  | 0  |     mul_add_c(a[2], b[3], c3, c1, c2);  | 
527  | 0  |     mul_add_c(a[3], b[2], c3, c1, c2);  | 
528  | 0  |     r[5] = c3;  | 
529  | 0  |     c3 = 0;  | 
530  | 0  |     mul_add_c(a[3], b[3], c1, c2, c3);  | 
531  | 0  |     r[6] = c1;  | 
532  | 0  |     r[7] = c2;  | 
533  | 0  | }  | 
534  |  |  | 
535  |  | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)  | 
536  | 0  | { | 
537  | 0  |     BN_ULONG c1, c2, c3;  | 
538  |  | 
  | 
539  | 0  |     c1 = 0;  | 
540  | 0  |     c2 = 0;  | 
541  | 0  |     c3 = 0;  | 
542  | 0  |     sqr_add_c(a, 0, c1, c2, c3);  | 
543  | 0  |     r[0] = c1;  | 
544  | 0  |     c1 = 0;  | 
545  | 0  |     sqr_add_c2(a, 1, 0, c2, c3, c1);  | 
546  | 0  |     r[1] = c2;  | 
547  | 0  |     c2 = 0;  | 
548  | 0  |     sqr_add_c(a, 1, c3, c1, c2);  | 
549  | 0  |     sqr_add_c2(a, 2, 0, c3, c1, c2);  | 
550  | 0  |     r[2] = c3;  | 
551  | 0  |     c3 = 0;  | 
552  | 0  |     sqr_add_c2(a, 3, 0, c1, c2, c3);  | 
553  | 0  |     sqr_add_c2(a, 2, 1, c1, c2, c3);  | 
554  | 0  |     r[3] = c1;  | 
555  | 0  |     c1 = 0;  | 
556  | 0  |     sqr_add_c(a, 2, c2, c3, c1);  | 
557  | 0  |     sqr_add_c2(a, 3, 1, c2, c3, c1);  | 
558  | 0  |     sqr_add_c2(a, 4, 0, c2, c3, c1);  | 
559  | 0  |     r[4] = c2;  | 
560  | 0  |     c2 = 0;  | 
561  | 0  |     sqr_add_c2(a, 5, 0, c3, c1, c2);  | 
562  | 0  |     sqr_add_c2(a, 4, 1, c3, c1, c2);  | 
563  | 0  |     sqr_add_c2(a, 3, 2, c3, c1, c2);  | 
564  | 0  |     r[5] = c3;  | 
565  | 0  |     c3 = 0;  | 
566  | 0  |     sqr_add_c(a, 3, c1, c2, c3);  | 
567  | 0  |     sqr_add_c2(a, 4, 2, c1, c2, c3);  | 
568  | 0  |     sqr_add_c2(a, 5, 1, c1, c2, c3);  | 
569  | 0  |     sqr_add_c2(a, 6, 0, c1, c2, c3);  | 
570  | 0  |     r[6] = c1;  | 
571  | 0  |     c1 = 0;  | 
572  | 0  |     sqr_add_c2(a, 7, 0, c2, c3, c1);  | 
573  | 0  |     sqr_add_c2(a, 6, 1, c2, c3, c1);  | 
574  | 0  |     sqr_add_c2(a, 5, 2, c2, c3, c1);  | 
575  | 0  |     sqr_add_c2(a, 4, 3, c2, c3, c1);  | 
576  | 0  |     r[7] = c2;  | 
577  | 0  |     c2 = 0;  | 
578  | 0  |     sqr_add_c(a, 4, c3, c1, c2);  | 
579  | 0  |     sqr_add_c2(a, 5, 3, c3, c1, c2);  | 
580  | 0  |     sqr_add_c2(a, 6, 2, c3, c1, c2);  | 
581  | 0  |     sqr_add_c2(a, 7, 1, c3, c1, c2);  | 
582  | 0  |     r[8] = c3;  | 
583  | 0  |     c3 = 0;  | 
584  | 0  |     sqr_add_c2(a, 7, 2, c1, c2, c3);  | 
585  | 0  |     sqr_add_c2(a, 6, 3, c1, c2, c3);  | 
586  | 0  |     sqr_add_c2(a, 5, 4, c1, c2, c3);  | 
587  | 0  |     r[9] = c1;  | 
588  | 0  |     c1 = 0;  | 
589  | 0  |     sqr_add_c(a, 5, c2, c3, c1);  | 
590  | 0  |     sqr_add_c2(a, 6, 4, c2, c3, c1);  | 
591  | 0  |     sqr_add_c2(a, 7, 3, c2, c3, c1);  | 
592  | 0  |     r[10] = c2;  | 
593  | 0  |     c2 = 0;  | 
594  | 0  |     sqr_add_c2(a, 7, 4, c3, c1, c2);  | 
595  | 0  |     sqr_add_c2(a, 6, 5, c3, c1, c2);  | 
596  | 0  |     r[11] = c3;  | 
597  | 0  |     c3 = 0;  | 
598  | 0  |     sqr_add_c(a, 6, c1, c2, c3);  | 
599  | 0  |     sqr_add_c2(a, 7, 5, c1, c2, c3);  | 
600  | 0  |     r[12] = c1;  | 
601  | 0  |     c1 = 0;  | 
602  | 0  |     sqr_add_c2(a, 7, 6, c2, c3, c1);  | 
603  | 0  |     r[13] = c2;  | 
604  | 0  |     c2 = 0;  | 
605  | 0  |     sqr_add_c(a, 7, c3, c1, c2);  | 
606  | 0  |     r[14] = c3;  | 
607  | 0  |     r[15] = c1;  | 
608  | 0  | }  | 
609  |  |  | 
610  |  | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)  | 
611  | 0  | { | 
612  | 0  |     BN_ULONG c1, c2, c3;  | 
613  |  | 
  | 
614  | 0  |     c1 = 0;  | 
615  | 0  |     c2 = 0;  | 
616  | 0  |     c3 = 0;  | 
617  | 0  |     sqr_add_c(a, 0, c1, c2, c3);  | 
618  | 0  |     r[0] = c1;  | 
619  | 0  |     c1 = 0;  | 
620  | 0  |     sqr_add_c2(a, 1, 0, c2, c3, c1);  | 
621  | 0  |     r[1] = c2;  | 
622  | 0  |     c2 = 0;  | 
623  | 0  |     sqr_add_c(a, 1, c3, c1, c2);  | 
624  | 0  |     sqr_add_c2(a, 2, 0, c3, c1, c2);  | 
625  | 0  |     r[2] = c3;  | 
626  | 0  |     c3 = 0;  | 
627  | 0  |     sqr_add_c2(a, 3, 0, c1, c2, c3);  | 
628  | 0  |     sqr_add_c2(a, 2, 1, c1, c2, c3);  | 
629  | 0  |     r[3] = c1;  | 
630  | 0  |     c1 = 0;  | 
631  | 0  |     sqr_add_c(a, 2, c2, c3, c1);  | 
632  | 0  |     sqr_add_c2(a, 3, 1, c2, c3, c1);  | 
633  | 0  |     r[4] = c2;  | 
634  | 0  |     c2 = 0;  | 
635  | 0  |     sqr_add_c2(a, 3, 2, c3, c1, c2);  | 
636  | 0  |     r[5] = c3;  | 
637  | 0  |     c3 = 0;  | 
638  | 0  |     sqr_add_c(a, 3, c1, c2, c3);  | 
639  | 0  |     r[6] = c1;  | 
640  | 0  |     r[7] = c2;  | 
641  | 0  | }  | 
642  |  | #endif  |