/src/openssl/crypto/bn/asm/x86_64-gcc.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. |
3 | | * |
4 | | * Licensed under the OpenSSL license (the "License"). You may not use |
5 | | * this file except in compliance with the License. You can obtain a copy |
6 | | * in the file LICENSE in the source distribution or at |
7 | | * https://www.openssl.org/source/license.html |
8 | | */ |
9 | | |
10 | | #include "../bn_lcl.h" |
11 | | #if !(defined(__GNUC__) && __GNUC__>=2) |
12 | | # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ |
13 | | #else |
14 | | /*- |
15 | | * x86_64 BIGNUM accelerator version 0.1, December 2002. |
16 | | * |
17 | | * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL |
18 | | * project. |
19 | | * |
20 | | * Rights for redistribution and usage in source and binary forms are |
21 | | * granted according to the OpenSSL license. Warranty of any kind is |
22 | | * disclaimed. |
23 | | * |
24 | | * Q. Version 0.1? It doesn't sound like Andy, he used to assign real |
25 | | * versions, like 1.0... |
26 | | * A. Well, that's because this code is basically a quick-n-dirty |
27 | | * proof-of-concept hack. As you can see it's implemented with |
28 | | * inline assembler, which means that you're bound to GCC and that |
29 | | * there might be enough room for further improvement. |
30 | | * |
31 | | * Q. Why inline assembler? |
32 | | * A. x86_64 features own ABI which I'm not familiar with. This is |
33 | | * why I decided to let the compiler take care of subroutine |
34 | | * prologue/epilogue as well as register allocation. For reference. |
35 | | * Win64 implements different ABI for AMD64, different from Linux. |
36 | | * |
37 | | * Q. How much faster does it get? |
38 | | * A. 'apps/openssl speed rsa dsa' output with no-asm: |
39 | | * |
40 | | * sign verify sign/s verify/s |
41 | | * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 |
42 | | * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 |
43 | | * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 |
44 | | * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 |
45 | | * sign verify sign/s verify/s |
46 | | * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 |
47 | | * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 |
48 | | * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 |
49 | | * |
50 | | * 'apps/openssl speed rsa dsa' output with this module: |
51 | | * |
52 | | * sign verify sign/s verify/s |
53 | | * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 |
54 | | * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 |
55 | | * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 |
56 | | * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 |
57 | | * sign verify sign/s verify/s |
58 | | * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 |
59 | | * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 |
60 | | * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 |
61 | | * |
62 | | * For the reference. IA-32 assembler implementation performs |
63 | | * very much like 64-bit code compiled with no-asm on the same |
64 | | * machine. |
65 | | */ |
66 | | |
67 | | # if defined(_WIN64) || !defined(__LP64__) |
68 | | # define BN_ULONG unsigned long long |
69 | | # else |
70 | 51.1M | # define BN_ULONG unsigned long |
71 | | # endif |
72 | | |
73 | | # undef mul |
74 | | # undef mul_add |
75 | | |
76 | | /*- |
77 | | * "m"(a), "+m"(r) is the way to favor DirectPath ยต-code; |
78 | | * "g"(0) let the compiler to decide where does it |
79 | | * want to keep the value of zero; |
80 | | */ |
81 | 111M | # define mul_add(r,a,word,carry) do { \ |
82 | 111M | register BN_ULONG high,low; \ |
83 | 111M | asm ("mulq %3" \ |
84 | 111M | : "=a"(low),"=d"(high) \ |
85 | 111M | : "a"(word),"m"(a) \ |
86 | 111M | : "cc"); \ |
87 | 111M | asm ("addq %2,%0; adcq %3,%1" \ |
88 | 111M | : "+r"(carry),"+d"(high)\ |
89 | 111M | : "a"(low),"g"(0) \ |
90 | 111M | : "cc"); \ |
91 | 111M | asm ("addq %2,%0; adcq %3,%1" \ |
92 | 111M | : "+m"(r),"+d"(high) \ |
93 | 111M | : "r"(carry),"g"(0) \ |
94 | 111M | : "cc"); \ |
95 | 111M | carry=high; \ |
96 | 111M | } while (0) |
97 | | |
98 | 9.94M | # define mul(r,a,word,carry) do { \ |
99 | 9.90M | register BN_ULONG high,low; \ |
100 | 9.90M | asm ("mulq %3" \ |
101 | 9.90M | : "=a"(low),"=d"(high) \ |
102 | 9.90M | : "a"(word),"g"(a) \ |
103 | 9.90M | : "cc"); \ |
104 | 9.90M | asm ("addq %2,%0; adcq %3,%1" \ |
105 | 9.90M | : "+r"(carry),"+d"(high)\ |
106 | 9.90M | : "a"(low),"g"(0) \ |
107 | 9.90M | : "cc"); \ |
108 | 9.90M | (r)=carry, carry=high; \ |
109 | 9.90M | } while (0) |
110 | | # undef sqr |
111 | | # define sqr(r0,r1,a) \ |
112 | 0 | asm ("mulq %2" \ |
113 | 0 | : "=a"(r0),"=d"(r1) \ |
114 | 0 | : "a"(a) \ |
115 | 0 | : "cc"); |
116 | | |
117 | | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
118 | | BN_ULONG w) |
119 | 241k | { |
120 | 241k | BN_ULONG c1 = 0; |
121 | 241k | |
122 | 241k | if (num <= 0) |
123 | 0 | return c1; |
124 | 241k | |
125 | 27.9M | while (num & ~3) { |
126 | 27.7M | mul_add(rp[0], ap[0], w, c1); |
127 | 27.7M | mul_add(rp[1], ap[1], w, c1); |
128 | 27.7M | mul_add(rp[2], ap[2], w, c1); |
129 | 27.7M | mul_add(rp[3], ap[3], w, c1); |
130 | 27.7M | ap += 4; |
131 | 27.7M | rp += 4; |
132 | 27.7M | num -= 4; |
133 | 27.7M | } |
134 | 241k | if (num) { |
135 | 148k | mul_add(rp[0], ap[0], w, c1); |
136 | 148k | if (--num == 0) |
137 | 50.2k | return c1; |
138 | 98.6k | mul_add(rp[1], ap[1], w, c1); |
139 | 98.6k | if (--num == 0) |
140 | 66.9k | return c1; |
141 | 31.7k | mul_add(rp[2], ap[2], w, c1); |
142 | 31.7k | return c1; |
143 | 31.7k | } |
144 | 93.0k | |
145 | 93.0k | return c1; |
146 | 93.0k | } |
147 | | |
148 | | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
149 | 95.3k | { |
150 | 95.3k | BN_ULONG c1 = 0; |
151 | 95.3k | |
152 | 95.3k | if (num <= 0) |
153 | 0 | return c1; |
154 | 95.3k | |
155 | 2.54M | while (num & ~3) { |
156 | 2.44M | mul(rp[0], ap[0], w, c1); |
157 | 2.44M | mul(rp[1], ap[1], w, c1); |
158 | 2.44M | mul(rp[2], ap[2], w, c1); |
159 | 2.44M | mul(rp[3], ap[3], w, c1); |
160 | 2.44M | ap += 4; |
161 | 2.44M | rp += 4; |
162 | 2.44M | num -= 4; |
163 | 2.44M | } |
164 | 95.3k | if (num) { |
165 | 73.8k | mul(rp[0], ap[0], w, c1); |
166 | 73.8k | if (--num == 0) |
167 | 40.1k | return c1; |
168 | 33.6k | mul(rp[1], ap[1], w, c1); |
169 | 33.6k | if (--num == 0) |
170 | 21.4k | return c1; |
171 | 12.2k | mul(rp[2], ap[2], w, c1); |
172 | 12.2k | } |
173 | 95.3k | return c1; |
174 | 95.3k | } |
175 | | |
176 | | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
177 | 0 | { |
178 | 0 | if (n <= 0) |
179 | 0 | return; |
180 | 0 | |
181 | 0 | while (n & ~3) { |
182 | 0 | sqr(r[0], r[1], a[0]); |
183 | 0 | sqr(r[2], r[3], a[1]); |
184 | 0 | sqr(r[4], r[5], a[2]); |
185 | 0 | sqr(r[6], r[7], a[3]); |
186 | 0 | a += 4; |
187 | 0 | r += 8; |
188 | 0 | n -= 4; |
189 | 0 | } |
190 | 0 | if (n) { |
191 | 0 | sqr(r[0], r[1], a[0]); |
192 | 0 | if (--n == 0) |
193 | 0 | return; |
194 | 0 | sqr(r[2], r[3], a[1]); |
195 | 0 | if (--n == 0) |
196 | 0 | return; |
197 | 0 | sqr(r[4], r[5], a[2]); |
198 | 0 | } |
199 | 0 | } |
200 | | |
201 | | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
202 | 19.6M | { |
203 | 19.6M | BN_ULONG ret, waste; |
204 | 19.6M | |
205 | 19.6M | asm("divq %4":"=a"(ret), "=d"(waste) |
206 | 19.6M | : "a"(l), "d"(h), "r"(d) |
207 | 19.6M | : "cc"); |
208 | 19.6M | |
209 | 19.6M | return ret; |
210 | 19.6M | } |
211 | | |
212 | | BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
213 | | int n) |
214 | 572k | { |
215 | 572k | BN_ULONG ret; |
216 | 572k | size_t i = 0; |
217 | 572k | |
218 | 572k | if (n <= 0) |
219 | 0 | return 0; |
220 | 572k | |
221 | 572k | asm volatile (" subq %0,%0 \n" /* clear carry */ |
222 | 572k | " jmp 1f \n" |
223 | 572k | ".p2align 4 \n" |
224 | 572k | "1: movq (%4,%2,8),%0 \n" |
225 | 572k | " adcq (%5,%2,8),%0 \n" |
226 | 572k | " movq %0,(%3,%2,8) \n" |
227 | 572k | " lea 1(%2),%2 \n" |
228 | 572k | " dec %1 \n" |
229 | 572k | " jnz 1b \n" |
230 | 572k | " sbbq %0,%0 \n" |
231 | 572k | :"=&r" (ret), "+c"(n), "+r"(i) |
232 | 572k | :"r"(rp), "r"(ap), "r"(bp) |
233 | 572k | :"cc", "memory"); |
234 | 572k | |
235 | 572k | return ret & 1; |
236 | 572k | } |
237 | | |
238 | | # ifndef SIMICS |
239 | | BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
240 | | int n) |
241 | 533k | { |
242 | 533k | BN_ULONG ret; |
243 | 533k | size_t i = 0; |
244 | 533k | |
245 | 533k | if (n <= 0) |
246 | 8.27k | return 0; |
247 | 525k | |
248 | 525k | asm volatile (" subq %0,%0 \n" /* clear borrow */ |
249 | 525k | " jmp 1f \n" |
250 | 525k | ".p2align 4 \n" |
251 | 525k | "1: movq (%4,%2,8),%0 \n" |
252 | 525k | " sbbq (%5,%2,8),%0 \n" |
253 | 525k | " movq %0,(%3,%2,8) \n" |
254 | 525k | " lea 1(%2),%2 \n" |
255 | 525k | " dec %1 \n" |
256 | 525k | " jnz 1b \n" |
257 | 525k | " sbbq %0,%0 \n" |
258 | 525k | :"=&r" (ret), "+c"(n), "+r"(i) |
259 | 525k | :"r"(rp), "r"(ap), "r"(bp) |
260 | 525k | :"cc", "memory"); |
261 | 525k | |
262 | 525k | return ret & 1; |
263 | 525k | } |
264 | | # else |
265 | | /* Simics 1.4<7 has buggy sbbq:-( */ |
266 | | # define BN_MASK2 0xffffffffffffffffL |
267 | | BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) |
268 | | { |
269 | | BN_ULONG t1, t2; |
270 | | int c = 0; |
271 | | |
272 | | if (n <= 0) |
273 | | return (BN_ULONG)0; |
274 | | |
275 | | for (;;) { |
276 | | t1 = a[0]; |
277 | | t2 = b[0]; |
278 | | r[0] = (t1 - t2 - c) & BN_MASK2; |
279 | | if (t1 != t2) |
280 | | c = (t1 < t2); |
281 | | if (--n <= 0) |
282 | | break; |
283 | | |
284 | | t1 = a[1]; |
285 | | t2 = b[1]; |
286 | | r[1] = (t1 - t2 - c) & BN_MASK2; |
287 | | if (t1 != t2) |
288 | | c = (t1 < t2); |
289 | | if (--n <= 0) |
290 | | break; |
291 | | |
292 | | t1 = a[2]; |
293 | | t2 = b[2]; |
294 | | r[2] = (t1 - t2 - c) & BN_MASK2; |
295 | | if (t1 != t2) |
296 | | c = (t1 < t2); |
297 | | if (--n <= 0) |
298 | | break; |
299 | | |
300 | | t1 = a[3]; |
301 | | t2 = b[3]; |
302 | | r[3] = (t1 - t2 - c) & BN_MASK2; |
303 | | if (t1 != t2) |
304 | | c = (t1 < t2); |
305 | | if (--n <= 0) |
306 | | break; |
307 | | |
308 | | a += 4; |
309 | | b += 4; |
310 | | r += 4; |
311 | | } |
312 | | return c; |
313 | | } |
314 | | # endif |
315 | | |
316 | | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
317 | | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ |
318 | | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ |
319 | | /* |
320 | | * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number |
321 | | * c=(c2,c1,c0) |
322 | | */ |
323 | | |
324 | | /* |
325 | | * Keep in mind that carrying into high part of multiplication result |
326 | | * can not overflow, because it cannot be all-ones. |
327 | | */ |
328 | | # if 0 |
329 | | /* original macros are kept for reference purposes */ |
330 | | # define mul_add_c(a,b,c0,c1,c2) do { \ |
331 | | BN_ULONG ta = (a), tb = (b); \ |
332 | | BN_ULONG lo, hi; \ |
333 | | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
334 | | c0 += lo; hi += (c0<lo)?1:0; \ |
335 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
336 | | } while(0) |
337 | | |
338 | | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
339 | | BN_ULONG ta = (a), tb = (b); \ |
340 | | BN_ULONG lo, hi, tt; \ |
341 | | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
342 | | c0 += lo; tt = hi+((c0<lo)?1:0); \ |
343 | | c1 += tt; c2 += (c1<tt)?1:0; \ |
344 | | c0 += lo; hi += (c0<lo)?1:0; \ |
345 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
346 | | } while(0) |
347 | | |
348 | | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
349 | | BN_ULONG ta = (a)[i]; \ |
350 | | BN_ULONG lo, hi; \ |
351 | | BN_UMULT_LOHI(lo,hi,ta,ta); \ |
352 | | c0 += lo; hi += (c0<lo)?1:0; \ |
353 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
354 | | } while(0) |
355 | | # else |
356 | 29.6M | # define mul_add_c(a,b,c0,c1,c2) do { \ |
357 | 29.6M | BN_ULONG t1,t2; \ |
358 | 29.6M | asm ("mulq %3" \ |
359 | 29.6M | : "=a"(t1),"=d"(t2) \ |
360 | 29.6M | : "a"(a),"m"(b) \ |
361 | 29.6M | : "cc"); \ |
362 | 29.6M | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
363 | 29.6M | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
364 | 29.6M | : "r"(t1),"r"(t2),"g"(0) \ |
365 | 29.6M | : "cc"); \ |
366 | 29.6M | } while (0) |
367 | | |
368 | 0 | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
369 | 0 | BN_ULONG t1,t2; \ |
370 | 0 | asm ("mulq %2" \ |
371 | 0 | : "=a"(t1),"=d"(t2) \ |
372 | 0 | : "a"(a[i]) \ |
373 | 0 | : "cc"); \ |
374 | 0 | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
375 | 0 | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
376 | 0 | : "r"(t1),"r"(t2),"g"(0) \ |
377 | 0 | : "cc"); \ |
378 | 0 | } while (0) |
379 | | |
380 | 0 | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
381 | 0 | BN_ULONG t1,t2; \ |
382 | 0 | asm ("mulq %3" \ |
383 | 0 | : "=a"(t1),"=d"(t2) \ |
384 | 0 | : "a"(a),"m"(b) \ |
385 | 0 | : "cc"); \ |
386 | 0 | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
387 | 0 | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
388 | 0 | : "r"(t1),"r"(t2),"g"(0) \ |
389 | 0 | : "cc"); \ |
390 | 0 | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
391 | 0 | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
392 | 0 | : "r"(t1),"r"(t2),"g"(0) \ |
393 | 0 | : "cc"); \ |
394 | 0 | } while (0) |
395 | | # endif |
396 | | |
397 | | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
398 | 0 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
399 | | |
400 | | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
401 | 463k | { |
402 | 463k | BN_ULONG c1, c2, c3; |
403 | 463k | |
404 | 463k | c1 = 0; |
405 | 463k | c2 = 0; |
406 | 463k | c3 = 0; |
407 | 463k | mul_add_c(a[0], b[0], c1, c2, c3); |
408 | 463k | r[0] = c1; |
409 | 463k | c1 = 0; |
410 | 463k | mul_add_c(a[0], b[1], c2, c3, c1); |
411 | 463k | mul_add_c(a[1], b[0], c2, c3, c1); |
412 | 463k | r[1] = c2; |
413 | 463k | c2 = 0; |
414 | 463k | mul_add_c(a[2], b[0], c3, c1, c2); |
415 | 463k | mul_add_c(a[1], b[1], c3, c1, c2); |
416 | 463k | mul_add_c(a[0], b[2], c3, c1, c2); |
417 | 463k | r[2] = c3; |
418 | 463k | c3 = 0; |
419 | 463k | mul_add_c(a[0], b[3], c1, c2, c3); |
420 | 463k | mul_add_c(a[1], b[2], c1, c2, c3); |
421 | 463k | mul_add_c(a[2], b[1], c1, c2, c3); |
422 | 463k | mul_add_c(a[3], b[0], c1, c2, c3); |
423 | 463k | r[3] = c1; |
424 | 463k | c1 = 0; |
425 | 463k | mul_add_c(a[4], b[0], c2, c3, c1); |
426 | 463k | mul_add_c(a[3], b[1], c2, c3, c1); |
427 | 463k | mul_add_c(a[2], b[2], c2, c3, c1); |
428 | 463k | mul_add_c(a[1], b[3], c2, c3, c1); |
429 | 463k | mul_add_c(a[0], b[4], c2, c3, c1); |
430 | 463k | r[4] = c2; |
431 | 463k | c2 = 0; |
432 | 463k | mul_add_c(a[0], b[5], c3, c1, c2); |
433 | 463k | mul_add_c(a[1], b[4], c3, c1, c2); |
434 | 463k | mul_add_c(a[2], b[3], c3, c1, c2); |
435 | 463k | mul_add_c(a[3], b[2], c3, c1, c2); |
436 | 463k | mul_add_c(a[4], b[1], c3, c1, c2); |
437 | 463k | mul_add_c(a[5], b[0], c3, c1, c2); |
438 | 463k | r[5] = c3; |
439 | 463k | c3 = 0; |
440 | 463k | mul_add_c(a[6], b[0], c1, c2, c3); |
441 | 463k | mul_add_c(a[5], b[1], c1, c2, c3); |
442 | 463k | mul_add_c(a[4], b[2], c1, c2, c3); |
443 | 463k | mul_add_c(a[3], b[3], c1, c2, c3); |
444 | 463k | mul_add_c(a[2], b[4], c1, c2, c3); |
445 | 463k | mul_add_c(a[1], b[5], c1, c2, c3); |
446 | 463k | mul_add_c(a[0], b[6], c1, c2, c3); |
447 | 463k | r[6] = c1; |
448 | 463k | c1 = 0; |
449 | 463k | mul_add_c(a[0], b[7], c2, c3, c1); |
450 | 463k | mul_add_c(a[1], b[6], c2, c3, c1); |
451 | 463k | mul_add_c(a[2], b[5], c2, c3, c1); |
452 | 463k | mul_add_c(a[3], b[4], c2, c3, c1); |
453 | 463k | mul_add_c(a[4], b[3], c2, c3, c1); |
454 | 463k | mul_add_c(a[5], b[2], c2, c3, c1); |
455 | 463k | mul_add_c(a[6], b[1], c2, c3, c1); |
456 | 463k | mul_add_c(a[7], b[0], c2, c3, c1); |
457 | 463k | r[7] = c2; |
458 | 463k | c2 = 0; |
459 | 463k | mul_add_c(a[7], b[1], c3, c1, c2); |
460 | 463k | mul_add_c(a[6], b[2], c3, c1, c2); |
461 | 463k | mul_add_c(a[5], b[3], c3, c1, c2); |
462 | 463k | mul_add_c(a[4], b[4], c3, c1, c2); |
463 | 463k | mul_add_c(a[3], b[5], c3, c1, c2); |
464 | 463k | mul_add_c(a[2], b[6], c3, c1, c2); |
465 | 463k | mul_add_c(a[1], b[7], c3, c1, c2); |
466 | 463k | r[8] = c3; |
467 | 463k | c3 = 0; |
468 | 463k | mul_add_c(a[2], b[7], c1, c2, c3); |
469 | 463k | mul_add_c(a[3], b[6], c1, c2, c3); |
470 | 463k | mul_add_c(a[4], b[5], c1, c2, c3); |
471 | 463k | mul_add_c(a[5], b[4], c1, c2, c3); |
472 | 463k | mul_add_c(a[6], b[3], c1, c2, c3); |
473 | 463k | mul_add_c(a[7], b[2], c1, c2, c3); |
474 | 463k | r[9] = c1; |
475 | 463k | c1 = 0; |
476 | 463k | mul_add_c(a[7], b[3], c2, c3, c1); |
477 | 463k | mul_add_c(a[6], b[4], c2, c3, c1); |
478 | 463k | mul_add_c(a[5], b[5], c2, c3, c1); |
479 | 463k | mul_add_c(a[4], b[6], c2, c3, c1); |
480 | 463k | mul_add_c(a[3], b[7], c2, c3, c1); |
481 | 463k | r[10] = c2; |
482 | 463k | c2 = 0; |
483 | 463k | mul_add_c(a[4], b[7], c3, c1, c2); |
484 | 463k | mul_add_c(a[5], b[6], c3, c1, c2); |
485 | 463k | mul_add_c(a[6], b[5], c3, c1, c2); |
486 | 463k | mul_add_c(a[7], b[4], c3, c1, c2); |
487 | 463k | r[11] = c3; |
488 | 463k | c3 = 0; |
489 | 463k | mul_add_c(a[7], b[5], c1, c2, c3); |
490 | 463k | mul_add_c(a[6], b[6], c1, c2, c3); |
491 | 463k | mul_add_c(a[5], b[7], c1, c2, c3); |
492 | 463k | r[12] = c1; |
493 | 463k | c1 = 0; |
494 | 463k | mul_add_c(a[6], b[7], c2, c3, c1); |
495 | 463k | mul_add_c(a[7], b[6], c2, c3, c1); |
496 | 463k | r[13] = c2; |
497 | 463k | c2 = 0; |
498 | 463k | mul_add_c(a[7], b[7], c3, c1, c2); |
499 | 463k | r[14] = c3; |
500 | 463k | r[15] = c1; |
501 | 463k | } |
502 | | |
503 | | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
504 | 0 | { |
505 | 0 | BN_ULONG c1, c2, c3; |
506 | 0 |
|
507 | 0 | c1 = 0; |
508 | 0 | c2 = 0; |
509 | 0 | c3 = 0; |
510 | 0 | mul_add_c(a[0], b[0], c1, c2, c3); |
511 | 0 | r[0] = c1; |
512 | 0 | c1 = 0; |
513 | 0 | mul_add_c(a[0], b[1], c2, c3, c1); |
514 | 0 | mul_add_c(a[1], b[0], c2, c3, c1); |
515 | 0 | r[1] = c2; |
516 | 0 | c2 = 0; |
517 | 0 | mul_add_c(a[2], b[0], c3, c1, c2); |
518 | 0 | mul_add_c(a[1], b[1], c3, c1, c2); |
519 | 0 | mul_add_c(a[0], b[2], c3, c1, c2); |
520 | 0 | r[2] = c3; |
521 | 0 | c3 = 0; |
522 | 0 | mul_add_c(a[0], b[3], c1, c2, c3); |
523 | 0 | mul_add_c(a[1], b[2], c1, c2, c3); |
524 | 0 | mul_add_c(a[2], b[1], c1, c2, c3); |
525 | 0 | mul_add_c(a[3], b[0], c1, c2, c3); |
526 | 0 | r[3] = c1; |
527 | 0 | c1 = 0; |
528 | 0 | mul_add_c(a[3], b[1], c2, c3, c1); |
529 | 0 | mul_add_c(a[2], b[2], c2, c3, c1); |
530 | 0 | mul_add_c(a[1], b[3], c2, c3, c1); |
531 | 0 | r[4] = c2; |
532 | 0 | c2 = 0; |
533 | 0 | mul_add_c(a[2], b[3], c3, c1, c2); |
534 | 0 | mul_add_c(a[3], b[2], c3, c1, c2); |
535 | 0 | r[5] = c3; |
536 | 0 | c3 = 0; |
537 | 0 | mul_add_c(a[3], b[3], c1, c2, c3); |
538 | 0 | r[6] = c1; |
539 | 0 | r[7] = c2; |
540 | 0 | } |
541 | | |
542 | | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
543 | 0 | { |
544 | 0 | BN_ULONG c1, c2, c3; |
545 | 0 |
|
546 | 0 | c1 = 0; |
547 | 0 | c2 = 0; |
548 | 0 | c3 = 0; |
549 | 0 | sqr_add_c(a, 0, c1, c2, c3); |
550 | 0 | r[0] = c1; |
551 | 0 | c1 = 0; |
552 | 0 | sqr_add_c2(a, 1, 0, c2, c3, c1); |
553 | 0 | r[1] = c2; |
554 | 0 | c2 = 0; |
555 | 0 | sqr_add_c(a, 1, c3, c1, c2); |
556 | 0 | sqr_add_c2(a, 2, 0, c3, c1, c2); |
557 | 0 | r[2] = c3; |
558 | 0 | c3 = 0; |
559 | 0 | sqr_add_c2(a, 3, 0, c1, c2, c3); |
560 | 0 | sqr_add_c2(a, 2, 1, c1, c2, c3); |
561 | 0 | r[3] = c1; |
562 | 0 | c1 = 0; |
563 | 0 | sqr_add_c(a, 2, c2, c3, c1); |
564 | 0 | sqr_add_c2(a, 3, 1, c2, c3, c1); |
565 | 0 | sqr_add_c2(a, 4, 0, c2, c3, c1); |
566 | 0 | r[4] = c2; |
567 | 0 | c2 = 0; |
568 | 0 | sqr_add_c2(a, 5, 0, c3, c1, c2); |
569 | 0 | sqr_add_c2(a, 4, 1, c3, c1, c2); |
570 | 0 | sqr_add_c2(a, 3, 2, c3, c1, c2); |
571 | 0 | r[5] = c3; |
572 | 0 | c3 = 0; |
573 | 0 | sqr_add_c(a, 3, c1, c2, c3); |
574 | 0 | sqr_add_c2(a, 4, 2, c1, c2, c3); |
575 | 0 | sqr_add_c2(a, 5, 1, c1, c2, c3); |
576 | 0 | sqr_add_c2(a, 6, 0, c1, c2, c3); |
577 | 0 | r[6] = c1; |
578 | 0 | c1 = 0; |
579 | 0 | sqr_add_c2(a, 7, 0, c2, c3, c1); |
580 | 0 | sqr_add_c2(a, 6, 1, c2, c3, c1); |
581 | 0 | sqr_add_c2(a, 5, 2, c2, c3, c1); |
582 | 0 | sqr_add_c2(a, 4, 3, c2, c3, c1); |
583 | 0 | r[7] = c2; |
584 | 0 | c2 = 0; |
585 | 0 | sqr_add_c(a, 4, c3, c1, c2); |
586 | 0 | sqr_add_c2(a, 5, 3, c3, c1, c2); |
587 | 0 | sqr_add_c2(a, 6, 2, c3, c1, c2); |
588 | 0 | sqr_add_c2(a, 7, 1, c3, c1, c2); |
589 | 0 | r[8] = c3; |
590 | 0 | c3 = 0; |
591 | 0 | sqr_add_c2(a, 7, 2, c1, c2, c3); |
592 | 0 | sqr_add_c2(a, 6, 3, c1, c2, c3); |
593 | 0 | sqr_add_c2(a, 5, 4, c1, c2, c3); |
594 | 0 | r[9] = c1; |
595 | 0 | c1 = 0; |
596 | 0 | sqr_add_c(a, 5, c2, c3, c1); |
597 | 0 | sqr_add_c2(a, 6, 4, c2, c3, c1); |
598 | 0 | sqr_add_c2(a, 7, 3, c2, c3, c1); |
599 | 0 | r[10] = c2; |
600 | 0 | c2 = 0; |
601 | 0 | sqr_add_c2(a, 7, 4, c3, c1, c2); |
602 | 0 | sqr_add_c2(a, 6, 5, c3, c1, c2); |
603 | 0 | r[11] = c3; |
604 | 0 | c3 = 0; |
605 | 0 | sqr_add_c(a, 6, c1, c2, c3); |
606 | 0 | sqr_add_c2(a, 7, 5, c1, c2, c3); |
607 | 0 | r[12] = c1; |
608 | 0 | c1 = 0; |
609 | 0 | sqr_add_c2(a, 7, 6, c2, c3, c1); |
610 | 0 | r[13] = c2; |
611 | 0 | c2 = 0; |
612 | 0 | sqr_add_c(a, 7, c3, c1, c2); |
613 | 0 | r[14] = c3; |
614 | 0 | r[15] = c1; |
615 | 0 | } |
616 | | |
617 | | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
618 | 0 | { |
619 | 0 | BN_ULONG c1, c2, c3; |
620 | 0 |
|
621 | 0 | c1 = 0; |
622 | 0 | c2 = 0; |
623 | 0 | c3 = 0; |
624 | 0 | sqr_add_c(a, 0, c1, c2, c3); |
625 | 0 | r[0] = c1; |
626 | 0 | c1 = 0; |
627 | 0 | sqr_add_c2(a, 1, 0, c2, c3, c1); |
628 | 0 | r[1] = c2; |
629 | 0 | c2 = 0; |
630 | 0 | sqr_add_c(a, 1, c3, c1, c2); |
631 | 0 | sqr_add_c2(a, 2, 0, c3, c1, c2); |
632 | 0 | r[2] = c3; |
633 | 0 | c3 = 0; |
634 | 0 | sqr_add_c2(a, 3, 0, c1, c2, c3); |
635 | 0 | sqr_add_c2(a, 2, 1, c1, c2, c3); |
636 | 0 | r[3] = c1; |
637 | 0 | c1 = 0; |
638 | 0 | sqr_add_c(a, 2, c2, c3, c1); |
639 | 0 | sqr_add_c2(a, 3, 1, c2, c3, c1); |
640 | 0 | r[4] = c2; |
641 | 0 | c2 = 0; |
642 | 0 | sqr_add_c2(a, 3, 2, c3, c1, c2); |
643 | 0 | r[5] = c3; |
644 | 0 | c3 = 0; |
645 | 0 | sqr_add_c(a, 3, c1, c2, c3); |
646 | 0 | r[6] = c1; |
647 | 0 | r[7] = c2; |
648 | 0 | } |
649 | | #endif |