/src/openssl34/crypto/bn/asm/x86_64-gcc.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved. |
3 | | * |
4 | | * Licensed under the Apache License 2.0 (the "License"). You may not use |
5 | | * this file except in compliance with the License. You can obtain a copy |
6 | | * in the file LICENSE in the source distribution or at |
7 | | * https://www.openssl.org/source/license.html |
8 | | */ |
9 | | |
10 | | #include "../bn_local.h" |
11 | | #if !(defined(__GNUC__) && __GNUC__ >= 2) |
12 | | /* clang-format off */ |
13 | | # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ |
14 | | /* clang-format on */ |
15 | | #else |
16 | | /*- |
17 | | * x86_64 BIGNUM accelerator version 0.1, December 2002. |
18 | | * |
19 | | * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL |
20 | | * project. |
21 | | * |
22 | | * Rights for redistribution and usage in source and binary forms are |
23 | | * granted according to the License. Warranty of any kind is disclaimed. |
24 | | * |
25 | | * Q. Version 0.1? It doesn't sound like Andy, he used to assign real |
26 | | * versions, like 1.0... |
27 | | * A. Well, that's because this code is basically a quick-n-dirty |
28 | | * proof-of-concept hack. As you can see it's implemented with |
29 | | * inline assembler, which means that you're bound to GCC and that |
30 | | * there might be enough room for further improvement. |
31 | | * |
32 | | * Q. Why inline assembler? |
33 | | * A. x86_64 features own ABI which I'm not familiar with. This is |
34 | | * why I decided to let the compiler take care of subroutine |
35 | | * prologue/epilogue as well as register allocation. For reference. |
36 | | * Win64 implements different ABI for AMD64, different from Linux. |
37 | | * |
38 | | * Q. How much faster does it get? |
39 | | * A. 'apps/openssl speed rsa dsa' output with no-asm: |
40 | | * |
41 | | * sign verify sign/s verify/s |
42 | | * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 |
43 | | * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 |
44 | | * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 |
45 | | * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 |
46 | | * sign verify sign/s verify/s |
47 | | * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 |
48 | | * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 |
49 | | * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 |
50 | | * |
51 | | * 'apps/openssl speed rsa dsa' output with this module: |
52 | | * |
53 | | * sign verify sign/s verify/s |
54 | | * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 |
55 | | * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 |
56 | | * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 |
57 | | * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 |
58 | | * sign verify sign/s verify/s |
59 | | * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 |
60 | | * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 |
61 | | * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 |
62 | | * |
63 | | * For the reference. IA-32 assembler implementation performs |
64 | | * very much like 64-bit code compiled with no-asm on the same |
65 | | * machine. |
66 | | */ |
67 | | |
68 | | #undef mul |
69 | | #undef mul_add |
70 | | |
71 | | /*- |
72 | | * "m"(a), "+m"(r) is the way to favor DirectPath ยต-code; |
73 | | * "g"(0) let the compiler to decide where does it |
74 | | * want to keep the value of zero; |
75 | | */ |
76 | | #define mul_add(r, a, word, carry) \ |
77 | 163G | do { \ |
78 | 163G | register BN_ULONG high, low; \ |
79 | 163G | asm("mulq %3" \ |
80 | 163G | : "=a"(low), "=d"(high) \ |
81 | 163G | : "a"(word), "m"(a) \ |
82 | 163G | : "cc"); \ |
83 | 163G | asm("addq %2,%0; adcq %3,%1" \ |
84 | 163G | : "+r"(carry), "+d"(high) \ |
85 | 163G | : "a"(low), "g"(0) \ |
86 | 163G | : "cc"); \ |
87 | 163G | asm("addq %2,%0; adcq %3,%1" \ |
88 | 163G | : "+m"(r), "+d"(high) \ |
89 | 163G | : "r"(carry), "g"(0) \ |
90 | 163G | : "cc"); \ |
91 | 163G | carry = high; \ |
92 | 163G | } while (0) |
93 | | |
94 | | #define mul(r, a, word, carry) \ |
95 | 27.0G | do { \ |
96 | 27.0G | register BN_ULONG high, low; \ |
97 | 27.0G | asm("mulq %3" \ |
98 | 27.0G | : "=a"(low), "=d"(high) \ |
99 | 27.0G | : "a"(word), "g"(a) \ |
100 | 27.0G | : "cc"); \ |
101 | 27.0G | asm("addq %2,%0; adcq %3,%1" \ |
102 | 27.0G | : "+r"(carry), "+d"(high) \ |
103 | 27.0G | : "a"(low), "g"(0) \ |
104 | 27.0G | : "cc"); \ |
105 | 27.0G | (r) = carry, carry = high; \ |
106 | 27.0G | } while (0) |
107 | | #undef sqr |
108 | | #define sqr(r0, r1, a) \ |
109 | 89.3M | asm("mulq %2" \ |
110 | 89.3M | : "=a"(r0), "=d"(r1) \ |
111 | 89.3M | : "a"(a) \ |
112 | 89.3M | : "cc"); |
113 | | |
114 | | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
115 | | BN_ULONG w) |
116 | 213M | { |
117 | 213M | BN_ULONG c1 = 0; |
118 | | |
119 | 213M | if (num <= 0) |
120 | 0 | return c1; |
121 | | |
122 | 41.0G | while (num & ~3) { |
123 | 40.8G | mul_add(rp[0], ap[0], w, c1); |
124 | 40.8G | mul_add(rp[1], ap[1], w, c1); |
125 | 40.8G | mul_add(rp[2], ap[2], w, c1); |
126 | 40.8G | mul_add(rp[3], ap[3], w, c1); |
127 | 40.8G | ap += 4; |
128 | 40.8G | rp += 4; |
129 | 40.8G | num -= 4; |
130 | 40.8G | } |
131 | 213M | if (num) { |
132 | 140M | mul_add(rp[0], ap[0], w, c1); |
133 | 140M | if (--num == 0) |
134 | 53.7M | return c1; |
135 | 87.1M | mul_add(rp[1], ap[1], w, c1); |
136 | 87.1M | if (--num == 0) |
137 | 39.0M | return c1; |
138 | 48.0M | mul_add(rp[2], ap[2], w, c1); |
139 | 48.0M | return c1; |
140 | 87.1M | } |
141 | | |
142 | 72.4M | return c1; |
143 | 213M | } |
144 | | |
145 | | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
146 | 483M | { |
147 | 483M | BN_ULONG c1 = 0; |
148 | | |
149 | 483M | if (num <= 0) |
150 | 0 | return c1; |
151 | | |
152 | 7.20G | while (num & ~3) { |
153 | 6.71G | mul(rp[0], ap[0], w, c1); |
154 | 6.71G | mul(rp[1], ap[1], w, c1); |
155 | 6.71G | mul(rp[2], ap[2], w, c1); |
156 | 6.71G | mul(rp[3], ap[3], w, c1); |
157 | 6.71G | ap += 4; |
158 | 6.71G | rp += 4; |
159 | 6.71G | num -= 4; |
160 | 6.71G | } |
161 | 483M | if (num) { |
162 | 68.5M | mul(rp[0], ap[0], w, c1); |
163 | 68.5M | if (--num == 0) |
164 | 31.3M | return c1; |
165 | 37.2M | mul(rp[1], ap[1], w, c1); |
166 | 37.2M | if (--num == 0) |
167 | 19.9M | return c1; |
168 | 17.3M | mul(rp[2], ap[2], w, c1); |
169 | 17.3M | } |
170 | 432M | return c1; |
171 | 483M | } |
172 | | |
173 | | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
174 | 5.14M | { |
175 | 5.14M | if (n <= 0) |
176 | 0 | return; |
177 | | |
178 | 25.9M | while (n & ~3) { |
179 | 20.8M | sqr(r[0], r[1], a[0]); |
180 | 20.8M | sqr(r[2], r[3], a[1]); |
181 | 20.8M | sqr(r[4], r[5], a[2]); |
182 | 20.8M | sqr(r[6], r[7], a[3]); |
183 | 20.8M | a += 4; |
184 | 20.8M | r += 8; |
185 | 20.8M | n -= 4; |
186 | 20.8M | } |
187 | 5.14M | if (n) { |
188 | 4.96M | sqr(r[0], r[1], a[0]); |
189 | 4.96M | if (--n == 0) |
190 | 4.27M | return; |
191 | 690k | sqr(r[2], r[3], a[1]); |
192 | 690k | if (--n == 0) |
193 | 334k | return; |
194 | 356k | sqr(r[4], r[5], a[2]); |
195 | 356k | } |
196 | 5.14M | } |
197 | | |
198 | | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
199 | 447M | { |
200 | 447M | BN_ULONG ret, waste; |
201 | | |
202 | 447M | asm("divq %4" : "=a"(ret), "=d"(waste) |
203 | 447M | : "a"(l), "d"(h), "r"(d) |
204 | 447M | : "cc"); |
205 | | |
206 | 447M | return ret; |
207 | 447M | } |
208 | | |
209 | | BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
210 | | int n) |
211 | 758M | { |
212 | 758M | BN_ULONG ret; |
213 | 758M | size_t i = 0; |
214 | | |
215 | 758M | if (n <= 0) |
216 | 8.77M | return 0; |
217 | | |
218 | 749M | asm volatile(" subq %0,%0 \n" /* clear carry */ |
219 | 749M | " jmp 1f \n" |
220 | 749M | ".p2align 4 \n" |
221 | 749M | "1: movq (%4,%2,8),%0 \n" |
222 | 749M | " adcq (%5,%2,8),%0 \n" |
223 | 749M | " movq %0,(%3,%2,8) \n" |
224 | 749M | " lea 1(%2),%2 \n" |
225 | 749M | " dec %1 \n" |
226 | 749M | " jnz 1b \n" |
227 | 749M | " sbbq %0,%0 \n" |
228 | 749M | : "=&r"(ret), "+c"(n), "+r"(i) |
229 | 749M | : "r"(rp), "r"(ap), "r"(bp) |
230 | 749M | : "cc", "memory"); |
231 | | |
232 | 749M | return ret & 1; |
233 | 758M | } |
234 | | |
235 | | #ifndef SIMICS |
236 | | BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
237 | | int n) |
238 | 754M | { |
239 | 754M | BN_ULONG ret; |
240 | 754M | size_t i = 0; |
241 | | |
242 | 754M | if (n <= 0) |
243 | 8.29M | return 0; |
244 | | |
245 | 745M | asm volatile(" subq %0,%0 \n" /* clear borrow */ |
246 | 745M | " jmp 1f \n" |
247 | 745M | ".p2align 4 \n" |
248 | 745M | "1: movq (%4,%2,8),%0 \n" |
249 | 745M | " sbbq (%5,%2,8),%0 \n" |
250 | 745M | " movq %0,(%3,%2,8) \n" |
251 | 745M | " lea 1(%2),%2 \n" |
252 | 745M | " dec %1 \n" |
253 | 745M | " jnz 1b \n" |
254 | 745M | " sbbq %0,%0 \n" |
255 | 745M | : "=&r"(ret), "+c"(n), "+r"(i) |
256 | 745M | : "r"(rp), "r"(ap), "r"(bp) |
257 | 745M | : "cc", "memory"); |
258 | | |
259 | 745M | return ret & 1; |
260 | 754M | } |
261 | | #else |
262 | | /* Simics 1.4<7 has buggy sbbq:-( */ |
263 | | #define BN_MASK2 0xffffffffffffffffL |
264 | | BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) |
265 | | { |
266 | | BN_ULONG t1, t2; |
267 | | int c = 0; |
268 | | |
269 | | if (n <= 0) |
270 | | return (BN_ULONG)0; |
271 | | |
272 | | for (;;) { |
273 | | t1 = a[0]; |
274 | | t2 = b[0]; |
275 | | r[0] = (t1 - t2 - c) & BN_MASK2; |
276 | | if (t1 != t2) |
277 | | c = (t1 < t2); |
278 | | if (--n <= 0) |
279 | | break; |
280 | | |
281 | | t1 = a[1]; |
282 | | t2 = b[1]; |
283 | | r[1] = (t1 - t2 - c) & BN_MASK2; |
284 | | if (t1 != t2) |
285 | | c = (t1 < t2); |
286 | | if (--n <= 0) |
287 | | break; |
288 | | |
289 | | t1 = a[2]; |
290 | | t2 = b[2]; |
291 | | r[2] = (t1 - t2 - c) & BN_MASK2; |
292 | | if (t1 != t2) |
293 | | c = (t1 < t2); |
294 | | if (--n <= 0) |
295 | | break; |
296 | | |
297 | | t1 = a[3]; |
298 | | t2 = b[3]; |
299 | | r[3] = (t1 - t2 - c) & BN_MASK2; |
300 | | if (t1 != t2) |
301 | | c = (t1 < t2); |
302 | | if (--n <= 0) |
303 | | break; |
304 | | |
305 | | a += 4; |
306 | | b += 4; |
307 | | r += 4; |
308 | | } |
309 | | return c; |
310 | | } |
311 | | #endif |
312 | | |
313 | | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
314 | | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ |
315 | | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ |
316 | | /* |
317 | | * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number |
318 | | * c=(c2,c1,c0) |
319 | | */ |
320 | | |
321 | | /* |
322 | | * Keep in mind that carrying into high part of multiplication result |
323 | | * can not overflow, because it cannot be all-ones. |
324 | | */ |
325 | | #if 0 |
326 | | /* original macros are kept for reference purposes */ |
327 | | #define mul_add_c(a, b, c0, c1, c2) \ |
328 | | do { \ |
329 | | BN_ULONG ta = (a), tb = (b); \ |
330 | | BN_ULONG lo, hi; \ |
331 | | BN_UMULT_LOHI(lo, hi, ta, tb); \ |
332 | | c0 += lo; \ |
333 | | hi += (c0 < lo) ? 1 : 0; \ |
334 | | c1 += hi; \ |
335 | | c2 += (c1 < hi) ? 1 : 0; \ |
336 | | } while (0) |
337 | | |
338 | | #define mul_add_c2(a, b, c0, c1, c2) \ |
339 | | do { \ |
340 | | BN_ULONG ta = (a), tb = (b); \ |
341 | | BN_ULONG lo, hi, tt; \ |
342 | | BN_UMULT_LOHI(lo, hi, ta, tb); \ |
343 | | c0 += lo; \ |
344 | | tt = hi + ((c0 < lo) ? 1 : 0); \ |
345 | | c1 += tt; \ |
346 | | c2 += (c1 < tt) ? 1 : 0; \ |
347 | | c0 += lo; \ |
348 | | hi += (c0 < lo) ? 1 : 0; \ |
349 | | c1 += hi; \ |
350 | | c2 += (c1 < hi) ? 1 : 0; \ |
351 | | } while (0) |
352 | | |
353 | | #define sqr_add_c(a, i, c0, c1, c2) \ |
354 | | do { \ |
355 | | BN_ULONG ta = (a)[i]; \ |
356 | | BN_ULONG lo, hi; \ |
357 | | BN_UMULT_LOHI(lo, hi, ta, ta); \ |
358 | | c0 += lo; \ |
359 | | hi += (c0 < lo) ? 1 : 0; \ |
360 | | c1 += hi; \ |
361 | | c2 += (c1 < hi) ? 1 : 0; \ |
362 | | } while (0) |
363 | | #else |
364 | | #define mul_add_c(a, b, c0, c1, c2) \ |
365 | 7.51G | do { \ |
366 | 7.51G | BN_ULONG t1, t2; \ |
367 | 7.51G | asm("mulq %3" \ |
368 | 7.51G | : "=a"(t1), "=d"(t2) \ |
369 | 7.51G | : "a"(a), "m"(b) \ |
370 | 7.51G | : "cc"); \ |
371 | 7.51G | asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
372 | 7.51G | : "+r"(c0), "+r"(c1), "+r"(c2) \ |
373 | 7.51G | : "r"(t1), "r"(t2), "g"(0) \ |
374 | 7.51G | : "cc"); \ |
375 | 7.51G | } while (0) |
376 | | |
377 | | #define sqr_add_c(a, i, c0, c1, c2) \ |
378 | 467M | do { \ |
379 | 467M | BN_ULONG t1, t2; \ |
380 | 467M | asm("mulq %2" \ |
381 | 467M | : "=a"(t1), "=d"(t2) \ |
382 | 467M | : "a"(a[i]) \ |
383 | 467M | : "cc"); \ |
384 | 467M | asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
385 | 467M | : "+r"(c0), "+r"(c1), "+r"(c2) \ |
386 | 467M | : "r"(t1), "r"(t2), "g"(0) \ |
387 | 467M | : "cc"); \ |
388 | 467M | } while (0) |
389 | | |
390 | | #define mul_add_c2(a, b, c0, c1, c2) \ |
391 | 915M | do { \ |
392 | 915M | BN_ULONG t1, t2; \ |
393 | 915M | asm("mulq %3" \ |
394 | 915M | : "=a"(t1), "=d"(t2) \ |
395 | 915M | : "a"(a), "m"(b) \ |
396 | 915M | : "cc"); \ |
397 | 915M | asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
398 | 915M | : "+r"(c0), "+r"(c1), "+r"(c2) \ |
399 | 915M | : "r"(t1), "r"(t2), "g"(0) \ |
400 | 915M | : "cc"); \ |
401 | 915M | asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
402 | 915M | : "+r"(c0), "+r"(c1), "+r"(c2) \ |
403 | 915M | : "r"(t1), "r"(t2), "g"(0) \ |
404 | 915M | : "cc"); \ |
405 | 915M | } while (0) |
406 | | #endif |
407 | | |
408 | | #define sqr_add_c2(a, i, j, c0, c1, c2) \ |
409 | 915M | mul_add_c2((a)[i], (a)[j], c0, c1, c2) |
410 | | |
411 | | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
412 | 117M | { |
413 | 117M | BN_ULONG c1, c2, c3; |
414 | | |
415 | 117M | c1 = 0; |
416 | 117M | c2 = 0; |
417 | 117M | c3 = 0; |
418 | 117M | mul_add_c(a[0], b[0], c1, c2, c3); |
419 | 117M | r[0] = c1; |
420 | 117M | c1 = 0; |
421 | 117M | mul_add_c(a[0], b[1], c2, c3, c1); |
422 | 117M | mul_add_c(a[1], b[0], c2, c3, c1); |
423 | 117M | r[1] = c2; |
424 | 117M | c2 = 0; |
425 | 117M | mul_add_c(a[2], b[0], c3, c1, c2); |
426 | 117M | mul_add_c(a[1], b[1], c3, c1, c2); |
427 | 117M | mul_add_c(a[0], b[2], c3, c1, c2); |
428 | 117M | r[2] = c3; |
429 | 117M | c3 = 0; |
430 | 117M | mul_add_c(a[0], b[3], c1, c2, c3); |
431 | 117M | mul_add_c(a[1], b[2], c1, c2, c3); |
432 | 117M | mul_add_c(a[2], b[1], c1, c2, c3); |
433 | 117M | mul_add_c(a[3], b[0], c1, c2, c3); |
434 | 117M | r[3] = c1; |
435 | 117M | c1 = 0; |
436 | 117M | mul_add_c(a[4], b[0], c2, c3, c1); |
437 | 117M | mul_add_c(a[3], b[1], c2, c3, c1); |
438 | 117M | mul_add_c(a[2], b[2], c2, c3, c1); |
439 | 117M | mul_add_c(a[1], b[3], c2, c3, c1); |
440 | 117M | mul_add_c(a[0], b[4], c2, c3, c1); |
441 | 117M | r[4] = c2; |
442 | 117M | c2 = 0; |
443 | 117M | mul_add_c(a[0], b[5], c3, c1, c2); |
444 | 117M | mul_add_c(a[1], b[4], c3, c1, c2); |
445 | 117M | mul_add_c(a[2], b[3], c3, c1, c2); |
446 | 117M | mul_add_c(a[3], b[2], c3, c1, c2); |
447 | 117M | mul_add_c(a[4], b[1], c3, c1, c2); |
448 | 117M | mul_add_c(a[5], b[0], c3, c1, c2); |
449 | 117M | r[5] = c3; |
450 | 117M | c3 = 0; |
451 | 117M | mul_add_c(a[6], b[0], c1, c2, c3); |
452 | 117M | mul_add_c(a[5], b[1], c1, c2, c3); |
453 | 117M | mul_add_c(a[4], b[2], c1, c2, c3); |
454 | 117M | mul_add_c(a[3], b[3], c1, c2, c3); |
455 | 117M | mul_add_c(a[2], b[4], c1, c2, c3); |
456 | 117M | mul_add_c(a[1], b[5], c1, c2, c3); |
457 | 117M | mul_add_c(a[0], b[6], c1, c2, c3); |
458 | 117M | r[6] = c1; |
459 | 117M | c1 = 0; |
460 | 117M | mul_add_c(a[0], b[7], c2, c3, c1); |
461 | 117M | mul_add_c(a[1], b[6], c2, c3, c1); |
462 | 117M | mul_add_c(a[2], b[5], c2, c3, c1); |
463 | 117M | mul_add_c(a[3], b[4], c2, c3, c1); |
464 | 117M | mul_add_c(a[4], b[3], c2, c3, c1); |
465 | 117M | mul_add_c(a[5], b[2], c2, c3, c1); |
466 | 117M | mul_add_c(a[6], b[1], c2, c3, c1); |
467 | 117M | mul_add_c(a[7], b[0], c2, c3, c1); |
468 | 117M | r[7] = c2; |
469 | 117M | c2 = 0; |
470 | 117M | mul_add_c(a[7], b[1], c3, c1, c2); |
471 | 117M | mul_add_c(a[6], b[2], c3, c1, c2); |
472 | 117M | mul_add_c(a[5], b[3], c3, c1, c2); |
473 | 117M | mul_add_c(a[4], b[4], c3, c1, c2); |
474 | 117M | mul_add_c(a[3], b[5], c3, c1, c2); |
475 | 117M | mul_add_c(a[2], b[6], c3, c1, c2); |
476 | 117M | mul_add_c(a[1], b[7], c3, c1, c2); |
477 | 117M | r[8] = c3; |
478 | 117M | c3 = 0; |
479 | 117M | mul_add_c(a[2], b[7], c1, c2, c3); |
480 | 117M | mul_add_c(a[3], b[6], c1, c2, c3); |
481 | 117M | mul_add_c(a[4], b[5], c1, c2, c3); |
482 | 117M | mul_add_c(a[5], b[4], c1, c2, c3); |
483 | 117M | mul_add_c(a[6], b[3], c1, c2, c3); |
484 | 117M | mul_add_c(a[7], b[2], c1, c2, c3); |
485 | 117M | r[9] = c1; |
486 | 117M | c1 = 0; |
487 | 117M | mul_add_c(a[7], b[3], c2, c3, c1); |
488 | 117M | mul_add_c(a[6], b[4], c2, c3, c1); |
489 | 117M | mul_add_c(a[5], b[5], c2, c3, c1); |
490 | 117M | mul_add_c(a[4], b[6], c2, c3, c1); |
491 | 117M | mul_add_c(a[3], b[7], c2, c3, c1); |
492 | 117M | r[10] = c2; |
493 | 117M | c2 = 0; |
494 | 117M | mul_add_c(a[4], b[7], c3, c1, c2); |
495 | 117M | mul_add_c(a[5], b[6], c3, c1, c2); |
496 | 117M | mul_add_c(a[6], b[5], c3, c1, c2); |
497 | 117M | mul_add_c(a[7], b[4], c3, c1, c2); |
498 | 117M | r[11] = c3; |
499 | 117M | c3 = 0; |
500 | 117M | mul_add_c(a[7], b[5], c1, c2, c3); |
501 | 117M | mul_add_c(a[6], b[6], c1, c2, c3); |
502 | 117M | mul_add_c(a[5], b[7], c1, c2, c3); |
503 | 117M | r[12] = c1; |
504 | 117M | c1 = 0; |
505 | 117M | mul_add_c(a[6], b[7], c2, c3, c1); |
506 | 117M | mul_add_c(a[7], b[6], c2, c3, c1); |
507 | 117M | r[13] = c2; |
508 | 117M | c2 = 0; |
509 | 117M | mul_add_c(a[7], b[7], c3, c1, c2); |
510 | 117M | r[14] = c3; |
511 | 117M | r[15] = c1; |
512 | 117M | } |
513 | | |
514 | | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
515 | 0 | { |
516 | 0 | BN_ULONG c1, c2, c3; |
517 | |
|
518 | 0 | c1 = 0; |
519 | 0 | c2 = 0; |
520 | 0 | c3 = 0; |
521 | 0 | mul_add_c(a[0], b[0], c1, c2, c3); |
522 | 0 | r[0] = c1; |
523 | 0 | c1 = 0; |
524 | 0 | mul_add_c(a[0], b[1], c2, c3, c1); |
525 | 0 | mul_add_c(a[1], b[0], c2, c3, c1); |
526 | 0 | r[1] = c2; |
527 | 0 | c2 = 0; |
528 | 0 | mul_add_c(a[2], b[0], c3, c1, c2); |
529 | 0 | mul_add_c(a[1], b[1], c3, c1, c2); |
530 | 0 | mul_add_c(a[0], b[2], c3, c1, c2); |
531 | 0 | r[2] = c3; |
532 | 0 | c3 = 0; |
533 | 0 | mul_add_c(a[0], b[3], c1, c2, c3); |
534 | 0 | mul_add_c(a[1], b[2], c1, c2, c3); |
535 | 0 | mul_add_c(a[2], b[1], c1, c2, c3); |
536 | 0 | mul_add_c(a[3], b[0], c1, c2, c3); |
537 | 0 | r[3] = c1; |
538 | 0 | c1 = 0; |
539 | 0 | mul_add_c(a[3], b[1], c2, c3, c1); |
540 | 0 | mul_add_c(a[2], b[2], c2, c3, c1); |
541 | 0 | mul_add_c(a[1], b[3], c2, c3, c1); |
542 | 0 | r[4] = c2; |
543 | 0 | c2 = 0; |
544 | 0 | mul_add_c(a[2], b[3], c3, c1, c2); |
545 | 0 | mul_add_c(a[3], b[2], c3, c1, c2); |
546 | 0 | r[5] = c3; |
547 | 0 | c3 = 0; |
548 | 0 | mul_add_c(a[3], b[3], c1, c2, c3); |
549 | 0 | r[6] = c1; |
550 | 0 | r[7] = c2; |
551 | 0 | } |
552 | | |
553 | | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
554 | 13.3M | { |
555 | 13.3M | BN_ULONG c1, c2, c3; |
556 | | |
557 | 13.3M | c1 = 0; |
558 | 13.3M | c2 = 0; |
559 | 13.3M | c3 = 0; |
560 | 13.3M | sqr_add_c(a, 0, c1, c2, c3); |
561 | 13.3M | r[0] = c1; |
562 | 13.3M | c1 = 0; |
563 | 13.3M | sqr_add_c2(a, 1, 0, c2, c3, c1); |
564 | 13.3M | r[1] = c2; |
565 | 13.3M | c2 = 0; |
566 | 13.3M | sqr_add_c(a, 1, c3, c1, c2); |
567 | 13.3M | sqr_add_c2(a, 2, 0, c3, c1, c2); |
568 | 13.3M | r[2] = c3; |
569 | 13.3M | c3 = 0; |
570 | 13.3M | sqr_add_c2(a, 3, 0, c1, c2, c3); |
571 | 13.3M | sqr_add_c2(a, 2, 1, c1, c2, c3); |
572 | 13.3M | r[3] = c1; |
573 | 13.3M | c1 = 0; |
574 | 13.3M | sqr_add_c(a, 2, c2, c3, c1); |
575 | 13.3M | sqr_add_c2(a, 3, 1, c2, c3, c1); |
576 | 13.3M | sqr_add_c2(a, 4, 0, c2, c3, c1); |
577 | 13.3M | r[4] = c2; |
578 | 13.3M | c2 = 0; |
579 | 13.3M | sqr_add_c2(a, 5, 0, c3, c1, c2); |
580 | 13.3M | sqr_add_c2(a, 4, 1, c3, c1, c2); |
581 | 13.3M | sqr_add_c2(a, 3, 2, c3, c1, c2); |
582 | 13.3M | r[5] = c3; |
583 | 13.3M | c3 = 0; |
584 | 13.3M | sqr_add_c(a, 3, c1, c2, c3); |
585 | 13.3M | sqr_add_c2(a, 4, 2, c1, c2, c3); |
586 | 13.3M | sqr_add_c2(a, 5, 1, c1, c2, c3); |
587 | 13.3M | sqr_add_c2(a, 6, 0, c1, c2, c3); |
588 | 13.3M | r[6] = c1; |
589 | 13.3M | c1 = 0; |
590 | 13.3M | sqr_add_c2(a, 7, 0, c2, c3, c1); |
591 | 13.3M | sqr_add_c2(a, 6, 1, c2, c3, c1); |
592 | 13.3M | sqr_add_c2(a, 5, 2, c2, c3, c1); |
593 | 13.3M | sqr_add_c2(a, 4, 3, c2, c3, c1); |
594 | 13.3M | r[7] = c2; |
595 | 13.3M | c2 = 0; |
596 | 13.3M | sqr_add_c(a, 4, c3, c1, c2); |
597 | 13.3M | sqr_add_c2(a, 5, 3, c3, c1, c2); |
598 | 13.3M | sqr_add_c2(a, 6, 2, c3, c1, c2); |
599 | 13.3M | sqr_add_c2(a, 7, 1, c3, c1, c2); |
600 | 13.3M | r[8] = c3; |
601 | 13.3M | c3 = 0; |
602 | 13.3M | sqr_add_c2(a, 7, 2, c1, c2, c3); |
603 | 13.3M | sqr_add_c2(a, 6, 3, c1, c2, c3); |
604 | 13.3M | sqr_add_c2(a, 5, 4, c1, c2, c3); |
605 | 13.3M | r[9] = c1; |
606 | 13.3M | c1 = 0; |
607 | 13.3M | sqr_add_c(a, 5, c2, c3, c1); |
608 | 13.3M | sqr_add_c2(a, 6, 4, c2, c3, c1); |
609 | 13.3M | sqr_add_c2(a, 7, 3, c2, c3, c1); |
610 | 13.3M | r[10] = c2; |
611 | 13.3M | c2 = 0; |
612 | 13.3M | sqr_add_c2(a, 7, 4, c3, c1, c2); |
613 | 13.3M | sqr_add_c2(a, 6, 5, c3, c1, c2); |
614 | 13.3M | r[11] = c3; |
615 | 13.3M | c3 = 0; |
616 | 13.3M | sqr_add_c(a, 6, c1, c2, c3); |
617 | 13.3M | sqr_add_c2(a, 7, 5, c1, c2, c3); |
618 | 13.3M | r[12] = c1; |
619 | 13.3M | c1 = 0; |
620 | 13.3M | sqr_add_c2(a, 7, 6, c2, c3, c1); |
621 | 13.3M | r[13] = c2; |
622 | 13.3M | c2 = 0; |
623 | 13.3M | sqr_add_c(a, 7, c3, c1, c2); |
624 | 13.3M | r[14] = c3; |
625 | 13.3M | r[15] = c1; |
626 | 13.3M | } |
627 | | |
628 | | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
629 | 90.0M | { |
630 | 90.0M | BN_ULONG c1, c2, c3; |
631 | | |
632 | 90.0M | c1 = 0; |
633 | 90.0M | c2 = 0; |
634 | 90.0M | c3 = 0; |
635 | 90.0M | sqr_add_c(a, 0, c1, c2, c3); |
636 | 90.0M | r[0] = c1; |
637 | 90.0M | c1 = 0; |
638 | 90.0M | sqr_add_c2(a, 1, 0, c2, c3, c1); |
639 | 90.0M | r[1] = c2; |
640 | 90.0M | c2 = 0; |
641 | 90.0M | sqr_add_c(a, 1, c3, c1, c2); |
642 | 90.0M | sqr_add_c2(a, 2, 0, c3, c1, c2); |
643 | 90.0M | r[2] = c3; |
644 | 90.0M | c3 = 0; |
645 | 90.0M | sqr_add_c2(a, 3, 0, c1, c2, c3); |
646 | 90.0M | sqr_add_c2(a, 2, 1, c1, c2, c3); |
647 | 90.0M | r[3] = c1; |
648 | 90.0M | c1 = 0; |
649 | 90.0M | sqr_add_c(a, 2, c2, c3, c1); |
650 | 90.0M | sqr_add_c2(a, 3, 1, c2, c3, c1); |
651 | 90.0M | r[4] = c2; |
652 | 90.0M | c2 = 0; |
653 | 90.0M | sqr_add_c2(a, 3, 2, c3, c1, c2); |
654 | 90.0M | r[5] = c3; |
655 | 90.0M | c3 = 0; |
656 | 90.0M | sqr_add_c(a, 3, c1, c2, c3); |
657 | 90.0M | r[6] = c1; |
658 | 90.0M | r[7] = c2; |
659 | 90.0M | } |
660 | | #endif |