/src/openssl111/crypto/bn/asm/x86_64-gcc.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved. |
3 | | * |
4 | | * Licensed under the OpenSSL license (the "License"). You may not use |
5 | | * this file except in compliance with the License. You can obtain a copy |
6 | | * in the file LICENSE in the source distribution or at |
7 | | * https://www.openssl.org/source/license.html |
8 | | */ |
9 | | |
10 | | #include "../bn_local.h" |
11 | | #if !(defined(__GNUC__) && __GNUC__>=2) |
12 | | # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ |
13 | | #else |
14 | | /*- |
15 | | * x86_64 BIGNUM accelerator version 0.1, December 2002. |
16 | | * |
17 | | * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL |
18 | | * project. |
19 | | * |
20 | | * Rights for redistribution and usage in source and binary forms are |
21 | | * granted according to the OpenSSL license. Warranty of any kind is |
22 | | * disclaimed. |
23 | | * |
24 | | * Q. Version 0.1? It doesn't sound like Andy, he used to assign real |
25 | | * versions, like 1.0... |
26 | | * A. Well, that's because this code is basically a quick-n-dirty |
27 | | * proof-of-concept hack. As you can see it's implemented with |
28 | | * inline assembler, which means that you're bound to GCC and that |
29 | | * there might be enough room for further improvement. |
30 | | * |
31 | | * Q. Why inline assembler? |
32 | | * A. x86_64 features own ABI which I'm not familiar with. This is |
33 | | * why I decided to let the compiler take care of subroutine |
34 | | * prologue/epilogue as well as register allocation. For reference. |
35 | | * Win64 implements different ABI for AMD64, different from Linux. |
36 | | * |
37 | | * Q. How much faster does it get? |
38 | | * A. 'apps/openssl speed rsa dsa' output with no-asm: |
39 | | * |
40 | | * sign verify sign/s verify/s |
41 | | * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 |
42 | | * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 |
43 | | * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 |
44 | | * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 |
45 | | * sign verify sign/s verify/s |
46 | | * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 |
47 | | * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 |
48 | | * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 |
49 | | * |
50 | | * 'apps/openssl speed rsa dsa' output with this module: |
51 | | * |
52 | | * sign verify sign/s verify/s |
53 | | * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 |
54 | | * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 |
55 | | * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 |
56 | | * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 |
57 | | * sign verify sign/s verify/s |
58 | | * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 |
59 | | * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 |
60 | | * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 |
61 | | * |
62 | | * For the reference. IA-32 assembler implementation performs |
63 | | * very much like 64-bit code compiled with no-asm on the same |
64 | | * machine. |
65 | | */ |
66 | | |
67 | | # undef mul |
68 | | # undef mul_add |
69 | | |
70 | | /*- |
71 | | * "m"(a), "+m"(r) is the way to favor DirectPath ยต-code; |
72 | | * "g"(0) let the compiler to decide where does it |
73 | | * want to keep the value of zero; |
74 | | */ |
75 | 3.02G | # define mul_add(r,a,word,carry) do { \ |
76 | 3.02G | register BN_ULONG high,low; \ |
77 | 3.02G | asm ("mulq %3" \ |
78 | 3.02G | : "=a"(low),"=d"(high) \ |
79 | 3.02G | : "a"(word),"m"(a) \ |
80 | 3.02G | : "cc"); \ |
81 | 3.02G | asm ("addq %2,%0; adcq %3,%1" \ |
82 | 3.02G | : "+r"(carry),"+d"(high)\ |
83 | 3.02G | : "a"(low),"g"(0) \ |
84 | 3.02G | : "cc"); \ |
85 | 3.02G | asm ("addq %2,%0; adcq %3,%1" \ |
86 | 3.02G | : "+m"(r),"+d"(high) \ |
87 | 3.02G | : "r"(carry),"g"(0) \ |
88 | 3.02G | : "cc"); \ |
89 | 3.02G | carry=high; \ |
90 | 3.02G | } while (0) |
91 | | |
92 | 9.78G | # define mul(r,a,word,carry) do { \ |
93 | 9.78G | register BN_ULONG high,low; \ |
94 | 9.78G | asm ("mulq %3" \ |
95 | 9.78G | : "=a"(low),"=d"(high) \ |
96 | 9.78G | : "a"(word),"g"(a) \ |
97 | 9.78G | : "cc"); \ |
98 | 9.78G | asm ("addq %2,%0; adcq %3,%1" \ |
99 | 9.78G | : "+r"(carry),"+d"(high)\ |
100 | 9.78G | : "a"(low),"g"(0) \ |
101 | 9.78G | : "cc"); \ |
102 | 9.78G | (r)=carry, carry=high; \ |
103 | 9.78G | } while (0) |
104 | | # undef sqr |
105 | | # define sqr(r0,r1,a) \ |
106 | 5.12M | asm ("mulq %2" \ |
107 | 5.12M | : "=a"(r0),"=d"(r1) \ |
108 | 5.12M | : "a"(a) \ |
109 | 5.12M | : "cc"); |
110 | | |
111 | | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
112 | | BN_ULONG w) |
113 | 24.0M | { |
114 | 24.0M | BN_ULONG c1 = 0; |
115 | | |
116 | 24.0M | if (num <= 0) |
117 | 0 | return c1; |
118 | | |
119 | 775M | while (num & ~3) { |
120 | 751M | mul_add(rp[0], ap[0], w, c1); |
121 | 751M | mul_add(rp[1], ap[1], w, c1); |
122 | 751M | mul_add(rp[2], ap[2], w, c1); |
123 | 751M | mul_add(rp[3], ap[3], w, c1); |
124 | 751M | ap += 4; |
125 | 751M | rp += 4; |
126 | 751M | num -= 4; |
127 | 751M | } |
128 | 24.0M | if (num) { |
129 | 8.36M | mul_add(rp[0], ap[0], w, c1); |
130 | 8.36M | if (--num == 0) |
131 | 4.39M | return c1; |
132 | 3.97M | mul_add(rp[1], ap[1], w, c1); |
133 | 3.97M | if (--num == 0) |
134 | 1.95M | return c1; |
135 | 2.02M | mul_add(rp[2], ap[2], w, c1); |
136 | 2.02M | return c1; |
137 | 3.97M | } |
138 | | |
139 | 15.6M | return c1; |
140 | 24.0M | } |
141 | | |
142 | | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
143 | 225M | { |
144 | 225M | BN_ULONG c1 = 0; |
145 | | |
146 | 225M | if (num <= 0) |
147 | 0 | return c1; |
148 | | |
149 | 2.66G | while (num & ~3) { |
150 | 2.43G | mul(rp[0], ap[0], w, c1); |
151 | 2.43G | mul(rp[1], ap[1], w, c1); |
152 | 2.43G | mul(rp[2], ap[2], w, c1); |
153 | 2.43G | mul(rp[3], ap[3], w, c1); |
154 | 2.43G | ap += 4; |
155 | 2.43G | rp += 4; |
156 | 2.43G | num -= 4; |
157 | 2.43G | } |
158 | 225M | if (num) { |
159 | 23.0M | mul(rp[0], ap[0], w, c1); |
160 | 23.0M | if (--num == 0) |
161 | 11.4M | return c1; |
162 | 11.5M | mul(rp[1], ap[1], w, c1); |
163 | 11.5M | if (--num == 0) |
164 | 6.11M | return c1; |
165 | 5.48M | mul(rp[2], ap[2], w, c1); |
166 | 5.48M | } |
167 | 207M | return c1; |
168 | 225M | } |
169 | | |
170 | | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
171 | 1.17M | { |
172 | 1.17M | if (n <= 0) |
173 | 0 | return; |
174 | | |
175 | 2.11M | while (n & ~3) { |
176 | 932k | sqr(r[0], r[1], a[0]); |
177 | 932k | sqr(r[2], r[3], a[1]); |
178 | 932k | sqr(r[4], r[5], a[2]); |
179 | 932k | sqr(r[6], r[7], a[3]); |
180 | 932k | a += 4; |
181 | 932k | r += 8; |
182 | 932k | n -= 4; |
183 | 932k | } |
184 | 1.17M | if (n) { |
185 | 1.16M | sqr(r[0], r[1], a[0]); |
186 | 1.16M | if (--n == 0) |
187 | 1.01M | return; |
188 | 146k | sqr(r[2], r[3], a[1]); |
189 | 146k | if (--n == 0) |
190 | 66.8k | return; |
191 | 79.9k | sqr(r[4], r[5], a[2]); |
192 | 79.9k | } |
193 | 1.17M | } |
194 | | |
195 | | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
196 | 213M | { |
197 | 213M | BN_ULONG ret, waste; |
198 | | |
199 | 213M | asm("divq %4":"=a"(ret), "=d"(waste) |
200 | 213M | : "a"(l), "d"(h), "r"(d) |
201 | 213M | : "cc"); |
202 | | |
203 | 213M | return ret; |
204 | 213M | } |
205 | | |
206 | | BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
207 | | int n) |
208 | 281M | { |
209 | 281M | BN_ULONG ret; |
210 | 281M | size_t i = 0; |
211 | | |
212 | 281M | if (n <= 0) |
213 | 480k | return 0; |
214 | | |
215 | 281M | asm volatile (" subq %0,%0 \n" /* clear carry */ |
216 | 281M | " jmp 1f \n" |
217 | 281M | ".p2align 4 \n" |
218 | 281M | "1: movq (%4,%2,8),%0 \n" |
219 | 281M | " adcq (%5,%2,8),%0 \n" |
220 | 281M | " movq %0,(%3,%2,8) \n" |
221 | 281M | " lea 1(%2),%2 \n" |
222 | 281M | " dec %1 \n" |
223 | 281M | " jnz 1b \n" |
224 | 281M | " sbbq %0,%0 \n" |
225 | 281M | :"=&r" (ret), "+c"(n), "+r"(i) |
226 | 281M | :"r"(rp), "r"(ap), "r"(bp) |
227 | 281M | :"cc", "memory"); |
228 | | |
229 | 281M | return ret & 1; |
230 | 281M | } |
231 | | |
232 | | # ifndef SIMICS |
233 | | BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
234 | | int n) |
235 | 283M | { |
236 | 283M | BN_ULONG ret; |
237 | 283M | size_t i = 0; |
238 | | |
239 | 283M | if (n <= 0) |
240 | 436k | return 0; |
241 | | |
242 | 282M | asm volatile (" subq %0,%0 \n" /* clear borrow */ |
243 | 282M | " jmp 1f \n" |
244 | 282M | ".p2align 4 \n" |
245 | 282M | "1: movq (%4,%2,8),%0 \n" |
246 | 282M | " sbbq (%5,%2,8),%0 \n" |
247 | 282M | " movq %0,(%3,%2,8) \n" |
248 | 282M | " lea 1(%2),%2 \n" |
249 | 282M | " dec %1 \n" |
250 | 282M | " jnz 1b \n" |
251 | 282M | " sbbq %0,%0 \n" |
252 | 282M | :"=&r" (ret), "+c"(n), "+r"(i) |
253 | 282M | :"r"(rp), "r"(ap), "r"(bp) |
254 | 282M | :"cc", "memory"); |
255 | | |
256 | 282M | return ret & 1; |
257 | 283M | } |
258 | | # else |
259 | | /* Simics 1.4<7 has buggy sbbq:-( */ |
260 | | # define BN_MASK2 0xffffffffffffffffL |
261 | | BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) |
262 | | { |
263 | | BN_ULONG t1, t2; |
264 | | int c = 0; |
265 | | |
266 | | if (n <= 0) |
267 | | return (BN_ULONG)0; |
268 | | |
269 | | for (;;) { |
270 | | t1 = a[0]; |
271 | | t2 = b[0]; |
272 | | r[0] = (t1 - t2 - c) & BN_MASK2; |
273 | | if (t1 != t2) |
274 | | c = (t1 < t2); |
275 | | if (--n <= 0) |
276 | | break; |
277 | | |
278 | | t1 = a[1]; |
279 | | t2 = b[1]; |
280 | | r[1] = (t1 - t2 - c) & BN_MASK2; |
281 | | if (t1 != t2) |
282 | | c = (t1 < t2); |
283 | | if (--n <= 0) |
284 | | break; |
285 | | |
286 | | t1 = a[2]; |
287 | | t2 = b[2]; |
288 | | r[2] = (t1 - t2 - c) & BN_MASK2; |
289 | | if (t1 != t2) |
290 | | c = (t1 < t2); |
291 | | if (--n <= 0) |
292 | | break; |
293 | | |
294 | | t1 = a[3]; |
295 | | t2 = b[3]; |
296 | | r[3] = (t1 - t2 - c) & BN_MASK2; |
297 | | if (t1 != t2) |
298 | | c = (t1 < t2); |
299 | | if (--n <= 0) |
300 | | break; |
301 | | |
302 | | a += 4; |
303 | | b += 4; |
304 | | r += 4; |
305 | | } |
306 | | return c; |
307 | | } |
308 | | # endif |
309 | | |
310 | | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
311 | | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ |
312 | | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ |
313 | | /* |
314 | | * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number |
315 | | * c=(c2,c1,c0) |
316 | | */ |
317 | | |
318 | | /* |
319 | | * Keep in mind that carrying into high part of multiplication result |
320 | | * can not overflow, because it cannot be all-ones. |
321 | | */ |
322 | | # if 0 |
323 | | /* original macros are kept for reference purposes */ |
324 | | # define mul_add_c(a,b,c0,c1,c2) do { \ |
325 | | BN_ULONG ta = (a), tb = (b); \ |
326 | | BN_ULONG lo, hi; \ |
327 | | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
328 | | c0 += lo; hi += (c0<lo)?1:0; \ |
329 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
330 | | } while(0) |
331 | | |
332 | | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
333 | | BN_ULONG ta = (a), tb = (b); \ |
334 | | BN_ULONG lo, hi, tt; \ |
335 | | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
336 | | c0 += lo; tt = hi+((c0<lo)?1:0); \ |
337 | | c1 += tt; c2 += (c1<tt)?1:0; \ |
338 | | c0 += lo; hi += (c0<lo)?1:0; \ |
339 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
340 | | } while(0) |
341 | | |
342 | | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
343 | | BN_ULONG ta = (a)[i]; \ |
344 | | BN_ULONG lo, hi; \ |
345 | | BN_UMULT_LOHI(lo,hi,ta,ta); \ |
346 | | c0 += lo; hi += (c0<lo)?1:0; \ |
347 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
348 | | } while(0) |
349 | | # else |
350 | 707M | # define mul_add_c(a,b,c0,c1,c2) do { \ |
351 | 707M | BN_ULONG t1,t2; \ |
352 | 707M | asm ("mulq %3" \ |
353 | 707M | : "=a"(t1),"=d"(t2) \ |
354 | 707M | : "a"(a),"m"(b) \ |
355 | 707M | : "cc"); \ |
356 | 707M | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
357 | 707M | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
358 | 707M | : "r"(t1),"r"(t2),"g"(0) \ |
359 | 707M | : "cc"); \ |
360 | 707M | } while (0) |
361 | | |
362 | 182M | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
363 | 182M | BN_ULONG t1,t2; \ |
364 | 182M | asm ("mulq %2" \ |
365 | 182M | : "=a"(t1),"=d"(t2) \ |
366 | 182M | : "a"(a[i]) \ |
367 | 182M | : "cc"); \ |
368 | 182M | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
369 | 182M | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
370 | 182M | : "r"(t1),"r"(t2),"g"(0) \ |
371 | 182M | : "cc"); \ |
372 | 182M | } while (0) |
373 | | |
374 | 280M | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
375 | 280M | BN_ULONG t1,t2; \ |
376 | 280M | asm ("mulq %3" \ |
377 | 280M | : "=a"(t1),"=d"(t2) \ |
378 | 280M | : "a"(a),"m"(b) \ |
379 | 280M | : "cc"); \ |
380 | 280M | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
381 | 280M | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
382 | 280M | : "r"(t1),"r"(t2),"g"(0) \ |
383 | 280M | : "cc"); \ |
384 | 280M | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
385 | 280M | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
386 | 280M | : "r"(t1),"r"(t2),"g"(0) \ |
387 | 280M | : "cc"); \ |
388 | 280M | } while (0) |
389 | | # endif |
390 | | |
391 | | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
392 | 280M | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
393 | | |
394 | | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
395 | 11.0M | { |
396 | 11.0M | BN_ULONG c1, c2, c3; |
397 | | |
398 | 11.0M | c1 = 0; |
399 | 11.0M | c2 = 0; |
400 | 11.0M | c3 = 0; |
401 | 11.0M | mul_add_c(a[0], b[0], c1, c2, c3); |
402 | 11.0M | r[0] = c1; |
403 | 11.0M | c1 = 0; |
404 | 11.0M | mul_add_c(a[0], b[1], c2, c3, c1); |
405 | 11.0M | mul_add_c(a[1], b[0], c2, c3, c1); |
406 | 11.0M | r[1] = c2; |
407 | 11.0M | c2 = 0; |
408 | 11.0M | mul_add_c(a[2], b[0], c3, c1, c2); |
409 | 11.0M | mul_add_c(a[1], b[1], c3, c1, c2); |
410 | 11.0M | mul_add_c(a[0], b[2], c3, c1, c2); |
411 | 11.0M | r[2] = c3; |
412 | 11.0M | c3 = 0; |
413 | 11.0M | mul_add_c(a[0], b[3], c1, c2, c3); |
414 | 11.0M | mul_add_c(a[1], b[2], c1, c2, c3); |
415 | 11.0M | mul_add_c(a[2], b[1], c1, c2, c3); |
416 | 11.0M | mul_add_c(a[3], b[0], c1, c2, c3); |
417 | 11.0M | r[3] = c1; |
418 | 11.0M | c1 = 0; |
419 | 11.0M | mul_add_c(a[4], b[0], c2, c3, c1); |
420 | 11.0M | mul_add_c(a[3], b[1], c2, c3, c1); |
421 | 11.0M | mul_add_c(a[2], b[2], c2, c3, c1); |
422 | 11.0M | mul_add_c(a[1], b[3], c2, c3, c1); |
423 | 11.0M | mul_add_c(a[0], b[4], c2, c3, c1); |
424 | 11.0M | r[4] = c2; |
425 | 11.0M | c2 = 0; |
426 | 11.0M | mul_add_c(a[0], b[5], c3, c1, c2); |
427 | 11.0M | mul_add_c(a[1], b[4], c3, c1, c2); |
428 | 11.0M | mul_add_c(a[2], b[3], c3, c1, c2); |
429 | 11.0M | mul_add_c(a[3], b[2], c3, c1, c2); |
430 | 11.0M | mul_add_c(a[4], b[1], c3, c1, c2); |
431 | 11.0M | mul_add_c(a[5], b[0], c3, c1, c2); |
432 | 11.0M | r[5] = c3; |
433 | 11.0M | c3 = 0; |
434 | 11.0M | mul_add_c(a[6], b[0], c1, c2, c3); |
435 | 11.0M | mul_add_c(a[5], b[1], c1, c2, c3); |
436 | 11.0M | mul_add_c(a[4], b[2], c1, c2, c3); |
437 | 11.0M | mul_add_c(a[3], b[3], c1, c2, c3); |
438 | 11.0M | mul_add_c(a[2], b[4], c1, c2, c3); |
439 | 11.0M | mul_add_c(a[1], b[5], c1, c2, c3); |
440 | 11.0M | mul_add_c(a[0], b[6], c1, c2, c3); |
441 | 11.0M | r[6] = c1; |
442 | 11.0M | c1 = 0; |
443 | 11.0M | mul_add_c(a[0], b[7], c2, c3, c1); |
444 | 11.0M | mul_add_c(a[1], b[6], c2, c3, c1); |
445 | 11.0M | mul_add_c(a[2], b[5], c2, c3, c1); |
446 | 11.0M | mul_add_c(a[3], b[4], c2, c3, c1); |
447 | 11.0M | mul_add_c(a[4], b[3], c2, c3, c1); |
448 | 11.0M | mul_add_c(a[5], b[2], c2, c3, c1); |
449 | 11.0M | mul_add_c(a[6], b[1], c2, c3, c1); |
450 | 11.0M | mul_add_c(a[7], b[0], c2, c3, c1); |
451 | 11.0M | r[7] = c2; |
452 | 11.0M | c2 = 0; |
453 | 11.0M | mul_add_c(a[7], b[1], c3, c1, c2); |
454 | 11.0M | mul_add_c(a[6], b[2], c3, c1, c2); |
455 | 11.0M | mul_add_c(a[5], b[3], c3, c1, c2); |
456 | 11.0M | mul_add_c(a[4], b[4], c3, c1, c2); |
457 | 11.0M | mul_add_c(a[3], b[5], c3, c1, c2); |
458 | 11.0M | mul_add_c(a[2], b[6], c3, c1, c2); |
459 | 11.0M | mul_add_c(a[1], b[7], c3, c1, c2); |
460 | 11.0M | r[8] = c3; |
461 | 11.0M | c3 = 0; |
462 | 11.0M | mul_add_c(a[2], b[7], c1, c2, c3); |
463 | 11.0M | mul_add_c(a[3], b[6], c1, c2, c3); |
464 | 11.0M | mul_add_c(a[4], b[5], c1, c2, c3); |
465 | 11.0M | mul_add_c(a[5], b[4], c1, c2, c3); |
466 | 11.0M | mul_add_c(a[6], b[3], c1, c2, c3); |
467 | 11.0M | mul_add_c(a[7], b[2], c1, c2, c3); |
468 | 11.0M | r[9] = c1; |
469 | 11.0M | c1 = 0; |
470 | 11.0M | mul_add_c(a[7], b[3], c2, c3, c1); |
471 | 11.0M | mul_add_c(a[6], b[4], c2, c3, c1); |
472 | 11.0M | mul_add_c(a[5], b[5], c2, c3, c1); |
473 | 11.0M | mul_add_c(a[4], b[6], c2, c3, c1); |
474 | 11.0M | mul_add_c(a[3], b[7], c2, c3, c1); |
475 | 11.0M | r[10] = c2; |
476 | 11.0M | c2 = 0; |
477 | 11.0M | mul_add_c(a[4], b[7], c3, c1, c2); |
478 | 11.0M | mul_add_c(a[5], b[6], c3, c1, c2); |
479 | 11.0M | mul_add_c(a[6], b[5], c3, c1, c2); |
480 | 11.0M | mul_add_c(a[7], b[4], c3, c1, c2); |
481 | 11.0M | r[11] = c3; |
482 | 11.0M | c3 = 0; |
483 | 11.0M | mul_add_c(a[7], b[5], c1, c2, c3); |
484 | 11.0M | mul_add_c(a[6], b[6], c1, c2, c3); |
485 | 11.0M | mul_add_c(a[5], b[7], c1, c2, c3); |
486 | 11.0M | r[12] = c1; |
487 | 11.0M | c1 = 0; |
488 | 11.0M | mul_add_c(a[6], b[7], c2, c3, c1); |
489 | 11.0M | mul_add_c(a[7], b[6], c2, c3, c1); |
490 | 11.0M | r[13] = c2; |
491 | 11.0M | c2 = 0; |
492 | 11.0M | mul_add_c(a[7], b[7], c3, c1, c2); |
493 | 11.0M | r[14] = c3; |
494 | 11.0M | r[15] = c1; |
495 | 11.0M | } |
496 | | |
497 | | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
498 | 0 | { |
499 | 0 | BN_ULONG c1, c2, c3; |
500 | |
|
501 | 0 | c1 = 0; |
502 | 0 | c2 = 0; |
503 | 0 | c3 = 0; |
504 | 0 | mul_add_c(a[0], b[0], c1, c2, c3); |
505 | 0 | r[0] = c1; |
506 | 0 | c1 = 0; |
507 | 0 | mul_add_c(a[0], b[1], c2, c3, c1); |
508 | 0 | mul_add_c(a[1], b[0], c2, c3, c1); |
509 | 0 | r[1] = c2; |
510 | 0 | c2 = 0; |
511 | 0 | mul_add_c(a[2], b[0], c3, c1, c2); |
512 | 0 | mul_add_c(a[1], b[1], c3, c1, c2); |
513 | 0 | mul_add_c(a[0], b[2], c3, c1, c2); |
514 | 0 | r[2] = c3; |
515 | 0 | c3 = 0; |
516 | 0 | mul_add_c(a[0], b[3], c1, c2, c3); |
517 | 0 | mul_add_c(a[1], b[2], c1, c2, c3); |
518 | 0 | mul_add_c(a[2], b[1], c1, c2, c3); |
519 | 0 | mul_add_c(a[3], b[0], c1, c2, c3); |
520 | 0 | r[3] = c1; |
521 | 0 | c1 = 0; |
522 | 0 | mul_add_c(a[3], b[1], c2, c3, c1); |
523 | 0 | mul_add_c(a[2], b[2], c2, c3, c1); |
524 | 0 | mul_add_c(a[1], b[3], c2, c3, c1); |
525 | 0 | r[4] = c2; |
526 | 0 | c2 = 0; |
527 | 0 | mul_add_c(a[2], b[3], c3, c1, c2); |
528 | 0 | mul_add_c(a[3], b[2], c3, c1, c2); |
529 | 0 | r[5] = c3; |
530 | 0 | c3 = 0; |
531 | 0 | mul_add_c(a[3], b[3], c1, c2, c3); |
532 | 0 | r[6] = c1; |
533 | 0 | r[7] = c2; |
534 | 0 | } |
535 | | |
536 | | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
537 | 452k | { |
538 | 452k | BN_ULONG c1, c2, c3; |
539 | | |
540 | 452k | c1 = 0; |
541 | 452k | c2 = 0; |
542 | 452k | c3 = 0; |
543 | 452k | sqr_add_c(a, 0, c1, c2, c3); |
544 | 452k | r[0] = c1; |
545 | 452k | c1 = 0; |
546 | 452k | sqr_add_c2(a, 1, 0, c2, c3, c1); |
547 | 452k | r[1] = c2; |
548 | 452k | c2 = 0; |
549 | 452k | sqr_add_c(a, 1, c3, c1, c2); |
550 | 452k | sqr_add_c2(a, 2, 0, c3, c1, c2); |
551 | 452k | r[2] = c3; |
552 | 452k | c3 = 0; |
553 | 452k | sqr_add_c2(a, 3, 0, c1, c2, c3); |
554 | 452k | sqr_add_c2(a, 2, 1, c1, c2, c3); |
555 | 452k | r[3] = c1; |
556 | 452k | c1 = 0; |
557 | 452k | sqr_add_c(a, 2, c2, c3, c1); |
558 | 452k | sqr_add_c2(a, 3, 1, c2, c3, c1); |
559 | 452k | sqr_add_c2(a, 4, 0, c2, c3, c1); |
560 | 452k | r[4] = c2; |
561 | 452k | c2 = 0; |
562 | 452k | sqr_add_c2(a, 5, 0, c3, c1, c2); |
563 | 452k | sqr_add_c2(a, 4, 1, c3, c1, c2); |
564 | 452k | sqr_add_c2(a, 3, 2, c3, c1, c2); |
565 | 452k | r[5] = c3; |
566 | 452k | c3 = 0; |
567 | 452k | sqr_add_c(a, 3, c1, c2, c3); |
568 | 452k | sqr_add_c2(a, 4, 2, c1, c2, c3); |
569 | 452k | sqr_add_c2(a, 5, 1, c1, c2, c3); |
570 | 452k | sqr_add_c2(a, 6, 0, c1, c2, c3); |
571 | 452k | r[6] = c1; |
572 | 452k | c1 = 0; |
573 | 452k | sqr_add_c2(a, 7, 0, c2, c3, c1); |
574 | 452k | sqr_add_c2(a, 6, 1, c2, c3, c1); |
575 | 452k | sqr_add_c2(a, 5, 2, c2, c3, c1); |
576 | 452k | sqr_add_c2(a, 4, 3, c2, c3, c1); |
577 | 452k | r[7] = c2; |
578 | 452k | c2 = 0; |
579 | 452k | sqr_add_c(a, 4, c3, c1, c2); |
580 | 452k | sqr_add_c2(a, 5, 3, c3, c1, c2); |
581 | 452k | sqr_add_c2(a, 6, 2, c3, c1, c2); |
582 | 452k | sqr_add_c2(a, 7, 1, c3, c1, c2); |
583 | 452k | r[8] = c3; |
584 | 452k | c3 = 0; |
585 | 452k | sqr_add_c2(a, 7, 2, c1, c2, c3); |
586 | 452k | sqr_add_c2(a, 6, 3, c1, c2, c3); |
587 | 452k | sqr_add_c2(a, 5, 4, c1, c2, c3); |
588 | 452k | r[9] = c1; |
589 | 452k | c1 = 0; |
590 | 452k | sqr_add_c(a, 5, c2, c3, c1); |
591 | 452k | sqr_add_c2(a, 6, 4, c2, c3, c1); |
592 | 452k | sqr_add_c2(a, 7, 3, c2, c3, c1); |
593 | 452k | r[10] = c2; |
594 | 452k | c2 = 0; |
595 | 452k | sqr_add_c2(a, 7, 4, c3, c1, c2); |
596 | 452k | sqr_add_c2(a, 6, 5, c3, c1, c2); |
597 | 452k | r[11] = c3; |
598 | 452k | c3 = 0; |
599 | 452k | sqr_add_c(a, 6, c1, c2, c3); |
600 | 452k | sqr_add_c2(a, 7, 5, c1, c2, c3); |
601 | 452k | r[12] = c1; |
602 | 452k | c1 = 0; |
603 | 452k | sqr_add_c2(a, 7, 6, c2, c3, c1); |
604 | 452k | r[13] = c2; |
605 | 452k | c2 = 0; |
606 | 452k | sqr_add_c(a, 7, c3, c1, c2); |
607 | 452k | r[14] = c3; |
608 | 452k | r[15] = c1; |
609 | 452k | } |
610 | | |
611 | | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
612 | 44.6M | { |
613 | 44.6M | BN_ULONG c1, c2, c3; |
614 | | |
615 | 44.6M | c1 = 0; |
616 | 44.6M | c2 = 0; |
617 | 44.6M | c3 = 0; |
618 | 44.6M | sqr_add_c(a, 0, c1, c2, c3); |
619 | 44.6M | r[0] = c1; |
620 | 44.6M | c1 = 0; |
621 | 44.6M | sqr_add_c2(a, 1, 0, c2, c3, c1); |
622 | 44.6M | r[1] = c2; |
623 | 44.6M | c2 = 0; |
624 | 44.6M | sqr_add_c(a, 1, c3, c1, c2); |
625 | 44.6M | sqr_add_c2(a, 2, 0, c3, c1, c2); |
626 | 44.6M | r[2] = c3; |
627 | 44.6M | c3 = 0; |
628 | 44.6M | sqr_add_c2(a, 3, 0, c1, c2, c3); |
629 | 44.6M | sqr_add_c2(a, 2, 1, c1, c2, c3); |
630 | 44.6M | r[3] = c1; |
631 | 44.6M | c1 = 0; |
632 | 44.6M | sqr_add_c(a, 2, c2, c3, c1); |
633 | 44.6M | sqr_add_c2(a, 3, 1, c2, c3, c1); |
634 | 44.6M | r[4] = c2; |
635 | 44.6M | c2 = 0; |
636 | 44.6M | sqr_add_c2(a, 3, 2, c3, c1, c2); |
637 | 44.6M | r[5] = c3; |
638 | 44.6M | c3 = 0; |
639 | 44.6M | sqr_add_c(a, 3, c1, c2, c3); |
640 | 44.6M | r[6] = c1; |
641 | 44.6M | r[7] = c2; |
642 | 44.6M | } |
643 | | #endif |