/src/openssl31/crypto/bn/asm/x86_64-gcc.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved. |
3 | | * |
4 | | * Licensed under the Apache License 2.0 (the "License"). You may not use |
5 | | * this file except in compliance with the License. You can obtain a copy |
6 | | * in the file LICENSE in the source distribution or at |
7 | | * https://www.openssl.org/source/license.html |
8 | | */ |
9 | | |
10 | | #include "../bn_local.h" |
11 | | #if !(defined(__GNUC__) && __GNUC__>=2) |
12 | | # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ |
13 | | #else |
14 | | /*- |
15 | | * x86_64 BIGNUM accelerator version 0.1, December 2002. |
16 | | * |
17 | | * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL |
18 | | * project. |
19 | | * |
20 | | * Rights for redistribution and usage in source and binary forms are |
21 | | * granted according to the License. Warranty of any kind is disclaimed. |
22 | | * |
23 | | * Q. Version 0.1? It doesn't sound like Andy, he used to assign real |
24 | | * versions, like 1.0... |
25 | | * A. Well, that's because this code is basically a quick-n-dirty |
26 | | * proof-of-concept hack. As you can see it's implemented with |
27 | | * inline assembler, which means that you're bound to GCC and that |
28 | | * there might be enough room for further improvement. |
29 | | * |
30 | | * Q. Why inline assembler? |
31 | | * A. x86_64 features own ABI which I'm not familiar with. This is |
32 | | * why I decided to let the compiler take care of subroutine |
33 | | * prologue/epilogue as well as register allocation. For reference. |
34 | | * Win64 implements different ABI for AMD64, different from Linux. |
35 | | * |
36 | | * Q. How much faster does it get? |
37 | | * A. 'apps/openssl speed rsa dsa' output with no-asm: |
38 | | * |
39 | | * sign verify sign/s verify/s |
40 | | * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 |
41 | | * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 |
42 | | * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 |
43 | | * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 |
44 | | * sign verify sign/s verify/s |
45 | | * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 |
46 | | * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 |
47 | | * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 |
48 | | * |
49 | | * 'apps/openssl speed rsa dsa' output with this module: |
50 | | * |
51 | | * sign verify sign/s verify/s |
52 | | * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 |
53 | | * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 |
54 | | * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 |
55 | | * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 |
56 | | * sign verify sign/s verify/s |
57 | | * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 |
58 | | * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 |
59 | | * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 |
60 | | * |
61 | | * For the reference. IA-32 assembler implementation performs |
62 | | * very much like 64-bit code compiled with no-asm on the same |
63 | | * machine. |
64 | | */ |
65 | | |
66 | | # undef mul |
67 | | # undef mul_add |
68 | | |
69 | | /*- |
70 | | * "m"(a), "+m"(r) is the way to favor DirectPath ยต-code; |
71 | | * "g"(0) let the compiler to decide where does it |
72 | | * want to keep the value of zero; |
73 | | */ |
74 | 89.1G | # define mul_add(r,a,word,carry) do { \ |
75 | 89.1G | register BN_ULONG high,low; \ |
76 | 89.1G | asm ("mulq %3" \ |
77 | 89.1G | : "=a"(low),"=d"(high) \ |
78 | 89.1G | : "a"(word),"m"(a) \ |
79 | 89.1G | : "cc"); \ |
80 | 89.1G | asm ("addq %2,%0; adcq %3,%1" \ |
81 | 89.1G | : "+r"(carry),"+d"(high)\ |
82 | 89.1G | : "a"(low),"g"(0) \ |
83 | 89.1G | : "cc"); \ |
84 | 89.1G | asm ("addq %2,%0; adcq %3,%1" \ |
85 | 89.1G | : "+m"(r),"+d"(high) \ |
86 | 89.1G | : "r"(carry),"g"(0) \ |
87 | 89.1G | : "cc"); \ |
88 | 89.1G | carry=high; \ |
89 | 89.1G | } while (0) |
90 | | |
91 | 17.2G | # define mul(r,a,word,carry) do { \ |
92 | 17.2G | register BN_ULONG high,low; \ |
93 | 17.2G | asm ("mulq %3" \ |
94 | 17.2G | : "=a"(low),"=d"(high) \ |
95 | 17.2G | : "a"(word),"g"(a) \ |
96 | 17.2G | : "cc"); \ |
97 | 17.2G | asm ("addq %2,%0; adcq %3,%1" \ |
98 | 17.2G | : "+r"(carry),"+d"(high)\ |
99 | 17.2G | : "a"(low),"g"(0) \ |
100 | 17.2G | : "cc"); \ |
101 | 17.2G | (r)=carry, carry=high; \ |
102 | 17.2G | } while (0) |
103 | | # undef sqr |
104 | | # define sqr(r0,r1,a) \ |
105 | 33.0M | asm ("mulq %2" \ |
106 | 33.0M | : "=a"(r0),"=d"(r1) \ |
107 | 33.0M | : "a"(a) \ |
108 | 33.0M | : "cc"); |
109 | | |
110 | | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
111 | | BN_ULONG w) |
112 | 92.5M | { |
113 | 92.5M | BN_ULONG c1 = 0; |
114 | | |
115 | 92.5M | if (num <= 0) |
116 | 0 | return c1; |
117 | | |
118 | 22.3G | while (num & ~3) { |
119 | 22.2G | mul_add(rp[0], ap[0], w, c1); |
120 | 22.2G | mul_add(rp[1], ap[1], w, c1); |
121 | 22.2G | mul_add(rp[2], ap[2], w, c1); |
122 | 22.2G | mul_add(rp[3], ap[3], w, c1); |
123 | 22.2G | ap += 4; |
124 | 22.2G | rp += 4; |
125 | 22.2G | num -= 4; |
126 | 22.2G | } |
127 | 92.5M | if (num) { |
128 | 57.2M | mul_add(rp[0], ap[0], w, c1); |
129 | 57.2M | if (--num == 0) |
130 | 19.9M | return c1; |
131 | 37.3M | mul_add(rp[1], ap[1], w, c1); |
132 | 37.3M | if (--num == 0) |
133 | 15.2M | return c1; |
134 | 22.0M | mul_add(rp[2], ap[2], w, c1); |
135 | 22.0M | return c1; |
136 | 37.3M | } |
137 | | |
138 | 35.2M | return c1; |
139 | 92.5M | } |
140 | | |
141 | | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
142 | 327M | { |
143 | 327M | BN_ULONG c1 = 0; |
144 | | |
145 | 327M | if (num <= 0) |
146 | 0 | return c1; |
147 | | |
148 | 4.63G | while (num & ~3) { |
149 | 4.30G | mul(rp[0], ap[0], w, c1); |
150 | 4.30G | mul(rp[1], ap[1], w, c1); |
151 | 4.30G | mul(rp[2], ap[2], w, c1); |
152 | 4.30G | mul(rp[3], ap[3], w, c1); |
153 | 4.30G | ap += 4; |
154 | 4.30G | rp += 4; |
155 | 4.30G | num -= 4; |
156 | 4.30G | } |
157 | 327M | if (num) { |
158 | 38.8M | mul(rp[0], ap[0], w, c1); |
159 | 38.8M | if (--num == 0) |
160 | 17.9M | return c1; |
161 | 20.9M | mul(rp[1], ap[1], w, c1); |
162 | 20.9M | if (--num == 0) |
163 | 11.3M | return c1; |
164 | 9.54M | mul(rp[2], ap[2], w, c1); |
165 | 9.54M | } |
166 | 298M | return c1; |
167 | 327M | } |
168 | | |
169 | | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
170 | 2.67M | { |
171 | 2.67M | if (n <= 0) |
172 | 0 | return; |
173 | | |
174 | 10.1M | while (n & ~3) { |
175 | 7.45M | sqr(r[0], r[1], a[0]); |
176 | 7.45M | sqr(r[2], r[3], a[1]); |
177 | 7.45M | sqr(r[4], r[5], a[2]); |
178 | 7.45M | sqr(r[6], r[7], a[3]); |
179 | 7.45M | a += 4; |
180 | 7.45M | r += 8; |
181 | 7.45M | n -= 4; |
182 | 7.45M | } |
183 | 2.67M | if (n) { |
184 | 2.62M | sqr(r[0], r[1], a[0]); |
185 | 2.62M | if (--n == 0) |
186 | 2.21M | return; |
187 | 404k | sqr(r[2], r[3], a[1]); |
188 | 404k | if (--n == 0) |
189 | 212k | return; |
190 | 191k | sqr(r[4], r[5], a[2]); |
191 | 191k | } |
192 | 2.67M | } |
193 | | |
194 | | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
195 | 306M | { |
196 | 306M | BN_ULONG ret, waste; |
197 | | |
198 | 306M | asm("divq %4":"=a"(ret), "=d"(waste) |
199 | 306M | : "a"(l), "d"(h), "r"(d) |
200 | 306M | : "cc"); |
201 | | |
202 | 306M | return ret; |
203 | 306M | } |
204 | | |
205 | | BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
206 | | int n) |
207 | 474M | { |
208 | 474M | BN_ULONG ret; |
209 | 474M | size_t i = 0; |
210 | | |
211 | 474M | if (n <= 0) |
212 | 3.86M | return 0; |
213 | | |
214 | 471M | asm volatile (" subq %0,%0 \n" /* clear carry */ |
215 | 471M | " jmp 1f \n" |
216 | 471M | ".p2align 4 \n" |
217 | 471M | "1: movq (%4,%2,8),%0 \n" |
218 | 471M | " adcq (%5,%2,8),%0 \n" |
219 | 471M | " movq %0,(%3,%2,8) \n" |
220 | 471M | " lea 1(%2),%2 \n" |
221 | 471M | " dec %1 \n" |
222 | 471M | " jnz 1b \n" |
223 | 471M | " sbbq %0,%0 \n" |
224 | 471M | :"=&r" (ret), "+c"(n), "+r"(i) |
225 | 471M | :"r"(rp), "r"(ap), "r"(bp) |
226 | 471M | :"cc", "memory"); |
227 | | |
228 | 471M | return ret & 1; |
229 | 474M | } |
230 | | |
231 | | # ifndef SIMICS |
232 | | BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
233 | | int n) |
234 | 476M | { |
235 | 476M | BN_ULONG ret; |
236 | 476M | size_t i = 0; |
237 | | |
238 | 476M | if (n <= 0) |
239 | 3.26M | return 0; |
240 | | |
241 | 473M | asm volatile (" subq %0,%0 \n" /* clear borrow */ |
242 | 473M | " jmp 1f \n" |
243 | 473M | ".p2align 4 \n" |
244 | 473M | "1: movq (%4,%2,8),%0 \n" |
245 | 473M | " sbbq (%5,%2,8),%0 \n" |
246 | 473M | " movq %0,(%3,%2,8) \n" |
247 | 473M | " lea 1(%2),%2 \n" |
248 | 473M | " dec %1 \n" |
249 | 473M | " jnz 1b \n" |
250 | 473M | " sbbq %0,%0 \n" |
251 | 473M | :"=&r" (ret), "+c"(n), "+r"(i) |
252 | 473M | :"r"(rp), "r"(ap), "r"(bp) |
253 | 473M | :"cc", "memory"); |
254 | | |
255 | 473M | return ret & 1; |
256 | 476M | } |
257 | | # else |
258 | | /* Simics 1.4<7 has buggy sbbq:-( */ |
259 | | # define BN_MASK2 0xffffffffffffffffL |
260 | | BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) |
261 | | { |
262 | | BN_ULONG t1, t2; |
263 | | int c = 0; |
264 | | |
265 | | if (n <= 0) |
266 | | return (BN_ULONG)0; |
267 | | |
268 | | for (;;) { |
269 | | t1 = a[0]; |
270 | | t2 = b[0]; |
271 | | r[0] = (t1 - t2 - c) & BN_MASK2; |
272 | | if (t1 != t2) |
273 | | c = (t1 < t2); |
274 | | if (--n <= 0) |
275 | | break; |
276 | | |
277 | | t1 = a[1]; |
278 | | t2 = b[1]; |
279 | | r[1] = (t1 - t2 - c) & BN_MASK2; |
280 | | if (t1 != t2) |
281 | | c = (t1 < t2); |
282 | | if (--n <= 0) |
283 | | break; |
284 | | |
285 | | t1 = a[2]; |
286 | | t2 = b[2]; |
287 | | r[2] = (t1 - t2 - c) & BN_MASK2; |
288 | | if (t1 != t2) |
289 | | c = (t1 < t2); |
290 | | if (--n <= 0) |
291 | | break; |
292 | | |
293 | | t1 = a[3]; |
294 | | t2 = b[3]; |
295 | | r[3] = (t1 - t2 - c) & BN_MASK2; |
296 | | if (t1 != t2) |
297 | | c = (t1 < t2); |
298 | | if (--n <= 0) |
299 | | break; |
300 | | |
301 | | a += 4; |
302 | | b += 4; |
303 | | r += 4; |
304 | | } |
305 | | return c; |
306 | | } |
307 | | # endif |
308 | | |
309 | | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
310 | | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ |
311 | | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ |
312 | | /* |
313 | | * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number |
314 | | * c=(c2,c1,c0) |
315 | | */ |
316 | | |
317 | | /* |
318 | | * Keep in mind that carrying into high part of multiplication result |
319 | | * can not overflow, because it cannot be all-ones. |
320 | | */ |
321 | | # if 0 |
322 | | /* original macros are kept for reference purposes */ |
323 | | # define mul_add_c(a,b,c0,c1,c2) do { \ |
324 | | BN_ULONG ta = (a), tb = (b); \ |
325 | | BN_ULONG lo, hi; \ |
326 | | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
327 | | c0 += lo; hi += (c0<lo)?1:0; \ |
328 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
329 | | } while(0) |
330 | | |
331 | | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
332 | | BN_ULONG ta = (a), tb = (b); \ |
333 | | BN_ULONG lo, hi, tt; \ |
334 | | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
335 | | c0 += lo; tt = hi+((c0<lo)?1:0); \ |
336 | | c1 += tt; c2 += (c1<tt)?1:0; \ |
337 | | c0 += lo; hi += (c0<lo)?1:0; \ |
338 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
339 | | } while(0) |
340 | | |
341 | | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
342 | | BN_ULONG ta = (a)[i]; \ |
343 | | BN_ULONG lo, hi; \ |
344 | | BN_UMULT_LOHI(lo,hi,ta,ta); \ |
345 | | c0 += lo; hi += (c0<lo)?1:0; \ |
346 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
347 | | } while(0) |
348 | | # else |
349 | 3.24G | # define mul_add_c(a,b,c0,c1,c2) do { \ |
350 | 3.24G | BN_ULONG t1,t2; \ |
351 | 3.24G | asm ("mulq %3" \ |
352 | 3.24G | : "=a"(t1),"=d"(t2) \ |
353 | 3.24G | : "a"(a),"m"(b) \ |
354 | 3.24G | : "cc"); \ |
355 | 3.24G | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
356 | 3.24G | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
357 | 3.24G | : "r"(t1),"r"(t2),"g"(0) \ |
358 | 3.24G | : "cc"); \ |
359 | 3.24G | } while (0) |
360 | | |
361 | 308M | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
362 | 308M | BN_ULONG t1,t2; \ |
363 | 308M | asm ("mulq %2" \ |
364 | 308M | : "=a"(t1),"=d"(t2) \ |
365 | 308M | : "a"(a[i]) \ |
366 | 308M | : "cc"); \ |
367 | 308M | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
368 | 308M | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
369 | 308M | : "r"(t1),"r"(t2),"g"(0) \ |
370 | 308M | : "cc"); \ |
371 | 308M | } while (0) |
372 | | |
373 | 576M | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
374 | 576M | BN_ULONG t1,t2; \ |
375 | 576M | asm ("mulq %3" \ |
376 | 576M | : "=a"(t1),"=d"(t2) \ |
377 | 576M | : "a"(a),"m"(b) \ |
378 | 576M | : "cc"); \ |
379 | 576M | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
380 | 576M | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
381 | 576M | : "r"(t1),"r"(t2),"g"(0) \ |
382 | 576M | : "cc"); \ |
383 | 576M | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ |
384 | 576M | : "+r"(c0),"+r"(c1),"+r"(c2) \ |
385 | 576M | : "r"(t1),"r"(t2),"g"(0) \ |
386 | 576M | : "cc"); \ |
387 | 576M | } while (0) |
388 | | # endif |
389 | | |
390 | | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
391 | 576M | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
392 | | |
393 | | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
394 | 50.7M | { |
395 | 50.7M | BN_ULONG c1, c2, c3; |
396 | | |
397 | 50.7M | c1 = 0; |
398 | 50.7M | c2 = 0; |
399 | 50.7M | c3 = 0; |
400 | 50.7M | mul_add_c(a[0], b[0], c1, c2, c3); |
401 | 50.7M | r[0] = c1; |
402 | 50.7M | c1 = 0; |
403 | 50.7M | mul_add_c(a[0], b[1], c2, c3, c1); |
404 | 50.7M | mul_add_c(a[1], b[0], c2, c3, c1); |
405 | 50.7M | r[1] = c2; |
406 | 50.7M | c2 = 0; |
407 | 50.7M | mul_add_c(a[2], b[0], c3, c1, c2); |
408 | 50.7M | mul_add_c(a[1], b[1], c3, c1, c2); |
409 | 50.7M | mul_add_c(a[0], b[2], c3, c1, c2); |
410 | 50.7M | r[2] = c3; |
411 | 50.7M | c3 = 0; |
412 | 50.7M | mul_add_c(a[0], b[3], c1, c2, c3); |
413 | 50.7M | mul_add_c(a[1], b[2], c1, c2, c3); |
414 | 50.7M | mul_add_c(a[2], b[1], c1, c2, c3); |
415 | 50.7M | mul_add_c(a[3], b[0], c1, c2, c3); |
416 | 50.7M | r[3] = c1; |
417 | 50.7M | c1 = 0; |
418 | 50.7M | mul_add_c(a[4], b[0], c2, c3, c1); |
419 | 50.7M | mul_add_c(a[3], b[1], c2, c3, c1); |
420 | 50.7M | mul_add_c(a[2], b[2], c2, c3, c1); |
421 | 50.7M | mul_add_c(a[1], b[3], c2, c3, c1); |
422 | 50.7M | mul_add_c(a[0], b[4], c2, c3, c1); |
423 | 50.7M | r[4] = c2; |
424 | 50.7M | c2 = 0; |
425 | 50.7M | mul_add_c(a[0], b[5], c3, c1, c2); |
426 | 50.7M | mul_add_c(a[1], b[4], c3, c1, c2); |
427 | 50.7M | mul_add_c(a[2], b[3], c3, c1, c2); |
428 | 50.7M | mul_add_c(a[3], b[2], c3, c1, c2); |
429 | 50.7M | mul_add_c(a[4], b[1], c3, c1, c2); |
430 | 50.7M | mul_add_c(a[5], b[0], c3, c1, c2); |
431 | 50.7M | r[5] = c3; |
432 | 50.7M | c3 = 0; |
433 | 50.7M | mul_add_c(a[6], b[0], c1, c2, c3); |
434 | 50.7M | mul_add_c(a[5], b[1], c1, c2, c3); |
435 | 50.7M | mul_add_c(a[4], b[2], c1, c2, c3); |
436 | 50.7M | mul_add_c(a[3], b[3], c1, c2, c3); |
437 | 50.7M | mul_add_c(a[2], b[4], c1, c2, c3); |
438 | 50.7M | mul_add_c(a[1], b[5], c1, c2, c3); |
439 | 50.7M | mul_add_c(a[0], b[6], c1, c2, c3); |
440 | 50.7M | r[6] = c1; |
441 | 50.7M | c1 = 0; |
442 | 50.7M | mul_add_c(a[0], b[7], c2, c3, c1); |
443 | 50.7M | mul_add_c(a[1], b[6], c2, c3, c1); |
444 | 50.7M | mul_add_c(a[2], b[5], c2, c3, c1); |
445 | 50.7M | mul_add_c(a[3], b[4], c2, c3, c1); |
446 | 50.7M | mul_add_c(a[4], b[3], c2, c3, c1); |
447 | 50.7M | mul_add_c(a[5], b[2], c2, c3, c1); |
448 | 50.7M | mul_add_c(a[6], b[1], c2, c3, c1); |
449 | 50.7M | mul_add_c(a[7], b[0], c2, c3, c1); |
450 | 50.7M | r[7] = c2; |
451 | 50.7M | c2 = 0; |
452 | 50.7M | mul_add_c(a[7], b[1], c3, c1, c2); |
453 | 50.7M | mul_add_c(a[6], b[2], c3, c1, c2); |
454 | 50.7M | mul_add_c(a[5], b[3], c3, c1, c2); |
455 | 50.7M | mul_add_c(a[4], b[4], c3, c1, c2); |
456 | 50.7M | mul_add_c(a[3], b[5], c3, c1, c2); |
457 | 50.7M | mul_add_c(a[2], b[6], c3, c1, c2); |
458 | 50.7M | mul_add_c(a[1], b[7], c3, c1, c2); |
459 | 50.7M | r[8] = c3; |
460 | 50.7M | c3 = 0; |
461 | 50.7M | mul_add_c(a[2], b[7], c1, c2, c3); |
462 | 50.7M | mul_add_c(a[3], b[6], c1, c2, c3); |
463 | 50.7M | mul_add_c(a[4], b[5], c1, c2, c3); |
464 | 50.7M | mul_add_c(a[5], b[4], c1, c2, c3); |
465 | 50.7M | mul_add_c(a[6], b[3], c1, c2, c3); |
466 | 50.7M | mul_add_c(a[7], b[2], c1, c2, c3); |
467 | 50.7M | r[9] = c1; |
468 | 50.7M | c1 = 0; |
469 | 50.7M | mul_add_c(a[7], b[3], c2, c3, c1); |
470 | 50.7M | mul_add_c(a[6], b[4], c2, c3, c1); |
471 | 50.7M | mul_add_c(a[5], b[5], c2, c3, c1); |
472 | 50.7M | mul_add_c(a[4], b[6], c2, c3, c1); |
473 | 50.7M | mul_add_c(a[3], b[7], c2, c3, c1); |
474 | 50.7M | r[10] = c2; |
475 | 50.7M | c2 = 0; |
476 | 50.7M | mul_add_c(a[4], b[7], c3, c1, c2); |
477 | 50.7M | mul_add_c(a[5], b[6], c3, c1, c2); |
478 | 50.7M | mul_add_c(a[6], b[5], c3, c1, c2); |
479 | 50.7M | mul_add_c(a[7], b[4], c3, c1, c2); |
480 | 50.7M | r[11] = c3; |
481 | 50.7M | c3 = 0; |
482 | 50.7M | mul_add_c(a[7], b[5], c1, c2, c3); |
483 | 50.7M | mul_add_c(a[6], b[6], c1, c2, c3); |
484 | 50.7M | mul_add_c(a[5], b[7], c1, c2, c3); |
485 | 50.7M | r[12] = c1; |
486 | 50.7M | c1 = 0; |
487 | 50.7M | mul_add_c(a[6], b[7], c2, c3, c1); |
488 | 50.7M | mul_add_c(a[7], b[6], c2, c3, c1); |
489 | 50.7M | r[13] = c2; |
490 | 50.7M | c2 = 0; |
491 | 50.7M | mul_add_c(a[7], b[7], c3, c1, c2); |
492 | 50.7M | r[14] = c3; |
493 | 50.7M | r[15] = c1; |
494 | 50.7M | } |
495 | | |
496 | | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
497 | 0 | { |
498 | 0 | BN_ULONG c1, c2, c3; |
499 | |
|
500 | 0 | c1 = 0; |
501 | 0 | c2 = 0; |
502 | 0 | c3 = 0; |
503 | 0 | mul_add_c(a[0], b[0], c1, c2, c3); |
504 | 0 | r[0] = c1; |
505 | 0 | c1 = 0; |
506 | 0 | mul_add_c(a[0], b[1], c2, c3, c1); |
507 | 0 | mul_add_c(a[1], b[0], c2, c3, c1); |
508 | 0 | r[1] = c2; |
509 | 0 | c2 = 0; |
510 | 0 | mul_add_c(a[2], b[0], c3, c1, c2); |
511 | 0 | mul_add_c(a[1], b[1], c3, c1, c2); |
512 | 0 | mul_add_c(a[0], b[2], c3, c1, c2); |
513 | 0 | r[2] = c3; |
514 | 0 | c3 = 0; |
515 | 0 | mul_add_c(a[0], b[3], c1, c2, c3); |
516 | 0 | mul_add_c(a[1], b[2], c1, c2, c3); |
517 | 0 | mul_add_c(a[2], b[1], c1, c2, c3); |
518 | 0 | mul_add_c(a[3], b[0], c1, c2, c3); |
519 | 0 | r[3] = c1; |
520 | 0 | c1 = 0; |
521 | 0 | mul_add_c(a[3], b[1], c2, c3, c1); |
522 | 0 | mul_add_c(a[2], b[2], c2, c3, c1); |
523 | 0 | mul_add_c(a[1], b[3], c2, c3, c1); |
524 | 0 | r[4] = c2; |
525 | 0 | c2 = 0; |
526 | 0 | mul_add_c(a[2], b[3], c3, c1, c2); |
527 | 0 | mul_add_c(a[3], b[2], c3, c1, c2); |
528 | 0 | r[5] = c3; |
529 | 0 | c3 = 0; |
530 | 0 | mul_add_c(a[3], b[3], c1, c2, c3); |
531 | 0 | r[6] = c1; |
532 | 0 | r[7] = c2; |
533 | 0 | } |
534 | | |
535 | | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
536 | 7.14M | { |
537 | 7.14M | BN_ULONG c1, c2, c3; |
538 | | |
539 | 7.14M | c1 = 0; |
540 | 7.14M | c2 = 0; |
541 | 7.14M | c3 = 0; |
542 | 7.14M | sqr_add_c(a, 0, c1, c2, c3); |
543 | 7.14M | r[0] = c1; |
544 | 7.14M | c1 = 0; |
545 | 7.14M | sqr_add_c2(a, 1, 0, c2, c3, c1); |
546 | 7.14M | r[1] = c2; |
547 | 7.14M | c2 = 0; |
548 | 7.14M | sqr_add_c(a, 1, c3, c1, c2); |
549 | 7.14M | sqr_add_c2(a, 2, 0, c3, c1, c2); |
550 | 7.14M | r[2] = c3; |
551 | 7.14M | c3 = 0; |
552 | 7.14M | sqr_add_c2(a, 3, 0, c1, c2, c3); |
553 | 7.14M | sqr_add_c2(a, 2, 1, c1, c2, c3); |
554 | 7.14M | r[3] = c1; |
555 | 7.14M | c1 = 0; |
556 | 7.14M | sqr_add_c(a, 2, c2, c3, c1); |
557 | 7.14M | sqr_add_c2(a, 3, 1, c2, c3, c1); |
558 | 7.14M | sqr_add_c2(a, 4, 0, c2, c3, c1); |
559 | 7.14M | r[4] = c2; |
560 | 7.14M | c2 = 0; |
561 | 7.14M | sqr_add_c2(a, 5, 0, c3, c1, c2); |
562 | 7.14M | sqr_add_c2(a, 4, 1, c3, c1, c2); |
563 | 7.14M | sqr_add_c2(a, 3, 2, c3, c1, c2); |
564 | 7.14M | r[5] = c3; |
565 | 7.14M | c3 = 0; |
566 | 7.14M | sqr_add_c(a, 3, c1, c2, c3); |
567 | 7.14M | sqr_add_c2(a, 4, 2, c1, c2, c3); |
568 | 7.14M | sqr_add_c2(a, 5, 1, c1, c2, c3); |
569 | 7.14M | sqr_add_c2(a, 6, 0, c1, c2, c3); |
570 | 7.14M | r[6] = c1; |
571 | 7.14M | c1 = 0; |
572 | 7.14M | sqr_add_c2(a, 7, 0, c2, c3, c1); |
573 | 7.14M | sqr_add_c2(a, 6, 1, c2, c3, c1); |
574 | 7.14M | sqr_add_c2(a, 5, 2, c2, c3, c1); |
575 | 7.14M | sqr_add_c2(a, 4, 3, c2, c3, c1); |
576 | 7.14M | r[7] = c2; |
577 | 7.14M | c2 = 0; |
578 | 7.14M | sqr_add_c(a, 4, c3, c1, c2); |
579 | 7.14M | sqr_add_c2(a, 5, 3, c3, c1, c2); |
580 | 7.14M | sqr_add_c2(a, 6, 2, c3, c1, c2); |
581 | 7.14M | sqr_add_c2(a, 7, 1, c3, c1, c2); |
582 | 7.14M | r[8] = c3; |
583 | 7.14M | c3 = 0; |
584 | 7.14M | sqr_add_c2(a, 7, 2, c1, c2, c3); |
585 | 7.14M | sqr_add_c2(a, 6, 3, c1, c2, c3); |
586 | 7.14M | sqr_add_c2(a, 5, 4, c1, c2, c3); |
587 | 7.14M | r[9] = c1; |
588 | 7.14M | c1 = 0; |
589 | 7.14M | sqr_add_c(a, 5, c2, c3, c1); |
590 | 7.14M | sqr_add_c2(a, 6, 4, c2, c3, c1); |
591 | 7.14M | sqr_add_c2(a, 7, 3, c2, c3, c1); |
592 | 7.14M | r[10] = c2; |
593 | 7.14M | c2 = 0; |
594 | 7.14M | sqr_add_c2(a, 7, 4, c3, c1, c2); |
595 | 7.14M | sqr_add_c2(a, 6, 5, c3, c1, c2); |
596 | 7.14M | r[11] = c3; |
597 | 7.14M | c3 = 0; |
598 | 7.14M | sqr_add_c(a, 6, c1, c2, c3); |
599 | 7.14M | sqr_add_c2(a, 7, 5, c1, c2, c3); |
600 | 7.14M | r[12] = c1; |
601 | 7.14M | c1 = 0; |
602 | 7.14M | sqr_add_c2(a, 7, 6, c2, c3, c1); |
603 | 7.14M | r[13] = c2; |
604 | 7.14M | c2 = 0; |
605 | 7.14M | sqr_add_c(a, 7, c3, c1, c2); |
606 | 7.14M | r[14] = c3; |
607 | 7.14M | r[15] = c1; |
608 | 7.14M | } |
609 | | |
610 | | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
611 | 62.8M | { |
612 | 62.8M | BN_ULONG c1, c2, c3; |
613 | | |
614 | 62.8M | c1 = 0; |
615 | 62.8M | c2 = 0; |
616 | 62.8M | c3 = 0; |
617 | 62.8M | sqr_add_c(a, 0, c1, c2, c3); |
618 | 62.8M | r[0] = c1; |
619 | 62.8M | c1 = 0; |
620 | 62.8M | sqr_add_c2(a, 1, 0, c2, c3, c1); |
621 | 62.8M | r[1] = c2; |
622 | 62.8M | c2 = 0; |
623 | 62.8M | sqr_add_c(a, 1, c3, c1, c2); |
624 | 62.8M | sqr_add_c2(a, 2, 0, c3, c1, c2); |
625 | 62.8M | r[2] = c3; |
626 | 62.8M | c3 = 0; |
627 | 62.8M | sqr_add_c2(a, 3, 0, c1, c2, c3); |
628 | 62.8M | sqr_add_c2(a, 2, 1, c1, c2, c3); |
629 | 62.8M | r[3] = c1; |
630 | 62.8M | c1 = 0; |
631 | 62.8M | sqr_add_c(a, 2, c2, c3, c1); |
632 | 62.8M | sqr_add_c2(a, 3, 1, c2, c3, c1); |
633 | 62.8M | r[4] = c2; |
634 | 62.8M | c2 = 0; |
635 | 62.8M | sqr_add_c2(a, 3, 2, c3, c1, c2); |
636 | 62.8M | r[5] = c3; |
637 | 62.8M | c3 = 0; |
638 | 62.8M | sqr_add_c(a, 3, c1, c2, c3); |
639 | 62.8M | r[6] = c1; |
640 | 62.8M | r[7] = c2; |
641 | 62.8M | } |
642 | | #endif |