/src/openssl/crypto/bn/bn_asm.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved. |
3 | | * |
4 | | * Licensed under the Apache License 2.0 (the "License"). You may not use |
5 | | * this file except in compliance with the License. You can obtain a copy |
6 | | * in the file LICENSE in the source distribution or at |
7 | | * https://www.openssl.org/source/license.html |
8 | | */ |
9 | | |
10 | | #include <assert.h> |
11 | | #include <openssl/crypto.h> |
12 | | #include "internal/cryptlib.h" |
13 | | #include "bn_local.h" |
14 | | |
15 | | #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) |
16 | | |
17 | | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
18 | | BN_ULONG w) |
19 | | { |
20 | | BN_ULONG c1 = 0; |
21 | | |
22 | | assert(num >= 0); |
23 | | if (num <= 0) |
24 | | return c1; |
25 | | |
26 | | # ifndef OPENSSL_SMALL_FOOTPRINT |
27 | | while (num & ~3) { |
28 | | mul_add(rp[0], ap[0], w, c1); |
29 | | mul_add(rp[1], ap[1], w, c1); |
30 | | mul_add(rp[2], ap[2], w, c1); |
31 | | mul_add(rp[3], ap[3], w, c1); |
32 | | ap += 4; |
33 | | rp += 4; |
34 | | num -= 4; |
35 | | } |
36 | | # endif |
37 | | while (num) { |
38 | | mul_add(rp[0], ap[0], w, c1); |
39 | | ap++; |
40 | | rp++; |
41 | | num--; |
42 | | } |
43 | | |
44 | | return c1; |
45 | | } |
46 | | |
47 | | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
48 | | { |
49 | | BN_ULONG c1 = 0; |
50 | | |
51 | | assert(num >= 0); |
52 | | if (num <= 0) |
53 | | return c1; |
54 | | |
55 | | # ifndef OPENSSL_SMALL_FOOTPRINT |
56 | | while (num & ~3) { |
57 | | mul(rp[0], ap[0], w, c1); |
58 | | mul(rp[1], ap[1], w, c1); |
59 | | mul(rp[2], ap[2], w, c1); |
60 | | mul(rp[3], ap[3], w, c1); |
61 | | ap += 4; |
62 | | rp += 4; |
63 | | num -= 4; |
64 | | } |
65 | | # endif |
66 | | while (num) { |
67 | | mul(rp[0], ap[0], w, c1); |
68 | | ap++; |
69 | | rp++; |
70 | | num--; |
71 | | } |
72 | | return c1; |
73 | | } |
74 | | |
75 | | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
76 | | { |
77 | | assert(n >= 0); |
78 | | if (n <= 0) |
79 | | return; |
80 | | |
81 | | # ifndef OPENSSL_SMALL_FOOTPRINT |
82 | | while (n & ~3) { |
83 | | sqr(r[0], r[1], a[0]); |
84 | | sqr(r[2], r[3], a[1]); |
85 | | sqr(r[4], r[5], a[2]); |
86 | | sqr(r[6], r[7], a[3]); |
87 | | a += 4; |
88 | | r += 8; |
89 | | n -= 4; |
90 | | } |
91 | | # endif |
92 | | while (n) { |
93 | | sqr(r[0], r[1], a[0]); |
94 | | a++; |
95 | | r += 2; |
96 | | n--; |
97 | | } |
98 | | } |
99 | | |
100 | | #else /* !(defined(BN_LLONG) || |
101 | | * defined(BN_UMULT_HIGH)) */ |
102 | | |
103 | | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
104 | | BN_ULONG w) |
105 | 818M | { |
106 | 818M | BN_ULONG c = 0; |
107 | 818M | BN_ULONG bl, bh; |
108 | | |
109 | 818M | assert(num >= 0); |
110 | 818M | if (num <= 0) |
111 | 0 | return (BN_ULONG)0; |
112 | | |
113 | 818M | bl = LBITS(w); |
114 | 818M | bh = HBITS(w); |
115 | | |
116 | 818M | # ifndef OPENSSL_SMALL_FOOTPRINT |
117 | 7.04G | while (num & ~3) { |
118 | 6.22G | mul_add(rp[0], ap[0], bl, bh, c); |
119 | 6.22G | mul_add(rp[1], ap[1], bl, bh, c); |
120 | 6.22G | mul_add(rp[2], ap[2], bl, bh, c); |
121 | 6.22G | mul_add(rp[3], ap[3], bl, bh, c); |
122 | 6.22G | ap += 4; |
123 | 6.22G | rp += 4; |
124 | 6.22G | num -= 4; |
125 | 6.22G | } |
126 | 818M | # endif |
127 | 943M | while (num) { |
128 | 125M | mul_add(rp[0], ap[0], bl, bh, c); |
129 | 125M | ap++; |
130 | 125M | rp++; |
131 | 125M | num--; |
132 | 125M | } |
133 | 818M | return c; |
134 | 818M | } |
135 | | |
136 | | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
137 | 13.1M | { |
138 | 13.1M | BN_ULONG carry = 0; |
139 | 13.1M | BN_ULONG bl, bh; |
140 | | |
141 | 13.1M | assert(num >= 0); |
142 | 13.1M | if (num <= 0) |
143 | 0 | return (BN_ULONG)0; |
144 | | |
145 | 13.1M | bl = LBITS(w); |
146 | 13.1M | bh = HBITS(w); |
147 | | |
148 | 13.1M | # ifndef OPENSSL_SMALL_FOOTPRINT |
149 | 61.6M | while (num & ~3) { |
150 | 48.4M | mul(rp[0], ap[0], bl, bh, carry); |
151 | 48.4M | mul(rp[1], ap[1], bl, bh, carry); |
152 | 48.4M | mul(rp[2], ap[2], bl, bh, carry); |
153 | 48.4M | mul(rp[3], ap[3], bl, bh, carry); |
154 | 48.4M | ap += 4; |
155 | 48.4M | rp += 4; |
156 | 48.4M | num -= 4; |
157 | 48.4M | } |
158 | 13.1M | # endif |
159 | 33.1M | while (num) { |
160 | 20.0M | mul(rp[0], ap[0], bl, bh, carry); |
161 | 20.0M | ap++; |
162 | 20.0M | rp++; |
163 | 20.0M | num--; |
164 | 20.0M | } |
165 | 13.1M | return carry; |
166 | 13.1M | } |
167 | | |
168 | | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
169 | 4.37M | { |
170 | 4.37M | assert(n >= 0); |
171 | 4.37M | if (n <= 0) |
172 | 0 | return; |
173 | | |
174 | 4.37M | # ifndef OPENSSL_SMALL_FOOTPRINT |
175 | 10.8M | while (n & ~3) { |
176 | 6.45M | sqr64(r[0], r[1], a[0]); |
177 | 6.45M | sqr64(r[2], r[3], a[1]); |
178 | 6.45M | sqr64(r[4], r[5], a[2]); |
179 | 6.45M | sqr64(r[6], r[7], a[3]); |
180 | 6.45M | a += 4; |
181 | 6.45M | r += 8; |
182 | 6.45M | n -= 4; |
183 | 6.45M | } |
184 | 4.37M | # endif |
185 | 15.9M | while (n) { |
186 | 11.5M | sqr64(r[0], r[1], a[0]); |
187 | 11.5M | a++; |
188 | 11.5M | r += 2; |
189 | 11.5M | n--; |
190 | 11.5M | } |
191 | 4.37M | } |
192 | | |
193 | | #endif /* !(defined(BN_LLONG) || |
194 | | * defined(BN_UMULT_HIGH)) */ |
195 | | |
196 | | #if defined(BN_LLONG) && defined(BN_DIV2W) |
197 | | |
198 | | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
199 | | { |
200 | | return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); |
201 | | } |
202 | | |
203 | | #else |
204 | | |
205 | | /* Divide h,l by d and return the result. */ |
206 | | /* I need to test this some more :-( */ |
207 | | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
208 | 7.37M | { |
209 | 7.37M | BN_ULONG dh, dl, q, ret = 0, th, tl, t; |
210 | 7.37M | int i, count = 2; |
211 | | |
212 | 7.37M | if (d == 0) |
213 | 0 | return BN_MASK2; |
214 | | |
215 | 7.37M | i = BN_num_bits_word(d); |
216 | 7.37M | assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); |
217 | | |
218 | 7.37M | i = BN_BITS2 - i; |
219 | 7.37M | if (h >= d) |
220 | 0 | h -= d; |
221 | | |
222 | 7.37M | if (i) { |
223 | 0 | d <<= i; |
224 | 0 | h = (h << i) | (l >> (BN_BITS2 - i)); |
225 | 0 | l <<= i; |
226 | 0 | } |
227 | 7.37M | dh = (d & BN_MASK2h) >> BN_BITS4; |
228 | 7.37M | dl = (d & BN_MASK2l); |
229 | 14.7M | for (;;) { |
230 | 14.7M | if ((h >> BN_BITS4) == dh) |
231 | 1.21k | q = BN_MASK2l; |
232 | 14.7M | else |
233 | 14.7M | q = h / dh; |
234 | | |
235 | 14.7M | th = q * dh; |
236 | 14.7M | tl = dl * q; |
237 | 21.4M | for (;;) { |
238 | 21.4M | t = h - th; |
239 | 21.4M | if ((t & BN_MASK2h) || |
240 | 21.4M | ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) |
241 | 14.7M | break; |
242 | 6.75M | q--; |
243 | 6.75M | th -= dh; |
244 | 6.75M | tl -= dl; |
245 | 6.75M | } |
246 | 14.7M | t = (tl >> BN_BITS4); |
247 | 14.7M | tl = (tl << BN_BITS4) & BN_MASK2h; |
248 | 14.7M | th += t; |
249 | | |
250 | 14.7M | if (l < tl) |
251 | 6.96M | th++; |
252 | 14.7M | l -= tl; |
253 | 14.7M | if (h < th) { |
254 | 0 | h += d; |
255 | 0 | q--; |
256 | 0 | } |
257 | 14.7M | h -= th; |
258 | | |
259 | 14.7M | if (--count == 0) |
260 | 7.37M | break; |
261 | | |
262 | 7.37M | ret = q << BN_BITS4; |
263 | 7.37M | h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; |
264 | 7.37M | l = (l & BN_MASK2l) << BN_BITS4; |
265 | 7.37M | } |
266 | 7.37M | ret |= q; |
267 | 7.37M | return ret; |
268 | 7.37M | } |
269 | | #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ |
270 | | |
271 | | #ifdef BN_LLONG |
272 | | BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, |
273 | | int n) |
274 | | { |
275 | | BN_ULLONG ll = 0; |
276 | | |
277 | | assert(n >= 0); |
278 | | if (n <= 0) |
279 | | return (BN_ULONG)0; |
280 | | |
281 | | # ifndef OPENSSL_SMALL_FOOTPRINT |
282 | | while (n & ~3) { |
283 | | ll += (BN_ULLONG) a[0] + b[0]; |
284 | | r[0] = (BN_ULONG)ll & BN_MASK2; |
285 | | ll >>= BN_BITS2; |
286 | | ll += (BN_ULLONG) a[1] + b[1]; |
287 | | r[1] = (BN_ULONG)ll & BN_MASK2; |
288 | | ll >>= BN_BITS2; |
289 | | ll += (BN_ULLONG) a[2] + b[2]; |
290 | | r[2] = (BN_ULONG)ll & BN_MASK2; |
291 | | ll >>= BN_BITS2; |
292 | | ll += (BN_ULLONG) a[3] + b[3]; |
293 | | r[3] = (BN_ULONG)ll & BN_MASK2; |
294 | | ll >>= BN_BITS2; |
295 | | a += 4; |
296 | | b += 4; |
297 | | r += 4; |
298 | | n -= 4; |
299 | | } |
300 | | # endif |
301 | | while (n) { |
302 | | ll += (BN_ULLONG) a[0] + b[0]; |
303 | | r[0] = (BN_ULONG)ll & BN_MASK2; |
304 | | ll >>= BN_BITS2; |
305 | | a++; |
306 | | b++; |
307 | | r++; |
308 | | n--; |
309 | | } |
310 | | return (BN_ULONG)ll; |
311 | | } |
312 | | #else /* !BN_LLONG */ |
313 | | BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, |
314 | | int n) |
315 | 164M | { |
316 | 164M | BN_ULONG c, l, t; |
317 | | |
318 | 164M | assert(n >= 0); |
319 | 164M | if (n <= 0) |
320 | 150k | return (BN_ULONG)0; |
321 | | |
322 | 164M | c = 0; |
323 | 164M | # ifndef OPENSSL_SMALL_FOOTPRINT |
324 | 979M | while (n & ~3) { |
325 | 815M | t = a[0]; |
326 | 815M | t = (t + c) & BN_MASK2; |
327 | 815M | c = (t < c); |
328 | 815M | l = (t + b[0]) & BN_MASK2; |
329 | 815M | c += (l < t); |
330 | 815M | r[0] = l; |
331 | 815M | t = a[1]; |
332 | 815M | t = (t + c) & BN_MASK2; |
333 | 815M | c = (t < c); |
334 | 815M | l = (t + b[1]) & BN_MASK2; |
335 | 815M | c += (l < t); |
336 | 815M | r[1] = l; |
337 | 815M | t = a[2]; |
338 | 815M | t = (t + c) & BN_MASK2; |
339 | 815M | c = (t < c); |
340 | 815M | l = (t + b[2]) & BN_MASK2; |
341 | 815M | c += (l < t); |
342 | 815M | r[2] = l; |
343 | 815M | t = a[3]; |
344 | 815M | t = (t + c) & BN_MASK2; |
345 | 815M | c = (t < c); |
346 | 815M | l = (t + b[3]) & BN_MASK2; |
347 | 815M | c += (l < t); |
348 | 815M | r[3] = l; |
349 | 815M | a += 4; |
350 | 815M | b += 4; |
351 | 815M | r += 4; |
352 | 815M | n -= 4; |
353 | 815M | } |
354 | 164M | # endif |
355 | 192M | while (n) { |
356 | 27.7M | t = a[0]; |
357 | 27.7M | t = (t + c) & BN_MASK2; |
358 | 27.7M | c = (t < c); |
359 | 27.7M | l = (t + b[0]) & BN_MASK2; |
360 | 27.7M | c += (l < t); |
361 | 27.7M | r[0] = l; |
362 | 27.7M | a++; |
363 | 27.7M | b++; |
364 | 27.7M | r++; |
365 | 27.7M | n--; |
366 | 27.7M | } |
367 | 164M | return (BN_ULONG)c; |
368 | 164M | } |
369 | | #endif /* !BN_LLONG */ |
370 | | |
371 | | BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, |
372 | | int n) |
373 | 206M | { |
374 | 206M | BN_ULONG t1, t2; |
375 | 206M | int c = 0; |
376 | | |
377 | 206M | assert(n >= 0); |
378 | 206M | if (n <= 0) |
379 | 78.2k | return (BN_ULONG)0; |
380 | | |
381 | 206M | #ifndef OPENSSL_SMALL_FOOTPRINT |
382 | 978M | while (n & ~3) { |
383 | 771M | t1 = a[0]; |
384 | 771M | t2 = (t1 - c) & BN_MASK2; |
385 | 771M | c = (t2 > t1); |
386 | 771M | t1 = b[0]; |
387 | 771M | t1 = (t2 - t1) & BN_MASK2; |
388 | 771M | r[0] = t1; |
389 | 771M | c += (t1 > t2); |
390 | 771M | t1 = a[1]; |
391 | 771M | t2 = (t1 - c) & BN_MASK2; |
392 | 771M | c = (t2 > t1); |
393 | 771M | t1 = b[1]; |
394 | 771M | t1 = (t2 - t1) & BN_MASK2; |
395 | 771M | r[1] = t1; |
396 | 771M | c += (t1 > t2); |
397 | 771M | t1 = a[2]; |
398 | 771M | t2 = (t1 - c) & BN_MASK2; |
399 | 771M | c = (t2 > t1); |
400 | 771M | t1 = b[2]; |
401 | 771M | t1 = (t2 - t1) & BN_MASK2; |
402 | 771M | r[2] = t1; |
403 | 771M | c += (t1 > t2); |
404 | 771M | t1 = a[3]; |
405 | 771M | t2 = (t1 - c) & BN_MASK2; |
406 | 771M | c = (t2 > t1); |
407 | 771M | t1 = b[3]; |
408 | 771M | t1 = (t2 - t1) & BN_MASK2; |
409 | 771M | r[3] = t1; |
410 | 771M | c += (t1 > t2); |
411 | 771M | a += 4; |
412 | 771M | b += 4; |
413 | 771M | r += 4; |
414 | 771M | n -= 4; |
415 | 771M | } |
416 | 206M | #endif |
417 | 236M | while (n) { |
418 | 29.4M | t1 = a[0]; |
419 | 29.4M | t2 = (t1 - c) & BN_MASK2; |
420 | 29.4M | c = (t2 > t1); |
421 | 29.4M | t1 = b[0]; |
422 | 29.4M | t1 = (t2 - t1) & BN_MASK2; |
423 | 29.4M | r[0] = t1; |
424 | 29.4M | c += (t1 > t2); |
425 | 29.4M | a++; |
426 | 29.4M | b++; |
427 | 29.4M | r++; |
428 | 29.4M | n--; |
429 | 29.4M | } |
430 | 206M | return c; |
431 | 206M | } |
432 | | |
433 | | #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) |
434 | | |
435 | | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
436 | | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ |
437 | | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ |
438 | | /* |
439 | | * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number |
440 | | * c=(c2,c1,c0) |
441 | | */ |
442 | | |
443 | | # ifdef BN_LLONG |
444 | | /* |
445 | | * Keep in mind that additions to multiplication result can not |
446 | | * overflow, because its high half cannot be all-ones. |
447 | | */ |
448 | | # define mul_add_c(a,b,c0,c1,c2) do { \ |
449 | | BN_ULONG hi; \ |
450 | | BN_ULLONG t = (BN_ULLONG)(a)*(b); \ |
451 | | t += c0; /* no carry */ \ |
452 | | c0 = (BN_ULONG)Lw(t); \ |
453 | | hi = (BN_ULONG)Hw(t); \ |
454 | | c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ |
455 | | } while(0) |
456 | | |
457 | | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
458 | | BN_ULONG hi; \ |
459 | | BN_ULLONG t = (BN_ULLONG)(a)*(b); \ |
460 | | BN_ULLONG tt = t+c0; /* no carry */ \ |
461 | | c0 = (BN_ULONG)Lw(tt); \ |
462 | | hi = (BN_ULONG)Hw(tt); \ |
463 | | c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ |
464 | | t += c0; /* no carry */ \ |
465 | | c0 = (BN_ULONG)Lw(t); \ |
466 | | hi = (BN_ULONG)Hw(t); \ |
467 | | c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ |
468 | | } while(0) |
469 | | |
470 | | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
471 | | BN_ULONG hi; \ |
472 | | BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ |
473 | | t += c0; /* no carry */ \ |
474 | | c0 = (BN_ULONG)Lw(t); \ |
475 | | hi = (BN_ULONG)Hw(t); \ |
476 | | c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ |
477 | | } while(0) |
478 | | |
479 | | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
480 | | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
481 | | |
482 | | # elif defined(BN_UMULT_LOHI) |
483 | | /* |
484 | | * Keep in mind that additions to hi can not overflow, because |
485 | | * the high word of a multiplication result cannot be all-ones. |
486 | | */ |
487 | | # define mul_add_c(a,b,c0,c1,c2) do { \ |
488 | | BN_ULONG ta = (a), tb = (b); \ |
489 | | BN_ULONG lo, hi; \ |
490 | | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
491 | | c0 += lo; hi += (c0<lo); \ |
492 | | c1 += hi; c2 += (c1<hi); \ |
493 | | } while(0) |
494 | | |
495 | | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
496 | | BN_ULONG ta = (a), tb = (b); \ |
497 | | BN_ULONG lo, hi, tt; \ |
498 | | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
499 | | c0 += lo; tt = hi + (c0<lo); \ |
500 | | c1 += tt; c2 += (c1<tt); \ |
501 | | c0 += lo; hi += (c0<lo); \ |
502 | | c1 += hi; c2 += (c1<hi); \ |
503 | | } while(0) |
504 | | |
505 | | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
506 | | BN_ULONG ta = (a)[i]; \ |
507 | | BN_ULONG lo, hi; \ |
508 | | BN_UMULT_LOHI(lo,hi,ta,ta); \ |
509 | | c0 += lo; hi += (c0<lo); \ |
510 | | c1 += hi; c2 += (c1<hi); \ |
511 | | } while(0) |
512 | | |
513 | | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
514 | | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
515 | | |
516 | | # elif defined(BN_UMULT_HIGH) |
517 | | /* |
518 | | * Keep in mind that additions to hi can not overflow, because |
519 | | * the high word of a multiplication result cannot be all-ones. |
520 | | */ |
521 | | # define mul_add_c(a,b,c0,c1,c2) do { \ |
522 | | BN_ULONG ta = (a), tb = (b); \ |
523 | | BN_ULONG lo = ta * tb; \ |
524 | | BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ |
525 | | c0 += lo; hi += (c0<lo); \ |
526 | | c1 += hi; c2 += (c1<hi); \ |
527 | | } while(0) |
528 | | |
529 | | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
530 | | BN_ULONG ta = (a), tb = (b), tt; \ |
531 | | BN_ULONG lo = ta * tb; \ |
532 | | BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ |
533 | | c0 += lo; tt = hi + (c0<lo); \ |
534 | | c1 += tt; c2 += (c1<tt); \ |
535 | | c0 += lo; hi += (c0<lo); \ |
536 | | c1 += hi; c2 += (c1<hi); \ |
537 | | } while(0) |
538 | | |
539 | | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
540 | | BN_ULONG ta = (a)[i]; \ |
541 | | BN_ULONG lo = ta * ta; \ |
542 | | BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ |
543 | | c0 += lo; hi += (c0<lo); \ |
544 | | c1 += hi; c2 += (c1<hi); \ |
545 | | } while(0) |
546 | | |
547 | | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
548 | | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
549 | | |
550 | | # else /* !BN_LLONG */ |
551 | | /* |
552 | | * Keep in mind that additions to hi can not overflow, because |
553 | | * the high word of a multiplication result cannot be all-ones. |
554 | | */ |
555 | 1.93G | # define mul_add_c(a,b,c0,c1,c2) do { \ |
556 | 1.93G | BN_ULONG lo = LBITS(a), hi = HBITS(a); \ |
557 | 1.93G | BN_ULONG bl = LBITS(b), bh = HBITS(b); \ |
558 | 1.93G | mul64(lo,hi,bl,bh); \ |
559 | 1.93G | c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \ |
560 | 1.93G | c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ |
561 | 1.93G | } while(0) |
562 | | |
563 | 4.24G | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
564 | 4.24G | BN_ULONG tt; \ |
565 | 4.24G | BN_ULONG lo = LBITS(a), hi = HBITS(a); \ |
566 | 4.24G | BN_ULONG bl = LBITS(b), bh = HBITS(b); \ |
567 | 4.24G | mul64(lo,hi,bl,bh); \ |
568 | 4.24G | tt = hi; \ |
569 | 4.24G | c0 = (c0+lo)&BN_MASK2; tt += (c0<lo); \ |
570 | 4.24G | c1 = (c1+tt)&BN_MASK2; c2 += (c1<tt); \ |
571 | 4.24G | c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \ |
572 | 4.24G | c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ |
573 | 4.24G | } while(0) |
574 | | |
575 | 1.21G | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
576 | 1.21G | BN_ULONG lo, hi; \ |
577 | 1.21G | sqr64(lo,hi,(a)[i]); \ |
578 | 1.21G | c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \ |
579 | 1.21G | c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ |
580 | 1.21G | } while(0) |
581 | | |
582 | | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
583 | 4.24G | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
584 | | # endif /* !BN_LLONG */ |
585 | | |
586 | | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
587 | 30.1M | { |
588 | 30.1M | BN_ULONG c1, c2, c3; |
589 | | |
590 | 30.1M | c1 = 0; |
591 | 30.1M | c2 = 0; |
592 | 30.1M | c3 = 0; |
593 | 30.1M | mul_add_c(a[0], b[0], c1, c2, c3); |
594 | 30.1M | r[0] = c1; |
595 | 30.1M | c1 = 0; |
596 | 30.1M | mul_add_c(a[0], b[1], c2, c3, c1); |
597 | 30.1M | mul_add_c(a[1], b[0], c2, c3, c1); |
598 | 30.1M | r[1] = c2; |
599 | 30.1M | c2 = 0; |
600 | 30.1M | mul_add_c(a[2], b[0], c3, c1, c2); |
601 | 30.1M | mul_add_c(a[1], b[1], c3, c1, c2); |
602 | 30.1M | mul_add_c(a[0], b[2], c3, c1, c2); |
603 | 30.1M | r[2] = c3; |
604 | 30.1M | c3 = 0; |
605 | 30.1M | mul_add_c(a[0], b[3], c1, c2, c3); |
606 | 30.1M | mul_add_c(a[1], b[2], c1, c2, c3); |
607 | 30.1M | mul_add_c(a[2], b[1], c1, c2, c3); |
608 | 30.1M | mul_add_c(a[3], b[0], c1, c2, c3); |
609 | 30.1M | r[3] = c1; |
610 | 30.1M | c1 = 0; |
611 | 30.1M | mul_add_c(a[4], b[0], c2, c3, c1); |
612 | 30.1M | mul_add_c(a[3], b[1], c2, c3, c1); |
613 | 30.1M | mul_add_c(a[2], b[2], c2, c3, c1); |
614 | 30.1M | mul_add_c(a[1], b[3], c2, c3, c1); |
615 | 30.1M | mul_add_c(a[0], b[4], c2, c3, c1); |
616 | 30.1M | r[4] = c2; |
617 | 30.1M | c2 = 0; |
618 | 30.1M | mul_add_c(a[0], b[5], c3, c1, c2); |
619 | 30.1M | mul_add_c(a[1], b[4], c3, c1, c2); |
620 | 30.1M | mul_add_c(a[2], b[3], c3, c1, c2); |
621 | 30.1M | mul_add_c(a[3], b[2], c3, c1, c2); |
622 | 30.1M | mul_add_c(a[4], b[1], c3, c1, c2); |
623 | 30.1M | mul_add_c(a[5], b[0], c3, c1, c2); |
624 | 30.1M | r[5] = c3; |
625 | 30.1M | c3 = 0; |
626 | 30.1M | mul_add_c(a[6], b[0], c1, c2, c3); |
627 | 30.1M | mul_add_c(a[5], b[1], c1, c2, c3); |
628 | 30.1M | mul_add_c(a[4], b[2], c1, c2, c3); |
629 | 30.1M | mul_add_c(a[3], b[3], c1, c2, c3); |
630 | 30.1M | mul_add_c(a[2], b[4], c1, c2, c3); |
631 | 30.1M | mul_add_c(a[1], b[5], c1, c2, c3); |
632 | 30.1M | mul_add_c(a[0], b[6], c1, c2, c3); |
633 | 30.1M | r[6] = c1; |
634 | 30.1M | c1 = 0; |
635 | 30.1M | mul_add_c(a[0], b[7], c2, c3, c1); |
636 | 30.1M | mul_add_c(a[1], b[6], c2, c3, c1); |
637 | 30.1M | mul_add_c(a[2], b[5], c2, c3, c1); |
638 | 30.1M | mul_add_c(a[3], b[4], c2, c3, c1); |
639 | 30.1M | mul_add_c(a[4], b[3], c2, c3, c1); |
640 | 30.1M | mul_add_c(a[5], b[2], c2, c3, c1); |
641 | 30.1M | mul_add_c(a[6], b[1], c2, c3, c1); |
642 | 30.1M | mul_add_c(a[7], b[0], c2, c3, c1); |
643 | 30.1M | r[7] = c2; |
644 | 30.1M | c2 = 0; |
645 | 30.1M | mul_add_c(a[7], b[1], c3, c1, c2); |
646 | 30.1M | mul_add_c(a[6], b[2], c3, c1, c2); |
647 | 30.1M | mul_add_c(a[5], b[3], c3, c1, c2); |
648 | 30.1M | mul_add_c(a[4], b[4], c3, c1, c2); |
649 | 30.1M | mul_add_c(a[3], b[5], c3, c1, c2); |
650 | 30.1M | mul_add_c(a[2], b[6], c3, c1, c2); |
651 | 30.1M | mul_add_c(a[1], b[7], c3, c1, c2); |
652 | 30.1M | r[8] = c3; |
653 | 30.1M | c3 = 0; |
654 | 30.1M | mul_add_c(a[2], b[7], c1, c2, c3); |
655 | 30.1M | mul_add_c(a[3], b[6], c1, c2, c3); |
656 | 30.1M | mul_add_c(a[4], b[5], c1, c2, c3); |
657 | 30.1M | mul_add_c(a[5], b[4], c1, c2, c3); |
658 | 30.1M | mul_add_c(a[6], b[3], c1, c2, c3); |
659 | 30.1M | mul_add_c(a[7], b[2], c1, c2, c3); |
660 | 30.1M | r[9] = c1; |
661 | 30.1M | c1 = 0; |
662 | 30.1M | mul_add_c(a[7], b[3], c2, c3, c1); |
663 | 30.1M | mul_add_c(a[6], b[4], c2, c3, c1); |
664 | 30.1M | mul_add_c(a[5], b[5], c2, c3, c1); |
665 | 30.1M | mul_add_c(a[4], b[6], c2, c3, c1); |
666 | 30.1M | mul_add_c(a[3], b[7], c2, c3, c1); |
667 | 30.1M | r[10] = c2; |
668 | 30.1M | c2 = 0; |
669 | 30.1M | mul_add_c(a[4], b[7], c3, c1, c2); |
670 | 30.1M | mul_add_c(a[5], b[6], c3, c1, c2); |
671 | 30.1M | mul_add_c(a[6], b[5], c3, c1, c2); |
672 | 30.1M | mul_add_c(a[7], b[4], c3, c1, c2); |
673 | 30.1M | r[11] = c3; |
674 | 30.1M | c3 = 0; |
675 | 30.1M | mul_add_c(a[7], b[5], c1, c2, c3); |
676 | 30.1M | mul_add_c(a[6], b[6], c1, c2, c3); |
677 | 30.1M | mul_add_c(a[5], b[7], c1, c2, c3); |
678 | 30.1M | r[12] = c1; |
679 | 30.1M | c1 = 0; |
680 | 30.1M | mul_add_c(a[6], b[7], c2, c3, c1); |
681 | 30.1M | mul_add_c(a[7], b[6], c2, c3, c1); |
682 | 30.1M | r[13] = c2; |
683 | 30.1M | c2 = 0; |
684 | 30.1M | mul_add_c(a[7], b[7], c3, c1, c2); |
685 | 30.1M | r[14] = c3; |
686 | 30.1M | r[15] = c1; |
687 | 30.1M | } |
688 | | |
689 | | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
690 | 0 | { |
691 | 0 | BN_ULONG c1, c2, c3; |
692 | |
|
693 | 0 | c1 = 0; |
694 | 0 | c2 = 0; |
695 | 0 | c3 = 0; |
696 | 0 | mul_add_c(a[0], b[0], c1, c2, c3); |
697 | 0 | r[0] = c1; |
698 | 0 | c1 = 0; |
699 | 0 | mul_add_c(a[0], b[1], c2, c3, c1); |
700 | 0 | mul_add_c(a[1], b[0], c2, c3, c1); |
701 | 0 | r[1] = c2; |
702 | 0 | c2 = 0; |
703 | 0 | mul_add_c(a[2], b[0], c3, c1, c2); |
704 | 0 | mul_add_c(a[1], b[1], c3, c1, c2); |
705 | 0 | mul_add_c(a[0], b[2], c3, c1, c2); |
706 | 0 | r[2] = c3; |
707 | 0 | c3 = 0; |
708 | 0 | mul_add_c(a[0], b[3], c1, c2, c3); |
709 | 0 | mul_add_c(a[1], b[2], c1, c2, c3); |
710 | 0 | mul_add_c(a[2], b[1], c1, c2, c3); |
711 | 0 | mul_add_c(a[3], b[0], c1, c2, c3); |
712 | 0 | r[3] = c1; |
713 | 0 | c1 = 0; |
714 | 0 | mul_add_c(a[3], b[1], c2, c3, c1); |
715 | 0 | mul_add_c(a[2], b[2], c2, c3, c1); |
716 | 0 | mul_add_c(a[1], b[3], c2, c3, c1); |
717 | 0 | r[4] = c2; |
718 | 0 | c2 = 0; |
719 | 0 | mul_add_c(a[2], b[3], c3, c1, c2); |
720 | 0 | mul_add_c(a[3], b[2], c3, c1, c2); |
721 | 0 | r[5] = c3; |
722 | 0 | c3 = 0; |
723 | 0 | mul_add_c(a[3], b[3], c1, c2, c3); |
724 | 0 | r[6] = c1; |
725 | 0 | r[7] = c2; |
726 | 0 | } |
727 | | |
728 | | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
729 | 151M | { |
730 | 151M | BN_ULONG c1, c2, c3; |
731 | | |
732 | 151M | c1 = 0; |
733 | 151M | c2 = 0; |
734 | 151M | c3 = 0; |
735 | 151M | sqr_add_c(a, 0, c1, c2, c3); |
736 | 151M | r[0] = c1; |
737 | 151M | c1 = 0; |
738 | 151M | sqr_add_c2(a, 1, 0, c2, c3, c1); |
739 | 151M | r[1] = c2; |
740 | 151M | c2 = 0; |
741 | 151M | sqr_add_c(a, 1, c3, c1, c2); |
742 | 151M | sqr_add_c2(a, 2, 0, c3, c1, c2); |
743 | 151M | r[2] = c3; |
744 | 151M | c3 = 0; |
745 | 151M | sqr_add_c2(a, 3, 0, c1, c2, c3); |
746 | 151M | sqr_add_c2(a, 2, 1, c1, c2, c3); |
747 | 151M | r[3] = c1; |
748 | 151M | c1 = 0; |
749 | 151M | sqr_add_c(a, 2, c2, c3, c1); |
750 | 151M | sqr_add_c2(a, 3, 1, c2, c3, c1); |
751 | 151M | sqr_add_c2(a, 4, 0, c2, c3, c1); |
752 | 151M | r[4] = c2; |
753 | 151M | c2 = 0; |
754 | 151M | sqr_add_c2(a, 5, 0, c3, c1, c2); |
755 | 151M | sqr_add_c2(a, 4, 1, c3, c1, c2); |
756 | 151M | sqr_add_c2(a, 3, 2, c3, c1, c2); |
757 | 151M | r[5] = c3; |
758 | 151M | c3 = 0; |
759 | 151M | sqr_add_c(a, 3, c1, c2, c3); |
760 | 151M | sqr_add_c2(a, 4, 2, c1, c2, c3); |
761 | 151M | sqr_add_c2(a, 5, 1, c1, c2, c3); |
762 | 151M | sqr_add_c2(a, 6, 0, c1, c2, c3); |
763 | 151M | r[6] = c1; |
764 | 151M | c1 = 0; |
765 | 151M | sqr_add_c2(a, 7, 0, c2, c3, c1); |
766 | 151M | sqr_add_c2(a, 6, 1, c2, c3, c1); |
767 | 151M | sqr_add_c2(a, 5, 2, c2, c3, c1); |
768 | 151M | sqr_add_c2(a, 4, 3, c2, c3, c1); |
769 | 151M | r[7] = c2; |
770 | 151M | c2 = 0; |
771 | 151M | sqr_add_c(a, 4, c3, c1, c2); |
772 | 151M | sqr_add_c2(a, 5, 3, c3, c1, c2); |
773 | 151M | sqr_add_c2(a, 6, 2, c3, c1, c2); |
774 | 151M | sqr_add_c2(a, 7, 1, c3, c1, c2); |
775 | 151M | r[8] = c3; |
776 | 151M | c3 = 0; |
777 | 151M | sqr_add_c2(a, 7, 2, c1, c2, c3); |
778 | 151M | sqr_add_c2(a, 6, 3, c1, c2, c3); |
779 | 151M | sqr_add_c2(a, 5, 4, c1, c2, c3); |
780 | 151M | r[9] = c1; |
781 | 151M | c1 = 0; |
782 | 151M | sqr_add_c(a, 5, c2, c3, c1); |
783 | 151M | sqr_add_c2(a, 6, 4, c2, c3, c1); |
784 | 151M | sqr_add_c2(a, 7, 3, c2, c3, c1); |
785 | 151M | r[10] = c2; |
786 | 151M | c2 = 0; |
787 | 151M | sqr_add_c2(a, 7, 4, c3, c1, c2); |
788 | 151M | sqr_add_c2(a, 6, 5, c3, c1, c2); |
789 | 151M | r[11] = c3; |
790 | 151M | c3 = 0; |
791 | 151M | sqr_add_c(a, 6, c1, c2, c3); |
792 | 151M | sqr_add_c2(a, 7, 5, c1, c2, c3); |
793 | 151M | r[12] = c1; |
794 | 151M | c1 = 0; |
795 | 151M | sqr_add_c2(a, 7, 6, c2, c3, c1); |
796 | 151M | r[13] = c2; |
797 | 151M | c2 = 0; |
798 | 151M | sqr_add_c(a, 7, c3, c1, c2); |
799 | 151M | r[14] = c3; |
800 | 151M | r[15] = c1; |
801 | 151M | } |
802 | | |
803 | | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
804 | 510k | { |
805 | 510k | BN_ULONG c1, c2, c3; |
806 | | |
807 | 510k | c1 = 0; |
808 | 510k | c2 = 0; |
809 | 510k | c3 = 0; |
810 | 510k | sqr_add_c(a, 0, c1, c2, c3); |
811 | 510k | r[0] = c1; |
812 | 510k | c1 = 0; |
813 | 510k | sqr_add_c2(a, 1, 0, c2, c3, c1); |
814 | 510k | r[1] = c2; |
815 | 510k | c2 = 0; |
816 | 510k | sqr_add_c(a, 1, c3, c1, c2); |
817 | 510k | sqr_add_c2(a, 2, 0, c3, c1, c2); |
818 | 510k | r[2] = c3; |
819 | 510k | c3 = 0; |
820 | 510k | sqr_add_c2(a, 3, 0, c1, c2, c3); |
821 | 510k | sqr_add_c2(a, 2, 1, c1, c2, c3); |
822 | 510k | r[3] = c1; |
823 | 510k | c1 = 0; |
824 | 510k | sqr_add_c(a, 2, c2, c3, c1); |
825 | 510k | sqr_add_c2(a, 3, 1, c2, c3, c1); |
826 | 510k | r[4] = c2; |
827 | 510k | c2 = 0; |
828 | 510k | sqr_add_c2(a, 3, 2, c3, c1, c2); |
829 | 510k | r[5] = c3; |
830 | 510k | c3 = 0; |
831 | 510k | sqr_add_c(a, 3, c1, c2, c3); |
832 | 510k | r[6] = c1; |
833 | 510k | r[7] = c2; |
834 | 510k | } |
835 | | |
836 | | # ifdef OPENSSL_NO_ASM |
837 | | # ifdef OPENSSL_BN_ASM_MONT |
838 | | # include <alloca.h> |
839 | | /* |
840 | | * This is essentially reference implementation, which may or may not |
841 | | * result in performance improvement. E.g. on IA-32 this routine was |
842 | | * observed to give 40% faster rsa1024 private key operations and 10% |
843 | | * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only |
844 | | * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a |
845 | | * reference implementation, one to be used as starting point for |
846 | | * platform-specific assembler. Mentioned numbers apply to compiler |
847 | | * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and |
848 | | * can vary not only from platform to platform, but even for compiler |
849 | | * versions. Assembler vs. assembler improvement coefficients can |
850 | | * [and are known to] differ and are to be documented elsewhere. |
851 | | */ |
852 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
853 | | const BN_ULONG *np, const BN_ULONG *n0p, int num) |
854 | | { |
855 | | BN_ULONG c0, c1, ml, *tp, n0; |
856 | | # ifdef mul64 |
857 | | BN_ULONG mh; |
858 | | # endif |
859 | | volatile BN_ULONG *vp; |
860 | | int i = 0, j; |
861 | | |
862 | | # if 0 /* template for platform-specific |
863 | | * implementation */ |
864 | | if (ap == bp) |
865 | | return bn_sqr_mont(rp, ap, np, n0p, num); |
866 | | # endif |
867 | | vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); |
868 | | |
869 | | n0 = *n0p; |
870 | | |
871 | | c0 = 0; |
872 | | ml = bp[0]; |
873 | | # ifdef mul64 |
874 | | mh = HBITS(ml); |
875 | | ml = LBITS(ml); |
876 | | for (j = 0; j < num; ++j) |
877 | | mul(tp[j], ap[j], ml, mh, c0); |
878 | | # else |
879 | | for (j = 0; j < num; ++j) |
880 | | mul(tp[j], ap[j], ml, c0); |
881 | | # endif |
882 | | |
883 | | tp[num] = c0; |
884 | | tp[num + 1] = 0; |
885 | | goto enter; |
886 | | |
887 | | for (i = 0; i < num; i++) { |
888 | | c0 = 0; |
889 | | ml = bp[i]; |
890 | | # ifdef mul64 |
891 | | mh = HBITS(ml); |
892 | | ml = LBITS(ml); |
893 | | for (j = 0; j < num; ++j) |
894 | | mul_add(tp[j], ap[j], ml, mh, c0); |
895 | | # else |
896 | | for (j = 0; j < num; ++j) |
897 | | mul_add(tp[j], ap[j], ml, c0); |
898 | | # endif |
899 | | c1 = (tp[num] + c0) & BN_MASK2; |
900 | | tp[num] = c1; |
901 | | tp[num + 1] = (c1 < c0 ? 1 : 0); |
902 | | enter: |
903 | | c1 = tp[0]; |
904 | | ml = (c1 * n0) & BN_MASK2; |
905 | | c0 = 0; |
906 | | # ifdef mul64 |
907 | | mh = HBITS(ml); |
908 | | ml = LBITS(ml); |
909 | | mul_add(c1, np[0], ml, mh, c0); |
910 | | # else |
911 | | mul_add(c1, ml, np[0], c0); |
912 | | # endif |
913 | | for (j = 1; j < num; j++) { |
914 | | c1 = tp[j]; |
915 | | # ifdef mul64 |
916 | | mul_add(c1, np[j], ml, mh, c0); |
917 | | # else |
918 | | mul_add(c1, ml, np[j], c0); |
919 | | # endif |
920 | | tp[j - 1] = c1 & BN_MASK2; |
921 | | } |
922 | | c1 = (tp[num] + c0) & BN_MASK2; |
923 | | tp[num - 1] = c1; |
924 | | tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); |
925 | | } |
926 | | |
927 | | if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { |
928 | | c0 = bn_sub_words(rp, tp, np, num); |
929 | | if (tp[num] != 0 || c0 == 0) { |
930 | | for (i = 0; i < num + 2; i++) |
931 | | vp[i] = 0; |
932 | | return 1; |
933 | | } |
934 | | } |
935 | | for (i = 0; i < num; i++) |
936 | | rp[i] = tp[i], vp[i] = 0; |
937 | | vp[num] = 0; |
938 | | vp[num + 1] = 0; |
939 | | return 1; |
940 | | } |
941 | | # else |
942 | | /* |
943 | | * Return value of 0 indicates that multiplication/convolution was not |
944 | | * performed to signal the caller to fall down to alternative/original |
945 | | * code-path. |
946 | | */ |
947 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
948 | | const BN_ULONG *np, const BN_ULONG *n0, int num) |
949 | | { |
950 | | return 0; |
951 | | } |
952 | | # endif /* OPENSSL_BN_ASM_MONT */ |
953 | | # endif |
954 | | |
955 | | #else /* !BN_MUL_COMBA */ |
956 | | |
957 | | /* hmm... is it faster just to do a multiply? */ |
958 | | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
959 | | { |
960 | | BN_ULONG t[8]; |
961 | | bn_sqr_normal(r, a, 4, t); |
962 | | } |
963 | | |
964 | | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
965 | | { |
966 | | BN_ULONG t[16]; |
967 | | bn_sqr_normal(r, a, 8, t); |
968 | | } |
969 | | |
970 | | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
971 | | { |
972 | | r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); |
973 | | r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); |
974 | | r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); |
975 | | r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); |
976 | | } |
977 | | |
978 | | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
979 | | { |
980 | | r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); |
981 | | r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); |
982 | | r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); |
983 | | r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); |
984 | | r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); |
985 | | r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); |
986 | | r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); |
987 | | r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); |
988 | | } |
989 | | |
990 | | # ifdef OPENSSL_NO_ASM |
991 | | # ifdef OPENSSL_BN_ASM_MONT |
992 | | # include <alloca.h> |
993 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
994 | | const BN_ULONG *np, const BN_ULONG *n0p, int num) |
995 | | { |
996 | | BN_ULONG c0, c1, *tp, n0 = *n0p; |
997 | | volatile BN_ULONG *vp; |
998 | | int i = 0, j; |
999 | | |
1000 | | vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); |
1001 | | |
1002 | | for (i = 0; i <= num; i++) |
1003 | | tp[i] = 0; |
1004 | | |
1005 | | for (i = 0; i < num; i++) { |
1006 | | c0 = bn_mul_add_words(tp, ap, num, bp[i]); |
1007 | | c1 = (tp[num] + c0) & BN_MASK2; |
1008 | | tp[num] = c1; |
1009 | | tp[num + 1] = (c1 < c0 ? 1 : 0); |
1010 | | |
1011 | | c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); |
1012 | | c1 = (tp[num] + c0) & BN_MASK2; |
1013 | | tp[num] = c1; |
1014 | | tp[num + 1] += (c1 < c0 ? 1 : 0); |
1015 | | for (j = 0; j <= num; j++) |
1016 | | tp[j] = tp[j + 1]; |
1017 | | } |
1018 | | |
1019 | | if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { |
1020 | | c0 = bn_sub_words(rp, tp, np, num); |
1021 | | if (tp[num] != 0 || c0 == 0) { |
1022 | | for (i = 0; i < num + 2; i++) |
1023 | | vp[i] = 0; |
1024 | | return 1; |
1025 | | } |
1026 | | } |
1027 | | for (i = 0; i < num; i++) |
1028 | | rp[i] = tp[i], vp[i] = 0; |
1029 | | vp[num] = 0; |
1030 | | vp[num + 1] = 0; |
1031 | | return 1; |
1032 | | } |
1033 | | # else |
1034 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
1035 | | const BN_ULONG *np, const BN_ULONG *n0, int num) |
1036 | | { |
1037 | | return 0; |
1038 | | } |
1039 | | # endif /* OPENSSL_BN_ASM_MONT */ |
1040 | | # endif |
1041 | | |
1042 | | #endif /* !BN_MUL_COMBA */ |