/src/openssl/crypto/bn/bn_asm.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved. |
3 | | * |
4 | | * Licensed under the Apache License 2.0 (the "License"). You may not use |
5 | | * this file except in compliance with the License. You can obtain a copy |
6 | | * in the file LICENSE in the source distribution or at |
7 | | * https://www.openssl.org/source/license.html |
8 | | */ |
9 | | |
10 | | #include <assert.h> |
11 | | #include <openssl/crypto.h> |
12 | | #include "internal/cryptlib.h" |
13 | | #include "bn_local.h" |
14 | | |
15 | | #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) |
16 | | |
17 | | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
18 | | BN_ULONG w) |
19 | | { |
20 | | BN_ULONG c1 = 0; |
21 | | |
22 | | assert(num >= 0); |
23 | | if (num <= 0) |
24 | | return c1; |
25 | | |
26 | | #ifndef OPENSSL_SMALL_FOOTPRINT |
27 | | while (num & ~3) { |
28 | | mul_add(rp[0], ap[0], w, c1); |
29 | | mul_add(rp[1], ap[1], w, c1); |
30 | | mul_add(rp[2], ap[2], w, c1); |
31 | | mul_add(rp[3], ap[3], w, c1); |
32 | | ap += 4; |
33 | | rp += 4; |
34 | | num -= 4; |
35 | | } |
36 | | #endif |
37 | | while (num) { |
38 | | mul_add(rp[0], ap[0], w, c1); |
39 | | ap++; |
40 | | rp++; |
41 | | num--; |
42 | | } |
43 | | |
44 | | return c1; |
45 | | } |
46 | | |
47 | | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
48 | | { |
49 | | BN_ULONG c1 = 0; |
50 | | |
51 | | assert(num >= 0); |
52 | | if (num <= 0) |
53 | | return c1; |
54 | | |
55 | | #ifndef OPENSSL_SMALL_FOOTPRINT |
56 | | while (num & ~3) { |
57 | | mul(rp[0], ap[0], w, c1); |
58 | | mul(rp[1], ap[1], w, c1); |
59 | | mul(rp[2], ap[2], w, c1); |
60 | | mul(rp[3], ap[3], w, c1); |
61 | | ap += 4; |
62 | | rp += 4; |
63 | | num -= 4; |
64 | | } |
65 | | #endif |
66 | | while (num) { |
67 | | mul(rp[0], ap[0], w, c1); |
68 | | ap++; |
69 | | rp++; |
70 | | num--; |
71 | | } |
72 | | return c1; |
73 | | } |
74 | | |
75 | | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
76 | | { |
77 | | assert(n >= 0); |
78 | | if (n <= 0) |
79 | | return; |
80 | | |
81 | | #ifndef OPENSSL_SMALL_FOOTPRINT |
82 | | while (n & ~3) { |
83 | | sqr(r[0], r[1], a[0]); |
84 | | sqr(r[2], r[3], a[1]); |
85 | | sqr(r[4], r[5], a[2]); |
86 | | sqr(r[6], r[7], a[3]); |
87 | | a += 4; |
88 | | r += 8; |
89 | | n -= 4; |
90 | | } |
91 | | #endif |
92 | | while (n) { |
93 | | sqr(r[0], r[1], a[0]); |
94 | | a++; |
95 | | r += 2; |
96 | | n--; |
97 | | } |
98 | | } |
99 | | |
100 | | #else /* !(defined(BN_LLONG) || \ |
101 | | * defined(BN_UMULT_HIGH)) */ |
102 | | |
103 | | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
104 | | BN_ULONG w) |
105 | 0 | { |
106 | 0 | BN_ULONG c = 0; |
107 | 0 | BN_ULONG bl, bh; |
108 | |
|
109 | 0 | assert(num >= 0); |
110 | 0 | if (num <= 0) |
111 | 0 | return (BN_ULONG)0; |
112 | | |
113 | 0 | bl = LBITS(w); |
114 | 0 | bh = HBITS(w); |
115 | |
|
116 | 0 | #ifndef OPENSSL_SMALL_FOOTPRINT |
117 | 0 | while (num & ~3) { |
118 | 0 | mul_add(rp[0], ap[0], bl, bh, c); |
119 | 0 | mul_add(rp[1], ap[1], bl, bh, c); |
120 | 0 | mul_add(rp[2], ap[2], bl, bh, c); |
121 | 0 | mul_add(rp[3], ap[3], bl, bh, c); |
122 | 0 | ap += 4; |
123 | 0 | rp += 4; |
124 | 0 | num -= 4; |
125 | 0 | } |
126 | 0 | #endif |
127 | 0 | while (num) { |
128 | 0 | mul_add(rp[0], ap[0], bl, bh, c); |
129 | 0 | ap++; |
130 | 0 | rp++; |
131 | 0 | num--; |
132 | 0 | } |
133 | 0 | return c; |
134 | 0 | } |
135 | | |
136 | | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
137 | 0 | { |
138 | 0 | BN_ULONG carry = 0; |
139 | 0 | BN_ULONG bl, bh; |
140 | |
|
141 | 0 | assert(num >= 0); |
142 | 0 | if (num <= 0) |
143 | 0 | return (BN_ULONG)0; |
144 | | |
145 | 0 | bl = LBITS(w); |
146 | 0 | bh = HBITS(w); |
147 | |
|
148 | 0 | #ifndef OPENSSL_SMALL_FOOTPRINT |
149 | 0 | while (num & ~3) { |
150 | 0 | mul(rp[0], ap[0], bl, bh, carry); |
151 | 0 | mul(rp[1], ap[1], bl, bh, carry); |
152 | 0 | mul(rp[2], ap[2], bl, bh, carry); |
153 | 0 | mul(rp[3], ap[3], bl, bh, carry); |
154 | 0 | ap += 4; |
155 | 0 | rp += 4; |
156 | 0 | num -= 4; |
157 | 0 | } |
158 | 0 | #endif |
159 | 0 | while (num) { |
160 | 0 | mul(rp[0], ap[0], bl, bh, carry); |
161 | 0 | ap++; |
162 | 0 | rp++; |
163 | 0 | num--; |
164 | 0 | } |
165 | 0 | return carry; |
166 | 0 | } |
167 | | |
168 | | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
169 | 0 | { |
170 | 0 | assert(n >= 0); |
171 | 0 | if (n <= 0) |
172 | 0 | return; |
173 | | |
174 | 0 | #ifndef OPENSSL_SMALL_FOOTPRINT |
175 | 0 | while (n & ~3) { |
176 | 0 | sqr64(r[0], r[1], a[0]); |
177 | 0 | sqr64(r[2], r[3], a[1]); |
178 | 0 | sqr64(r[4], r[5], a[2]); |
179 | 0 | sqr64(r[6], r[7], a[3]); |
180 | 0 | a += 4; |
181 | 0 | r += 8; |
182 | 0 | n -= 4; |
183 | 0 | } |
184 | 0 | #endif |
185 | 0 | while (n) { |
186 | 0 | sqr64(r[0], r[1], a[0]); |
187 | 0 | a++; |
188 | 0 | r += 2; |
189 | 0 | n--; |
190 | 0 | } |
191 | 0 | } |
192 | | |
193 | | #endif /* !(defined(BN_LLONG) || \ |
194 | | * defined(BN_UMULT_HIGH)) */ |
195 | | |
196 | | #if defined(BN_LLONG) && defined(BN_DIV2W) |
197 | | |
198 | | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
199 | | { |
200 | | return ((BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d)); |
201 | | } |
202 | | |
203 | | #else |
204 | | |
205 | | /* Divide h,l by d and return the result. */ |
206 | | /* I need to test this some more :-( */ |
207 | | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
208 | 0 | { |
209 | 0 | BN_ULONG dh, dl, q, ret = 0, th, tl, t; |
210 | 0 | int i, count = 2; |
211 | |
|
212 | 0 | if (d == 0) |
213 | 0 | return BN_MASK2; |
214 | | |
215 | 0 | i = BN_num_bits_word(d); |
216 | 0 | assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); |
217 | |
|
218 | 0 | i = BN_BITS2 - i; |
219 | 0 | if (h >= d) |
220 | 0 | h -= d; |
221 | |
|
222 | 0 | if (i) { |
223 | 0 | d <<= i; |
224 | 0 | h = (h << i) | (l >> (BN_BITS2 - i)); |
225 | 0 | l <<= i; |
226 | 0 | } |
227 | 0 | dh = (d & BN_MASK2h) >> BN_BITS4; |
228 | 0 | dl = (d & BN_MASK2l); |
229 | 0 | for (;;) { |
230 | 0 | if ((h >> BN_BITS4) == dh) |
231 | 0 | q = BN_MASK2l; |
232 | 0 | else |
233 | 0 | q = h / dh; |
234 | |
|
235 | 0 | th = q * dh; |
236 | 0 | tl = dl * q; |
237 | 0 | for (;;) { |
238 | 0 | t = h - th; |
239 | 0 | if ((t & BN_MASK2h) || ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) |
240 | 0 | break; |
241 | 0 | q--; |
242 | 0 | th -= dh; |
243 | 0 | tl -= dl; |
244 | 0 | } |
245 | 0 | t = (tl >> BN_BITS4); |
246 | 0 | tl = (tl << BN_BITS4) & BN_MASK2h; |
247 | 0 | th += t; |
248 | |
|
249 | 0 | if (l < tl) |
250 | 0 | th++; |
251 | 0 | l -= tl; |
252 | 0 | if (h < th) { |
253 | 0 | h += d; |
254 | 0 | q--; |
255 | 0 | } |
256 | 0 | h -= th; |
257 | |
|
258 | 0 | if (--count == 0) |
259 | 0 | break; |
260 | | |
261 | 0 | ret = q << BN_BITS4; |
262 | 0 | h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; |
263 | 0 | l = (l & BN_MASK2l) << BN_BITS4; |
264 | 0 | } |
265 | 0 | ret |= q; |
266 | 0 | return ret; |
267 | 0 | } |
268 | | #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ |
269 | | |
270 | | #ifdef BN_LLONG |
271 | | BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, |
272 | | int n) |
273 | | { |
274 | | BN_ULLONG ll = 0; |
275 | | |
276 | | assert(n >= 0); |
277 | | if (n <= 0) |
278 | | return (BN_ULONG)0; |
279 | | |
280 | | #ifndef OPENSSL_SMALL_FOOTPRINT |
281 | | while (n & ~3) { |
282 | | ll += (BN_ULLONG)a[0] + b[0]; |
283 | | r[0] = (BN_ULONG)ll & BN_MASK2; |
284 | | ll >>= BN_BITS2; |
285 | | ll += (BN_ULLONG)a[1] + b[1]; |
286 | | r[1] = (BN_ULONG)ll & BN_MASK2; |
287 | | ll >>= BN_BITS2; |
288 | | ll += (BN_ULLONG)a[2] + b[2]; |
289 | | r[2] = (BN_ULONG)ll & BN_MASK2; |
290 | | ll >>= BN_BITS2; |
291 | | ll += (BN_ULLONG)a[3] + b[3]; |
292 | | r[3] = (BN_ULONG)ll & BN_MASK2; |
293 | | ll >>= BN_BITS2; |
294 | | a += 4; |
295 | | b += 4; |
296 | | r += 4; |
297 | | n -= 4; |
298 | | } |
299 | | #endif |
300 | | while (n) { |
301 | | ll += (BN_ULLONG)a[0] + b[0]; |
302 | | r[0] = (BN_ULONG)ll & BN_MASK2; |
303 | | ll >>= BN_BITS2; |
304 | | a++; |
305 | | b++; |
306 | | r++; |
307 | | n--; |
308 | | } |
309 | | return (BN_ULONG)ll; |
310 | | } |
311 | | #else /* !BN_LLONG */ |
312 | | BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, |
313 | | int n) |
314 | 0 | { |
315 | 0 | BN_ULONG c, l, t; |
316 | |
|
317 | 0 | assert(n >= 0); |
318 | 0 | if (n <= 0) |
319 | 0 | return (BN_ULONG)0; |
320 | | |
321 | 0 | c = 0; |
322 | 0 | #ifndef OPENSSL_SMALL_FOOTPRINT |
323 | 0 | while (n & ~3) { |
324 | 0 | t = a[0]; |
325 | 0 | t = (t + c) & BN_MASK2; |
326 | 0 | c = (t < c); |
327 | 0 | l = (t + b[0]) & BN_MASK2; |
328 | 0 | c += (l < t); |
329 | 0 | r[0] = l; |
330 | 0 | t = a[1]; |
331 | 0 | t = (t + c) & BN_MASK2; |
332 | 0 | c = (t < c); |
333 | 0 | l = (t + b[1]) & BN_MASK2; |
334 | 0 | c += (l < t); |
335 | 0 | r[1] = l; |
336 | 0 | t = a[2]; |
337 | 0 | t = (t + c) & BN_MASK2; |
338 | 0 | c = (t < c); |
339 | 0 | l = (t + b[2]) & BN_MASK2; |
340 | 0 | c += (l < t); |
341 | 0 | r[2] = l; |
342 | 0 | t = a[3]; |
343 | 0 | t = (t + c) & BN_MASK2; |
344 | 0 | c = (t < c); |
345 | 0 | l = (t + b[3]) & BN_MASK2; |
346 | 0 | c += (l < t); |
347 | 0 | r[3] = l; |
348 | 0 | a += 4; |
349 | 0 | b += 4; |
350 | 0 | r += 4; |
351 | 0 | n -= 4; |
352 | 0 | } |
353 | 0 | #endif |
354 | 0 | while (n) { |
355 | 0 | t = a[0]; |
356 | 0 | t = (t + c) & BN_MASK2; |
357 | 0 | c = (t < c); |
358 | 0 | l = (t + b[0]) & BN_MASK2; |
359 | 0 | c += (l < t); |
360 | 0 | r[0] = l; |
361 | 0 | a++; |
362 | 0 | b++; |
363 | 0 | r++; |
364 | 0 | n--; |
365 | 0 | } |
366 | 0 | return (BN_ULONG)c; |
367 | 0 | } |
368 | | #endif /* !BN_LLONG */ |
369 | | |
370 | | BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, |
371 | | int n) |
372 | 0 | { |
373 | 0 | BN_ULONG t1, t2; |
374 | 0 | int c = 0; |
375 | |
|
376 | 0 | assert(n >= 0); |
377 | 0 | if (n <= 0) |
378 | 0 | return (BN_ULONG)0; |
379 | | |
380 | 0 | #ifndef OPENSSL_SMALL_FOOTPRINT |
381 | 0 | while (n & ~3) { |
382 | 0 | t1 = a[0]; |
383 | 0 | t2 = (t1 - c) & BN_MASK2; |
384 | 0 | c = (t2 > t1); |
385 | 0 | t1 = b[0]; |
386 | 0 | t1 = (t2 - t1) & BN_MASK2; |
387 | 0 | r[0] = t1; |
388 | 0 | c += (t1 > t2); |
389 | 0 | t1 = a[1]; |
390 | 0 | t2 = (t1 - c) & BN_MASK2; |
391 | 0 | c = (t2 > t1); |
392 | 0 | t1 = b[1]; |
393 | 0 | t1 = (t2 - t1) & BN_MASK2; |
394 | 0 | r[1] = t1; |
395 | 0 | c += (t1 > t2); |
396 | 0 | t1 = a[2]; |
397 | 0 | t2 = (t1 - c) & BN_MASK2; |
398 | 0 | c = (t2 > t1); |
399 | 0 | t1 = b[2]; |
400 | 0 | t1 = (t2 - t1) & BN_MASK2; |
401 | 0 | r[2] = t1; |
402 | 0 | c += (t1 > t2); |
403 | 0 | t1 = a[3]; |
404 | 0 | t2 = (t1 - c) & BN_MASK2; |
405 | 0 | c = (t2 > t1); |
406 | 0 | t1 = b[3]; |
407 | 0 | t1 = (t2 - t1) & BN_MASK2; |
408 | 0 | r[3] = t1; |
409 | 0 | c += (t1 > t2); |
410 | 0 | a += 4; |
411 | 0 | b += 4; |
412 | 0 | r += 4; |
413 | 0 | n -= 4; |
414 | 0 | } |
415 | 0 | #endif |
416 | 0 | while (n) { |
417 | 0 | t1 = a[0]; |
418 | 0 | t2 = (t1 - c) & BN_MASK2; |
419 | 0 | c = (t2 > t1); |
420 | 0 | t1 = b[0]; |
421 | 0 | t1 = (t2 - t1) & BN_MASK2; |
422 | 0 | r[0] = t1; |
423 | 0 | c += (t1 > t2); |
424 | 0 | a++; |
425 | 0 | b++; |
426 | 0 | r++; |
427 | 0 | n--; |
428 | 0 | } |
429 | 0 | return c; |
430 | 0 | } |
431 | | |
432 | | #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) |
433 | | |
434 | | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
435 | | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ |
436 | | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ |
437 | | /* |
438 | | * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number |
439 | | * c=(c2,c1,c0) |
440 | | */ |
441 | | |
442 | | #ifdef BN_LLONG |
443 | | /* |
444 | | * Keep in mind that additions to multiplication result can not |
445 | | * overflow, because its high half cannot be all-ones. |
446 | | */ |
447 | | #define mul_add_c(a, b, c0, c1, c2) \ |
448 | | do { \ |
449 | | BN_ULONG hi; \ |
450 | | BN_ULLONG t = (BN_ULLONG)(a) * (b); \ |
451 | | t += c0; /* no carry */ \ |
452 | | c0 = (BN_ULONG)Lw(t); \ |
453 | | hi = (BN_ULONG)Hw(t); \ |
454 | | c1 = (c1 + hi) & BN_MASK2; \ |
455 | | c2 += (c1 < hi); \ |
456 | | } while (0) |
457 | | |
458 | | #define mul_add_c2(a, b, c0, c1, c2) \ |
459 | | do { \ |
460 | | BN_ULONG hi; \ |
461 | | BN_ULLONG t = (BN_ULLONG)(a) * (b); \ |
462 | | BN_ULLONG tt = t + c0; /* no carry */ \ |
463 | | c0 = (BN_ULONG)Lw(tt); \ |
464 | | hi = (BN_ULONG)Hw(tt); \ |
465 | | c1 = (c1 + hi) & BN_MASK2; \ |
466 | | c2 += (c1 < hi); \ |
467 | | t += c0; /* no carry */ \ |
468 | | c0 = (BN_ULONG)Lw(t); \ |
469 | | hi = (BN_ULONG)Hw(t); \ |
470 | | c1 = (c1 + hi) & BN_MASK2; \ |
471 | | c2 += (c1 < hi); \ |
472 | | } while (0) |
473 | | |
474 | | #define sqr_add_c(a, i, c0, c1, c2) \ |
475 | | do { \ |
476 | | BN_ULONG hi; \ |
477 | | BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \ |
478 | | t += c0; /* no carry */ \ |
479 | | c0 = (BN_ULONG)Lw(t); \ |
480 | | hi = (BN_ULONG)Hw(t); \ |
481 | | c1 = (c1 + hi) & BN_MASK2; \ |
482 | | c2 += (c1 < hi); \ |
483 | | } while (0) |
484 | | |
485 | | #define sqr_add_c2(a, i, j, c0, c1, c2) \ |
486 | | mul_add_c2((a)[i], (a)[j], c0, c1, c2) |
487 | | |
488 | | #elif defined(BN_UMULT_LOHI) |
489 | | /* |
490 | | * Keep in mind that additions to hi can not overflow, because |
491 | | * the high word of a multiplication result cannot be all-ones. |
492 | | */ |
493 | | #define mul_add_c(a, b, c0, c1, c2) \ |
494 | | do { \ |
495 | | BN_ULONG ta = (a), tb = (b); \ |
496 | | BN_ULONG lo, hi; \ |
497 | | BN_UMULT_LOHI(lo, hi, ta, tb); \ |
498 | | c0 += lo; \ |
499 | | hi += (c0 < lo); \ |
500 | | c1 += hi; \ |
501 | | c2 += (c1 < hi); \ |
502 | | } while (0) |
503 | | |
504 | | #define mul_add_c2(a, b, c0, c1, c2) \ |
505 | | do { \ |
506 | | BN_ULONG ta = (a), tb = (b); \ |
507 | | BN_ULONG lo, hi, tt; \ |
508 | | BN_UMULT_LOHI(lo, hi, ta, tb); \ |
509 | | c0 += lo; \ |
510 | | tt = hi + (c0 < lo); \ |
511 | | c1 += tt; \ |
512 | | c2 += (c1 < tt); \ |
513 | | c0 += lo; \ |
514 | | hi += (c0 < lo); \ |
515 | | c1 += hi; \ |
516 | | c2 += (c1 < hi); \ |
517 | | } while (0) |
518 | | |
519 | | #define sqr_add_c(a, i, c0, c1, c2) \ |
520 | | do { \ |
521 | | BN_ULONG ta = (a)[i]; \ |
522 | | BN_ULONG lo, hi; \ |
523 | | BN_UMULT_LOHI(lo, hi, ta, ta); \ |
524 | | c0 += lo; \ |
525 | | hi += (c0 < lo); \ |
526 | | c1 += hi; \ |
527 | | c2 += (c1 < hi); \ |
528 | | } while (0) |
529 | | |
530 | | #define sqr_add_c2(a, i, j, c0, c1, c2) \ |
531 | | mul_add_c2((a)[i], (a)[j], c0, c1, c2) |
532 | | |
533 | | #elif defined(BN_UMULT_HIGH) |
534 | | /* |
535 | | * Keep in mind that additions to hi can not overflow, because |
536 | | * the high word of a multiplication result cannot be all-ones. |
537 | | */ |
538 | | #define mul_add_c(a, b, c0, c1, c2) \ |
539 | | do { \ |
540 | | BN_ULONG ta = (a), tb = (b); \ |
541 | | BN_ULONG lo = ta * tb; \ |
542 | | BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \ |
543 | | c0 += lo; \ |
544 | | hi += (c0 < lo); \ |
545 | | c1 += hi; \ |
546 | | c2 += (c1 < hi); \ |
547 | | } while (0) |
548 | | |
549 | | #define mul_add_c2(a, b, c0, c1, c2) \ |
550 | | do { \ |
551 | | BN_ULONG ta = (a), tb = (b), tt; \ |
552 | | BN_ULONG lo = ta * tb; \ |
553 | | BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \ |
554 | | c0 += lo; \ |
555 | | tt = hi + (c0 < lo); \ |
556 | | c1 += tt; \ |
557 | | c2 += (c1 < tt); \ |
558 | | c0 += lo; \ |
559 | | hi += (c0 < lo); \ |
560 | | c1 += hi; \ |
561 | | c2 += (c1 < hi); \ |
562 | | } while (0) |
563 | | |
564 | | #define sqr_add_c(a, i, c0, c1, c2) \ |
565 | | do { \ |
566 | | BN_ULONG ta = (a)[i]; \ |
567 | | BN_ULONG lo = ta * ta; \ |
568 | | BN_ULONG hi = BN_UMULT_HIGH(ta, ta); \ |
569 | | c0 += lo; \ |
570 | | hi += (c0 < lo); \ |
571 | | c1 += hi; \ |
572 | | c2 += (c1 < hi); \ |
573 | | } while (0) |
574 | | |
575 | | #define sqr_add_c2(a, i, j, c0, c1, c2) \ |
576 | | mul_add_c2((a)[i], (a)[j], c0, c1, c2) |
577 | | |
578 | | #else /* !BN_LLONG */ |
579 | | /* |
580 | | * Keep in mind that additions to hi can not overflow, because |
581 | | * the high word of a multiplication result cannot be all-ones. |
582 | | */ |
583 | | #define mul_add_c(a, b, c0, c1, c2) \ |
584 | 0 | do { \ |
585 | 0 | BN_ULONG lo = LBITS(a), hi = HBITS(a); \ |
586 | 0 | BN_ULONG bl = LBITS(b), bh = HBITS(b); \ |
587 | 0 | mul64(lo, hi, bl, bh); \ |
588 | 0 | c0 = (c0 + lo) & BN_MASK2; \ |
589 | 0 | hi += (c0 < lo); \ |
590 | 0 | c1 = (c1 + hi) & BN_MASK2; \ |
591 | 0 | c2 += (c1 < hi); \ |
592 | 0 | } while (0) |
593 | | |
594 | | #define mul_add_c2(a, b, c0, c1, c2) \ |
595 | 0 | do { \ |
596 | 0 | BN_ULONG tt; \ |
597 | 0 | BN_ULONG lo = LBITS(a), hi = HBITS(a); \ |
598 | 0 | BN_ULONG bl = LBITS(b), bh = HBITS(b); \ |
599 | 0 | mul64(lo, hi, bl, bh); \ |
600 | 0 | tt = hi; \ |
601 | 0 | c0 = (c0 + lo) & BN_MASK2; \ |
602 | 0 | tt += (c0 < lo); \ |
603 | 0 | c1 = (c1 + tt) & BN_MASK2; \ |
604 | 0 | c2 += (c1 < tt); \ |
605 | 0 | c0 = (c0 + lo) & BN_MASK2; \ |
606 | 0 | hi += (c0 < lo); \ |
607 | 0 | c1 = (c1 + hi) & BN_MASK2; \ |
608 | 0 | c2 += (c1 < hi); \ |
609 | 0 | } while (0) |
610 | | |
611 | | #define sqr_add_c(a, i, c0, c1, c2) \ |
612 | 0 | do { \ |
613 | 0 | BN_ULONG lo, hi; \ |
614 | 0 | sqr64(lo, hi, (a)[i]); \ |
615 | 0 | c0 = (c0 + lo) & BN_MASK2; \ |
616 | 0 | hi += (c0 < lo); \ |
617 | 0 | c1 = (c1 + hi) & BN_MASK2; \ |
618 | 0 | c2 += (c1 < hi); \ |
619 | 0 | } while (0) |
620 | | |
621 | | #define sqr_add_c2(a, i, j, c0, c1, c2) \ |
622 | 0 | mul_add_c2((a)[i], (a)[j], c0, c1, c2) |
623 | | #endif /* !BN_LLONG */ |
624 | | |
625 | | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
626 | 0 | { |
627 | 0 | BN_ULONG c1, c2, c3; |
628 | |
|
629 | 0 | c1 = 0; |
630 | 0 | c2 = 0; |
631 | 0 | c3 = 0; |
632 | 0 | mul_add_c(a[0], b[0], c1, c2, c3); |
633 | 0 | r[0] = c1; |
634 | 0 | c1 = 0; |
635 | 0 | mul_add_c(a[0], b[1], c2, c3, c1); |
636 | 0 | mul_add_c(a[1], b[0], c2, c3, c1); |
637 | 0 | r[1] = c2; |
638 | 0 | c2 = 0; |
639 | 0 | mul_add_c(a[2], b[0], c3, c1, c2); |
640 | 0 | mul_add_c(a[1], b[1], c3, c1, c2); |
641 | 0 | mul_add_c(a[0], b[2], c3, c1, c2); |
642 | 0 | r[2] = c3; |
643 | 0 | c3 = 0; |
644 | 0 | mul_add_c(a[0], b[3], c1, c2, c3); |
645 | 0 | mul_add_c(a[1], b[2], c1, c2, c3); |
646 | 0 | mul_add_c(a[2], b[1], c1, c2, c3); |
647 | 0 | mul_add_c(a[3], b[0], c1, c2, c3); |
648 | 0 | r[3] = c1; |
649 | 0 | c1 = 0; |
650 | 0 | mul_add_c(a[4], b[0], c2, c3, c1); |
651 | 0 | mul_add_c(a[3], b[1], c2, c3, c1); |
652 | 0 | mul_add_c(a[2], b[2], c2, c3, c1); |
653 | 0 | mul_add_c(a[1], b[3], c2, c3, c1); |
654 | 0 | mul_add_c(a[0], b[4], c2, c3, c1); |
655 | 0 | r[4] = c2; |
656 | 0 | c2 = 0; |
657 | 0 | mul_add_c(a[0], b[5], c3, c1, c2); |
658 | 0 | mul_add_c(a[1], b[4], c3, c1, c2); |
659 | 0 | mul_add_c(a[2], b[3], c3, c1, c2); |
660 | 0 | mul_add_c(a[3], b[2], c3, c1, c2); |
661 | 0 | mul_add_c(a[4], b[1], c3, c1, c2); |
662 | 0 | mul_add_c(a[5], b[0], c3, c1, c2); |
663 | 0 | r[5] = c3; |
664 | 0 | c3 = 0; |
665 | 0 | mul_add_c(a[6], b[0], c1, c2, c3); |
666 | 0 | mul_add_c(a[5], b[1], c1, c2, c3); |
667 | 0 | mul_add_c(a[4], b[2], c1, c2, c3); |
668 | 0 | mul_add_c(a[3], b[3], c1, c2, c3); |
669 | 0 | mul_add_c(a[2], b[4], c1, c2, c3); |
670 | 0 | mul_add_c(a[1], b[5], c1, c2, c3); |
671 | 0 | mul_add_c(a[0], b[6], c1, c2, c3); |
672 | 0 | r[6] = c1; |
673 | 0 | c1 = 0; |
674 | 0 | mul_add_c(a[0], b[7], c2, c3, c1); |
675 | 0 | mul_add_c(a[1], b[6], c2, c3, c1); |
676 | 0 | mul_add_c(a[2], b[5], c2, c3, c1); |
677 | 0 | mul_add_c(a[3], b[4], c2, c3, c1); |
678 | 0 | mul_add_c(a[4], b[3], c2, c3, c1); |
679 | 0 | mul_add_c(a[5], b[2], c2, c3, c1); |
680 | 0 | mul_add_c(a[6], b[1], c2, c3, c1); |
681 | 0 | mul_add_c(a[7], b[0], c2, c3, c1); |
682 | 0 | r[7] = c2; |
683 | 0 | c2 = 0; |
684 | 0 | mul_add_c(a[7], b[1], c3, c1, c2); |
685 | 0 | mul_add_c(a[6], b[2], c3, c1, c2); |
686 | 0 | mul_add_c(a[5], b[3], c3, c1, c2); |
687 | 0 | mul_add_c(a[4], b[4], c3, c1, c2); |
688 | 0 | mul_add_c(a[3], b[5], c3, c1, c2); |
689 | 0 | mul_add_c(a[2], b[6], c3, c1, c2); |
690 | 0 | mul_add_c(a[1], b[7], c3, c1, c2); |
691 | 0 | r[8] = c3; |
692 | 0 | c3 = 0; |
693 | 0 | mul_add_c(a[2], b[7], c1, c2, c3); |
694 | 0 | mul_add_c(a[3], b[6], c1, c2, c3); |
695 | 0 | mul_add_c(a[4], b[5], c1, c2, c3); |
696 | 0 | mul_add_c(a[5], b[4], c1, c2, c3); |
697 | 0 | mul_add_c(a[6], b[3], c1, c2, c3); |
698 | 0 | mul_add_c(a[7], b[2], c1, c2, c3); |
699 | 0 | r[9] = c1; |
700 | 0 | c1 = 0; |
701 | 0 | mul_add_c(a[7], b[3], c2, c3, c1); |
702 | 0 | mul_add_c(a[6], b[4], c2, c3, c1); |
703 | 0 | mul_add_c(a[5], b[5], c2, c3, c1); |
704 | 0 | mul_add_c(a[4], b[6], c2, c3, c1); |
705 | 0 | mul_add_c(a[3], b[7], c2, c3, c1); |
706 | 0 | r[10] = c2; |
707 | 0 | c2 = 0; |
708 | 0 | mul_add_c(a[4], b[7], c3, c1, c2); |
709 | 0 | mul_add_c(a[5], b[6], c3, c1, c2); |
710 | 0 | mul_add_c(a[6], b[5], c3, c1, c2); |
711 | 0 | mul_add_c(a[7], b[4], c3, c1, c2); |
712 | 0 | r[11] = c3; |
713 | 0 | c3 = 0; |
714 | 0 | mul_add_c(a[7], b[5], c1, c2, c3); |
715 | 0 | mul_add_c(a[6], b[6], c1, c2, c3); |
716 | 0 | mul_add_c(a[5], b[7], c1, c2, c3); |
717 | 0 | r[12] = c1; |
718 | 0 | c1 = 0; |
719 | 0 | mul_add_c(a[6], b[7], c2, c3, c1); |
720 | 0 | mul_add_c(a[7], b[6], c2, c3, c1); |
721 | 0 | r[13] = c2; |
722 | 0 | c2 = 0; |
723 | 0 | mul_add_c(a[7], b[7], c3, c1, c2); |
724 | 0 | r[14] = c3; |
725 | 0 | r[15] = c1; |
726 | 0 | } |
727 | | |
728 | | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
729 | 0 | { |
730 | 0 | BN_ULONG c1, c2, c3; |
731 | |
|
732 | 0 | c1 = 0; |
733 | 0 | c2 = 0; |
734 | 0 | c3 = 0; |
735 | 0 | mul_add_c(a[0], b[0], c1, c2, c3); |
736 | 0 | r[0] = c1; |
737 | 0 | c1 = 0; |
738 | 0 | mul_add_c(a[0], b[1], c2, c3, c1); |
739 | 0 | mul_add_c(a[1], b[0], c2, c3, c1); |
740 | 0 | r[1] = c2; |
741 | 0 | c2 = 0; |
742 | 0 | mul_add_c(a[2], b[0], c3, c1, c2); |
743 | 0 | mul_add_c(a[1], b[1], c3, c1, c2); |
744 | 0 | mul_add_c(a[0], b[2], c3, c1, c2); |
745 | 0 | r[2] = c3; |
746 | 0 | c3 = 0; |
747 | 0 | mul_add_c(a[0], b[3], c1, c2, c3); |
748 | 0 | mul_add_c(a[1], b[2], c1, c2, c3); |
749 | 0 | mul_add_c(a[2], b[1], c1, c2, c3); |
750 | 0 | mul_add_c(a[3], b[0], c1, c2, c3); |
751 | 0 | r[3] = c1; |
752 | 0 | c1 = 0; |
753 | 0 | mul_add_c(a[3], b[1], c2, c3, c1); |
754 | 0 | mul_add_c(a[2], b[2], c2, c3, c1); |
755 | 0 | mul_add_c(a[1], b[3], c2, c3, c1); |
756 | 0 | r[4] = c2; |
757 | 0 | c2 = 0; |
758 | 0 | mul_add_c(a[2], b[3], c3, c1, c2); |
759 | 0 | mul_add_c(a[3], b[2], c3, c1, c2); |
760 | 0 | r[5] = c3; |
761 | 0 | c3 = 0; |
762 | 0 | mul_add_c(a[3], b[3], c1, c2, c3); |
763 | 0 | r[6] = c1; |
764 | 0 | r[7] = c2; |
765 | 0 | } |
766 | | |
767 | | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
768 | 0 | { |
769 | 0 | BN_ULONG c1, c2, c3; |
770 | |
|
771 | 0 | c1 = 0; |
772 | 0 | c2 = 0; |
773 | 0 | c3 = 0; |
774 | 0 | sqr_add_c(a, 0, c1, c2, c3); |
775 | 0 | r[0] = c1; |
776 | 0 | c1 = 0; |
777 | 0 | sqr_add_c2(a, 1, 0, c2, c3, c1); |
778 | 0 | r[1] = c2; |
779 | 0 | c2 = 0; |
780 | 0 | sqr_add_c(a, 1, c3, c1, c2); |
781 | 0 | sqr_add_c2(a, 2, 0, c3, c1, c2); |
782 | 0 | r[2] = c3; |
783 | 0 | c3 = 0; |
784 | 0 | sqr_add_c2(a, 3, 0, c1, c2, c3); |
785 | 0 | sqr_add_c2(a, 2, 1, c1, c2, c3); |
786 | 0 | r[3] = c1; |
787 | 0 | c1 = 0; |
788 | 0 | sqr_add_c(a, 2, c2, c3, c1); |
789 | 0 | sqr_add_c2(a, 3, 1, c2, c3, c1); |
790 | 0 | sqr_add_c2(a, 4, 0, c2, c3, c1); |
791 | 0 | r[4] = c2; |
792 | 0 | c2 = 0; |
793 | 0 | sqr_add_c2(a, 5, 0, c3, c1, c2); |
794 | 0 | sqr_add_c2(a, 4, 1, c3, c1, c2); |
795 | 0 | sqr_add_c2(a, 3, 2, c3, c1, c2); |
796 | 0 | r[5] = c3; |
797 | 0 | c3 = 0; |
798 | 0 | sqr_add_c(a, 3, c1, c2, c3); |
799 | 0 | sqr_add_c2(a, 4, 2, c1, c2, c3); |
800 | 0 | sqr_add_c2(a, 5, 1, c1, c2, c3); |
801 | 0 | sqr_add_c2(a, 6, 0, c1, c2, c3); |
802 | 0 | r[6] = c1; |
803 | 0 | c1 = 0; |
804 | 0 | sqr_add_c2(a, 7, 0, c2, c3, c1); |
805 | 0 | sqr_add_c2(a, 6, 1, c2, c3, c1); |
806 | 0 | sqr_add_c2(a, 5, 2, c2, c3, c1); |
807 | 0 | sqr_add_c2(a, 4, 3, c2, c3, c1); |
808 | 0 | r[7] = c2; |
809 | 0 | c2 = 0; |
810 | 0 | sqr_add_c(a, 4, c3, c1, c2); |
811 | 0 | sqr_add_c2(a, 5, 3, c3, c1, c2); |
812 | 0 | sqr_add_c2(a, 6, 2, c3, c1, c2); |
813 | 0 | sqr_add_c2(a, 7, 1, c3, c1, c2); |
814 | 0 | r[8] = c3; |
815 | 0 | c3 = 0; |
816 | 0 | sqr_add_c2(a, 7, 2, c1, c2, c3); |
817 | 0 | sqr_add_c2(a, 6, 3, c1, c2, c3); |
818 | 0 | sqr_add_c2(a, 5, 4, c1, c2, c3); |
819 | 0 | r[9] = c1; |
820 | 0 | c1 = 0; |
821 | 0 | sqr_add_c(a, 5, c2, c3, c1); |
822 | 0 | sqr_add_c2(a, 6, 4, c2, c3, c1); |
823 | 0 | sqr_add_c2(a, 7, 3, c2, c3, c1); |
824 | 0 | r[10] = c2; |
825 | 0 | c2 = 0; |
826 | 0 | sqr_add_c2(a, 7, 4, c3, c1, c2); |
827 | 0 | sqr_add_c2(a, 6, 5, c3, c1, c2); |
828 | 0 | r[11] = c3; |
829 | 0 | c3 = 0; |
830 | 0 | sqr_add_c(a, 6, c1, c2, c3); |
831 | 0 | sqr_add_c2(a, 7, 5, c1, c2, c3); |
832 | 0 | r[12] = c1; |
833 | 0 | c1 = 0; |
834 | 0 | sqr_add_c2(a, 7, 6, c2, c3, c1); |
835 | 0 | r[13] = c2; |
836 | 0 | c2 = 0; |
837 | 0 | sqr_add_c(a, 7, c3, c1, c2); |
838 | 0 | r[14] = c3; |
839 | 0 | r[15] = c1; |
840 | 0 | } |
841 | | |
842 | | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
843 | 0 | { |
844 | 0 | BN_ULONG c1, c2, c3; |
845 | |
|
846 | 0 | c1 = 0; |
847 | 0 | c2 = 0; |
848 | 0 | c3 = 0; |
849 | 0 | sqr_add_c(a, 0, c1, c2, c3); |
850 | 0 | r[0] = c1; |
851 | 0 | c1 = 0; |
852 | 0 | sqr_add_c2(a, 1, 0, c2, c3, c1); |
853 | 0 | r[1] = c2; |
854 | 0 | c2 = 0; |
855 | 0 | sqr_add_c(a, 1, c3, c1, c2); |
856 | 0 | sqr_add_c2(a, 2, 0, c3, c1, c2); |
857 | 0 | r[2] = c3; |
858 | 0 | c3 = 0; |
859 | 0 | sqr_add_c2(a, 3, 0, c1, c2, c3); |
860 | 0 | sqr_add_c2(a, 2, 1, c1, c2, c3); |
861 | 0 | r[3] = c1; |
862 | 0 | c1 = 0; |
863 | 0 | sqr_add_c(a, 2, c2, c3, c1); |
864 | 0 | sqr_add_c2(a, 3, 1, c2, c3, c1); |
865 | 0 | r[4] = c2; |
866 | 0 | c2 = 0; |
867 | 0 | sqr_add_c2(a, 3, 2, c3, c1, c2); |
868 | 0 | r[5] = c3; |
869 | 0 | c3 = 0; |
870 | 0 | sqr_add_c(a, 3, c1, c2, c3); |
871 | 0 | r[6] = c1; |
872 | 0 | r[7] = c2; |
873 | 0 | } |
874 | | |
875 | | #ifdef OPENSSL_NO_ASM |
876 | | #ifdef OPENSSL_BN_ASM_MONT |
877 | | #include <alloca.h> |
878 | | /* |
879 | | * This is essentially reference implementation, which may or may not |
880 | | * result in performance improvement. E.g. on IA-32 this routine was |
881 | | * observed to give 40% faster rsa1024 private key operations and 10% |
882 | | * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only |
883 | | * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a |
884 | | * reference implementation, one to be used as starting point for |
885 | | * platform-specific assembler. Mentioned numbers apply to compiler |
886 | | * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and |
887 | | * can vary not only from platform to platform, but even for compiler |
888 | | * versions. Assembler vs. assembler improvement coefficients can |
889 | | * [and are known to] differ and are to be documented elsewhere. |
890 | | */ |
891 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
892 | | const BN_ULONG *np, const BN_ULONG *n0p, int num) |
893 | | { |
894 | | BN_ULONG c0, c1, ml, *tp, n0; |
895 | | #ifdef mul64 |
896 | | BN_ULONG mh; |
897 | | #endif |
898 | | volatile BN_ULONG *vp; |
899 | | int i = 0, j; |
900 | | |
901 | | #if 0 /* template for platform-specific \ |
902 | | * implementation */ |
903 | | if (ap == bp) |
904 | | return bn_sqr_mont(rp, ap, np, n0p, num); |
905 | | #endif |
906 | | vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); |
907 | | |
908 | | n0 = *n0p; |
909 | | |
910 | | c0 = 0; |
911 | | ml = bp[0]; |
912 | | #ifdef mul64 |
913 | | mh = HBITS(ml); |
914 | | ml = LBITS(ml); |
915 | | for (j = 0; j < num; ++j) |
916 | | mul(tp[j], ap[j], ml, mh, c0); |
917 | | #else |
918 | | for (j = 0; j < num; ++j) |
919 | | mul(tp[j], ap[j], ml, c0); |
920 | | #endif |
921 | | |
922 | | tp[num] = c0; |
923 | | tp[num + 1] = 0; |
924 | | goto enter; |
925 | | |
926 | | for (i = 0; i < num; i++) { |
927 | | c0 = 0; |
928 | | ml = bp[i]; |
929 | | #ifdef mul64 |
930 | | mh = HBITS(ml); |
931 | | ml = LBITS(ml); |
932 | | for (j = 0; j < num; ++j) |
933 | | mul_add(tp[j], ap[j], ml, mh, c0); |
934 | | #else |
935 | | for (j = 0; j < num; ++j) |
936 | | mul_add(tp[j], ap[j], ml, c0); |
937 | | #endif |
938 | | c1 = (tp[num] + c0) & BN_MASK2; |
939 | | tp[num] = c1; |
940 | | tp[num + 1] = (c1 < c0 ? 1 : 0); |
941 | | enter: |
942 | | c1 = tp[0]; |
943 | | ml = (c1 * n0) & BN_MASK2; |
944 | | c0 = 0; |
945 | | #ifdef mul64 |
946 | | mh = HBITS(ml); |
947 | | ml = LBITS(ml); |
948 | | mul_add(c1, np[0], ml, mh, c0); |
949 | | #else |
950 | | mul_add(c1, ml, np[0], c0); |
951 | | #endif |
952 | | for (j = 1; j < num; j++) { |
953 | | c1 = tp[j]; |
954 | | #ifdef mul64 |
955 | | mul_add(c1, np[j], ml, mh, c0); |
956 | | #else |
957 | | mul_add(c1, ml, np[j], c0); |
958 | | #endif |
959 | | tp[j - 1] = c1 & BN_MASK2; |
960 | | } |
961 | | c1 = (tp[num] + c0) & BN_MASK2; |
962 | | tp[num - 1] = c1; |
963 | | tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); |
964 | | } |
965 | | |
966 | | if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { |
967 | | c0 = bn_sub_words(rp, tp, np, num); |
968 | | if (tp[num] != 0 || c0 == 0) { |
969 | | for (i = 0; i < num + 2; i++) |
970 | | vp[i] = 0; |
971 | | return 1; |
972 | | } |
973 | | } |
974 | | for (i = 0; i < num; i++) |
975 | | rp[i] = tp[i], vp[i] = 0; |
976 | | vp[num] = 0; |
977 | | vp[num + 1] = 0; |
978 | | return 1; |
979 | | } |
980 | | #else |
981 | | /* |
982 | | * Return value of 0 indicates that multiplication/convolution was not |
983 | | * performed to signal the caller to fall down to alternative/original |
984 | | * code-path. |
985 | | */ |
986 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
987 | | const BN_ULONG *np, const BN_ULONG *n0, int num) |
988 | 0 | { |
989 | 0 | return 0; |
990 | 0 | } |
991 | | #endif /* OPENSSL_BN_ASM_MONT */ |
992 | | #endif |
993 | | |
994 | | #else /* !BN_MUL_COMBA */ |
995 | | |
996 | | /* hmm... is it faster just to do a multiply? */ |
997 | | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
998 | | { |
999 | | BN_ULONG t[8]; |
1000 | | bn_sqr_normal(r, a, 4, t); |
1001 | | } |
1002 | | |
1003 | | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
1004 | | { |
1005 | | BN_ULONG t[16]; |
1006 | | bn_sqr_normal(r, a, 8, t); |
1007 | | } |
1008 | | |
1009 | | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
1010 | | { |
1011 | | r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); |
1012 | | r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); |
1013 | | r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); |
1014 | | r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); |
1015 | | } |
1016 | | |
1017 | | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
1018 | | { |
1019 | | r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); |
1020 | | r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); |
1021 | | r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); |
1022 | | r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); |
1023 | | r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); |
1024 | | r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); |
1025 | | r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); |
1026 | | r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); |
1027 | | } |
1028 | | |
1029 | | #ifdef OPENSSL_NO_ASM |
1030 | | #ifdef OPENSSL_BN_ASM_MONT |
1031 | | #include <alloca.h> |
1032 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
1033 | | const BN_ULONG *np, const BN_ULONG *n0p, int num) |
1034 | | { |
1035 | | BN_ULONG c0, c1, *tp, n0 = *n0p; |
1036 | | volatile BN_ULONG *vp; |
1037 | | int i = 0, j; |
1038 | | |
1039 | | vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); |
1040 | | |
1041 | | for (i = 0; i <= num; i++) |
1042 | | tp[i] = 0; |
1043 | | |
1044 | | for (i = 0; i < num; i++) { |
1045 | | c0 = bn_mul_add_words(tp, ap, num, bp[i]); |
1046 | | c1 = (tp[num] + c0) & BN_MASK2; |
1047 | | tp[num] = c1; |
1048 | | tp[num + 1] = (c1 < c0 ? 1 : 0); |
1049 | | |
1050 | | c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); |
1051 | | c1 = (tp[num] + c0) & BN_MASK2; |
1052 | | tp[num] = c1; |
1053 | | tp[num + 1] += (c1 < c0 ? 1 : 0); |
1054 | | for (j = 0; j <= num; j++) |
1055 | | tp[j] = tp[j + 1]; |
1056 | | } |
1057 | | |
1058 | | if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { |
1059 | | c0 = bn_sub_words(rp, tp, np, num); |
1060 | | if (tp[num] != 0 || c0 == 0) { |
1061 | | for (i = 0; i < num + 2; i++) |
1062 | | vp[i] = 0; |
1063 | | return 1; |
1064 | | } |
1065 | | } |
1066 | | for (i = 0; i < num; i++) |
1067 | | rp[i] = tp[i], vp[i] = 0; |
1068 | | vp[num] = 0; |
1069 | | vp[num + 1] = 0; |
1070 | | return 1; |
1071 | | } |
1072 | | #else |
1073 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
1074 | | const BN_ULONG *np, const BN_ULONG *n0, int num) |
1075 | | { |
1076 | | return 0; |
1077 | | } |
1078 | | #endif /* OPENSSL_BN_ASM_MONT */ |
1079 | | #endif |
1080 | | |
1081 | | #endif /* !BN_MUL_COMBA */ |