/src/irssi/subprojects/openssl-1.1.1l/crypto/bn/bn_asm.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. |
3 | | * |
4 | | * Licensed under the OpenSSL license (the "License"). You may not use |
5 | | * this file except in compliance with the License. You can obtain a copy |
6 | | * in the file LICENSE in the source distribution or at |
7 | | * https://www.openssl.org/source/license.html |
8 | | */ |
9 | | |
10 | | #include <assert.h> |
11 | | #include <openssl/crypto.h> |
12 | | #include "internal/cryptlib.h" |
13 | | #include "bn_local.h" |
14 | | |
15 | | #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) |
16 | | |
17 | | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
18 | | BN_ULONG w) |
19 | | { |
20 | | BN_ULONG c1 = 0; |
21 | | |
22 | | assert(num >= 0); |
23 | | if (num <= 0) |
24 | | return c1; |
25 | | |
26 | | # ifndef OPENSSL_SMALL_FOOTPRINT |
27 | | while (num & ~3) { |
28 | | mul_add(rp[0], ap[0], w, c1); |
29 | | mul_add(rp[1], ap[1], w, c1); |
30 | | mul_add(rp[2], ap[2], w, c1); |
31 | | mul_add(rp[3], ap[3], w, c1); |
32 | | ap += 4; |
33 | | rp += 4; |
34 | | num -= 4; |
35 | | } |
36 | | # endif |
37 | | while (num) { |
38 | | mul_add(rp[0], ap[0], w, c1); |
39 | | ap++; |
40 | | rp++; |
41 | | num--; |
42 | | } |
43 | | |
44 | | return c1; |
45 | | } |
46 | | |
47 | | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
48 | | { |
49 | | BN_ULONG c1 = 0; |
50 | | |
51 | | assert(num >= 0); |
52 | | if (num <= 0) |
53 | | return c1; |
54 | | |
55 | | # ifndef OPENSSL_SMALL_FOOTPRINT |
56 | | while (num & ~3) { |
57 | | mul(rp[0], ap[0], w, c1); |
58 | | mul(rp[1], ap[1], w, c1); |
59 | | mul(rp[2], ap[2], w, c1); |
60 | | mul(rp[3], ap[3], w, c1); |
61 | | ap += 4; |
62 | | rp += 4; |
63 | | num -= 4; |
64 | | } |
65 | | # endif |
66 | | while (num) { |
67 | | mul(rp[0], ap[0], w, c1); |
68 | | ap++; |
69 | | rp++; |
70 | | num--; |
71 | | } |
72 | | return c1; |
73 | | } |
74 | | |
75 | | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
76 | | { |
77 | | assert(n >= 0); |
78 | | if (n <= 0) |
79 | | return; |
80 | | |
81 | | # ifndef OPENSSL_SMALL_FOOTPRINT |
82 | | while (n & ~3) { |
83 | | sqr(r[0], r[1], a[0]); |
84 | | sqr(r[2], r[3], a[1]); |
85 | | sqr(r[4], r[5], a[2]); |
86 | | sqr(r[6], r[7], a[3]); |
87 | | a += 4; |
88 | | r += 8; |
89 | | n -= 4; |
90 | | } |
91 | | # endif |
92 | | while (n) { |
93 | | sqr(r[0], r[1], a[0]); |
94 | | a++; |
95 | | r += 2; |
96 | | n--; |
97 | | } |
98 | | } |
99 | | |
100 | | #else /* !(defined(BN_LLONG) || |
101 | | * defined(BN_UMULT_HIGH)) */ |
102 | | |
103 | | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, |
104 | | BN_ULONG w) |
105 | 0 | { |
106 | 0 | BN_ULONG c = 0; |
107 | 0 | BN_ULONG bl, bh; |
108 | |
|
109 | 0 | assert(num >= 0); |
110 | 0 | if (num <= 0) |
111 | 0 | return (BN_ULONG)0; |
112 | | |
113 | 0 | bl = LBITS(w); |
114 | 0 | bh = HBITS(w); |
115 | |
|
116 | 0 | # ifndef OPENSSL_SMALL_FOOTPRINT |
117 | 0 | while (num & ~3) { |
118 | 0 | mul_add(rp[0], ap[0], bl, bh, c); |
119 | 0 | mul_add(rp[1], ap[1], bl, bh, c); |
120 | 0 | mul_add(rp[2], ap[2], bl, bh, c); |
121 | 0 | mul_add(rp[3], ap[3], bl, bh, c); |
122 | 0 | ap += 4; |
123 | 0 | rp += 4; |
124 | 0 | num -= 4; |
125 | 0 | } |
126 | 0 | # endif |
127 | 0 | while (num) { |
128 | 0 | mul_add(rp[0], ap[0], bl, bh, c); |
129 | 0 | ap++; |
130 | 0 | rp++; |
131 | 0 | num--; |
132 | 0 | } |
133 | 0 | return c; |
134 | 0 | } |
135 | | |
136 | | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) |
137 | 0 | { |
138 | 0 | BN_ULONG carry = 0; |
139 | 0 | BN_ULONG bl, bh; |
140 | |
|
141 | 0 | assert(num >= 0); |
142 | 0 | if (num <= 0) |
143 | 0 | return (BN_ULONG)0; |
144 | | |
145 | 0 | bl = LBITS(w); |
146 | 0 | bh = HBITS(w); |
147 | |
|
148 | 0 | # ifndef OPENSSL_SMALL_FOOTPRINT |
149 | 0 | while (num & ~3) { |
150 | 0 | mul(rp[0], ap[0], bl, bh, carry); |
151 | 0 | mul(rp[1], ap[1], bl, bh, carry); |
152 | 0 | mul(rp[2], ap[2], bl, bh, carry); |
153 | 0 | mul(rp[3], ap[3], bl, bh, carry); |
154 | 0 | ap += 4; |
155 | 0 | rp += 4; |
156 | 0 | num -= 4; |
157 | 0 | } |
158 | 0 | # endif |
159 | 0 | while (num) { |
160 | 0 | mul(rp[0], ap[0], bl, bh, carry); |
161 | 0 | ap++; |
162 | 0 | rp++; |
163 | 0 | num--; |
164 | 0 | } |
165 | 0 | return carry; |
166 | 0 | } |
167 | | |
168 | | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) |
169 | 0 | { |
170 | 0 | assert(n >= 0); |
171 | 0 | if (n <= 0) |
172 | 0 | return; |
173 | | |
174 | 0 | # ifndef OPENSSL_SMALL_FOOTPRINT |
175 | 0 | while (n & ~3) { |
176 | 0 | sqr64(r[0], r[1], a[0]); |
177 | 0 | sqr64(r[2], r[3], a[1]); |
178 | 0 | sqr64(r[4], r[5], a[2]); |
179 | 0 | sqr64(r[6], r[7], a[3]); |
180 | 0 | a += 4; |
181 | 0 | r += 8; |
182 | 0 | n -= 4; |
183 | 0 | } |
184 | 0 | # endif |
185 | 0 | while (n) { |
186 | 0 | sqr64(r[0], r[1], a[0]); |
187 | 0 | a++; |
188 | 0 | r += 2; |
189 | 0 | n--; |
190 | 0 | } |
191 | 0 | } |
192 | | |
193 | | #endif /* !(defined(BN_LLONG) || |
194 | | * defined(BN_UMULT_HIGH)) */ |
195 | | |
196 | | #if defined(BN_LLONG) && defined(BN_DIV2W) |
197 | | |
198 | | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
199 | | { |
200 | | return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); |
201 | | } |
202 | | |
203 | | #else |
204 | | |
205 | | /* Divide h,l by d and return the result. */ |
206 | | /* I need to test this some more :-( */ |
207 | | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) |
208 | 0 | { |
209 | 0 | BN_ULONG dh, dl, q, ret = 0, th, tl, t; |
210 | 0 | int i, count = 2; |
211 | |
|
212 | 0 | if (d == 0) |
213 | 0 | return BN_MASK2; |
214 | | |
215 | 0 | i = BN_num_bits_word(d); |
216 | 0 | assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); |
217 | | |
218 | 0 | i = BN_BITS2 - i; |
219 | 0 | if (h >= d) |
220 | 0 | h -= d; |
221 | |
|
222 | 0 | if (i) { |
223 | 0 | d <<= i; |
224 | 0 | h = (h << i) | (l >> (BN_BITS2 - i)); |
225 | 0 | l <<= i; |
226 | 0 | } |
227 | 0 | dh = (d & BN_MASK2h) >> BN_BITS4; |
228 | 0 | dl = (d & BN_MASK2l); |
229 | 0 | for (;;) { |
230 | 0 | if ((h >> BN_BITS4) == dh) |
231 | 0 | q = BN_MASK2l; |
232 | 0 | else |
233 | 0 | q = h / dh; |
234 | |
|
235 | 0 | th = q * dh; |
236 | 0 | tl = dl * q; |
237 | 0 | for (;;) { |
238 | 0 | t = h - th; |
239 | 0 | if ((t & BN_MASK2h) || |
240 | 0 | ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) |
241 | 0 | break; |
242 | 0 | q--; |
243 | 0 | th -= dh; |
244 | 0 | tl -= dl; |
245 | 0 | } |
246 | 0 | t = (tl >> BN_BITS4); |
247 | 0 | tl = (tl << BN_BITS4) & BN_MASK2h; |
248 | 0 | th += t; |
249 | |
|
250 | 0 | if (l < tl) |
251 | 0 | th++; |
252 | 0 | l -= tl; |
253 | 0 | if (h < th) { |
254 | 0 | h += d; |
255 | 0 | q--; |
256 | 0 | } |
257 | 0 | h -= th; |
258 | |
|
259 | 0 | if (--count == 0) |
260 | 0 | break; |
261 | | |
262 | 0 | ret = q << BN_BITS4; |
263 | 0 | h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; |
264 | 0 | l = (l & BN_MASK2l) << BN_BITS4; |
265 | 0 | } |
266 | 0 | ret |= q; |
267 | 0 | return ret; |
268 | 0 | } |
269 | | #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ |
270 | | |
271 | | #ifdef BN_LLONG |
272 | | BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, |
273 | | int n) |
274 | | { |
275 | | BN_ULLONG ll = 0; |
276 | | |
277 | | assert(n >= 0); |
278 | | if (n <= 0) |
279 | | return (BN_ULONG)0; |
280 | | |
281 | | # ifndef OPENSSL_SMALL_FOOTPRINT |
282 | | while (n & ~3) { |
283 | | ll += (BN_ULLONG) a[0] + b[0]; |
284 | | r[0] = (BN_ULONG)ll & BN_MASK2; |
285 | | ll >>= BN_BITS2; |
286 | | ll += (BN_ULLONG) a[1] + b[1]; |
287 | | r[1] = (BN_ULONG)ll & BN_MASK2; |
288 | | ll >>= BN_BITS2; |
289 | | ll += (BN_ULLONG) a[2] + b[2]; |
290 | | r[2] = (BN_ULONG)ll & BN_MASK2; |
291 | | ll >>= BN_BITS2; |
292 | | ll += (BN_ULLONG) a[3] + b[3]; |
293 | | r[3] = (BN_ULONG)ll & BN_MASK2; |
294 | | ll >>= BN_BITS2; |
295 | | a += 4; |
296 | | b += 4; |
297 | | r += 4; |
298 | | n -= 4; |
299 | | } |
300 | | # endif |
301 | | while (n) { |
302 | | ll += (BN_ULLONG) a[0] + b[0]; |
303 | | r[0] = (BN_ULONG)ll & BN_MASK2; |
304 | | ll >>= BN_BITS2; |
305 | | a++; |
306 | | b++; |
307 | | r++; |
308 | | n--; |
309 | | } |
310 | | return (BN_ULONG)ll; |
311 | | } |
312 | | #else /* !BN_LLONG */ |
313 | | BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, |
314 | | int n) |
315 | 0 | { |
316 | 0 | BN_ULONG c, l, t; |
317 | |
|
318 | 0 | assert(n >= 0); |
319 | 0 | if (n <= 0) |
320 | 0 | return (BN_ULONG)0; |
321 | | |
322 | 0 | c = 0; |
323 | 0 | # ifndef OPENSSL_SMALL_FOOTPRINT |
324 | 0 | while (n & ~3) { |
325 | 0 | t = a[0]; |
326 | 0 | t = (t + c) & BN_MASK2; |
327 | 0 | c = (t < c); |
328 | 0 | l = (t + b[0]) & BN_MASK2; |
329 | 0 | c += (l < t); |
330 | 0 | r[0] = l; |
331 | 0 | t = a[1]; |
332 | 0 | t = (t + c) & BN_MASK2; |
333 | 0 | c = (t < c); |
334 | 0 | l = (t + b[1]) & BN_MASK2; |
335 | 0 | c += (l < t); |
336 | 0 | r[1] = l; |
337 | 0 | t = a[2]; |
338 | 0 | t = (t + c) & BN_MASK2; |
339 | 0 | c = (t < c); |
340 | 0 | l = (t + b[2]) & BN_MASK2; |
341 | 0 | c += (l < t); |
342 | 0 | r[2] = l; |
343 | 0 | t = a[3]; |
344 | 0 | t = (t + c) & BN_MASK2; |
345 | 0 | c = (t < c); |
346 | 0 | l = (t + b[3]) & BN_MASK2; |
347 | 0 | c += (l < t); |
348 | 0 | r[3] = l; |
349 | 0 | a += 4; |
350 | 0 | b += 4; |
351 | 0 | r += 4; |
352 | 0 | n -= 4; |
353 | 0 | } |
354 | 0 | # endif |
355 | 0 | while (n) { |
356 | 0 | t = a[0]; |
357 | 0 | t = (t + c) & BN_MASK2; |
358 | 0 | c = (t < c); |
359 | 0 | l = (t + b[0]) & BN_MASK2; |
360 | 0 | c += (l < t); |
361 | 0 | r[0] = l; |
362 | 0 | a++; |
363 | 0 | b++; |
364 | 0 | r++; |
365 | 0 | n--; |
366 | 0 | } |
367 | 0 | return (BN_ULONG)c; |
368 | 0 | } |
369 | | #endif /* !BN_LLONG */ |
370 | | |
371 | | BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, |
372 | | int n) |
373 | 0 | { |
374 | 0 | BN_ULONG t1, t2; |
375 | 0 | int c = 0; |
376 | |
|
377 | 0 | assert(n >= 0); |
378 | 0 | if (n <= 0) |
379 | 0 | return (BN_ULONG)0; |
380 | | |
381 | 0 | #ifndef OPENSSL_SMALL_FOOTPRINT |
382 | 0 | while (n & ~3) { |
383 | 0 | t1 = a[0]; |
384 | 0 | t2 = b[0]; |
385 | 0 | r[0] = (t1 - t2 - c) & BN_MASK2; |
386 | 0 | if (t1 != t2) |
387 | 0 | c = (t1 < t2); |
388 | 0 | t1 = a[1]; |
389 | 0 | t2 = b[1]; |
390 | 0 | r[1] = (t1 - t2 - c) & BN_MASK2; |
391 | 0 | if (t1 != t2) |
392 | 0 | c = (t1 < t2); |
393 | 0 | t1 = a[2]; |
394 | 0 | t2 = b[2]; |
395 | 0 | r[2] = (t1 - t2 - c) & BN_MASK2; |
396 | 0 | if (t1 != t2) |
397 | 0 | c = (t1 < t2); |
398 | 0 | t1 = a[3]; |
399 | 0 | t2 = b[3]; |
400 | 0 | r[3] = (t1 - t2 - c) & BN_MASK2; |
401 | 0 | if (t1 != t2) |
402 | 0 | c = (t1 < t2); |
403 | 0 | a += 4; |
404 | 0 | b += 4; |
405 | 0 | r += 4; |
406 | 0 | n -= 4; |
407 | 0 | } |
408 | 0 | #endif |
409 | 0 | while (n) { |
410 | 0 | t1 = a[0]; |
411 | 0 | t2 = b[0]; |
412 | 0 | r[0] = (t1 - t2 - c) & BN_MASK2; |
413 | 0 | if (t1 != t2) |
414 | 0 | c = (t1 < t2); |
415 | 0 | a++; |
416 | 0 | b++; |
417 | 0 | r++; |
418 | 0 | n--; |
419 | 0 | } |
420 | 0 | return c; |
421 | 0 | } |
422 | | |
423 | | #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) |
424 | | |
425 | | # undef bn_mul_comba8 |
426 | | # undef bn_mul_comba4 |
427 | | # undef bn_sqr_comba8 |
428 | | # undef bn_sqr_comba4 |
429 | | |
430 | | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ |
431 | | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ |
432 | | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ |
433 | | /* |
434 | | * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number |
435 | | * c=(c2,c1,c0) |
436 | | */ |
437 | | |
438 | | # ifdef BN_LLONG |
439 | | /* |
440 | | * Keep in mind that additions to multiplication result can not |
441 | | * overflow, because its high half cannot be all-ones. |
442 | | */ |
443 | | # define mul_add_c(a,b,c0,c1,c2) do { \ |
444 | | BN_ULONG hi; \ |
445 | | BN_ULLONG t = (BN_ULLONG)(a)*(b); \ |
446 | | t += c0; /* no carry */ \ |
447 | | c0 = (BN_ULONG)Lw(t); \ |
448 | | hi = (BN_ULONG)Hw(t); \ |
449 | | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ |
450 | | } while(0) |
451 | | |
452 | | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
453 | | BN_ULONG hi; \ |
454 | | BN_ULLONG t = (BN_ULLONG)(a)*(b); \ |
455 | | BN_ULLONG tt = t+c0; /* no carry */ \ |
456 | | c0 = (BN_ULONG)Lw(tt); \ |
457 | | hi = (BN_ULONG)Hw(tt); \ |
458 | | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ |
459 | | t += c0; /* no carry */ \ |
460 | | c0 = (BN_ULONG)Lw(t); \ |
461 | | hi = (BN_ULONG)Hw(t); \ |
462 | | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ |
463 | | } while(0) |
464 | | |
465 | | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
466 | | BN_ULONG hi; \ |
467 | | BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ |
468 | | t += c0; /* no carry */ \ |
469 | | c0 = (BN_ULONG)Lw(t); \ |
470 | | hi = (BN_ULONG)Hw(t); \ |
471 | | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ |
472 | | } while(0) |
473 | | |
474 | | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
475 | | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
476 | | |
477 | | # elif defined(BN_UMULT_LOHI) |
478 | | /* |
479 | | * Keep in mind that additions to hi can not overflow, because |
480 | | * the high word of a multiplication result cannot be all-ones. |
481 | | */ |
482 | | # define mul_add_c(a,b,c0,c1,c2) do { \ |
483 | | BN_ULONG ta = (a), tb = (b); \ |
484 | | BN_ULONG lo, hi; \ |
485 | | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
486 | | c0 += lo; hi += (c0<lo)?1:0; \ |
487 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
488 | | } while(0) |
489 | | |
490 | | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
491 | | BN_ULONG ta = (a), tb = (b); \ |
492 | | BN_ULONG lo, hi, tt; \ |
493 | | BN_UMULT_LOHI(lo,hi,ta,tb); \ |
494 | | c0 += lo; tt = hi+((c0<lo)?1:0); \ |
495 | | c1 += tt; c2 += (c1<tt)?1:0; \ |
496 | | c0 += lo; hi += (c0<lo)?1:0; \ |
497 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
498 | | } while(0) |
499 | | |
500 | | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
501 | | BN_ULONG ta = (a)[i]; \ |
502 | | BN_ULONG lo, hi; \ |
503 | | BN_UMULT_LOHI(lo,hi,ta,ta); \ |
504 | | c0 += lo; hi += (c0<lo)?1:0; \ |
505 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
506 | | } while(0) |
507 | | |
508 | | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
509 | | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
510 | | |
511 | | # elif defined(BN_UMULT_HIGH) |
512 | | /* |
513 | | * Keep in mind that additions to hi can not overflow, because |
514 | | * the high word of a multiplication result cannot be all-ones. |
515 | | */ |
516 | | # define mul_add_c(a,b,c0,c1,c2) do { \ |
517 | | BN_ULONG ta = (a), tb = (b); \ |
518 | | BN_ULONG lo = ta * tb; \ |
519 | | BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ |
520 | | c0 += lo; hi += (c0<lo)?1:0; \ |
521 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
522 | | } while(0) |
523 | | |
524 | | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
525 | | BN_ULONG ta = (a), tb = (b), tt; \ |
526 | | BN_ULONG lo = ta * tb; \ |
527 | | BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ |
528 | | c0 += lo; tt = hi + ((c0<lo)?1:0); \ |
529 | | c1 += tt; c2 += (c1<tt)?1:0; \ |
530 | | c0 += lo; hi += (c0<lo)?1:0; \ |
531 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
532 | | } while(0) |
533 | | |
534 | | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
535 | | BN_ULONG ta = (a)[i]; \ |
536 | | BN_ULONG lo = ta * ta; \ |
537 | | BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ |
538 | | c0 += lo; hi += (c0<lo)?1:0; \ |
539 | | c1 += hi; c2 += (c1<hi)?1:0; \ |
540 | | } while(0) |
541 | | |
542 | | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
543 | | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
544 | | |
545 | | # else /* !BN_LLONG */ |
546 | | /* |
547 | | * Keep in mind that additions to hi can not overflow, because |
548 | | * the high word of a multiplication result cannot be all-ones. |
549 | | */ |
550 | 0 | # define mul_add_c(a,b,c0,c1,c2) do { \ |
551 | 0 | BN_ULONG lo = LBITS(a), hi = HBITS(a); \ |
552 | 0 | BN_ULONG bl = LBITS(b), bh = HBITS(b); \ |
553 | 0 | mul64(lo,hi,bl,bh); \ |
554 | 0 | c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ |
555 | 0 | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ |
556 | 0 | } while(0) |
557 | | |
558 | 0 | # define mul_add_c2(a,b,c0,c1,c2) do { \ |
559 | 0 | BN_ULONG tt; \ |
560 | 0 | BN_ULONG lo = LBITS(a), hi = HBITS(a); \ |
561 | 0 | BN_ULONG bl = LBITS(b), bh = HBITS(b); \ |
562 | 0 | mul64(lo,hi,bl,bh); \ |
563 | 0 | tt = hi; \ |
564 | 0 | c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \ |
565 | 0 | c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \ |
566 | 0 | c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ |
567 | 0 | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ |
568 | 0 | } while(0) |
569 | | |
570 | 0 | # define sqr_add_c(a,i,c0,c1,c2) do { \ |
571 | 0 | BN_ULONG lo, hi; \ |
572 | 0 | sqr64(lo,hi,(a)[i]); \ |
573 | 0 | c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ |
574 | 0 | c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ |
575 | 0 | } while(0) |
576 | | |
577 | | # define sqr_add_c2(a,i,j,c0,c1,c2) \ |
578 | 0 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) |
579 | | # endif /* !BN_LLONG */ |
580 | | |
581 | | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
582 | 0 | { |
583 | 0 | BN_ULONG c1, c2, c3; |
584 | |
|
585 | 0 | c1 = 0; |
586 | 0 | c2 = 0; |
587 | 0 | c3 = 0; |
588 | 0 | mul_add_c(a[0], b[0], c1, c2, c3); |
589 | 0 | r[0] = c1; |
590 | 0 | c1 = 0; |
591 | 0 | mul_add_c(a[0], b[1], c2, c3, c1); |
592 | 0 | mul_add_c(a[1], b[0], c2, c3, c1); |
593 | 0 | r[1] = c2; |
594 | 0 | c2 = 0; |
595 | 0 | mul_add_c(a[2], b[0], c3, c1, c2); |
596 | 0 | mul_add_c(a[1], b[1], c3, c1, c2); |
597 | 0 | mul_add_c(a[0], b[2], c3, c1, c2); |
598 | 0 | r[2] = c3; |
599 | 0 | c3 = 0; |
600 | 0 | mul_add_c(a[0], b[3], c1, c2, c3); |
601 | 0 | mul_add_c(a[1], b[2], c1, c2, c3); |
602 | 0 | mul_add_c(a[2], b[1], c1, c2, c3); |
603 | 0 | mul_add_c(a[3], b[0], c1, c2, c3); |
604 | 0 | r[3] = c1; |
605 | 0 | c1 = 0; |
606 | 0 | mul_add_c(a[4], b[0], c2, c3, c1); |
607 | 0 | mul_add_c(a[3], b[1], c2, c3, c1); |
608 | 0 | mul_add_c(a[2], b[2], c2, c3, c1); |
609 | 0 | mul_add_c(a[1], b[3], c2, c3, c1); |
610 | 0 | mul_add_c(a[0], b[4], c2, c3, c1); |
611 | 0 | r[4] = c2; |
612 | 0 | c2 = 0; |
613 | 0 | mul_add_c(a[0], b[5], c3, c1, c2); |
614 | 0 | mul_add_c(a[1], b[4], c3, c1, c2); |
615 | 0 | mul_add_c(a[2], b[3], c3, c1, c2); |
616 | 0 | mul_add_c(a[3], b[2], c3, c1, c2); |
617 | 0 | mul_add_c(a[4], b[1], c3, c1, c2); |
618 | 0 | mul_add_c(a[5], b[0], c3, c1, c2); |
619 | 0 | r[5] = c3; |
620 | 0 | c3 = 0; |
621 | 0 | mul_add_c(a[6], b[0], c1, c2, c3); |
622 | 0 | mul_add_c(a[5], b[1], c1, c2, c3); |
623 | 0 | mul_add_c(a[4], b[2], c1, c2, c3); |
624 | 0 | mul_add_c(a[3], b[3], c1, c2, c3); |
625 | 0 | mul_add_c(a[2], b[4], c1, c2, c3); |
626 | 0 | mul_add_c(a[1], b[5], c1, c2, c3); |
627 | 0 | mul_add_c(a[0], b[6], c1, c2, c3); |
628 | 0 | r[6] = c1; |
629 | 0 | c1 = 0; |
630 | 0 | mul_add_c(a[0], b[7], c2, c3, c1); |
631 | 0 | mul_add_c(a[1], b[6], c2, c3, c1); |
632 | 0 | mul_add_c(a[2], b[5], c2, c3, c1); |
633 | 0 | mul_add_c(a[3], b[4], c2, c3, c1); |
634 | 0 | mul_add_c(a[4], b[3], c2, c3, c1); |
635 | 0 | mul_add_c(a[5], b[2], c2, c3, c1); |
636 | 0 | mul_add_c(a[6], b[1], c2, c3, c1); |
637 | 0 | mul_add_c(a[7], b[0], c2, c3, c1); |
638 | 0 | r[7] = c2; |
639 | 0 | c2 = 0; |
640 | 0 | mul_add_c(a[7], b[1], c3, c1, c2); |
641 | 0 | mul_add_c(a[6], b[2], c3, c1, c2); |
642 | 0 | mul_add_c(a[5], b[3], c3, c1, c2); |
643 | 0 | mul_add_c(a[4], b[4], c3, c1, c2); |
644 | 0 | mul_add_c(a[3], b[5], c3, c1, c2); |
645 | 0 | mul_add_c(a[2], b[6], c3, c1, c2); |
646 | 0 | mul_add_c(a[1], b[7], c3, c1, c2); |
647 | 0 | r[8] = c3; |
648 | 0 | c3 = 0; |
649 | 0 | mul_add_c(a[2], b[7], c1, c2, c3); |
650 | 0 | mul_add_c(a[3], b[6], c1, c2, c3); |
651 | 0 | mul_add_c(a[4], b[5], c1, c2, c3); |
652 | 0 | mul_add_c(a[5], b[4], c1, c2, c3); |
653 | 0 | mul_add_c(a[6], b[3], c1, c2, c3); |
654 | 0 | mul_add_c(a[7], b[2], c1, c2, c3); |
655 | 0 | r[9] = c1; |
656 | 0 | c1 = 0; |
657 | 0 | mul_add_c(a[7], b[3], c2, c3, c1); |
658 | 0 | mul_add_c(a[6], b[4], c2, c3, c1); |
659 | 0 | mul_add_c(a[5], b[5], c2, c3, c1); |
660 | 0 | mul_add_c(a[4], b[6], c2, c3, c1); |
661 | 0 | mul_add_c(a[3], b[7], c2, c3, c1); |
662 | 0 | r[10] = c2; |
663 | 0 | c2 = 0; |
664 | 0 | mul_add_c(a[4], b[7], c3, c1, c2); |
665 | 0 | mul_add_c(a[5], b[6], c3, c1, c2); |
666 | 0 | mul_add_c(a[6], b[5], c3, c1, c2); |
667 | 0 | mul_add_c(a[7], b[4], c3, c1, c2); |
668 | 0 | r[11] = c3; |
669 | 0 | c3 = 0; |
670 | 0 | mul_add_c(a[7], b[5], c1, c2, c3); |
671 | 0 | mul_add_c(a[6], b[6], c1, c2, c3); |
672 | 0 | mul_add_c(a[5], b[7], c1, c2, c3); |
673 | 0 | r[12] = c1; |
674 | 0 | c1 = 0; |
675 | 0 | mul_add_c(a[6], b[7], c2, c3, c1); |
676 | 0 | mul_add_c(a[7], b[6], c2, c3, c1); |
677 | 0 | r[13] = c2; |
678 | 0 | c2 = 0; |
679 | 0 | mul_add_c(a[7], b[7], c3, c1, c2); |
680 | 0 | r[14] = c3; |
681 | 0 | r[15] = c1; |
682 | 0 | } |
683 | | |
684 | | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
685 | 0 | { |
686 | 0 | BN_ULONG c1, c2, c3; |
687 | |
|
688 | 0 | c1 = 0; |
689 | 0 | c2 = 0; |
690 | 0 | c3 = 0; |
691 | 0 | mul_add_c(a[0], b[0], c1, c2, c3); |
692 | 0 | r[0] = c1; |
693 | 0 | c1 = 0; |
694 | 0 | mul_add_c(a[0], b[1], c2, c3, c1); |
695 | 0 | mul_add_c(a[1], b[0], c2, c3, c1); |
696 | 0 | r[1] = c2; |
697 | 0 | c2 = 0; |
698 | 0 | mul_add_c(a[2], b[0], c3, c1, c2); |
699 | 0 | mul_add_c(a[1], b[1], c3, c1, c2); |
700 | 0 | mul_add_c(a[0], b[2], c3, c1, c2); |
701 | 0 | r[2] = c3; |
702 | 0 | c3 = 0; |
703 | 0 | mul_add_c(a[0], b[3], c1, c2, c3); |
704 | 0 | mul_add_c(a[1], b[2], c1, c2, c3); |
705 | 0 | mul_add_c(a[2], b[1], c1, c2, c3); |
706 | 0 | mul_add_c(a[3], b[0], c1, c2, c3); |
707 | 0 | r[3] = c1; |
708 | 0 | c1 = 0; |
709 | 0 | mul_add_c(a[3], b[1], c2, c3, c1); |
710 | 0 | mul_add_c(a[2], b[2], c2, c3, c1); |
711 | 0 | mul_add_c(a[1], b[3], c2, c3, c1); |
712 | 0 | r[4] = c2; |
713 | 0 | c2 = 0; |
714 | 0 | mul_add_c(a[2], b[3], c3, c1, c2); |
715 | 0 | mul_add_c(a[3], b[2], c3, c1, c2); |
716 | 0 | r[5] = c3; |
717 | 0 | c3 = 0; |
718 | 0 | mul_add_c(a[3], b[3], c1, c2, c3); |
719 | 0 | r[6] = c1; |
720 | 0 | r[7] = c2; |
721 | 0 | } |
722 | | |
723 | | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
724 | 0 | { |
725 | 0 | BN_ULONG c1, c2, c3; |
726 | |
|
727 | 0 | c1 = 0; |
728 | 0 | c2 = 0; |
729 | 0 | c3 = 0; |
730 | 0 | sqr_add_c(a, 0, c1, c2, c3); |
731 | 0 | r[0] = c1; |
732 | 0 | c1 = 0; |
733 | 0 | sqr_add_c2(a, 1, 0, c2, c3, c1); |
734 | 0 | r[1] = c2; |
735 | 0 | c2 = 0; |
736 | 0 | sqr_add_c(a, 1, c3, c1, c2); |
737 | 0 | sqr_add_c2(a, 2, 0, c3, c1, c2); |
738 | 0 | r[2] = c3; |
739 | 0 | c3 = 0; |
740 | 0 | sqr_add_c2(a, 3, 0, c1, c2, c3); |
741 | 0 | sqr_add_c2(a, 2, 1, c1, c2, c3); |
742 | 0 | r[3] = c1; |
743 | 0 | c1 = 0; |
744 | 0 | sqr_add_c(a, 2, c2, c3, c1); |
745 | 0 | sqr_add_c2(a, 3, 1, c2, c3, c1); |
746 | 0 | sqr_add_c2(a, 4, 0, c2, c3, c1); |
747 | 0 | r[4] = c2; |
748 | 0 | c2 = 0; |
749 | 0 | sqr_add_c2(a, 5, 0, c3, c1, c2); |
750 | 0 | sqr_add_c2(a, 4, 1, c3, c1, c2); |
751 | 0 | sqr_add_c2(a, 3, 2, c3, c1, c2); |
752 | 0 | r[5] = c3; |
753 | 0 | c3 = 0; |
754 | 0 | sqr_add_c(a, 3, c1, c2, c3); |
755 | 0 | sqr_add_c2(a, 4, 2, c1, c2, c3); |
756 | 0 | sqr_add_c2(a, 5, 1, c1, c2, c3); |
757 | 0 | sqr_add_c2(a, 6, 0, c1, c2, c3); |
758 | 0 | r[6] = c1; |
759 | 0 | c1 = 0; |
760 | 0 | sqr_add_c2(a, 7, 0, c2, c3, c1); |
761 | 0 | sqr_add_c2(a, 6, 1, c2, c3, c1); |
762 | 0 | sqr_add_c2(a, 5, 2, c2, c3, c1); |
763 | 0 | sqr_add_c2(a, 4, 3, c2, c3, c1); |
764 | 0 | r[7] = c2; |
765 | 0 | c2 = 0; |
766 | 0 | sqr_add_c(a, 4, c3, c1, c2); |
767 | 0 | sqr_add_c2(a, 5, 3, c3, c1, c2); |
768 | 0 | sqr_add_c2(a, 6, 2, c3, c1, c2); |
769 | 0 | sqr_add_c2(a, 7, 1, c3, c1, c2); |
770 | 0 | r[8] = c3; |
771 | 0 | c3 = 0; |
772 | 0 | sqr_add_c2(a, 7, 2, c1, c2, c3); |
773 | 0 | sqr_add_c2(a, 6, 3, c1, c2, c3); |
774 | 0 | sqr_add_c2(a, 5, 4, c1, c2, c3); |
775 | 0 | r[9] = c1; |
776 | 0 | c1 = 0; |
777 | 0 | sqr_add_c(a, 5, c2, c3, c1); |
778 | 0 | sqr_add_c2(a, 6, 4, c2, c3, c1); |
779 | 0 | sqr_add_c2(a, 7, 3, c2, c3, c1); |
780 | 0 | r[10] = c2; |
781 | 0 | c2 = 0; |
782 | 0 | sqr_add_c2(a, 7, 4, c3, c1, c2); |
783 | 0 | sqr_add_c2(a, 6, 5, c3, c1, c2); |
784 | 0 | r[11] = c3; |
785 | 0 | c3 = 0; |
786 | 0 | sqr_add_c(a, 6, c1, c2, c3); |
787 | 0 | sqr_add_c2(a, 7, 5, c1, c2, c3); |
788 | 0 | r[12] = c1; |
789 | 0 | c1 = 0; |
790 | 0 | sqr_add_c2(a, 7, 6, c2, c3, c1); |
791 | 0 | r[13] = c2; |
792 | 0 | c2 = 0; |
793 | 0 | sqr_add_c(a, 7, c3, c1, c2); |
794 | 0 | r[14] = c3; |
795 | 0 | r[15] = c1; |
796 | 0 | } |
797 | | |
798 | | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
799 | 0 | { |
800 | 0 | BN_ULONG c1, c2, c3; |
801 | |
|
802 | 0 | c1 = 0; |
803 | 0 | c2 = 0; |
804 | 0 | c3 = 0; |
805 | 0 | sqr_add_c(a, 0, c1, c2, c3); |
806 | 0 | r[0] = c1; |
807 | 0 | c1 = 0; |
808 | 0 | sqr_add_c2(a, 1, 0, c2, c3, c1); |
809 | 0 | r[1] = c2; |
810 | 0 | c2 = 0; |
811 | 0 | sqr_add_c(a, 1, c3, c1, c2); |
812 | 0 | sqr_add_c2(a, 2, 0, c3, c1, c2); |
813 | 0 | r[2] = c3; |
814 | 0 | c3 = 0; |
815 | 0 | sqr_add_c2(a, 3, 0, c1, c2, c3); |
816 | 0 | sqr_add_c2(a, 2, 1, c1, c2, c3); |
817 | 0 | r[3] = c1; |
818 | 0 | c1 = 0; |
819 | 0 | sqr_add_c(a, 2, c2, c3, c1); |
820 | 0 | sqr_add_c2(a, 3, 1, c2, c3, c1); |
821 | 0 | r[4] = c2; |
822 | 0 | c2 = 0; |
823 | 0 | sqr_add_c2(a, 3, 2, c3, c1, c2); |
824 | 0 | r[5] = c3; |
825 | 0 | c3 = 0; |
826 | 0 | sqr_add_c(a, 3, c1, c2, c3); |
827 | 0 | r[6] = c1; |
828 | 0 | r[7] = c2; |
829 | 0 | } |
830 | | |
831 | | # ifdef OPENSSL_NO_ASM |
832 | | # ifdef OPENSSL_BN_ASM_MONT |
833 | | # include <alloca.h> |
834 | | /* |
835 | | * This is essentially reference implementation, which may or may not |
836 | | * result in performance improvement. E.g. on IA-32 this routine was |
837 | | * observed to give 40% faster rsa1024 private key operations and 10% |
838 | | * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only |
839 | | * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a |
840 | | * reference implementation, one to be used as starting point for |
841 | | * platform-specific assembler. Mentioned numbers apply to compiler |
842 | | * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and |
843 | | * can vary not only from platform to platform, but even for compiler |
844 | | * versions. Assembler vs. assembler improvement coefficients can |
845 | | * [and are known to] differ and are to be documented elsewhere. |
846 | | */ |
847 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
848 | | const BN_ULONG *np, const BN_ULONG *n0p, int num) |
849 | | { |
850 | | BN_ULONG c0, c1, ml, *tp, n0; |
851 | | # ifdef mul64 |
852 | | BN_ULONG mh; |
853 | | # endif |
854 | | volatile BN_ULONG *vp; |
855 | | int i = 0, j; |
856 | | |
857 | | # if 0 /* template for platform-specific |
858 | | * implementation */ |
859 | | if (ap == bp) |
860 | | return bn_sqr_mont(rp, ap, np, n0p, num); |
861 | | # endif |
862 | | vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); |
863 | | |
864 | | n0 = *n0p; |
865 | | |
866 | | c0 = 0; |
867 | | ml = bp[0]; |
868 | | # ifdef mul64 |
869 | | mh = HBITS(ml); |
870 | | ml = LBITS(ml); |
871 | | for (j = 0; j < num; ++j) |
872 | | mul(tp[j], ap[j], ml, mh, c0); |
873 | | # else |
874 | | for (j = 0; j < num; ++j) |
875 | | mul(tp[j], ap[j], ml, c0); |
876 | | # endif |
877 | | |
878 | | tp[num] = c0; |
879 | | tp[num + 1] = 0; |
880 | | goto enter; |
881 | | |
882 | | for (i = 0; i < num; i++) { |
883 | | c0 = 0; |
884 | | ml = bp[i]; |
885 | | # ifdef mul64 |
886 | | mh = HBITS(ml); |
887 | | ml = LBITS(ml); |
888 | | for (j = 0; j < num; ++j) |
889 | | mul_add(tp[j], ap[j], ml, mh, c0); |
890 | | # else |
891 | | for (j = 0; j < num; ++j) |
892 | | mul_add(tp[j], ap[j], ml, c0); |
893 | | # endif |
894 | | c1 = (tp[num] + c0) & BN_MASK2; |
895 | | tp[num] = c1; |
896 | | tp[num + 1] = (c1 < c0 ? 1 : 0); |
897 | | enter: |
898 | | c1 = tp[0]; |
899 | | ml = (c1 * n0) & BN_MASK2; |
900 | | c0 = 0; |
901 | | # ifdef mul64 |
902 | | mh = HBITS(ml); |
903 | | ml = LBITS(ml); |
904 | | mul_add(c1, np[0], ml, mh, c0); |
905 | | # else |
906 | | mul_add(c1, ml, np[0], c0); |
907 | | # endif |
908 | | for (j = 1; j < num; j++) { |
909 | | c1 = tp[j]; |
910 | | # ifdef mul64 |
911 | | mul_add(c1, np[j], ml, mh, c0); |
912 | | # else |
913 | | mul_add(c1, ml, np[j], c0); |
914 | | # endif |
915 | | tp[j - 1] = c1 & BN_MASK2; |
916 | | } |
917 | | c1 = (tp[num] + c0) & BN_MASK2; |
918 | | tp[num - 1] = c1; |
919 | | tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); |
920 | | } |
921 | | |
922 | | if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { |
923 | | c0 = bn_sub_words(rp, tp, np, num); |
924 | | if (tp[num] != 0 || c0 == 0) { |
925 | | for (i = 0; i < num + 2; i++) |
926 | | vp[i] = 0; |
927 | | return 1; |
928 | | } |
929 | | } |
930 | | for (i = 0; i < num; i++) |
931 | | rp[i] = tp[i], vp[i] = 0; |
932 | | vp[num] = 0; |
933 | | vp[num + 1] = 0; |
934 | | return 1; |
935 | | } |
936 | | # else |
937 | | /* |
938 | | * Return value of 0 indicates that multiplication/convolution was not |
939 | | * performed to signal the caller to fall down to alternative/original |
940 | | * code-path. |
941 | | */ |
942 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
943 | | const BN_ULONG *np, const BN_ULONG *n0, int num) |
944 | 0 | { |
945 | 0 | return 0; |
946 | 0 | } |
947 | | # endif /* OPENSSL_BN_ASM_MONT */ |
948 | | # endif |
949 | | |
950 | | #else /* !BN_MUL_COMBA */ |
951 | | |
952 | | /* hmm... is it faster just to do a multiply? */ |
953 | | # undef bn_sqr_comba4 |
954 | | # undef bn_sqr_comba8 |
955 | | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
956 | | { |
957 | | BN_ULONG t[8]; |
958 | | bn_sqr_normal(r, a, 4, t); |
959 | | } |
960 | | |
961 | | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
962 | | { |
963 | | BN_ULONG t[16]; |
964 | | bn_sqr_normal(r, a, 8, t); |
965 | | } |
966 | | |
967 | | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
968 | | { |
969 | | r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); |
970 | | r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); |
971 | | r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); |
972 | | r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); |
973 | | } |
974 | | |
975 | | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) |
976 | | { |
977 | | r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); |
978 | | r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); |
979 | | r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); |
980 | | r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); |
981 | | r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); |
982 | | r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); |
983 | | r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); |
984 | | r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); |
985 | | } |
986 | | |
987 | | # ifdef OPENSSL_NO_ASM |
988 | | # ifdef OPENSSL_BN_ASM_MONT |
989 | | # include <alloca.h> |
990 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
991 | | const BN_ULONG *np, const BN_ULONG *n0p, int num) |
992 | | { |
993 | | BN_ULONG c0, c1, *tp, n0 = *n0p; |
994 | | volatile BN_ULONG *vp; |
995 | | int i = 0, j; |
996 | | |
997 | | vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); |
998 | | |
999 | | for (i = 0; i <= num; i++) |
1000 | | tp[i] = 0; |
1001 | | |
1002 | | for (i = 0; i < num; i++) { |
1003 | | c0 = bn_mul_add_words(tp, ap, num, bp[i]); |
1004 | | c1 = (tp[num] + c0) & BN_MASK2; |
1005 | | tp[num] = c1; |
1006 | | tp[num + 1] = (c1 < c0 ? 1 : 0); |
1007 | | |
1008 | | c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); |
1009 | | c1 = (tp[num] + c0) & BN_MASK2; |
1010 | | tp[num] = c1; |
1011 | | tp[num + 1] += (c1 < c0 ? 1 : 0); |
1012 | | for (j = 0; j <= num; j++) |
1013 | | tp[j] = tp[j + 1]; |
1014 | | } |
1015 | | |
1016 | | if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { |
1017 | | c0 = bn_sub_words(rp, tp, np, num); |
1018 | | if (tp[num] != 0 || c0 == 0) { |
1019 | | for (i = 0; i < num + 2; i++) |
1020 | | vp[i] = 0; |
1021 | | return 1; |
1022 | | } |
1023 | | } |
1024 | | for (i = 0; i < num; i++) |
1025 | | rp[i] = tp[i], vp[i] = 0; |
1026 | | vp[num] = 0; |
1027 | | vp[num + 1] = 0; |
1028 | | return 1; |
1029 | | } |
1030 | | # else |
1031 | | int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, |
1032 | | const BN_ULONG *np, const BN_ULONG *n0, int num) |
1033 | | { |
1034 | | return 0; |
1035 | | } |
1036 | | # endif /* OPENSSL_BN_ASM_MONT */ |
1037 | | # endif |
1038 | | |
1039 | | #endif /* !BN_MUL_COMBA */ |