Coverage Report

Created: 2025-12-10 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openssl/crypto/bn/bn_asm.c
Line
Count
Source
1
/*
2
 * Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved.
3
 *
4
 * Licensed under the Apache License 2.0 (the "License").  You may not use
5
 * this file except in compliance with the License.  You can obtain a copy
6
 * in the file LICENSE in the source distribution or at
7
 * https://www.openssl.org/source/license.html
8
 */
9
10
#include <assert.h>
11
#include <openssl/crypto.h>
12
#include "internal/cryptlib.h"
13
#include "bn_local.h"
14
15
#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
16
17
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
18
    BN_ULONG w)
19
{
20
    BN_ULONG c1 = 0;
21
22
    assert(num >= 0);
23
    if (num <= 0)
24
        return c1;
25
26
#ifndef OPENSSL_SMALL_FOOTPRINT
27
    while (num & ~3) {
28
        mul_add(rp[0], ap[0], w, c1);
29
        mul_add(rp[1], ap[1], w, c1);
30
        mul_add(rp[2], ap[2], w, c1);
31
        mul_add(rp[3], ap[3], w, c1);
32
        ap += 4;
33
        rp += 4;
34
        num -= 4;
35
    }
36
#endif
37
    while (num) {
38
        mul_add(rp[0], ap[0], w, c1);
39
        ap++;
40
        rp++;
41
        num--;
42
    }
43
44
    return c1;
45
}
46
47
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
48
{
49
    BN_ULONG c1 = 0;
50
51
    assert(num >= 0);
52
    if (num <= 0)
53
        return c1;
54
55
#ifndef OPENSSL_SMALL_FOOTPRINT
56
    while (num & ~3) {
57
        mul(rp[0], ap[0], w, c1);
58
        mul(rp[1], ap[1], w, c1);
59
        mul(rp[2], ap[2], w, c1);
60
        mul(rp[3], ap[3], w, c1);
61
        ap += 4;
62
        rp += 4;
63
        num -= 4;
64
    }
65
#endif
66
    while (num) {
67
        mul(rp[0], ap[0], w, c1);
68
        ap++;
69
        rp++;
70
        num--;
71
    }
72
    return c1;
73
}
74
75
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
76
{
77
    assert(n >= 0);
78
    if (n <= 0)
79
        return;
80
81
#ifndef OPENSSL_SMALL_FOOTPRINT
82
    while (n & ~3) {
83
        sqr(r[0], r[1], a[0]);
84
        sqr(r[2], r[3], a[1]);
85
        sqr(r[4], r[5], a[2]);
86
        sqr(r[6], r[7], a[3]);
87
        a += 4;
88
        r += 8;
89
        n -= 4;
90
    }
91
#endif
92
    while (n) {
93
        sqr(r[0], r[1], a[0]);
94
        a++;
95
        r += 2;
96
        n--;
97
    }
98
}
99
100
#else /* !(defined(BN_LLONG) || \
101
       * defined(BN_UMULT_HIGH)) */
102
103
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
104
    BN_ULONG w)
105
0
{
106
0
    BN_ULONG c = 0;
107
0
    BN_ULONG bl, bh;
108
109
0
    assert(num >= 0);
110
0
    if (num <= 0)
111
0
        return (BN_ULONG)0;
112
113
0
    bl = LBITS(w);
114
0
    bh = HBITS(w);
115
116
0
#ifndef OPENSSL_SMALL_FOOTPRINT
117
0
    while (num & ~3) {
118
0
        mul_add(rp[0], ap[0], bl, bh, c);
119
0
        mul_add(rp[1], ap[1], bl, bh, c);
120
0
        mul_add(rp[2], ap[2], bl, bh, c);
121
0
        mul_add(rp[3], ap[3], bl, bh, c);
122
0
        ap += 4;
123
0
        rp += 4;
124
0
        num -= 4;
125
0
    }
126
0
#endif
127
0
    while (num) {
128
0
        mul_add(rp[0], ap[0], bl, bh, c);
129
0
        ap++;
130
0
        rp++;
131
0
        num--;
132
0
    }
133
0
    return c;
134
0
}
135
136
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
137
0
{
138
0
    BN_ULONG carry = 0;
139
0
    BN_ULONG bl, bh;
140
141
0
    assert(num >= 0);
142
0
    if (num <= 0)
143
0
        return (BN_ULONG)0;
144
145
0
    bl = LBITS(w);
146
0
    bh = HBITS(w);
147
148
0
#ifndef OPENSSL_SMALL_FOOTPRINT
149
0
    while (num & ~3) {
150
0
        mul(rp[0], ap[0], bl, bh, carry);
151
0
        mul(rp[1], ap[1], bl, bh, carry);
152
0
        mul(rp[2], ap[2], bl, bh, carry);
153
0
        mul(rp[3], ap[3], bl, bh, carry);
154
0
        ap += 4;
155
0
        rp += 4;
156
0
        num -= 4;
157
0
    }
158
0
#endif
159
0
    while (num) {
160
0
        mul(rp[0], ap[0], bl, bh, carry);
161
0
        ap++;
162
0
        rp++;
163
0
        num--;
164
0
    }
165
0
    return carry;
166
0
}
167
168
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
169
0
{
170
0
    assert(n >= 0);
171
0
    if (n <= 0)
172
0
        return;
173
174
0
#ifndef OPENSSL_SMALL_FOOTPRINT
175
0
    while (n & ~3) {
176
0
        sqr64(r[0], r[1], a[0]);
177
0
        sqr64(r[2], r[3], a[1]);
178
0
        sqr64(r[4], r[5], a[2]);
179
0
        sqr64(r[6], r[7], a[3]);
180
0
        a += 4;
181
0
        r += 8;
182
0
        n -= 4;
183
0
    }
184
0
#endif
185
0
    while (n) {
186
0
        sqr64(r[0], r[1], a[0]);
187
0
        a++;
188
0
        r += 2;
189
0
        n--;
190
0
    }
191
0
}
192
193
#endif /* !(defined(BN_LLONG) || \
194
        * defined(BN_UMULT_HIGH)) */
195
196
#if defined(BN_LLONG) && defined(BN_DIV2W)
197
198
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
199
{
200
    return ((BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d));
201
}
202
203
#else
204
205
/* Divide h,l by d and return the result. */
206
/* I need to test this some more :-( */
207
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
208
0
{
209
0
    BN_ULONG dh, dl, q, ret = 0, th, tl, t;
210
0
    int i, count = 2;
211
212
0
    if (d == 0)
213
0
        return BN_MASK2;
214
215
0
    i = BN_num_bits_word(d);
216
0
    assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
217
218
0
    i = BN_BITS2 - i;
219
0
    if (h >= d)
220
0
        h -= d;
221
222
0
    if (i) {
223
0
        d <<= i;
224
0
        h = (h << i) | (l >> (BN_BITS2 - i));
225
0
        l <<= i;
226
0
    }
227
0
    dh = (d & BN_MASK2h) >> BN_BITS4;
228
0
    dl = (d & BN_MASK2l);
229
0
    for (;;) {
230
0
        if ((h >> BN_BITS4) == dh)
231
0
            q = BN_MASK2l;
232
0
        else
233
0
            q = h / dh;
234
235
0
        th = q * dh;
236
0
        tl = dl * q;
237
0
        for (;;) {
238
0
            t = h - th;
239
0
            if ((t & BN_MASK2h) || ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
240
0
                break;
241
0
            q--;
242
0
            th -= dh;
243
0
            tl -= dl;
244
0
        }
245
0
        t = (tl >> BN_BITS4);
246
0
        tl = (tl << BN_BITS4) & BN_MASK2h;
247
0
        th += t;
248
249
0
        if (l < tl)
250
0
            th++;
251
0
        l -= tl;
252
0
        if (h < th) {
253
0
            h += d;
254
0
            q--;
255
0
        }
256
0
        h -= th;
257
258
0
        if (--count == 0)
259
0
            break;
260
261
0
        ret = q << BN_BITS4;
262
0
        h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
263
0
        l = (l & BN_MASK2l) << BN_BITS4;
264
0
    }
265
0
    ret |= q;
266
0
    return ret;
267
0
}
268
#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
269
270
#ifdef BN_LLONG
271
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
272
    int n)
273
{
274
    BN_ULLONG ll = 0;
275
276
    assert(n >= 0);
277
    if (n <= 0)
278
        return (BN_ULONG)0;
279
280
#ifndef OPENSSL_SMALL_FOOTPRINT
281
    while (n & ~3) {
282
        ll += (BN_ULLONG)a[0] + b[0];
283
        r[0] = (BN_ULONG)ll & BN_MASK2;
284
        ll >>= BN_BITS2;
285
        ll += (BN_ULLONG)a[1] + b[1];
286
        r[1] = (BN_ULONG)ll & BN_MASK2;
287
        ll >>= BN_BITS2;
288
        ll += (BN_ULLONG)a[2] + b[2];
289
        r[2] = (BN_ULONG)ll & BN_MASK2;
290
        ll >>= BN_BITS2;
291
        ll += (BN_ULLONG)a[3] + b[3];
292
        r[3] = (BN_ULONG)ll & BN_MASK2;
293
        ll >>= BN_BITS2;
294
        a += 4;
295
        b += 4;
296
        r += 4;
297
        n -= 4;
298
    }
299
#endif
300
    while (n) {
301
        ll += (BN_ULLONG)a[0] + b[0];
302
        r[0] = (BN_ULONG)ll & BN_MASK2;
303
        ll >>= BN_BITS2;
304
        a++;
305
        b++;
306
        r++;
307
        n--;
308
    }
309
    return (BN_ULONG)ll;
310
}
311
#else /* !BN_LLONG */
312
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
313
    int n)
314
0
{
315
0
    BN_ULONG c, l, t;
316
317
0
    assert(n >= 0);
318
0
    if (n <= 0)
319
0
        return (BN_ULONG)0;
320
321
0
    c = 0;
322
0
#ifndef OPENSSL_SMALL_FOOTPRINT
323
0
    while (n & ~3) {
324
0
        t = a[0];
325
0
        t = (t + c) & BN_MASK2;
326
0
        c = (t < c);
327
0
        l = (t + b[0]) & BN_MASK2;
328
0
        c += (l < t);
329
0
        r[0] = l;
330
0
        t = a[1];
331
0
        t = (t + c) & BN_MASK2;
332
0
        c = (t < c);
333
0
        l = (t + b[1]) & BN_MASK2;
334
0
        c += (l < t);
335
0
        r[1] = l;
336
0
        t = a[2];
337
0
        t = (t + c) & BN_MASK2;
338
0
        c = (t < c);
339
0
        l = (t + b[2]) & BN_MASK2;
340
0
        c += (l < t);
341
0
        r[2] = l;
342
0
        t = a[3];
343
0
        t = (t + c) & BN_MASK2;
344
0
        c = (t < c);
345
0
        l = (t + b[3]) & BN_MASK2;
346
0
        c += (l < t);
347
0
        r[3] = l;
348
0
        a += 4;
349
0
        b += 4;
350
0
        r += 4;
351
0
        n -= 4;
352
0
    }
353
0
#endif
354
0
    while (n) {
355
0
        t = a[0];
356
0
        t = (t + c) & BN_MASK2;
357
0
        c = (t < c);
358
0
        l = (t + b[0]) & BN_MASK2;
359
0
        c += (l < t);
360
0
        r[0] = l;
361
0
        a++;
362
0
        b++;
363
0
        r++;
364
0
        n--;
365
0
    }
366
0
    return (BN_ULONG)c;
367
0
}
368
#endif /* !BN_LLONG */
369
370
BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
371
    int n)
372
0
{
373
0
    BN_ULONG t1, t2;
374
0
    int c = 0;
375
376
0
    assert(n >= 0);
377
0
    if (n <= 0)
378
0
        return (BN_ULONG)0;
379
380
0
#ifndef OPENSSL_SMALL_FOOTPRINT
381
0
    while (n & ~3) {
382
0
        t1 = a[0];
383
0
        t2 = (t1 - c) & BN_MASK2;
384
0
        c = (t2 > t1);
385
0
        t1 = b[0];
386
0
        t1 = (t2 - t1) & BN_MASK2;
387
0
        r[0] = t1;
388
0
        c += (t1 > t2);
389
0
        t1 = a[1];
390
0
        t2 = (t1 - c) & BN_MASK2;
391
0
        c = (t2 > t1);
392
0
        t1 = b[1];
393
0
        t1 = (t2 - t1) & BN_MASK2;
394
0
        r[1] = t1;
395
0
        c += (t1 > t2);
396
0
        t1 = a[2];
397
0
        t2 = (t1 - c) & BN_MASK2;
398
0
        c = (t2 > t1);
399
0
        t1 = b[2];
400
0
        t1 = (t2 - t1) & BN_MASK2;
401
0
        r[2] = t1;
402
0
        c += (t1 > t2);
403
0
        t1 = a[3];
404
0
        t2 = (t1 - c) & BN_MASK2;
405
0
        c = (t2 > t1);
406
0
        t1 = b[3];
407
0
        t1 = (t2 - t1) & BN_MASK2;
408
0
        r[3] = t1;
409
0
        c += (t1 > t2);
410
0
        a += 4;
411
0
        b += 4;
412
0
        r += 4;
413
0
        n -= 4;
414
0
    }
415
0
#endif
416
0
    while (n) {
417
0
        t1 = a[0];
418
0
        t2 = (t1 - c) & BN_MASK2;
419
0
        c = (t2 > t1);
420
0
        t1 = b[0];
421
0
        t1 = (t2 - t1) & BN_MASK2;
422
0
        r[0] = t1;
423
0
        c += (t1 > t2);
424
0
        a++;
425
0
        b++;
426
0
        r++;
427
0
        n--;
428
0
    }
429
0
    return c;
430
0
}
431
432
#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
433
434
/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
435
/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
436
/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
437
/*
438
 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
439
 * c=(c2,c1,c0)
440
 */
441
442
#ifdef BN_LLONG
443
/*
444
 * Keep in mind that additions to multiplication result can not
445
 * overflow, because its high half cannot be all-ones.
446
 */
447
#define mul_add_c(a, b, c0, c1, c2)         \
448
    do {                                    \
449
        BN_ULONG hi;                        \
450
        BN_ULLONG t = (BN_ULLONG)(a) * (b); \
451
        t += c0; /* no carry */             \
452
        c0 = (BN_ULONG)Lw(t);               \
453
        hi = (BN_ULONG)Hw(t);               \
454
        c1 = (c1 + hi) & BN_MASK2;          \
455
        c2 += (c1 < hi);                    \
456
    } while (0)
457
458
#define mul_add_c2(a, b, c0, c1, c2)          \
459
    do {                                      \
460
        BN_ULONG hi;                          \
461
        BN_ULLONG t = (BN_ULLONG)(a) * (b);   \
462
        BN_ULLONG tt = t + c0; /* no carry */ \
463
        c0 = (BN_ULONG)Lw(tt);                \
464
        hi = (BN_ULONG)Hw(tt);                \
465
        c1 = (c1 + hi) & BN_MASK2;            \
466
        c2 += (c1 < hi);                      \
467
        t += c0; /* no carry */               \
468
        c0 = (BN_ULONG)Lw(t);                 \
469
        hi = (BN_ULONG)Hw(t);                 \
470
        c1 = (c1 + hi) & BN_MASK2;            \
471
        c2 += (c1 < hi);                      \
472
    } while (0)
473
474
#define sqr_add_c(a, i, c0, c1, c2)           \
475
    do {                                      \
476
        BN_ULONG hi;                          \
477
        BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \
478
        t += c0; /* no carry */               \
479
        c0 = (BN_ULONG)Lw(t);                 \
480
        hi = (BN_ULONG)Hw(t);                 \
481
        c1 = (c1 + hi) & BN_MASK2;            \
482
        c2 += (c1 < hi);                      \
483
    } while (0)
484
485
#define sqr_add_c2(a, i, j, c0, c1, c2) \
486
    mul_add_c2((a)[i], (a)[j], c0, c1, c2)
487
488
#elif defined(BN_UMULT_LOHI)
489
/*
490
 * Keep in mind that additions to hi can not overflow, because
491
 * the high word of a multiplication result cannot be all-ones.
492
 */
493
#define mul_add_c(a, b, c0, c1, c2)    \
494
    do {                               \
495
        BN_ULONG ta = (a), tb = (b);   \
496
        BN_ULONG lo, hi;               \
497
        BN_UMULT_LOHI(lo, hi, ta, tb); \
498
        c0 += lo;                      \
499
        hi += (c0 < lo);               \
500
        c1 += hi;                      \
501
        c2 += (c1 < hi);               \
502
    } while (0)
503
504
#define mul_add_c2(a, b, c0, c1, c2)   \
505
    do {                               \
506
        BN_ULONG ta = (a), tb = (b);   \
507
        BN_ULONG lo, hi, tt;           \
508
        BN_UMULT_LOHI(lo, hi, ta, tb); \
509
        c0 += lo;                      \
510
        tt = hi + (c0 < lo);           \
511
        c1 += tt;                      \
512
        c2 += (c1 < tt);               \
513
        c0 += lo;                      \
514
        hi += (c0 < lo);               \
515
        c1 += hi;                      \
516
        c2 += (c1 < hi);               \
517
    } while (0)
518
519
#define sqr_add_c(a, i, c0, c1, c2)    \
520
    do {                               \
521
        BN_ULONG ta = (a)[i];          \
522
        BN_ULONG lo, hi;               \
523
        BN_UMULT_LOHI(lo, hi, ta, ta); \
524
        c0 += lo;                      \
525
        hi += (c0 < lo);               \
526
        c1 += hi;                      \
527
        c2 += (c1 < hi);               \
528
    } while (0)
529
530
#define sqr_add_c2(a, i, j, c0, c1, c2) \
531
    mul_add_c2((a)[i], (a)[j], c0, c1, c2)
532
533
#elif defined(BN_UMULT_HIGH)
534
/*
535
 * Keep in mind that additions to hi can not overflow, because
536
 * the high word of a multiplication result cannot be all-ones.
537
 */
538
#define mul_add_c(a, b, c0, c1, c2)          \
539
    do {                                     \
540
        BN_ULONG ta = (a), tb = (b);         \
541
        BN_ULONG lo = ta * tb;               \
542
        BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \
543
        c0 += lo;                            \
544
        hi += (c0 < lo);                     \
545
        c1 += hi;                            \
546
        c2 += (c1 < hi);                     \
547
    } while (0)
548
549
#define mul_add_c2(a, b, c0, c1, c2)         \
550
    do {                                     \
551
        BN_ULONG ta = (a), tb = (b), tt;     \
552
        BN_ULONG lo = ta * tb;               \
553
        BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \
554
        c0 += lo;                            \
555
        tt = hi + (c0 < lo);                 \
556
        c1 += tt;                            \
557
        c2 += (c1 < tt);                     \
558
        c0 += lo;                            \
559
        hi += (c0 < lo);                     \
560
        c1 += hi;                            \
561
        c2 += (c1 < hi);                     \
562
    } while (0)
563
564
#define sqr_add_c(a, i, c0, c1, c2)          \
565
    do {                                     \
566
        BN_ULONG ta = (a)[i];                \
567
        BN_ULONG lo = ta * ta;               \
568
        BN_ULONG hi = BN_UMULT_HIGH(ta, ta); \
569
        c0 += lo;                            \
570
        hi += (c0 < lo);                     \
571
        c1 += hi;                            \
572
        c2 += (c1 < hi);                     \
573
    } while (0)
574
575
#define sqr_add_c2(a, i, j, c0, c1, c2) \
576
    mul_add_c2((a)[i], (a)[j], c0, c1, c2)
577
578
#else /* !BN_LLONG */
579
/*
580
 * Keep in mind that additions to hi can not overflow, because
581
 * the high word of a multiplication result cannot be all-ones.
582
 */
583
#define mul_add_c(a, b, c0, c1, c2)            \
584
0
    do {                                       \
585
0
        BN_ULONG lo = LBITS(a), hi = HBITS(a); \
586
0
        BN_ULONG bl = LBITS(b), bh = HBITS(b); \
587
0
        mul64(lo, hi, bl, bh);                 \
588
0
        c0 = (c0 + lo) & BN_MASK2;             \
589
0
        hi += (c0 < lo);                       \
590
0
        c1 = (c1 + hi) & BN_MASK2;             \
591
0
        c2 += (c1 < hi);                       \
592
0
    } while (0)
593
594
#define mul_add_c2(a, b, c0, c1, c2)           \
595
0
    do {                                       \
596
0
        BN_ULONG tt;                           \
597
0
        BN_ULONG lo = LBITS(a), hi = HBITS(a); \
598
0
        BN_ULONG bl = LBITS(b), bh = HBITS(b); \
599
0
        mul64(lo, hi, bl, bh);                 \
600
0
        tt = hi;                               \
601
0
        c0 = (c0 + lo) & BN_MASK2;             \
602
0
        tt += (c0 < lo);                       \
603
0
        c1 = (c1 + tt) & BN_MASK2;             \
604
0
        c2 += (c1 < tt);                       \
605
0
        c0 = (c0 + lo) & BN_MASK2;             \
606
0
        hi += (c0 < lo);                       \
607
0
        c1 = (c1 + hi) & BN_MASK2;             \
608
0
        c2 += (c1 < hi);                       \
609
0
    } while (0)
610
611
#define sqr_add_c(a, i, c0, c1, c2) \
612
0
    do {                            \
613
0
        BN_ULONG lo, hi;            \
614
0
        sqr64(lo, hi, (a)[i]);      \
615
0
        c0 = (c0 + lo) & BN_MASK2;  \
616
0
        hi += (c0 < lo);            \
617
0
        c1 = (c1 + hi) & BN_MASK2;  \
618
0
        c2 += (c1 < hi);            \
619
0
    } while (0)
620
621
#define sqr_add_c2(a, i, j, c0, c1, c2) \
622
0
    mul_add_c2((a)[i], (a)[j], c0, c1, c2)
623
#endif /* !BN_LLONG */
624
625
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
626
0
{
627
0
    BN_ULONG c1, c2, c3;
628
629
0
    c1 = 0;
630
0
    c2 = 0;
631
0
    c3 = 0;
632
0
    mul_add_c(a[0], b[0], c1, c2, c3);
633
0
    r[0] = c1;
634
0
    c1 = 0;
635
0
    mul_add_c(a[0], b[1], c2, c3, c1);
636
0
    mul_add_c(a[1], b[0], c2, c3, c1);
637
0
    r[1] = c2;
638
0
    c2 = 0;
639
0
    mul_add_c(a[2], b[0], c3, c1, c2);
640
0
    mul_add_c(a[1], b[1], c3, c1, c2);
641
0
    mul_add_c(a[0], b[2], c3, c1, c2);
642
0
    r[2] = c3;
643
0
    c3 = 0;
644
0
    mul_add_c(a[0], b[3], c1, c2, c3);
645
0
    mul_add_c(a[1], b[2], c1, c2, c3);
646
0
    mul_add_c(a[2], b[1], c1, c2, c3);
647
0
    mul_add_c(a[3], b[0], c1, c2, c3);
648
0
    r[3] = c1;
649
0
    c1 = 0;
650
0
    mul_add_c(a[4], b[0], c2, c3, c1);
651
0
    mul_add_c(a[3], b[1], c2, c3, c1);
652
0
    mul_add_c(a[2], b[2], c2, c3, c1);
653
0
    mul_add_c(a[1], b[3], c2, c3, c1);
654
0
    mul_add_c(a[0], b[4], c2, c3, c1);
655
0
    r[4] = c2;
656
0
    c2 = 0;
657
0
    mul_add_c(a[0], b[5], c3, c1, c2);
658
0
    mul_add_c(a[1], b[4], c3, c1, c2);
659
0
    mul_add_c(a[2], b[3], c3, c1, c2);
660
0
    mul_add_c(a[3], b[2], c3, c1, c2);
661
0
    mul_add_c(a[4], b[1], c3, c1, c2);
662
0
    mul_add_c(a[5], b[0], c3, c1, c2);
663
0
    r[5] = c3;
664
0
    c3 = 0;
665
0
    mul_add_c(a[6], b[0], c1, c2, c3);
666
0
    mul_add_c(a[5], b[1], c1, c2, c3);
667
0
    mul_add_c(a[4], b[2], c1, c2, c3);
668
0
    mul_add_c(a[3], b[3], c1, c2, c3);
669
0
    mul_add_c(a[2], b[4], c1, c2, c3);
670
0
    mul_add_c(a[1], b[5], c1, c2, c3);
671
0
    mul_add_c(a[0], b[6], c1, c2, c3);
672
0
    r[6] = c1;
673
0
    c1 = 0;
674
0
    mul_add_c(a[0], b[7], c2, c3, c1);
675
0
    mul_add_c(a[1], b[6], c2, c3, c1);
676
0
    mul_add_c(a[2], b[5], c2, c3, c1);
677
0
    mul_add_c(a[3], b[4], c2, c3, c1);
678
0
    mul_add_c(a[4], b[3], c2, c3, c1);
679
0
    mul_add_c(a[5], b[2], c2, c3, c1);
680
0
    mul_add_c(a[6], b[1], c2, c3, c1);
681
0
    mul_add_c(a[7], b[0], c2, c3, c1);
682
0
    r[7] = c2;
683
0
    c2 = 0;
684
0
    mul_add_c(a[7], b[1], c3, c1, c2);
685
0
    mul_add_c(a[6], b[2], c3, c1, c2);
686
0
    mul_add_c(a[5], b[3], c3, c1, c2);
687
0
    mul_add_c(a[4], b[4], c3, c1, c2);
688
0
    mul_add_c(a[3], b[5], c3, c1, c2);
689
0
    mul_add_c(a[2], b[6], c3, c1, c2);
690
0
    mul_add_c(a[1], b[7], c3, c1, c2);
691
0
    r[8] = c3;
692
0
    c3 = 0;
693
0
    mul_add_c(a[2], b[7], c1, c2, c3);
694
0
    mul_add_c(a[3], b[6], c1, c2, c3);
695
0
    mul_add_c(a[4], b[5], c1, c2, c3);
696
0
    mul_add_c(a[5], b[4], c1, c2, c3);
697
0
    mul_add_c(a[6], b[3], c1, c2, c3);
698
0
    mul_add_c(a[7], b[2], c1, c2, c3);
699
0
    r[9] = c1;
700
0
    c1 = 0;
701
0
    mul_add_c(a[7], b[3], c2, c3, c1);
702
0
    mul_add_c(a[6], b[4], c2, c3, c1);
703
0
    mul_add_c(a[5], b[5], c2, c3, c1);
704
0
    mul_add_c(a[4], b[6], c2, c3, c1);
705
0
    mul_add_c(a[3], b[7], c2, c3, c1);
706
0
    r[10] = c2;
707
0
    c2 = 0;
708
0
    mul_add_c(a[4], b[7], c3, c1, c2);
709
0
    mul_add_c(a[5], b[6], c3, c1, c2);
710
0
    mul_add_c(a[6], b[5], c3, c1, c2);
711
0
    mul_add_c(a[7], b[4], c3, c1, c2);
712
0
    r[11] = c3;
713
0
    c3 = 0;
714
0
    mul_add_c(a[7], b[5], c1, c2, c3);
715
0
    mul_add_c(a[6], b[6], c1, c2, c3);
716
0
    mul_add_c(a[5], b[7], c1, c2, c3);
717
0
    r[12] = c1;
718
0
    c1 = 0;
719
0
    mul_add_c(a[6], b[7], c2, c3, c1);
720
0
    mul_add_c(a[7], b[6], c2, c3, c1);
721
0
    r[13] = c2;
722
0
    c2 = 0;
723
0
    mul_add_c(a[7], b[7], c3, c1, c2);
724
0
    r[14] = c3;
725
0
    r[15] = c1;
726
0
}
727
728
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
729
0
{
730
0
    BN_ULONG c1, c2, c3;
731
732
0
    c1 = 0;
733
0
    c2 = 0;
734
0
    c3 = 0;
735
0
    mul_add_c(a[0], b[0], c1, c2, c3);
736
0
    r[0] = c1;
737
0
    c1 = 0;
738
0
    mul_add_c(a[0], b[1], c2, c3, c1);
739
0
    mul_add_c(a[1], b[0], c2, c3, c1);
740
0
    r[1] = c2;
741
0
    c2 = 0;
742
0
    mul_add_c(a[2], b[0], c3, c1, c2);
743
0
    mul_add_c(a[1], b[1], c3, c1, c2);
744
0
    mul_add_c(a[0], b[2], c3, c1, c2);
745
0
    r[2] = c3;
746
0
    c3 = 0;
747
0
    mul_add_c(a[0], b[3], c1, c2, c3);
748
0
    mul_add_c(a[1], b[2], c1, c2, c3);
749
0
    mul_add_c(a[2], b[1], c1, c2, c3);
750
0
    mul_add_c(a[3], b[0], c1, c2, c3);
751
0
    r[3] = c1;
752
0
    c1 = 0;
753
0
    mul_add_c(a[3], b[1], c2, c3, c1);
754
0
    mul_add_c(a[2], b[2], c2, c3, c1);
755
0
    mul_add_c(a[1], b[3], c2, c3, c1);
756
0
    r[4] = c2;
757
0
    c2 = 0;
758
0
    mul_add_c(a[2], b[3], c3, c1, c2);
759
0
    mul_add_c(a[3], b[2], c3, c1, c2);
760
0
    r[5] = c3;
761
0
    c3 = 0;
762
0
    mul_add_c(a[3], b[3], c1, c2, c3);
763
0
    r[6] = c1;
764
0
    r[7] = c2;
765
0
}
766
767
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
768
0
{
769
0
    BN_ULONG c1, c2, c3;
770
771
0
    c1 = 0;
772
0
    c2 = 0;
773
0
    c3 = 0;
774
0
    sqr_add_c(a, 0, c1, c2, c3);
775
0
    r[0] = c1;
776
0
    c1 = 0;
777
0
    sqr_add_c2(a, 1, 0, c2, c3, c1);
778
0
    r[1] = c2;
779
0
    c2 = 0;
780
0
    sqr_add_c(a, 1, c3, c1, c2);
781
0
    sqr_add_c2(a, 2, 0, c3, c1, c2);
782
0
    r[2] = c3;
783
0
    c3 = 0;
784
0
    sqr_add_c2(a, 3, 0, c1, c2, c3);
785
0
    sqr_add_c2(a, 2, 1, c1, c2, c3);
786
0
    r[3] = c1;
787
0
    c1 = 0;
788
0
    sqr_add_c(a, 2, c2, c3, c1);
789
0
    sqr_add_c2(a, 3, 1, c2, c3, c1);
790
0
    sqr_add_c2(a, 4, 0, c2, c3, c1);
791
0
    r[4] = c2;
792
0
    c2 = 0;
793
0
    sqr_add_c2(a, 5, 0, c3, c1, c2);
794
0
    sqr_add_c2(a, 4, 1, c3, c1, c2);
795
0
    sqr_add_c2(a, 3, 2, c3, c1, c2);
796
0
    r[5] = c3;
797
0
    c3 = 0;
798
0
    sqr_add_c(a, 3, c1, c2, c3);
799
0
    sqr_add_c2(a, 4, 2, c1, c2, c3);
800
0
    sqr_add_c2(a, 5, 1, c1, c2, c3);
801
0
    sqr_add_c2(a, 6, 0, c1, c2, c3);
802
0
    r[6] = c1;
803
0
    c1 = 0;
804
0
    sqr_add_c2(a, 7, 0, c2, c3, c1);
805
0
    sqr_add_c2(a, 6, 1, c2, c3, c1);
806
0
    sqr_add_c2(a, 5, 2, c2, c3, c1);
807
0
    sqr_add_c2(a, 4, 3, c2, c3, c1);
808
0
    r[7] = c2;
809
0
    c2 = 0;
810
0
    sqr_add_c(a, 4, c3, c1, c2);
811
0
    sqr_add_c2(a, 5, 3, c3, c1, c2);
812
0
    sqr_add_c2(a, 6, 2, c3, c1, c2);
813
0
    sqr_add_c2(a, 7, 1, c3, c1, c2);
814
0
    r[8] = c3;
815
0
    c3 = 0;
816
0
    sqr_add_c2(a, 7, 2, c1, c2, c3);
817
0
    sqr_add_c2(a, 6, 3, c1, c2, c3);
818
0
    sqr_add_c2(a, 5, 4, c1, c2, c3);
819
0
    r[9] = c1;
820
0
    c1 = 0;
821
0
    sqr_add_c(a, 5, c2, c3, c1);
822
0
    sqr_add_c2(a, 6, 4, c2, c3, c1);
823
0
    sqr_add_c2(a, 7, 3, c2, c3, c1);
824
0
    r[10] = c2;
825
0
    c2 = 0;
826
0
    sqr_add_c2(a, 7, 4, c3, c1, c2);
827
0
    sqr_add_c2(a, 6, 5, c3, c1, c2);
828
0
    r[11] = c3;
829
0
    c3 = 0;
830
0
    sqr_add_c(a, 6, c1, c2, c3);
831
0
    sqr_add_c2(a, 7, 5, c1, c2, c3);
832
0
    r[12] = c1;
833
0
    c1 = 0;
834
0
    sqr_add_c2(a, 7, 6, c2, c3, c1);
835
0
    r[13] = c2;
836
0
    c2 = 0;
837
0
    sqr_add_c(a, 7, c3, c1, c2);
838
0
    r[14] = c3;
839
0
    r[15] = c1;
840
0
}
841
842
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
843
0
{
844
0
    BN_ULONG c1, c2, c3;
845
846
0
    c1 = 0;
847
0
    c2 = 0;
848
0
    c3 = 0;
849
0
    sqr_add_c(a, 0, c1, c2, c3);
850
0
    r[0] = c1;
851
0
    c1 = 0;
852
0
    sqr_add_c2(a, 1, 0, c2, c3, c1);
853
0
    r[1] = c2;
854
0
    c2 = 0;
855
0
    sqr_add_c(a, 1, c3, c1, c2);
856
0
    sqr_add_c2(a, 2, 0, c3, c1, c2);
857
0
    r[2] = c3;
858
0
    c3 = 0;
859
0
    sqr_add_c2(a, 3, 0, c1, c2, c3);
860
0
    sqr_add_c2(a, 2, 1, c1, c2, c3);
861
0
    r[3] = c1;
862
0
    c1 = 0;
863
0
    sqr_add_c(a, 2, c2, c3, c1);
864
0
    sqr_add_c2(a, 3, 1, c2, c3, c1);
865
0
    r[4] = c2;
866
0
    c2 = 0;
867
0
    sqr_add_c2(a, 3, 2, c3, c1, c2);
868
0
    r[5] = c3;
869
0
    c3 = 0;
870
0
    sqr_add_c(a, 3, c1, c2, c3);
871
0
    r[6] = c1;
872
0
    r[7] = c2;
873
0
}
874
875
#ifdef OPENSSL_NO_ASM
876
#ifdef OPENSSL_BN_ASM_MONT
877
#include <alloca.h>
878
/*
879
 * This is essentially reference implementation, which may or may not
880
 * result in performance improvement. E.g. on IA-32 this routine was
881
 * observed to give 40% faster rsa1024 private key operations and 10%
882
 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
883
 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
884
 * reference implementation, one to be used as starting point for
885
 * platform-specific assembler. Mentioned numbers apply to compiler
886
 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
887
 * can vary not only from platform to platform, but even for compiler
888
 * versions. Assembler vs. assembler improvement coefficients can
889
 * [and are known to] differ and are to be documented elsewhere.
890
 */
891
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
892
    const BN_ULONG *np, const BN_ULONG *n0p, int num)
893
{
894
    BN_ULONG c0, c1, ml, *tp, n0;
895
#ifdef mul64
896
    BN_ULONG mh;
897
#endif
898
    volatile BN_ULONG *vp;
899
    int i = 0, j;
900
901
#if 0 /* template for platform-specific \
902
       * implementation */
903
    if (ap == bp)
904
        return bn_sqr_mont(rp, ap, np, n0p, num);
905
#endif
906
    vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
907
908
    n0 = *n0p;
909
910
    c0 = 0;
911
    ml = bp[0];
912
#ifdef mul64
913
    mh = HBITS(ml);
914
    ml = LBITS(ml);
915
    for (j = 0; j < num; ++j)
916
        mul(tp[j], ap[j], ml, mh, c0);
917
#else
918
    for (j = 0; j < num; ++j)
919
        mul(tp[j], ap[j], ml, c0);
920
#endif
921
922
    tp[num] = c0;
923
    tp[num + 1] = 0;
924
    goto enter;
925
926
    for (i = 0; i < num; i++) {
927
        c0 = 0;
928
        ml = bp[i];
929
#ifdef mul64
930
        mh = HBITS(ml);
931
        ml = LBITS(ml);
932
        for (j = 0; j < num; ++j)
933
            mul_add(tp[j], ap[j], ml, mh, c0);
934
#else
935
        for (j = 0; j < num; ++j)
936
            mul_add(tp[j], ap[j], ml, c0);
937
#endif
938
        c1 = (tp[num] + c0) & BN_MASK2;
939
        tp[num] = c1;
940
        tp[num + 1] = (c1 < c0 ? 1 : 0);
941
    enter:
942
        c1 = tp[0];
943
        ml = (c1 * n0) & BN_MASK2;
944
        c0 = 0;
945
#ifdef mul64
946
        mh = HBITS(ml);
947
        ml = LBITS(ml);
948
        mul_add(c1, np[0], ml, mh, c0);
949
#else
950
        mul_add(c1, ml, np[0], c0);
951
#endif
952
        for (j = 1; j < num; j++) {
953
            c1 = tp[j];
954
#ifdef mul64
955
            mul_add(c1, np[j], ml, mh, c0);
956
#else
957
            mul_add(c1, ml, np[j], c0);
958
#endif
959
            tp[j - 1] = c1 & BN_MASK2;
960
        }
961
        c1 = (tp[num] + c0) & BN_MASK2;
962
        tp[num - 1] = c1;
963
        tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
964
    }
965
966
    if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
967
        c0 = bn_sub_words(rp, tp, np, num);
968
        if (tp[num] != 0 || c0 == 0) {
969
            for (i = 0; i < num + 2; i++)
970
                vp[i] = 0;
971
            return 1;
972
        }
973
    }
974
    for (i = 0; i < num; i++)
975
        rp[i] = tp[i], vp[i] = 0;
976
    vp[num] = 0;
977
    vp[num + 1] = 0;
978
    return 1;
979
}
980
#else
981
/*
982
 * Return value of 0 indicates that multiplication/convolution was not
983
 * performed to signal the caller to fall down to alternative/original
984
 * code-path.
985
 */
986
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
987
    const BN_ULONG *np, const BN_ULONG *n0, int num)
988
0
{
989
0
    return 0;
990
0
}
991
#endif /* OPENSSL_BN_ASM_MONT */
992
#endif
993
994
#else /* !BN_MUL_COMBA */
995
996
/* hmm... is it faster just to do a multiply? */
997
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
998
{
999
    BN_ULONG t[8];
1000
    bn_sqr_normal(r, a, 4, t);
1001
}
1002
1003
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
1004
{
1005
    BN_ULONG t[16];
1006
    bn_sqr_normal(r, a, 8, t);
1007
}
1008
1009
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1010
{
1011
    r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
1012
    r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
1013
    r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
1014
    r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
1015
}
1016
1017
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1018
{
1019
    r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
1020
    r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
1021
    r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
1022
    r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
1023
    r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
1024
    r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
1025
    r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
1026
    r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
1027
}
1028
1029
#ifdef OPENSSL_NO_ASM
1030
#ifdef OPENSSL_BN_ASM_MONT
1031
#include <alloca.h>
1032
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1033
    const BN_ULONG *np, const BN_ULONG *n0p, int num)
1034
{
1035
    BN_ULONG c0, c1, *tp, n0 = *n0p;
1036
    volatile BN_ULONG *vp;
1037
    int i = 0, j;
1038
1039
    vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1040
1041
    for (i = 0; i <= num; i++)
1042
        tp[i] = 0;
1043
1044
    for (i = 0; i < num; i++) {
1045
        c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1046
        c1 = (tp[num] + c0) & BN_MASK2;
1047
        tp[num] = c1;
1048
        tp[num + 1] = (c1 < c0 ? 1 : 0);
1049
1050
        c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1051
        c1 = (tp[num] + c0) & BN_MASK2;
1052
        tp[num] = c1;
1053
        tp[num + 1] += (c1 < c0 ? 1 : 0);
1054
        for (j = 0; j <= num; j++)
1055
            tp[j] = tp[j + 1];
1056
    }
1057
1058
    if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1059
        c0 = bn_sub_words(rp, tp, np, num);
1060
        if (tp[num] != 0 || c0 == 0) {
1061
            for (i = 0; i < num + 2; i++)
1062
                vp[i] = 0;
1063
            return 1;
1064
        }
1065
    }
1066
    for (i = 0; i < num; i++)
1067
        rp[i] = tp[i], vp[i] = 0;
1068
    vp[num] = 0;
1069
    vp[num + 1] = 0;
1070
    return 1;
1071
}
1072
#else
1073
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1074
    const BN_ULONG *np, const BN_ULONG *n0, int num)
1075
{
1076
    return 0;
1077
}
1078
#endif /* OPENSSL_BN_ASM_MONT */
1079
#endif
1080
1081
#endif /* !BN_MUL_COMBA */