Coverage Report

Created: 2025-07-01 06:23

/src/irssi/subprojects/openssl-1.1.1l/crypto/bn/bn_asm.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
3
 *
4
 * Licensed under the OpenSSL license (the "License").  You may not use
5
 * this file except in compliance with the License.  You can obtain a copy
6
 * in the file LICENSE in the source distribution or at
7
 * https://www.openssl.org/source/license.html
8
 */
9
10
#include <assert.h>
11
#include <openssl/crypto.h>
12
#include "internal/cryptlib.h"
13
#include "bn_local.h"
14
15
#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
16
17
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
18
                          BN_ULONG w)
19
{
20
    BN_ULONG c1 = 0;
21
22
    assert(num >= 0);
23
    if (num <= 0)
24
        return c1;
25
26
# ifndef OPENSSL_SMALL_FOOTPRINT
27
    while (num & ~3) {
28
        mul_add(rp[0], ap[0], w, c1);
29
        mul_add(rp[1], ap[1], w, c1);
30
        mul_add(rp[2], ap[2], w, c1);
31
        mul_add(rp[3], ap[3], w, c1);
32
        ap += 4;
33
        rp += 4;
34
        num -= 4;
35
    }
36
# endif
37
    while (num) {
38
        mul_add(rp[0], ap[0], w, c1);
39
        ap++;
40
        rp++;
41
        num--;
42
    }
43
44
    return c1;
45
}
46
47
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
48
{
49
    BN_ULONG c1 = 0;
50
51
    assert(num >= 0);
52
    if (num <= 0)
53
        return c1;
54
55
# ifndef OPENSSL_SMALL_FOOTPRINT
56
    while (num & ~3) {
57
        mul(rp[0], ap[0], w, c1);
58
        mul(rp[1], ap[1], w, c1);
59
        mul(rp[2], ap[2], w, c1);
60
        mul(rp[3], ap[3], w, c1);
61
        ap += 4;
62
        rp += 4;
63
        num -= 4;
64
    }
65
# endif
66
    while (num) {
67
        mul(rp[0], ap[0], w, c1);
68
        ap++;
69
        rp++;
70
        num--;
71
    }
72
    return c1;
73
}
74
75
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
76
{
77
    assert(n >= 0);
78
    if (n <= 0)
79
        return;
80
81
# ifndef OPENSSL_SMALL_FOOTPRINT
82
    while (n & ~3) {
83
        sqr(r[0], r[1], a[0]);
84
        sqr(r[2], r[3], a[1]);
85
        sqr(r[4], r[5], a[2]);
86
        sqr(r[6], r[7], a[3]);
87
        a += 4;
88
        r += 8;
89
        n -= 4;
90
    }
91
# endif
92
    while (n) {
93
        sqr(r[0], r[1], a[0]);
94
        a++;
95
        r += 2;
96
        n--;
97
    }
98
}
99
100
#else                           /* !(defined(BN_LLONG) ||
101
                                 * defined(BN_UMULT_HIGH)) */
102
103
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
104
                          BN_ULONG w)
105
0
{
106
0
    BN_ULONG c = 0;
107
0
    BN_ULONG bl, bh;
108
109
0
    assert(num >= 0);
110
0
    if (num <= 0)
111
0
        return (BN_ULONG)0;
112
113
0
    bl = LBITS(w);
114
0
    bh = HBITS(w);
115
116
0
# ifndef OPENSSL_SMALL_FOOTPRINT
117
0
    while (num & ~3) {
118
0
        mul_add(rp[0], ap[0], bl, bh, c);
119
0
        mul_add(rp[1], ap[1], bl, bh, c);
120
0
        mul_add(rp[2], ap[2], bl, bh, c);
121
0
        mul_add(rp[3], ap[3], bl, bh, c);
122
0
        ap += 4;
123
0
        rp += 4;
124
0
        num -= 4;
125
0
    }
126
0
# endif
127
0
    while (num) {
128
0
        mul_add(rp[0], ap[0], bl, bh, c);
129
0
        ap++;
130
0
        rp++;
131
0
        num--;
132
0
    }
133
0
    return c;
134
0
}
135
136
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
137
0
{
138
0
    BN_ULONG carry = 0;
139
0
    BN_ULONG bl, bh;
140
141
0
    assert(num >= 0);
142
0
    if (num <= 0)
143
0
        return (BN_ULONG)0;
144
145
0
    bl = LBITS(w);
146
0
    bh = HBITS(w);
147
148
0
# ifndef OPENSSL_SMALL_FOOTPRINT
149
0
    while (num & ~3) {
150
0
        mul(rp[0], ap[0], bl, bh, carry);
151
0
        mul(rp[1], ap[1], bl, bh, carry);
152
0
        mul(rp[2], ap[2], bl, bh, carry);
153
0
        mul(rp[3], ap[3], bl, bh, carry);
154
0
        ap += 4;
155
0
        rp += 4;
156
0
        num -= 4;
157
0
    }
158
0
# endif
159
0
    while (num) {
160
0
        mul(rp[0], ap[0], bl, bh, carry);
161
0
        ap++;
162
0
        rp++;
163
0
        num--;
164
0
    }
165
0
    return carry;
166
0
}
167
168
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
169
0
{
170
0
    assert(n >= 0);
171
0
    if (n <= 0)
172
0
        return;
173
174
0
# ifndef OPENSSL_SMALL_FOOTPRINT
175
0
    while (n & ~3) {
176
0
        sqr64(r[0], r[1], a[0]);
177
0
        sqr64(r[2], r[3], a[1]);
178
0
        sqr64(r[4], r[5], a[2]);
179
0
        sqr64(r[6], r[7], a[3]);
180
0
        a += 4;
181
0
        r += 8;
182
0
        n -= 4;
183
0
    }
184
0
# endif
185
0
    while (n) {
186
0
        sqr64(r[0], r[1], a[0]);
187
0
        a++;
188
0
        r += 2;
189
0
        n--;
190
0
    }
191
0
}
192
193
#endif                          /* !(defined(BN_LLONG) ||
194
                                 * defined(BN_UMULT_HIGH)) */
195
196
#if defined(BN_LLONG) && defined(BN_DIV2W)
197
198
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
199
{
200
    return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
201
}
202
203
#else
204
205
/* Divide h,l by d and return the result. */
206
/* I need to test this some more :-( */
207
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
208
0
{
209
0
    BN_ULONG dh, dl, q, ret = 0, th, tl, t;
210
0
    int i, count = 2;
211
212
0
    if (d == 0)
213
0
        return BN_MASK2;
214
215
0
    i = BN_num_bits_word(d);
216
0
    assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
217
218
0
    i = BN_BITS2 - i;
219
0
    if (h >= d)
220
0
        h -= d;
221
222
0
    if (i) {
223
0
        d <<= i;
224
0
        h = (h << i) | (l >> (BN_BITS2 - i));
225
0
        l <<= i;
226
0
    }
227
0
    dh = (d & BN_MASK2h) >> BN_BITS4;
228
0
    dl = (d & BN_MASK2l);
229
0
    for (;;) {
230
0
        if ((h >> BN_BITS4) == dh)
231
0
            q = BN_MASK2l;
232
0
        else
233
0
            q = h / dh;
234
235
0
        th = q * dh;
236
0
        tl = dl * q;
237
0
        for (;;) {
238
0
            t = h - th;
239
0
            if ((t & BN_MASK2h) ||
240
0
                ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
241
0
                break;
242
0
            q--;
243
0
            th -= dh;
244
0
            tl -= dl;
245
0
        }
246
0
        t = (tl >> BN_BITS4);
247
0
        tl = (tl << BN_BITS4) & BN_MASK2h;
248
0
        th += t;
249
250
0
        if (l < tl)
251
0
            th++;
252
0
        l -= tl;
253
0
        if (h < th) {
254
0
            h += d;
255
0
            q--;
256
0
        }
257
0
        h -= th;
258
259
0
        if (--count == 0)
260
0
            break;
261
262
0
        ret = q << BN_BITS4;
263
0
        h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
264
0
        l = (l & BN_MASK2l) << BN_BITS4;
265
0
    }
266
0
    ret |= q;
267
0
    return ret;
268
0
}
269
#endif                          /* !defined(BN_LLONG) && defined(BN_DIV2W) */
270
271
#ifdef BN_LLONG
272
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
273
                      int n)
274
{
275
    BN_ULLONG ll = 0;
276
277
    assert(n >= 0);
278
    if (n <= 0)
279
        return (BN_ULONG)0;
280
281
# ifndef OPENSSL_SMALL_FOOTPRINT
282
    while (n & ~3) {
283
        ll += (BN_ULLONG) a[0] + b[0];
284
        r[0] = (BN_ULONG)ll & BN_MASK2;
285
        ll >>= BN_BITS2;
286
        ll += (BN_ULLONG) a[1] + b[1];
287
        r[1] = (BN_ULONG)ll & BN_MASK2;
288
        ll >>= BN_BITS2;
289
        ll += (BN_ULLONG) a[2] + b[2];
290
        r[2] = (BN_ULONG)ll & BN_MASK2;
291
        ll >>= BN_BITS2;
292
        ll += (BN_ULLONG) a[3] + b[3];
293
        r[3] = (BN_ULONG)ll & BN_MASK2;
294
        ll >>= BN_BITS2;
295
        a += 4;
296
        b += 4;
297
        r += 4;
298
        n -= 4;
299
    }
300
# endif
301
    while (n) {
302
        ll += (BN_ULLONG) a[0] + b[0];
303
        r[0] = (BN_ULONG)ll & BN_MASK2;
304
        ll >>= BN_BITS2;
305
        a++;
306
        b++;
307
        r++;
308
        n--;
309
    }
310
    return (BN_ULONG)ll;
311
}
312
#else                           /* !BN_LLONG */
313
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
314
                      int n)
315
0
{
316
0
    BN_ULONG c, l, t;
317
318
0
    assert(n >= 0);
319
0
    if (n <= 0)
320
0
        return (BN_ULONG)0;
321
322
0
    c = 0;
323
0
# ifndef OPENSSL_SMALL_FOOTPRINT
324
0
    while (n & ~3) {
325
0
        t = a[0];
326
0
        t = (t + c) & BN_MASK2;
327
0
        c = (t < c);
328
0
        l = (t + b[0]) & BN_MASK2;
329
0
        c += (l < t);
330
0
        r[0] = l;
331
0
        t = a[1];
332
0
        t = (t + c) & BN_MASK2;
333
0
        c = (t < c);
334
0
        l = (t + b[1]) & BN_MASK2;
335
0
        c += (l < t);
336
0
        r[1] = l;
337
0
        t = a[2];
338
0
        t = (t + c) & BN_MASK2;
339
0
        c = (t < c);
340
0
        l = (t + b[2]) & BN_MASK2;
341
0
        c += (l < t);
342
0
        r[2] = l;
343
0
        t = a[3];
344
0
        t = (t + c) & BN_MASK2;
345
0
        c = (t < c);
346
0
        l = (t + b[3]) & BN_MASK2;
347
0
        c += (l < t);
348
0
        r[3] = l;
349
0
        a += 4;
350
0
        b += 4;
351
0
        r += 4;
352
0
        n -= 4;
353
0
    }
354
0
# endif
355
0
    while (n) {
356
0
        t = a[0];
357
0
        t = (t + c) & BN_MASK2;
358
0
        c = (t < c);
359
0
        l = (t + b[0]) & BN_MASK2;
360
0
        c += (l < t);
361
0
        r[0] = l;
362
0
        a++;
363
0
        b++;
364
0
        r++;
365
0
        n--;
366
0
    }
367
0
    return (BN_ULONG)c;
368
0
}
369
#endif                          /* !BN_LLONG */
370
371
BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
372
                      int n)
373
0
{
374
0
    BN_ULONG t1, t2;
375
0
    int c = 0;
376
377
0
    assert(n >= 0);
378
0
    if (n <= 0)
379
0
        return (BN_ULONG)0;
380
381
0
#ifndef OPENSSL_SMALL_FOOTPRINT
382
0
    while (n & ~3) {
383
0
        t1 = a[0];
384
0
        t2 = b[0];
385
0
        r[0] = (t1 - t2 - c) & BN_MASK2;
386
0
        if (t1 != t2)
387
0
            c = (t1 < t2);
388
0
        t1 = a[1];
389
0
        t2 = b[1];
390
0
        r[1] = (t1 - t2 - c) & BN_MASK2;
391
0
        if (t1 != t2)
392
0
            c = (t1 < t2);
393
0
        t1 = a[2];
394
0
        t2 = b[2];
395
0
        r[2] = (t1 - t2 - c) & BN_MASK2;
396
0
        if (t1 != t2)
397
0
            c = (t1 < t2);
398
0
        t1 = a[3];
399
0
        t2 = b[3];
400
0
        r[3] = (t1 - t2 - c) & BN_MASK2;
401
0
        if (t1 != t2)
402
0
            c = (t1 < t2);
403
0
        a += 4;
404
0
        b += 4;
405
0
        r += 4;
406
0
        n -= 4;
407
0
    }
408
0
#endif
409
0
    while (n) {
410
0
        t1 = a[0];
411
0
        t2 = b[0];
412
0
        r[0] = (t1 - t2 - c) & BN_MASK2;
413
0
        if (t1 != t2)
414
0
            c = (t1 < t2);
415
0
        a++;
416
0
        b++;
417
0
        r++;
418
0
        n--;
419
0
    }
420
0
    return c;
421
0
}
422
423
#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
424
425
# undef bn_mul_comba8
426
# undef bn_mul_comba4
427
# undef bn_sqr_comba8
428
# undef bn_sqr_comba4
429
430
/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
431
/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
432
/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
433
/*
434
 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
435
 * c=(c2,c1,c0)
436
 */
437
438
# ifdef BN_LLONG
439
/*
440
 * Keep in mind that additions to multiplication result can not
441
 * overflow, because its high half cannot be all-ones.
442
 */
443
#  define mul_add_c(a,b,c0,c1,c2)       do {    \
444
        BN_ULONG hi;                            \
445
        BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
446
        t += c0;                /* no carry */  \
447
        c0 = (BN_ULONG)Lw(t);                   \
448
        hi = (BN_ULONG)Hw(t);                   \
449
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
450
        } while(0)
451
452
#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
453
        BN_ULONG hi;                            \
454
        BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
455
        BN_ULLONG tt = t+c0;    /* no carry */  \
456
        c0 = (BN_ULONG)Lw(tt);                  \
457
        hi = (BN_ULONG)Hw(tt);                  \
458
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
459
        t += c0;                /* no carry */  \
460
        c0 = (BN_ULONG)Lw(t);                   \
461
        hi = (BN_ULONG)Hw(t);                   \
462
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
463
        } while(0)
464
465
#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
466
        BN_ULONG hi;                            \
467
        BN_ULLONG t = (BN_ULLONG)a[i]*a[i];     \
468
        t += c0;                /* no carry */  \
469
        c0 = (BN_ULONG)Lw(t);                   \
470
        hi = (BN_ULONG)Hw(t);                   \
471
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
472
        } while(0)
473
474
#  define sqr_add_c2(a,i,j,c0,c1,c2) \
475
        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
476
477
# elif defined(BN_UMULT_LOHI)
478
/*
479
 * Keep in mind that additions to hi can not overflow, because
480
 * the high word of a multiplication result cannot be all-ones.
481
 */
482
#  define mul_add_c(a,b,c0,c1,c2)       do {    \
483
        BN_ULONG ta = (a), tb = (b);            \
484
        BN_ULONG lo, hi;                        \
485
        BN_UMULT_LOHI(lo,hi,ta,tb);             \
486
        c0 += lo; hi += (c0<lo)?1:0;            \
487
        c1 += hi; c2 += (c1<hi)?1:0;            \
488
        } while(0)
489
490
#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
491
        BN_ULONG ta = (a), tb = (b);            \
492
        BN_ULONG lo, hi, tt;                    \
493
        BN_UMULT_LOHI(lo,hi,ta,tb);             \
494
        c0 += lo; tt = hi+((c0<lo)?1:0);        \
495
        c1 += tt; c2 += (c1<tt)?1:0;            \
496
        c0 += lo; hi += (c0<lo)?1:0;            \
497
        c1 += hi; c2 += (c1<hi)?1:0;            \
498
        } while(0)
499
500
#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
501
        BN_ULONG ta = (a)[i];                   \
502
        BN_ULONG lo, hi;                        \
503
        BN_UMULT_LOHI(lo,hi,ta,ta);             \
504
        c0 += lo; hi += (c0<lo)?1:0;            \
505
        c1 += hi; c2 += (c1<hi)?1:0;            \
506
        } while(0)
507
508
#  define sqr_add_c2(a,i,j,c0,c1,c2)    \
509
        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
510
511
# elif defined(BN_UMULT_HIGH)
512
/*
513
 * Keep in mind that additions to hi can not overflow, because
514
 * the high word of a multiplication result cannot be all-ones.
515
 */
516
#  define mul_add_c(a,b,c0,c1,c2)       do {    \
517
        BN_ULONG ta = (a), tb = (b);            \
518
        BN_ULONG lo = ta * tb;                  \
519
        BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
520
        c0 += lo; hi += (c0<lo)?1:0;            \
521
        c1 += hi; c2 += (c1<hi)?1:0;            \
522
        } while(0)
523
524
#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
525
        BN_ULONG ta = (a), tb = (b), tt;        \
526
        BN_ULONG lo = ta * tb;                  \
527
        BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
528
        c0 += lo; tt = hi + ((c0<lo)?1:0);      \
529
        c1 += tt; c2 += (c1<tt)?1:0;            \
530
        c0 += lo; hi += (c0<lo)?1:0;            \
531
        c1 += hi; c2 += (c1<hi)?1:0;            \
532
        } while(0)
533
534
#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
535
        BN_ULONG ta = (a)[i];                   \
536
        BN_ULONG lo = ta * ta;                  \
537
        BN_ULONG hi = BN_UMULT_HIGH(ta,ta);     \
538
        c0 += lo; hi += (c0<lo)?1:0;            \
539
        c1 += hi; c2 += (c1<hi)?1:0;            \
540
        } while(0)
541
542
#  define sqr_add_c2(a,i,j,c0,c1,c2)      \
543
        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
544
545
# else                          /* !BN_LLONG */
546
/*
547
 * Keep in mind that additions to hi can not overflow, because
548
 * the high word of a multiplication result cannot be all-ones.
549
 */
550
0
#  define mul_add_c(a,b,c0,c1,c2)       do {    \
551
0
        BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
552
0
        BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
553
0
        mul64(lo,hi,bl,bh);                     \
554
0
        c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
555
0
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
556
0
        } while(0)
557
558
0
#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
559
0
        BN_ULONG tt;                            \
560
0
        BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
561
0
        BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
562
0
        mul64(lo,hi,bl,bh);                     \
563
0
        tt = hi;                                \
564
0
        c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
565
0
        c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
566
0
        c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
567
0
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
568
0
        } while(0)
569
570
0
#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
571
0
        BN_ULONG lo, hi;                        \
572
0
        sqr64(lo,hi,(a)[i]);                    \
573
0
        c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
574
0
        c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
575
0
        } while(0)
576
577
#  define sqr_add_c2(a,i,j,c0,c1,c2) \
578
0
        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
579
# endif                         /* !BN_LLONG */
580
581
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
582
0
{
583
0
    BN_ULONG c1, c2, c3;
584
585
0
    c1 = 0;
586
0
    c2 = 0;
587
0
    c3 = 0;
588
0
    mul_add_c(a[0], b[0], c1, c2, c3);
589
0
    r[0] = c1;
590
0
    c1 = 0;
591
0
    mul_add_c(a[0], b[1], c2, c3, c1);
592
0
    mul_add_c(a[1], b[0], c2, c3, c1);
593
0
    r[1] = c2;
594
0
    c2 = 0;
595
0
    mul_add_c(a[2], b[0], c3, c1, c2);
596
0
    mul_add_c(a[1], b[1], c3, c1, c2);
597
0
    mul_add_c(a[0], b[2], c3, c1, c2);
598
0
    r[2] = c3;
599
0
    c3 = 0;
600
0
    mul_add_c(a[0], b[3], c1, c2, c3);
601
0
    mul_add_c(a[1], b[2], c1, c2, c3);
602
0
    mul_add_c(a[2], b[1], c1, c2, c3);
603
0
    mul_add_c(a[3], b[0], c1, c2, c3);
604
0
    r[3] = c1;
605
0
    c1 = 0;
606
0
    mul_add_c(a[4], b[0], c2, c3, c1);
607
0
    mul_add_c(a[3], b[1], c2, c3, c1);
608
0
    mul_add_c(a[2], b[2], c2, c3, c1);
609
0
    mul_add_c(a[1], b[3], c2, c3, c1);
610
0
    mul_add_c(a[0], b[4], c2, c3, c1);
611
0
    r[4] = c2;
612
0
    c2 = 0;
613
0
    mul_add_c(a[0], b[5], c3, c1, c2);
614
0
    mul_add_c(a[1], b[4], c3, c1, c2);
615
0
    mul_add_c(a[2], b[3], c3, c1, c2);
616
0
    mul_add_c(a[3], b[2], c3, c1, c2);
617
0
    mul_add_c(a[4], b[1], c3, c1, c2);
618
0
    mul_add_c(a[5], b[0], c3, c1, c2);
619
0
    r[5] = c3;
620
0
    c3 = 0;
621
0
    mul_add_c(a[6], b[0], c1, c2, c3);
622
0
    mul_add_c(a[5], b[1], c1, c2, c3);
623
0
    mul_add_c(a[4], b[2], c1, c2, c3);
624
0
    mul_add_c(a[3], b[3], c1, c2, c3);
625
0
    mul_add_c(a[2], b[4], c1, c2, c3);
626
0
    mul_add_c(a[1], b[5], c1, c2, c3);
627
0
    mul_add_c(a[0], b[6], c1, c2, c3);
628
0
    r[6] = c1;
629
0
    c1 = 0;
630
0
    mul_add_c(a[0], b[7], c2, c3, c1);
631
0
    mul_add_c(a[1], b[6], c2, c3, c1);
632
0
    mul_add_c(a[2], b[5], c2, c3, c1);
633
0
    mul_add_c(a[3], b[4], c2, c3, c1);
634
0
    mul_add_c(a[4], b[3], c2, c3, c1);
635
0
    mul_add_c(a[5], b[2], c2, c3, c1);
636
0
    mul_add_c(a[6], b[1], c2, c3, c1);
637
0
    mul_add_c(a[7], b[0], c2, c3, c1);
638
0
    r[7] = c2;
639
0
    c2 = 0;
640
0
    mul_add_c(a[7], b[1], c3, c1, c2);
641
0
    mul_add_c(a[6], b[2], c3, c1, c2);
642
0
    mul_add_c(a[5], b[3], c3, c1, c2);
643
0
    mul_add_c(a[4], b[4], c3, c1, c2);
644
0
    mul_add_c(a[3], b[5], c3, c1, c2);
645
0
    mul_add_c(a[2], b[6], c3, c1, c2);
646
0
    mul_add_c(a[1], b[7], c3, c1, c2);
647
0
    r[8] = c3;
648
0
    c3 = 0;
649
0
    mul_add_c(a[2], b[7], c1, c2, c3);
650
0
    mul_add_c(a[3], b[6], c1, c2, c3);
651
0
    mul_add_c(a[4], b[5], c1, c2, c3);
652
0
    mul_add_c(a[5], b[4], c1, c2, c3);
653
0
    mul_add_c(a[6], b[3], c1, c2, c3);
654
0
    mul_add_c(a[7], b[2], c1, c2, c3);
655
0
    r[9] = c1;
656
0
    c1 = 0;
657
0
    mul_add_c(a[7], b[3], c2, c3, c1);
658
0
    mul_add_c(a[6], b[4], c2, c3, c1);
659
0
    mul_add_c(a[5], b[5], c2, c3, c1);
660
0
    mul_add_c(a[4], b[6], c2, c3, c1);
661
0
    mul_add_c(a[3], b[7], c2, c3, c1);
662
0
    r[10] = c2;
663
0
    c2 = 0;
664
0
    mul_add_c(a[4], b[7], c3, c1, c2);
665
0
    mul_add_c(a[5], b[6], c3, c1, c2);
666
0
    mul_add_c(a[6], b[5], c3, c1, c2);
667
0
    mul_add_c(a[7], b[4], c3, c1, c2);
668
0
    r[11] = c3;
669
0
    c3 = 0;
670
0
    mul_add_c(a[7], b[5], c1, c2, c3);
671
0
    mul_add_c(a[6], b[6], c1, c2, c3);
672
0
    mul_add_c(a[5], b[7], c1, c2, c3);
673
0
    r[12] = c1;
674
0
    c1 = 0;
675
0
    mul_add_c(a[6], b[7], c2, c3, c1);
676
0
    mul_add_c(a[7], b[6], c2, c3, c1);
677
0
    r[13] = c2;
678
0
    c2 = 0;
679
0
    mul_add_c(a[7], b[7], c3, c1, c2);
680
0
    r[14] = c3;
681
0
    r[15] = c1;
682
0
}
683
684
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
685
0
{
686
0
    BN_ULONG c1, c2, c3;
687
688
0
    c1 = 0;
689
0
    c2 = 0;
690
0
    c3 = 0;
691
0
    mul_add_c(a[0], b[0], c1, c2, c3);
692
0
    r[0] = c1;
693
0
    c1 = 0;
694
0
    mul_add_c(a[0], b[1], c2, c3, c1);
695
0
    mul_add_c(a[1], b[0], c2, c3, c1);
696
0
    r[1] = c2;
697
0
    c2 = 0;
698
0
    mul_add_c(a[2], b[0], c3, c1, c2);
699
0
    mul_add_c(a[1], b[1], c3, c1, c2);
700
0
    mul_add_c(a[0], b[2], c3, c1, c2);
701
0
    r[2] = c3;
702
0
    c3 = 0;
703
0
    mul_add_c(a[0], b[3], c1, c2, c3);
704
0
    mul_add_c(a[1], b[2], c1, c2, c3);
705
0
    mul_add_c(a[2], b[1], c1, c2, c3);
706
0
    mul_add_c(a[3], b[0], c1, c2, c3);
707
0
    r[3] = c1;
708
0
    c1 = 0;
709
0
    mul_add_c(a[3], b[1], c2, c3, c1);
710
0
    mul_add_c(a[2], b[2], c2, c3, c1);
711
0
    mul_add_c(a[1], b[3], c2, c3, c1);
712
0
    r[4] = c2;
713
0
    c2 = 0;
714
0
    mul_add_c(a[2], b[3], c3, c1, c2);
715
0
    mul_add_c(a[3], b[2], c3, c1, c2);
716
0
    r[5] = c3;
717
0
    c3 = 0;
718
0
    mul_add_c(a[3], b[3], c1, c2, c3);
719
0
    r[6] = c1;
720
0
    r[7] = c2;
721
0
}
722
723
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
724
0
{
725
0
    BN_ULONG c1, c2, c3;
726
727
0
    c1 = 0;
728
0
    c2 = 0;
729
0
    c3 = 0;
730
0
    sqr_add_c(a, 0, c1, c2, c3);
731
0
    r[0] = c1;
732
0
    c1 = 0;
733
0
    sqr_add_c2(a, 1, 0, c2, c3, c1);
734
0
    r[1] = c2;
735
0
    c2 = 0;
736
0
    sqr_add_c(a, 1, c3, c1, c2);
737
0
    sqr_add_c2(a, 2, 0, c3, c1, c2);
738
0
    r[2] = c3;
739
0
    c3 = 0;
740
0
    sqr_add_c2(a, 3, 0, c1, c2, c3);
741
0
    sqr_add_c2(a, 2, 1, c1, c2, c3);
742
0
    r[3] = c1;
743
0
    c1 = 0;
744
0
    sqr_add_c(a, 2, c2, c3, c1);
745
0
    sqr_add_c2(a, 3, 1, c2, c3, c1);
746
0
    sqr_add_c2(a, 4, 0, c2, c3, c1);
747
0
    r[4] = c2;
748
0
    c2 = 0;
749
0
    sqr_add_c2(a, 5, 0, c3, c1, c2);
750
0
    sqr_add_c2(a, 4, 1, c3, c1, c2);
751
0
    sqr_add_c2(a, 3, 2, c3, c1, c2);
752
0
    r[5] = c3;
753
0
    c3 = 0;
754
0
    sqr_add_c(a, 3, c1, c2, c3);
755
0
    sqr_add_c2(a, 4, 2, c1, c2, c3);
756
0
    sqr_add_c2(a, 5, 1, c1, c2, c3);
757
0
    sqr_add_c2(a, 6, 0, c1, c2, c3);
758
0
    r[6] = c1;
759
0
    c1 = 0;
760
0
    sqr_add_c2(a, 7, 0, c2, c3, c1);
761
0
    sqr_add_c2(a, 6, 1, c2, c3, c1);
762
0
    sqr_add_c2(a, 5, 2, c2, c3, c1);
763
0
    sqr_add_c2(a, 4, 3, c2, c3, c1);
764
0
    r[7] = c2;
765
0
    c2 = 0;
766
0
    sqr_add_c(a, 4, c3, c1, c2);
767
0
    sqr_add_c2(a, 5, 3, c3, c1, c2);
768
0
    sqr_add_c2(a, 6, 2, c3, c1, c2);
769
0
    sqr_add_c2(a, 7, 1, c3, c1, c2);
770
0
    r[8] = c3;
771
0
    c3 = 0;
772
0
    sqr_add_c2(a, 7, 2, c1, c2, c3);
773
0
    sqr_add_c2(a, 6, 3, c1, c2, c3);
774
0
    sqr_add_c2(a, 5, 4, c1, c2, c3);
775
0
    r[9] = c1;
776
0
    c1 = 0;
777
0
    sqr_add_c(a, 5, c2, c3, c1);
778
0
    sqr_add_c2(a, 6, 4, c2, c3, c1);
779
0
    sqr_add_c2(a, 7, 3, c2, c3, c1);
780
0
    r[10] = c2;
781
0
    c2 = 0;
782
0
    sqr_add_c2(a, 7, 4, c3, c1, c2);
783
0
    sqr_add_c2(a, 6, 5, c3, c1, c2);
784
0
    r[11] = c3;
785
0
    c3 = 0;
786
0
    sqr_add_c(a, 6, c1, c2, c3);
787
0
    sqr_add_c2(a, 7, 5, c1, c2, c3);
788
0
    r[12] = c1;
789
0
    c1 = 0;
790
0
    sqr_add_c2(a, 7, 6, c2, c3, c1);
791
0
    r[13] = c2;
792
0
    c2 = 0;
793
0
    sqr_add_c(a, 7, c3, c1, c2);
794
0
    r[14] = c3;
795
0
    r[15] = c1;
796
0
}
797
798
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
799
0
{
800
0
    BN_ULONG c1, c2, c3;
801
802
0
    c1 = 0;
803
0
    c2 = 0;
804
0
    c3 = 0;
805
0
    sqr_add_c(a, 0, c1, c2, c3);
806
0
    r[0] = c1;
807
0
    c1 = 0;
808
0
    sqr_add_c2(a, 1, 0, c2, c3, c1);
809
0
    r[1] = c2;
810
0
    c2 = 0;
811
0
    sqr_add_c(a, 1, c3, c1, c2);
812
0
    sqr_add_c2(a, 2, 0, c3, c1, c2);
813
0
    r[2] = c3;
814
0
    c3 = 0;
815
0
    sqr_add_c2(a, 3, 0, c1, c2, c3);
816
0
    sqr_add_c2(a, 2, 1, c1, c2, c3);
817
0
    r[3] = c1;
818
0
    c1 = 0;
819
0
    sqr_add_c(a, 2, c2, c3, c1);
820
0
    sqr_add_c2(a, 3, 1, c2, c3, c1);
821
0
    r[4] = c2;
822
0
    c2 = 0;
823
0
    sqr_add_c2(a, 3, 2, c3, c1, c2);
824
0
    r[5] = c3;
825
0
    c3 = 0;
826
0
    sqr_add_c(a, 3, c1, c2, c3);
827
0
    r[6] = c1;
828
0
    r[7] = c2;
829
0
}
830
831
# ifdef OPENSSL_NO_ASM
832
#  ifdef OPENSSL_BN_ASM_MONT
833
#   include <alloca.h>
834
/*
835
 * This is essentially reference implementation, which may or may not
836
 * result in performance improvement. E.g. on IA-32 this routine was
837
 * observed to give 40% faster rsa1024 private key operations and 10%
838
 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
839
 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
840
 * reference implementation, one to be used as starting point for
841
 * platform-specific assembler. Mentioned numbers apply to compiler
842
 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
843
 * can vary not only from platform to platform, but even for compiler
844
 * versions. Assembler vs. assembler improvement coefficients can
845
 * [and are known to] differ and are to be documented elsewhere.
846
 */
847
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
848
                const BN_ULONG *np, const BN_ULONG *n0p, int num)
849
{
850
    BN_ULONG c0, c1, ml, *tp, n0;
851
#   ifdef mul64
852
    BN_ULONG mh;
853
#   endif
854
    volatile BN_ULONG *vp;
855
    int i = 0, j;
856
857
#   if 0                        /* template for platform-specific
858
                                 * implementation */
859
    if (ap == bp)
860
        return bn_sqr_mont(rp, ap, np, n0p, num);
861
#   endif
862
    vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
863
864
    n0 = *n0p;
865
866
    c0 = 0;
867
    ml = bp[0];
868
#   ifdef mul64
869
    mh = HBITS(ml);
870
    ml = LBITS(ml);
871
    for (j = 0; j < num; ++j)
872
        mul(tp[j], ap[j], ml, mh, c0);
873
#   else
874
    for (j = 0; j < num; ++j)
875
        mul(tp[j], ap[j], ml, c0);
876
#   endif
877
878
    tp[num] = c0;
879
    tp[num + 1] = 0;
880
    goto enter;
881
882
    for (i = 0; i < num; i++) {
883
        c0 = 0;
884
        ml = bp[i];
885
#   ifdef mul64
886
        mh = HBITS(ml);
887
        ml = LBITS(ml);
888
        for (j = 0; j < num; ++j)
889
            mul_add(tp[j], ap[j], ml, mh, c0);
890
#   else
891
        for (j = 0; j < num; ++j)
892
            mul_add(tp[j], ap[j], ml, c0);
893
#   endif
894
        c1 = (tp[num] + c0) & BN_MASK2;
895
        tp[num] = c1;
896
        tp[num + 1] = (c1 < c0 ? 1 : 0);
897
 enter:
898
        c1 = tp[0];
899
        ml = (c1 * n0) & BN_MASK2;
900
        c0 = 0;
901
#   ifdef mul64
902
        mh = HBITS(ml);
903
        ml = LBITS(ml);
904
        mul_add(c1, np[0], ml, mh, c0);
905
#   else
906
        mul_add(c1, ml, np[0], c0);
907
#   endif
908
        for (j = 1; j < num; j++) {
909
            c1 = tp[j];
910
#   ifdef mul64
911
            mul_add(c1, np[j], ml, mh, c0);
912
#   else
913
            mul_add(c1, ml, np[j], c0);
914
#   endif
915
            tp[j - 1] = c1 & BN_MASK2;
916
        }
917
        c1 = (tp[num] + c0) & BN_MASK2;
918
        tp[num - 1] = c1;
919
        tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
920
    }
921
922
    if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
923
        c0 = bn_sub_words(rp, tp, np, num);
924
        if (tp[num] != 0 || c0 == 0) {
925
            for (i = 0; i < num + 2; i++)
926
                vp[i] = 0;
927
            return 1;
928
        }
929
    }
930
    for (i = 0; i < num; i++)
931
        rp[i] = tp[i], vp[i] = 0;
932
    vp[num] = 0;
933
    vp[num + 1] = 0;
934
    return 1;
935
}
936
#  else
937
/*
938
 * Return value of 0 indicates that multiplication/convolution was not
939
 * performed to signal the caller to fall down to alternative/original
940
 * code-path.
941
 */
942
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
943
                const BN_ULONG *np, const BN_ULONG *n0, int num)
944
0
{
945
0
    return 0;
946
0
}
947
#  endif                        /* OPENSSL_BN_ASM_MONT */
948
# endif
949
950
#else                           /* !BN_MUL_COMBA */
951
952
/* hmm... is it faster just to do a multiply? */
953
# undef bn_sqr_comba4
954
# undef bn_sqr_comba8
955
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
956
{
957
    BN_ULONG t[8];
958
    bn_sqr_normal(r, a, 4, t);
959
}
960
961
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
962
{
963
    BN_ULONG t[16];
964
    bn_sqr_normal(r, a, 8, t);
965
}
966
967
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
968
{
969
    r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
970
    r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
971
    r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
972
    r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
973
}
974
975
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
976
{
977
    r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
978
    r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
979
    r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
980
    r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
981
    r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
982
    r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
983
    r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
984
    r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
985
}
986
987
# ifdef OPENSSL_NO_ASM
988
#  ifdef OPENSSL_BN_ASM_MONT
989
#   include <alloca.h>
990
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
991
                const BN_ULONG *np, const BN_ULONG *n0p, int num)
992
{
993
    BN_ULONG c0, c1, *tp, n0 = *n0p;
994
    volatile BN_ULONG *vp;
995
    int i = 0, j;
996
997
    vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
998
999
    for (i = 0; i <= num; i++)
1000
        tp[i] = 0;
1001
1002
    for (i = 0; i < num; i++) {
1003
        c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1004
        c1 = (tp[num] + c0) & BN_MASK2;
1005
        tp[num] = c1;
1006
        tp[num + 1] = (c1 < c0 ? 1 : 0);
1007
1008
        c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1009
        c1 = (tp[num] + c0) & BN_MASK2;
1010
        tp[num] = c1;
1011
        tp[num + 1] += (c1 < c0 ? 1 : 0);
1012
        for (j = 0; j <= num; j++)
1013
            tp[j] = tp[j + 1];
1014
    }
1015
1016
    if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1017
        c0 = bn_sub_words(rp, tp, np, num);
1018
        if (tp[num] != 0 || c0 == 0) {
1019
            for (i = 0; i < num + 2; i++)
1020
                vp[i] = 0;
1021
            return 1;
1022
        }
1023
    }
1024
    for (i = 0; i < num; i++)
1025
        rp[i] = tp[i], vp[i] = 0;
1026
    vp[num] = 0;
1027
    vp[num + 1] = 0;
1028
    return 1;
1029
}
1030
#  else
1031
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1032
                const BN_ULONG *np, const BN_ULONG *n0, int num)
1033
{
1034
    return 0;
1035
}
1036
#  endif                        /* OPENSSL_BN_ASM_MONT */
1037
# endif
1038
1039
#endif                          /* !BN_MUL_COMBA */