Coverage Report

Created: 2026-03-31 07:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/blst/src/ec_mult.h
Line
Count
Source
1
/*
2
 * Copyright Supranational LLC
3
 * Licensed under the Apache License, Version 2.0, see LICENSE for details.
4
 * SPDX-License-Identifier: Apache-2.0
5
 */
6
#ifndef __BLS12_381_ASM_EC_MULT_H__
7
#define __BLS12_381_ASM_EC_MULT_H__
8
9
#include "point.h"
10
11
/* Works up to 9 bits */
12
static limb_t get_wval(const byte *d, size_t off, size_t bits)
13
70.4k
{
14
70.4k
    size_t top = off + bits - 1;
15
70.4k
    limb_t ret;
16
17
70.4k
    ret = ((limb_t)d[top / 8] << 8) | d[off / 8];
18
19
70.4k
    return ret >> (off%8);
20
70.4k
}
21
22
/* Works up to 25 bits. */
23
static limb_t get_wval_limb(const byte *d, size_t off, size_t bits)
24
0
{
25
0
    size_t i, top = (off + bits - 1)/8;
26
0
    limb_t ret, mask = (limb_t)0 - 1;
27
28
0
    d   += off/8;
29
0
    top -= off/8-1;
30
31
    /* this is not about constant-time-ness, but branch optimization */
32
0
    for (ret=0, i=0; i<4;) {
33
0
        ret |= (*d & mask) << (8*i);
34
0
        mask = (limb_t)0 - ((++i - top) >> (8*sizeof(top)-1));
35
0
        d += 1 & mask;
36
0
    }
37
38
0
    return ret >> (off%8);
39
0
}
40
41
/*
42
 * Window value encoding that utilizes the fact that -P is trivially
43
 * calculated, which allows to halve the size of pre-computed table,
44
 * is attributed to A. D. Booth, hence the name of the subroutines...
45
 */
46
static limb_t booth_encode(limb_t wval, size_t sz)
47
71.1k
{
48
71.1k
    limb_t mask = 0 - (wval >> sz);     /* "sign" bit -> mask */
49
71.1k
    launder(mask);
50
51
71.1k
    wval = (wval + 1) >> 1;
52
71.1k
    wval = (wval ^ mask) - mask;
53
54
    /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */
55
71.1k
    return wval;
56
71.1k
}
57
58
/*
59
 * Key feature of these constant-time subroutines is that they tolerate
60
 * zeros in most significant bit positions of the scalar[s], or in other
61
 * words, zero-padded scalar values. This means that one can and should
62
 * pass order's bit-length, which is customarily publicly known, instead
63
 * of the factual scalars' bit-lengths. This is facilitated by point
64
 * addition subroutines implemented to handle points at infinity, which
65
 * are encoded as Z==0. [Doubling algorithms handle such points at
66
 * infinity "naturally," since resulting Z is product of original Z.]
67
 */
68
#define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \
69
static bool_t ptype##_gather_booth_w##SZ(ptype *restrict p, \
70
                                         const ptype table[1<<(SZ-1)], \
71
71.1k
                                         limb_t booth_idx) \
72
71.1k
{ \
73
71.1k
    size_t i; \
74
71.1k
    bool_t booth_sign = (booth_idx >> SZ) & 1; \
75
71.1k
\
76
71.1k
    booth_idx &= (1<<SZ) - 1; \
77
71.1k
    vec_copy(p, table, sizeof(ptype)); \
78
71.1k
    /* ~6% with -Os, ~2% with -O3 ... */\
79
1.13M
    for (i = 2; i <= 1<<(SZ-1); i++) \
80
1.06M
        ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \
81
71.1k
\
82
71.1k
    ptype##_cneg(p, booth_sign); \
83
71.1k
\
84
71.1k
    return byte_is_zero((byte)booth_idx); \
85
71.1k
} \
server.c:POINTonE1_gather_booth_w4
Line
Count
Source
71
304
                                         limb_t booth_idx) \
72
304
{ \
73
304
    size_t i; \
74
304
    bool_t booth_sign = (booth_idx >> SZ) & 1; \
75
304
\
76
304
    booth_idx &= (1<<SZ) - 1; \
77
304
    vec_copy(p, table, sizeof(ptype)); \
78
304
    /* ~6% with -Os, ~2% with -O3 ... */\
79
2.43k
    for (i = 2; i <= 1<<(SZ-1); i++) \
80
2.12k
        ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \
81
304
\
82
304
    ptype##_cneg(p, booth_sign); \
83
304
\
84
304
    return byte_is_zero((byte)booth_idx); \
85
304
} \
server.c:POINTonE1_gather_booth_w5
Line
Count
Source
71
43.4k
                                         limb_t booth_idx) \
72
43.4k
{ \
73
43.4k
    size_t i; \
74
43.4k
    bool_t booth_sign = (booth_idx >> SZ) & 1; \
75
43.4k
\
76
43.4k
    booth_idx &= (1<<SZ) - 1; \
77
43.4k
    vec_copy(p, table, sizeof(ptype)); \
78
43.4k
    /* ~6% with -Os, ~2% with -O3 ... */\
79
694k
    for (i = 2; i <= 1<<(SZ-1); i++) \
80
651k
        ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \
81
43.4k
\
82
43.4k
    ptype##_cneg(p, booth_sign); \
83
43.4k
\
84
43.4k
    return byte_is_zero((byte)booth_idx); \
85
43.4k
} \
server.c:POINTonE2_gather_booth_w4
Line
Count
Source
71
262
                                         limb_t booth_idx) \
72
262
{ \
73
262
    size_t i; \
74
262
    bool_t booth_sign = (booth_idx >> SZ) & 1; \
75
262
\
76
262
    booth_idx &= (1<<SZ) - 1; \
77
262
    vec_copy(p, table, sizeof(ptype)); \
78
262
    /* ~6% with -Os, ~2% with -O3 ... */\
79
2.09k
    for (i = 2; i <= 1<<(SZ-1); i++) \
80
1.83k
        ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \
81
262
\
82
262
    ptype##_cneg(p, booth_sign); \
83
262
\
84
262
    return byte_is_zero((byte)booth_idx); \
85
262
} \
server.c:POINTonE2_gather_booth_w5
Line
Count
Source
71
27.1k
                                         limb_t booth_idx) \
72
27.1k
{ \
73
27.1k
    size_t i; \
74
27.1k
    bool_t booth_sign = (booth_idx >> SZ) & 1; \
75
27.1k
\
76
27.1k
    booth_idx &= (1<<SZ) - 1; \
77
27.1k
    vec_copy(p, table, sizeof(ptype)); \
78
27.1k
    /* ~6% with -Os, ~2% with -O3 ... */\
79
434k
    for (i = 2; i <= 1<<(SZ-1); i++) \
80
407k
        ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \
81
27.1k
\
82
27.1k
    ptype##_cneg(p, booth_sign); \
83
27.1k
\
84
27.1k
    return byte_is_zero((byte)booth_idx); \
85
27.1k
} \
86
\
87
308
static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \
88
308
{ \
89
308
    size_t i, j; \
90
308
                                      /* row[-1] is implicit infinity */\
91
308
    vec_copy(&row[0], point, sizeof(ptype));        /* row[0]=p*1     */\
92
308
    ptype##_double(&row[1],  point);                /* row[1]=p*(1+1) */\
93
2.26k
    for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \
94
1.95k
        ptype##_add(&row[i], &row[j], &row[j-1]),   /* row[2]=p*(2+1) */\
95
1.95k
        ptype##_double(&row[i+1], &row[j]);         /* row[3]=p*(2+2) */\
96
308
}                                                   /* row[4] ...     */\
server.c:POINTonE1_precompute_w4
Line
Count
Source
87
26
static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \
88
26
{ \
89
26
    size_t i, j; \
90
26
                                      /* row[-1] is implicit infinity */\
91
26
    vec_copy(&row[0], point, sizeof(ptype));        /* row[0]=p*1     */\
92
26
    ptype##_double(&row[1],  point);                /* row[1]=p*(1+1) */\
93
104
    for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \
94
78
        ptype##_add(&row[i], &row[j], &row[j-1]),   /* row[2]=p*(2+1) */\
95
78
        ptype##_double(&row[i+1], &row[j]);         /* row[3]=p*(2+2) */\
96
26
}                                                   /* row[4] ...     */\
server.c:POINTonE1_precompute_w5
Line
Count
Source
87
102
static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \
88
102
{ \
89
102
    size_t i, j; \
90
102
                                      /* row[-1] is implicit infinity */\
91
102
    vec_copy(&row[0], point, sizeof(ptype));        /* row[0]=p*1     */\
92
102
    ptype##_double(&row[1],  point);                /* row[1]=p*(1+1) */\
93
816
    for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \
94
714
        ptype##_add(&row[i], &row[j], &row[j-1]),   /* row[2]=p*(2+1) */\
95
714
        ptype##_double(&row[i+1], &row[j]);         /* row[3]=p*(2+2) */\
96
102
}                                                   /* row[4] ...     */\
server.c:POINTonE2_precompute_w4
Line
Count
Source
87
24
static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \
88
24
{ \
89
24
    size_t i, j; \
90
24
                                      /* row[-1] is implicit infinity */\
91
24
    vec_copy(&row[0], point, sizeof(ptype));        /* row[0]=p*1     */\
92
24
    ptype##_double(&row[1],  point);                /* row[1]=p*(1+1) */\
93
96
    for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \
94
72
        ptype##_add(&row[i], &row[j], &row[j-1]),   /* row[2]=p*(2+1) */\
95
72
        ptype##_double(&row[i+1], &row[j]);         /* row[3]=p*(2+2) */\
96
24
}                                                   /* row[4] ...     */\
server.c:POINTonE2_precompute_w5
Line
Count
Source
87
156
static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \
88
156
{ \
89
156
    size_t i, j; \
90
156
                                      /* row[-1] is implicit infinity */\
91
156
    vec_copy(&row[0], point, sizeof(ptype));        /* row[0]=p*1     */\
92
156
    ptype##_double(&row[1],  point);                /* row[1]=p*(1+1) */\
93
1.24k
    for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \
94
1.09k
        ptype##_add(&row[i], &row[j], &row[j-1]),   /* row[2]=p*(2+1) */\
95
1.09k
        ptype##_double(&row[i+1], &row[j]);         /* row[3]=p*(2+2) */\
96
156
}                                                   /* row[4] ...     */\
97
\
98
static void ptype##s_mult_w##SZ(ptype *ret, \
99
                                const ptype *points[], size_t npoints, \
100
                                const byte *scalars[], size_t bits, \
101
154
                                ptype table[][1<<(SZ-1)]) \
102
154
{ \
103
154
    limb_t wmask, wval; \
104
154
    size_t i, j, window, nbytes; \
105
154
    const byte *scalar, **scalar_s = scalars; \
106
154
    ptype sum[1], row[1]; \
107
154
    bool_t sum_is_inf, row_is_inf, ret_is_inf; \
108
154
\
109
154
    if (table == NULL) \
110
154
        table = (ptype (*)[1<<(SZ-1)])alloca((1<<(SZ-1)) * sizeof(ptype) * \
111
154
                                             npoints); \
112
154
\
113
154
    if (points != NULL) { \
114
0
        const ptype *point = NULL; \
115
0
        for (i = 0; i < npoints; i++) \
116
0
            point = *points ? *points++ : point+1, \
117
0
            ptype##_precompute_w##SZ(table[i], point); \
118
0
    } \
119
154
\
120
154
    nbytes = (bits + 7)/8; /* convert |bits| to bytes */ \
121
154
    scalar = *scalar_s++; \
122
154
\
123
154
    /* top excess bits modulo target window size */ \
124
154
    window = bits % SZ; /* yes, it may be zero */ \
125
154
    wmask = ((limb_t)1 << (window + 1)) - 1; \
126
154
\
127
154
    bits -= window; \
128
154
    if (bits > 0) \
129
154
        wval = get_wval(scalar, bits - 1, window + 1) & wmask; \
130
154
    else \
131
154
        wval = (scalar[0] << 1) & wmask; \
132
154
\
133
154
    wval = booth_encode(wval, SZ); \
134
154
    ret_is_inf = ptype##_gather_booth_w##SZ(ret, table[0], wval); \
135
154
\
136
154
    i = 1; \
137
2.52k
    while (bits > 0) { \
138
9.68k
        for (; i < npoints; i++) { \
139
7.31k
            scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \
140
7.31k
            wval = get_wval(scalar, bits - 1, window + 1) & wmask; \
141
7.31k
            wval = booth_encode(wval, SZ); \
142
7.31k
            row_is_inf = ptype##_gather_booth_w##SZ(row, table[i], wval); \
143
7.31k
            ptype##_dadd(sum, ret, row, NULL); \
144
7.31k
            ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \
145
7.31k
            sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \
146
7.31k
            ret_is_inf |= sum_is_inf; \
147
7.31k
            row_is_inf |= sum_is_inf; \
148
7.31k
            ptype##_ccopy(ret, row, ret_is_inf); \
149
7.31k
            ret_is_inf &= row_is_inf; \
150
7.31k
        } \
151
2.36k
\
152
14.2k
        for (j = 0; j < SZ; j++) \
153
11.8k
            ptype##_double(ret, ret); \
154
2.36k
\
155
2.36k
        window = SZ; \
156
2.36k
        wmask = ((limb_t)1 << (window + 1)) - 1; \
157
2.36k
        bits -= window; \
158
2.36k
        i = 0; scalar_s = scalars; \
159
2.36k
    } \
160
154
\
161
690
    for (; i < npoints; i++) { \
162
536
        scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \
163
536
        wval = (scalar[0] << 1) & wmask; \
164
536
        wval = booth_encode(wval, SZ); \
165
536
        row_is_inf = ptype##_gather_booth_w##SZ(row, table[i], wval); \
166
536
        ptype##_dadd(sum, ret, row, NULL); \
167
536
        ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \
168
536
        sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \
169
536
        ret_is_inf |= sum_is_inf; \
170
536
        row_is_inf |= sum_is_inf; \
171
536
        ptype##_ccopy(ret, row, ret_is_inf); \
172
536
        ret_is_inf &= row_is_inf; \
173
536
    } \
174
154
\
175
154
    vec_czero(ret->Z, sizeof(ret->Z), ret_is_inf); \
176
154
} \
Unexecuted instantiation: server.c:POINTonE1s_mult_w4
Unexecuted instantiation: server.c:POINTonE2s_mult_w4
177
\
178
static void ptype##_mult_w##SZ(ptype *ret, const ptype *point, \
179
154
                               const byte *scalar, size_t bits) \
180
154
{ \
181
154
    limb_t wmask, wval; \
182
154
    size_t j, window; \
183
154
    ptype sum[1], row[1]; \
184
154
    bool_t sum_is_inf, row_is_inf, ret_is_inf; \
185
154
    ptype table[1<<(SZ-1)]; \
186
154
\
187
154
    ptype##_precompute_w##SZ(table, point); \
188
154
\
189
154
    /* top excess bits modulo target window size */ \
190
154
    window = bits % SZ;  /* yes, it may be zero */ \
191
154
    wmask = ((limb_t)1 << (window + 1)) - 1; \
192
154
\
193
154
    bits -= window; \
194
154
    wval = bits ? get_wval(scalar, bits - 1, window + 1) \
195
154
                : (limb_t)scalar[0] << 1; \
196
154
    wval &= wmask; \
197
154
    wval = booth_encode(wval, SZ); \
198
154
    ret_is_inf = ptype##_gather_booth_w##SZ(ret, table, wval); \
199
154
\
200
63.1k
    while (bits > 0) { \
201
377k
        for (j = 0; j < SZ; j++) \
202
314k
            ptype##_double(ret, ret); \
203
63.0k
\
204
63.0k
        window = SZ; \
205
63.0k
        wmask = ((limb_t)1 << (window + 1)) - 1; \
206
63.0k
        bits -= window; \
207
63.0k
\
208
63.0k
        wval = bits ? get_wval(scalar, bits - 1, window + 1) \
209
63.0k
                    : (limb_t)scalar[0] << 1; \
210
63.0k
        wval &= wmask; \
211
63.0k
        wval = booth_encode(wval, SZ); \
212
63.0k
        row_is_inf = ptype##_gather_booth_w##SZ(row, table, wval); \
213
63.0k
        ptype##_dadd(sum, ret, row, NULL); \
214
63.0k
        ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \
215
63.0k
        sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \
216
63.0k
        ret_is_inf |= sum_is_inf; \
217
63.0k
        row_is_inf |= sum_is_inf; \
218
63.0k
        ptype##_ccopy(ret, row, ret_is_inf); \
219
63.0k
        ret_is_inf &= row_is_inf; \
220
63.0k
    } \
221
154
\
222
154
    vec_czero(ret->Z, sizeof(ret->Z), ret_is_inf); \
223
154
}
224
225
#if 0
226
/* ~50%, or ~2x[!] slower than w5... */
227
#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \
228
static void ptype##_mult_ladder(ptype *ret, const ptype *p, \
229
                                const byte *scalar, size_t bits) \
230
{ \
231
    ptype sum[1]; \
232
    bool_t bit, pbit = 0; \
233
\
234
    vec_copy(sum, p, sizeof(ptype)); \
235
    vec_zero(ret, sizeof(ptype));   /* infinity */ \
236
\
237
    while (bits--) { \
238
        bit = is_bit_set(scalar, bits); \
239
        bit ^= pbit; \
240
        ptype##_cswap(ret, sum, bit); \
241
        ptype##_add(sum, sum, ret); \
242
        ptype##_double(ret, ret); \
243
        pbit ^= bit; \
244
    } \
245
    ptype##_cswap(ret, sum, pbit); \
246
}
247
#else
248
/* >40% better performance than above, [and ~30% slower than w5]... */
249
#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \
250
static void ptype##_mult_ladder(ptype *out, const ptype *p, \
251
                                const byte *scalar, size_t bits) \
252
{ \
253
    ptype##xz sum[1]; \
254
    ptype##xz pxz[1]; \
255
    ptype##xz ret[1]; \
256
    bool_t bit, pbit = 0; \
257
\
258
    ptype##xz_ladder_pre(pxz, p); \
259
    vec_copy(sum, pxz, sizeof(ptype##xz)); \
260
    vec_zero(ret, sizeof(ptype##xz));   /* infinity */ \
261
\
262
    while (bits--) { \
263
        bit = is_bit_set(scalar, bits); \
264
        bit ^= pbit; \
265
        ptype##xz_cswap(ret, sum, bit); \
266
        ptype##xz_ladder_step(ret, sum, pxz); \
267
        pbit ^= bit; \
268
    } \
269
    ptype##xz_cswap(ret, sum, pbit); \
270
    ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \
271
}
272
#endif
273
274
/*
275
 * Sole reason for existence of this implementation is that addition
276
 * with affine point renders a share of multiplications redundant by
277
 * virtue of Z==1. And since pre-defined generator point can be and
278
 * customarily is instantiated affine, it would be hardly appropriate
279
 * to pass on this opportunity. Though while it's faster than the
280
 * generic ladder implementation, by ~25%, it's not faster than XZ one
281
 * above, <15% slower. Just in case, it's faster than generic ladder
282
 * even if one accounts for prior conversion to affine coordinates,
283
 * so that choice [for resource-constrained case] is actually between
284
 * this plus said conversion and XZ ladder...
285
 *
286
 * To summarize, if ptype##_mult_w5 executed in one unit of time, then
287
 * - naive ptype##_mult_ladder would execute in ~2;
288
 * - XZ version above - in ~1.4;
289
 * - ptype##_affine_mult_ladder below - in ~1.65;
290
 * - [small-footprint ptype##_to_affine would run in ~0.18].
291
 *
292
 * Caveat lector, |p_affine|*(order+2) produces wrong result, because
293
 * addition doesn't handle doubling. Indeed, P*(order+1) is P and it
294
 * fails to add with itself producing infinity in last addition. But
295
 * as long as |scalar| is reduced modulo order, as it should be, it's
296
 * not a problem...
297
 */
298
#define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \
299
static void ptype##_affine_mult_ladder(ptype *ret, \
300
                                       const ptype##_affine *p_affine, \
301
                                       const byte *scalar, size_t bits) \
302
{ \
303
    ptype sum[1]; \
304
    bool_t bit; \
305
\
306
    vec_zero(ret, sizeof(ptype));   /* infinity */ \
307
\
308
    while (bits--) { \
309
        ptype##_double(ret, ret); \
310
        ptype##_add_affine(sum, ret, p_affine); \
311
        bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \
312
        ptype##_ccopy(ret, sum, bit); \
313
    } \
314
}
315
#endif