Line | Count | Source |
1 | | /* |
2 | | * Copyright Supranational LLC |
3 | | * Licensed under the Apache License, Version 2.0, see LICENSE for details. |
4 | | * SPDX-License-Identifier: Apache-2.0 |
5 | | */ |
6 | | #ifndef __BLS12_381_ASM_EC_MULT_H__ |
7 | | #define __BLS12_381_ASM_EC_MULT_H__ |
8 | | |
9 | | #include "point.h" |
10 | | |
11 | | /* Works up to 9 bits */ |
12 | | static limb_t get_wval(const byte *d, size_t off, size_t bits) |
13 | 70.4k | { |
14 | 70.4k | size_t top = off + bits - 1; |
15 | 70.4k | limb_t ret; |
16 | | |
17 | 70.4k | ret = ((limb_t)d[top / 8] << 8) | d[off / 8]; |
18 | | |
19 | 70.4k | return ret >> (off%8); |
20 | 70.4k | } |
21 | | |
22 | | /* Works up to 25 bits. */ |
23 | | static limb_t get_wval_limb(const byte *d, size_t off, size_t bits) |
24 | 0 | { |
25 | 0 | size_t i, top = (off + bits - 1)/8; |
26 | 0 | limb_t ret, mask = (limb_t)0 - 1; |
27 | |
|
28 | 0 | d += off/8; |
29 | 0 | top -= off/8-1; |
30 | | |
31 | | /* this is not about constant-time-ness, but branch optimization */ |
32 | 0 | for (ret=0, i=0; i<4;) { |
33 | 0 | ret |= (*d & mask) << (8*i); |
34 | 0 | mask = (limb_t)0 - ((++i - top) >> (8*sizeof(top)-1)); |
35 | 0 | d += 1 & mask; |
36 | 0 | } |
37 | |
|
38 | 0 | return ret >> (off%8); |
39 | 0 | } |
40 | | |
41 | | /* |
42 | | * Window value encoding that utilizes the fact that -P is trivially |
43 | | * calculated, which allows to halve the size of pre-computed table, |
44 | | * is attributed to A. D. Booth, hence the name of the subroutines... |
45 | | */ |
46 | | static limb_t booth_encode(limb_t wval, size_t sz) |
47 | 71.1k | { |
48 | 71.1k | limb_t mask = 0 - (wval >> sz); /* "sign" bit -> mask */ |
49 | 71.1k | launder(mask); |
50 | | |
51 | 71.1k | wval = (wval + 1) >> 1; |
52 | 71.1k | wval = (wval ^ mask) - mask; |
53 | | |
54 | | /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */ |
55 | 71.1k | return wval; |
56 | 71.1k | } |
57 | | |
58 | | /* |
59 | | * Key feature of these constant-time subroutines is that they tolerate |
60 | | * zeros in most significant bit positions of the scalar[s], or in other |
61 | | * words, zero-padded scalar values. This means that one can and should |
62 | | * pass order's bit-length, which is customarily publicly known, instead |
63 | | * of the factual scalars' bit-lengths. This is facilitated by point |
64 | | * addition subroutines implemented to handle points at infinity, which |
65 | | * are encoded as Z==0. [Doubling algorithms handle such points at |
66 | | * infinity "naturally," since resulting Z is product of original Z.] |
67 | | */ |
68 | | #define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \ |
69 | | static bool_t ptype##_gather_booth_w##SZ(ptype *restrict p, \ |
70 | | const ptype table[1<<(SZ-1)], \ |
71 | 71.1k | limb_t booth_idx) \ |
72 | 71.1k | { \ |
73 | 71.1k | size_t i; \ |
74 | 71.1k | bool_t booth_sign = (booth_idx >> SZ) & 1; \ |
75 | 71.1k | \ |
76 | 71.1k | booth_idx &= (1<<SZ) - 1; \ |
77 | 71.1k | vec_copy(p, table, sizeof(ptype)); \ |
78 | 71.1k | /* ~6% with -Os, ~2% with -O3 ... */\ |
79 | 1.13M | for (i = 2; i <= 1<<(SZ-1); i++) \ |
80 | 1.06M | ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \ |
81 | 71.1k | \ |
82 | 71.1k | ptype##_cneg(p, booth_sign); \ |
83 | 71.1k | \ |
84 | 71.1k | return byte_is_zero((byte)booth_idx); \ |
85 | 71.1k | } \ server.c:POINTonE1_gather_booth_w4 Line | Count | Source | 71 | 304 | limb_t booth_idx) \ | 72 | 304 | { \ | 73 | 304 | size_t i; \ | 74 | 304 | bool_t booth_sign = (booth_idx >> SZ) & 1; \ | 75 | 304 | \ | 76 | 304 | booth_idx &= (1<<SZ) - 1; \ | 77 | 304 | vec_copy(p, table, sizeof(ptype)); \ | 78 | 304 | /* ~6% with -Os, ~2% with -O3 ... */\ | 79 | 2.43k | for (i = 2; i <= 1<<(SZ-1); i++) \ | 80 | 2.12k | ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \ | 81 | 304 | \ | 82 | 304 | ptype##_cneg(p, booth_sign); \ | 83 | 304 | \ | 84 | 304 | return byte_is_zero((byte)booth_idx); \ | 85 | 304 | } \ |
server.c:POINTonE1_gather_booth_w5 Line | Count | Source | 71 | 43.4k | limb_t booth_idx) \ | 72 | 43.4k | { \ | 73 | 43.4k | size_t i; \ | 74 | 43.4k | bool_t booth_sign = (booth_idx >> SZ) & 1; \ | 75 | 43.4k | \ | 76 | 43.4k | booth_idx &= (1<<SZ) - 1; \ | 77 | 43.4k | vec_copy(p, table, sizeof(ptype)); \ | 78 | 43.4k | /* ~6% with -Os, ~2% with -O3 ... */\ | 79 | 694k | for (i = 2; i <= 1<<(SZ-1); i++) \ | 80 | 651k | ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \ | 81 | 43.4k | \ | 82 | 43.4k | ptype##_cneg(p, booth_sign); \ | 83 | 43.4k | \ | 84 | 43.4k | return byte_is_zero((byte)booth_idx); \ | 85 | 43.4k | } \ |
server.c:POINTonE2_gather_booth_w4 Line | Count | Source | 71 | 262 | limb_t booth_idx) \ | 72 | 262 | { \ | 73 | 262 | size_t i; \ | 74 | 262 | bool_t booth_sign = (booth_idx >> SZ) & 1; \ | 75 | 262 | \ | 76 | 262 | booth_idx &= (1<<SZ) - 1; \ | 77 | 262 | vec_copy(p, table, sizeof(ptype)); \ | 78 | 262 | /* ~6% with -Os, ~2% with -O3 ... */\ | 79 | 2.09k | for (i = 2; i <= 1<<(SZ-1); i++) \ | 80 | 1.83k | ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \ | 81 | 262 | \ | 82 | 262 | ptype##_cneg(p, booth_sign); \ | 83 | 262 | \ | 84 | 262 | return byte_is_zero((byte)booth_idx); \ | 85 | 262 | } \ |
server.c:POINTonE2_gather_booth_w5 Line | Count | Source | 71 | 27.1k | limb_t booth_idx) \ | 72 | 27.1k | { \ | 73 | 27.1k | size_t i; \ | 74 | 27.1k | bool_t booth_sign = (booth_idx >> SZ) & 1; \ | 75 | 27.1k | \ | 76 | 27.1k | booth_idx &= (1<<SZ) - 1; \ | 77 | 27.1k | vec_copy(p, table, sizeof(ptype)); \ | 78 | 27.1k | /* ~6% with -Os, ~2% with -O3 ... */\ | 79 | 434k | for (i = 2; i <= 1<<(SZ-1); i++) \ | 80 | 407k | ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \ | 81 | 27.1k | \ | 82 | 27.1k | ptype##_cneg(p, booth_sign); \ | 83 | 27.1k | \ | 84 | 27.1k | return byte_is_zero((byte)booth_idx); \ | 85 | 27.1k | } \ |
|
86 | | \ |
87 | 308 | static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \ |
88 | 308 | { \ |
89 | 308 | size_t i, j; \ |
90 | 308 | /* row[-1] is implicit infinity */\ |
91 | 308 | vec_copy(&row[0], point, sizeof(ptype)); /* row[0]=p*1 */\ |
92 | 308 | ptype##_double(&row[1], point); /* row[1]=p*(1+1) */\ |
93 | 2.26k | for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \ |
94 | 1.95k | ptype##_add(&row[i], &row[j], &row[j-1]), /* row[2]=p*(2+1) */\ |
95 | 1.95k | ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ |
96 | 308 | } /* row[4] ... */\ server.c:POINTonE1_precompute_w4 Line | Count | Source | 87 | 26 | static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \ | 88 | 26 | { \ | 89 | 26 | size_t i, j; \ | 90 | 26 | /* row[-1] is implicit infinity */\ | 91 | 26 | vec_copy(&row[0], point, sizeof(ptype)); /* row[0]=p*1 */\ | 92 | 26 | ptype##_double(&row[1], point); /* row[1]=p*(1+1) */\ | 93 | 104 | for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \ | 94 | 78 | ptype##_add(&row[i], &row[j], &row[j-1]), /* row[2]=p*(2+1) */\ | 95 | 78 | ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ | 96 | 26 | } /* row[4] ... */\ |
server.c:POINTonE1_precompute_w5 Line | Count | Source | 87 | 102 | static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \ | 88 | 102 | { \ | 89 | 102 | size_t i, j; \ | 90 | 102 | /* row[-1] is implicit infinity */\ | 91 | 102 | vec_copy(&row[0], point, sizeof(ptype)); /* row[0]=p*1 */\ | 92 | 102 | ptype##_double(&row[1], point); /* row[1]=p*(1+1) */\ | 93 | 816 | for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \ | 94 | 714 | ptype##_add(&row[i], &row[j], &row[j-1]), /* row[2]=p*(2+1) */\ | 95 | 714 | ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ | 96 | 102 | } /* row[4] ... */\ |
server.c:POINTonE2_precompute_w4 Line | Count | Source | 87 | 24 | static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \ | 88 | 24 | { \ | 89 | 24 | size_t i, j; \ | 90 | 24 | /* row[-1] is implicit infinity */\ | 91 | 24 | vec_copy(&row[0], point, sizeof(ptype)); /* row[0]=p*1 */\ | 92 | 24 | ptype##_double(&row[1], point); /* row[1]=p*(1+1) */\ | 93 | 96 | for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \ | 94 | 72 | ptype##_add(&row[i], &row[j], &row[j-1]), /* row[2]=p*(2+1) */\ | 95 | 72 | ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ | 96 | 24 | } /* row[4] ... */\ |
server.c:POINTonE2_precompute_w5 Line | Count | Source | 87 | 156 | static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \ | 88 | 156 | { \ | 89 | 156 | size_t i, j; \ | 90 | 156 | /* row[-1] is implicit infinity */\ | 91 | 156 | vec_copy(&row[0], point, sizeof(ptype)); /* row[0]=p*1 */\ | 92 | 156 | ptype##_double(&row[1], point); /* row[1]=p*(1+1) */\ | 93 | 1.24k | for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \ | 94 | 1.09k | ptype##_add(&row[i], &row[j], &row[j-1]), /* row[2]=p*(2+1) */\ | 95 | 1.09k | ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ | 96 | 156 | } /* row[4] ... */\ |
|
97 | | \ |
98 | | static void ptype##s_mult_w##SZ(ptype *ret, \ |
99 | | const ptype *points[], size_t npoints, \ |
100 | | const byte *scalars[], size_t bits, \ |
101 | 154 | ptype table[][1<<(SZ-1)]) \ |
102 | 154 | { \ |
103 | 154 | limb_t wmask, wval; \ |
104 | 154 | size_t i, j, window, nbytes; \ |
105 | 154 | const byte *scalar, **scalar_s = scalars; \ |
106 | 154 | ptype sum[1], row[1]; \ |
107 | 154 | bool_t sum_is_inf, row_is_inf, ret_is_inf; \ |
108 | 154 | \ |
109 | 154 | if (table == NULL) \ |
110 | 154 | table = (ptype (*)[1<<(SZ-1)])alloca((1<<(SZ-1)) * sizeof(ptype) * \ |
111 | 154 | npoints); \ |
112 | 154 | \ |
113 | 154 | if (points != NULL) { \ |
114 | 0 | const ptype *point = NULL; \ |
115 | 0 | for (i = 0; i < npoints; i++) \ |
116 | 0 | point = *points ? *points++ : point+1, \ |
117 | 0 | ptype##_precompute_w##SZ(table[i], point); \ |
118 | 0 | } \ |
119 | 154 | \ |
120 | 154 | nbytes = (bits + 7)/8; /* convert |bits| to bytes */ \ |
121 | 154 | scalar = *scalar_s++; \ |
122 | 154 | \ |
123 | 154 | /* top excess bits modulo target window size */ \ |
124 | 154 | window = bits % SZ; /* yes, it may be zero */ \ |
125 | 154 | wmask = ((limb_t)1 << (window + 1)) - 1; \ |
126 | 154 | \ |
127 | 154 | bits -= window; \ |
128 | 154 | if (bits > 0) \ |
129 | 154 | wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ |
130 | 154 | else \ |
131 | 154 | wval = (scalar[0] << 1) & wmask; \ |
132 | 154 | \ |
133 | 154 | wval = booth_encode(wval, SZ); \ |
134 | 154 | ret_is_inf = ptype##_gather_booth_w##SZ(ret, table[0], wval); \ |
135 | 154 | \ |
136 | 154 | i = 1; \ |
137 | 2.52k | while (bits > 0) { \ |
138 | 9.68k | for (; i < npoints; i++) { \ |
139 | 7.31k | scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ |
140 | 7.31k | wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ |
141 | 7.31k | wval = booth_encode(wval, SZ); \ |
142 | 7.31k | row_is_inf = ptype##_gather_booth_w##SZ(row, table[i], wval); \ |
143 | 7.31k | ptype##_dadd(sum, ret, row, NULL); \ |
144 | 7.31k | ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \ |
145 | 7.31k | sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \ |
146 | 7.31k | ret_is_inf |= sum_is_inf; \ |
147 | 7.31k | row_is_inf |= sum_is_inf; \ |
148 | 7.31k | ptype##_ccopy(ret, row, ret_is_inf); \ |
149 | 7.31k | ret_is_inf &= row_is_inf; \ |
150 | 7.31k | } \ |
151 | 2.36k | \ |
152 | 14.2k | for (j = 0; j < SZ; j++) \ |
153 | 11.8k | ptype##_double(ret, ret); \ |
154 | 2.36k | \ |
155 | 2.36k | window = SZ; \ |
156 | 2.36k | wmask = ((limb_t)1 << (window + 1)) - 1; \ |
157 | 2.36k | bits -= window; \ |
158 | 2.36k | i = 0; scalar_s = scalars; \ |
159 | 2.36k | } \ |
160 | 154 | \ |
161 | 690 | for (; i < npoints; i++) { \ |
162 | 536 | scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ |
163 | 536 | wval = (scalar[0] << 1) & wmask; \ |
164 | 536 | wval = booth_encode(wval, SZ); \ |
165 | 536 | row_is_inf = ptype##_gather_booth_w##SZ(row, table[i], wval); \ |
166 | 536 | ptype##_dadd(sum, ret, row, NULL); \ |
167 | 536 | ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \ |
168 | 536 | sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \ |
169 | 536 | ret_is_inf |= sum_is_inf; \ |
170 | 536 | row_is_inf |= sum_is_inf; \ |
171 | 536 | ptype##_ccopy(ret, row, ret_is_inf); \ |
172 | 536 | ret_is_inf &= row_is_inf; \ |
173 | 536 | } \ |
174 | 154 | \ |
175 | 154 | vec_czero(ret->Z, sizeof(ret->Z), ret_is_inf); \ |
176 | 154 | } \ Unexecuted instantiation: server.c:POINTonE1s_mult_w4 Unexecuted instantiation: server.c:POINTonE2s_mult_w4 |
177 | | \ |
178 | | static void ptype##_mult_w##SZ(ptype *ret, const ptype *point, \ |
179 | 154 | const byte *scalar, size_t bits) \ |
180 | 154 | { \ |
181 | 154 | limb_t wmask, wval; \ |
182 | 154 | size_t j, window; \ |
183 | 154 | ptype sum[1], row[1]; \ |
184 | 154 | bool_t sum_is_inf, row_is_inf, ret_is_inf; \ |
185 | 154 | ptype table[1<<(SZ-1)]; \ |
186 | 154 | \ |
187 | 154 | ptype##_precompute_w##SZ(table, point); \ |
188 | 154 | \ |
189 | 154 | /* top excess bits modulo target window size */ \ |
190 | 154 | window = bits % SZ; /* yes, it may be zero */ \ |
191 | 154 | wmask = ((limb_t)1 << (window + 1)) - 1; \ |
192 | 154 | \ |
193 | 154 | bits -= window; \ |
194 | 154 | wval = bits ? get_wval(scalar, bits - 1, window + 1) \ |
195 | 154 | : (limb_t)scalar[0] << 1; \ |
196 | 154 | wval &= wmask; \ |
197 | 154 | wval = booth_encode(wval, SZ); \ |
198 | 154 | ret_is_inf = ptype##_gather_booth_w##SZ(ret, table, wval); \ |
199 | 154 | \ |
200 | 63.1k | while (bits > 0) { \ |
201 | 377k | for (j = 0; j < SZ; j++) \ |
202 | 314k | ptype##_double(ret, ret); \ |
203 | 63.0k | \ |
204 | 63.0k | window = SZ; \ |
205 | 63.0k | wmask = ((limb_t)1 << (window + 1)) - 1; \ |
206 | 63.0k | bits -= window; \ |
207 | 63.0k | \ |
208 | 63.0k | wval = bits ? get_wval(scalar, bits - 1, window + 1) \ |
209 | 63.0k | : (limb_t)scalar[0] << 1; \ |
210 | 63.0k | wval &= wmask; \ |
211 | 63.0k | wval = booth_encode(wval, SZ); \ |
212 | 63.0k | row_is_inf = ptype##_gather_booth_w##SZ(row, table, wval); \ |
213 | 63.0k | ptype##_dadd(sum, ret, row, NULL); \ |
214 | 63.0k | ptype##_ccopy(ret, sum, (ret_is_inf | row_is_inf) ^ 1); \ |
215 | 63.0k | sum_is_inf = vec_is_zero(ret->Z, sizeof(ret->Z)); \ |
216 | 63.0k | ret_is_inf |= sum_is_inf; \ |
217 | 63.0k | row_is_inf |= sum_is_inf; \ |
218 | 63.0k | ptype##_ccopy(ret, row, ret_is_inf); \ |
219 | 63.0k | ret_is_inf &= row_is_inf; \ |
220 | 63.0k | } \ |
221 | 154 | \ |
222 | 154 | vec_czero(ret->Z, sizeof(ret->Z), ret_is_inf); \ |
223 | 154 | } |
224 | | |
225 | | #if 0 |
226 | | /* ~50%, or ~2x[!] slower than w5... */ |
227 | | #define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ |
228 | | static void ptype##_mult_ladder(ptype *ret, const ptype *p, \ |
229 | | const byte *scalar, size_t bits) \ |
230 | | { \ |
231 | | ptype sum[1]; \ |
232 | | bool_t bit, pbit = 0; \ |
233 | | \ |
234 | | vec_copy(sum, p, sizeof(ptype)); \ |
235 | | vec_zero(ret, sizeof(ptype)); /* infinity */ \ |
236 | | \ |
237 | | while (bits--) { \ |
238 | | bit = is_bit_set(scalar, bits); \ |
239 | | bit ^= pbit; \ |
240 | | ptype##_cswap(ret, sum, bit); \ |
241 | | ptype##_add(sum, sum, ret); \ |
242 | | ptype##_double(ret, ret); \ |
243 | | pbit ^= bit; \ |
244 | | } \ |
245 | | ptype##_cswap(ret, sum, pbit); \ |
246 | | } |
247 | | #else |
248 | | /* >40% better performance than above, [and ~30% slower than w5]... */ |
249 | | #define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ |
250 | | static void ptype##_mult_ladder(ptype *out, const ptype *p, \ |
251 | | const byte *scalar, size_t bits) \ |
252 | | { \ |
253 | | ptype##xz sum[1]; \ |
254 | | ptype##xz pxz[1]; \ |
255 | | ptype##xz ret[1]; \ |
256 | | bool_t bit, pbit = 0; \ |
257 | | \ |
258 | | ptype##xz_ladder_pre(pxz, p); \ |
259 | | vec_copy(sum, pxz, sizeof(ptype##xz)); \ |
260 | | vec_zero(ret, sizeof(ptype##xz)); /* infinity */ \ |
261 | | \ |
262 | | while (bits--) { \ |
263 | | bit = is_bit_set(scalar, bits); \ |
264 | | bit ^= pbit; \ |
265 | | ptype##xz_cswap(ret, sum, bit); \ |
266 | | ptype##xz_ladder_step(ret, sum, pxz); \ |
267 | | pbit ^= bit; \ |
268 | | } \ |
269 | | ptype##xz_cswap(ret, sum, pbit); \ |
270 | | ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \ |
271 | | } |
272 | | #endif |
273 | | |
274 | | /* |
275 | | * Sole reason for existence of this implementation is that addition |
276 | | * with affine point renders a share of multiplications redundant by |
277 | | * virtue of Z==1. And since pre-defined generator point can be and |
278 | | * customarily is instantiated affine, it would be hardly appropriate |
279 | | * to pass on this opportunity. Though while it's faster than the |
280 | | * generic ladder implementation, by ~25%, it's not faster than XZ one |
281 | | * above, <15% slower. Just in case, it's faster than generic ladder |
282 | | * even if one accounts for prior conversion to affine coordinates, |
283 | | * so that choice [for resource-constrained case] is actually between |
284 | | * this plus said conversion and XZ ladder... |
285 | | * |
286 | | * To summarize, if ptype##_mult_w5 executed in one unit of time, then |
287 | | * - naive ptype##_mult_ladder would execute in ~2; |
288 | | * - XZ version above - in ~1.4; |
289 | | * - ptype##_affine_mult_ladder below - in ~1.65; |
290 | | * - [small-footprint ptype##_to_affine would run in ~0.18]. |
291 | | * |
292 | | * Caveat lector, |p_affine|*(order+2) produces wrong result, because |
293 | | * addition doesn't handle doubling. Indeed, P*(order+1) is P and it |
294 | | * fails to add with itself producing infinity in last addition. But |
295 | | * as long as |scalar| is reduced modulo order, as it should be, it's |
296 | | * not a problem... |
297 | | */ |
298 | | #define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \ |
299 | | static void ptype##_affine_mult_ladder(ptype *ret, \ |
300 | | const ptype##_affine *p_affine, \ |
301 | | const byte *scalar, size_t bits) \ |
302 | | { \ |
303 | | ptype sum[1]; \ |
304 | | bool_t bit; \ |
305 | | \ |
306 | | vec_zero(ret, sizeof(ptype)); /* infinity */ \ |
307 | | \ |
308 | | while (bits--) { \ |
309 | | ptype##_double(ret, ret); \ |
310 | | ptype##_add_affine(sum, ret, p_affine); \ |
311 | | bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \ |
312 | | ptype##_ccopy(ret, sum, bit); \ |
313 | | } \ |
314 | | } |
315 | | #endif |