Line | Count | Source |
1 | | /* |
2 | | * Copyright Supranational LLC |
3 | | * Licensed under the Apache License, Version 2.0, see LICENSE for details. |
4 | | * SPDX-License-Identifier: Apache-2.0 |
5 | | */ |
6 | | #ifndef __BLS12_381_ASM_VECT_H__ |
7 | | #define __BLS12_381_ASM_VECT_H__ |
8 | | |
9 | | #include <stddef.h> |
10 | | |
11 | | #if defined(__x86_64__) || defined(__aarch64__) |
12 | | /* These are available even in ILP32 flavours, but even then they are |
13 | | * capable of performing 64-bit operations as efficiently as in *P64. */ |
14 | | typedef unsigned long long limb_t; |
15 | 1.22M | # define LIMB_T_BITS 64 |
16 | | |
17 | | #elif defined(_WIN64) /* Win64 is P64 */ |
18 | | typedef unsigned __int64 limb_t; |
19 | | # define LIMB_T_BITS 64 |
20 | | |
21 | | #elif defined(__BLST_NO_ASM__) || defined(__wasm64__) |
22 | | typedef unsigned int limb_t; |
23 | | # define LIMB_T_BITS 32 |
24 | | # ifndef __BLST_NO_ASM__ |
25 | | # define __BLST_NO_ASM__ |
26 | | # endif |
27 | | |
28 | | #else /* 32 bits on 32-bit platforms, 64 - on 64-bit */ |
29 | | typedef unsigned long limb_t; |
30 | | # ifdef _LP64 |
31 | | # define LIMB_T_BITS 64 |
32 | | # else |
33 | | # define LIMB_T_BITS 32 |
34 | | # define __BLST_NO_ASM__ |
35 | | # endif |
36 | | #endif |
37 | | |
38 | | /* |
39 | | * Why isn't LIMB_T_BITS defined as 8*sizeof(limb_t)? Because pre-processor |
40 | | * knows nothing about sizeof(anything)... |
41 | | */ |
42 | | #if LIMB_T_BITS == 64 |
43 | 153k | # define TO_LIMB_T(limb64) limb64 |
44 | | #else |
45 | | # define TO_LIMB_T(limb64) (limb_t)limb64,(limb_t)(limb64>>32) |
46 | | #endif |
47 | | |
48 | 110 | #define NLIMBS(bits) (bits/LIMB_T_BITS) |
49 | | |
50 | | typedef limb_t vec256[NLIMBS(256)]; |
51 | | typedef limb_t vec512[NLIMBS(512)]; |
52 | | typedef limb_t vec384[NLIMBS(384)]; |
53 | | typedef limb_t vec768[NLIMBS(768)]; |
54 | | typedef vec384 vec384x[2]; /* 0 is "real" part, 1 is "imaginary" */ |
55 | | |
56 | | typedef unsigned char byte; |
57 | | #define TO_BYTES(limb64) (byte)limb64,(byte)(limb64>>8),\ |
58 | | (byte)(limb64>>16),(byte)(limb64>>24),\ |
59 | | (byte)(limb64>>32),(byte)(limb64>>40),\ |
60 | | (byte)(limb64>>48),(byte)(limb64>>56) |
61 | | typedef byte pow256[256/8]; |
62 | | |
63 | | /* |
64 | | * Internal Boolean type, Boolean by value, hence safe to cast to or |
65 | | * reinterpret as 'bool'. |
66 | | */ |
67 | | typedef limb_t bool_t; |
68 | | |
69 | | /* |
70 | | * Assembly subroutines... |
71 | | */ |
72 | | #if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__)\ |
73 | | && !defined(__BLST_NO_ASM__) |
74 | 1.21k | # define mul_mont_sparse_256 mulx_mont_sparse_256 |
75 | 14 | # define sqr_mont_sparse_256 sqrx_mont_sparse_256 |
76 | 203 | # define from_mont_256 fromx_mont_256 |
77 | 10 | # define redc_mont_256 redcx_mont_256 |
78 | 1.32M | # define mul_mont_384 mulx_mont_384 |
79 | 1.69M | # define sqr_mont_384 sqrx_mont_384 |
80 | | # define sqr_n_mul_mont_384 sqrx_n_mul_mont_384 |
81 | 45.2k | # define sqr_n_mul_mont_383 sqrx_n_mul_mont_383 |
82 | | # define mul_384 mulx_384 |
83 | | # define sqr_384 sqrx_384 |
84 | 138k | # define redc_mont_384 redcx_mont_384 |
85 | 4.64k | # define from_mont_384 fromx_mont_384 |
86 | 178 | # define sgn0_pty_mont_384 sgn0x_pty_mont_384 |
87 | 212 | # define sgn0_pty_mont_384x sgn0x_pty_mont_384x |
88 | 1.44k | # define ct_inverse_mod_384 ctx_inverse_mod_384 |
89 | | #endif |
90 | | |
91 | | void mul_mont_sparse_256(vec256 ret, const vec256 a, const vec256 b, |
92 | | const vec256 p, limb_t n0); |
93 | | void sqr_mont_sparse_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); |
94 | | void redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0); |
95 | | void from_mont_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); |
96 | | |
97 | | void add_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); |
98 | | void sub_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); |
99 | | void mul_by_3_mod_256(vec256 ret, const vec256 a, const vec256 p); |
100 | | void cneg_mod_256(vec256 ret, const vec256 a, bool_t flag, const vec256 p); |
101 | | void lshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); |
102 | | void rshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); |
103 | | bool_t eucl_inverse_mod_256(vec256 ret, const vec256 a, const vec256 p, |
104 | | const vec256 one); |
105 | | limb_t check_mod_256(const pow256 a, const vec256 p); |
106 | | limb_t add_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, |
107 | | const vec256 p); |
108 | | limb_t sub_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, |
109 | | const vec256 p); |
110 | | |
111 | | void vec_prefetch(const void *ptr, size_t len); |
112 | | |
113 | | void mul_mont_384(vec384 ret, const vec384 a, const vec384 b, |
114 | | const vec384 p, limb_t n0); |
115 | | void sqr_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); |
116 | | void sqr_n_mul_mont_384(vec384 ret, const vec384 a, size_t count, |
117 | | const vec384 p, limb_t n0, const vec384 b); |
118 | | void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, |
119 | | const vec384 p, limb_t n0, const vec384 b); |
120 | | |
121 | | void mul_384(vec768 ret, const vec384 a, const vec384 b); |
122 | | void sqr_384(vec768 ret, const vec384 a); |
123 | | void redc_mont_384(vec384 ret, const vec768 a, const vec384 p, limb_t n0); |
124 | | void from_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); |
125 | | limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0); |
126 | | limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0); |
127 | | limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p); |
128 | | limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p); |
129 | | |
130 | | void add_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); |
131 | | void sub_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); |
132 | | void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 p); |
133 | | void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 p); |
134 | | void cneg_mod_384(vec384 ret, const vec384 a, bool_t flag, const vec384 p); |
135 | | void lshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); |
136 | | void rshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); |
137 | | void div_by_2_mod_384(vec384 ret, const vec384 a, const vec384 p); |
138 | | void ct_inverse_mod_384(vec768 ret, const vec384 inp, const vec384 mod, |
139 | | const vec384 modx); |
140 | | void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, |
141 | | const vec256 modx); |
142 | | bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod); |
143 | | |
144 | | #if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__) |
145 | 590k | # define mul_mont_384x mulx_mont_384x |
146 | 718k | # define sqr_mont_384x sqrx_mont_384x |
147 | | # define sqr_mont_382x sqrx_mont_382x |
148 | 33.2k | # define mul_382x mulx_382x |
149 | 82.3k | # define sqr_382x sqrx_382x |
150 | | #endif |
151 | | |
152 | | void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, |
153 | | const vec384 p, limb_t n0); |
154 | | void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); |
155 | | void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); |
156 | | void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p); |
157 | | void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p); |
158 | | |
159 | | void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, |
160 | | const vec384 p); |
161 | | void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, |
162 | | const vec384 p); |
163 | | void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 p); |
164 | | void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 p); |
165 | | void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 p); |
166 | | void add_mod_384x384(vec768 ret, const vec768 a, const vec768 b, |
167 | | const vec384 p); |
168 | | void sub_mod_384x384(vec768 ret, const vec768 a, const vec768 b, |
169 | | const vec384 p); |
170 | | |
171 | | /* |
172 | | * C subroutines |
173 | | */ |
174 | | static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, |
175 | | size_t pow_bits, const vec384 p, limb_t n0); |
176 | | static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, |
177 | | size_t pow_bits, const vec384 p, limb_t n0); |
178 | | static void div_by_zz(limb_t val[]); |
179 | | static void div_by_z(limb_t val[]); |
180 | | |
181 | | #ifdef __UINTPTR_TYPE__ |
182 | | typedef __UINTPTR_TYPE__ uptr_t; |
183 | | #else |
184 | | typedef const void *uptr_t; |
185 | | #endif |
186 | | |
187 | | #if !defined(restrict) |
188 | | # if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 |
189 | | # if defined(__GNUC__) && __GNUC__>=2 |
190 | | # define restrict __restrict__ |
191 | | # elif defined(_MSC_VER) |
192 | | # define restrict __restrict |
193 | | # else |
194 | | # define restrict |
195 | | # endif |
196 | | # endif |
197 | | #endif |
198 | | |
199 | | #if !defined(inline) && !defined(__cplusplus) |
200 | | # if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 |
201 | | # if defined(__GNUC__) && __GNUC__>=2 |
202 | | # define inline __inline__ |
203 | | # elif defined(_MSC_VER) |
204 | | # define inline __inline |
205 | | # else |
206 | | # define inline |
207 | | # endif |
208 | | # endif |
209 | | #endif |
210 | | |
211 | | #if defined(__GNUC__) || defined(__clang__) |
212 | 2.92M | # define launder(var) __asm__ __volatile__("" : "+r"(var)) |
213 | | #else |
214 | | # define launder(var) |
215 | | #endif |
216 | | |
217 | | static inline bool_t is_bit_set(const byte *v, size_t i) |
218 | 0 | { |
219 | 0 | bool_t ret = (v[i/8] >> (i%8)) & 1; |
220 | 0 | launder(ret); |
221 | 0 | return ret; |
222 | 0 | } |
223 | | |
224 | | static inline bool_t byte_is_zero(unsigned char c) |
225 | 1.22M | { |
226 | 1.22M | limb_t ret = ((limb_t)(c) - 1) >> (LIMB_T_BITS - 1); |
227 | 1.22M | launder(ret); |
228 | 1.22M | return ret; |
229 | 1.22M | } |
230 | | |
231 | | static inline bool_t bytes_are_zero(const unsigned char *a, size_t num) |
232 | 4 | { |
233 | 4 | unsigned char acc; |
234 | 4 | size_t i; |
235 | | |
236 | 192 | for (acc = 0, i = 0; i < num; i++) |
237 | 188 | acc |= a[i]; |
238 | | |
239 | 4 | return byte_is_zero(acc); |
240 | 4 | } |
241 | | |
242 | | static inline void vec_cswap(void *restrict a, void *restrict b, size_t num, |
243 | | bool_t cbit) |
244 | 0 | { |
245 | 0 | limb_t ai, *ap = (limb_t *)a; |
246 | 0 | limb_t bi, *bp = (limb_t *)b; |
247 | 0 | limb_t xorm, mask; |
248 | 0 | size_t i; |
249 | 0 |
|
250 | 0 | launder(cbit); |
251 | 0 | mask = (limb_t)0 - cbit; |
252 | 0 |
|
253 | 0 | num /= sizeof(limb_t); |
254 | 0 |
|
255 | 0 | for (i = 0; i < num; i++) { |
256 | 0 | xorm = ((ai = ap[i]) ^ (bi = bp[i])) & mask; |
257 | 0 | ap[i] = ai ^ xorm; |
258 | 0 | bp[i] = bi ^ xorm; |
259 | 0 | } |
260 | 0 | } |
261 | | |
262 | | /* ret = bit ? a : b */ |
263 | | void vec_select_32(void *ret, const void *a, const void *b, bool_t sel_a); |
264 | | void vec_select_48(void *ret, const void *a, const void *b, bool_t sel_a); |
265 | | void vec_select_96(void *ret, const void *a, const void *b, bool_t sel_a); |
266 | | void vec_select_144(void *ret, const void *a, const void *b, bool_t sel_a); |
267 | | void vec_select_192(void *ret, const void *a, const void *b, bool_t sel_a); |
268 | | void vec_select_288(void *ret, const void *a, const void *b, bool_t sel_a); |
269 | | static inline void vec_select(void *ret, const void *a, const void *b, |
270 | | size_t num, bool_t sel_a) |
271 | 1.62M | { |
272 | 1.62M | launder(sel_a); |
273 | 1.62M | #ifndef __BLST_NO_ASM__ |
274 | 1.62M | if (num == 32) vec_select_32(ret, a, b, sel_a); |
275 | 1.62M | else if (num == 48) vec_select_48(ret, a, b, sel_a); |
276 | 1.62M | else if (num == 96) vec_select_96(ret, a, b, sel_a); |
277 | 1.62M | else if (num == 144) vec_select_144(ret, a, b, sel_a); |
278 | 491k | else if (num == 192) vec_select_192(ret, a, b, sel_a); |
279 | 491k | else if (num == 288) vec_select_288(ret, a, b, sel_a); |
280 | | #else |
281 | | if (0) ; |
282 | | #endif |
283 | 0 | else { |
284 | 0 | limb_t bi; |
285 | 0 | volatile limb_t *rp = (limb_t *)ret; |
286 | 0 | const limb_t *ap = (const limb_t *)a; |
287 | 0 | const limb_t *bp = (const limb_t *)b; |
288 | 0 | limb_t xorm, mask = (limb_t)0 - sel_a; |
289 | 0 | size_t i; |
290 | |
|
291 | 0 | num /= sizeof(limb_t); |
292 | |
|
293 | 0 | for (i = 0; i < num; i++) { |
294 | 0 | xorm = (ap[i] ^ (bi = bp[i])) & mask; |
295 | 0 | rp[i] = bi ^ xorm; |
296 | 0 | } |
297 | 0 | } |
298 | 1.62M | } |
299 | | |
300 | | static inline bool_t is_zero(limb_t l) |
301 | 1.23k | { |
302 | 1.23k | limb_t ret = (~l & (l - 1)) >> (LIMB_T_BITS - 1); |
303 | 1.23k | launder(ret); |
304 | 1.23k | return ret; |
305 | 1.23k | } |
306 | | |
307 | | static inline bool_t vec_is_zero(const void *a, size_t num) |
308 | 326k | { |
309 | 326k | const limb_t *ap = (const limb_t *)a; |
310 | 326k | limb_t acc; |
311 | 326k | size_t i; |
312 | | |
313 | 326k | #ifndef __BLST_NO_ASM__ |
314 | 326k | bool_t vec_is_zero_16x(const void *a, size_t num); |
315 | 326k | if ((num & 15) == 0) |
316 | 326k | return vec_is_zero_16x(a, num); |
317 | 0 | #endif |
318 | | |
319 | 0 | num /= sizeof(limb_t); |
320 | |
|
321 | 0 | for (acc = 0, i = 0; i < num; i++) |
322 | 0 | acc |= ap[i]; |
323 | |
|
324 | 0 | return is_zero(acc); |
325 | 326k | } |
326 | | |
327 | | static inline bool_t vec_is_equal(const void *a, const void *b, size_t num) |
328 | 3.62k | { |
329 | 3.62k | const limb_t *ap = (const limb_t *)a; |
330 | 3.62k | const limb_t *bp = (const limb_t *)b; |
331 | 3.62k | limb_t acc; |
332 | 3.62k | size_t i; |
333 | | |
334 | 3.62k | #ifndef __BLST_NO_ASM__ |
335 | 3.62k | bool_t vec_is_equal_16x(const void *a, const void *b, size_t num); |
336 | 3.62k | if ((num & 15) == 0) |
337 | 3.62k | return vec_is_equal_16x(a, b, num); |
338 | 0 | #endif |
339 | | |
340 | 0 | num /= sizeof(limb_t); |
341 | |
|
342 | 0 | for (acc = 0, i = 0; i < num; i++) |
343 | 0 | acc |= ap[i] ^ bp[i]; |
344 | |
|
345 | 0 | return is_zero(acc); |
346 | 3.62k | } |
347 | | |
348 | | static inline void cneg_mod_384x(vec384x ret, const vec384x a, bool_t flag, |
349 | | const vec384 p) |
350 | 0 | { |
351 | 0 | cneg_mod_384(ret[0], a[0], flag, p); |
352 | 0 | cneg_mod_384(ret[1], a[1], flag, p); |
353 | 0 | } |
354 | | |
355 | | static inline void vec_copy(void *restrict ret, const void *a, size_t num) |
356 | 90.3k | { |
357 | 90.3k | limb_t *rp = (limb_t *)ret; |
358 | 90.3k | const limb_t *ap = (const limb_t *)a; |
359 | 90.3k | size_t i; |
360 | | |
361 | 90.3k | num /= sizeof(limb_t); |
362 | | |
363 | 2.18M | for (i = 0; i < num; i++) |
364 | 2.09M | rp[i] = ap[i]; |
365 | 90.3k | } |
366 | | |
367 | | static inline void vec_zero(void *ret, size_t num) |
368 | 549 | { |
369 | 549 | volatile limb_t *rp = (volatile limb_t *)ret; |
370 | 549 | size_t i; |
371 | | |
372 | 549 | num /= sizeof(limb_t); |
373 | | |
374 | 5.55k | for (i = 0; i < num; i++) |
375 | 5.01k | rp[i] = 0; |
376 | | |
377 | 549 | #if defined(__GNUC__) || defined(__clang__) |
378 | 549 | __asm__ __volatile__("" : : "r"(ret) : "memory"); |
379 | 549 | #endif |
380 | 549 | } |
381 | | |
382 | | static inline void vec_czero(void *ret, size_t num, bool_t cbit) |
383 | 332 | { |
384 | 332 | limb_t *rp = (limb_t *)ret; |
385 | 332 | size_t i; |
386 | 332 | limb_t mask; |
387 | | |
388 | 332 | launder(cbit); |
389 | 332 | mask = (limb_t)0 - (cbit^1); |
390 | | |
391 | 332 | num /= sizeof(limb_t); |
392 | | |
393 | 3.37k | for (i = 0; i < num; i++) |
394 | 3.04k | rp[i] &= mask; |
395 | 332 | } |
396 | | |
397 | | /* |
398 | | * Some compilers get arguably overzealous(*) when passing pointer to |
399 | | * multi-dimensional array [such as vec384x] as 'const' argument. |
400 | | * General direction seems to be to legitimize such constification, |
401 | | * so it's argued that suppressing the warning is appropriate. |
402 | | * |
403 | | * (*) http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1923.htm |
404 | | */ |
405 | | #if defined(__INTEL_COMPILER) |
406 | | # pragma warning(disable:167) |
407 | | # pragma warning(disable:556) |
408 | | #elif defined(__GNUC__) && !defined(__clang__) && (__STDC_VERSION__-0) < 202311 |
409 | | # pragma GCC diagnostic ignored "-Wpedantic" |
410 | | #elif defined(_MSC_VER) |
411 | | # pragma warning(disable: 4127 4189) |
412 | | #endif |
413 | | |
414 | | #if !defined(__wasm__) && __STDC_HOSTED__-0 != 0 |
415 | | # include <stdlib.h> |
416 | | #endif |
417 | | |
418 | | #if defined(__GNUC__) |
419 | | # ifndef alloca |
420 | | # define alloca(s) __builtin_alloca(s) |
421 | | # endif |
422 | | #elif defined(__sun) |
423 | | # include <alloca.h> |
424 | | #elif defined(_WIN32) |
425 | | # include <malloc.h> |
426 | | # ifndef alloca |
427 | | # define alloca(s) _alloca(s) |
428 | | # endif |
429 | | #endif |
430 | | |
431 | | #endif /* __BLS12_381_ASM_VECT_H__ */ |