Line data Source code
1 : /* Included by fd_bn254_g1.c and fd_bn254_g2.c, should not be used elsewhere. */
2 :
3 : /* Rundown of the BN254 GLV implementation:
4 :
5 : Context: https://bitcointalk.org/index.php?topic=3238.msg45565#msg45565
6 :
7 : BN254 is a "pairing-friendly" curve E: y^2 = x^3 + 3 over a prime field
8 : Fp, where:
9 : p = 21888242871839275222246405745257275088696311157297823662689037894645226208583
10 :
11 : The group G1 = E(Fp) has prime order:
12 : r = 21888242871839275222246405745257275088548364400416034343698204186575808495617
13 :
14 : A naive approach to scalar multiplication, [s]P, requires ~256 doublings
15 : and ~128 additions (double-and-add) in the worst case. The GLV method
16 : takes advantage of an easy to compute endomorphism to cut this roughly
17 : in half.
18 :
19 : For BN254, there exists a cube root of unity beta in Fp such that:
20 : phi: (x, y) -> (beta * x, y)
21 : is an endomorphism of E.
22 : That is, if point P = (x, y) is on the curve, then phi(P) = (beta * x, y)
23 : is also on the curve, since (beta*x)^3 = beta^3 * x^3) = x^3,
24 : so the curve equation is preserved.
25 :
26 : phi satisfies phi(P) = [lambda]P where lambda is a cube root of unity
27 : modulo r (the group order we defined above^):
28 : lambda = 4407920970296243842393367215006156084916469457145843978461
29 : (a root of X^2 + X + 1 = 0 mod r)
30 :
31 : Computing phi(P) = (beta * x, y) costs just one FP multiplication, but
32 : is equivalent to a full scalar multiplication by the ~254-bit scalar lambda.
33 :
34 : We want to "decompose" the input scalar in a way where a majority of the
35 : effort is re-used by the lambda computation, leaving the remaining operations
36 : smaller and faster to perform. Given a scalar s, we want to find small
37 : k1, k2 (each ~128-bit) such that:
38 : s = k1 + k2 * lambda (mod r)
39 : Then: [s]P = [k1]P + [k2](lambda * P) = [k1]P + [k2]phi(P).
40 :
41 : Using the "Straus-Shamir trick", https://pmc.ncbi.nlm.nih.gov/articles/PMC9028562/#app1-sensors-22-03083,
42 : this requires only ~128 doublings + ~128 additions instead of ~256 doublings.
43 :
44 : To decompose the scalar, we have a 2-dimensional lattice
45 : L = { (a, b) in Z^2 : a + b*lambda = 0 (mod r) }
46 :
47 : A reduced basis of L is given by three magnitudes:
48 : N_A = 147946756881789319000765030803803410728 (~128 bits, 2 limbs)
49 : N_B = 9931322734385697763 (~64 bits, 1 limb)
50 : N_C = 147946756881789319010696353538189108491 (~128 bits, 2 limbs)
51 :
52 : These are arranged differently depending on the group.
53 : G1:
54 : | +N_A +N_B |
55 : | -N_B +N_C |
56 : G2:
57 : | -N_C -N_B |
58 : | +N_B -N_A |
59 :
60 : We avoid big integer divisions by r using Babai's algorithm with
61 : precomputed fixed-point inverses:
62 : b1 = (s * g1) >> 256, where for group:
63 : G1: g1 = round(2^256 * N_C / r)
64 : G2: g1 = round(2^256 * N_A / r)
65 : b2 = (s * g2) >> 256 g2 = round(2^256 * N_B / r), 66-bit, 2 limbs)
66 :
67 : Then:
68 : G1: k1 = s - b1*N_A - b2*N_B, k2 = b1*N_B - b2*N_C
69 : G2: k1 = s - b1*N_C - b2*N_B, k2 = b2*N_A - b1*N_B
70 :
71 : For G1, k1 >= 0 always, k2 may be negative.
72 : For G2, both k1 and k2 may be negative. */
73 :
74 : /* beta in Montgomery form.
75 : 0x30644e72e131a0295e6dd9e7e0acccb0c28f069fbb966e3de4bd44e5607cfd48 */
76 : const fd_bn254_fp_t fd_bn254_const_beta_mont[1] = {{{
77 : 0x3350c88e13e80b9cUL, 0x7dce557cdb5e56b9UL, 0x6001b4b8b615564aUL, 0x2682e617020217e0UL
78 : }}};
79 :
80 : /* Lattice constants, see glv.py */
81 : const ulong na[ 2 ] = { 0x8211bbeb7d4f1128UL, 0x6f4d8248eeb859fcUL };
82 : const ulong nb[ 1 ] = { 0x89d3256894d213e3UL };
83 : const ulong nc[ 2 ] = { 0x0be4e1541221250bUL, 0x6f4d8248eeb859fdUL };
84 :
85 : /* g2 = round(2^256 * N_B / r), 66-bit (2 limbs). Same for G1 and G2. */
86 : const ulong g2_const[ 2 ] = { 0xd91d232ec7e0b3d7UL, 0x0000000000000002UL };
87 :
88 : /* Multiply 4-limb scalar s by a 3-limb constant g.
89 : Returns top 3 limbs. */
90 : static inline void
91 : fd_bn254_glv_sxg3( ulong out[ 3 ],
92 : fd_bn254_scalar_t const * s,
93 361 : ulong const g[ 3 ] ) {
94 361 : uint128 s0 = s->limbs[0];
95 361 : uint128 s1 = s->limbs[1];
96 361 : uint128 s2 = s->limbs[2];
97 361 : uint128 s3 = s->limbs[3];
98 361 : uint128 acc;
99 361 : acc = s0 * g[ 0 ];
100 361 : acc = s1 * g[ 0 ] + s0 * g[ 1 ] + (ulong)(acc >> 64);
101 361 : acc = s2 * g[ 0 ] + s1 * g[ 1 ] + s0 * g[ 2 ] + (ulong)(acc >> 64);
102 361 : acc = s3 * g[ 0 ] + s2 * g[ 1 ] + s1 * g[ 2 ] + (ulong)(acc >> 64);
103 361 : acc = s3 * g[ 1 ] + s2 * g[ 2 ] + (ulong)(acc >> 64); out[ 0 ] = (ulong)acc;
104 361 : acc = s3 * g[ 2 ] + (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
105 361 : acc = (ulong)(acc >> 64); out[ 2 ] = (ulong)acc;
106 361 : }
107 :
108 : /* Same, but for a 2-limb constant g. */
109 : static inline void
110 : fd_bn254_glv_sxg2( ulong out[ 2 ],
111 : fd_bn254_scalar_t const * s,
112 361 : ulong const g[ 2 ] ) {
113 361 : uint128 s0 = s->limbs[0];
114 361 : uint128 s1 = s->limbs[1];
115 361 : uint128 s2 = s->limbs[2];
116 361 : uint128 s3 = s->limbs[3];
117 361 : uint128 acc;
118 361 : acc = s0 * g[ 0 ];
119 361 : acc = s1 * g[ 0 ] + s0 * g[ 1 ] + (ulong)(acc >> 64);
120 361 : acc = s2 * g[ 0 ] + s1 * g[ 1 ] + (ulong)(acc >> 64);
121 361 : acc = s3 * g[ 0 ] + s2 * g[ 1 ] + (ulong)(acc >> 64);
122 361 : acc = s3 * g[ 1 ] + (ulong)(acc >> 64); out[ 0 ] = (ulong)acc;
123 361 : acc = (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
124 361 : }
125 :
126 : /* Multiply 3-limb a by 2-limb n, store low 4 limbs into out. */
127 : static inline void
128 : fd_bn254_glv_mul3x2( ulong out[ 4 ],
129 : ulong const a[ 3 ],
130 361 : ulong const n[ 2 ] ) {
131 361 : uint128 acc;
132 361 : acc = (uint128)a[ 0 ] * n[ 0 ]; out[ 0 ] = (ulong)acc;
133 361 : acc = (uint128)a[ 1 ] * n[ 0 ] + (uint128)a[ 0 ] * n[ 1 ] + (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
134 361 : acc = (uint128)a[ 2 ] * n[ 0 ] + (uint128)a[ 1 ] * n[ 1 ] + (ulong)(acc >> 64); out[ 2 ] = (ulong)acc;
135 361 : acc = (uint128)a[ 2 ] * n[ 1 ] + (ulong)(acc >> 64); out[ 3 ] = (ulong)acc;
136 361 : }
137 :
138 : /* Multiply 3-limb by 1-limb, store into 4-limb. */
139 : static inline void
140 : fd_bn254_glv_mul3x1( ulong out[ 4 ],
141 : ulong const a[ 3 ],
142 361 : ulong const n[ 1 ] ) {
143 361 : uint128 acc;
144 361 : acc = (uint128)a[ 0 ] * n[ 0 ]; out[ 0 ] = (ulong)acc;
145 361 : acc = (uint128)a[ 1 ] * n[ 0 ] + (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
146 361 : acc = (uint128)a[ 2 ] * n[ 0 ] + (ulong)(acc >> 64); out[ 2 ] = (ulong)acc;
147 361 : acc = (ulong)(acc >> 64); out[ 3 ] = (ulong)acc;
148 361 : }
149 :
150 : /* Multiply 2-limb by 2-limb, store into 4-limb. */
151 : static inline void
152 : fd_bn254_glv_mul2x2( ulong out[ 4 ],
153 : ulong const a[ 2 ],
154 361 : ulong const n[ 2 ] ) {
155 361 : uint128 acc;
156 361 : acc = (uint128)a[ 0 ] * n[ 0 ]; out[ 0 ] = (ulong)acc;
157 361 : acc = (uint128)a[ 1 ] * n[ 0 ] + (uint128)a[ 0 ] * n[ 1 ] + (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
158 361 : acc = (uint128)a[ 1 ] * n[ 1 ] + (ulong)(acc >> 64); out[ 2 ] = (ulong)acc;
159 361 : acc = (ulong)(acc >> 64); out[ 3 ] = (ulong)acc;
160 361 : }
161 :
162 : /* Multiply 2-limb by 1-limb, stores 3-limbs. */
163 : static inline void
164 : fd_bn254_glv_mul2x1( ulong out[ 3 ],
165 : ulong const a[ 2 ],
166 362 : ulong const n[ 1 ] ) {
167 362 : uint128 acc;
168 362 : acc = (uint128)a[ 0 ] * n[ 0 ]; out[ 0 ] = (ulong)acc;
169 362 : acc = (uint128)a[ 1 ] * n[ 0 ] + (ulong)(acc >> 64); out[ 1 ] = (ulong)acc;
170 362 : acc = (ulong)(acc >> 64); out[ 2 ] = (ulong)acc;
171 362 : }
172 :
173 : /* 4-limb addition: out = a + b. Returns carry. */
174 : static inline ulong
175 : fd_bn254_glv_add4( ulong out[ 4 ],
176 : ulong const a[ 4 ],
177 361 : ulong const b[ 4 ] ) {
178 361 : ulong carry = 0;
179 1805 : for( int j = 0; j < 4; j++ ) {
180 1444 : uint128 acc = (uint128)a[ j ] + b[ j ] + carry;
181 1444 : out[ j ] = (ulong)acc;
182 1444 : carry = (ulong)(acc >> 64);
183 1444 : }
184 361 : return carry;
185 361 : }
186 :
187 : /* 4-limb subtraction: out = a - b. Returns borrow (1 if a < b). */
188 : static inline ulong
189 : fd_bn254_glv_sub4( ulong out[ 4 ],
190 : ulong const a[ 4 ],
191 724 : ulong const b[ 4 ] ) {
192 724 : ulong borrow = 0;
193 3616 : for( int j = 0; j < 4; j++ ) {
194 2892 : ulong av = a[ j ];
195 2892 : ulong bv = b[ j ];
196 2892 : ulong diff = av - bv - borrow;
197 2892 : borrow = ( av < bv || (borrow && av == bv) ) ? 1UL : 0UL;
198 2892 : out[ j ] = diff;
199 2892 : }
200 724 : return borrow;
201 724 : }
202 :
203 : /* Two's complement negation of a 4-limb value in-place. */
204 : static inline void
205 0 : fd_bn254_glv_negate4( ulong v[ 4 ] ) {
206 0 : ulong carry = 1;
207 0 : for( int j = 0; j < 4; j++ ) {
208 0 : uint128 sum = (uint128)(~v[ j ]) + carry;
209 0 : v[ j ] = (ulong)sum;
210 0 : carry = (ulong)(sum >> 64);
211 0 : }
212 0 : }
|