/src/BearSSL/src/symcipher/poly1305_ctmul.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> |
3 | | * |
4 | | * Permission is hereby granted, free of charge, to any person obtaining |
5 | | * a copy of this software and associated documentation files (the |
6 | | * "Software"), to deal in the Software without restriction, including |
7 | | * without limitation the rights to use, copy, modify, merge, publish, |
8 | | * distribute, sublicense, and/or sell copies of the Software, and to |
9 | | * permit persons to whom the Software is furnished to do so, subject to |
10 | | * the following conditions: |
11 | | * |
12 | | * The above copyright notice and this permission notice shall be |
13 | | * included in all copies or substantial portions of the Software. |
14 | | * |
15 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
16 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
17 | | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
18 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
19 | | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
20 | | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
21 | | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | |
25 | | #include "inner.h" |
26 | | |
27 | | /* |
28 | | * Perform the inner processing of blocks for Poly1305. The accumulator |
29 | | * and the r key are provided as arrays of 26-bit words (these words |
30 | | * are allowed to have an extra bit, i.e. use 27 bits). |
31 | | * |
32 | | * On output, all accumulator words fit on 26 bits, except acc[1], which |
33 | | * may be slightly larger (but by a very small amount only). |
34 | | */ |
35 | | static void |
36 | | poly1305_inner(uint32_t *acc, const uint32_t *r, const void *data, size_t len) |
37 | 2.49k | { |
38 | | /* |
39 | | * Implementation notes: we split the 130-bit values into five |
40 | | * 26-bit words. This gives us some space for carries. |
41 | | * |
42 | | * This code is inspired from the public-domain code available |
43 | | * on: |
44 | | * https://github.com/floodyberry/poly1305-donna |
45 | | * |
46 | | * Since we compute modulo 2^130-5, the "upper words" become |
47 | | * low words with a factor of 5; that is, x*2^130 = x*5 mod p. |
48 | | */ |
49 | 2.49k | const unsigned char *buf; |
50 | 2.49k | uint32_t a0, a1, a2, a3, a4; |
51 | 2.49k | uint32_t r0, r1, r2, r3, r4; |
52 | 2.49k | uint32_t u1, u2, u3, u4; |
53 | | |
54 | 2.49k | r0 = r[0]; |
55 | 2.49k | r1 = r[1]; |
56 | 2.49k | r2 = r[2]; |
57 | 2.49k | r3 = r[3]; |
58 | 2.49k | r4 = r[4]; |
59 | | |
60 | 2.49k | u1 = r1 * 5; |
61 | 2.49k | u2 = r2 * 5; |
62 | 2.49k | u3 = r3 * 5; |
63 | 2.49k | u4 = r4 * 5; |
64 | | |
65 | 2.49k | a0 = acc[0]; |
66 | 2.49k | a1 = acc[1]; |
67 | 2.49k | a2 = acc[2]; |
68 | 2.49k | a3 = acc[3]; |
69 | 2.49k | a4 = acc[4]; |
70 | | |
71 | 2.49k | buf = data; |
72 | 17.2k | while (len > 0) { |
73 | 14.7k | uint64_t w0, w1, w2, w3, w4; |
74 | 14.7k | uint64_t c; |
75 | 14.7k | unsigned char tmp[16]; |
76 | | |
77 | | /* |
78 | | * If there is a partial block, right-pad it with zeros. |
79 | | */ |
80 | 14.7k | if (len < 16) { |
81 | 543 | memset(tmp, 0, sizeof tmp); |
82 | 543 | memcpy(tmp, buf, len); |
83 | 543 | buf = tmp; |
84 | 543 | len = 16; |
85 | 543 | } |
86 | | |
87 | | /* |
88 | | * Decode next block and apply the "high bit"; that value |
89 | | * is added to the accumulator. |
90 | | */ |
91 | 14.7k | a0 += br_dec32le(buf) & 0x03FFFFFF; |
92 | 14.7k | a1 += (br_dec32le(buf + 3) >> 2) & 0x03FFFFFF; |
93 | 14.7k | a2 += (br_dec32le(buf + 6) >> 4) & 0x03FFFFFF; |
94 | 14.7k | a3 += (br_dec32le(buf + 9) >> 6) & 0x03FFFFFF; |
95 | 14.7k | a4 += (br_dec32le(buf + 12) >> 8) | 0x01000000; |
96 | | |
97 | | /* |
98 | | * Compute multiplication. |
99 | | */ |
100 | 368k | #define M(x, y) ((uint64_t)(x) * (uint64_t)(y)) |
101 | | |
102 | 14.7k | w0 = M(a0, r0) + M(a1, u4) + M(a2, u3) + M(a3, u2) + M(a4, u1); |
103 | 14.7k | w1 = M(a0, r1) + M(a1, r0) + M(a2, u4) + M(a3, u3) + M(a4, u2); |
104 | 14.7k | w2 = M(a0, r2) + M(a1, r1) + M(a2, r0) + M(a3, u4) + M(a4, u3); |
105 | 14.7k | w3 = M(a0, r3) + M(a1, r2) + M(a2, r1) + M(a3, r0) + M(a4, u4); |
106 | 14.7k | w4 = M(a0, r4) + M(a1, r3) + M(a2, r2) + M(a3, r1) + M(a4, r0); |
107 | | |
108 | 14.7k | #undef M |
109 | | /* |
110 | | * Perform some (partial) modular reduction. This step is |
111 | | * enough to keep values in ranges such that there won't |
112 | | * be carry overflows. Most of the reduction was done in |
113 | | * the multiplication step (by using the 'u*' values, and |
114 | | * using the fact that 2^130 = -5 mod p); here we perform |
115 | | * some carry propagation. |
116 | | */ |
117 | 14.7k | c = w0 >> 26; |
118 | 14.7k | a0 = (uint32_t)w0 & 0x3FFFFFF; |
119 | 14.7k | w1 += c; |
120 | 14.7k | c = w1 >> 26; |
121 | 14.7k | a1 = (uint32_t)w1 & 0x3FFFFFF; |
122 | 14.7k | w2 += c; |
123 | 14.7k | c = w2 >> 26; |
124 | 14.7k | a2 = (uint32_t)w2 & 0x3FFFFFF; |
125 | 14.7k | w3 += c; |
126 | 14.7k | c = w3 >> 26; |
127 | 14.7k | a3 = (uint32_t)w3 & 0x3FFFFFF; |
128 | 14.7k | w4 += c; |
129 | 14.7k | c = w4 >> 26; |
130 | 14.7k | a4 = (uint32_t)w4 & 0x3FFFFFF; |
131 | 14.7k | a0 += (uint32_t)c * 5; |
132 | 14.7k | a1 += a0 >> 26; |
133 | 14.7k | a0 &= 0x3FFFFFF; |
134 | | |
135 | 14.7k | buf += 16; |
136 | 14.7k | len -= 16; |
137 | 14.7k | } |
138 | | |
139 | 2.49k | acc[0] = a0; |
140 | 2.49k | acc[1] = a1; |
141 | 2.49k | acc[2] = a2; |
142 | 2.49k | acc[3] = a3; |
143 | 2.49k | acc[4] = a4; |
144 | 2.49k | } |
145 | | |
146 | | /* see bearssl_block.h */ |
147 | | void |
148 | | br_poly1305_ctmul_run(const void *key, const void *iv, |
149 | | void *data, size_t len, const void *aad, size_t aad_len, |
150 | | void *tag, br_chacha20_run ichacha, int encrypt) |
151 | 832 | { |
152 | 832 | unsigned char pkey[32], foot[16]; |
153 | 832 | uint32_t r[5], acc[5], cc, ctl, hi; |
154 | 832 | uint64_t w; |
155 | 832 | int i; |
156 | | |
157 | | /* |
158 | | * Compute the MAC key. The 'r' value is the first 16 bytes of |
159 | | * pkey[]. |
160 | | */ |
161 | 832 | memset(pkey, 0, sizeof pkey); |
162 | 832 | ichacha(key, iv, 0, pkey, sizeof pkey); |
163 | | |
164 | | /* |
165 | | * If encrypting, ChaCha20 must run first, followed by Poly1305. |
166 | | * When decrypting, the operations are reversed. |
167 | | */ |
168 | 832 | if (encrypt) { |
169 | 394 | ichacha(key, iv, 1, data, len); |
170 | 394 | } |
171 | | |
172 | | /* |
173 | | * Run Poly1305. We must process the AAD, then ciphertext, then |
174 | | * the footer (with the lengths). Note that the AAD and ciphertext |
175 | | * are meant to be padded with zeros up to the next multiple of 16, |
176 | | * and the length of the footer is 16 bytes as well. |
177 | | */ |
178 | | |
179 | | /* |
180 | | * Decode the 'r' value into 26-bit words, with the "clamping" |
181 | | * operation applied. |
182 | | */ |
183 | 832 | r[0] = br_dec32le(pkey) & 0x03FFFFFF; |
184 | 832 | r[1] = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03; |
185 | 832 | r[2] = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF; |
186 | 832 | r[3] = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF; |
187 | 832 | r[4] = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF; |
188 | | |
189 | | /* |
190 | | * Accumulator is 0. |
191 | | */ |
192 | 832 | memset(acc, 0, sizeof acc); |
193 | | |
194 | | /* |
195 | | * Process the additional authenticated data, ciphertext, and |
196 | | * footer in due order. |
197 | | */ |
198 | 832 | br_enc64le(foot, (uint64_t)aad_len); |
199 | 832 | br_enc64le(foot + 8, (uint64_t)len); |
200 | 832 | poly1305_inner(acc, r, aad, aad_len); |
201 | 832 | poly1305_inner(acc, r, data, len); |
202 | 832 | poly1305_inner(acc, r, foot, sizeof foot); |
203 | | |
204 | | /* |
205 | | * Finalise modular reduction. This is done with carry propagation |
206 | | * and applying the '2^130 = -5 mod p' rule. Note that the output |
207 | | * of poly1035_inner() is already mostly reduced, since only |
208 | | * acc[1] may be (very slightly) above 2^26. A single loop back |
209 | | * to acc[1] will be enough to make the value fit in 130 bits. |
210 | | */ |
211 | 832 | cc = 0; |
212 | 5.82k | for (i = 1; i <= 6; i ++) { |
213 | 4.99k | int j; |
214 | | |
215 | 4.99k | j = (i >= 5) ? i - 5 : i; |
216 | 4.99k | acc[j] += cc; |
217 | 4.99k | cc = acc[j] >> 26; |
218 | 4.99k | acc[j] &= 0x03FFFFFF; |
219 | 4.99k | } |
220 | | |
221 | | /* |
222 | | * We may still have a value in the 2^130-5..2^130-1 range, in |
223 | | * which case we must reduce it again. The code below selects, |
224 | | * in constant-time, between 'acc' and 'acc-p', |
225 | | */ |
226 | 832 | ctl = GT(acc[0], 0x03FFFFFA); |
227 | 4.16k | for (i = 1; i < 5; i ++) { |
228 | 3.32k | ctl &= EQ(acc[i], 0x03FFFFFF); |
229 | 3.32k | } |
230 | 832 | cc = 5; |
231 | 4.99k | for (i = 0; i < 5; i ++) { |
232 | 4.16k | uint32_t t; |
233 | | |
234 | 4.16k | t = (acc[i] + cc); |
235 | 4.16k | cc = t >> 26; |
236 | 4.16k | t &= 0x03FFFFFF; |
237 | 4.16k | acc[i] = MUX(ctl, t, acc[i]); |
238 | 4.16k | } |
239 | | |
240 | | /* |
241 | | * Convert back the accumulator to 32-bit words, and add the |
242 | | * 's' value (second half of pkey[]). That addition is done |
243 | | * modulo 2^128. |
244 | | */ |
245 | 832 | w = (uint64_t)acc[0] + ((uint64_t)acc[1] << 26) + br_dec32le(pkey + 16); |
246 | 832 | br_enc32le((unsigned char *)tag, (uint32_t)w); |
247 | 832 | w = (w >> 32) + ((uint64_t)acc[2] << 20) + br_dec32le(pkey + 20); |
248 | 832 | br_enc32le((unsigned char *)tag + 4, (uint32_t)w); |
249 | 832 | w = (w >> 32) + ((uint64_t)acc[3] << 14) + br_dec32le(pkey + 24); |
250 | 832 | br_enc32le((unsigned char *)tag + 8, (uint32_t)w); |
251 | 832 | hi = (uint32_t)(w >> 32) + (acc[4] << 8) + br_dec32le(pkey + 28); |
252 | 832 | br_enc32le((unsigned char *)tag + 12, hi); |
253 | | |
254 | | /* |
255 | | * If decrypting, then ChaCha20 runs _after_ Poly1305. |
256 | | */ |
257 | 832 | if (!encrypt) { |
258 | 438 | ichacha(key, iv, 1, data, len); |
259 | 438 | } |
260 | 832 | } |