Coverage Report

Created: 2024-11-21 06:38

/src/BearSSL/src/symcipher/poly1305_ctmul.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining 
5
 * a copy of this software and associated documentation files (the
6
 * "Software"), to deal in the Software without restriction, including
7
 * without limitation the rights to use, copy, modify, merge, publish,
8
 * distribute, sublicense, and/or sell copies of the Software, and to
9
 * permit persons to whom the Software is furnished to do so, subject to
10
 * the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be 
13
 * included in all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 */
24
25
#include "inner.h"
26
27
/*
28
 * Perform the inner processing of blocks for Poly1305. The accumulator
29
 * and the r key are provided as arrays of 26-bit words (these words
30
 * are allowed to have an extra bit, i.e. use 27 bits).
31
 *
32
 * On output, all accumulator words fit on 26 bits, except acc[1], which
33
 * may be slightly larger (but by a very small amount only).
34
 */
35
static void
36
poly1305_inner(uint32_t *acc, const uint32_t *r, const void *data, size_t len)
37
2.49k
{
38
  /*
39
   * Implementation notes: we split the 130-bit values into five
40
   * 26-bit words. This gives us some space for carries.
41
   *
42
   * This code is inspired from the public-domain code available
43
   * on:
44
   *      https://github.com/floodyberry/poly1305-donna
45
   *
46
   * Since we compute modulo 2^130-5, the "upper words" become
47
   * low words with a factor of 5; that is, x*2^130 = x*5 mod p.
48
   */
49
2.49k
  const unsigned char *buf;
50
2.49k
  uint32_t a0, a1, a2, a3, a4;
51
2.49k
  uint32_t r0, r1, r2, r3, r4;
52
2.49k
  uint32_t u1, u2, u3, u4;
53
54
2.49k
  r0 = r[0];
55
2.49k
  r1 = r[1];
56
2.49k
  r2 = r[2];
57
2.49k
  r3 = r[3];
58
2.49k
  r4 = r[4];
59
60
2.49k
  u1 = r1 * 5;
61
2.49k
  u2 = r2 * 5;
62
2.49k
  u3 = r3 * 5;
63
2.49k
  u4 = r4 * 5;
64
65
2.49k
  a0 = acc[0];
66
2.49k
  a1 = acc[1];
67
2.49k
  a2 = acc[2];
68
2.49k
  a3 = acc[3];
69
2.49k
  a4 = acc[4];
70
71
2.49k
  buf = data;
72
17.2k
  while (len > 0) {
73
14.7k
    uint64_t w0, w1, w2, w3, w4;
74
14.7k
    uint64_t c;
75
14.7k
    unsigned char tmp[16];
76
77
    /*
78
     * If there is a partial block, right-pad it with zeros.
79
     */
80
14.7k
    if (len < 16) {
81
543
      memset(tmp, 0, sizeof tmp);
82
543
      memcpy(tmp, buf, len);
83
543
      buf = tmp;
84
543
      len = 16;
85
543
    }
86
87
    /*
88
     * Decode next block and apply the "high bit"; that value
89
     * is added to the accumulator.
90
     */
91
14.7k
    a0 += br_dec32le(buf) & 0x03FFFFFF;
92
14.7k
    a1 += (br_dec32le(buf +  3) >> 2) & 0x03FFFFFF;
93
14.7k
    a2 += (br_dec32le(buf +  6) >> 4) & 0x03FFFFFF;
94
14.7k
    a3 += (br_dec32le(buf +  9) >> 6) & 0x03FFFFFF;
95
14.7k
    a4 += (br_dec32le(buf + 12) >> 8) | 0x01000000;
96
97
    /*
98
     * Compute multiplication.
99
     */
100
368k
#define M(x, y)   ((uint64_t)(x) * (uint64_t)(y))
101
102
14.7k
    w0 = M(a0, r0) + M(a1, u4) + M(a2, u3) + M(a3, u2) + M(a4, u1);
103
14.7k
    w1 = M(a0, r1) + M(a1, r0) + M(a2, u4) + M(a3, u3) + M(a4, u2);
104
14.7k
    w2 = M(a0, r2) + M(a1, r1) + M(a2, r0) + M(a3, u4) + M(a4, u3);
105
14.7k
    w3 = M(a0, r3) + M(a1, r2) + M(a2, r1) + M(a3, r0) + M(a4, u4);
106
14.7k
    w4 = M(a0, r4) + M(a1, r3) + M(a2, r2) + M(a3, r1) + M(a4, r0);
107
108
14.7k
#undef M
109
    /*
110
     * Perform some (partial) modular reduction. This step is
111
     * enough to keep values in ranges such that there won't
112
     * be carry overflows. Most of the reduction was done in
113
     * the multiplication step (by using the 'u*' values, and
114
     * using the fact that 2^130 = -5 mod p); here we perform
115
     * some carry propagation.
116
     */
117
14.7k
    c = w0 >> 26;
118
14.7k
    a0 = (uint32_t)w0 & 0x3FFFFFF;
119
14.7k
    w1 += c;
120
14.7k
    c = w1 >> 26;
121
14.7k
    a1 = (uint32_t)w1 & 0x3FFFFFF;
122
14.7k
    w2 += c;
123
14.7k
    c = w2 >> 26;
124
14.7k
    a2 = (uint32_t)w2 & 0x3FFFFFF;
125
14.7k
    w3 += c;
126
14.7k
    c = w3 >> 26;
127
14.7k
    a3 = (uint32_t)w3 & 0x3FFFFFF;
128
14.7k
    w4 += c;
129
14.7k
    c = w4 >> 26;
130
14.7k
    a4 = (uint32_t)w4 & 0x3FFFFFF;
131
14.7k
    a0 += (uint32_t)c * 5;
132
14.7k
    a1 += a0 >> 26;
133
14.7k
    a0 &= 0x3FFFFFF;
134
135
14.7k
    buf += 16;
136
14.7k
    len -= 16;
137
14.7k
  }
138
139
2.49k
  acc[0] = a0;
140
2.49k
  acc[1] = a1;
141
2.49k
  acc[2] = a2;
142
2.49k
  acc[3] = a3;
143
2.49k
  acc[4] = a4;
144
2.49k
}
145
146
/* see bearssl_block.h */
147
void
148
br_poly1305_ctmul_run(const void *key, const void *iv,
149
  void *data, size_t len, const void *aad, size_t aad_len,
150
  void *tag, br_chacha20_run ichacha, int encrypt)
151
832
{
152
832
  unsigned char pkey[32], foot[16];
153
832
  uint32_t r[5], acc[5], cc, ctl, hi;
154
832
  uint64_t w;
155
832
  int i;
156
157
  /*
158
   * Compute the MAC key. The 'r' value is the first 16 bytes of
159
   * pkey[].
160
   */
161
832
  memset(pkey, 0, sizeof pkey);
162
832
  ichacha(key, iv, 0, pkey, sizeof pkey);
163
164
  /*
165
   * If encrypting, ChaCha20 must run first, followed by Poly1305.
166
   * When decrypting, the operations are reversed.
167
   */
168
832
  if (encrypt) {
169
394
    ichacha(key, iv, 1, data, len);
170
394
  }
171
172
  /*
173
   * Run Poly1305. We must process the AAD, then ciphertext, then
174
   * the footer (with the lengths). Note that the AAD and ciphertext
175
   * are meant to be padded with zeros up to the next multiple of 16,
176
   * and the length of the footer is 16 bytes as well.
177
   */
178
179
  /*
180
   * Decode the 'r' value into 26-bit words, with the "clamping"
181
   * operation applied.
182
   */
183
832
  r[0] = br_dec32le(pkey) & 0x03FFFFFF;
184
832
  r[1] = (br_dec32le(pkey +  3) >> 2) & 0x03FFFF03;
185
832
  r[2] = (br_dec32le(pkey +  6) >> 4) & 0x03FFC0FF;
186
832
  r[3] = (br_dec32le(pkey +  9) >> 6) & 0x03F03FFF;
187
832
  r[4] = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;
188
189
  /*
190
   * Accumulator is 0.
191
   */
192
832
  memset(acc, 0, sizeof acc);
193
194
  /*
195
   * Process the additional authenticated data, ciphertext, and
196
   * footer in due order.
197
   */
198
832
  br_enc64le(foot, (uint64_t)aad_len);
199
832
  br_enc64le(foot + 8, (uint64_t)len);
200
832
  poly1305_inner(acc, r, aad, aad_len);
201
832
  poly1305_inner(acc, r, data, len);
202
832
  poly1305_inner(acc, r, foot, sizeof foot);
203
204
  /*
205
   * Finalise modular reduction. This is done with carry propagation
206
   * and applying the '2^130 = -5 mod p' rule. Note that the output
207
   * of poly1035_inner() is already mostly reduced, since only
208
   * acc[1] may be (very slightly) above 2^26. A single loop back
209
   * to acc[1] will be enough to make the value fit in 130 bits.
210
   */
211
832
  cc = 0;
212
5.82k
  for (i = 1; i <= 6; i ++) {
213
4.99k
    int j;
214
215
4.99k
    j = (i >= 5) ? i - 5 : i;
216
4.99k
    acc[j] += cc;
217
4.99k
    cc = acc[j] >> 26;
218
4.99k
    acc[j] &= 0x03FFFFFF;
219
4.99k
  }
220
221
  /*
222
   * We may still have a value in the 2^130-5..2^130-1 range, in
223
   * which case we must reduce it again. The code below selects,
224
   * in constant-time, between 'acc' and 'acc-p',
225
   */
226
832
  ctl = GT(acc[0], 0x03FFFFFA);
227
4.16k
  for (i = 1; i < 5; i ++) {
228
3.32k
    ctl &= EQ(acc[i], 0x03FFFFFF);
229
3.32k
  }
230
832
  cc = 5;
231
4.99k
  for (i = 0; i < 5; i ++) {
232
4.16k
    uint32_t t;
233
234
4.16k
    t = (acc[i] + cc);
235
4.16k
    cc = t >> 26;
236
4.16k
    t &= 0x03FFFFFF;
237
4.16k
    acc[i] = MUX(ctl, t, acc[i]);
238
4.16k
  }
239
240
  /*
241
   * Convert back the accumulator to 32-bit words, and add the
242
   * 's' value (second half of pkey[]). That addition is done
243
   * modulo 2^128.
244
   */
245
832
  w = (uint64_t)acc[0] + ((uint64_t)acc[1] << 26) + br_dec32le(pkey + 16);
246
832
  br_enc32le((unsigned char *)tag, (uint32_t)w);
247
832
  w = (w >> 32) + ((uint64_t)acc[2] << 20) + br_dec32le(pkey + 20);
248
832
  br_enc32le((unsigned char *)tag + 4, (uint32_t)w);
249
832
  w = (w >> 32) + ((uint64_t)acc[3] << 14) + br_dec32le(pkey + 24);
250
832
  br_enc32le((unsigned char *)tag + 8, (uint32_t)w);
251
832
  hi = (uint32_t)(w >> 32) + (acc[4] << 8) + br_dec32le(pkey + 28);
252
832
  br_enc32le((unsigned char *)tag + 12, hi);
253
254
  /*
255
   * If decrypting, then ChaCha20 runs _after_ Poly1305.
256
   */
257
832
  if (!encrypt) {
258
438
    ichacha(key, iv, 1, data, len);
259
438
  }
260
832
}