Coverage Report

Created: 2023-06-07 07:13

/src/boringssl/crypto/poly1305/poly1305_vec.c
Line
Count
Source (jump to first uncovered line)
1
/* Copyright (c) 2014, Google Inc.
2
 *
3
 * Permission to use, copy, modify, and/or distribute this software for any
4
 * purpose with or without fee is hereby granted, provided that the above
5
 * copyright notice and this permission notice appear in all copies.
6
 *
7
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10
 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15
// This implementation of poly1305 is by Andrew Moon
16
// (https://github.com/floodyberry/poly1305-donna) and released as public
17
// domain. It implements SIMD vectorization based on the algorithm described in
18
// http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19
// block size
20
21
#include <openssl/poly1305.h>
22
23
#include <assert.h>
24
25
#include "../internal.h"
26
27
28
#if defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_X86_64)
29
30
#include <emmintrin.h>
31
32
typedef __m128i xmmi;
33
34
static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
35
    (1 << 26) - 1, 0, (1 << 26) - 1, 0};
36
static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
37
static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
38
    (1 << 24), 0, (1 << 24), 0};
39
40
0
static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
41
42
0
static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
43
44
0
static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
45
0
  return (uint128_t)a * b;
46
0
}
47
48
0
static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
49
50
0
static inline uint64_t shr128(uint128_t v, const int shift) {
51
0
  return (uint64_t)(v >> shift);
52
0
}
53
54
0
static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
55
0
  return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
56
0
}
57
58
typedef struct poly1305_power_t {
59
  union {
60
    xmmi v;
61
    uint64_t u[2];
62
    uint32_t d[4];
63
  } R20, R21, R22, R23, R24, S21, S22, S23, S24;
64
} poly1305_power;
65
66
typedef struct poly1305_state_internal_t {
67
  poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
68
                          bytes of free storage */
69
  union {
70
    xmmi H[5];  //  80 bytes
71
    uint64_t HH[10];
72
  };
73
  // uint64_t r0,r1,r2;       [24 bytes]
74
  // uint64_t pad0,pad1;      [16 bytes]
75
  uint64_t started;        //   8 bytes
76
  uint64_t leftover;       //   8 bytes
77
  uint8_t buffer[64];      //  64 bytes
78
} poly1305_state_internal; /* 448 bytes total + 63 bytes for
79
                              alignment = 511 bytes raw */
80
81
static_assert(sizeof(struct poly1305_state_internal_t) + 63 <=
82
                  sizeof(poly1305_state),
83
              "poly1305_state isn't large enough to hold aligned "
84
              "poly1305_state_internal_t");
85
86
static inline poly1305_state_internal *poly1305_aligned_state(
87
0
    poly1305_state *state) {
88
0
  return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
89
0
}
90
91
0
static inline size_t poly1305_min(size_t a, size_t b) {
92
0
  return (a < b) ? a : b;
93
0
}
94
95
0
void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
96
0
  poly1305_state_internal *st = poly1305_aligned_state(state);
97
0
  poly1305_power *p;
98
0
  uint64_t r0, r1, r2;
99
0
  uint64_t t0, t1;
100
101
  // clamp key
102
0
  t0 = CRYPTO_load_u64_le(key + 0);
103
0
  t1 = CRYPTO_load_u64_le(key + 8);
104
0
  r0 = t0 & 0xffc0fffffff;
105
0
  t0 >>= 44;
106
0
  t0 |= t1 << 20;
107
0
  r1 = t0 & 0xfffffc0ffff;
108
0
  t1 >>= 24;
109
0
  r2 = t1 & 0x00ffffffc0f;
110
111
  // store r in un-used space of st->P[1]
112
0
  p = &st->P[1];
113
0
  p->R20.d[1] = (uint32_t)(r0);
114
0
  p->R20.d[3] = (uint32_t)(r0 >> 32);
115
0
  p->R21.d[1] = (uint32_t)(r1);
116
0
  p->R21.d[3] = (uint32_t)(r1 >> 32);
117
0
  p->R22.d[1] = (uint32_t)(r2);
118
0
  p->R22.d[3] = (uint32_t)(r2 >> 32);
119
120
  // store pad
121
0
  p->R23.d[1] = CRYPTO_load_u32_le(key + 16);
122
0
  p->R23.d[3] = CRYPTO_load_u32_le(key + 20);
123
0
  p->R24.d[1] = CRYPTO_load_u32_le(key + 24);
124
0
  p->R24.d[3] = CRYPTO_load_u32_le(key + 28);
125
126
  // H = 0
127
0
  st->H[0] = _mm_setzero_si128();
128
0
  st->H[1] = _mm_setzero_si128();
129
0
  st->H[2] = _mm_setzero_si128();
130
0
  st->H[3] = _mm_setzero_si128();
131
0
  st->H[4] = _mm_setzero_si128();
132
133
0
  st->started = 0;
134
0
  st->leftover = 0;
135
0
}
136
137
static void poly1305_first_block(poly1305_state_internal *st,
138
0
                                 const uint8_t *m) {
139
0
  const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
140
0
  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
141
0
  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
142
0
  xmmi T5, T6;
143
0
  poly1305_power *p;
144
0
  uint128_t d[3];
145
0
  uint64_t r0, r1, r2;
146
0
  uint64_t r20, r21, r22, s22;
147
0
  uint64_t pad0, pad1;
148
0
  uint64_t c;
149
0
  uint64_t i;
150
151
  // pull out stored info
152
0
  p = &st->P[1];
153
154
0
  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
155
0
  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
156
0
  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
157
0
  pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
158
0
  pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
159
160
  // compute powers r^2,r^4
161
0
  r20 = r0;
162
0
  r21 = r1;
163
0
  r22 = r2;
164
0
  for (i = 0; i < 2; i++) {
165
0
    s22 = r22 * (5 << 2);
166
167
0
    d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
168
0
    d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
169
0
    d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
170
171
0
    r20 = lo128(d[0]) & 0xfffffffffff;
172
0
    c = shr128(d[0], 44);
173
0
    d[1] = add128_64(d[1], c);
174
0
    r21 = lo128(d[1]) & 0xfffffffffff;
175
0
    c = shr128(d[1], 44);
176
0
    d[2] = add128_64(d[2], c);
177
0
    r22 = lo128(d[2]) & 0x3ffffffffff;
178
0
    c = shr128(d[2], 42);
179
0
    r20 += c * 5;
180
0
    c = (r20 >> 44);
181
0
    r20 = r20 & 0xfffffffffff;
182
0
    r21 += c;
183
184
0
    p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
185
0
                                 _MM_SHUFFLE(1, 0, 1, 0));
186
0
    p->R21.v = _mm_shuffle_epi32(
187
0
        _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
188
0
        _MM_SHUFFLE(1, 0, 1, 0));
189
0
    p->R22.v =
190
0
        _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
191
0
                          _MM_SHUFFLE(1, 0, 1, 0));
192
0
    p->R23.v = _mm_shuffle_epi32(
193
0
        _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
194
0
        _MM_SHUFFLE(1, 0, 1, 0));
195
0
    p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
196
0
                                 _MM_SHUFFLE(1, 0, 1, 0));
197
0
    p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
198
0
    p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
199
0
    p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
200
0
    p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
201
0
    p--;
202
0
  }
203
204
  // put saved info back
205
0
  p = &st->P[1];
206
0
  p->R20.d[1] = (uint32_t)(r0);
207
0
  p->R20.d[3] = (uint32_t)(r0 >> 32);
208
0
  p->R21.d[1] = (uint32_t)(r1);
209
0
  p->R21.d[3] = (uint32_t)(r1 >> 32);
210
0
  p->R22.d[1] = (uint32_t)(r2);
211
0
  p->R22.d[3] = (uint32_t)(r2 >> 32);
212
0
  p->R23.d[1] = (uint32_t)(pad0);
213
0
  p->R23.d[3] = (uint32_t)(pad0 >> 32);
214
0
  p->R24.d[1] = (uint32_t)(pad1);
215
0
  p->R24.d[3] = (uint32_t)(pad1 >> 32);
216
217
  // H = [Mx,My]
218
0
  T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
219
0
                          _mm_loadl_epi64((const xmmi *)(m + 16)));
220
0
  T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
221
0
                          _mm_loadl_epi64((const xmmi *)(m + 24)));
222
0
  st->H[0] = _mm_and_si128(MMASK, T5);
223
0
  st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
224
0
  T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
225
0
  st->H[2] = _mm_and_si128(MMASK, T5);
226
0
  st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
227
0
  st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
228
0
}
229
230
static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
231
0
                            size_t bytes) {
232
0
  const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
233
0
  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
234
0
  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
235
236
0
  poly1305_power *p;
237
0
  xmmi H0, H1, H2, H3, H4;
238
0
  xmmi T0, T1, T2, T3, T4, T5, T6;
239
0
  xmmi M0, M1, M2, M3, M4;
240
0
  xmmi C1, C2;
241
242
0
  H0 = st->H[0];
243
0
  H1 = st->H[1];
244
0
  H2 = st->H[2];
245
0
  H3 = st->H[3];
246
0
  H4 = st->H[4];
247
248
0
  while (bytes >= 64) {
249
    // H *= [r^4,r^4]
250
0
    p = &st->P[0];
251
0
    T0 = _mm_mul_epu32(H0, p->R20.v);
252
0
    T1 = _mm_mul_epu32(H0, p->R21.v);
253
0
    T2 = _mm_mul_epu32(H0, p->R22.v);
254
0
    T3 = _mm_mul_epu32(H0, p->R23.v);
255
0
    T4 = _mm_mul_epu32(H0, p->R24.v);
256
0
    T5 = _mm_mul_epu32(H1, p->S24.v);
257
0
    T6 = _mm_mul_epu32(H1, p->R20.v);
258
0
    T0 = _mm_add_epi64(T0, T5);
259
0
    T1 = _mm_add_epi64(T1, T6);
260
0
    T5 = _mm_mul_epu32(H2, p->S23.v);
261
0
    T6 = _mm_mul_epu32(H2, p->S24.v);
262
0
    T0 = _mm_add_epi64(T0, T5);
263
0
    T1 = _mm_add_epi64(T1, T6);
264
0
    T5 = _mm_mul_epu32(H3, p->S22.v);
265
0
    T6 = _mm_mul_epu32(H3, p->S23.v);
266
0
    T0 = _mm_add_epi64(T0, T5);
267
0
    T1 = _mm_add_epi64(T1, T6);
268
0
    T5 = _mm_mul_epu32(H4, p->S21.v);
269
0
    T6 = _mm_mul_epu32(H4, p->S22.v);
270
0
    T0 = _mm_add_epi64(T0, T5);
271
0
    T1 = _mm_add_epi64(T1, T6);
272
0
    T5 = _mm_mul_epu32(H1, p->R21.v);
273
0
    T6 = _mm_mul_epu32(H1, p->R22.v);
274
0
    T2 = _mm_add_epi64(T2, T5);
275
0
    T3 = _mm_add_epi64(T3, T6);
276
0
    T5 = _mm_mul_epu32(H2, p->R20.v);
277
0
    T6 = _mm_mul_epu32(H2, p->R21.v);
278
0
    T2 = _mm_add_epi64(T2, T5);
279
0
    T3 = _mm_add_epi64(T3, T6);
280
0
    T5 = _mm_mul_epu32(H3, p->S24.v);
281
0
    T6 = _mm_mul_epu32(H3, p->R20.v);
282
0
    T2 = _mm_add_epi64(T2, T5);
283
0
    T3 = _mm_add_epi64(T3, T6);
284
0
    T5 = _mm_mul_epu32(H4, p->S23.v);
285
0
    T6 = _mm_mul_epu32(H4, p->S24.v);
286
0
    T2 = _mm_add_epi64(T2, T5);
287
0
    T3 = _mm_add_epi64(T3, T6);
288
0
    T5 = _mm_mul_epu32(H1, p->R23.v);
289
0
    T4 = _mm_add_epi64(T4, T5);
290
0
    T5 = _mm_mul_epu32(H2, p->R22.v);
291
0
    T4 = _mm_add_epi64(T4, T5);
292
0
    T5 = _mm_mul_epu32(H3, p->R21.v);
293
0
    T4 = _mm_add_epi64(T4, T5);
294
0
    T5 = _mm_mul_epu32(H4, p->R20.v);
295
0
    T4 = _mm_add_epi64(T4, T5);
296
297
    // H += [Mx,My]*[r^2,r^2]
298
0
    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
299
0
                            _mm_loadl_epi64((const xmmi *)(m + 16)));
300
0
    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
301
0
                            _mm_loadl_epi64((const xmmi *)(m + 24)));
302
0
    M0 = _mm_and_si128(MMASK, T5);
303
0
    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
304
0
    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
305
0
    M2 = _mm_and_si128(MMASK, T5);
306
0
    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
307
0
    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
308
309
0
    p = &st->P[1];
310
0
    T5 = _mm_mul_epu32(M0, p->R20.v);
311
0
    T6 = _mm_mul_epu32(M0, p->R21.v);
312
0
    T0 = _mm_add_epi64(T0, T5);
313
0
    T1 = _mm_add_epi64(T1, T6);
314
0
    T5 = _mm_mul_epu32(M1, p->S24.v);
315
0
    T6 = _mm_mul_epu32(M1, p->R20.v);
316
0
    T0 = _mm_add_epi64(T0, T5);
317
0
    T1 = _mm_add_epi64(T1, T6);
318
0
    T5 = _mm_mul_epu32(M2, p->S23.v);
319
0
    T6 = _mm_mul_epu32(M2, p->S24.v);
320
0
    T0 = _mm_add_epi64(T0, T5);
321
0
    T1 = _mm_add_epi64(T1, T6);
322
0
    T5 = _mm_mul_epu32(M3, p->S22.v);
323
0
    T6 = _mm_mul_epu32(M3, p->S23.v);
324
0
    T0 = _mm_add_epi64(T0, T5);
325
0
    T1 = _mm_add_epi64(T1, T6);
326
0
    T5 = _mm_mul_epu32(M4, p->S21.v);
327
0
    T6 = _mm_mul_epu32(M4, p->S22.v);
328
0
    T0 = _mm_add_epi64(T0, T5);
329
0
    T1 = _mm_add_epi64(T1, T6);
330
0
    T5 = _mm_mul_epu32(M0, p->R22.v);
331
0
    T6 = _mm_mul_epu32(M0, p->R23.v);
332
0
    T2 = _mm_add_epi64(T2, T5);
333
0
    T3 = _mm_add_epi64(T3, T6);
334
0
    T5 = _mm_mul_epu32(M1, p->R21.v);
335
0
    T6 = _mm_mul_epu32(M1, p->R22.v);
336
0
    T2 = _mm_add_epi64(T2, T5);
337
0
    T3 = _mm_add_epi64(T3, T6);
338
0
    T5 = _mm_mul_epu32(M2, p->R20.v);
339
0
    T6 = _mm_mul_epu32(M2, p->R21.v);
340
0
    T2 = _mm_add_epi64(T2, T5);
341
0
    T3 = _mm_add_epi64(T3, T6);
342
0
    T5 = _mm_mul_epu32(M3, p->S24.v);
343
0
    T6 = _mm_mul_epu32(M3, p->R20.v);
344
0
    T2 = _mm_add_epi64(T2, T5);
345
0
    T3 = _mm_add_epi64(T3, T6);
346
0
    T5 = _mm_mul_epu32(M4, p->S23.v);
347
0
    T6 = _mm_mul_epu32(M4, p->S24.v);
348
0
    T2 = _mm_add_epi64(T2, T5);
349
0
    T3 = _mm_add_epi64(T3, T6);
350
0
    T5 = _mm_mul_epu32(M0, p->R24.v);
351
0
    T4 = _mm_add_epi64(T4, T5);
352
0
    T5 = _mm_mul_epu32(M1, p->R23.v);
353
0
    T4 = _mm_add_epi64(T4, T5);
354
0
    T5 = _mm_mul_epu32(M2, p->R22.v);
355
0
    T4 = _mm_add_epi64(T4, T5);
356
0
    T5 = _mm_mul_epu32(M3, p->R21.v);
357
0
    T4 = _mm_add_epi64(T4, T5);
358
0
    T5 = _mm_mul_epu32(M4, p->R20.v);
359
0
    T4 = _mm_add_epi64(T4, T5);
360
361
    // H += [Mx,My]
362
0
    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
363
0
                            _mm_loadl_epi64((const xmmi *)(m + 48)));
364
0
    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
365
0
                            _mm_loadl_epi64((const xmmi *)(m + 56)));
366
0
    M0 = _mm_and_si128(MMASK, T5);
367
0
    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
368
0
    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
369
0
    M2 = _mm_and_si128(MMASK, T5);
370
0
    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
371
0
    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
372
373
0
    T0 = _mm_add_epi64(T0, M0);
374
0
    T1 = _mm_add_epi64(T1, M1);
375
0
    T2 = _mm_add_epi64(T2, M2);
376
0
    T3 = _mm_add_epi64(T3, M3);
377
0
    T4 = _mm_add_epi64(T4, M4);
378
379
    // reduce
380
0
    C1 = _mm_srli_epi64(T0, 26);
381
0
    C2 = _mm_srli_epi64(T3, 26);
382
0
    T0 = _mm_and_si128(T0, MMASK);
383
0
    T3 = _mm_and_si128(T3, MMASK);
384
0
    T1 = _mm_add_epi64(T1, C1);
385
0
    T4 = _mm_add_epi64(T4, C2);
386
0
    C1 = _mm_srli_epi64(T1, 26);
387
0
    C2 = _mm_srli_epi64(T4, 26);
388
0
    T1 = _mm_and_si128(T1, MMASK);
389
0
    T4 = _mm_and_si128(T4, MMASK);
390
0
    T2 = _mm_add_epi64(T2, C1);
391
0
    T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
392
0
    C1 = _mm_srli_epi64(T2, 26);
393
0
    C2 = _mm_srli_epi64(T0, 26);
394
0
    T2 = _mm_and_si128(T2, MMASK);
395
0
    T0 = _mm_and_si128(T0, MMASK);
396
0
    T3 = _mm_add_epi64(T3, C1);
397
0
    T1 = _mm_add_epi64(T1, C2);
398
0
    C1 = _mm_srli_epi64(T3, 26);
399
0
    T3 = _mm_and_si128(T3, MMASK);
400
0
    T4 = _mm_add_epi64(T4, C1);
401
402
    // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
403
0
    H0 = T0;
404
0
    H1 = T1;
405
0
    H2 = T2;
406
0
    H3 = T3;
407
0
    H4 = T4;
408
409
0
    m += 64;
410
0
    bytes -= 64;
411
0
  }
412
413
0
  st->H[0] = H0;
414
0
  st->H[1] = H1;
415
0
  st->H[2] = H2;
416
0
  st->H[3] = H3;
417
0
  st->H[4] = H4;
418
0
}
419
420
static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
421
0
                               size_t bytes) {
422
0
  const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
423
0
  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
424
0
  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
425
426
0
  poly1305_power *p;
427
0
  xmmi H0, H1, H2, H3, H4;
428
0
  xmmi M0, M1, M2, M3, M4;
429
0
  xmmi T0, T1, T2, T3, T4, T5, T6;
430
0
  xmmi C1, C2;
431
432
0
  uint64_t r0, r1, r2;
433
0
  uint64_t t0, t1, t2, t3, t4;
434
0
  uint64_t c;
435
0
  size_t consumed = 0;
436
437
0
  H0 = st->H[0];
438
0
  H1 = st->H[1];
439
0
  H2 = st->H[2];
440
0
  H3 = st->H[3];
441
0
  H4 = st->H[4];
442
443
  // p = [r^2,r^2]
444
0
  p = &st->P[1];
445
446
0
  if (bytes >= 32) {
447
    // H *= [r^2,r^2]
448
0
    T0 = _mm_mul_epu32(H0, p->R20.v);
449
0
    T1 = _mm_mul_epu32(H0, p->R21.v);
450
0
    T2 = _mm_mul_epu32(H0, p->R22.v);
451
0
    T3 = _mm_mul_epu32(H0, p->R23.v);
452
0
    T4 = _mm_mul_epu32(H0, p->R24.v);
453
0
    T5 = _mm_mul_epu32(H1, p->S24.v);
454
0
    T6 = _mm_mul_epu32(H1, p->R20.v);
455
0
    T0 = _mm_add_epi64(T0, T5);
456
0
    T1 = _mm_add_epi64(T1, T6);
457
0
    T5 = _mm_mul_epu32(H2, p->S23.v);
458
0
    T6 = _mm_mul_epu32(H2, p->S24.v);
459
0
    T0 = _mm_add_epi64(T0, T5);
460
0
    T1 = _mm_add_epi64(T1, T6);
461
0
    T5 = _mm_mul_epu32(H3, p->S22.v);
462
0
    T6 = _mm_mul_epu32(H3, p->S23.v);
463
0
    T0 = _mm_add_epi64(T0, T5);
464
0
    T1 = _mm_add_epi64(T1, T6);
465
0
    T5 = _mm_mul_epu32(H4, p->S21.v);
466
0
    T6 = _mm_mul_epu32(H4, p->S22.v);
467
0
    T0 = _mm_add_epi64(T0, T5);
468
0
    T1 = _mm_add_epi64(T1, T6);
469
0
    T5 = _mm_mul_epu32(H1, p->R21.v);
470
0
    T6 = _mm_mul_epu32(H1, p->R22.v);
471
0
    T2 = _mm_add_epi64(T2, T5);
472
0
    T3 = _mm_add_epi64(T3, T6);
473
0
    T5 = _mm_mul_epu32(H2, p->R20.v);
474
0
    T6 = _mm_mul_epu32(H2, p->R21.v);
475
0
    T2 = _mm_add_epi64(T2, T5);
476
0
    T3 = _mm_add_epi64(T3, T6);
477
0
    T5 = _mm_mul_epu32(H3, p->S24.v);
478
0
    T6 = _mm_mul_epu32(H3, p->R20.v);
479
0
    T2 = _mm_add_epi64(T2, T5);
480
0
    T3 = _mm_add_epi64(T3, T6);
481
0
    T5 = _mm_mul_epu32(H4, p->S23.v);
482
0
    T6 = _mm_mul_epu32(H4, p->S24.v);
483
0
    T2 = _mm_add_epi64(T2, T5);
484
0
    T3 = _mm_add_epi64(T3, T6);
485
0
    T5 = _mm_mul_epu32(H1, p->R23.v);
486
0
    T4 = _mm_add_epi64(T4, T5);
487
0
    T5 = _mm_mul_epu32(H2, p->R22.v);
488
0
    T4 = _mm_add_epi64(T4, T5);
489
0
    T5 = _mm_mul_epu32(H3, p->R21.v);
490
0
    T4 = _mm_add_epi64(T4, T5);
491
0
    T5 = _mm_mul_epu32(H4, p->R20.v);
492
0
    T4 = _mm_add_epi64(T4, T5);
493
494
    // H += [Mx,My]
495
0
    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
496
0
                            _mm_loadl_epi64((const xmmi *)(m + 16)));
497
0
    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
498
0
                            _mm_loadl_epi64((const xmmi *)(m + 24)));
499
0
    M0 = _mm_and_si128(MMASK, T5);
500
0
    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
501
0
    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
502
0
    M2 = _mm_and_si128(MMASK, T5);
503
0
    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
504
0
    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
505
506
0
    T0 = _mm_add_epi64(T0, M0);
507
0
    T1 = _mm_add_epi64(T1, M1);
508
0
    T2 = _mm_add_epi64(T2, M2);
509
0
    T3 = _mm_add_epi64(T3, M3);
510
0
    T4 = _mm_add_epi64(T4, M4);
511
512
    // reduce
513
0
    C1 = _mm_srli_epi64(T0, 26);
514
0
    C2 = _mm_srli_epi64(T3, 26);
515
0
    T0 = _mm_and_si128(T0, MMASK);
516
0
    T3 = _mm_and_si128(T3, MMASK);
517
0
    T1 = _mm_add_epi64(T1, C1);
518
0
    T4 = _mm_add_epi64(T4, C2);
519
0
    C1 = _mm_srli_epi64(T1, 26);
520
0
    C2 = _mm_srli_epi64(T4, 26);
521
0
    T1 = _mm_and_si128(T1, MMASK);
522
0
    T4 = _mm_and_si128(T4, MMASK);
523
0
    T2 = _mm_add_epi64(T2, C1);
524
0
    T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
525
0
    C1 = _mm_srli_epi64(T2, 26);
526
0
    C2 = _mm_srli_epi64(T0, 26);
527
0
    T2 = _mm_and_si128(T2, MMASK);
528
0
    T0 = _mm_and_si128(T0, MMASK);
529
0
    T3 = _mm_add_epi64(T3, C1);
530
0
    T1 = _mm_add_epi64(T1, C2);
531
0
    C1 = _mm_srli_epi64(T3, 26);
532
0
    T3 = _mm_and_si128(T3, MMASK);
533
0
    T4 = _mm_add_epi64(T4, C1);
534
535
    // H = (H*[r^2,r^2] + [Mx,My])
536
0
    H0 = T0;
537
0
    H1 = T1;
538
0
    H2 = T2;
539
0
    H3 = T3;
540
0
    H4 = T4;
541
542
0
    consumed = 32;
543
0
  }
544
545
  // finalize, H *= [r^2,r]
546
0
  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
547
0
  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
548
0
  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
549
550
0
  p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
551
0
  p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
552
0
  p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
553
0
  p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
554
0
  p->R24.d[2] = (uint32_t)((r2 >> 16));
555
0
  p->S21.d[2] = p->R21.d[2] * 5;
556
0
  p->S22.d[2] = p->R22.d[2] * 5;
557
0
  p->S23.d[2] = p->R23.d[2] * 5;
558
0
  p->S24.d[2] = p->R24.d[2] * 5;
559
560
  // H *= [r^2,r]
561
0
  T0 = _mm_mul_epu32(H0, p->R20.v);
562
0
  T1 = _mm_mul_epu32(H0, p->R21.v);
563
0
  T2 = _mm_mul_epu32(H0, p->R22.v);
564
0
  T3 = _mm_mul_epu32(H0, p->R23.v);
565
0
  T4 = _mm_mul_epu32(H0, p->R24.v);
566
0
  T5 = _mm_mul_epu32(H1, p->S24.v);
567
0
  T6 = _mm_mul_epu32(H1, p->R20.v);
568
0
  T0 = _mm_add_epi64(T0, T5);
569
0
  T1 = _mm_add_epi64(T1, T6);
570
0
  T5 = _mm_mul_epu32(H2, p->S23.v);
571
0
  T6 = _mm_mul_epu32(H2, p->S24.v);
572
0
  T0 = _mm_add_epi64(T0, T5);
573
0
  T1 = _mm_add_epi64(T1, T6);
574
0
  T5 = _mm_mul_epu32(H3, p->S22.v);
575
0
  T6 = _mm_mul_epu32(H3, p->S23.v);
576
0
  T0 = _mm_add_epi64(T0, T5);
577
0
  T1 = _mm_add_epi64(T1, T6);
578
0
  T5 = _mm_mul_epu32(H4, p->S21.v);
579
0
  T6 = _mm_mul_epu32(H4, p->S22.v);
580
0
  T0 = _mm_add_epi64(T0, T5);
581
0
  T1 = _mm_add_epi64(T1, T6);
582
0
  T5 = _mm_mul_epu32(H1, p->R21.v);
583
0
  T6 = _mm_mul_epu32(H1, p->R22.v);
584
0
  T2 = _mm_add_epi64(T2, T5);
585
0
  T3 = _mm_add_epi64(T3, T6);
586
0
  T5 = _mm_mul_epu32(H2, p->R20.v);
587
0
  T6 = _mm_mul_epu32(H2, p->R21.v);
588
0
  T2 = _mm_add_epi64(T2, T5);
589
0
  T3 = _mm_add_epi64(T3, T6);
590
0
  T5 = _mm_mul_epu32(H3, p->S24.v);
591
0
  T6 = _mm_mul_epu32(H3, p->R20.v);
592
0
  T2 = _mm_add_epi64(T2, T5);
593
0
  T3 = _mm_add_epi64(T3, T6);
594
0
  T5 = _mm_mul_epu32(H4, p->S23.v);
595
0
  T6 = _mm_mul_epu32(H4, p->S24.v);
596
0
  T2 = _mm_add_epi64(T2, T5);
597
0
  T3 = _mm_add_epi64(T3, T6);
598
0
  T5 = _mm_mul_epu32(H1, p->R23.v);
599
0
  T4 = _mm_add_epi64(T4, T5);
600
0
  T5 = _mm_mul_epu32(H2, p->R22.v);
601
0
  T4 = _mm_add_epi64(T4, T5);
602
0
  T5 = _mm_mul_epu32(H3, p->R21.v);
603
0
  T4 = _mm_add_epi64(T4, T5);
604
0
  T5 = _mm_mul_epu32(H4, p->R20.v);
605
0
  T4 = _mm_add_epi64(T4, T5);
606
607
0
  C1 = _mm_srli_epi64(T0, 26);
608
0
  C2 = _mm_srli_epi64(T3, 26);
609
0
  T0 = _mm_and_si128(T0, MMASK);
610
0
  T3 = _mm_and_si128(T3, MMASK);
611
0
  T1 = _mm_add_epi64(T1, C1);
612
0
  T4 = _mm_add_epi64(T4, C2);
613
0
  C1 = _mm_srli_epi64(T1, 26);
614
0
  C2 = _mm_srli_epi64(T4, 26);
615
0
  T1 = _mm_and_si128(T1, MMASK);
616
0
  T4 = _mm_and_si128(T4, MMASK);
617
0
  T2 = _mm_add_epi64(T2, C1);
618
0
  T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
619
0
  C1 = _mm_srli_epi64(T2, 26);
620
0
  C2 = _mm_srli_epi64(T0, 26);
621
0
  T2 = _mm_and_si128(T2, MMASK);
622
0
  T0 = _mm_and_si128(T0, MMASK);
623
0
  T3 = _mm_add_epi64(T3, C1);
624
0
  T1 = _mm_add_epi64(T1, C2);
625
0
  C1 = _mm_srli_epi64(T3, 26);
626
0
  T3 = _mm_and_si128(T3, MMASK);
627
0
  T4 = _mm_add_epi64(T4, C1);
628
629
  // H = H[0]+H[1]
630
0
  H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
631
0
  H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
632
0
  H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
633
0
  H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
634
0
  H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
635
636
0
  t0 = _mm_cvtsi128_si32(H0);
637
0
  c = (t0 >> 26);
638
0
  t0 &= 0x3ffffff;
639
0
  t1 = _mm_cvtsi128_si32(H1) + c;
640
0
  c = (t1 >> 26);
641
0
  t1 &= 0x3ffffff;
642
0
  t2 = _mm_cvtsi128_si32(H2) + c;
643
0
  c = (t2 >> 26);
644
0
  t2 &= 0x3ffffff;
645
0
  t3 = _mm_cvtsi128_si32(H3) + c;
646
0
  c = (t3 >> 26);
647
0
  t3 &= 0x3ffffff;
648
0
  t4 = _mm_cvtsi128_si32(H4) + c;
649
0
  c = (t4 >> 26);
650
0
  t4 &= 0x3ffffff;
651
0
  t0 = t0 + (c * 5);
652
0
  c = (t0 >> 26);
653
0
  t0 &= 0x3ffffff;
654
0
  t1 = t1 + c;
655
656
0
  st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
657
0
  st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
658
0
  st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
659
660
0
  return consumed;
661
0
}
662
663
void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
664
0
                            size_t bytes) {
665
0
  poly1305_state_internal *st = poly1305_aligned_state(state);
666
0
  size_t want;
667
668
  // Work around a C language bug. See https://crbug.com/1019588.
669
0
  if (bytes == 0) {
670
0
    return;
671
0
  }
672
673
  // need at least 32 initial bytes to start the accelerated branch
674
0
  if (!st->started) {
675
0
    if ((st->leftover == 0) && (bytes > 32)) {
676
0
      poly1305_first_block(st, m);
677
0
      m += 32;
678
0
      bytes -= 32;
679
0
    } else {
680
0
      want = poly1305_min(32 - st->leftover, bytes);
681
0
      OPENSSL_memcpy(st->buffer + st->leftover, m, want);
682
0
      bytes -= want;
683
0
      m += want;
684
0
      st->leftover += want;
685
0
      if ((st->leftover < 32) || (bytes == 0)) {
686
0
        return;
687
0
      }
688
0
      poly1305_first_block(st, st->buffer);
689
0
      st->leftover = 0;
690
0
    }
691
0
    st->started = 1;
692
0
  }
693
694
  // handle leftover
695
0
  if (st->leftover) {
696
0
    want = poly1305_min(64 - st->leftover, bytes);
697
0
    OPENSSL_memcpy(st->buffer + st->leftover, m, want);
698
0
    bytes -= want;
699
0
    m += want;
700
0
    st->leftover += want;
701
0
    if (st->leftover < 64) {
702
0
      return;
703
0
    }
704
0
    poly1305_blocks(st, st->buffer, 64);
705
0
    st->leftover = 0;
706
0
  }
707
708
  // process 64 byte blocks
709
0
  if (bytes >= 64) {
710
0
    want = (bytes & ~63);
711
0
    poly1305_blocks(st, m, want);
712
0
    m += want;
713
0
    bytes -= want;
714
0
  }
715
716
0
  if (bytes) {
717
0
    OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
718
0
    st->leftover += bytes;
719
0
  }
720
0
}
721
722
0
void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
723
0
  poly1305_state_internal *st = poly1305_aligned_state(state);
724
0
  size_t leftover = st->leftover;
725
0
  uint8_t *m = st->buffer;
726
0
  uint128_t d[3];
727
0
  uint64_t h0, h1, h2;
728
0
  uint64_t t0, t1;
729
0
  uint64_t g0, g1, g2, c, nc;
730
0
  uint64_t r0, r1, r2, s1, s2;
731
0
  poly1305_power *p;
732
733
0
  if (st->started) {
734
0
    size_t consumed = poly1305_combine(st, m, leftover);
735
0
    leftover -= consumed;
736
0
    m += consumed;
737
0
  }
738
739
  // st->HH will either be 0 or have the combined result
740
0
  h0 = st->HH[0];
741
0
  h1 = st->HH[1];
742
0
  h2 = st->HH[2];
743
744
0
  p = &st->P[1];
745
0
  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
746
0
  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
747
0
  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
748
0
  s1 = r1 * (5 << 2);
749
0
  s2 = r2 * (5 << 2);
750
751
0
  if (leftover < 16) {
752
0
    goto poly1305_donna_atmost15bytes;
753
0
  }
754
755
0
poly1305_donna_atleast16bytes:
756
0
  t0 = CRYPTO_load_u64_le(m + 0);
757
0
  t1 = CRYPTO_load_u64_le(m + 8);
758
0
  h0 += t0 & 0xfffffffffff;
759
0
  t0 = shr128_pair(t1, t0, 44);
760
0
  h1 += t0 & 0xfffffffffff;
761
0
  h2 += (t1 >> 24) | ((uint64_t)1 << 40);
762
763
0
poly1305_donna_mul:
764
0
  d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
765
0
                mul64x64_128(h2, s1));
766
0
  d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
767
0
                mul64x64_128(h2, s2));
768
0
  d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
769
0
                mul64x64_128(h2, r0));
770
0
  h0 = lo128(d[0]) & 0xfffffffffff;
771
0
  c = shr128(d[0], 44);
772
0
  d[1] = add128_64(d[1], c);
773
0
  h1 = lo128(d[1]) & 0xfffffffffff;
774
0
  c = shr128(d[1], 44);
775
0
  d[2] = add128_64(d[2], c);
776
0
  h2 = lo128(d[2]) & 0x3ffffffffff;
777
0
  c = shr128(d[2], 42);
778
0
  h0 += c * 5;
779
780
0
  m += 16;
781
0
  leftover -= 16;
782
0
  if (leftover >= 16) {
783
0
    goto poly1305_donna_atleast16bytes;
784
0
  }
785
786
// final bytes
787
0
poly1305_donna_atmost15bytes:
788
0
  if (!leftover) {
789
0
    goto poly1305_donna_finish;
790
0
  }
791
792
0
  m[leftover++] = 1;
793
0
  OPENSSL_memset(m + leftover, 0, 16 - leftover);
794
0
  leftover = 16;
795
796
0
  t0 = CRYPTO_load_u64_le(m + 0);
797
0
  t1 = CRYPTO_load_u64_le(m + 8);
798
0
  h0 += t0 & 0xfffffffffff;
799
0
  t0 = shr128_pair(t1, t0, 44);
800
0
  h1 += t0 & 0xfffffffffff;
801
0
  h2 += (t1 >> 24);
802
803
0
  goto poly1305_donna_mul;
804
805
0
poly1305_donna_finish:
806
0
  c = (h0 >> 44);
807
0
  h0 &= 0xfffffffffff;
808
0
  h1 += c;
809
0
  c = (h1 >> 44);
810
0
  h1 &= 0xfffffffffff;
811
0
  h2 += c;
812
0
  c = (h2 >> 42);
813
0
  h2 &= 0x3ffffffffff;
814
0
  h0 += c * 5;
815
816
0
  g0 = h0 + 5;
817
0
  c = (g0 >> 44);
818
0
  g0 &= 0xfffffffffff;
819
0
  g1 = h1 + c;
820
0
  c = (g1 >> 44);
821
0
  g1 &= 0xfffffffffff;
822
0
  g2 = h2 + c - ((uint64_t)1 << 42);
823
824
0
  c = (g2 >> 63) - 1;
825
0
  nc = ~c;
826
0
  h0 = (h0 & nc) | (g0 & c);
827
0
  h1 = (h1 & nc) | (g1 & c);
828
0
  h2 = (h2 & nc) | (g2 & c);
829
830
  // pad
831
0
  t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
832
0
  t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
833
0
  h0 += (t0 & 0xfffffffffff);
834
0
  c = (h0 >> 44);
835
0
  h0 &= 0xfffffffffff;
836
0
  t0 = shr128_pair(t1, t0, 44);
837
0
  h1 += (t0 & 0xfffffffffff) + c;
838
0
  c = (h1 >> 44);
839
0
  h1 &= 0xfffffffffff;
840
0
  t1 = (t1 >> 24);
841
0
  h2 += (t1)+c;
842
843
0
  CRYPTO_store_u64_le(mac + 0, ((h0) | (h1 << 44)));
844
0
  CRYPTO_store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24)));
845
0
}
846
847
#endif  // BORINGSSL_HAS_UINT128 && OPENSSL_X86_64