Coverage Report

Created: 2026-02-16 07:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/boringssl/crypto/poly1305/poly1305_vec.cc
Line
Count
Source
1
// Copyright 2014 The BoringSSL Authors
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
// This implementation of poly1305 is by Andrew Moon
16
// (https://github.com/floodyberry/poly1305-donna) and released as public
17
// domain. It implements SIMD vectorization based on the algorithm described in
18
// http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19
// block size
20
21
#include <openssl/poly1305.h>
22
23
#include <assert.h>
24
25
#include "../internal.h"
26
27
28
#if defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_X86_64)
29
30
#include <emmintrin.h>
31
32
33
using namespace bssl;
34
35
namespace {
36
37
typedef __m128i xmmi;
38
39
alignas(16) const uint32_t poly1305_x64_sse2_message_mask[4] = {
40
    (1 << 26) - 1, 0, (1 << 26) - 1, 0};
41
alignas(16) const uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
42
alignas(16) const uint32_t poly1305_x64_sse2_1shl128[4] = {(1 << 24), 0,
43
                                                           (1 << 24), 0};
44
45
0
uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
46
47
0
uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
48
49
0
uint128_t mul64x64_128(uint64_t a, uint64_t b) { return (uint128_t)a * b; }
50
51
0
uint64_t lo128(uint128_t a) { return (uint64_t)a; }
52
53
0
uint64_t shr128(uint128_t v, const int shift) { return (uint64_t)(v >> shift); }
54
55
0
uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
56
0
  return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
57
0
}
58
59
typedef struct poly1305_power_t {
60
  union {
61
    xmmi v;
62
    uint64_t u[2];
63
    uint32_t d[4];
64
  } R20, R21, R22, R23, R24, S21, S22, S23, S24;
65
} poly1305_power;
66
67
typedef struct poly1305_state_internal_t {
68
  poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
69
                          bytes of free storage */
70
  union {
71
    xmmi H[5];  //  80 bytes
72
    uint64_t HH[10];
73
  };
74
  // uint64_t r0,r1,r2;       [24 bytes]
75
  // uint64_t pad0,pad1;      [16 bytes]
76
  uint64_t started;        //   8 bytes
77
  uint64_t leftover;       //   8 bytes
78
  uint8_t buffer[64];      //  64 bytes
79
} poly1305_state_internal; /* 448 bytes total + 63 bytes for
80
                              alignment = 511 bytes raw */
81
82
static_assert(sizeof(struct poly1305_state_internal_t) + 63 <=
83
                  sizeof(poly1305_state),
84
              "poly1305_state isn't large enough to hold aligned "
85
              "poly1305_state_internal_t");
86
87
0
poly1305_state_internal *poly1305_aligned_state(poly1305_state *state) {
88
0
  return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
89
0
}
90
91
0
size_t poly1305_min(size_t a, size_t b) { return (a < b) ? a : b; }
92
93
}  // namespace
94
95
0
void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
96
0
  poly1305_state_internal *st = poly1305_aligned_state(state);
97
0
  poly1305_power *p;
98
0
  uint64_t r0, r1, r2;
99
0
  uint64_t t0, t1;
100
101
  // clamp key
102
0
  t0 = CRYPTO_load_u64_le(key + 0);
103
0
  t1 = CRYPTO_load_u64_le(key + 8);
104
0
  r0 = t0 & 0xffc0fffffff;
105
0
  t0 >>= 44;
106
0
  t0 |= t1 << 20;
107
0
  r1 = t0 & 0xfffffc0ffff;
108
0
  t1 >>= 24;
109
0
  r2 = t1 & 0x00ffffffc0f;
110
111
  // store r in un-used space of st->P[1]
112
0
  p = &st->P[1];
113
0
  p->R20.d[1] = (uint32_t)(r0);
114
0
  p->R20.d[3] = (uint32_t)(r0 >> 32);
115
0
  p->R21.d[1] = (uint32_t)(r1);
116
0
  p->R21.d[3] = (uint32_t)(r1 >> 32);
117
0
  p->R22.d[1] = (uint32_t)(r2);
118
0
  p->R22.d[3] = (uint32_t)(r2 >> 32);
119
120
  // store pad
121
0
  p->R23.d[1] = CRYPTO_load_u32_le(key + 16);
122
0
  p->R23.d[3] = CRYPTO_load_u32_le(key + 20);
123
0
  p->R24.d[1] = CRYPTO_load_u32_le(key + 24);
124
0
  p->R24.d[3] = CRYPTO_load_u32_le(key + 28);
125
126
  // H = 0
127
0
  st->H[0] = _mm_setzero_si128();
128
0
  st->H[1] = _mm_setzero_si128();
129
0
  st->H[2] = _mm_setzero_si128();
130
0
  st->H[3] = _mm_setzero_si128();
131
0
  st->H[4] = _mm_setzero_si128();
132
133
0
  st->started = 0;
134
0
  st->leftover = 0;
135
0
}
136
137
namespace {
138
139
0
void poly1305_first_block(poly1305_state_internal *st, const uint8_t *m) {
140
0
  const xmmi MMASK =
141
0
      _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
142
0
  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
143
0
  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
144
0
  xmmi T5, T6;
145
0
  poly1305_power *p;
146
0
  uint128_t d[3];
147
0
  uint64_t r0, r1, r2;
148
0
  uint64_t r20, r21, r22, s22;
149
0
  uint64_t pad0, pad1;
150
0
  uint64_t c;
151
0
  uint64_t i;
152
153
  // pull out stored info
154
0
  p = &st->P[1];
155
156
0
  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
157
0
  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
158
0
  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
159
0
  pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
160
0
  pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
161
162
  // compute powers r^2,r^4
163
0
  r20 = r0;
164
0
  r21 = r1;
165
0
  r22 = r2;
166
0
  for (i = 0; i < 2; i++) {
167
0
    s22 = r22 * (5 << 2);
168
169
0
    d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
170
0
    d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
171
0
    d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
172
173
0
    r20 = lo128(d[0]) & 0xfffffffffff;
174
0
    c = shr128(d[0], 44);
175
0
    d[1] = add128_64(d[1], c);
176
0
    r21 = lo128(d[1]) & 0xfffffffffff;
177
0
    c = shr128(d[1], 44);
178
0
    d[2] = add128_64(d[2], c);
179
0
    r22 = lo128(d[2]) & 0x3ffffffffff;
180
0
    c = shr128(d[2], 42);
181
0
    r20 += c * 5;
182
0
    c = (r20 >> 44);
183
0
    r20 = r20 & 0xfffffffffff;
184
0
    r21 += c;
185
186
0
    p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20) & 0x3ffffff),
187
0
                                 _MM_SHUFFLE(1, 0, 1, 0));
188
0
    p->R21.v = _mm_shuffle_epi32(
189
0
        _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
190
0
        _MM_SHUFFLE(1, 0, 1, 0));
191
0
    p->R22.v =
192
0
        _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
193
0
                          _MM_SHUFFLE(1, 0, 1, 0));
194
0
    p->R23.v = _mm_shuffle_epi32(
195
0
        _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
196
0
        _MM_SHUFFLE(1, 0, 1, 0));
197
0
    p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
198
0
                                 _MM_SHUFFLE(1, 0, 1, 0));
199
0
    p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
200
0
    p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
201
0
    p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
202
0
    p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
203
0
    p--;
204
0
  }
205
206
  // put saved info back
207
0
  p = &st->P[1];
208
0
  p->R20.d[1] = (uint32_t)(r0);
209
0
  p->R20.d[3] = (uint32_t)(r0 >> 32);
210
0
  p->R21.d[1] = (uint32_t)(r1);
211
0
  p->R21.d[3] = (uint32_t)(r1 >> 32);
212
0
  p->R22.d[1] = (uint32_t)(r2);
213
0
  p->R22.d[3] = (uint32_t)(r2 >> 32);
214
0
  p->R23.d[1] = (uint32_t)(pad0);
215
0
  p->R23.d[3] = (uint32_t)(pad0 >> 32);
216
0
  p->R24.d[1] = (uint32_t)(pad1);
217
0
  p->R24.d[3] = (uint32_t)(pad1 >> 32);
218
219
  // H = [Mx,My]
220
0
  T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
221
0
                          _mm_loadl_epi64((const xmmi *)(m + 16)));
222
0
  T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
223
0
                          _mm_loadl_epi64((const xmmi *)(m + 24)));
224
0
  st->H[0] = _mm_and_si128(MMASK, T5);
225
0
  st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
226
0
  T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
227
0
  st->H[2] = _mm_and_si128(MMASK, T5);
228
0
  st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
229
0
  st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
230
0
}
231
232
void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
233
0
                     size_t bytes) {
234
0
  const xmmi MMASK =
235
0
      _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
236
0
  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
237
0
  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
238
239
0
  poly1305_power *p;
240
0
  xmmi H0, H1, H2, H3, H4;
241
0
  xmmi T0, T1, T2, T3, T4, T5, T6;
242
0
  xmmi M0, M1, M2, M3, M4;
243
0
  xmmi C1, C2;
244
245
0
  H0 = st->H[0];
246
0
  H1 = st->H[1];
247
0
  H2 = st->H[2];
248
0
  H3 = st->H[3];
249
0
  H4 = st->H[4];
250
251
0
  while (bytes >= 64) {
252
    // H *= [r^4,r^4]
253
0
    p = &st->P[0];
254
0
    T0 = _mm_mul_epu32(H0, p->R20.v);
255
0
    T1 = _mm_mul_epu32(H0, p->R21.v);
256
0
    T2 = _mm_mul_epu32(H0, p->R22.v);
257
0
    T3 = _mm_mul_epu32(H0, p->R23.v);
258
0
    T4 = _mm_mul_epu32(H0, p->R24.v);
259
0
    T5 = _mm_mul_epu32(H1, p->S24.v);
260
0
    T6 = _mm_mul_epu32(H1, p->R20.v);
261
0
    T0 = _mm_add_epi64(T0, T5);
262
0
    T1 = _mm_add_epi64(T1, T6);
263
0
    T5 = _mm_mul_epu32(H2, p->S23.v);
264
0
    T6 = _mm_mul_epu32(H2, p->S24.v);
265
0
    T0 = _mm_add_epi64(T0, T5);
266
0
    T1 = _mm_add_epi64(T1, T6);
267
0
    T5 = _mm_mul_epu32(H3, p->S22.v);
268
0
    T6 = _mm_mul_epu32(H3, p->S23.v);
269
0
    T0 = _mm_add_epi64(T0, T5);
270
0
    T1 = _mm_add_epi64(T1, T6);
271
0
    T5 = _mm_mul_epu32(H4, p->S21.v);
272
0
    T6 = _mm_mul_epu32(H4, p->S22.v);
273
0
    T0 = _mm_add_epi64(T0, T5);
274
0
    T1 = _mm_add_epi64(T1, T6);
275
0
    T5 = _mm_mul_epu32(H1, p->R21.v);
276
0
    T6 = _mm_mul_epu32(H1, p->R22.v);
277
0
    T2 = _mm_add_epi64(T2, T5);
278
0
    T3 = _mm_add_epi64(T3, T6);
279
0
    T5 = _mm_mul_epu32(H2, p->R20.v);
280
0
    T6 = _mm_mul_epu32(H2, p->R21.v);
281
0
    T2 = _mm_add_epi64(T2, T5);
282
0
    T3 = _mm_add_epi64(T3, T6);
283
0
    T5 = _mm_mul_epu32(H3, p->S24.v);
284
0
    T6 = _mm_mul_epu32(H3, p->R20.v);
285
0
    T2 = _mm_add_epi64(T2, T5);
286
0
    T3 = _mm_add_epi64(T3, T6);
287
0
    T5 = _mm_mul_epu32(H4, p->S23.v);
288
0
    T6 = _mm_mul_epu32(H4, p->S24.v);
289
0
    T2 = _mm_add_epi64(T2, T5);
290
0
    T3 = _mm_add_epi64(T3, T6);
291
0
    T5 = _mm_mul_epu32(H1, p->R23.v);
292
0
    T4 = _mm_add_epi64(T4, T5);
293
0
    T5 = _mm_mul_epu32(H2, p->R22.v);
294
0
    T4 = _mm_add_epi64(T4, T5);
295
0
    T5 = _mm_mul_epu32(H3, p->R21.v);
296
0
    T4 = _mm_add_epi64(T4, T5);
297
0
    T5 = _mm_mul_epu32(H4, p->R20.v);
298
0
    T4 = _mm_add_epi64(T4, T5);
299
300
    // H += [Mx,My]*[r^2,r^2]
301
0
    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
302
0
                            _mm_loadl_epi64((const xmmi *)(m + 16)));
303
0
    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
304
0
                            _mm_loadl_epi64((const xmmi *)(m + 24)));
305
0
    M0 = _mm_and_si128(MMASK, T5);
306
0
    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
307
0
    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
308
0
    M2 = _mm_and_si128(MMASK, T5);
309
0
    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
310
0
    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
311
312
0
    p = &st->P[1];
313
0
    T5 = _mm_mul_epu32(M0, p->R20.v);
314
0
    T6 = _mm_mul_epu32(M0, p->R21.v);
315
0
    T0 = _mm_add_epi64(T0, T5);
316
0
    T1 = _mm_add_epi64(T1, T6);
317
0
    T5 = _mm_mul_epu32(M1, p->S24.v);
318
0
    T6 = _mm_mul_epu32(M1, p->R20.v);
319
0
    T0 = _mm_add_epi64(T0, T5);
320
0
    T1 = _mm_add_epi64(T1, T6);
321
0
    T5 = _mm_mul_epu32(M2, p->S23.v);
322
0
    T6 = _mm_mul_epu32(M2, p->S24.v);
323
0
    T0 = _mm_add_epi64(T0, T5);
324
0
    T1 = _mm_add_epi64(T1, T6);
325
0
    T5 = _mm_mul_epu32(M3, p->S22.v);
326
0
    T6 = _mm_mul_epu32(M3, p->S23.v);
327
0
    T0 = _mm_add_epi64(T0, T5);
328
0
    T1 = _mm_add_epi64(T1, T6);
329
0
    T5 = _mm_mul_epu32(M4, p->S21.v);
330
0
    T6 = _mm_mul_epu32(M4, p->S22.v);
331
0
    T0 = _mm_add_epi64(T0, T5);
332
0
    T1 = _mm_add_epi64(T1, T6);
333
0
    T5 = _mm_mul_epu32(M0, p->R22.v);
334
0
    T6 = _mm_mul_epu32(M0, p->R23.v);
335
0
    T2 = _mm_add_epi64(T2, T5);
336
0
    T3 = _mm_add_epi64(T3, T6);
337
0
    T5 = _mm_mul_epu32(M1, p->R21.v);
338
0
    T6 = _mm_mul_epu32(M1, p->R22.v);
339
0
    T2 = _mm_add_epi64(T2, T5);
340
0
    T3 = _mm_add_epi64(T3, T6);
341
0
    T5 = _mm_mul_epu32(M2, p->R20.v);
342
0
    T6 = _mm_mul_epu32(M2, p->R21.v);
343
0
    T2 = _mm_add_epi64(T2, T5);
344
0
    T3 = _mm_add_epi64(T3, T6);
345
0
    T5 = _mm_mul_epu32(M3, p->S24.v);
346
0
    T6 = _mm_mul_epu32(M3, p->R20.v);
347
0
    T2 = _mm_add_epi64(T2, T5);
348
0
    T3 = _mm_add_epi64(T3, T6);
349
0
    T5 = _mm_mul_epu32(M4, p->S23.v);
350
0
    T6 = _mm_mul_epu32(M4, p->S24.v);
351
0
    T2 = _mm_add_epi64(T2, T5);
352
0
    T3 = _mm_add_epi64(T3, T6);
353
0
    T5 = _mm_mul_epu32(M0, p->R24.v);
354
0
    T4 = _mm_add_epi64(T4, T5);
355
0
    T5 = _mm_mul_epu32(M1, p->R23.v);
356
0
    T4 = _mm_add_epi64(T4, T5);
357
0
    T5 = _mm_mul_epu32(M2, p->R22.v);
358
0
    T4 = _mm_add_epi64(T4, T5);
359
0
    T5 = _mm_mul_epu32(M3, p->R21.v);
360
0
    T4 = _mm_add_epi64(T4, T5);
361
0
    T5 = _mm_mul_epu32(M4, p->R20.v);
362
0
    T4 = _mm_add_epi64(T4, T5);
363
364
    // H += [Mx,My]
365
0
    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
366
0
                            _mm_loadl_epi64((const xmmi *)(m + 48)));
367
0
    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
368
0
                            _mm_loadl_epi64((const xmmi *)(m + 56)));
369
0
    M0 = _mm_and_si128(MMASK, T5);
370
0
    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
371
0
    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
372
0
    M2 = _mm_and_si128(MMASK, T5);
373
0
    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
374
0
    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
375
376
0
    T0 = _mm_add_epi64(T0, M0);
377
0
    T1 = _mm_add_epi64(T1, M1);
378
0
    T2 = _mm_add_epi64(T2, M2);
379
0
    T3 = _mm_add_epi64(T3, M3);
380
0
    T4 = _mm_add_epi64(T4, M4);
381
382
    // reduce
383
0
    C1 = _mm_srli_epi64(T0, 26);
384
0
    C2 = _mm_srli_epi64(T3, 26);
385
0
    T0 = _mm_and_si128(T0, MMASK);
386
0
    T3 = _mm_and_si128(T3, MMASK);
387
0
    T1 = _mm_add_epi64(T1, C1);
388
0
    T4 = _mm_add_epi64(T4, C2);
389
0
    C1 = _mm_srli_epi64(T1, 26);
390
0
    C2 = _mm_srli_epi64(T4, 26);
391
0
    T1 = _mm_and_si128(T1, MMASK);
392
0
    T4 = _mm_and_si128(T4, MMASK);
393
0
    T2 = _mm_add_epi64(T2, C1);
394
0
    T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
395
0
    C1 = _mm_srli_epi64(T2, 26);
396
0
    C2 = _mm_srli_epi64(T0, 26);
397
0
    T2 = _mm_and_si128(T2, MMASK);
398
0
    T0 = _mm_and_si128(T0, MMASK);
399
0
    T3 = _mm_add_epi64(T3, C1);
400
0
    T1 = _mm_add_epi64(T1, C2);
401
0
    C1 = _mm_srli_epi64(T3, 26);
402
0
    T3 = _mm_and_si128(T3, MMASK);
403
0
    T4 = _mm_add_epi64(T4, C1);
404
405
    // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
406
0
    H0 = T0;
407
0
    H1 = T1;
408
0
    H2 = T2;
409
0
    H3 = T3;
410
0
    H4 = T4;
411
412
0
    m += 64;
413
0
    bytes -= 64;
414
0
  }
415
416
0
  st->H[0] = H0;
417
0
  st->H[1] = H1;
418
0
  st->H[2] = H2;
419
0
  st->H[3] = H3;
420
0
  st->H[4] = H4;
421
0
}
422
423
size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
424
0
                        size_t bytes) {
425
0
  const xmmi MMASK =
426
0
      _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
427
0
  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
428
0
  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
429
430
0
  poly1305_power *p;
431
0
  xmmi H0, H1, H2, H3, H4;
432
0
  xmmi M0, M1, M2, M3, M4;
433
0
  xmmi T0, T1, T2, T3, T4, T5, T6;
434
0
  xmmi C1, C2;
435
436
0
  uint64_t r0, r1, r2;
437
0
  uint64_t t0, t1, t2, t3, t4;
438
0
  uint64_t c;
439
0
  size_t consumed = 0;
440
441
0
  H0 = st->H[0];
442
0
  H1 = st->H[1];
443
0
  H2 = st->H[2];
444
0
  H3 = st->H[3];
445
0
  H4 = st->H[4];
446
447
  // p = [r^2,r^2]
448
0
  p = &st->P[1];
449
450
0
  if (bytes >= 32) {
451
    // H *= [r^2,r^2]
452
0
    T0 = _mm_mul_epu32(H0, p->R20.v);
453
0
    T1 = _mm_mul_epu32(H0, p->R21.v);
454
0
    T2 = _mm_mul_epu32(H0, p->R22.v);
455
0
    T3 = _mm_mul_epu32(H0, p->R23.v);
456
0
    T4 = _mm_mul_epu32(H0, p->R24.v);
457
0
    T5 = _mm_mul_epu32(H1, p->S24.v);
458
0
    T6 = _mm_mul_epu32(H1, p->R20.v);
459
0
    T0 = _mm_add_epi64(T0, T5);
460
0
    T1 = _mm_add_epi64(T1, T6);
461
0
    T5 = _mm_mul_epu32(H2, p->S23.v);
462
0
    T6 = _mm_mul_epu32(H2, p->S24.v);
463
0
    T0 = _mm_add_epi64(T0, T5);
464
0
    T1 = _mm_add_epi64(T1, T6);
465
0
    T5 = _mm_mul_epu32(H3, p->S22.v);
466
0
    T6 = _mm_mul_epu32(H3, p->S23.v);
467
0
    T0 = _mm_add_epi64(T0, T5);
468
0
    T1 = _mm_add_epi64(T1, T6);
469
0
    T5 = _mm_mul_epu32(H4, p->S21.v);
470
0
    T6 = _mm_mul_epu32(H4, p->S22.v);
471
0
    T0 = _mm_add_epi64(T0, T5);
472
0
    T1 = _mm_add_epi64(T1, T6);
473
0
    T5 = _mm_mul_epu32(H1, p->R21.v);
474
0
    T6 = _mm_mul_epu32(H1, p->R22.v);
475
0
    T2 = _mm_add_epi64(T2, T5);
476
0
    T3 = _mm_add_epi64(T3, T6);
477
0
    T5 = _mm_mul_epu32(H2, p->R20.v);
478
0
    T6 = _mm_mul_epu32(H2, p->R21.v);
479
0
    T2 = _mm_add_epi64(T2, T5);
480
0
    T3 = _mm_add_epi64(T3, T6);
481
0
    T5 = _mm_mul_epu32(H3, p->S24.v);
482
0
    T6 = _mm_mul_epu32(H3, p->R20.v);
483
0
    T2 = _mm_add_epi64(T2, T5);
484
0
    T3 = _mm_add_epi64(T3, T6);
485
0
    T5 = _mm_mul_epu32(H4, p->S23.v);
486
0
    T6 = _mm_mul_epu32(H4, p->S24.v);
487
0
    T2 = _mm_add_epi64(T2, T5);
488
0
    T3 = _mm_add_epi64(T3, T6);
489
0
    T5 = _mm_mul_epu32(H1, p->R23.v);
490
0
    T4 = _mm_add_epi64(T4, T5);
491
0
    T5 = _mm_mul_epu32(H2, p->R22.v);
492
0
    T4 = _mm_add_epi64(T4, T5);
493
0
    T5 = _mm_mul_epu32(H3, p->R21.v);
494
0
    T4 = _mm_add_epi64(T4, T5);
495
0
    T5 = _mm_mul_epu32(H4, p->R20.v);
496
0
    T4 = _mm_add_epi64(T4, T5);
497
498
    // H += [Mx,My]
499
0
    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
500
0
                            _mm_loadl_epi64((const xmmi *)(m + 16)));
501
0
    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
502
0
                            _mm_loadl_epi64((const xmmi *)(m + 24)));
503
0
    M0 = _mm_and_si128(MMASK, T5);
504
0
    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
505
0
    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
506
0
    M2 = _mm_and_si128(MMASK, T5);
507
0
    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
508
0
    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
509
510
0
    T0 = _mm_add_epi64(T0, M0);
511
0
    T1 = _mm_add_epi64(T1, M1);
512
0
    T2 = _mm_add_epi64(T2, M2);
513
0
    T3 = _mm_add_epi64(T3, M3);
514
0
    T4 = _mm_add_epi64(T4, M4);
515
516
    // reduce
517
0
    C1 = _mm_srli_epi64(T0, 26);
518
0
    C2 = _mm_srli_epi64(T3, 26);
519
0
    T0 = _mm_and_si128(T0, MMASK);
520
0
    T3 = _mm_and_si128(T3, MMASK);
521
0
    T1 = _mm_add_epi64(T1, C1);
522
0
    T4 = _mm_add_epi64(T4, C2);
523
0
    C1 = _mm_srli_epi64(T1, 26);
524
0
    C2 = _mm_srli_epi64(T4, 26);
525
0
    T1 = _mm_and_si128(T1, MMASK);
526
0
    T4 = _mm_and_si128(T4, MMASK);
527
0
    T2 = _mm_add_epi64(T2, C1);
528
0
    T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
529
0
    C1 = _mm_srli_epi64(T2, 26);
530
0
    C2 = _mm_srli_epi64(T0, 26);
531
0
    T2 = _mm_and_si128(T2, MMASK);
532
0
    T0 = _mm_and_si128(T0, MMASK);
533
0
    T3 = _mm_add_epi64(T3, C1);
534
0
    T1 = _mm_add_epi64(T1, C2);
535
0
    C1 = _mm_srli_epi64(T3, 26);
536
0
    T3 = _mm_and_si128(T3, MMASK);
537
0
    T4 = _mm_add_epi64(T4, C1);
538
539
    // H = (H*[r^2,r^2] + [Mx,My])
540
0
    H0 = T0;
541
0
    H1 = T1;
542
0
    H2 = T2;
543
0
    H3 = T3;
544
0
    H4 = T4;
545
546
0
    consumed = 32;
547
0
  }
548
549
  // finalize, H *= [r^2,r]
550
0
  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
551
0
  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
552
0
  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
553
554
0
  p->R20.d[2] = (uint32_t)(r0) & 0x3ffffff;
555
0
  p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
556
0
  p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
557
0
  p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
558
0
  p->R24.d[2] = (uint32_t)((r2 >> 16));
559
0
  p->S21.d[2] = p->R21.d[2] * 5;
560
0
  p->S22.d[2] = p->R22.d[2] * 5;
561
0
  p->S23.d[2] = p->R23.d[2] * 5;
562
0
  p->S24.d[2] = p->R24.d[2] * 5;
563
564
  // H *= [r^2,r]
565
0
  T0 = _mm_mul_epu32(H0, p->R20.v);
566
0
  T1 = _mm_mul_epu32(H0, p->R21.v);
567
0
  T2 = _mm_mul_epu32(H0, p->R22.v);
568
0
  T3 = _mm_mul_epu32(H0, p->R23.v);
569
0
  T4 = _mm_mul_epu32(H0, p->R24.v);
570
0
  T5 = _mm_mul_epu32(H1, p->S24.v);
571
0
  T6 = _mm_mul_epu32(H1, p->R20.v);
572
0
  T0 = _mm_add_epi64(T0, T5);
573
0
  T1 = _mm_add_epi64(T1, T6);
574
0
  T5 = _mm_mul_epu32(H2, p->S23.v);
575
0
  T6 = _mm_mul_epu32(H2, p->S24.v);
576
0
  T0 = _mm_add_epi64(T0, T5);
577
0
  T1 = _mm_add_epi64(T1, T6);
578
0
  T5 = _mm_mul_epu32(H3, p->S22.v);
579
0
  T6 = _mm_mul_epu32(H3, p->S23.v);
580
0
  T0 = _mm_add_epi64(T0, T5);
581
0
  T1 = _mm_add_epi64(T1, T6);
582
0
  T5 = _mm_mul_epu32(H4, p->S21.v);
583
0
  T6 = _mm_mul_epu32(H4, p->S22.v);
584
0
  T0 = _mm_add_epi64(T0, T5);
585
0
  T1 = _mm_add_epi64(T1, T6);
586
0
  T5 = _mm_mul_epu32(H1, p->R21.v);
587
0
  T6 = _mm_mul_epu32(H1, p->R22.v);
588
0
  T2 = _mm_add_epi64(T2, T5);
589
0
  T3 = _mm_add_epi64(T3, T6);
590
0
  T5 = _mm_mul_epu32(H2, p->R20.v);
591
0
  T6 = _mm_mul_epu32(H2, p->R21.v);
592
0
  T2 = _mm_add_epi64(T2, T5);
593
0
  T3 = _mm_add_epi64(T3, T6);
594
0
  T5 = _mm_mul_epu32(H3, p->S24.v);
595
0
  T6 = _mm_mul_epu32(H3, p->R20.v);
596
0
  T2 = _mm_add_epi64(T2, T5);
597
0
  T3 = _mm_add_epi64(T3, T6);
598
0
  T5 = _mm_mul_epu32(H4, p->S23.v);
599
0
  T6 = _mm_mul_epu32(H4, p->S24.v);
600
0
  T2 = _mm_add_epi64(T2, T5);
601
0
  T3 = _mm_add_epi64(T3, T6);
602
0
  T5 = _mm_mul_epu32(H1, p->R23.v);
603
0
  T4 = _mm_add_epi64(T4, T5);
604
0
  T5 = _mm_mul_epu32(H2, p->R22.v);
605
0
  T4 = _mm_add_epi64(T4, T5);
606
0
  T5 = _mm_mul_epu32(H3, p->R21.v);
607
0
  T4 = _mm_add_epi64(T4, T5);
608
0
  T5 = _mm_mul_epu32(H4, p->R20.v);
609
0
  T4 = _mm_add_epi64(T4, T5);
610
611
0
  C1 = _mm_srli_epi64(T0, 26);
612
0
  C2 = _mm_srli_epi64(T3, 26);
613
0
  T0 = _mm_and_si128(T0, MMASK);
614
0
  T3 = _mm_and_si128(T3, MMASK);
615
0
  T1 = _mm_add_epi64(T1, C1);
616
0
  T4 = _mm_add_epi64(T4, C2);
617
0
  C1 = _mm_srli_epi64(T1, 26);
618
0
  C2 = _mm_srli_epi64(T4, 26);
619
0
  T1 = _mm_and_si128(T1, MMASK);
620
0
  T4 = _mm_and_si128(T4, MMASK);
621
0
  T2 = _mm_add_epi64(T2, C1);
622
0
  T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
623
0
  C1 = _mm_srli_epi64(T2, 26);
624
0
  C2 = _mm_srli_epi64(T0, 26);
625
0
  T2 = _mm_and_si128(T2, MMASK);
626
0
  T0 = _mm_and_si128(T0, MMASK);
627
0
  T3 = _mm_add_epi64(T3, C1);
628
0
  T1 = _mm_add_epi64(T1, C2);
629
0
  C1 = _mm_srli_epi64(T3, 26);
630
0
  T3 = _mm_and_si128(T3, MMASK);
631
0
  T4 = _mm_add_epi64(T4, C1);
632
633
  // H = H[0]+H[1]
634
0
  H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
635
0
  H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
636
0
  H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
637
0
  H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
638
0
  H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
639
640
0
  t0 = _mm_cvtsi128_si32(H0);
641
0
  c = (t0 >> 26);
642
0
  t0 &= 0x3ffffff;
643
0
  t1 = _mm_cvtsi128_si32(H1) + c;
644
0
  c = (t1 >> 26);
645
0
  t1 &= 0x3ffffff;
646
0
  t2 = _mm_cvtsi128_si32(H2) + c;
647
0
  c = (t2 >> 26);
648
0
  t2 &= 0x3ffffff;
649
0
  t3 = _mm_cvtsi128_si32(H3) + c;
650
0
  c = (t3 >> 26);
651
0
  t3 &= 0x3ffffff;
652
0
  t4 = _mm_cvtsi128_si32(H4) + c;
653
0
  c = (t4 >> 26);
654
0
  t4 &= 0x3ffffff;
655
0
  t0 = t0 + (c * 5);
656
0
  c = (t0 >> 26);
657
0
  t0 &= 0x3ffffff;
658
0
  t1 = t1 + c;
659
660
0
  st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
661
0
  st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
662
0
  st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
663
664
0
  return consumed;
665
0
}
666
667
}  // namespace
668
669
void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
670
0
                            size_t bytes) {
671
0
  poly1305_state_internal *st = poly1305_aligned_state(state);
672
0
  size_t want;
673
674
  // Work around a C language bug. See https://crbug.com/1019588.
675
0
  if (bytes == 0) {
676
0
    return;
677
0
  }
678
679
  // need at least 32 initial bytes to start the accelerated branch
680
0
  if (!st->started) {
681
0
    if ((st->leftover == 0) && (bytes > 32)) {
682
0
      poly1305_first_block(st, m);
683
0
      m += 32;
684
0
      bytes -= 32;
685
0
    } else {
686
0
      want = poly1305_min(32 - st->leftover, bytes);
687
0
      OPENSSL_memcpy(st->buffer + st->leftover, m, want);
688
0
      bytes -= want;
689
0
      m += want;
690
0
      st->leftover += want;
691
0
      if ((st->leftover < 32) || (bytes == 0)) {
692
0
        return;
693
0
      }
694
0
      poly1305_first_block(st, st->buffer);
695
0
      st->leftover = 0;
696
0
    }
697
0
    st->started = 1;
698
0
  }
699
700
  // handle leftover
701
0
  if (st->leftover) {
702
0
    want = poly1305_min(64 - st->leftover, bytes);
703
0
    OPENSSL_memcpy(st->buffer + st->leftover, m, want);
704
0
    bytes -= want;
705
0
    m += want;
706
0
    st->leftover += want;
707
0
    if (st->leftover < 64) {
708
0
      return;
709
0
    }
710
0
    poly1305_blocks(st, st->buffer, 64);
711
0
    st->leftover = 0;
712
0
  }
713
714
  // process 64 byte blocks
715
0
  if (bytes >= 64) {
716
0
    want = (bytes & ~63);
717
0
    poly1305_blocks(st, m, want);
718
0
    m += want;
719
0
    bytes -= want;
720
0
  }
721
722
0
  if (bytes) {
723
0
    OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
724
0
    st->leftover += bytes;
725
0
  }
726
0
}
727
728
0
void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
729
0
  poly1305_state_internal *st = poly1305_aligned_state(state);
730
0
  size_t leftover = st->leftover;
731
0
  uint8_t *m = st->buffer;
732
0
  uint128_t d[3];
733
0
  uint64_t h0, h1, h2;
734
0
  uint64_t t0, t1;
735
0
  uint64_t g0, g1, g2, c, nc;
736
0
  uint64_t r0, r1, r2, s1, s2;
737
0
  poly1305_power *p;
738
739
0
  if (st->started) {
740
0
    size_t consumed = poly1305_combine(st, m, leftover);
741
0
    leftover -= consumed;
742
0
    m += consumed;
743
0
  }
744
745
  // st->HH will either be 0 or have the combined result
746
0
  h0 = st->HH[0];
747
0
  h1 = st->HH[1];
748
0
  h2 = st->HH[2];
749
750
0
  p = &st->P[1];
751
0
  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
752
0
  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
753
0
  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
754
0
  s1 = r1 * (5 << 2);
755
0
  s2 = r2 * (5 << 2);
756
757
0
  if (leftover < 16) {
758
0
    goto poly1305_donna_atmost15bytes;
759
0
  }
760
761
0
poly1305_donna_atleast16bytes:
762
0
  t0 = CRYPTO_load_u64_le(m + 0);
763
0
  t1 = CRYPTO_load_u64_le(m + 8);
764
0
  h0 += t0 & 0xfffffffffff;
765
0
  t0 = shr128_pair(t1, t0, 44);
766
0
  h1 += t0 & 0xfffffffffff;
767
0
  h2 += (t1 >> 24) | ((uint64_t)1 << 40);
768
769
0
poly1305_donna_mul:
770
0
  d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
771
0
                mul64x64_128(h2, s1));
772
0
  d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
773
0
                mul64x64_128(h2, s2));
774
0
  d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
775
0
                mul64x64_128(h2, r0));
776
0
  h0 = lo128(d[0]) & 0xfffffffffff;
777
0
  c = shr128(d[0], 44);
778
0
  d[1] = add128_64(d[1], c);
779
0
  h1 = lo128(d[1]) & 0xfffffffffff;
780
0
  c = shr128(d[1], 44);
781
0
  d[2] = add128_64(d[2], c);
782
0
  h2 = lo128(d[2]) & 0x3ffffffffff;
783
0
  c = shr128(d[2], 42);
784
0
  h0 += c * 5;
785
786
0
  m += 16;
787
0
  leftover -= 16;
788
0
  if (leftover >= 16) {
789
0
    goto poly1305_donna_atleast16bytes;
790
0
  }
791
792
// final bytes
793
0
poly1305_donna_atmost15bytes:
794
0
  if (!leftover) {
795
0
    goto poly1305_donna_finish;
796
0
  }
797
798
0
  m[leftover++] = 1;
799
0
  OPENSSL_memset(m + leftover, 0, 16 - leftover);
800
0
  leftover = 16;
801
802
0
  t0 = CRYPTO_load_u64_le(m + 0);
803
0
  t1 = CRYPTO_load_u64_le(m + 8);
804
0
  h0 += t0 & 0xfffffffffff;
805
0
  t0 = shr128_pair(t1, t0, 44);
806
0
  h1 += t0 & 0xfffffffffff;
807
0
  h2 += (t1 >> 24);
808
809
0
  goto poly1305_donna_mul;
810
811
0
poly1305_donna_finish:
812
0
  c = (h0 >> 44);
813
0
  h0 &= 0xfffffffffff;
814
0
  h1 += c;
815
0
  c = (h1 >> 44);
816
0
  h1 &= 0xfffffffffff;
817
0
  h2 += c;
818
0
  c = (h2 >> 42);
819
0
  h2 &= 0x3ffffffffff;
820
0
  h0 += c * 5;
821
822
0
  g0 = h0 + 5;
823
0
  c = (g0 >> 44);
824
0
  g0 &= 0xfffffffffff;
825
0
  g1 = h1 + c;
826
0
  c = (g1 >> 44);
827
0
  g1 &= 0xfffffffffff;
828
0
  g2 = h2 + c - ((uint64_t)1 << 42);
829
830
0
  c = (g2 >> 63) - 1;
831
0
  nc = ~c;
832
0
  h0 = (h0 & nc) | (g0 & c);
833
0
  h1 = (h1 & nc) | (g1 & c);
834
0
  h2 = (h2 & nc) | (g2 & c);
835
836
  // pad
837
0
  t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
838
0
  t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
839
0
  h0 += (t0 & 0xfffffffffff);
840
0
  c = (h0 >> 44);
841
0
  h0 &= 0xfffffffffff;
842
0
  t0 = shr128_pair(t1, t0, 44);
843
0
  h1 += (t0 & 0xfffffffffff) + c;
844
0
  c = (h1 >> 44);
845
0
  h1 &= 0xfffffffffff;
846
0
  t1 = (t1 >> 24);
847
0
  h2 += (t1) + c;
848
849
0
  CRYPTO_store_u64_le(mac + 0, ((h0) | (h1 << 44)));
850
0
  CRYPTO_store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24)));
851
0
}
852
853
#endif  // BORINGSSL_HAS_UINT128 && OPENSSL_X86_64