Coverage Report

Created: 2022-12-08 06:10

/src/libgcrypt/cipher/poly1305.c
Line
Count
Source (jump to first uncovered line)
1
/* poly1305.c  -  Poly1305 internals and generic implementation
2
 * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3
 *
4
 * This file is part of Libgcrypt.
5
 *
6
 * Libgcrypt is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU Lesser general Public License as
8
 * published by the Free Software Foundation; either version 2.1 of
9
 * the License, or (at your option) any later version.
10
 *
11
 * Libgcrypt is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18
 */
19
20
#include <config.h>
21
#include <stdio.h>
22
#include <stdlib.h>
23
#include <string.h>
24
25
#include "types.h"
26
#include "g10lib.h"
27
#include "cipher.h"
28
#include "bufhelp.h"
29
#include "poly1305-internal.h"
30
31
#include "mpi-internal.h"
32
#include "longlong.h"
33
34
35
static const char *selftest (void);
36
37
38
#undef HAVE_ASM_POLY1305_BLOCKS
39
40
41
#undef USE_MPI_64BIT
42
#undef USE_MPI_32BIT
43
#if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_U64)
44
# define USE_MPI_64BIT 1
45
#elif BYTES_PER_MPI_LIMB == 4
46
# define USE_MPI_32BIT 1
47
#else
48
# error please implement for this limb size.
49
#endif
50
51
52
/* USE_S390X_ASM indicates whether to enable zSeries code. */
53
#undef USE_S390X_ASM
54
#if BYTES_PER_MPI_LIMB == 8
55
# if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
56
#  if defined(HAVE_GCC_INLINE_ASM_S390X)
57
#   define USE_S390X_ASM 1
58
#  endif /* USE_S390X_ASM */
59
# endif
60
#endif
61
62
63
/* AMD64 Assembly implementations use SystemV ABI, ABI conversion and
64
 * additional stack to store XMM6-XMM15 needed on Win64. */
65
#undef ASM_FUNC_ABI
66
#undef ASM_FUNC_WRAPPER_ATTR
67
#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
68
# define ASM_FUNC_ABI __attribute__((sysv_abi))
69
# define ASM_FUNC_WRAPPER_ATTR __attribute__((noinline))
70
#else
71
# define ASM_FUNC_ABI
72
# define ASM_FUNC_WRAPPER_ATTR
73
#endif
74
75
76
#ifdef USE_S390X_ASM
77
78
#define HAVE_ASM_POLY1305_BLOCKS 1
79
80
extern unsigned int _gcry_poly1305_s390x_blocks1(void *state,
81
             const byte *buf, size_t len,
82
             byte high_pad);
83
84
static unsigned int
85
poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
86
     byte high_pad)
87
{
88
  return _gcry_poly1305_s390x_blocks1(&ctx->state, buf, len, high_pad);
89
}
90
91
#endif /* USE_S390X_ASM */
92
93
94
#ifdef POLY1305_USE_AVX512
95
96
extern unsigned int
97
_gcry_poly1305_amd64_avx512_blocks(const void *msg, const u64 msg_len,
98
           void *hash, const void *key) ASM_FUNC_ABI;
99
100
ASM_FUNC_WRAPPER_ATTR static unsigned int
101
poly1305_amd64_avx512_blocks(poly1305_context_t *ctx, const byte *buf,
102
           size_t len)
103
0
{
104
0
  POLY1305_STATE *st = &ctx->state;
105
0
  return _gcry_poly1305_amd64_avx512_blocks(buf, len, st->h, st->r);
106
0
}
107
108
#endif /* POLY1305_USE_AVX512 */
109
110
111
#ifdef POLY1305_USE_PPC_VEC
112
113
extern unsigned int
114
gcry_poly1305_p10le_4blocks(unsigned char *key, const byte *m, size_t len);
115
116
#endif /* POLY1305_USE_PPC_VEC */
117
118
119
static void poly1305_init (poly1305_context_t *ctx,
120
         const byte key[POLY1305_KEYLEN])
121
0
{
122
0
  POLY1305_STATE *st = &ctx->state;
123
0
  unsigned int features = _gcry_get_hw_features ();
124
125
0
#ifdef POLY1305_USE_AVX512
126
0
  ctx->use_avx512 = (features & HWF_INTEL_AVX512) != 0;
127
0
#endif
128
129
#ifdef POLY1305_USE_PPC_VEC
130
  ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0;
131
# ifdef ENABLE_FORCE_SOFT_HWFEATURES
132
  /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10.
133
   * Actual implementation works with HWF_PPC_ARCH_3_00 also. */
134
  ctx->use_p10 |= (features & HWF_PPC_ARCH_3_00) != 0;
135
# endif
136
#endif
137
138
0
  (void)features;
139
140
0
  ctx->leftover = 0;
141
142
0
  st->h[0] = 0;
143
0
  st->h[1] = 0;
144
0
  st->h[2] = 0;
145
0
  st->h[3] = 0;
146
0
  st->h[4] = 0;
147
148
0
  st->r[0] = buf_get_le32(key + 0)  & 0x0fffffff;
149
0
  st->r[1] = buf_get_le32(key + 4)  & 0x0ffffffc;
150
0
  st->r[2] = buf_get_le32(key + 8)  & 0x0ffffffc;
151
0
  st->r[3] = buf_get_le32(key + 12) & 0x0ffffffc;
152
153
0
  st->k[0] = buf_get_le32(key + 16);
154
0
  st->k[1] = buf_get_le32(key + 20);
155
0
  st->k[2] = buf_get_le32(key + 24);
156
0
  st->k[3] = buf_get_le32(key + 28);
157
0
}
158
159
160
#ifdef USE_MPI_64BIT
161
162
#if defined (__aarch64__) && defined(HAVE_CPU_ARCH_ARM) && __GNUC__ >= 4
163
164
/* A += B (armv8/aarch64) */
165
#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
166
      __asm__ ("adds %0, %3, %0\n" \
167
         "adcs %1, %4, %1\n" \
168
         "adc  %2, %5, %2\n" \
169
         : "+r" (A0), "+r" (A1), "+r" (A2) \
170
         : "r" (B0), "r" (B1), "r" (B2) \
171
         : "cc" )
172
173
#endif /* __aarch64__ */
174
175
#if defined (__x86_64__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4
176
177
/* A += B (x86-64) */
178
#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
179
0
      __asm__ ("addq %3, %0\n" \
180
0
         "adcq %4, %1\n" \
181
0
         "adcq %5, %2\n" \
182
0
         : "+r" (A0), "+r" (A1), "+r" (A2) \
183
0
         : "g" (B0), "g" (B1), "g" (B2) \
184
0
         : "cc" )
185
186
#endif /* __x86_64__ */
187
188
#if defined (__powerpc__) && defined(HAVE_CPU_ARCH_PPC) && __GNUC__ >= 4
189
190
/* A += B (ppc64) */
191
#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
192
      __asm__ ("addc %0, %3, %0\n" \
193
         "adde %1, %4, %1\n" \
194
         "adde %2, %5, %2\n" \
195
         : "+r" (A0), "+r" (A1), "+r" (A2) \
196
         : "r" (B0), "r" (B1), "r" (B2) \
197
         : "cc" )
198
199
#endif /* __powerpc__ */
200
201
#ifndef ADD_1305_64
202
/* A += B (generic, mpi) */
203
#  define ADD_1305_64(A2, A1, A0, B2, B1, B0) do { \
204
    u64 carry; \
205
    add_ssaaaa(carry, A0, 0, A0, 0, B0); \
206
    add_ssaaaa(A2, A1, A2, A1, B2, B1); \
207
    add_ssaaaa(A2, A1, A2, A1, 0, carry); \
208
  } while (0)
209
#endif
210
211
/* H = H * R mod 2¹³⁰-5 */
212
0
#define MUL_MOD_1305_64(H2, H1, H0, R1, R0, R1_MULT5) do { \
213
0
    u64 x0_lo, x0_hi, x1_lo, x1_hi; \
214
0
    u64 t0_lo, t0_hi, t1_lo, t1_hi; \
215
0
    \
216
0
    /* x = a * r (partial mod 2^130-5) */ \
217
0
    umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
218
0
    umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
219
0
    \
220
0
    umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \
221
0
    add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \
222
0
    umul_ppmm(t1_hi, t1_lo, H1, R0);       /* h1 * r0 */ \
223
0
    add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \
224
0
    \
225
0
    t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \
226
0
    t1_hi = H2 * R0;       /* h2 * r0 */ \
227
0
    add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \
228
0
    \
229
0
    /* carry propagation */ \
230
0
    H2 = H0 & 3; \
231
0
    H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \
232
0
    ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \
233
0
  } while (0)
234
235
#ifndef HAVE_ASM_POLY1305_BLOCKS
236
237
static unsigned int
238
poly1305_blocks_generic (poly1305_context_t *ctx, const byte *buf, size_t len,
239
       byte high_pad)
240
0
{
241
0
  POLY1305_STATE *st = &ctx->state;
242
0
  u64 r0, r1, r1_mult5;
243
0
  u64 h0, h1, h2;
244
0
  u64 m0, m1, m2;
245
246
0
  m2 = high_pad;
247
248
0
  h0 = st->h[0] + ((u64)st->h[1] << 32);
249
0
  h1 = st->h[2] + ((u64)st->h[3] << 32);
250
0
  h2 = st->h[4];
251
252
0
  r0 = st->r[0] + ((u64)st->r[1] << 32);
253
0
  r1 = st->r[2] + ((u64)st->r[3] << 32);
254
255
0
  r1_mult5 = (r1 >> 2) + r1;
256
257
0
  m0 = buf_get_le64(buf + 0);
258
0
  m1 = buf_get_le64(buf + 8);
259
0
  buf += POLY1305_BLOCKSIZE;
260
0
  len -= POLY1305_BLOCKSIZE;
261
262
0
  while (len >= POLY1305_BLOCKSIZE)
263
0
    {
264
      /* a = h + m */
265
0
      ADD_1305_64(h2, h1, h0, m2, m1, m0);
266
267
0
      m0 = buf_get_le64(buf + 0);
268
0
      m1 = buf_get_le64(buf + 8);
269
270
      /* h = a * r (partial mod 2^130-5) */
271
0
      MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
272
273
0
      buf += POLY1305_BLOCKSIZE;
274
0
      len -= POLY1305_BLOCKSIZE;
275
0
    }
276
277
  /* a = h + m */
278
0
  ADD_1305_64(h2, h1, h0, m2, m1, m0);
279
280
  /* h = a * r (partial mod 2^130-5) */
281
0
  MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
282
283
0
  st->h[0] = h0;
284
0
  st->h[1] = h0 >> 32;
285
0
  st->h[2] = h1;
286
0
  st->h[3] = h1 >> 32;
287
0
  st->h[4] = h2;
288
289
0
  return 6 * sizeof (void *) + 18 * sizeof (u64);
290
0
}
291
292
static unsigned int
293
poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
294
     byte high_pad)
295
0
{
296
0
#ifdef POLY1305_USE_AVX512
297
0
  if ((high_pad & ctx->use_avx512) != 0)
298
0
    return poly1305_amd64_avx512_blocks(ctx, buf, len);
299
0
#endif
300
301
0
  return poly1305_blocks_generic(ctx, buf, len, high_pad);
302
0
}
303
304
#endif /* !HAVE_ASM_POLY1305_BLOCKS */
305
306
static unsigned int poly1305_final (poly1305_context_t *ctx,
307
            byte mac[POLY1305_TAGLEN])
308
0
{
309
0
  POLY1305_STATE *st = &ctx->state;
310
0
  unsigned int burn = 0;
311
0
  u64 u, carry;
312
0
  u64 k0, k1;
313
0
  u64 h0, h1;
314
0
  u64 h2;
315
316
  /* process the remaining block */
317
0
  if (ctx->leftover)
318
0
    {
319
0
      ctx->buffer[ctx->leftover++] = 1;
320
0
      if (ctx->leftover < POLY1305_BLOCKSIZE)
321
0
  {
322
0
    memset (&ctx->buffer[ctx->leftover], 0,
323
0
      POLY1305_BLOCKSIZE - ctx->leftover);
324
0
    ctx->leftover = POLY1305_BLOCKSIZE;
325
0
  }
326
0
      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
327
0
    }
328
329
0
  h0 = st->h[0] + ((u64)st->h[1] << 32);
330
0
  h1 = st->h[2] + ((u64)st->h[3] << 32);
331
0
  h2 = st->h[4];
332
333
0
  k0 = st->k[0] + ((u64)st->k[1] << 32);
334
0
  k1 = st->k[2] + ((u64)st->k[3] << 32);
335
336
  /* check if h is more than 2^130-5, by adding 5. */
337
0
  add_ssaaaa(carry, u, 0, h0, 0, 5);
338
0
  add_ssaaaa(carry, u, 0, carry, 0, h1);
339
0
  u = (carry + h2) >> 2; /* u == 0 or 1 */
340
341
  /* minus 2^130-5 ... (+5) */
342
0
  u = (-u) & 5;
343
0
  add_ssaaaa(h1, h0, h1, h0, 0, u);
344
345
  /* add high part of key + h */
346
0
  add_ssaaaa(h1, h0, h1, h0, k1, k0);
347
0
  buf_put_le64(mac + 0, h0);
348
0
  buf_put_le64(mac + 8, h1);
349
350
  /* burn_stack */
351
0
  return 4 * sizeof (void *) + 7 * sizeof (u64) + burn;
352
0
}
353
354
#endif /* USE_MPI_64BIT */
355
356
#ifdef USE_MPI_32BIT
357
358
#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
359
360
/* HI:LO += A * B (arm) */
361
#define UMUL_ADD_32(HI, LO, A, B) \
362
      __asm__ ("umlal %1, %0, %4, %5" \
363
         : "=r" (HI), "=r" (LO) \
364
         : "0" (HI), "1" (LO), "r" (A), "r" (B) )
365
366
/* A += B (arm) */
367
#ifdef __GCC_ASM_FLAG_OUTPUTS__
368
#  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
369
      u32 __carry; \
370
      __asm__ ("adds %0, %0, %5\n" \
371
         "adcs %1, %1, %6\n" \
372
         "adcs %2, %2, %7\n" \
373
         "adcs %3, %3, %8\n" \
374
         : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), \
375
           "=@cccs" (__carry) \
376
         : "r" (B0), "r" (B1), "r" (B2), "r" (B3) \
377
         : ); \
378
      (A4) += (B4) + __carry; \
379
    } while (0)
380
#else
381
#  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
382
      u32 __carry = (B0); \
383
      __asm__ ("adds %0, %0, %2\n" \
384
         "adcs %1, %1, %3\n" \
385
         "rrx %2, %2\n" /* carry to 31th bit */ \
386
         : "+r" (A0), "+r" (A1), "+r" (__carry) \
387
         : "r" (B1), "r" (0) \
388
         : "cc" ); \
389
      __asm__ ("lsls %0, %0, #1\n" /* carry from 31th bit */ \
390
         "adcs %1, %1, %4\n" \
391
         "adcs %2, %2, %5\n" \
392
         "adc  %3, %3, %6\n" \
393
         : "+r" (__carry), "+r" (A2), "+r" (A3), "+r" (A4) \
394
         : "r" (B2), "r" (B3), "r" (B4) \
395
         : "cc" ); \
396
    } while (0)
397
#endif
398
399
#endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
400
401
#if defined (__i386__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 5
402
/* Note: ADD_1305_32 below does not compile on GCC-4.7 */
403
404
/* A += B (i386) */
405
#define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
406
      __asm__ ("addl %5, %0\n" \
407
         "adcl %6, %1\n" \
408
         "adcl %7, %2\n" \
409
         "adcl %8, %3\n" \
410
         "adcl %9, %4\n" \
411
         : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
412
         : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \
413
         : "cc" )
414
415
#endif /* __i386__ */
416
417
#ifndef UMUL_ADD_32
418
/* HI:LO += A * B (generic, mpi) */
419
#  define UMUL_ADD_32(HI, LO, A, B) do { \
420
    u32 t_lo, t_hi; \
421
    umul_ppmm(t_hi, t_lo, A, B); \
422
    add_ssaaaa(HI, LO, HI, LO, t_hi, t_lo); \
423
  } while (0)
424
#endif
425
426
#ifndef ADD_1305_32
427
/* A += B (generic, mpi) */
428
#  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
429
    u32 carry0, carry1, carry2; \
430
    add_ssaaaa(carry0, A0, 0, A0, 0, B0); \
431
    add_ssaaaa(carry1, A1, 0, A1, 0, B1); \
432
    add_ssaaaa(carry1, A1, carry1, A1, 0, carry0); \
433
    add_ssaaaa(carry2, A2, 0, A2, 0, B2); \
434
    add_ssaaaa(carry2, A2, carry2, A2, 0, carry1); \
435
    add_ssaaaa(A4, A3, A4, A3, B4, B3); \
436
    add_ssaaaa(A4, A3, A4, A3, 0, carry2); \
437
  } while (0)
438
#endif
439
440
/* H = H * R mod 2¹³⁰-5 */
441
#define MUL_MOD_1305_32(H4, H3, H2, H1, H0, R3, R2, R1, R0, \
442
                        R3_MULT5, R2_MULT5, R1_MULT5) do { \
443
    u32 x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi; \
444
    u32 t0_lo, t0_hi; \
445
    \
446
    /* x = a * r (partial mod 2^130-5) */ \
447
    umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
448
    umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
449
    umul_ppmm(x2_hi, x2_lo, H0, R2);  /* h0 * r2 */ \
450
    umul_ppmm(x3_hi, x3_lo, H0, R3);  /* h0 * r3 */ \
451
    \
452
    UMUL_ADD_32(x0_hi, x0_lo, H1, R3_MULT5); /* h1 * r3 mod 2^130-5 */ \
453
    UMUL_ADD_32(x1_hi, x1_lo, H1, R0);       /* h1 * r0 */ \
454
    UMUL_ADD_32(x2_hi, x2_lo, H1, R1);       /* h1 * r1 */ \
455
    UMUL_ADD_32(x3_hi, x3_lo, H1, R2);       /* h1 * r2 */ \
456
    \
457
    UMUL_ADD_32(x0_hi, x0_lo, H2, R2_MULT5); /* h2 * r2 mod 2^130-5 */ \
458
    UMUL_ADD_32(x1_hi, x1_lo, H2, R3_MULT5); /* h2 * r3 mod 2^130-5 */ \
459
    UMUL_ADD_32(x2_hi, x2_lo, H2, R0);       /* h2 * r0 */ \
460
    UMUL_ADD_32(x3_hi, x3_lo, H2, R1);       /* h2 * r1 */ \
461
    \
462
    UMUL_ADD_32(x0_hi, x0_lo, H3, R1_MULT5); /* h3 * r1 mod 2^130-5 */ \
463
    H1 = x0_hi; \
464
    UMUL_ADD_32(x1_hi, x1_lo, H3, R2_MULT5); /* h3 * r2 mod 2^130-5 */ \
465
    UMUL_ADD_32(x2_hi, x2_lo, H3, R3_MULT5); /* h3 * r3 mod 2^130-5 */ \
466
    UMUL_ADD_32(x3_hi, x3_lo, H3, R0);       /* h3 * r0 */ \
467
    \
468
    t0_lo = H4 * R1_MULT5; /* h4 * r1 mod 2^130-5 */ \
469
    t0_hi = H4 * R2_MULT5; /* h4 * r2 mod 2^130-5 */ \
470
    add_ssaaaa(H2, x1_lo, x1_hi, x1_lo, 0, t0_lo); \
471
    add_ssaaaa(H3, x2_lo, x2_hi, x2_lo, 0, t0_hi); \
472
    t0_lo = H4 * R3_MULT5; /* h4 * r3 mod 2^130-5 */ \
473
    t0_hi = H4 * R0;       /* h4 * r0 */ \
474
    add_ssaaaa(H4, x3_lo, x3_hi, x3_lo, t0_hi, t0_lo); \
475
    \
476
    /* carry propagation */ \
477
    H0 = (H4 >> 2) * 5; /* msb mod 2^130-5 */ \
478
    H4 = H4 & 3; \
479
    ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \
480
  } while (0)
481
482
#ifndef HAVE_ASM_POLY1305_BLOCKS
483
484
static unsigned int
485
poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
486
     byte high_pad)
487
{
488
  POLY1305_STATE *st = &ctx->state;
489
  u32 r1_mult5, r2_mult5, r3_mult5;
490
  u32 h0, h1, h2, h3, h4;
491
  u32 m0, m1, m2, m3, m4;
492
493
  m4 = high_pad;
494
495
  h0 = st->h[0];
496
  h1 = st->h[1];
497
  h2 = st->h[2];
498
  h3 = st->h[3];
499
  h4 = st->h[4];
500
501
  r1_mult5 = (st->r[1] >> 2) + st->r[1];
502
  r2_mult5 = (st->r[2] >> 2) + st->r[2];
503
  r3_mult5 = (st->r[3] >> 2) + st->r[3];
504
505
  while (len >= POLY1305_BLOCKSIZE)
506
    {
507
      m0 = buf_get_le32(buf + 0);
508
      m1 = buf_get_le32(buf + 4);
509
      m2 = buf_get_le32(buf + 8);
510
      m3 = buf_get_le32(buf + 12);
511
512
      /* a = h + m */
513
      ADD_1305_32(h4, h3, h2, h1, h0, m4, m3, m2, m1, m0);
514
515
      /* h = a * r (partial mod 2^130-5) */
516
      MUL_MOD_1305_32(h4, h3, h2, h1, h0,
517
          st->r[3], st->r[2], st->r[1], st->r[0],
518
          r3_mult5, r2_mult5, r1_mult5);
519
520
      buf += POLY1305_BLOCKSIZE;
521
      len -= POLY1305_BLOCKSIZE;
522
    }
523
524
  st->h[0] = h0;
525
  st->h[1] = h1;
526
  st->h[2] = h2;
527
  st->h[3] = h3;
528
  st->h[4] = h4;
529
530
  return 6 * sizeof (void *) + 28 * sizeof (u32);
531
}
532
533
#endif /* !HAVE_ASM_POLY1305_BLOCKS */
534
535
static unsigned int poly1305_final (poly1305_context_t *ctx,
536
            byte mac[POLY1305_TAGLEN])
537
{
538
  POLY1305_STATE *st = &ctx->state;
539
  unsigned int burn = 0;
540
  u32 carry, tmp0, tmp1, tmp2, u;
541
  u32 h4, h3, h2, h1, h0;
542
543
  /* process the remaining block */
544
  if (ctx->leftover)
545
    {
546
      ctx->buffer[ctx->leftover++] = 1;
547
      if (ctx->leftover < POLY1305_BLOCKSIZE)
548
  {
549
    memset (&ctx->buffer[ctx->leftover], 0,
550
      POLY1305_BLOCKSIZE - ctx->leftover);
551
    ctx->leftover = POLY1305_BLOCKSIZE;
552
  }
553
      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
554
    }
555
556
  h0 = st->h[0];
557
  h1 = st->h[1];
558
  h2 = st->h[2];
559
  h3 = st->h[3];
560
  h4 = st->h[4];
561
562
  /* check if h is more than 2^130-5, by adding 5. */
563
  add_ssaaaa(carry, tmp0, 0, h0, 0, 5);
564
  add_ssaaaa(carry, tmp0, 0, carry, 0, h1);
565
  add_ssaaaa(carry, tmp0, 0, carry, 0, h2);
566
  add_ssaaaa(carry, tmp0, 0, carry, 0, h3);
567
  u = (carry + h4) >> 2; /* u == 0 or 1 */
568
569
  /* minus 2^130-5 ... (+5) */
570
  u = (-u) & 5;
571
  add_ssaaaa(carry, h0, 0, h0, 0, u);
572
  add_ssaaaa(carry, h1, 0, h1, 0, carry);
573
  add_ssaaaa(carry, h2, 0, h2, 0, carry);
574
  add_ssaaaa(carry, h3, 0, h3, 0, carry);
575
576
  /* add high part of key + h */
577
  add_ssaaaa(tmp0, h0, 0, h0, 0, st->k[0]);
578
  add_ssaaaa(tmp1, h1, 0, h1, 0, st->k[1]);
579
  add_ssaaaa(tmp1, h1, tmp1, h1, 0, tmp0);
580
  add_ssaaaa(tmp2, h2, 0, h2, 0, st->k[2]);
581
  add_ssaaaa(tmp2, h2, tmp2, h2, 0, tmp1);
582
  add_ssaaaa(carry, h3, 0, h3, 0, st->k[3]);
583
  h3 += tmp2;
584
585
  buf_put_le32(mac + 0, h0);
586
  buf_put_le32(mac + 4, h1);
587
  buf_put_le32(mac + 8, h2);
588
  buf_put_le32(mac + 12, h3);
589
590
  /* burn_stack */
591
  return 4 * sizeof (void *) + 10 * sizeof (u32) + burn;
592
}
593
594
#endif /* USE_MPI_32BIT */
595
596
597
unsigned int
598
_gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m,
599
          size_t bytes)
600
0
{
601
0
  unsigned int burn = 0;
602
0
  unsigned int nburn;
603
604
  /* handle leftover */
605
0
  if (ctx->leftover)
606
0
    {
607
0
      size_t want = (POLY1305_BLOCKSIZE - ctx->leftover);
608
0
      if (want > bytes)
609
0
  want = bytes;
610
0
      buf_cpy (ctx->buffer + ctx->leftover, m, want);
611
0
      bytes -= want;
612
0
      m += want;
613
0
      ctx->leftover += want;
614
0
      if (ctx->leftover < POLY1305_BLOCKSIZE)
615
0
  return 0;
616
0
      nburn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
617
0
      burn = nburn > burn ? nburn : burn;
618
0
      ctx->leftover = 0;
619
0
    }
620
621
#ifdef POLY1305_USE_PPC_VEC
622
  /* PPC-P10/little-endian: bulk process multiples of eight blocks */
623
  if (ctx->use_p10 && bytes >= POLY1305_BLOCKSIZE * 8)
624
    {
625
      size_t nblks = bytes / (POLY1305_BLOCKSIZE * 8);
626
      size_t len = nblks * (POLY1305_BLOCKSIZE * 8);
627
      POLY1305_STATE *st = &ctx->state;
628
      nburn = gcry_poly1305_p10le_4blocks ((unsigned char *) st, m, len);
629
      burn = nburn > burn ? nburn : burn;
630
      m += len;
631
      bytes -= len;
632
    }
633
#endif /* POLY1305_USE_PPC_VEC */
634
635
  /* process full blocks */
636
0
  if (bytes >= POLY1305_BLOCKSIZE)
637
0
    {
638
0
      size_t nblks = bytes / POLY1305_BLOCKSIZE;
639
0
      nburn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
640
0
      burn = nburn > burn ? nburn : burn;
641
0
      m += nblks * POLY1305_BLOCKSIZE;
642
0
      bytes -= nblks * POLY1305_BLOCKSIZE;
643
0
    }
644
645
  /* store leftover */
646
0
  if (bytes)
647
0
    {
648
0
      buf_cpy (ctx->buffer + ctx->leftover, m, bytes);
649
0
      ctx->leftover += bytes;
650
0
    }
651
652
0
  return burn;
653
0
}
654
655
656
void
657
_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
658
0
{
659
0
  unsigned int burn;
660
661
0
  burn = _gcry_poly1305_update_burn (ctx, m, bytes);
662
663
0
  if (burn)
664
0
    _gcry_burn_stack (burn);
665
0
}
666
667
668
void
669
_gcry_poly1305_finish (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN])
670
0
{
671
0
  unsigned int burn;
672
673
0
  burn = poly1305_final (ctx, mac);
674
675
0
  _gcry_burn_stack (burn);
676
0
}
677
678
679
gcry_err_code_t
680
_gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
681
         size_t keylen)
682
0
{
683
0
  static int initialized;
684
0
  static const char *selftest_failed;
685
686
0
  if (!initialized)
687
0
    {
688
0
      initialized = 1;
689
0
      selftest_failed = selftest ();
690
0
      if (selftest_failed)
691
0
  log_error ("Poly1305 selftest failed (%s)\n", selftest_failed);
692
0
    }
693
694
0
  if (keylen != POLY1305_KEYLEN)
695
0
    return GPG_ERR_INV_KEYLEN;
696
697
0
  if (selftest_failed)
698
0
    return GPG_ERR_SELFTEST_FAILED;
699
700
0
  poly1305_init (ctx, key);
701
702
0
  return 0;
703
0
}
704
705
706
static void
707
poly1305_auth (byte mac[POLY1305_TAGLEN], const byte * m, size_t bytes,
708
         const byte * key)
709
0
{
710
0
  poly1305_context_t ctx;
711
712
0
  memset (&ctx, 0, sizeof (ctx));
713
714
0
  _gcry_poly1305_init (&ctx, key, POLY1305_KEYLEN);
715
0
  _gcry_poly1305_update (&ctx, m, bytes);
716
0
  _gcry_poly1305_finish (&ctx, mac);
717
718
0
  wipememory (&ctx, sizeof (ctx));
719
0
}
720
721
722
static const char *
723
selftest (void)
724
0
{
725
  /* example from nacl */
726
0
  static const byte nacl_key[POLY1305_KEYLEN] = {
727
0
    0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91,
728
0
    0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25,
729
0
    0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65,
730
0
    0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80,
731
0
  };
732
733
0
  static const byte nacl_msg[131] = {
734
0
    0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73,
735
0
    0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce,
736
0
    0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4,
737
0
    0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a,
738
0
    0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b,
739
0
    0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72,
740
0
    0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2,
741
0
    0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38,
742
0
    0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a,
743
0
    0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae,
744
0
    0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea,
745
0
    0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda,
746
0
    0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde,
747
0
    0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3,
748
0
    0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6,
749
0
    0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74,
750
0
    0xe3, 0x55, 0xa5
751
0
  };
752
753
0
  static const byte nacl_mac[16] = {
754
0
    0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5,
755
0
    0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9
756
0
  };
757
758
  /* generates a final value of (2^130 - 2) == 3 */
759
0
  static const byte wrap_key[POLY1305_KEYLEN] = {
760
0
    0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
761
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
762
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
763
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
764
0
  };
765
766
0
  static const byte wrap_msg[16] = {
767
0
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
768
0
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
769
0
  };
770
771
0
  static const byte wrap_mac[16] = {
772
0
    0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
773
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
774
0
  };
775
776
  /* mac of the macs of messages of length 0 to 256, where the key and messages
777
   * have all their values set to the length
778
   */
779
0
  static const byte total_key[POLY1305_KEYLEN] = {
780
0
    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
781
0
    0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9,
782
0
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
783
0
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
784
0
  };
785
786
0
  static const byte total_mac[16] = {
787
0
    0x64, 0xaf, 0xe2, 0xe8, 0xd6, 0xad, 0x7b, 0xbd,
788
0
    0xd2, 0x87, 0xf9, 0x7c, 0x44, 0x62, 0x3d, 0x39
789
0
  };
790
791
0
  poly1305_context_t ctx;
792
0
  poly1305_context_t total_ctx;
793
0
  byte all_key[POLY1305_KEYLEN];
794
0
  byte all_msg[256];
795
0
  byte mac[16];
796
0
  size_t i, j;
797
798
0
  memset (&ctx, 0, sizeof (ctx));
799
0
  memset (&total_ctx, 0, sizeof (total_ctx));
800
801
0
  memset (mac, 0, sizeof (mac));
802
0
  poly1305_auth (mac, nacl_msg, sizeof (nacl_msg), nacl_key);
803
0
  if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
804
0
    return "Poly1305 test 1 failed.";
805
806
  /* SSE2/AVX have a 32 byte block size, but also support 64 byte blocks, so
807
   * make sure everything still works varying between them */
808
0
  memset (mac, 0, sizeof (mac));
809
0
  _gcry_poly1305_init (&ctx, nacl_key, POLY1305_KEYLEN);
810
0
  _gcry_poly1305_update (&ctx, nacl_msg + 0, 32);
811
0
  _gcry_poly1305_update (&ctx, nacl_msg + 32, 64);
812
0
  _gcry_poly1305_update (&ctx, nacl_msg + 96, 16);
813
0
  _gcry_poly1305_update (&ctx, nacl_msg + 112, 8);
814
0
  _gcry_poly1305_update (&ctx, nacl_msg + 120, 4);
815
0
  _gcry_poly1305_update (&ctx, nacl_msg + 124, 2);
816
0
  _gcry_poly1305_update (&ctx, nacl_msg + 126, 1);
817
0
  _gcry_poly1305_update (&ctx, nacl_msg + 127, 1);
818
0
  _gcry_poly1305_update (&ctx, nacl_msg + 128, 1);
819
0
  _gcry_poly1305_update (&ctx, nacl_msg + 129, 1);
820
0
  _gcry_poly1305_update (&ctx, nacl_msg + 130, 1);
821
0
  _gcry_poly1305_finish (&ctx, mac);
822
0
  if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
823
0
    return "Poly1305 test 2 failed.";
824
825
0
  memset (mac, 0, sizeof (mac));
826
0
  poly1305_auth (mac, wrap_msg, sizeof (wrap_msg), wrap_key);
827
0
  if (memcmp (wrap_mac, mac, sizeof (nacl_mac)) != 0)
828
0
    return "Poly1305 test 3 failed.";
829
830
0
  _gcry_poly1305_init (&total_ctx, total_key, POLY1305_KEYLEN);
831
0
  for (i = 0; i < 256; i++)
832
0
    {
833
      /* set key and message to 'i,i,i..' */
834
0
      for (j = 0; j < sizeof (all_key); j++)
835
0
  all_key[j] = i;
836
0
      for (j = 0; j < i; j++)
837
0
  all_msg[j] = i;
838
0
      poly1305_auth (mac, all_msg, i, all_key);
839
0
      _gcry_poly1305_update (&total_ctx, mac, 16);
840
0
    }
841
0
  _gcry_poly1305_finish (&total_ctx, mac);
842
0
  if (memcmp (total_mac, mac, sizeof (total_mac)) != 0)
843
0
    return "Poly1305 test 4 failed.";
844
845
0
  return NULL;
846
0
}