Coverage Report

Created: 2022-12-08 06:10

/src/libgcrypt/cipher/chacha20.c
Line
Count
Source (jump to first uncovered line)
1
/* chacha20.c  -  Bernstein's ChaCha20 cipher
2
 * Copyright (C) 2014,2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3
 *
4
 * This file is part of Libgcrypt.
5
 *
6
 * Libgcrypt is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU Lesser general Public License as
8
 * published by the Free Software Foundation; either version 2.1 of
9
 * the License, or (at your option) any later version.
10
 *
11
 * Libgcrypt is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18
 *
19
 * For a description of the algorithm, see:
20
 *   http://cr.yp.to/chacha.html
21
 */
22
23
/*
24
 * Based on D. J. Bernstein reference implementation at
25
 * http://cr.yp.to/chacha.html:
26
 *
27
 * chacha-regs.c version 20080118
28
 * D. J. Bernstein
29
 * Public domain.
30
 */
31
32
#include <config.h>
33
#include <stdio.h>
34
#include <stdlib.h>
35
#include <string.h>
36
#include "types.h"
37
#include "g10lib.h"
38
#include "cipher.h"
39
#include "cipher-internal.h"
40
#include "bufhelp.h"
41
42
43
0
#define CHACHA20_MIN_KEY_SIZE 16        /* Bytes.  */
44
0
#define CHACHA20_MAX_KEY_SIZE 32        /* Bytes.  */
45
0
#define CHACHA20_BLOCK_SIZE   64        /* Bytes.  */
46
0
#define CHACHA20_MIN_IV_SIZE   8        /* Bytes.  */
47
0
#define CHACHA20_MAX_IV_SIZE  12        /* Bytes.  */
48
0
#define CHACHA20_CTR_SIZE     16        /* Bytes.  */
49
50
51
/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
52
#undef USE_SSSE3
53
#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
54
   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
55
    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
56
# define USE_SSSE3 1
57
#endif
58
59
/* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
60
#undef USE_AVX2
61
#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
62
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
63
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
64
# define USE_AVX2 1
65
#endif
66
67
/* USE_AVX512 indicates whether to compile with Intel AVX512 code. */
68
#undef USE_AVX512
69
#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \
70
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
71
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
72
# define USE_AVX512 1
73
#endif
74
75
/* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
76
#undef USE_ARMV7_NEON
77
#ifdef ENABLE_NEON_SUPPORT
78
# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
79
     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
80
     && defined(HAVE_GCC_INLINE_ASM_NEON)
81
#  define USE_ARMV7_NEON 1
82
# endif
83
#endif
84
85
/* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
86
 * code. */
87
#undef USE_AARCH64_SIMD
88
#ifdef ENABLE_NEON_SUPPORT
89
# if defined(__AARCH64EL__) \
90
       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
91
       && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
92
#  define USE_AARCH64_SIMD 1
93
# endif
94
#endif
95
96
/* USE_PPC_VEC indicates whether to enable PowerPC vector
97
 * accelerated code. */
98
#undef USE_PPC_VEC
99
#ifdef ENABLE_PPC_CRYPTO_SUPPORT
100
# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
101
     defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
102
#  if __GNUC__ >= 4
103
#   define USE_PPC_VEC 1
104
#  endif
105
# endif
106
#endif
107
108
/* USE_S390X_VX indicates whether to enable zSeries code. */
109
#undef USE_S390X_VX
110
#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
111
# if defined(HAVE_GCC_INLINE_ASM_S390X_VX)
112
#  define USE_S390X_VX 1
113
# endif /* USE_S390X_VX */
114
#endif
115
116
/* Assembly implementations use SystemV ABI, ABI conversion and additional
117
 * stack to store XMM6-XMM15 needed on Win64. */
118
#undef ASM_FUNC_ABI
119
#undef ASM_EXTRA_STACK
120
#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
121
# define ASM_FUNC_ABI __attribute__((sysv_abi))
122
#else
123
# define ASM_FUNC_ABI
124
#endif
125
126
127
typedef struct CHACHA20_context_s
128
{
129
  u32 input[16];
130
  unsigned char pad[CHACHA20_BLOCK_SIZE];
131
  unsigned int unused; /* bytes in the pad.  */
132
  unsigned int use_ssse3:1;
133
  unsigned int use_avx2:1;
134
  unsigned int use_avx512:1;
135
  unsigned int use_neon:1;
136
  unsigned int use_ppc:1;
137
  unsigned int use_p10:1;
138
  unsigned int use_s390x:1;
139
} CHACHA20_context_t;
140
141
142
#ifdef USE_SSSE3
143
144
unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
145
            const byte *src,
146
            size_t nblks) ASM_FUNC_ABI;
147
148
unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst,
149
            const byte *src,
150
            size_t nblks) ASM_FUNC_ABI;
151
152
unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
153
    u32 *state, byte *dst, const byte *src, size_t nblks,
154
    void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
155
156
unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
157
    u32 *state, byte *dst, const byte *src, size_t nblks,
158
    void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
159
160
#endif /* USE_SSSE3 */
161
162
#ifdef USE_AVX2
163
164
unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
165
                 const byte *src,
166
                 size_t nblks) ASM_FUNC_ABI;
167
168
unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
169
    u32 *state, byte *dst, const byte *src, size_t nblks,
170
    void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
171
172
#endif /* USE_AVX2 */
173
174
#ifdef USE_AVX512
175
176
unsigned int _gcry_chacha20_amd64_avx512_blocks16(u32 *state, byte *dst,
177
              const byte *src,
178
              size_t nblks) ASM_FUNC_ABI;
179
180
#endif /* USE_AVX2 */
181
182
#ifdef USE_PPC_VEC
183
184
#ifndef WORDS_BIGENDIAN
185
unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst,
186
             const byte *src,
187
             size_t len);
188
#endif
189
190
unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst,
191
           const byte *src,
192
           size_t nblks);
193
194
unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
195
           const byte *src,
196
           size_t nblks);
197
198
#undef USE_PPC_VEC_POLY1305
199
#if SIZEOF_UNSIGNED_LONG == 8
200
#define USE_PPC_VEC_POLY1305 1
201
unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
202
    u32 *state, byte *dst, const byte *src, size_t nblks,
203
    POLY1305_STATE *st, const byte *poly1305_src);
204
#endif /* SIZEOF_UNSIGNED_LONG == 8 */
205
206
#endif /* USE_PPC_VEC */
207
208
#ifdef USE_S390X_VX
209
210
unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst,
211
               const byte *src, size_t nblks);
212
213
unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst,
214
             const byte *src, size_t nblks);
215
216
#undef USE_S390X_VX_POLY1305
217
#if SIZEOF_UNSIGNED_LONG == 8
218
#define USE_S390X_VX_POLY1305 1
219
unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks8(
220
    u32 *state, byte *dst, const byte *src, size_t nblks,
221
    POLY1305_STATE *st, const byte *poly1305_src);
222
223
unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
224
    u32 *state, byte *dst, const byte *src, size_t nblks,
225
    POLY1305_STATE *st, const byte *poly1305_src);
226
#endif /* SIZEOF_UNSIGNED_LONG == 8 */
227
228
#endif /* USE_S390X_VX */
229
230
#ifdef USE_ARMV7_NEON
231
232
unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
233
                 const byte *src,
234
                 size_t nblks);
235
236
#endif /* USE_ARMV7_NEON */
237
238
#ifdef USE_AARCH64_SIMD
239
240
unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst,
241
              const byte *src, size_t nblks);
242
243
unsigned int _gcry_chacha20_poly1305_aarch64_blocks4(
244
    u32 *state, byte *dst, const byte *src, size_t nblks,
245
    void *poly1305_state, const byte *poly1305_src);
246
247
#endif /* USE_AARCH64_SIMD */
248
249
250
static const char *selftest (void);
251

252
253
0
#define ROTATE(v,c) (rol(v,c))
254
#define XOR(v,w)  ((v) ^ (w))
255
0
#define PLUS(v,w) ((u32)((v) + (w)))
256
0
#define PLUSONE(v)  (PLUS((v),1))
257
258
#define QUARTERROUND(a,b,c,d) \
259
0
  a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
260
0
  c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
261
0
  a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
262
0
  c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
263
264
#define BUF_XOR_LE32(dst, src, offset, x) \
265
0
  buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
266
267
static unsigned int
268
do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
269
0
{
270
0
  u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
271
0
  unsigned int i;
272
273
0
  while (nblks)
274
0
    {
275
0
      x0 = input[0];
276
0
      x1 = input[1];
277
0
      x2 = input[2];
278
0
      x3 = input[3];
279
0
      x4 = input[4];
280
0
      x5 = input[5];
281
0
      x6 = input[6];
282
0
      x7 = input[7];
283
0
      x8 = input[8];
284
0
      x9 = input[9];
285
0
      x10 = input[10];
286
0
      x11 = input[11];
287
0
      x12 = input[12];
288
0
      x13 = input[13];
289
0
      x14 = input[14];
290
0
      x15 = input[15];
291
292
0
      for (i = 20; i > 0; i -= 2)
293
0
  {
294
0
    QUARTERROUND(x0, x4,  x8, x12)
295
0
    QUARTERROUND(x1, x5,  x9, x13)
296
0
    QUARTERROUND(x2, x6, x10, x14)
297
0
    QUARTERROUND(x3, x7, x11, x15)
298
0
    QUARTERROUND(x0, x5, x10, x15)
299
0
    QUARTERROUND(x1, x6, x11, x12)
300
0
    QUARTERROUND(x2, x7,  x8, x13)
301
0
    QUARTERROUND(x3, x4,  x9, x14)
302
0
  }
303
304
0
      x0 = PLUS(x0, input[0]);
305
0
      x1 = PLUS(x1, input[1]);
306
0
      x2 = PLUS(x2, input[2]);
307
0
      x3 = PLUS(x3, input[3]);
308
0
      x4 = PLUS(x4, input[4]);
309
0
      x5 = PLUS(x5, input[5]);
310
0
      x6 = PLUS(x6, input[6]);
311
0
      x7 = PLUS(x7, input[7]);
312
0
      x8 = PLUS(x8, input[8]);
313
0
      x9 = PLUS(x9, input[9]);
314
0
      x10 = PLUS(x10, input[10]);
315
0
      x11 = PLUS(x11, input[11]);
316
0
      x12 = PLUS(x12, input[12]);
317
0
      x13 = PLUS(x13, input[13]);
318
0
      x14 = PLUS(x14, input[14]);
319
0
      x15 = PLUS(x15, input[15]);
320
321
0
      input[12] = PLUSONE(input[12]);
322
0
      input[13] = PLUS(input[13], !input[12]);
323
324
0
      BUF_XOR_LE32(dst, src, 0, x0);
325
0
      BUF_XOR_LE32(dst, src, 4, x1);
326
0
      BUF_XOR_LE32(dst, src, 8, x2);
327
0
      BUF_XOR_LE32(dst, src, 12, x3);
328
0
      BUF_XOR_LE32(dst, src, 16, x4);
329
0
      BUF_XOR_LE32(dst, src, 20, x5);
330
0
      BUF_XOR_LE32(dst, src, 24, x6);
331
0
      BUF_XOR_LE32(dst, src, 28, x7);
332
0
      BUF_XOR_LE32(dst, src, 32, x8);
333
0
      BUF_XOR_LE32(dst, src, 36, x9);
334
0
      BUF_XOR_LE32(dst, src, 40, x10);
335
0
      BUF_XOR_LE32(dst, src, 44, x11);
336
0
      BUF_XOR_LE32(dst, src, 48, x12);
337
0
      BUF_XOR_LE32(dst, src, 52, x13);
338
0
      BUF_XOR_LE32(dst, src, 56, x14);
339
0
      BUF_XOR_LE32(dst, src, 60, x15);
340
341
0
      src += CHACHA20_BLOCK_SIZE;
342
0
      dst += CHACHA20_BLOCK_SIZE;
343
0
      nblks--;
344
0
    }
345
346
  /* burn_stack */
347
0
  return (17 * sizeof(u32) + 6 * sizeof(void *));
348
0
}
349
350
351
static unsigned int
352
chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
353
     size_t nblks)
354
0
{
355
0
#ifdef USE_SSSE3
356
0
  if (ctx->use_ssse3)
357
0
    {
358
0
      return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
359
0
    }
360
0
#endif
361
362
#ifdef USE_PPC_VEC
363
  if (ctx->use_ppc)
364
    {
365
      return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
366
    }
367
#endif
368
369
#ifdef USE_S390X_VX
370
  if (ctx->use_s390x)
371
    {
372
      return _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, dst, src, nblks);
373
    }
374
#endif
375
376
0
  return do_chacha20_blocks (ctx->input, dst, src, nblks);
377
0
}
378
379
380
static void
381
chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
382
                   unsigned int keylen)
383
0
{
384
0
  static const char sigma[16] = "expand 32-byte k";
385
0
  static const char tau[16] = "expand 16-byte k";
386
0
  const char *constants;
387
388
0
  ctx->input[4] = buf_get_le32(key + 0);
389
0
  ctx->input[5] = buf_get_le32(key + 4);
390
0
  ctx->input[6] = buf_get_le32(key + 8);
391
0
  ctx->input[7] = buf_get_le32(key + 12);
392
0
  if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */
393
0
    {
394
0
      key += 16;
395
0
      constants = sigma;
396
0
    }
397
0
  else /* 128 bits */
398
0
    {
399
0
      constants = tau;
400
0
    }
401
0
  ctx->input[8] = buf_get_le32(key + 0);
402
0
  ctx->input[9] = buf_get_le32(key + 4);
403
0
  ctx->input[10] = buf_get_le32(key + 8);
404
0
  ctx->input[11] = buf_get_le32(key + 12);
405
0
  ctx->input[0] = buf_get_le32(constants + 0);
406
0
  ctx->input[1] = buf_get_le32(constants + 4);
407
0
  ctx->input[2] = buf_get_le32(constants + 8);
408
0
  ctx->input[3] = buf_get_le32(constants + 12);
409
0
}
410
411
412
static void
413
chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen)
414
0
{
415
0
  if (ivlen == CHACHA20_CTR_SIZE)
416
0
    {
417
0
      ctx->input[12] = buf_get_le32 (iv + 0);
418
0
      ctx->input[13] = buf_get_le32 (iv + 4);
419
0
      ctx->input[14] = buf_get_le32 (iv + 8);
420
0
      ctx->input[15] = buf_get_le32 (iv + 12);
421
0
    }
422
0
  else if (ivlen == CHACHA20_MAX_IV_SIZE)
423
0
    {
424
0
      ctx->input[12] = 0;
425
0
      ctx->input[13] = buf_get_le32 (iv + 0);
426
0
      ctx->input[14] = buf_get_le32 (iv + 4);
427
0
      ctx->input[15] = buf_get_le32 (iv + 8);
428
0
    }
429
0
  else if (ivlen == CHACHA20_MIN_IV_SIZE)
430
0
    {
431
0
      ctx->input[12] = 0;
432
0
      ctx->input[13] = 0;
433
0
      ctx->input[14] = buf_get_le32 (iv + 0);
434
0
      ctx->input[15] = buf_get_le32 (iv + 4);
435
0
    }
436
0
  else
437
0
    {
438
0
      ctx->input[12] = 0;
439
0
      ctx->input[13] = 0;
440
0
      ctx->input[14] = 0;
441
0
      ctx->input[15] = 0;
442
0
    }
443
0
}
444
445
446
static void
447
chacha20_setiv (void *context, const byte *iv, size_t ivlen)
448
0
{
449
0
  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
450
451
  /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
452
0
  if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
453
0
      && ivlen != CHACHA20_CTR_SIZE)
454
0
    log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
455
456
0
  if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
457
0
             || ivlen == CHACHA20_CTR_SIZE))
458
0
    chacha20_ivsetup (ctx, iv, ivlen);
459
0
  else
460
0
    chacha20_ivsetup (ctx, NULL, 0);
461
462
  /* Reset the unused pad bytes counter.  */
463
0
  ctx->unused = 0;
464
0
}
465
466
467
static gcry_err_code_t
468
chacha20_do_setkey (CHACHA20_context_t *ctx,
469
                    const byte *key, unsigned int keylen)
470
0
{
471
0
  static int initialized;
472
0
  static const char *selftest_failed;
473
0
  unsigned int features = _gcry_get_hw_features ();
474
475
0
  if (!initialized)
476
0
    {
477
0
      initialized = 1;
478
0
      selftest_failed = selftest ();
479
0
      if (selftest_failed)
480
0
        log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed);
481
0
    }
482
0
  if (selftest_failed)
483
0
    return GPG_ERR_SELFTEST_FAILED;
484
485
0
  if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE)
486
0
    return GPG_ERR_INV_KEYLEN;
487
488
0
#ifdef USE_SSSE3
489
0
  ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
490
0
#endif
491
0
#ifdef USE_AVX512
492
0
  ctx->use_avx512 = (features & HWF_INTEL_AVX512) != 0;
493
0
#endif
494
0
#ifdef USE_AVX2
495
0
  ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
496
0
#endif
497
#ifdef USE_ARMV7_NEON
498
  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
499
#endif
500
#ifdef USE_AARCH64_SIMD
501
  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
502
#endif
503
#ifdef USE_PPC_VEC
504
  ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
505
# ifndef WORDS_BIGENDIAN
506
  ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0;
507
#  ifdef ENABLE_FORCE_SOFT_HWFEATURES
508
  /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10.
509
   * Actual implementation works with HWF_PPC_ARCH_3_00 also. */
510
  ctx->use_p10 |= (features & HWF_PPC_ARCH_3_00) != 0;
511
#  endif
512
# endif
513
#endif
514
#ifdef USE_S390X_VX
515
  ctx->use_s390x = (features & HWF_S390X_VX) != 0;
516
#endif
517
518
0
  (void)features;
519
520
0
  chacha20_keysetup (ctx, key, keylen);
521
522
  /* We default to a zero nonce.  */
523
0
  chacha20_setiv (ctx, NULL, 0);
524
525
0
  return 0;
526
0
}
527
528
529
static gcry_err_code_t
530
chacha20_setkey (void *context, const byte *key, unsigned int keylen,
531
                 cipher_bulk_ops_t *bulk_ops)
532
0
{
533
0
  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
534
0
  gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen);
535
0
  (void)bulk_ops;
536
0
  _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
537
0
  return rc;
538
0
}
539
540
541
static unsigned int
542
do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
543
         const byte *inbuf, size_t length)
544
0
{
545
0
  static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
546
0
  unsigned int nburn, burn = 0;
547
548
0
#ifdef USE_AVX512
549
0
  if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE * 16)
550
0
    {
551
0
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
552
0
      nblocks -= nblocks % 16;
553
0
      nburn = _gcry_chacha20_amd64_avx512_blocks16(ctx->input, outbuf, inbuf,
554
0
               nblocks);
555
0
      burn = nburn > burn ? nburn : burn;
556
0
      length -= nblocks * CHACHA20_BLOCK_SIZE;
557
0
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
558
0
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
559
0
    }
560
0
#endif
561
562
0
#ifdef USE_AVX2
563
0
  if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
564
0
    {
565
0
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
566
0
      nblocks -= nblocks % 8;
567
0
      nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf,
568
0
            nblocks);
569
0
      burn = nburn > burn ? nburn : burn;
570
0
      length -= nblocks * CHACHA20_BLOCK_SIZE;
571
0
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
572
0
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
573
0
    }
574
0
#endif
575
576
0
#ifdef USE_SSSE3
577
0
  if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
578
0
    {
579
0
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
580
0
      nblocks -= nblocks % 4;
581
0
      nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf,
582
0
             nblocks);
583
0
      burn = nburn > burn ? nburn : burn;
584
0
      length -= nblocks * CHACHA20_BLOCK_SIZE;
585
0
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
586
0
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
587
0
    }
588
0
#endif
589
590
#ifdef USE_ARMV7_NEON
591
  if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
592
    {
593
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
594
      nblocks -= nblocks % 4;
595
      nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf,
596
            nblocks);
597
      burn = nburn > burn ? nburn : burn;
598
      length -= nblocks * CHACHA20_BLOCK_SIZE;
599
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
600
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
601
    }
602
#endif
603
604
#ifdef USE_AARCH64_SIMD
605
  if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
606
    {
607
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
608
      nblocks -= nblocks % 4;
609
      nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf,
610
               nblocks);
611
      burn = nburn > burn ? nburn : burn;
612
      length -= nblocks * CHACHA20_BLOCK_SIZE;
613
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
614
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
615
    }
616
#endif
617
618
#ifdef USE_PPC_VEC
619
  if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
620
    {
621
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
622
      nblocks -= nblocks % 4;
623
#ifndef WORDS_BIGENDIAN
624
      /*
625
       * A workaround to skip counter overflow. This is rare.
626
       */
627
      if (ctx->use_p10 && nblocks >= 8
628
          && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU)
629
        {
630
          size_t len = nblocks * CHACHA20_BLOCK_SIZE;
631
          nburn = _gcry_chacha20_p10le_8x(ctx->input, outbuf, inbuf, len);
632
        }
633
      else
634
#endif
635
        {
636
          nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf,
637
                                              nblocks);
638
        }
639
      burn = nburn > burn ? nburn : burn;
640
      length -= nblocks * CHACHA20_BLOCK_SIZE;
641
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
642
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
643
    }
644
#endif
645
646
#ifdef USE_S390X_VX
647
  if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 8)
648
    {
649
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
650
      nblocks -= nblocks % 8;
651
      nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf,
652
                nblocks);
653
      burn = nburn > burn ? nburn : burn;
654
      length -= nblocks * CHACHA20_BLOCK_SIZE;
655
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
656
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
657
    }
658
#endif
659
660
0
  if (length >= CHACHA20_BLOCK_SIZE)
661
0
    {
662
0
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
663
0
      nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks);
664
0
      burn = nburn > burn ? nburn : burn;
665
0
      length -= nblocks * CHACHA20_BLOCK_SIZE;
666
0
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
667
0
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
668
0
    }
669
670
0
  if (length > 0)
671
0
    {
672
0
      nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1);
673
0
      burn = nburn > burn ? nburn : burn;
674
675
0
      buf_xor (outbuf, inbuf, ctx->pad, length);
676
0
      ctx->unused = CHACHA20_BLOCK_SIZE - length;
677
0
    }
678
679
0
  if (burn)
680
0
    burn += 5 * sizeof(void *);
681
682
0
  return burn;
683
0
}
684
685
686
static void
687
chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
688
                         size_t length)
689
0
{
690
0
  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
691
0
  unsigned int nburn, burn = 0;
692
693
0
  if (!length)
694
0
    return;
695
696
0
  if (ctx->unused)
697
0
    {
698
0
      unsigned char *p = ctx->pad;
699
0
      size_t n;
700
701
0
      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
702
703
0
      n = ctx->unused;
704
0
      if (n > length)
705
0
        n = length;
706
707
0
      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
708
0
      length -= n;
709
0
      outbuf += n;
710
0
      inbuf += n;
711
0
      ctx->unused -= n;
712
713
0
      if (!length)
714
0
        return;
715
0
      gcry_assert (!ctx->unused);
716
0
    }
717
718
0
  nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
719
0
  burn = nburn > burn ? nburn : burn;
720
721
0
  if (burn)
722
0
    _gcry_burn_stack (burn);
723
0
}
724
725
726
gcry_err_code_t
727
_gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
728
        const byte *inbuf, size_t length)
729
0
{
730
0
  CHACHA20_context_t *ctx = (void *) &c->context.c;
731
0
  unsigned int nburn, burn = 0;
732
0
  byte *authptr = NULL;
733
734
0
  if (!length)
735
0
    return 0;
736
737
0
  if (ctx->unused)
738
0
    {
739
0
      unsigned char *p = ctx->pad;
740
0
      size_t n;
741
742
0
      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
743
744
0
      n = ctx->unused;
745
0
      if (n > length)
746
0
        n = length;
747
748
0
      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
749
0
      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n);
750
0
      burn = nburn > burn ? nburn : burn;
751
0
      length -= n;
752
0
      outbuf += n;
753
0
      inbuf += n;
754
0
      ctx->unused -= n;
755
756
0
      if (!length)
757
0
  {
758
0
    if (burn)
759
0
      _gcry_burn_stack (burn);
760
761
0
    return 0;
762
0
  }
763
0
      gcry_assert (!ctx->unused);
764
0
    }
765
766
0
  gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
767
768
0
  if (0)
769
0
    { }
770
0
#ifdef USE_AVX512
771
0
  else if (ctx->use_avx512)
772
0
    {
773
      /* Skip stitched chacha20-poly1305 for AVX512. */
774
0
      authptr = NULL;
775
0
    }
776
0
#endif
777
0
#ifdef USE_AVX2
778
0
  else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
779
0
    {
780
0
      nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8);
781
0
      burn = nburn > burn ? nburn : burn;
782
783
0
      authptr = outbuf;
784
0
      length -= 8 * CHACHA20_BLOCK_SIZE;
785
0
      outbuf += 8 * CHACHA20_BLOCK_SIZE;
786
0
      inbuf  += 8 * CHACHA20_BLOCK_SIZE;
787
0
    }
788
0
#endif
789
0
#ifdef USE_SSSE3
790
0
  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
791
0
    {
792
0
      nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4);
793
0
      burn = nburn > burn ? nburn : burn;
794
795
0
      authptr = outbuf;
796
0
      length -= 4 * CHACHA20_BLOCK_SIZE;
797
0
      outbuf += 4 * CHACHA20_BLOCK_SIZE;
798
0
      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
799
0
    }
800
0
  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 2)
801
0
    {
802
0
      nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 2);
803
0
      burn = nburn > burn ? nburn : burn;
804
805
0
      authptr = outbuf;
806
0
      length -= 2 * CHACHA20_BLOCK_SIZE;
807
0
      outbuf += 2 * CHACHA20_BLOCK_SIZE;
808
0
      inbuf  += 2 * CHACHA20_BLOCK_SIZE;
809
0
    }
810
0
  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE)
811
0
    {
812
0
      nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1);
813
0
      burn = nburn > burn ? nburn : burn;
814
815
0
      authptr = outbuf;
816
0
      length -= 1 * CHACHA20_BLOCK_SIZE;
817
0
      outbuf += 1 * CHACHA20_BLOCK_SIZE;
818
0
      inbuf  += 1 * CHACHA20_BLOCK_SIZE;
819
0
    }
820
0
#endif
821
#ifdef USE_AARCH64_SIMD
822
  else if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
823
    {
824
      nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, 4);
825
      burn = nburn > burn ? nburn : burn;
826
827
      authptr = outbuf;
828
      length -= 4 * CHACHA20_BLOCK_SIZE;
829
      outbuf += 4 * CHACHA20_BLOCK_SIZE;
830
      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
831
    }
832
#endif
833
#ifdef USE_PPC_VEC_POLY1305
834
  else if (ctx->use_ppc && ctx->use_p10)
835
    {
836
      /* Skip stitched chacha20-poly1305 for P10. */
837
      authptr = NULL;
838
    }
839
  else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
840
    {
841
      nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
842
      burn = nburn > burn ? nburn : burn;
843
844
      authptr = outbuf;
845
      length -= 4 * CHACHA20_BLOCK_SIZE;
846
      outbuf += 4 * CHACHA20_BLOCK_SIZE;
847
      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
848
    }
849
#endif
850
#ifdef USE_S390X_VX_POLY1305
851
  else if (ctx->use_s390x && length >= 2 * CHACHA20_BLOCK_SIZE * 8)
852
    {
853
      nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, 8);
854
      burn = nburn > burn ? nburn : burn;
855
856
      authptr = outbuf;
857
      length -= 8 * CHACHA20_BLOCK_SIZE;
858
      outbuf += 8 * CHACHA20_BLOCK_SIZE;
859
      inbuf  += 8 * CHACHA20_BLOCK_SIZE;
860
    }
861
  else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 4)
862
    {
863
      nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 4);
864
      burn = nburn > burn ? nburn : burn;
865
866
      authptr = outbuf;
867
      length -= 4 * CHACHA20_BLOCK_SIZE;
868
      outbuf += 4 * CHACHA20_BLOCK_SIZE;
869
      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
870
    }
871
  else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 2)
872
    {
873
      nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 2);
874
      burn = nburn > burn ? nburn : burn;
875
876
      authptr = outbuf;
877
      length -= 2 * CHACHA20_BLOCK_SIZE;
878
      outbuf += 2 * CHACHA20_BLOCK_SIZE;
879
      inbuf  += 2 * CHACHA20_BLOCK_SIZE;
880
    }
881
  else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE)
882
    {
883
      nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 1);
884
      burn = nburn > burn ? nburn : burn;
885
886
      authptr = outbuf;
887
      length -= 1 * CHACHA20_BLOCK_SIZE;
888
      outbuf += 1 * CHACHA20_BLOCK_SIZE;
889
      inbuf  += 1 * CHACHA20_BLOCK_SIZE;
890
    }
891
#endif
892
893
0
  if (authptr)
894
0
    {
895
0
      size_t authoffset = outbuf - authptr;
896
897
0
#ifdef USE_AVX2
898
0
      if (ctx->use_avx2 &&
899
0
    length >= 8 * CHACHA20_BLOCK_SIZE &&
900
0
    authoffset >= 8 * CHACHA20_BLOCK_SIZE)
901
0
  {
902
0
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
903
0
    nblocks -= nblocks % 8;
904
905
0
    nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
906
0
          ctx->input, outbuf, inbuf, nblocks,
907
0
          &c->u_mode.poly1305.ctx.state, authptr);
908
0
    burn = nburn > burn ? nburn : burn;
909
910
0
    length  -= nblocks * CHACHA20_BLOCK_SIZE;
911
0
    outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
912
0
    inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
913
0
    authptr += nblocks * CHACHA20_BLOCK_SIZE;
914
0
  }
915
0
#endif
916
917
0
#ifdef USE_SSSE3
918
0
      if (ctx->use_ssse3)
919
0
  {
920
0
    if (length >= 4 * CHACHA20_BLOCK_SIZE &&
921
0
        authoffset >= 4 * CHACHA20_BLOCK_SIZE)
922
0
      {
923
0
        size_t nblocks = length / CHACHA20_BLOCK_SIZE;
924
0
        nblocks -= nblocks % 4;
925
926
0
        nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
927
0
        ctx->input, outbuf, inbuf, nblocks,
928
0
        &c->u_mode.poly1305.ctx.state, authptr);
929
0
        burn = nburn > burn ? nburn : burn;
930
931
0
        length  -= nblocks * CHACHA20_BLOCK_SIZE;
932
0
        outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
933
0
        inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
934
0
        authptr += nblocks * CHACHA20_BLOCK_SIZE;
935
0
      }
936
937
0
    if (length >= CHACHA20_BLOCK_SIZE &&
938
0
        authoffset >= CHACHA20_BLOCK_SIZE)
939
0
      {
940
0
        size_t nblocks = length / CHACHA20_BLOCK_SIZE;
941
942
0
        nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
943
0
        ctx->input, outbuf, inbuf, nblocks,
944
0
        &c->u_mode.poly1305.ctx.state, authptr);
945
0
        burn = nburn > burn ? nburn : burn;
946
947
0
        length  -= nblocks * CHACHA20_BLOCK_SIZE;
948
0
        outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
949
0
        inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
950
0
        authptr += nblocks * CHACHA20_BLOCK_SIZE;
951
0
      }
952
0
  }
953
0
#endif
954
955
#ifdef USE_AARCH64_SIMD
956
      if (ctx->use_neon &&
957
    length >= 4 * CHACHA20_BLOCK_SIZE &&
958
    authoffset >= 4 * CHACHA20_BLOCK_SIZE)
959
  {
960
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
961
    nblocks -= nblocks % 4;
962
963
    nburn = _gcry_chacha20_poly1305_aarch64_blocks4(
964
          ctx->input, outbuf, inbuf, nblocks,
965
          &c->u_mode.poly1305.ctx.state, authptr);
966
    burn = nburn > burn ? nburn : burn;
967
968
    length  -= nblocks * CHACHA20_BLOCK_SIZE;
969
    outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
970
    inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
971
    authptr += nblocks * CHACHA20_BLOCK_SIZE;
972
  }
973
#endif
974
975
#ifdef USE_PPC_VEC_POLY1305
976
      if (ctx->use_ppc &&
977
    length >= 4 * CHACHA20_BLOCK_SIZE &&
978
    authoffset >= 4 * CHACHA20_BLOCK_SIZE)
979
  {
980
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
981
    nblocks -= nblocks % 4;
982
983
    nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
984
          ctx->input, outbuf, inbuf, nblocks,
985
          &c->u_mode.poly1305.ctx.state, authptr);
986
    burn = nburn > burn ? nburn : burn;
987
988
    length  -= nblocks * CHACHA20_BLOCK_SIZE;
989
    outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
990
    inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
991
    authptr += nblocks * CHACHA20_BLOCK_SIZE;
992
  }
993
#endif
994
995
#ifdef USE_S390X_VX_POLY1305
996
      if (ctx->use_s390x)
997
  {
998
    if (length >= 8 * CHACHA20_BLOCK_SIZE &&
999
        authoffset >= 8 * CHACHA20_BLOCK_SIZE)
1000
      {
1001
        size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1002
        nblocks -= nblocks % 8;
1003
1004
        burn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
1005
        ctx->input, outbuf, inbuf, nblocks,
1006
        &c->u_mode.poly1305.ctx.state, authptr);
1007
        burn = nburn > burn ? nburn : burn;
1008
1009
        length  -= nblocks * CHACHA20_BLOCK_SIZE;
1010
        outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1011
        inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
1012
        authptr += nblocks * CHACHA20_BLOCK_SIZE;
1013
      }
1014
1015
    if (length >= CHACHA20_BLOCK_SIZE &&
1016
        authoffset >= CHACHA20_BLOCK_SIZE)
1017
      {
1018
        size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1019
1020
        burn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
1021
        ctx->input, outbuf, inbuf, nblocks,
1022
        &c->u_mode.poly1305.ctx.state, authptr);
1023
        burn = nburn > burn ? nburn : burn;
1024
1025
        length  -= nblocks * CHACHA20_BLOCK_SIZE;
1026
        outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1027
        inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
1028
        authptr += nblocks * CHACHA20_BLOCK_SIZE;
1029
      }
1030
  }
1031
#endif
1032
1033
0
      if (authoffset > 0)
1034
0
  {
1035
0
    _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset);
1036
0
    authptr += authoffset;
1037
0
    authoffset = 0;
1038
0
  }
1039
1040
0
      gcry_assert(authptr == outbuf);
1041
0
    }
1042
1043
0
  while (length)
1044
0
    {
1045
0
      size_t currlen = length;
1046
1047
      /* Since checksumming is done after encryption, process input in 24KiB
1048
       * chunks to keep data loaded in L1 cache for checksumming.  However
1049
       * only do splitting if input is large enough so that last chunks does
1050
       * not end up being short. */
1051
0
      if (currlen > 32 * 1024)
1052
0
  currlen = 24 * 1024;
1053
1054
0
      nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen);
1055
0
      burn = nburn > burn ? nburn : burn;
1056
1057
0
      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf,
1058
0
            currlen);
1059
0
      burn = nburn > burn ? nburn : burn;
1060
1061
0
      outbuf += currlen;
1062
0
      inbuf += currlen;
1063
0
      length -= currlen;
1064
0
    }
1065
1066
0
  if (burn)
1067
0
    _gcry_burn_stack (burn);
1068
1069
0
  return 0;
1070
0
}
1071
1072
1073
gcry_err_code_t
1074
_gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
1075
        const byte *inbuf, size_t length)
1076
0
{
1077
0
  CHACHA20_context_t *ctx = (void *) &c->context.c;
1078
0
  unsigned int nburn, burn = 0;
1079
0
#if defined(USE_AVX512) || defined(USE_PPC_VEC_POLY1305)                  \
1080
0
  || defined(USE_AVX2) || defined(USE_SSSE3) || defined(USE_AARCH64_SIMD) \
1081
0
  || defined(USE_S390X_VX_POLY1305)
1082
0
  int skip_stitched = 0;
1083
0
#endif
1084
1085
0
  if (!length)
1086
0
    return 0;
1087
1088
0
  if (ctx->unused)
1089
0
    {
1090
0
      unsigned char *p = ctx->pad;
1091
0
      size_t n;
1092
1093
0
      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
1094
1095
0
      n = ctx->unused;
1096
0
      if (n > length)
1097
0
        n = length;
1098
1099
0
      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n);
1100
0
      burn = nburn > burn ? nburn : burn;
1101
0
      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
1102
0
      length -= n;
1103
0
      outbuf += n;
1104
0
      inbuf += n;
1105
0
      ctx->unused -= n;
1106
1107
0
      if (!length)
1108
0
  {
1109
0
    if (burn)
1110
0
      _gcry_burn_stack (burn);
1111
1112
0
    return 0;
1113
0
  }
1114
0
      gcry_assert (!ctx->unused);
1115
0
    }
1116
1117
0
  gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
1118
1119
0
#ifdef USE_AVX512
1120
0
  if (ctx->use_avx512)
1121
0
    {
1122
      /* Skip stitched chacha20-poly1305 for AVX512. */
1123
0
      skip_stitched = 1;
1124
0
    }
1125
0
#endif
1126
#ifdef USE_PPC_VEC_POLY1305
1127
  if (ctx->use_ppc && ctx->use_p10)
1128
    {
1129
      /* Skip stitched chacha20-poly1305 for P10. */
1130
      skip_stitched = 1;
1131
    }
1132
#endif
1133
1134
0
#ifdef USE_AVX2
1135
0
  if (!skip_stitched && ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
1136
0
    {
1137
0
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1138
0
      nblocks -= nblocks % 8;
1139
1140
0
      nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
1141
0
      ctx->input, outbuf, inbuf, nblocks,
1142
0
      &c->u_mode.poly1305.ctx.state, inbuf);
1143
0
      burn = nburn > burn ? nburn : burn;
1144
1145
0
      length -= nblocks * CHACHA20_BLOCK_SIZE;
1146
0
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1147
0
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1148
0
    }
1149
0
#endif
1150
1151
0
#ifdef USE_SSSE3
1152
0
  if (!skip_stitched && ctx->use_ssse3)
1153
0
    {
1154
0
      if (length >= 4 * CHACHA20_BLOCK_SIZE)
1155
0
  {
1156
0
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1157
0
    nblocks -= nblocks % 4;
1158
1159
0
    nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
1160
0
          ctx->input, outbuf, inbuf, nblocks,
1161
0
          &c->u_mode.poly1305.ctx.state, inbuf);
1162
0
    burn = nburn > burn ? nburn : burn;
1163
1164
0
    length -= nblocks * CHACHA20_BLOCK_SIZE;
1165
0
    outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1166
0
    inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1167
0
  }
1168
1169
0
      if (length >= CHACHA20_BLOCK_SIZE)
1170
0
  {
1171
0
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1172
1173
0
    nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
1174
0
          ctx->input, outbuf, inbuf, nblocks,
1175
0
          &c->u_mode.poly1305.ctx.state, inbuf);
1176
0
    burn = nburn > burn ? nburn : burn;
1177
1178
0
    length -= nblocks * CHACHA20_BLOCK_SIZE;
1179
0
    outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1180
0
    inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1181
0
  }
1182
0
    }
1183
0
#endif
1184
1185
#ifdef USE_AARCH64_SIMD
1186
  if (!skip_stitched && ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE)
1187
    {
1188
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1189
      nblocks -= nblocks % 4;
1190
1191
      nburn = _gcry_chacha20_poly1305_aarch64_blocks4(
1192
      ctx->input, outbuf, inbuf, nblocks,
1193
      &c->u_mode.poly1305.ctx.state, inbuf);
1194
      burn = nburn > burn ? nburn : burn;
1195
1196
      length -= nblocks * CHACHA20_BLOCK_SIZE;
1197
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1198
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1199
    }
1200
#endif
1201
1202
#ifdef USE_PPC_VEC_POLY1305
1203
  /* skip stitch for p10 */
1204
  if (!skip_stitched && ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
1205
    {
1206
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1207
      nblocks -= nblocks % 4;
1208
1209
      nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
1210
      ctx->input, outbuf, inbuf, nblocks,
1211
      &c->u_mode.poly1305.ctx.state, inbuf);
1212
      burn = nburn > burn ? nburn : burn;
1213
1214
      length -= nblocks * CHACHA20_BLOCK_SIZE;
1215
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1216
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1217
    }
1218
#endif
1219
1220
#ifdef USE_S390X_VX_POLY1305
1221
  if (!skip_stitched && ctx->use_s390x)
1222
    {
1223
      if (length >= 8 * CHACHA20_BLOCK_SIZE)
1224
  {
1225
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1226
    nblocks -= nblocks % 8;
1227
1228
    nburn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
1229
          ctx->input, outbuf, inbuf, nblocks,
1230
          &c->u_mode.poly1305.ctx.state, inbuf);
1231
    burn = nburn > burn ? nburn : burn;
1232
1233
    length -= nblocks * CHACHA20_BLOCK_SIZE;
1234
    outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1235
    inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1236
  }
1237
1238
      if (length >= CHACHA20_BLOCK_SIZE)
1239
  {
1240
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1241
1242
    nburn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
1243
          ctx->input, outbuf, inbuf, nblocks,
1244
          &c->u_mode.poly1305.ctx.state, inbuf);
1245
    burn = nburn > burn ? nburn : burn;
1246
1247
    length -= nblocks * CHACHA20_BLOCK_SIZE;
1248
    outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1249
    inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1250
  }
1251
    }
1252
#endif
1253
1254
0
  while (length)
1255
0
    {
1256
0
      size_t currlen = length;
1257
1258
      /* Since checksumming is done before decryption, process input in 24KiB
1259
       * chunks to keep data loaded in L1 cache for decryption.  However only
1260
       * do splitting if input is large enough so that last chunks does not
1261
       * end up being short. */
1262
0
      if (currlen > 32 * 1024)
1263
0
  currlen = 24 * 1024;
1264
1265
0
      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf,
1266
0
            currlen);
1267
0
      burn = nburn > burn ? nburn : burn;
1268
1269
0
      nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen);
1270
0
      burn = nburn > burn ? nburn : burn;
1271
1272
0
      outbuf += currlen;
1273
0
      inbuf += currlen;
1274
0
      length -= currlen;
1275
0
    }
1276
1277
0
  if (burn)
1278
0
    _gcry_burn_stack (burn);
1279
1280
0
  return 0;
1281
0
}
1282
1283
1284
static const char *
1285
selftest (void)
1286
0
{
1287
0
  byte ctxbuf[sizeof(CHACHA20_context_t) + 15];
1288
0
  CHACHA20_context_t *ctx;
1289
0
  byte scratch[127 + 1];
1290
0
  byte buf[512 + 64 + 4];
1291
0
  int i;
1292
1293
  /* From draft-strombergson-chacha-test-vectors */
1294
0
  static byte key_1[] = {
1295
0
    0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78,
1296
0
    0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35,
1297
0
    0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb,
1298
0
    0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d
1299
0
  };
1300
0
  static const byte nonce_1[] =
1301
0
    { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 };
1302
0
  static const byte plaintext_1[127] = {
1303
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1304
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1305
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1306
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1307
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1308
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1309
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1310
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1311
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1312
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1313
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1314
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1315
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1316
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1317
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1318
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1319
0
  };
1320
0
  static const byte ciphertext_1[127] = {
1321
0
    0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9,
1322
0
    0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06,
1323
0
    0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00,
1324
0
    0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf,
1325
0
    0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd,
1326
0
    0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f,
1327
0
    0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f,
1328
0
    0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92,
1329
0
    0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9,
1330
0
    0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36,
1331
0
    0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1,
1332
0
    0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38,
1333
0
    0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea,
1334
0
    0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0,
1335
0
    0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27,
1336
0
    0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33
1337
0
  };
1338
1339
  /* 16-byte alignment required for amd64 implementation. */
1340
0
  ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15);
1341
1342
0
  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
1343
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1344
0
  scratch[sizeof (scratch) - 1] = 0;
1345
0
  chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1);
1346
0
  if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
1347
0
    return "ChaCha20 encryption test 1 failed.";
1348
0
  if (scratch[sizeof (scratch) - 1])
1349
0
    return "ChaCha20 wrote too much.";
1350
0
  chacha20_setkey (ctx, key_1, sizeof (key_1), NULL);
1351
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1352
0
  chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1);
1353
0
  if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
1354
0
    return "ChaCha20 decryption test 1 failed.";
1355
1356
0
  for (i = 0; i < sizeof buf; i++)
1357
0
    buf[i] = i;
1358
0
  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
1359
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1360
  /*encrypt */
1361
0
  chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
1362
  /*decrypt */
1363
0
  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
1364
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1365
0
  chacha20_encrypt_stream (ctx, buf, buf, 1);
1366
0
  chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1);
1367
0
  chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1,
1368
0
                           buf + (sizeof buf) - 1, 1);
1369
0
  for (i = 0; i < sizeof buf; i++)
1370
0
    if (buf[i] != (byte) i)
1371
0
      return "ChaCha20 encryption test 2 failed.";
1372
1373
0
  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
1374
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1375
  /* encrypt */
1376
0
  for (i = 0; i < sizeof buf; i++)
1377
0
    chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1);
1378
  /* decrypt */
1379
0
  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
1380
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1381
0
  chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
1382
0
  for (i = 0; i < sizeof buf; i++)
1383
0
    if (buf[i] != (byte) i)
1384
0
      return "ChaCha20 encryption test 3 failed.";
1385
1386
0
  return NULL;
1387
0
}
1388
1389
1390
gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = {
1391
  GCRY_CIPHER_CHACHA20,
1392
  {0, 0},                       /* flags */
1393
  "CHACHA20",                   /* name */
1394
  NULL,                         /* aliases */
1395
  NULL,                         /* oids */
1396
  1,                            /* blocksize in bytes. */
1397
  CHACHA20_MAX_KEY_SIZE * 8,    /* standard key length in bits. */
1398
  sizeof (CHACHA20_context_t),
1399
  chacha20_setkey,
1400
  NULL,
1401
  NULL,
1402
  chacha20_encrypt_stream,
1403
  chacha20_encrypt_stream,
1404
  NULL,
1405
  NULL,
1406
  chacha20_setiv
1407
};