Coverage Report

Created: 2024-11-21 07:03

/src/libgcrypt/cipher/chacha20.c
Line
Count
Source (jump to first uncovered line)
1
/* chacha20.c  -  Bernstein's ChaCha20 cipher
2
 * Copyright (C) 2014,2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3
 *
4
 * This file is part of Libgcrypt.
5
 *
6
 * Libgcrypt is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU Lesser General Public License as
8
 * published by the Free Software Foundation; either version 2.1 of
9
 * the License, or (at your option) any later version.
10
 *
11
 * Libgcrypt is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18
 *
19
 * For a description of the algorithm, see:
20
 *   http://cr.yp.to/chacha.html
21
 */
22
23
/*
24
 * Based on D. J. Bernstein reference implementation at
25
 * http://cr.yp.to/chacha.html:
26
 *
27
 * chacha-regs.c version 20080118
28
 * D. J. Bernstein
29
 * Public domain.
30
 */
31
32
#include <config.h>
33
#include <stdio.h>
34
#include <stdlib.h>
35
#include <string.h>
36
#include "types.h"
37
#include "g10lib.h"
38
#include "cipher.h"
39
#include "cipher-internal.h"
40
#include "bufhelp.h"
41
42
43
0
#define CHACHA20_MIN_KEY_SIZE 16        /* Bytes.  */
44
0
#define CHACHA20_MAX_KEY_SIZE 32        /* Bytes.  */
45
0
#define CHACHA20_BLOCK_SIZE   64        /* Bytes.  */
46
0
#define CHACHA20_MIN_IV_SIZE   8        /* Bytes.  */
47
0
#define CHACHA20_MAX_IV_SIZE  12        /* Bytes.  */
48
0
#define CHACHA20_CTR_SIZE     16        /* Bytes.  */
49
50
51
/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
52
#undef USE_SSSE3
53
#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
54
   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
55
    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
56
# define USE_SSSE3 1
57
#endif
58
59
/* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
60
#undef USE_AVX2
61
#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
62
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
63
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
64
# define USE_AVX2 1
65
#endif
66
67
/* USE_AVX512 indicates whether to compile with Intel AVX512 code. */
68
#undef USE_AVX512
69
#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \
70
    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
71
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
72
# define USE_AVX512 1
73
#endif
74
75
/* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
76
#undef USE_ARMV7_NEON
77
#ifdef ENABLE_NEON_SUPPORT
78
# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
79
     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
80
     && defined(HAVE_GCC_INLINE_ASM_NEON)
81
#  define USE_ARMV7_NEON 1
82
# endif
83
#endif
84
85
/* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
86
 * code. */
87
#undef USE_AARCH64_SIMD
88
#ifdef ENABLE_NEON_SUPPORT
89
# if defined(__AARCH64EL__) \
90
       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
91
       && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
92
#  define USE_AARCH64_SIMD 1
93
# endif
94
#endif
95
96
/* USE_PPC_VEC indicates whether to enable PowerPC vector
97
 * accelerated code. */
98
#undef USE_PPC_VEC
99
#ifdef ENABLE_PPC_CRYPTO_SUPPORT
100
# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
101
     defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
102
#  if __GNUC__ >= 4
103
#   define USE_PPC_VEC 1
104
#  endif
105
# endif
106
#endif
107
108
/* USE_S390X_VX indicates whether to enable zSeries code. */
109
#undef USE_S390X_VX
110
#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
111
# if defined(HAVE_GCC_INLINE_ASM_S390X_VX)
112
#  define USE_S390X_VX 1
113
# endif /* USE_S390X_VX */
114
#endif
115
116
/* Assembly implementations use SystemV ABI, ABI conversion and additional
117
 * stack to store XMM6-XMM15 needed on Win64. */
118
#undef ASM_FUNC_ABI
119
#undef ASM_EXTRA_STACK
120
#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
121
# define ASM_FUNC_ABI __attribute__((sysv_abi))
122
#else
123
# define ASM_FUNC_ABI
124
#endif
125
126
127
typedef struct CHACHA20_context_s
128
{
129
  u32 input[16];
130
  unsigned char pad[CHACHA20_BLOCK_SIZE];
131
  unsigned int unused; /* bytes in the pad.  */
132
  unsigned int use_ssse3:1;
133
  unsigned int use_avx2:1;
134
  unsigned int use_avx512:1;
135
  unsigned int use_neon:1;
136
  unsigned int use_ppc:1;
137
  unsigned int use_p9:1;
138
  unsigned int use_p10:1;
139
  unsigned int use_s390x:1;
140
} CHACHA20_context_t;
141
142
143
#ifdef USE_SSSE3
144
145
unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
146
            const byte *src,
147
            size_t nblks) ASM_FUNC_ABI;
148
149
unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst,
150
            const byte *src,
151
            size_t nblks) ASM_FUNC_ABI;
152
153
unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
154
    u32 *state, byte *dst, const byte *src, size_t nblks,
155
    void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
156
157
unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
158
    u32 *state, byte *dst, const byte *src, size_t nblks,
159
    void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
160
161
#endif /* USE_SSSE3 */
162
163
#ifdef USE_AVX2
164
165
unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
166
                 const byte *src,
167
                 size_t nblks) ASM_FUNC_ABI;
168
169
unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
170
    u32 *state, byte *dst, const byte *src, size_t nblks,
171
    void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
172
173
#endif /* USE_AVX2 */
174
175
#ifdef USE_AVX512
176
177
unsigned int _gcry_chacha20_amd64_avx512_blocks(u32 *state, byte *dst,
178
                                                const byte *src,
179
                                                size_t nblks) ASM_FUNC_ABI;
180
181
#endif /* USE_AVX2 */
182
183
#ifdef USE_PPC_VEC
184
185
#ifndef WORDS_BIGENDIAN
186
unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst,
187
             const byte *src,
188
             size_t len);
189
#endif
190
191
unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst,
192
           const byte *src,
193
           size_t nblks);
194
195
unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
196
           const byte *src,
197
           size_t nblks);
198
199
unsigned int _gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst,
200
           const byte *src,
201
           size_t nblks);
202
203
unsigned int _gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst,
204
           const byte *src,
205
           size_t nblks);
206
207
#undef USE_PPC_VEC_POLY1305
208
#if SIZEOF_UNSIGNED_LONG == 8
209
#define USE_PPC_VEC_POLY1305 1
210
unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
211
    u32 *state, byte *dst, const byte *src, size_t nblks,
212
    POLY1305_STATE *st, const byte *poly1305_src);
213
214
unsigned int _gcry_chacha20_poly1305_ppc9_blocks4(
215
    u32 *state, byte *dst, const byte *src, size_t nblks,
216
    POLY1305_STATE *st, const byte *poly1305_src);
217
#endif /* SIZEOF_UNSIGNED_LONG == 8 */
218
219
#endif /* USE_PPC_VEC */
220
221
#ifdef USE_S390X_VX
222
223
unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst,
224
               const byte *src, size_t nblks);
225
226
unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst,
227
             const byte *src, size_t nblks);
228
229
#undef USE_S390X_VX_POLY1305
230
#if SIZEOF_UNSIGNED_LONG == 8
231
#define USE_S390X_VX_POLY1305 1
232
unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks8(
233
    u32 *state, byte *dst, const byte *src, size_t nblks,
234
    POLY1305_STATE *st, const byte *poly1305_src);
235
236
unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
237
    u32 *state, byte *dst, const byte *src, size_t nblks,
238
    POLY1305_STATE *st, const byte *poly1305_src);
239
#endif /* SIZEOF_UNSIGNED_LONG == 8 */
240
241
#endif /* USE_S390X_VX */
242
243
#ifdef USE_ARMV7_NEON
244
245
unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
246
                 const byte *src,
247
                 size_t nblks);
248
249
#endif /* USE_ARMV7_NEON */
250
251
#ifdef USE_AARCH64_SIMD
252
253
unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst,
254
              const byte *src, size_t nblks);
255
256
unsigned int _gcry_chacha20_poly1305_aarch64_blocks4(
257
    u32 *state, byte *dst, const byte *src, size_t nblks,
258
    void *poly1305_state, const byte *poly1305_src);
259
260
#endif /* USE_AARCH64_SIMD */
261
262
263
static const char *selftest (void);
264

265
266
0
#define ROTATE(v,c) (rol(v,c))
267
#define XOR(v,w)  ((v) ^ (w))
268
0
#define PLUS(v,w) ((u32)((v) + (w)))
269
0
#define PLUSONE(v)  (PLUS((v),1))
270
271
#define QUARTERROUND(a,b,c,d) \
272
0
  a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
273
0
  c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
274
0
  a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
275
0
  c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
276
277
#define BUF_XOR_LE32(dst, src, offset, x) \
278
0
  buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
279
280
static unsigned int
281
do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
282
0
{
283
0
  u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
284
0
  unsigned int i;
285
286
0
  while (nblks)
287
0
    {
288
0
      x0 = input[0];
289
0
      x1 = input[1];
290
0
      x2 = input[2];
291
0
      x3 = input[3];
292
0
      x4 = input[4];
293
0
      x5 = input[5];
294
0
      x6 = input[6];
295
0
      x7 = input[7];
296
0
      x8 = input[8];
297
0
      x9 = input[9];
298
0
      x10 = input[10];
299
0
      x11 = input[11];
300
0
      x12 = input[12];
301
0
      x13 = input[13];
302
0
      x14 = input[14];
303
0
      x15 = input[15];
304
305
0
      for (i = 20; i > 0; i -= 2)
306
0
  {
307
0
    QUARTERROUND(x0, x4,  x8, x12)
308
0
    QUARTERROUND(x1, x5,  x9, x13)
309
0
    QUARTERROUND(x2, x6, x10, x14)
310
0
    QUARTERROUND(x3, x7, x11, x15)
311
0
    QUARTERROUND(x0, x5, x10, x15)
312
0
    QUARTERROUND(x1, x6, x11, x12)
313
0
    QUARTERROUND(x2, x7,  x8, x13)
314
0
    QUARTERROUND(x3, x4,  x9, x14)
315
0
  }
316
317
0
      x0 = PLUS(x0, input[0]);
318
0
      x1 = PLUS(x1, input[1]);
319
0
      x2 = PLUS(x2, input[2]);
320
0
      x3 = PLUS(x3, input[3]);
321
0
      x4 = PLUS(x4, input[4]);
322
0
      x5 = PLUS(x5, input[5]);
323
0
      x6 = PLUS(x6, input[6]);
324
0
      x7 = PLUS(x7, input[7]);
325
0
      x8 = PLUS(x8, input[8]);
326
0
      x9 = PLUS(x9, input[9]);
327
0
      x10 = PLUS(x10, input[10]);
328
0
      x11 = PLUS(x11, input[11]);
329
0
      x12 = PLUS(x12, input[12]);
330
0
      x13 = PLUS(x13, input[13]);
331
0
      x14 = PLUS(x14, input[14]);
332
0
      x15 = PLUS(x15, input[15]);
333
334
0
      input[12] = PLUSONE(input[12]);
335
0
      input[13] = PLUS(input[13], !input[12]);
336
337
0
      BUF_XOR_LE32(dst, src, 0, x0);
338
0
      BUF_XOR_LE32(dst, src, 4, x1);
339
0
      BUF_XOR_LE32(dst, src, 8, x2);
340
0
      BUF_XOR_LE32(dst, src, 12, x3);
341
0
      BUF_XOR_LE32(dst, src, 16, x4);
342
0
      BUF_XOR_LE32(dst, src, 20, x5);
343
0
      BUF_XOR_LE32(dst, src, 24, x6);
344
0
      BUF_XOR_LE32(dst, src, 28, x7);
345
0
      BUF_XOR_LE32(dst, src, 32, x8);
346
0
      BUF_XOR_LE32(dst, src, 36, x9);
347
0
      BUF_XOR_LE32(dst, src, 40, x10);
348
0
      BUF_XOR_LE32(dst, src, 44, x11);
349
0
      BUF_XOR_LE32(dst, src, 48, x12);
350
0
      BUF_XOR_LE32(dst, src, 52, x13);
351
0
      BUF_XOR_LE32(dst, src, 56, x14);
352
0
      BUF_XOR_LE32(dst, src, 60, x15);
353
354
0
      src += CHACHA20_BLOCK_SIZE;
355
0
      dst += CHACHA20_BLOCK_SIZE;
356
0
      nblks--;
357
0
    }
358
359
  /* burn_stack */
360
0
  return (17 * sizeof(u32) + 6 * sizeof(void *));
361
0
}
362
363
364
static unsigned int
365
chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
366
     size_t nblks)
367
0
{
368
0
#ifdef USE_AVX512
369
0
  if (ctx->use_avx512)
370
0
    {
371
0
      return _gcry_chacha20_amd64_avx512_blocks(ctx->input, dst, src, nblks);
372
0
    }
373
0
#endif
374
375
0
#ifdef USE_SSSE3
376
0
  if (ctx->use_ssse3)
377
0
    {
378
0
      return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
379
0
    }
380
0
#endif
381
382
#ifdef USE_PPC_VEC
383
  if (ctx->use_ppc)
384
    {
385
      if (ctx->use_p9)
386
  return _gcry_chacha20_ppc9_blocks1(ctx->input, dst, src, nblks);
387
      else
388
  return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
389
    }
390
#endif
391
392
#ifdef USE_S390X_VX
393
  if (ctx->use_s390x)
394
    {
395
      return _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, dst, src, nblks);
396
    }
397
#endif
398
399
0
  return do_chacha20_blocks (ctx->input, dst, src, nblks);
400
0
}
401
402
403
static void
404
chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
405
                   unsigned int keylen)
406
0
{
407
0
  static const char sigma[16] = "expand 32-byte k";
408
0
  static const char tau[16] = "expand 16-byte k";
409
0
  const char *constants;
410
411
0
  ctx->input[4] = buf_get_le32(key + 0);
412
0
  ctx->input[5] = buf_get_le32(key + 4);
413
0
  ctx->input[6] = buf_get_le32(key + 8);
414
0
  ctx->input[7] = buf_get_le32(key + 12);
415
0
  if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */
416
0
    {
417
0
      key += 16;
418
0
      constants = sigma;
419
0
    }
420
0
  else /* 128 bits */
421
0
    {
422
0
      constants = tau;
423
0
    }
424
0
  ctx->input[8] = buf_get_le32(key + 0);
425
0
  ctx->input[9] = buf_get_le32(key + 4);
426
0
  ctx->input[10] = buf_get_le32(key + 8);
427
0
  ctx->input[11] = buf_get_le32(key + 12);
428
0
  ctx->input[0] = buf_get_le32(constants + 0);
429
0
  ctx->input[1] = buf_get_le32(constants + 4);
430
0
  ctx->input[2] = buf_get_le32(constants + 8);
431
0
  ctx->input[3] = buf_get_le32(constants + 12);
432
0
}
433
434
435
static void
436
chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen)
437
0
{
438
0
  if (ivlen == CHACHA20_CTR_SIZE)
439
0
    {
440
0
      ctx->input[12] = buf_get_le32 (iv + 0);
441
0
      ctx->input[13] = buf_get_le32 (iv + 4);
442
0
      ctx->input[14] = buf_get_le32 (iv + 8);
443
0
      ctx->input[15] = buf_get_le32 (iv + 12);
444
0
    }
445
0
  else if (ivlen == CHACHA20_MAX_IV_SIZE)
446
0
    {
447
0
      ctx->input[12] = 0;
448
0
      ctx->input[13] = buf_get_le32 (iv + 0);
449
0
      ctx->input[14] = buf_get_le32 (iv + 4);
450
0
      ctx->input[15] = buf_get_le32 (iv + 8);
451
0
    }
452
0
  else if (ivlen == CHACHA20_MIN_IV_SIZE)
453
0
    {
454
0
      ctx->input[12] = 0;
455
0
      ctx->input[13] = 0;
456
0
      ctx->input[14] = buf_get_le32 (iv + 0);
457
0
      ctx->input[15] = buf_get_le32 (iv + 4);
458
0
    }
459
0
  else
460
0
    {
461
0
      ctx->input[12] = 0;
462
0
      ctx->input[13] = 0;
463
0
      ctx->input[14] = 0;
464
0
      ctx->input[15] = 0;
465
0
    }
466
0
}
467
468
469
static void
470
chacha20_setiv (void *context, const byte *iv, size_t ivlen)
471
0
{
472
0
  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
473
474
  /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
475
0
  if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
476
0
      && ivlen != CHACHA20_CTR_SIZE)
477
0
    log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
478
479
0
  if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
480
0
             || ivlen == CHACHA20_CTR_SIZE))
481
0
    chacha20_ivsetup (ctx, iv, ivlen);
482
0
  else
483
0
    chacha20_ivsetup (ctx, NULL, 0);
484
485
  /* Reset the unused pad bytes counter.  */
486
0
  ctx->unused = 0;
487
0
}
488
489
490
static gcry_err_code_t
491
chacha20_do_setkey (CHACHA20_context_t *ctx,
492
                    const byte *key, unsigned int keylen)
493
0
{
494
0
  static int initialized;
495
0
  static const char *selftest_failed;
496
0
  unsigned int features = _gcry_get_hw_features ();
497
498
0
  if (!initialized)
499
0
    {
500
0
      initialized = 1;
501
0
      selftest_failed = selftest ();
502
0
      if (selftest_failed)
503
0
        log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed);
504
0
    }
505
0
  if (selftest_failed)
506
0
    return GPG_ERR_SELFTEST_FAILED;
507
508
0
  if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE)
509
0
    return GPG_ERR_INV_KEYLEN;
510
511
0
#ifdef USE_SSSE3
512
0
  ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
513
0
#endif
514
0
#ifdef USE_AVX512
515
0
  ctx->use_avx512 = (features & HWF_INTEL_AVX512) != 0;
516
0
#endif
517
0
#ifdef USE_AVX2
518
0
  ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
519
0
#endif
520
#ifdef USE_ARMV7_NEON
521
  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
522
#endif
523
#ifdef USE_AARCH64_SIMD
524
  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
525
#endif
526
#ifdef USE_PPC_VEC
527
  ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
528
  ctx->use_p9  = (features & HWF_PPC_ARCH_3_00) != 0;
529
# ifndef WORDS_BIGENDIAN
530
  ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0;
531
#  ifdef ENABLE_FORCE_SOFT_HWFEATURES
532
  /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10.
533
   * Actual implementation works with HWF_PPC_ARCH_3_00 also. */
534
  ctx->use_p10 |= (features & HWF_PPC_ARCH_3_00) != 0;
535
#  endif
536
# endif
537
#endif
538
#ifdef USE_S390X_VX
539
  ctx->use_s390x = (features & HWF_S390X_VX) != 0;
540
#endif
541
542
0
  (void)features;
543
544
0
  chacha20_keysetup (ctx, key, keylen);
545
546
  /* We default to a zero nonce.  */
547
0
  chacha20_setiv (ctx, NULL, 0);
548
549
0
  return 0;
550
0
}
551
552
553
static gcry_err_code_t
554
chacha20_setkey (void *context, const byte *key, unsigned int keylen,
555
                 cipher_bulk_ops_t *bulk_ops)
556
0
{
557
0
  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
558
0
  gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen);
559
0
  (void)bulk_ops;
560
0
  _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
561
0
  return rc;
562
0
}
563
564
565
static unsigned int
566
do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
567
         const byte *inbuf, size_t length)
568
0
{
569
0
  static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
570
0
  unsigned int nburn, burn = 0;
571
572
0
#ifdef USE_AVX512
573
0
  if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE)
574
0
    {
575
0
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
576
0
      nburn = _gcry_chacha20_amd64_avx512_blocks(ctx->input, outbuf, inbuf,
577
0
                                                 nblocks);
578
0
      burn = nburn > burn ? nburn : burn;
579
0
      length %= CHACHA20_BLOCK_SIZE;
580
0
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
581
0
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
582
0
    }
583
0
#endif
584
585
0
#ifdef USE_AVX2
586
0
  if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
587
0
    {
588
0
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
589
0
      nblocks -= nblocks % 8;
590
0
      nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf,
591
0
            nblocks);
592
0
      burn = nburn > burn ? nburn : burn;
593
0
      length -= nblocks * CHACHA20_BLOCK_SIZE;
594
0
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
595
0
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
596
0
    }
597
0
#endif
598
599
0
#ifdef USE_SSSE3
600
0
  if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
601
0
    {
602
0
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
603
0
      nblocks -= nblocks % 4;
604
0
      nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf,
605
0
             nblocks);
606
0
      burn = nburn > burn ? nburn : burn;
607
0
      length -= nblocks * CHACHA20_BLOCK_SIZE;
608
0
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
609
0
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
610
0
    }
611
0
#endif
612
613
#ifdef USE_ARMV7_NEON
614
  if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
615
    {
616
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
617
      nblocks -= nblocks % 4;
618
      nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf,
619
            nblocks);
620
      burn = nburn > burn ? nburn : burn;
621
      length -= nblocks * CHACHA20_BLOCK_SIZE;
622
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
623
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
624
    }
625
#endif
626
627
#ifdef USE_AARCH64_SIMD
628
  if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
629
    {
630
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
631
      nblocks -= nblocks % 4;
632
      nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf,
633
               nblocks);
634
      burn = nburn > burn ? nburn : burn;
635
      length -= nblocks * CHACHA20_BLOCK_SIZE;
636
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
637
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
638
    }
639
#endif
640
641
#ifdef USE_PPC_VEC
642
  if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
643
    {
644
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
645
      nblocks -= nblocks % 4;
646
      if (0)
647
        {}
648
#ifndef WORDS_BIGENDIAN
649
      /*
650
       * A workaround to skip counter overflow. This is rare.
651
       */
652
      else if (ctx->use_p10 && nblocks >= 8
653
               && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU)
654
        {
655
          size_t len = nblocks * CHACHA20_BLOCK_SIZE;
656
          nburn = _gcry_chacha20_p10le_8x(ctx->input, outbuf, inbuf, len);
657
        }
658
#endif
659
      else if (ctx->use_p9)
660
        {
661
          nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf,
662
                                              nblocks);
663
        }
664
      else
665
        {
666
          nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf,
667
                                              nblocks);
668
        }
669
      burn = nburn > burn ? nburn : burn;
670
      length -= nblocks * CHACHA20_BLOCK_SIZE;
671
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
672
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
673
    }
674
#endif
675
676
#ifdef USE_S390X_VX
677
  if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 8)
678
    {
679
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
680
      nblocks -= nblocks % 8;
681
      nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf,
682
                nblocks);
683
      burn = nburn > burn ? nburn : burn;
684
      length -= nblocks * CHACHA20_BLOCK_SIZE;
685
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
686
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
687
    }
688
#endif
689
690
0
  if (length >= CHACHA20_BLOCK_SIZE)
691
0
    {
692
0
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
693
0
      nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks);
694
0
      burn = nburn > burn ? nburn : burn;
695
0
      length %= CHACHA20_BLOCK_SIZE;
696
0
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
697
0
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
698
0
    }
699
700
0
  if (length > 0)
701
0
    {
702
0
      nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1);
703
0
      burn = nburn > burn ? nburn : burn;
704
705
0
      buf_xor (outbuf, inbuf, ctx->pad, length);
706
0
      ctx->unused = CHACHA20_BLOCK_SIZE - length;
707
0
    }
708
709
0
  if (burn)
710
0
    burn += 5 * sizeof(void *);
711
712
0
  return burn;
713
0
}
714
715
716
static void
717
chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
718
                         size_t length)
719
0
{
720
0
  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
721
0
  unsigned int nburn, burn = 0;
722
723
0
  if (!length)
724
0
    return;
725
726
0
  if (ctx->unused)
727
0
    {
728
0
      unsigned char *p = ctx->pad;
729
0
      size_t n;
730
731
0
      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
732
733
0
      n = ctx->unused;
734
0
      if (n > length)
735
0
        n = length;
736
737
0
      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
738
0
      length -= n;
739
0
      outbuf += n;
740
0
      inbuf += n;
741
0
      ctx->unused -= n;
742
743
0
      if (!length)
744
0
        return;
745
0
      gcry_assert (!ctx->unused);
746
0
    }
747
748
0
  nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
749
0
  burn = nburn > burn ? nburn : burn;
750
751
0
  if (burn)
752
0
    _gcry_burn_stack (burn);
753
0
}
754
755
756
gcry_err_code_t
757
_gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
758
        const byte *inbuf, size_t length)
759
0
{
760
0
  CHACHA20_context_t *ctx = (void *) &c->context.c;
761
0
  unsigned int nburn, burn = 0;
762
0
  byte *authptr = NULL;
763
764
0
  if (!length)
765
0
    return 0;
766
767
0
  if (ctx->unused)
768
0
    {
769
0
      unsigned char *p = ctx->pad;
770
0
      size_t n;
771
772
0
      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
773
774
0
      n = ctx->unused;
775
0
      if (n > length)
776
0
        n = length;
777
778
0
      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
779
0
      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n);
780
0
      burn = nburn > burn ? nburn : burn;
781
0
      length -= n;
782
0
      outbuf += n;
783
0
      inbuf += n;
784
0
      ctx->unused -= n;
785
786
0
      if (!length)
787
0
  {
788
0
    if (burn)
789
0
      _gcry_burn_stack (burn);
790
791
0
    return 0;
792
0
  }
793
0
      gcry_assert (!ctx->unused);
794
0
    }
795
796
0
  gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
797
798
0
  if (0)
799
0
    { }
800
0
#ifdef USE_AVX512
801
0
  else if (ctx->use_avx512)
802
0
    {
803
      /* Skip stitched chacha20-poly1305 for AVX512. */
804
0
      authptr = NULL;
805
0
    }
806
0
#endif
807
0
#ifdef USE_AVX2
808
0
  else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
809
0
    {
810
0
      nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8);
811
0
      burn = nburn > burn ? nburn : burn;
812
813
0
      authptr = outbuf;
814
0
      length -= 8 * CHACHA20_BLOCK_SIZE;
815
0
      outbuf += 8 * CHACHA20_BLOCK_SIZE;
816
0
      inbuf  += 8 * CHACHA20_BLOCK_SIZE;
817
0
    }
818
0
#endif
819
0
#ifdef USE_SSSE3
820
0
  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
821
0
    {
822
0
      nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4);
823
0
      burn = nburn > burn ? nburn : burn;
824
825
0
      authptr = outbuf;
826
0
      length -= 4 * CHACHA20_BLOCK_SIZE;
827
0
      outbuf += 4 * CHACHA20_BLOCK_SIZE;
828
0
      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
829
0
    }
830
0
  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 2)
831
0
    {
832
0
      nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 2);
833
0
      burn = nburn > burn ? nburn : burn;
834
835
0
      authptr = outbuf;
836
0
      length -= 2 * CHACHA20_BLOCK_SIZE;
837
0
      outbuf += 2 * CHACHA20_BLOCK_SIZE;
838
0
      inbuf  += 2 * CHACHA20_BLOCK_SIZE;
839
0
    }
840
0
  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE)
841
0
    {
842
0
      nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1);
843
0
      burn = nburn > burn ? nburn : burn;
844
845
0
      authptr = outbuf;
846
0
      length -= 1 * CHACHA20_BLOCK_SIZE;
847
0
      outbuf += 1 * CHACHA20_BLOCK_SIZE;
848
0
      inbuf  += 1 * CHACHA20_BLOCK_SIZE;
849
0
    }
850
0
#endif
851
#ifdef USE_AARCH64_SIMD
852
  else if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
853
    {
854
      nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, 4);
855
      burn = nburn > burn ? nburn : burn;
856
857
      authptr = outbuf;
858
      length -= 4 * CHACHA20_BLOCK_SIZE;
859
      outbuf += 4 * CHACHA20_BLOCK_SIZE;
860
      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
861
    }
862
#endif
863
#ifdef USE_PPC_VEC_POLY1305
864
  else if (ctx->use_ppc && ctx->use_p10)
865
    {
866
      /* Skip stitched chacha20-poly1305 for P10. */
867
      authptr = NULL;
868
    }
869
  else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
870
    {
871
      if (ctx->use_p9)
872
        nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf, 4);
873
      else
874
  nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
875
      burn = nburn > burn ? nburn : burn;
876
877
      authptr = outbuf;
878
      length -= 4 * CHACHA20_BLOCK_SIZE;
879
      outbuf += 4 * CHACHA20_BLOCK_SIZE;
880
      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
881
    }
882
#endif
883
#ifdef USE_S390X_VX_POLY1305
884
  else if (ctx->use_s390x && length >= 2 * CHACHA20_BLOCK_SIZE * 8)
885
    {
886
      nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, 8);
887
      burn = nburn > burn ? nburn : burn;
888
889
      authptr = outbuf;
890
      length -= 8 * CHACHA20_BLOCK_SIZE;
891
      outbuf += 8 * CHACHA20_BLOCK_SIZE;
892
      inbuf  += 8 * CHACHA20_BLOCK_SIZE;
893
    }
894
  else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 4)
895
    {
896
      nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 4);
897
      burn = nburn > burn ? nburn : burn;
898
899
      authptr = outbuf;
900
      length -= 4 * CHACHA20_BLOCK_SIZE;
901
      outbuf += 4 * CHACHA20_BLOCK_SIZE;
902
      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
903
    }
904
  else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 2)
905
    {
906
      nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 2);
907
      burn = nburn > burn ? nburn : burn;
908
909
      authptr = outbuf;
910
      length -= 2 * CHACHA20_BLOCK_SIZE;
911
      outbuf += 2 * CHACHA20_BLOCK_SIZE;
912
      inbuf  += 2 * CHACHA20_BLOCK_SIZE;
913
    }
914
  else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE)
915
    {
916
      nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 1);
917
      burn = nburn > burn ? nburn : burn;
918
919
      authptr = outbuf;
920
      length -= 1 * CHACHA20_BLOCK_SIZE;
921
      outbuf += 1 * CHACHA20_BLOCK_SIZE;
922
      inbuf  += 1 * CHACHA20_BLOCK_SIZE;
923
    }
924
#endif
925
926
0
  if (authptr)
927
0
    {
928
0
      size_t authoffset = outbuf - authptr;
929
930
0
#ifdef USE_AVX2
931
0
      if (ctx->use_avx2 &&
932
0
    length >= 8 * CHACHA20_BLOCK_SIZE &&
933
0
    authoffset >= 8 * CHACHA20_BLOCK_SIZE)
934
0
  {
935
0
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
936
0
    nblocks -= nblocks % 8;
937
938
0
    nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
939
0
          ctx->input, outbuf, inbuf, nblocks,
940
0
          &c->u_mode.poly1305.ctx.state, authptr);
941
0
    burn = nburn > burn ? nburn : burn;
942
943
0
    length  -= nblocks * CHACHA20_BLOCK_SIZE;
944
0
    outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
945
0
    inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
946
0
    authptr += nblocks * CHACHA20_BLOCK_SIZE;
947
0
  }
948
0
#endif
949
950
0
#ifdef USE_SSSE3
951
0
      if (ctx->use_ssse3)
952
0
  {
953
0
    if (length >= 4 * CHACHA20_BLOCK_SIZE &&
954
0
        authoffset >= 4 * CHACHA20_BLOCK_SIZE)
955
0
      {
956
0
        size_t nblocks = length / CHACHA20_BLOCK_SIZE;
957
0
        nblocks -= nblocks % 4;
958
959
0
        nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
960
0
        ctx->input, outbuf, inbuf, nblocks,
961
0
        &c->u_mode.poly1305.ctx.state, authptr);
962
0
        burn = nburn > burn ? nburn : burn;
963
964
0
        length  -= nblocks * CHACHA20_BLOCK_SIZE;
965
0
        outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
966
0
        inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
967
0
        authptr += nblocks * CHACHA20_BLOCK_SIZE;
968
0
      }
969
970
0
    if (length >= CHACHA20_BLOCK_SIZE &&
971
0
        authoffset >= CHACHA20_BLOCK_SIZE)
972
0
      {
973
0
        size_t nblocks = length / CHACHA20_BLOCK_SIZE;
974
975
0
        nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
976
0
        ctx->input, outbuf, inbuf, nblocks,
977
0
        &c->u_mode.poly1305.ctx.state, authptr);
978
0
        burn = nburn > burn ? nburn : burn;
979
980
0
        length  -= nblocks * CHACHA20_BLOCK_SIZE;
981
0
        outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
982
0
        inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
983
0
        authptr += nblocks * CHACHA20_BLOCK_SIZE;
984
0
      }
985
0
  }
986
0
#endif
987
988
#ifdef USE_AARCH64_SIMD
989
      if (ctx->use_neon &&
990
    length >= 4 * CHACHA20_BLOCK_SIZE &&
991
    authoffset >= 4 * CHACHA20_BLOCK_SIZE)
992
  {
993
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
994
    nblocks -= nblocks % 4;
995
996
    nburn = _gcry_chacha20_poly1305_aarch64_blocks4(
997
          ctx->input, outbuf, inbuf, nblocks,
998
          &c->u_mode.poly1305.ctx.state, authptr);
999
    burn = nburn > burn ? nburn : burn;
1000
1001
    length  -= nblocks * CHACHA20_BLOCK_SIZE;
1002
    outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1003
    inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
1004
    authptr += nblocks * CHACHA20_BLOCK_SIZE;
1005
  }
1006
#endif
1007
1008
#ifdef USE_PPC_VEC_POLY1305
1009
      if (ctx->use_ppc &&
1010
    length >= 4 * CHACHA20_BLOCK_SIZE &&
1011
    authoffset >= 4 * CHACHA20_BLOCK_SIZE)
1012
  {
1013
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1014
    nblocks -= nblocks % 4;
1015
1016
    if (ctx->use_p9)
1017
      nburn = _gcry_chacha20_poly1305_ppc9_blocks4(
1018
          ctx->input, outbuf, inbuf, nblocks,
1019
          &c->u_mode.poly1305.ctx.state, authptr);
1020
    else
1021
      nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
1022
          ctx->input, outbuf, inbuf, nblocks,
1023
          &c->u_mode.poly1305.ctx.state, authptr);
1024
    burn = nburn > burn ? nburn : burn;
1025
1026
    length  -= nblocks * CHACHA20_BLOCK_SIZE;
1027
    outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1028
    inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
1029
    authptr += nblocks * CHACHA20_BLOCK_SIZE;
1030
  }
1031
#endif
1032
1033
#ifdef USE_S390X_VX_POLY1305
1034
      if (ctx->use_s390x)
1035
  {
1036
    if (length >= 8 * CHACHA20_BLOCK_SIZE &&
1037
        authoffset >= 8 * CHACHA20_BLOCK_SIZE)
1038
      {
1039
        size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1040
        nblocks -= nblocks % 8;
1041
1042
        burn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
1043
        ctx->input, outbuf, inbuf, nblocks,
1044
        &c->u_mode.poly1305.ctx.state, authptr);
1045
        burn = nburn > burn ? nburn : burn;
1046
1047
        length  -= nblocks * CHACHA20_BLOCK_SIZE;
1048
        outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1049
        inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
1050
        authptr += nblocks * CHACHA20_BLOCK_SIZE;
1051
      }
1052
1053
    if (length >= CHACHA20_BLOCK_SIZE &&
1054
        authoffset >= CHACHA20_BLOCK_SIZE)
1055
      {
1056
        size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1057
1058
        burn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
1059
        ctx->input, outbuf, inbuf, nblocks,
1060
        &c->u_mode.poly1305.ctx.state, authptr);
1061
        burn = nburn > burn ? nburn : burn;
1062
1063
        length  -= nblocks * CHACHA20_BLOCK_SIZE;
1064
        outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1065
        inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
1066
        authptr += nblocks * CHACHA20_BLOCK_SIZE;
1067
      }
1068
  }
1069
#endif
1070
1071
0
      if (authoffset > 0)
1072
0
  {
1073
0
    _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset);
1074
0
    authptr += authoffset;
1075
0
    authoffset = 0;
1076
0
  }
1077
1078
0
      gcry_assert(authptr == outbuf);
1079
0
    }
1080
1081
0
  while (length)
1082
0
    {
1083
0
      size_t currlen = length;
1084
1085
      /* Since checksumming is done after encryption, process input in 24KiB
1086
       * chunks to keep data loaded in L1 cache for checksumming.  However
1087
       * only do splitting if input is large enough so that last chunks does
1088
       * not end up being short. */
1089
0
      if (currlen > 32 * 1024)
1090
0
  currlen = 24 * 1024;
1091
1092
0
      nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen);
1093
0
      burn = nburn > burn ? nburn : burn;
1094
1095
0
      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf,
1096
0
            currlen);
1097
0
      burn = nburn > burn ? nburn : burn;
1098
1099
0
      outbuf += currlen;
1100
0
      inbuf += currlen;
1101
0
      length -= currlen;
1102
0
    }
1103
1104
0
  if (burn)
1105
0
    _gcry_burn_stack (burn);
1106
1107
0
  return 0;
1108
0
}
1109
1110
1111
gcry_err_code_t
1112
_gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
1113
        const byte *inbuf, size_t length)
1114
0
{
1115
0
  CHACHA20_context_t *ctx = (void *) &c->context.c;
1116
0
  unsigned int nburn, burn = 0;
1117
0
#if defined(USE_AVX512) || defined(USE_PPC_VEC_POLY1305)                  \
1118
0
  || defined(USE_AVX2) || defined(USE_SSSE3) || defined(USE_AARCH64_SIMD) \
1119
0
  || defined(USE_S390X_VX_POLY1305)
1120
0
  int skip_stitched = 0;
1121
0
#endif
1122
1123
0
  if (!length)
1124
0
    return 0;
1125
1126
0
  if (ctx->unused)
1127
0
    {
1128
0
      unsigned char *p = ctx->pad;
1129
0
      size_t n;
1130
1131
0
      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
1132
1133
0
      n = ctx->unused;
1134
0
      if (n > length)
1135
0
        n = length;
1136
1137
0
      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n);
1138
0
      burn = nburn > burn ? nburn : burn;
1139
0
      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
1140
0
      length -= n;
1141
0
      outbuf += n;
1142
0
      inbuf += n;
1143
0
      ctx->unused -= n;
1144
1145
0
      if (!length)
1146
0
  {
1147
0
    if (burn)
1148
0
      _gcry_burn_stack (burn);
1149
1150
0
    return 0;
1151
0
  }
1152
0
      gcry_assert (!ctx->unused);
1153
0
    }
1154
1155
0
  gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
1156
1157
0
#ifdef USE_AVX512
1158
0
  if (ctx->use_avx512)
1159
0
    {
1160
      /* Skip stitched chacha20-poly1305 for AVX512. */
1161
0
      skip_stitched = 1;
1162
0
    }
1163
0
#endif
1164
#ifdef USE_PPC_VEC_POLY1305
1165
  if (ctx->use_ppc && ctx->use_p10)
1166
    {
1167
      /* Skip stitched chacha20-poly1305 for P10. */
1168
      skip_stitched = 1;
1169
    }
1170
#endif
1171
1172
0
#ifdef USE_AVX2
1173
0
  if (!skip_stitched && ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
1174
0
    {
1175
0
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1176
0
      nblocks -= nblocks % 8;
1177
1178
0
      nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
1179
0
      ctx->input, outbuf, inbuf, nblocks,
1180
0
      &c->u_mode.poly1305.ctx.state, inbuf);
1181
0
      burn = nburn > burn ? nburn : burn;
1182
1183
0
      length -= nblocks * CHACHA20_BLOCK_SIZE;
1184
0
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1185
0
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1186
0
    }
1187
0
#endif
1188
1189
0
#ifdef USE_SSSE3
1190
0
  if (!skip_stitched && ctx->use_ssse3)
1191
0
    {
1192
0
      if (length >= 4 * CHACHA20_BLOCK_SIZE)
1193
0
  {
1194
0
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1195
0
    nblocks -= nblocks % 4;
1196
1197
0
    nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
1198
0
          ctx->input, outbuf, inbuf, nblocks,
1199
0
          &c->u_mode.poly1305.ctx.state, inbuf);
1200
0
    burn = nburn > burn ? nburn : burn;
1201
1202
0
    length -= nblocks * CHACHA20_BLOCK_SIZE;
1203
0
    outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1204
0
    inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1205
0
  }
1206
1207
0
      if (length >= CHACHA20_BLOCK_SIZE)
1208
0
  {
1209
0
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1210
1211
0
    nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
1212
0
          ctx->input, outbuf, inbuf, nblocks,
1213
0
          &c->u_mode.poly1305.ctx.state, inbuf);
1214
0
    burn = nburn > burn ? nburn : burn;
1215
1216
0
    length -= nblocks * CHACHA20_BLOCK_SIZE;
1217
0
    outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1218
0
    inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1219
0
  }
1220
0
    }
1221
0
#endif
1222
1223
#ifdef USE_AARCH64_SIMD
1224
  if (!skip_stitched && ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE)
1225
    {
1226
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1227
      nblocks -= nblocks % 4;
1228
1229
      nburn = _gcry_chacha20_poly1305_aarch64_blocks4(
1230
      ctx->input, outbuf, inbuf, nblocks,
1231
      &c->u_mode.poly1305.ctx.state, inbuf);
1232
      burn = nburn > burn ? nburn : burn;
1233
1234
      length -= nblocks * CHACHA20_BLOCK_SIZE;
1235
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1236
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1237
    }
1238
#endif
1239
1240
#ifdef USE_PPC_VEC_POLY1305
1241
  /* skip stitch for p10 */
1242
  if (!skip_stitched && ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
1243
    {
1244
      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1245
      nblocks -= nblocks % 4;
1246
1247
      if (ctx->use_p9)
1248
  nburn = _gcry_chacha20_poly1305_ppc9_blocks4(
1249
        ctx->input, outbuf, inbuf, nblocks,
1250
        &c->u_mode.poly1305.ctx.state, inbuf);
1251
      else
1252
  nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
1253
        ctx->input, outbuf, inbuf, nblocks,
1254
        &c->u_mode.poly1305.ctx.state, inbuf);
1255
      burn = nburn > burn ? nburn : burn;
1256
1257
      length -= nblocks * CHACHA20_BLOCK_SIZE;
1258
      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1259
      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1260
    }
1261
#endif
1262
1263
#ifdef USE_S390X_VX_POLY1305
1264
  if (!skip_stitched && ctx->use_s390x)
1265
    {
1266
      if (length >= 8 * CHACHA20_BLOCK_SIZE)
1267
  {
1268
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1269
    nblocks -= nblocks % 8;
1270
1271
    nburn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
1272
          ctx->input, outbuf, inbuf, nblocks,
1273
          &c->u_mode.poly1305.ctx.state, inbuf);
1274
    burn = nburn > burn ? nburn : burn;
1275
1276
    length -= nblocks * CHACHA20_BLOCK_SIZE;
1277
    outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1278
    inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1279
  }
1280
1281
      if (length >= CHACHA20_BLOCK_SIZE)
1282
  {
1283
    size_t nblocks = length / CHACHA20_BLOCK_SIZE;
1284
1285
    nburn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
1286
          ctx->input, outbuf, inbuf, nblocks,
1287
          &c->u_mode.poly1305.ctx.state, inbuf);
1288
    burn = nburn > burn ? nburn : burn;
1289
1290
    length -= nblocks * CHACHA20_BLOCK_SIZE;
1291
    outbuf += nblocks * CHACHA20_BLOCK_SIZE;
1292
    inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
1293
  }
1294
    }
1295
#endif
1296
1297
0
  while (length)
1298
0
    {
1299
0
      size_t currlen = length;
1300
1301
      /* Since checksumming is done before decryption, process input in 24KiB
1302
       * chunks to keep data loaded in L1 cache for decryption.  However only
1303
       * do splitting if input is large enough so that last chunks does not
1304
       * end up being short. */
1305
0
      if (currlen > 32 * 1024)
1306
0
  currlen = 24 * 1024;
1307
1308
0
      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf,
1309
0
            currlen);
1310
0
      burn = nburn > burn ? nburn : burn;
1311
1312
0
      nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen);
1313
0
      burn = nburn > burn ? nburn : burn;
1314
1315
0
      outbuf += currlen;
1316
0
      inbuf += currlen;
1317
0
      length -= currlen;
1318
0
    }
1319
1320
0
  if (burn)
1321
0
    _gcry_burn_stack (burn);
1322
1323
0
  return 0;
1324
0
}
1325
1326
1327
static const char *
1328
selftest (void)
1329
0
{
1330
0
  byte ctxbuf[sizeof(CHACHA20_context_t) + 15];
1331
0
  CHACHA20_context_t *ctx;
1332
0
  byte scratch[127 + 1];
1333
0
  byte buf[512 + 64 + 4];
1334
0
  int i;
1335
1336
  /* From draft-strombergson-chacha-test-vectors */
1337
0
  static byte key_1[] = {
1338
0
    0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78,
1339
0
    0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35,
1340
0
    0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb,
1341
0
    0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d
1342
0
  };
1343
0
  static const byte nonce_1[] =
1344
0
    { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 };
1345
0
  static const byte plaintext_1[127] = {
1346
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1347
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1348
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1349
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1350
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1351
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1352
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1353
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1354
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1355
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1356
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1357
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1358
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1359
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1360
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1361
0
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1362
0
  };
1363
0
  static const byte ciphertext_1[127] = {
1364
0
    0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9,
1365
0
    0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06,
1366
0
    0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00,
1367
0
    0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf,
1368
0
    0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd,
1369
0
    0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f,
1370
0
    0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f,
1371
0
    0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92,
1372
0
    0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9,
1373
0
    0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36,
1374
0
    0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1,
1375
0
    0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38,
1376
0
    0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea,
1377
0
    0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0,
1378
0
    0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27,
1379
0
    0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33
1380
0
  };
1381
1382
  /* 16-byte alignment required for amd64 implementation. */
1383
0
  ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15);
1384
1385
0
  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
1386
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1387
0
  scratch[sizeof (scratch) - 1] = 0;
1388
0
  chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1);
1389
0
  if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
1390
0
    return "ChaCha20 encryption test 1 failed.";
1391
0
  if (scratch[sizeof (scratch) - 1])
1392
0
    return "ChaCha20 wrote too much.";
1393
0
  chacha20_setkey (ctx, key_1, sizeof (key_1), NULL);
1394
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1395
0
  chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1);
1396
0
  if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
1397
0
    return "ChaCha20 decryption test 1 failed.";
1398
1399
0
  for (i = 0; i < sizeof buf; i++)
1400
0
    buf[i] = i;
1401
0
  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
1402
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1403
  /*encrypt */
1404
0
  chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
1405
  /*decrypt */
1406
0
  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
1407
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1408
0
  chacha20_encrypt_stream (ctx, buf, buf, 1);
1409
0
  chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1);
1410
0
  chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1,
1411
0
                           buf + (sizeof buf) - 1, 1);
1412
0
  for (i = 0; i < sizeof buf; i++)
1413
0
    if (buf[i] != (byte) i)
1414
0
      return "ChaCha20 encryption test 2 failed.";
1415
1416
0
  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
1417
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1418
  /* encrypt */
1419
0
  for (i = 0; i < sizeof buf; i++)
1420
0
    chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1);
1421
  /* decrypt */
1422
0
  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
1423
0
  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
1424
0
  chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
1425
0
  for (i = 0; i < sizeof buf; i++)
1426
0
    if (buf[i] != (byte) i)
1427
0
      return "ChaCha20 encryption test 3 failed.";
1428
1429
0
  return NULL;
1430
0
}
1431
1432
1433
gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = {
1434
  GCRY_CIPHER_CHACHA20,
1435
  {0, 0},                       /* flags */
1436
  "CHACHA20",                   /* name */
1437
  NULL,                         /* aliases */
1438
  NULL,                         /* oids */
1439
  1,                            /* blocksize in bytes. */
1440
  CHACHA20_MAX_KEY_SIZE * 8,    /* standard key length in bits. */
1441
  sizeof (CHACHA20_context_t),
1442
  chacha20_setkey,
1443
  NULL,
1444
  NULL,
1445
  chacha20_encrypt_stream,
1446
  chacha20_encrypt_stream,
1447
  NULL,
1448
  NULL,
1449
  chacha20_setiv
1450
};