Coverage Report

Created: 2022-08-24 06:28

/src/wolfssl-normal-math/wolfcrypt/src/poly1305.c
Line
Count
Source (jump to first uncovered line)
1
/* poly1305.c
2
 *
3
 * Copyright (C) 2006-2022 wolfSSL Inc.
4
 *
5
 * This file is part of wolfSSL.
6
 *
7
 * wolfSSL is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * wolfSSL is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20
 */
21
/*
22
23
DESCRIPTION
24
This library contains implementation for the Poly1305 authenticator.
25
26
Based off the public domain implementations by Andrew Moon
27
and Daniel J. Bernstein
28
29
*/
30
31
32
#ifdef HAVE_CONFIG_H
33
    #include <config.h>
34
#endif
35
36
#include <wolfssl/wolfcrypt/settings.h>
37
38
#ifdef HAVE_POLY1305
39
#include <wolfssl/wolfcrypt/poly1305.h>
40
#include <wolfssl/wolfcrypt/error-crypt.h>
41
#include <wolfssl/wolfcrypt/logging.h>
42
#include <wolfssl/wolfcrypt/cpuid.h>
43
#ifdef NO_INLINE
44
    #include <wolfssl/wolfcrypt/misc.h>
45
#else
46
    #define WOLFSSL_MISC_INCLUDED
47
    #include <wolfcrypt/src/misc.c>
48
#endif
49
#ifdef CHACHA_AEAD_TEST
50
    #include <stdio.h>
51
#endif
52
53
#ifdef _MSC_VER
54
    /* 4127 warning constant while(1)  */
55
    #pragma warning(disable: 4127)
56
#endif
57
58
#ifdef USE_INTEL_SPEEDUP
59
    #include <emmintrin.h>
60
    #include <immintrin.h>
61
62
    #if defined(__GNUC__) && ((__GNUC__ < 4) || \
63
                              (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
64
        #undef  NO_AVX2_SUPPORT
65
        #define NO_AVX2_SUPPORT
66
    #endif
67
    #if defined(__clang__) && ((__clang_major__ < 3) || \
68
                               (__clang_major__ == 3 && __clang_minor__ <= 5))
69
        #define NO_AVX2_SUPPORT
70
    #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
71
        #undef NO_AVX2_SUPPORT
72
    #endif
73
74
    #define HAVE_INTEL_AVX1
75
    #ifndef NO_AVX2_SUPPORT
76
        #define HAVE_INTEL_AVX2
77
    #endif
78
#endif
79
80
#ifdef USE_INTEL_SPEEDUP
81
static word32 intel_flags = 0;
82
static word32 cpu_flags_set = 0;
83
#endif
84
85
#if defined(USE_INTEL_SPEEDUP) || defined(POLY130564)
86
    #if defined(_MSC_VER)
87
        #define POLY1305_NOINLINE __declspec(noinline)
88
    #elif defined(__GNUC__)
89
        #define POLY1305_NOINLINE __attribute__((noinline))
90
    #else
91
        #define POLY1305_NOINLINE
92
    #endif
93
94
    #if defined(_MSC_VER)
95
        #include <intrin.h>
96
97
        typedef struct word128 {
98
            word64 lo;
99
            word64 hi;
100
        } word128;
101
102
        #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi)
103
        #define ADD(out, in) { word64 t = out.lo; out.lo += in.lo; \
104
                               out.hi += (out.lo < t) + in.hi; }
105
        #define ADDLO(out, in) { word64 t = out.lo; out.lo += in; \
106
                                 out.hi += (out.lo < t); }
107
        #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift)))
108
        #define LO(in) (in.lo)
109
110
    #elif defined(__GNUC__)
111
        #if defined(__SIZEOF_INT128__)
112
            PEDANTIC_EXTENSION typedef unsigned __int128 word128;
113
        #else
114
            typedef unsigned word128 __attribute__((mode(TI)));
115
        #endif
116
117
0
        #define MUL(out, x, y) out = ((word128)(x) * (y))
118
0
        #define ADD(out, in) (out) += (in)
119
0
        #define ADDLO(out, in) (out) += (in)
120
0
        #define SHR(in, shift) (word64)((in) >> (shift))
121
0
        #define LO(in) (word64)(in)
122
    #endif
123
#endif
124
125
#ifdef USE_INTEL_SPEEDUP
126
#ifdef __cplusplus
127
    extern "C" {
128
#endif
129
130
#ifdef HAVE_INTEL_AVX1
131
/* Process one block (16 bytes) of data.
132
 *
133
 * ctx  Poly1305 context.
134
 * m    One block of message data.
135
 */
136
extern void poly1305_block_avx(Poly1305* ctx, const unsigned char *m);
137
/* Process multiple blocks (n * 16 bytes) of data.
138
 *
139
 * ctx    Poly1305 context.
140
 * m      Blocks of message data.
141
 * bytes  The number of bytes to process.
142
 */
143
extern void poly1305_blocks_avx(Poly1305* ctx, const unsigned char* m,
144
                                size_t bytes);
145
/* Set the key to use when processing data.
146
 * Initialize the context.
147
 *
148
 * ctx  Poly1305 context.
149
 * key  The key data (16 bytes).
150
 */
151
extern void poly1305_setkey_avx(Poly1305* ctx, const byte* key);
152
/* Calculate the final result - authentication data.
153
 * Zeros out the private data in the context.
154
 *
155
 * ctx  Poly1305 context.
156
 * mac  Buffer to hold 16 bytes.
157
 */
158
extern void poly1305_final_avx(Poly1305* ctx, byte* mac);
159
#endif
160
161
#ifdef HAVE_INTEL_AVX2
162
/* Process multiple blocks (n * 16 bytes) of data.
163
 *
164
 * ctx    Poly1305 context.
165
 * m      Blocks of message data.
166
 * bytes  The number of bytes to process.
167
 */
168
extern void poly1305_blocks_avx2(Poly1305* ctx, const unsigned char* m,
169
                                 size_t bytes);
170
/* Calculate R^1, R^2, R^3 and R^4 and store them in the context.
171
 *
172
 * ctx    Poly1305 context.
173
 */
174
extern void poly1305_calc_powers_avx2(Poly1305* ctx);
175
/* Set the key to use when processing data.
176
 * Initialize the context.
177
 * Calls AVX set key function as final function calls AVX code.
178
 *
179
 * ctx  Poly1305 context.
180
 * key  The key data (16 bytes).
181
 */
182
extern void poly1305_setkey_avx2(Poly1305* ctx, const byte* key);
183
/* Calculate the final result - authentication data.
184
 * Zeros out the private data in the context.
185
 * Calls AVX final function to quickly process last blocks.
186
 *
187
 * ctx  Poly1305 context.
188
 * mac  Buffer to hold 16 bytes - authentication data.
189
 */
190
extern void poly1305_final_avx2(Poly1305* ctx, byte* mac);
191
#endif
192
193
#ifdef __cplusplus
194
    }  /* extern "C" */
195
#endif
196
197
#elif defined(POLY130564)
198
#ifndef WOLFSSL_ARMASM
199
    static word64 U8TO64(const byte* p)
200
0
    {
201
0
        return
202
0
            (((word64)(p[0] & 0xff)      ) |
203
0
             ((word64)(p[1] & 0xff) <<  8) |
204
0
             ((word64)(p[2] & 0xff) << 16) |
205
0
             ((word64)(p[3] & 0xff) << 24) |
206
0
             ((word64)(p[4] & 0xff) << 32) |
207
0
             ((word64)(p[5] & 0xff) << 40) |
208
0
             ((word64)(p[6] & 0xff) << 48) |
209
0
             ((word64)(p[7] & 0xff) << 56));
210
0
    }
211
212
0
    static void U64TO8(byte* p, word64 v) {
213
0
        p[0] = (v      ) & 0xff;
214
0
        p[1] = (v >>  8) & 0xff;
215
0
        p[2] = (v >> 16) & 0xff;
216
0
        p[3] = (v >> 24) & 0xff;
217
0
        p[4] = (v >> 32) & 0xff;
218
0
        p[5] = (v >> 40) & 0xff;
219
0
        p[6] = (v >> 48) & 0xff;
220
0
        p[7] = (v >> 56) & 0xff;
221
0
    }
222
#endif/* WOLFSSL_ARMASM */
223
#else /* if not 64 bit then use 32 bit */
224
225
    static word32 U8TO32(const byte *p)
226
    {
227
        return
228
            (((word32)(p[0] & 0xff)      ) |
229
             ((word32)(p[1] & 0xff) <<  8) |
230
             ((word32)(p[2] & 0xff) << 16) |
231
             ((word32)(p[3] & 0xff) << 24));
232
    }
233
234
    static void U32TO8(byte *p, word32 v) {
235
        p[0] = (byte)((v      ) & 0xff);
236
        p[1] = (byte)((v >>  8) & 0xff);
237
        p[2] = (byte)((v >> 16) & 0xff);
238
        p[3] = (byte)((v >> 24) & 0xff);
239
    }
240
#endif
241
242
/* convert 32-bit unsigned to little endian 64 bit type as byte array */
243
static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8])
244
0
{
245
#ifndef WOLFSSL_X86_64_BUILD
246
    outLe64[0] = (byte)(inLe32  & 0x000000FF);
247
    outLe64[1] = (byte)((inLe32 & 0x0000FF00) >> 8);
248
    outLe64[2] = (byte)((inLe32 & 0x00FF0000) >> 16);
249
    outLe64[3] = (byte)((inLe32 & 0xFF000000) >> 24);
250
    outLe64[4] = 0;
251
    outLe64[5] = 0;
252
    outLe64[6] = 0;
253
    outLe64[7] = 0;
254
#else
255
0
    *(word64*)outLe64 = inLe32;
256
0
#endif
257
0
}
258
259
260
#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
261
/*
262
This local function operates on a message with a given number of bytes
263
with a given ctx pointer to a Poly1305 structure.
264
*/
265
static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
266
                     size_t bytes)
267
0
{
268
#ifdef USE_INTEL_SPEEDUP
269
    /* AVX2 is handled in wc_Poly1305Update. */
270
    SAVE_VECTOR_REGISTERS(return _svr_ret;);
271
    poly1305_blocks_avx(ctx, m, bytes);
272
    RESTORE_VECTOR_REGISTERS();
273
    return 0;
274
#elif defined(POLY130564)
275
0
    const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */
276
0
    word64 r0,r1,r2;
277
0
    word64 s1,s2;
278
0
    word64 h0,h1,h2;
279
0
    word64 c;
280
0
    word128 d0,d1,d2,d;
281
282
0
    r0 = ctx->r[0];
283
0
    r1 = ctx->r[1];
284
0
    r2 = ctx->r[2];
285
286
0
    h0 = ctx->h[0];
287
0
    h1 = ctx->h[1];
288
0
    h2 = ctx->h[2];
289
290
0
    s1 = r1 * (5 << 2);
291
0
    s2 = r2 * (5 << 2);
292
293
0
    while (bytes >= POLY1305_BLOCK_SIZE) {
294
0
        word64 t0,t1;
295
296
        /* h += m[i] */
297
0
        t0 = U8TO64(&m[0]);
298
0
        t1 = U8TO64(&m[8]);
299
300
0
        h0 += (( t0                    ) & 0xfffffffffff);
301
0
        h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
302
0
        h2 += (((t1 >> 24)             ) & 0x3ffffffffff) | hibit;
303
304
        /* h *= r */
305
0
        MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d);
306
0
        MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d);
307
0
        MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d);
308
309
        /* (partial) h %= p */
310
0
                      c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff;
311
0
        ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff;
312
0
        ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff;
313
0
        h0  += c * 5; c = (h0 >> 44);  h0 =    h0  & 0xfffffffffff;
314
0
        h1  += c;
315
316
0
        m += POLY1305_BLOCK_SIZE;
317
0
        bytes -= POLY1305_BLOCK_SIZE;
318
0
    }
319
320
0
    ctx->h[0] = h0;
321
0
    ctx->h[1] = h1;
322
0
    ctx->h[2] = h2;
323
324
0
    return 0;
325
326
#else /* if not 64 bit then use 32 bit */
327
    const word32 hibit = (ctx->finished) ? 0 : ((word32)1 << 24); /* 1 << 128 */
328
    word32 r0,r1,r2,r3,r4;
329
    word32 s1,s2,s3,s4;
330
    word32 h0,h1,h2,h3,h4;
331
    word64 d0,d1,d2,d3,d4;
332
    word32 c;
333
334
335
    r0 = ctx->r[0];
336
    r1 = ctx->r[1];
337
    r2 = ctx->r[2];
338
    r3 = ctx->r[3];
339
    r4 = ctx->r[4];
340
341
    s1 = r1 * 5;
342
    s2 = r2 * 5;
343
    s3 = r3 * 5;
344
    s4 = r4 * 5;
345
346
    h0 = ctx->h[0];
347
    h1 = ctx->h[1];
348
    h2 = ctx->h[2];
349
    h3 = ctx->h[3];
350
    h4 = ctx->h[4];
351
352
    while (bytes >= POLY1305_BLOCK_SIZE) {
353
        /* h += m[i] */
354
        h0 += (U8TO32(m+ 0)     ) & 0x3ffffff;
355
        h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff;
356
        h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff;
357
        h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff;
358
        h4 += (U8TO32(m+12) >> 8) | hibit;
359
360
        /* h *= r */
361
        d0 = ((word64)h0 * r0) + ((word64)h1 * s4) + ((word64)h2 * s3) +
362
             ((word64)h3 * s2) + ((word64)h4 * s1);
363
        d1 = ((word64)h0 * r1) + ((word64)h1 * r0) + ((word64)h2 * s4) +
364
             ((word64)h3 * s3) + ((word64)h4 * s2);
365
        d2 = ((word64)h0 * r2) + ((word64)h1 * r1) + ((word64)h2 * r0) +
366
             ((word64)h3 * s4) + ((word64)h4 * s3);
367
        d3 = ((word64)h0 * r3) + ((word64)h1 * r2) + ((word64)h2 * r1) +
368
             ((word64)h3 * r0) + ((word64)h4 * s4);
369
        d4 = ((word64)h0 * r4) + ((word64)h1 * r3) + ((word64)h2 * r2) +
370
             ((word64)h3 * r1) + ((word64)h4 * r0);
371
372
        /* (partial) h %= p */
373
                      c = (word32)(d0 >> 26); h0 = (word32)d0 & 0x3ffffff;
374
        d1 += c;      c = (word32)(d1 >> 26); h1 = (word32)d1 & 0x3ffffff;
375
        d2 += c;      c = (word32)(d2 >> 26); h2 = (word32)d2 & 0x3ffffff;
376
        d3 += c;      c = (word32)(d3 >> 26); h3 = (word32)d3 & 0x3ffffff;
377
        d4 += c;      c = (word32)(d4 >> 26); h4 = (word32)d4 & 0x3ffffff;
378
        h0 += c * 5;  c =  (h0 >> 26); h0 =                h0 & 0x3ffffff;
379
        h1 += c;
380
381
        m += POLY1305_BLOCK_SIZE;
382
        bytes -= POLY1305_BLOCK_SIZE;
383
    }
384
385
    ctx->h[0] = h0;
386
    ctx->h[1] = h1;
387
    ctx->h[2] = h2;
388
    ctx->h[3] = h3;
389
    ctx->h[4] = h4;
390
391
    return 0;
392
393
#endif /* end of 64 bit cpu blocks or 32 bit cpu */
394
0
}
395
396
/*
397
This local function is used for the last call when a message with a given
398
number of bytes is less than the block size.
399
*/
400
static int poly1305_block(Poly1305* ctx, const unsigned char *m)
401
0
{
402
#ifdef USE_INTEL_SPEEDUP
403
    /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
404
    SAVE_VECTOR_REGISTERS(return _svr_ret;);
405
    poly1305_block_avx(ctx, m);
406
    RESTORE_VECTOR_REGISTERS();
407
    return 0;
408
#else
409
0
    return poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE);
410
0
#endif
411
0
}
412
#endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */
413
414
#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
415
int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
416
0
{
417
0
#if defined(POLY130564) && !defined(USE_INTEL_SPEEDUP)
418
0
    word64 t0,t1;
419
0
#endif
420
421
0
    if (key == NULL)
422
0
        return BAD_FUNC_ARG;
423
424
#ifdef CHACHA_AEAD_TEST
425
    word32 k;
426
    printf("Poly key used:\n");
427
    for (k = 0; k < keySz; k++) {
428
        printf("%02x", key[k]);
429
        if ((k+1) % 8 == 0)
430
            printf("\n");
431
    }
432
    printf("\n");
433
#endif
434
435
0
    if (keySz != 32 || ctx == NULL)
436
0
        return BAD_FUNC_ARG;
437
438
#ifdef USE_INTEL_SPEEDUP
439
    if (!cpu_flags_set) {
440
        intel_flags = cpuid_get_flags();
441
        cpu_flags_set = 1;
442
    }
443
    SAVE_VECTOR_REGISTERS(return _svr_ret;);
444
    #ifdef HAVE_INTEL_AVX2
445
    if (IS_INTEL_AVX2(intel_flags))
446
        poly1305_setkey_avx2(ctx, key);
447
    else
448
    #endif
449
        poly1305_setkey_avx(ctx, key);
450
    RESTORE_VECTOR_REGISTERS();
451
#elif defined(POLY130564)
452
453
    /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
454
0
    t0 = U8TO64(key + 0);
455
0
    t1 = U8TO64(key + 8);
456
457
0
    ctx->r[0] = ( t0                    ) & 0xffc0fffffff;
458
0
    ctx->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
459
0
    ctx->r[2] = ((t1 >> 24)             ) & 0x00ffffffc0f;
460
461
    /* h (accumulator) = 0 */
462
0
    ctx->h[0] = 0;
463
0
    ctx->h[1] = 0;
464
0
    ctx->h[2] = 0;
465
466
    /* save pad for later */
467
0
    ctx->pad[0] = U8TO64(key + 16);
468
0
    ctx->pad[1] = U8TO64(key + 24);
469
470
0
    ctx->leftover = 0;
471
0
    ctx->finished = 0;
472
473
#else /* if not 64 bit then use 32 bit */
474
475
    /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
476
    ctx->r[0] = (U8TO32(key +  0)     ) & 0x3ffffff;
477
    ctx->r[1] = (U8TO32(key +  3) >> 2) & 0x3ffff03;
478
    ctx->r[2] = (U8TO32(key +  6) >> 4) & 0x3ffc0ff;
479
    ctx->r[3] = (U8TO32(key +  9) >> 6) & 0x3f03fff;
480
    ctx->r[4] = (U8TO32(key + 12) >> 8) & 0x00fffff;
481
482
    /* h = 0 */
483
    ctx->h[0] = 0;
484
    ctx->h[1] = 0;
485
    ctx->h[2] = 0;
486
    ctx->h[3] = 0;
487
    ctx->h[4] = 0;
488
489
    /* save pad for later */
490
    ctx->pad[0] = U8TO32(key + 16);
491
    ctx->pad[1] = U8TO32(key + 20);
492
    ctx->pad[2] = U8TO32(key + 24);
493
    ctx->pad[3] = U8TO32(key + 28);
494
495
    ctx->leftover = 0;
496
    ctx->finished = 0;
497
498
#endif
499
500
0
    return 0;
501
0
}
502
503
int wc_Poly1305Final(Poly1305* ctx, byte* mac)
504
0
{
505
#ifdef USE_INTEL_SPEEDUP
506
#elif defined(POLY130564)
507
508
0
    word64 h0,h1,h2,c;
509
0
    word64 g0,g1,g2;
510
0
    word64 t0,t1;
511
512
#else
513
514
    word32 h0,h1,h2,h3,h4,c;
515
    word32 g0,g1,g2,g3,g4;
516
    word64 f;
517
    word32 mask;
518
519
#endif
520
521
0
    if (ctx == NULL || mac == NULL)
522
0
        return BAD_FUNC_ARG;
523
524
#ifdef USE_INTEL_SPEEDUP
525
    SAVE_VECTOR_REGISTERS(return _svr_ret;);
526
    #ifdef HAVE_INTEL_AVX2
527
    if (IS_INTEL_AVX2(intel_flags))
528
        poly1305_final_avx2(ctx, mac);
529
    else
530
    #endif
531
        poly1305_final_avx(ctx, mac);
532
    RESTORE_VECTOR_REGISTERS();
533
#elif defined(POLY130564)
534
535
    /* process the remaining block */
536
0
    if (ctx->leftover) {
537
0
        size_t i = ctx->leftover;
538
0
        ctx->buffer[i] = 1;
539
0
        for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++)
540
0
            ctx->buffer[i] = 0;
541
0
        ctx->finished = 1;
542
0
        poly1305_block(ctx, ctx->buffer);
543
0
    }
544
545
    /* fully carry h */
546
0
    h0 = ctx->h[0];
547
0
    h1 = ctx->h[1];
548
0
    h2 = ctx->h[2];
549
550
0
                 c = (h1 >> 44); h1 &= 0xfffffffffff;
551
0
    h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
552
0
    h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
553
0
    h1 += c;     c = (h1 >> 44); h1 &= 0xfffffffffff;
554
0
    h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
555
0
    h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
556
0
    h1 += c;
557
558
    /* compute h + -p */
559
0
    g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
560
0
    g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
561
0
    g2 = h2 + c - ((word64)1 << 42);
562
563
    /* select h if h < p, or h + -p if h >= p */
564
0
    c = (g2 >> ((sizeof(word64) * 8) - 1)) - 1;
565
0
    g0 &= c;
566
0
    g1 &= c;
567
0
    g2 &= c;
568
0
    c = ~c;
569
0
    h0 = (h0 & c) | g0;
570
0
    h1 = (h1 & c) | g1;
571
0
    h2 = (h2 & c) | g2;
572
573
    /* h = (h + pad) */
574
0
    t0 = ctx->pad[0];
575
0
    t1 = ctx->pad[1];
576
577
0
    h0 += (( t0                    ) & 0xfffffffffff)    ;
578
0
    c = (h0 >> 44); h0 &= 0xfffffffffff;
579
0
    h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
580
0
    c = (h1 >> 44); h1 &= 0xfffffffffff;
581
0
    h2 += (((t1 >> 24)             ) & 0x3ffffffffff) + c;
582
0
    h2 &= 0x3ffffffffff;
583
584
    /* mac = h % (2^128) */
585
0
    h0 = ((h0      ) | (h1 << 44));
586
0
    h1 = ((h1 >> 20) | (h2 << 24));
587
588
0
    U64TO8(mac + 0, h0);
589
0
    U64TO8(mac + 8, h1);
590
591
    /* zero out the state */
592
0
    ctx->h[0] = 0;
593
0
    ctx->h[1] = 0;
594
0
    ctx->h[2] = 0;
595
0
    ctx->r[0] = 0;
596
0
    ctx->r[1] = 0;
597
0
    ctx->r[2] = 0;
598
0
    ctx->pad[0] = 0;
599
0
    ctx->pad[1] = 0;
600
601
#else /* if not 64 bit then use 32 bit */
602
603
    /* process the remaining block */
604
    if (ctx->leftover) {
605
        size_t i = ctx->leftover;
606
        ctx->buffer[i++] = 1;
607
        for (; i < POLY1305_BLOCK_SIZE; i++)
608
            ctx->buffer[i] = 0;
609
        ctx->finished = 1;
610
        poly1305_block(ctx, ctx->buffer);
611
    }
612
613
    /* fully carry h */
614
    h0 = ctx->h[0];
615
    h1 = ctx->h[1];
616
    h2 = ctx->h[2];
617
    h3 = ctx->h[3];
618
    h4 = ctx->h[4];
619
620
                 c = h1 >> 26; h1 = h1 & 0x3ffffff;
621
    h2 +=     c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
622
    h3 +=     c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
623
    h4 +=     c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
624
    h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
625
    h1 +=     c;
626
627
    /* compute h + -p */
628
    g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
629
    g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
630
    g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
631
    g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
632
    g4 = h4 + c - ((word32)1 << 26);
633
634
    /* select h if h < p, or h + -p if h >= p */
635
    mask = ((word32)g4 >> ((sizeof(word32) * 8) - 1)) - 1;
636
    g0 &= mask;
637
    g1 &= mask;
638
    g2 &= mask;
639
    g3 &= mask;
640
    g4 &= mask;
641
    mask = ~mask;
642
    h0 = (h0 & mask) | g0;
643
    h1 = (h1 & mask) | g1;
644
    h2 = (h2 & mask) | g2;
645
    h3 = (h3 & mask) | g3;
646
    h4 = (h4 & mask) | g4;
647
648
    /* h = h % (2^128) */
649
    h0 = ((h0      ) | (h1 << 26)) & 0xffffffff;
650
    h1 = ((h1 >>  6) | (h2 << 20)) & 0xffffffff;
651
    h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
652
    h3 = ((h3 >> 18) | (h4 <<  8)) & 0xffffffff;
653
654
    /* mac = (h + pad) % (2^128) */
655
    f = (word64)h0 + ctx->pad[0]            ; h0 = (word32)f;
656
    f = (word64)h1 + ctx->pad[1] + (f >> 32); h1 = (word32)f;
657
    f = (word64)h2 + ctx->pad[2] + (f >> 32); h2 = (word32)f;
658
    f = (word64)h3 + ctx->pad[3] + (f >> 32); h3 = (word32)f;
659
660
    U32TO8(mac + 0, h0);
661
    U32TO8(mac + 4, h1);
662
    U32TO8(mac + 8, h2);
663
    U32TO8(mac + 12, h3);
664
665
    /* zero out the state */
666
    ctx->h[0] = 0;
667
    ctx->h[1] = 0;
668
    ctx->h[2] = 0;
669
    ctx->h[3] = 0;
670
    ctx->h[4] = 0;
671
    ctx->r[0] = 0;
672
    ctx->r[1] = 0;
673
    ctx->r[2] = 0;
674
    ctx->r[3] = 0;
675
    ctx->r[4] = 0;
676
    ctx->pad[0] = 0;
677
    ctx->pad[1] = 0;
678
    ctx->pad[2] = 0;
679
    ctx->pad[3] = 0;
680
681
#endif
682
683
0
    return 0;
684
0
}
685
#endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */
686
687
688
int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
689
0
{
690
0
    size_t i;
691
692
0
    if (ctx == NULL || (m == NULL && bytes > 0))
693
0
        return BAD_FUNC_ARG;
694
695
0
    if (bytes == 0) {
696
        /* valid, but do nothing */
697
0
        return 0;
698
0
    }
699
#ifdef CHACHA_AEAD_TEST
700
    word32 k;
701
    printf("Raw input to poly:\n");
702
    for (k = 0; k < bytes; k++) {
703
        printf("%02x", m[k]);
704
        if ((k+1) % 16 == 0)
705
            printf("\n");
706
    }
707
    printf("\n");
708
#endif
709
710
#ifdef USE_INTEL_SPEEDUP
711
    #ifdef HAVE_INTEL_AVX2
712
    if (IS_INTEL_AVX2(intel_flags)) {
713
        SAVE_VECTOR_REGISTERS(return _svr_ret;);
714
715
        /* handle leftover */
716
717
        if (ctx->leftover) {
718
            size_t want = sizeof(ctx->buffer) - ctx->leftover;
719
            if (want > bytes)
720
                want = bytes;
721
722
            for (i = 0; i < want; i++)
723
                ctx->buffer[ctx->leftover + i] = m[i];
724
            bytes -= (word32)want;
725
            m += want;
726
            ctx->leftover += want;
727
            if (ctx->leftover < sizeof(ctx->buffer)) {
728
                RESTORE_VECTOR_REGISTERS();
729
                return 0;
730
            }
731
732
            if (!ctx->started)
733
                poly1305_calc_powers_avx2(ctx);
734
            poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
735
            ctx->leftover = 0;
736
        }
737
738
        /* process full blocks */
739
        if (bytes >= sizeof(ctx->buffer)) {
740
            size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
741
742
            if (!ctx->started)
743
                poly1305_calc_powers_avx2(ctx);
744
            poly1305_blocks_avx2(ctx, m, want);
745
            m += want;
746
            bytes -= (word32)want;
747
        }
748
749
        /* store leftover */
750
        if (bytes) {
751
            for (i = 0; i < bytes; i++)
752
                ctx->buffer[ctx->leftover + i] = m[i];
753
            ctx->leftover += bytes;
754
        }
755
        RESTORE_VECTOR_REGISTERS();
756
    }
757
    else
758
    #endif
759
#endif
760
0
    {
761
        /* handle leftover */
762
0
        if (ctx->leftover) {
763
0
            size_t want = (POLY1305_BLOCK_SIZE - ctx->leftover);
764
0
            if (want > bytes)
765
0
                want = bytes;
766
0
            for (i = 0; i < want; i++)
767
0
                ctx->buffer[ctx->leftover + i] = m[i];
768
0
            bytes -= (word32)want;
769
0
            m += want;
770
0
            ctx->leftover += want;
771
0
            if (ctx->leftover < POLY1305_BLOCK_SIZE)
772
0
                return 0;
773
0
            poly1305_block(ctx, ctx->buffer);
774
0
            ctx->leftover = 0;
775
0
        }
776
777
        /* process full blocks */
778
0
        if (bytes >= POLY1305_BLOCK_SIZE) {
779
0
            size_t want = (bytes & ~(POLY1305_BLOCK_SIZE - 1));
780
0
#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
781
0
            int ret;
782
0
            ret = poly1305_blocks(ctx, m, want);
783
0
            if (ret != 0)
784
0
                return ret;
785
#else
786
            poly1305_blocks(ctx, m, want);
787
#endif
788
0
            m += want;
789
0
            bytes -= (word32)want;
790
0
        }
791
792
        /* store leftover */
793
0
        if (bytes) {
794
0
            for (i = 0; i < bytes; i++)
795
0
                ctx->buffer[ctx->leftover + i] = m[i];
796
0
            ctx->leftover += bytes;
797
0
        }
798
0
    }
799
800
0
    return 0;
801
0
}
802
803
/*  Takes a Poly1305 struct that has a key loaded and pads the provided length
804
    ctx        : Initialized Poly1305 struct to use
805
    lenToPad   : Current number of bytes updated that needs padding to 16
806
 */
807
int wc_Poly1305_Pad(Poly1305* ctx, word32 lenToPad)
808
0
{
809
0
    int ret = 0;
810
0
    word32 paddingLen;
811
0
    byte padding[WC_POLY1305_PAD_SZ - 1];
812
813
0
    if (ctx == NULL) {
814
0
        return BAD_FUNC_ARG;
815
0
    }
816
0
    if (lenToPad == 0) {
817
0
        return 0; /* nothing needs to be done */
818
0
    }
819
820
0
    XMEMSET(padding, 0, sizeof(padding));
821
822
    /* Pad length to 16 bytes */
823
0
    paddingLen = (-(int)lenToPad) & (WC_POLY1305_PAD_SZ - 1);
824
0
    if ((paddingLen > 0) && (paddingLen < WC_POLY1305_PAD_SZ)) {
825
0
        ret = wc_Poly1305Update(ctx, padding, paddingLen);
826
0
    }
827
0
    return ret;
828
0
}
829
830
/*  Takes a Poly1305 struct that has a key loaded and adds the AEAD length
831
    encoding in 64-bit little endian
832
    aadSz      : Size of the additional authentication data
833
    dataSz     : Size of the plaintext or ciphertext
834
 */
835
int wc_Poly1305_EncodeSizes(Poly1305* ctx, word32 aadSz, word32 dataSz)
836
0
{
837
0
    int ret;
838
0
    byte little64[16]; /* sizeof(word64) * 2 */
839
840
0
    if (ctx == NULL) {
841
0
        return BAD_FUNC_ARG;
842
0
    }
843
844
0
    XMEMSET(little64, 0, sizeof(little64));
845
846
    /* size of additional data and input data as little endian 64 bit types */
847
0
    u32tole64(aadSz,  little64);
848
0
    u32tole64(dataSz, little64 + 8);
849
0
    ret = wc_Poly1305Update(ctx, little64, sizeof(little64));
850
851
0
    return ret;
852
0
}
853
854
#ifdef WORD64_AVAILABLE
855
int wc_Poly1305_EncodeSizes64(Poly1305* ctx, word64 aadSz, word64 dataSz)
856
0
{
857
0
    int ret;
858
0
    word64 little64[2];
859
860
0
    if (ctx == NULL) {
861
0
        return BAD_FUNC_ARG;
862
0
    }
863
864
#ifdef BIG_ENDIAN_ORDER
865
    little64[0] = ByteReverseWord64(aadSz);
866
    little64[1] = ByteReverseWord64(dataSz);
867
#else
868
0
    little64[0] = aadSz;
869
0
    little64[1] = dataSz;
870
0
#endif
871
872
0
    ret = wc_Poly1305Update(ctx, (byte *)little64, sizeof(little64));
873
874
0
    return ret;
875
0
}
876
#endif
877
878
/*  Takes in an initialized Poly1305 struct that has a key loaded and creates
879
    a MAC (tag) using recent TLS AEAD padding scheme.
880
    ctx        : Initialized Poly1305 struct to use
881
    additional : Additional data to use
882
    addSz      : Size of additional buffer
883
    input      : Input buffer to create tag from
884
    sz         : Size of input buffer
885
    tag        : Buffer to hold created tag
886
    tagSz      : Size of input tag buffer (must be at least
887
                 WC_POLY1305_MAC_SZ(16))
888
 */
889
int wc_Poly1305_MAC(Poly1305* ctx, const byte* additional, word32 addSz,
890
                    const byte* input, word32 sz, byte* tag, word32 tagSz)
891
0
{
892
0
    int ret;
893
894
    /* sanity check on arguments */
895
0
    if (ctx == NULL || input == NULL || tag == NULL ||
896
0
                                                   tagSz < WC_POLY1305_MAC_SZ) {
897
0
        return BAD_FUNC_ARG;
898
0
    }
899
900
    /* additional allowed to be 0 */
901
0
    if (addSz > 0) {
902
0
        if (additional == NULL)
903
0
            return BAD_FUNC_ARG;
904
905
        /* additional data plus padding */
906
0
        if ((ret = wc_Poly1305Update(ctx, additional, addSz)) != 0) {
907
0
            return ret;
908
0
        }
909
        /* pad additional data */
910
0
        if ((ret = wc_Poly1305_Pad(ctx, addSz)) != 0) {
911
0
            return ret;
912
0
        }
913
0
    }
914
915
    /* input plus padding */
916
0
    if ((ret = wc_Poly1305Update(ctx, input, sz)) != 0) {
917
0
        return ret;
918
0
    }
919
    /* pad input data */
920
0
    if ((ret = wc_Poly1305_Pad(ctx, sz)) != 0) {
921
0
        return ret;
922
0
    }
923
924
    /* encode size of AAD and input data as little endian 64 bit types */
925
0
    if ((ret = wc_Poly1305_EncodeSizes(ctx, addSz, sz)) != 0) {
926
0
        return ret;
927
0
    }
928
929
    /* Finalize the auth tag */
930
0
    ret = wc_Poly1305Final(ctx, tag);
931
932
0
    return ret;
933
934
0
}
935
#endif /* HAVE_POLY1305 */