Coverage Report

Created: 2025-07-23 06:59

/src/wolfssl-sp-math-all/wolfcrypt/src/poly1305.c
Line
Count
Source (jump to first uncovered line)
1
/* poly1305.c
2
 *
3
 * Copyright (C) 2006-2025 wolfSSL Inc.
4
 *
5
 * This file is part of wolfSSL.
6
 *
7
 * wolfSSL is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 3 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * wolfSSL is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20
 */
21
/*
22
23
DESCRIPTION
24
This library contains implementation for the Poly1305 authenticator.
25
26
Based off the public domain implementations by Andrew Moon
27
and Daniel J. Bernstein
28
29
*/
30
31
32
/*
33
 * WOLFSSL_W64_WRAPPER Uses wrappers around word64 types for a system that does
34
 *                     not have word64 available. As expected it reduces
35
 *                     performance. Benchmarks collected July 2024 show
36
 *                     303.004 MiB/s with and 1874.194 MiB/s without.
37
 */
38
39
#include <wolfssl/wolfcrypt/libwolfssl_sources.h>
40
41
#ifdef HAVE_POLY1305
42
#include <wolfssl/wolfcrypt/poly1305.h>
43
#include <wolfssl/wolfcrypt/cpuid.h>
44
#ifdef NO_INLINE
45
    #include <wolfssl/wolfcrypt/misc.h>
46
#else
47
    #define WOLFSSL_MISC_INCLUDED
48
    #include <wolfcrypt/src/misc.c>
49
#endif
50
#ifdef CHACHA_AEAD_TEST
51
    #include <stdio.h>
52
#endif
53
54
#ifdef _MSC_VER
55
    /* 4127 warning constant while(1)  */
56
    #pragma warning(disable: 4127)
57
#endif
58
59
#ifdef USE_INTEL_POLY1305_SPEEDUP
60
    #include <emmintrin.h>
61
    #include <immintrin.h>
62
63
    #if defined(__GNUC__) && ((__GNUC__ < 4) || \
64
                              (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
65
        #undef  NO_AVX2_SUPPORT
66
        #define NO_AVX2_SUPPORT
67
    #endif
68
    #if defined(__clang__) && ((__clang_major__ < 3) || \
69
                               (__clang_major__ == 3 && __clang_minor__ <= 5))
70
        #define NO_AVX2_SUPPORT
71
    #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
72
        #undef NO_AVX2_SUPPORT
73
    #endif
74
    #if defined(_MSC_VER) && (_MSC_VER <= 1900)
75
        #undef  NO_AVX2_SUPPORT
76
        #define NO_AVX2_SUPPORT
77
    #endif
78
79
    #define HAVE_INTEL_AVX1
80
    #ifndef NO_AVX2_SUPPORT
81
        #define HAVE_INTEL_AVX2
82
    #endif
83
#endif
84
85
#ifdef USE_INTEL_POLY1305_SPEEDUP
86
static word32 intel_flags = 0;
87
static word32 cpu_flags_set = 0;
88
#endif
89
90
#if defined(USE_INTEL_POLY1305_SPEEDUP) || defined(POLY130564)
91
    #if defined(_MSC_VER)
92
        #define POLY1305_NOINLINE __declspec(noinline)
93
    #elif defined(__GNUC__)
94
        #define POLY1305_NOINLINE __attribute__((noinline))
95
    #else
96
        #define POLY1305_NOINLINE
97
    #endif
98
99
    #if defined(_MSC_VER)
100
        #include <intrin.h>
101
102
        typedef struct word128 {
103
            word64 lo;
104
            word64 hi;
105
        } word128;
106
107
        #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi)
108
        #define ADD(out, in) { word64 t = out.lo; out.lo += in.lo; \
109
                               out.hi += (out.lo < t) + in.hi; }
110
        #define ADDLO(out, in) { word64 t = out.lo; out.lo += in; \
111
                                 out.hi += (out.lo < t); }
112
        #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift)))
113
        #define LO(in) (in.lo)
114
115
    #elif defined(__GNUC__)
116
        #if defined(__SIZEOF_INT128__)
117
            PEDANTIC_EXTENSION typedef unsigned __int128 word128;
118
        #else
119
            typedef unsigned word128 __attribute__((mode(TI)));
120
        #endif
121
122
508k
        #define MUL(out, x, y) out = ((word128)(x) * (y))
123
339k
        #define ADD(out, in) (out) += (in)
124
113k
        #define ADDLO(out, in) (out) += (in)
125
169k
        #define SHR(in, shift) (word64)((in) >> (shift))
126
169k
        #define LO(in) (word64)(in)
127
    #endif
128
#endif
129
130
#ifdef USE_INTEL_POLY1305_SPEEDUP
131
#ifdef __cplusplus
132
    extern "C" {
133
#endif
134
135
#ifdef HAVE_INTEL_AVX1
136
/* Process one block (16 bytes) of data.
137
 *
138
 * ctx  Poly1305 context.
139
 * m    One block of message data.
140
 */
141
extern void poly1305_block_avx(Poly1305* ctx, const unsigned char *m);
142
/* Process multiple blocks (n * 16 bytes) of data.
143
 *
144
 * ctx    Poly1305 context.
145
 * m      Blocks of message data.
146
 * bytes  The number of bytes to process.
147
 */
148
extern void poly1305_blocks_avx(Poly1305* ctx, const unsigned char* m,
149
                                size_t bytes);
150
/* Set the key to use when processing data.
151
 * Initialize the context.
152
 *
153
 * ctx  Poly1305 context.
154
 * key  The key data (16 bytes).
155
 */
156
extern void poly1305_setkey_avx(Poly1305* ctx, const byte* key);
157
/* Calculate the final result - authentication data.
158
 * Zeros out the private data in the context.
159
 *
160
 * ctx  Poly1305 context.
161
 * mac  Buffer to hold 16 bytes.
162
 */
163
extern void poly1305_final_avx(Poly1305* ctx, byte* mac);
164
#endif
165
166
#ifdef HAVE_INTEL_AVX2
167
/* Process multiple blocks (n * 16 bytes) of data.
168
 *
169
 * ctx    Poly1305 context.
170
 * m      Blocks of message data.
171
 * bytes  The number of bytes to process.
172
 */
173
extern void poly1305_blocks_avx2(Poly1305* ctx, const unsigned char* m,
174
                                 size_t bytes);
175
/* Calculate R^1, R^2, R^3 and R^4 and store them in the context.
176
 *
177
 * ctx    Poly1305 context.
178
 */
179
extern void poly1305_calc_powers_avx2(Poly1305* ctx);
180
/* Set the key to use when processing data.
181
 * Initialize the context.
182
 * Calls AVX set key function as final function calls AVX code.
183
 *
184
 * ctx  Poly1305 context.
185
 * key  The key data (16 bytes).
186
 */
187
extern void poly1305_setkey_avx2(Poly1305* ctx, const byte* key);
188
/* Calculate the final result - authentication data.
189
 * Zeros out the private data in the context.
190
 * Calls AVX final function to quickly process last blocks.
191
 *
192
 * ctx  Poly1305 context.
193
 * mac  Buffer to hold 16 bytes - authentication data.
194
 */
195
extern void poly1305_final_avx2(Poly1305* ctx, byte* mac);
196
#endif
197
198
#ifdef __cplusplus
199
    }  /* extern "C" */
200
#endif
201
202
#elif defined(POLY130564)
203
#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
204
    static word64 U8TO64(const byte* p)
205
127k
    {
206
127k
        return
207
127k
            (((word64)(p[0] & 0xff)      ) |
208
127k
             ((word64)(p[1] & 0xff) <<  8) |
209
127k
             ((word64)(p[2] & 0xff) << 16) |
210
127k
             ((word64)(p[3] & 0xff) << 24) |
211
127k
             ((word64)(p[4] & 0xff) << 32) |
212
127k
             ((word64)(p[5] & 0xff) << 40) |
213
127k
             ((word64)(p[6] & 0xff) << 48) |
214
127k
             ((word64)(p[7] & 0xff) << 56));
215
127k
    }
216
217
7.30k
    static void U64TO8(byte* p, word64 v) {
218
7.30k
        p[0] = (byte)v;
219
7.30k
        p[1] = (byte)(v >>  8);
220
7.30k
        p[2] = (byte)(v >> 16);
221
7.30k
        p[3] = (byte)(v >> 24);
222
7.30k
        p[4] = (byte)(v >> 32);
223
7.30k
        p[5] = (byte)(v >> 40);
224
7.30k
        p[6] = (byte)(v >> 48);
225
7.30k
        p[7] = (byte)(v >> 56);
226
7.30k
    }
227
#endif/* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */
228
/* if not 64 bit then use 32 bit */
229
#elif !defined(WOLFSSL_ARMASM)
230
231
    static word32 U8TO32(const byte *p)
232
    {
233
        return
234
            (((word32)(p[0] & 0xff)      ) |
235
             ((word32)(p[1] & 0xff) <<  8) |
236
             ((word32)(p[2] & 0xff) << 16) |
237
             ((word32)(p[3] & 0xff) << 24));
238
    }
239
240
    static void U32TO8(byte *p, word32 v) {
241
        p[0] = (byte)((v      ) & 0xff);
242
        p[1] = (byte)((v >>  8) & 0xff);
243
        p[2] = (byte)((v >> 16) & 0xff);
244
        p[3] = (byte)((v >> 24) & 0xff);
245
    }
246
#endif
247
248
/* convert 32-bit unsigned to little endian 64 bit type as byte array */
249
static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8])
250
7.27k
{
251
#ifndef WOLFSSL_X86_64_BUILD
252
    outLe64[0] = (byte)(inLe32  & 0x000000FF);
253
    outLe64[1] = (byte)((inLe32 & 0x0000FF00) >> 8);
254
    outLe64[2] = (byte)((inLe32 & 0x00FF0000) >> 16);
255
    outLe64[3] = (byte)((inLe32 & 0xFF000000) >> 24);
256
    outLe64[4] = 0;
257
    outLe64[5] = 0;
258
    outLe64[6] = 0;
259
    outLe64[7] = 0;
260
#else
261
7.27k
    *(word64*)outLe64 = inLe32;
262
7.27k
#endif
263
7.27k
}
264
265
266
#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
267
/*
268
This local function operates on a message with a given number of bytes
269
with a given ctx pointer to a Poly1305 structure.
270
*/
271
static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
272
                     size_t bytes)
273
11.5k
{
274
#ifdef USE_INTEL_POLY1305_SPEEDUP
275
    /* AVX2 is handled in wc_Poly1305Update. */
276
    SAVE_VECTOR_REGISTERS(return _svr_ret;);
277
    poly1305_blocks_avx(ctx, m, bytes);
278
    RESTORE_VECTOR_REGISTERS();
279
    return 0;
280
#elif defined(POLY130564)
281
11.5k
    const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */
282
11.5k
    word64 r0,r1,r2;
283
11.5k
    word64 s1,s2;
284
11.5k
    word64 h0,h1,h2;
285
11.5k
    word64 c;
286
11.5k
    word128 d0,d1,d2,d;
287
288
11.5k
    r0 = ctx->r[0];
289
11.5k
    r1 = ctx->r[1];
290
11.5k
    r2 = ctx->r[2];
291
292
11.5k
    h0 = ctx->h[0];
293
11.5k
    h1 = ctx->h[1];
294
11.5k
    h2 = ctx->h[2];
295
296
11.5k
    s1 = r1 * (5 << 2);
297
11.5k
    s2 = r2 * (5 << 2);
298
299
68.1k
    while (bytes >= POLY1305_BLOCK_SIZE) {
300
56.5k
        word64 t0,t1;
301
302
        /* h += m[i] */
303
56.5k
        t0 = U8TO64(&m[0]);
304
56.5k
        t1 = U8TO64(&m[8]);
305
306
56.5k
        h0 += (( t0                    ) & 0xfffffffffff);
307
56.5k
        h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
308
56.5k
        h2 += (((t1 >> 24)             ) & 0x3ffffffffff) | hibit;
309
310
        /* h *= r */
311
56.5k
        MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d);
312
56.5k
        MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d);
313
56.5k
        MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d);
314
315
        /* (partial) h %= p */
316
56.5k
                      c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff;
317
56.5k
        ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff;
318
56.5k
        ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff;
319
56.5k
        h0  += c * 5; c = (h0 >> 44);  h0 =    h0  & 0xfffffffffff;
320
56.5k
        h1  += c;
321
322
56.5k
        m += POLY1305_BLOCK_SIZE;
323
56.5k
        bytes -= POLY1305_BLOCK_SIZE;
324
56.5k
    }
325
326
11.5k
    ctx->h[0] = h0;
327
11.5k
    ctx->h[1] = h1;
328
11.5k
    ctx->h[2] = h2;
329
330
11.5k
    return 0;
331
332
#else /* if not 64 bit then use 32 bit */
333
    const word32 hibit = (ctx->finished) ? 0 : ((word32)1 << 24); /* 1 << 128 */
334
    word32 r0,r1,r2,r3,r4;
335
    word32 s1,s2,s3,s4;
336
    word32 h0,h1,h2,h3,h4;
337
    word32 c;
338
#ifdef WOLFSSL_W64_WRAPPER
339
    #ifdef WOLFSSL_SMALL_STACK
340
    w64wrapper* d;
341
342
    d = (w64wrapper*)XMALLOC(5 * sizeof(w64wrapper), NULL,
343
        DYNAMIC_TYPE_TMP_BUFFER);
344
    if (d == NULL) {
345
        return MEMORY_E;
346
    }
347
    #else
348
    w64wrapper d[5];
349
    #endif
350
#else
351
    word64 d0,d1,d2,d3,d4;
352
#endif
353
354
355
    r0 = ctx->r[0];
356
    r1 = ctx->r[1];
357
    r2 = ctx->r[2];
358
    r3 = ctx->r[3];
359
    r4 = ctx->r[4];
360
361
    s1 = r1 * 5;
362
    s2 = r2 * 5;
363
    s3 = r3 * 5;
364
    s4 = r4 * 5;
365
366
    h0 = ctx->h[0];
367
    h1 = ctx->h[1];
368
    h2 = ctx->h[2];
369
    h3 = ctx->h[3];
370
    h4 = ctx->h[4];
371
372
    while (bytes >= POLY1305_BLOCK_SIZE) {
373
        /* h += m[i] */
374
        h0 += (U8TO32(m+ 0)     ) & 0x3ffffff;
375
        h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff;
376
        h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff;
377
        h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff;
378
        h4 += (U8TO32(m+12) >> 8) | hibit;
379
380
        /* h *= r */
381
#ifdef WOLFSSL_W64_WRAPPER
382
        {
383
            w64wrapper tmp;
384
385
            d[0] = w64Mul(h0, r0); tmp = w64Mul(h1, s4);
386
            d[0] = w64Add(d[0], tmp, NULL); tmp = w64Mul(h2, s3);
387
            d[0] = w64Add(d[0], tmp, NULL); tmp = w64Mul(h3, s2);
388
            d[0] = w64Add(d[0], tmp, NULL); tmp = w64Mul(h4, s1);
389
            d[0] = w64Add(d[0], tmp, NULL);
390
391
            d[1] = w64Mul(h0, r1); tmp = w64Mul(h1, r0);
392
            d[1] = w64Add(d[1], tmp, NULL); tmp = w64Mul(h2, s4);
393
            d[1] = w64Add(d[1], tmp, NULL); tmp = w64Mul(h3, s3);
394
            d[1] = w64Add(d[1], tmp, NULL); tmp = w64Mul(h4, s2);
395
            d[1] = w64Add(d[1], tmp, NULL);
396
397
            d[2] = w64Mul(h0, r2); tmp = w64Mul(h1, r1);
398
            d[2] = w64Add(d[2], tmp, NULL); tmp = w64Mul(h2, r0);
399
            d[2] = w64Add(d[2], tmp, NULL); tmp = w64Mul(h3, s4);
400
            d[2] = w64Add(d[2], tmp, NULL); tmp = w64Mul(h4, s3);
401
            d[2] = w64Add(d[2], tmp, NULL);
402
403
            d[3] = w64Mul(h0, r3); tmp = w64Mul(h1, r2);
404
            d[3] = w64Add(d[3], tmp, NULL); tmp = w64Mul(h2, r1);
405
            d[3] = w64Add(d[3], tmp, NULL); tmp = w64Mul(h3, r0);
406
            d[3] = w64Add(d[3], tmp, NULL); tmp = w64Mul(h4, s4);
407
            d[3] = w64Add(d[3], tmp, NULL);
408
409
            d[4] = w64Mul(h0, r4); tmp = w64Mul(h1, r3);
410
            d[4] = w64Add(d[4], tmp, NULL); tmp = w64Mul(h2, r2);
411
            d[4] = w64Add(d[4], tmp, NULL); tmp = w64Mul(h3, r1);
412
            d[4] = w64Add(d[4], tmp, NULL); tmp = w64Mul(h4, r0);
413
            d[4] = w64Add(d[4], tmp, NULL);
414
        }
415
#else
416
        d0 = ((word64)h0 * r0) + ((word64)h1 * s4) + ((word64)h2 * s3) +
417
             ((word64)h3 * s2) + ((word64)h4 * s1);
418
        d1 = ((word64)h0 * r1) + ((word64)h1 * r0) + ((word64)h2 * s4) +
419
             ((word64)h3 * s3) + ((word64)h4 * s2);
420
        d2 = ((word64)h0 * r2) + ((word64)h1 * r1) + ((word64)h2 * r0) +
421
             ((word64)h3 * s4) + ((word64)h4 * s3);
422
        d3 = ((word64)h0 * r3) + ((word64)h1 * r2) + ((word64)h2 * r1) +
423
             ((word64)h3 * r0) + ((word64)h4 * s4);
424
        d4 = ((word64)h0 * r4) + ((word64)h1 * r3) + ((word64)h2 * r2) +
425
             ((word64)h3 * r1) + ((word64)h4 * r0);
426
#endif
427
428
        /* (partial) h %= p */
429
#ifdef WOLFSSL_W64_WRAPPER
430
        c = w64GetLow32(w64ShiftRight(d[0], 26));
431
        h0 = w64GetLow32(d[0]) & 0x3ffffff;
432
        d[1] = w64Add32(d[1], c, NULL);
433
        c = w64GetLow32(w64ShiftRight(d[1], 26));
434
        h1 = w64GetLow32(d[1]) & 0x3ffffff;
435
        d[2] = w64Add32(d[2], c, NULL);
436
        c = w64GetLow32(w64ShiftRight(d[2], 26));
437
        h2 = w64GetLow32(d[2]) & 0x3ffffff;
438
        d[3] = w64Add32(d[3], c, NULL);
439
        c = w64GetLow32(w64ShiftRight(d[3], 26));
440
        h3 = w64GetLow32(d[3]) & 0x3ffffff;
441
        d[4] = w64Add32(d[4], c, NULL);
442
        c = w64GetLow32(w64ShiftRight(d[4], 26));
443
        h4 = w64GetLow32(d[4]) & 0x3ffffff;
444
#else
445
                      c = (word32)(d0 >> 26); h0 = (word32)d0 & 0x3ffffff;
446
        d1 += c;      c = (word32)(d1 >> 26); h1 = (word32)d1 & 0x3ffffff;
447
        d2 += c;      c = (word32)(d2 >> 26); h2 = (word32)d2 & 0x3ffffff;
448
        d3 += c;      c = (word32)(d3 >> 26); h3 = (word32)d3 & 0x3ffffff;
449
        d4 += c;      c = (word32)(d4 >> 26); h4 = (word32)d4 & 0x3ffffff;
450
#endif
451
        h0 += c * 5;  c =  (h0 >> 26); h0 =                h0 & 0x3ffffff;
452
        h1 += c;
453
454
        m += POLY1305_BLOCK_SIZE;
455
        bytes -= POLY1305_BLOCK_SIZE;
456
    }
457
458
    ctx->h[0] = h0;
459
    ctx->h[1] = h1;
460
    ctx->h[2] = h2;
461
    ctx->h[3] = h3;
462
    ctx->h[4] = h4;
463
464
#if defined(WOLFSSL_W64_WRAPPER) && defined(WOLFSSL_SMALL_STACK)
465
    XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
466
#endif
467
468
    return 0;
469
470
#endif /* end of 64 bit cpu blocks or 32 bit cpu */
471
11.5k
}
472
473
/*
474
This local function is used for the last call when a message with a given
475
number of bytes is less than the block size.
476
*/
477
static int poly1305_block(Poly1305* ctx, const unsigned char *m)
478
7.08k
{
479
#ifdef USE_INTEL_POLY1305_SPEEDUP
480
    /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
481
    SAVE_VECTOR_REGISTERS(return _svr_ret;);
482
    poly1305_block_avx(ctx, m);
483
    RESTORE_VECTOR_REGISTERS();
484
    return 0;
485
#else
486
7.08k
    return poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE);
487
7.08k
#endif
488
7.08k
}
489
490
int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
491
3.65k
{
492
3.65k
#if defined(POLY130564) && !defined(USE_INTEL_POLY1305_SPEEDUP)
493
3.65k
    word64 t0,t1;
494
3.65k
#endif
495
496
3.65k
    if (key == NULL)
497
0
        return BAD_FUNC_ARG;
498
499
#ifdef CHACHA_AEAD_TEST
500
    word32 k;
501
    printf("Poly key used:\n");
502
    for (k = 0; k < keySz; k++) {
503
        printf("%02x", key[k]);
504
        if ((k+1) % 8 == 0)
505
            printf("\n");
506
    }
507
    printf("\n");
508
#endif
509
510
3.65k
    if (keySz != 32 || ctx == NULL)
511
0
        return BAD_FUNC_ARG;
512
513
#ifdef USE_INTEL_POLY1305_SPEEDUP
514
    if (!cpu_flags_set) {
515
        intel_flags = cpuid_get_flags();
516
        cpu_flags_set = 1;
517
    }
518
    SAVE_VECTOR_REGISTERS(return _svr_ret;);
519
    #ifdef HAVE_INTEL_AVX2
520
    if (IS_INTEL_AVX2(intel_flags))
521
        poly1305_setkey_avx2(ctx, key);
522
    else
523
    #endif
524
        poly1305_setkey_avx(ctx, key);
525
    RESTORE_VECTOR_REGISTERS();
526
    ctx->started = 0;
527
#elif defined(POLY130564)
528
529
    /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
530
3.65k
    t0 = U8TO64(key + 0);
531
3.65k
    t1 = U8TO64(key + 8);
532
533
3.65k
    ctx->r[0] = ( t0                    ) & 0xffc0fffffff;
534
3.65k
    ctx->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
535
3.65k
    ctx->r[2] = ((t1 >> 24)             ) & 0x00ffffffc0f;
536
537
    /* h (accumulator) = 0 */
538
3.65k
    ctx->h[0] = 0;
539
3.65k
    ctx->h[1] = 0;
540
3.65k
    ctx->h[2] = 0;
541
542
    /* save pad for later */
543
3.65k
    ctx->pad[0] = U8TO64(key + 16);
544
3.65k
    ctx->pad[1] = U8TO64(key + 24);
545
546
3.65k
    ctx->leftover = 0;
547
3.65k
    ctx->finished = 0;
548
549
#else /* if not 64 bit then use 32 bit */
550
551
    /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
552
    ctx->r[0] = (U8TO32(key +  0)     ) & 0x3ffffff;
553
    ctx->r[1] = (U8TO32(key +  3) >> 2) & 0x3ffff03;
554
    ctx->r[2] = (U8TO32(key +  6) >> 4) & 0x3ffc0ff;
555
    ctx->r[3] = (U8TO32(key +  9) >> 6) & 0x3f03fff;
556
    ctx->r[4] = (U8TO32(key + 12) >> 8) & 0x00fffff;
557
558
    /* h = 0 */
559
    ctx->h[0] = 0;
560
    ctx->h[1] = 0;
561
    ctx->h[2] = 0;
562
    ctx->h[3] = 0;
563
    ctx->h[4] = 0;
564
565
    /* save pad for later */
566
    ctx->pad[0] = U8TO32(key + 16);
567
    ctx->pad[1] = U8TO32(key + 20);
568
    ctx->pad[2] = U8TO32(key + 24);
569
    ctx->pad[3] = U8TO32(key + 28);
570
571
    ctx->leftover = 0;
572
    ctx->finished = 0;
573
574
#endif
575
576
3.65k
    return 0;
577
3.65k
}
578
579
int wc_Poly1305Final(Poly1305* ctx, byte* mac)
580
3.65k
{
581
#ifdef USE_INTEL_POLY1305_SPEEDUP
582
#elif defined(POLY130564)
583
584
3.65k
    word64 h0,h1,h2,c;
585
3.65k
    word64 g0,g1,g2;
586
3.65k
    word64 t0,t1;
587
588
#else
589
590
    word32 h0,h1,h2,h3,h4,c;
591
    word32 g0,g1,g2,g3,g4;
592
#ifdef WOLFSSL_W64_WRAPPER
593
    w64wrapper f;
594
#else
595
    word64 f;
596
#endif
597
    word32 mask;
598
599
#endif
600
601
3.65k
    if (ctx == NULL || mac == NULL)
602
0
        return BAD_FUNC_ARG;
603
604
#ifdef USE_INTEL_POLY1305_SPEEDUP
605
    SAVE_VECTOR_REGISTERS(return _svr_ret;);
606
    #ifdef HAVE_INTEL_AVX2
607
    if (IS_INTEL_AVX2(intel_flags))
608
        poly1305_final_avx2(ctx, mac);
609
    else
610
    #endif
611
        poly1305_final_avx(ctx, mac);
612
    RESTORE_VECTOR_REGISTERS();
613
#elif defined(POLY130564)
614
615
    /* process the remaining block */
616
3.65k
    if (ctx->leftover) {
617
11
        size_t i = ctx->leftover;
618
11
        ctx->buffer[i] = 1;
619
67
        for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++)
620
56
            ctx->buffer[i] = 0;
621
11
        ctx->finished = 1;
622
11
        poly1305_block(ctx, ctx->buffer);
623
11
    }
624
625
    /* fully carry h */
626
3.65k
    h0 = ctx->h[0];
627
3.65k
    h1 = ctx->h[1];
628
3.65k
    h2 = ctx->h[2];
629
630
3.65k
                 c = (h1 >> 44); h1 &= 0xfffffffffff;
631
3.65k
    h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
632
3.65k
    h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
633
3.65k
    h1 += c;     c = (h1 >> 44); h1 &= 0xfffffffffff;
634
3.65k
    h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
635
3.65k
    h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
636
3.65k
    h1 += c;
637
638
    /* compute h + -p */
639
3.65k
    g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
640
3.65k
    g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
641
3.65k
    g2 = h2 + c - ((word64)1 << 42);
642
643
    /* select h if h < p, or h + -p if h >= p */
644
3.65k
    c = (g2 >> ((sizeof(word64) * 8) - 1)) - 1;
645
3.65k
    g0 &= c;
646
3.65k
    g1 &= c;
647
3.65k
    g2 &= c;
648
3.65k
    c = ~c;
649
3.65k
    h0 = (h0 & c) | g0;
650
3.65k
    h1 = (h1 & c) | g1;
651
3.65k
    h2 = (h2 & c) | g2;
652
653
    /* h = (h + pad) */
654
3.65k
    t0 = ctx->pad[0];
655
3.65k
    t1 = ctx->pad[1];
656
657
3.65k
    h0 += (( t0                    ) & 0xfffffffffff)    ;
658
3.65k
    c = (h0 >> 44); h0 &= 0xfffffffffff;
659
3.65k
    h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
660
3.65k
    c = (h1 >> 44); h1 &= 0xfffffffffff;
661
3.65k
    h2 += (((t1 >> 24)             ) & 0x3ffffffffff) + c;
662
3.65k
    h2 &= 0x3ffffffffff;
663
664
    /* mac = h % (2^128) */
665
3.65k
    h0 = ((h0      ) | (h1 << 44));
666
3.65k
    h1 = ((h1 >> 20) | (h2 << 24));
667
668
3.65k
    U64TO8(mac + 0, h0);
669
3.65k
    U64TO8(mac + 8, h1);
670
671
    /* zero out the state */
672
3.65k
    ctx->h[0] = 0;
673
3.65k
    ctx->h[1] = 0;
674
3.65k
    ctx->h[2] = 0;
675
3.65k
    ctx->r[0] = 0;
676
3.65k
    ctx->r[1] = 0;
677
3.65k
    ctx->r[2] = 0;
678
3.65k
    ctx->pad[0] = 0;
679
3.65k
    ctx->pad[1] = 0;
680
681
#else /* if not 64 bit then use 32 bit */
682
683
    /* process the remaining block */
684
    if (ctx->leftover) {
685
        size_t i = ctx->leftover;
686
        ctx->buffer[i++] = 1;
687
        for (; i < POLY1305_BLOCK_SIZE; i++)
688
            ctx->buffer[i] = 0;
689
        ctx->finished = 1;
690
        poly1305_block(ctx, ctx->buffer);
691
    }
692
693
    /* fully carry h */
694
    h0 = ctx->h[0];
695
    h1 = ctx->h[1];
696
    h2 = ctx->h[2];
697
    h3 = ctx->h[3];
698
    h4 = ctx->h[4];
699
700
                 c = h1 >> 26; h1 = h1 & 0x3ffffff;
701
    h2 +=     c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
702
    h3 +=     c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
703
    h4 +=     c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
704
    h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
705
    h1 +=     c;
706
707
    /* compute h + -p */
708
    g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
709
    g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
710
    g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
711
    g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
712
    g4 = h4 + c - ((word32)1 << 26);
713
714
    /* select h if h < p, or h + -p if h >= p */
715
    mask = ((word32)g4 >> ((sizeof(word32) * 8) - 1)) - 1;
716
    g0 &= mask;
717
    g1 &= mask;
718
    g2 &= mask;
719
    g3 &= mask;
720
    g4 &= mask;
721
    mask = ~mask;
722
    h0 = (h0 & mask) | g0;
723
    h1 = (h1 & mask) | g1;
724
    h2 = (h2 & mask) | g2;
725
    h3 = (h3 & mask) | g3;
726
    h4 = (h4 & mask) | g4;
727
728
    /* h = h % (2^128) */
729
    h0 = ((h0      ) | (h1 << 26)) & 0xffffffff;
730
    h1 = ((h1 >>  6) | (h2 << 20)) & 0xffffffff;
731
    h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
732
    h3 = ((h3 >> 18) | (h4 <<  8)) & 0xffffffff;
733
734
    /* mac = (h + pad) % (2^128) */
735
#ifdef WOLFSSL_W64_WRAPPER
736
    f = w64From32(0, h0);
737
    f = w64Add32(f, ctx->pad[0], NULL);
738
    h0 = w64GetLow32(f);
739
740
    f = w64ShiftRight(f, 32);
741
    f = w64Add32(f, h1, NULL);
742
    f = w64Add32(f, ctx->pad[1], NULL);
743
    h1 = w64GetLow32(f);
744
745
    f = w64ShiftRight(f, 32);
746
    f = w64Add32(f, h2, NULL);
747
    f = w64Add32(f, ctx->pad[2], NULL);
748
    h2 = w64GetLow32(f);
749
750
    f = w64ShiftRight(f, 32);
751
    f = w64Add32(f, h3, NULL);
752
    f = w64Add32(f, ctx->pad[3], NULL);
753
    h3 = w64GetLow32(f);
754
#else
755
    f = (word64)h0 + ctx->pad[0]            ; h0 = (word32)f;
756
    f = (word64)h1 + ctx->pad[1] + (f >> 32); h1 = (word32)f;
757
    f = (word64)h2 + ctx->pad[2] + (f >> 32); h2 = (word32)f;
758
    f = (word64)h3 + ctx->pad[3] + (f >> 32); h3 = (word32)f;
759
#endif
760
761
    U32TO8(mac + 0, h0);
762
    U32TO8(mac + 4, h1);
763
    U32TO8(mac + 8, h2);
764
    U32TO8(mac + 12, h3);
765
766
    /* zero out the state */
767
    ctx->h[0] = 0;
768
    ctx->h[1] = 0;
769
    ctx->h[2] = 0;
770
    ctx->h[3] = 0;
771
    ctx->h[4] = 0;
772
    ctx->r[0] = 0;
773
    ctx->r[1] = 0;
774
    ctx->r[2] = 0;
775
    ctx->r[3] = 0;
776
    ctx->r[4] = 0;
777
    ctx->pad[0] = 0;
778
    ctx->pad[1] = 0;
779
    ctx->pad[2] = 0;
780
    ctx->pad[3] = 0;
781
782
#endif
783
784
3.65k
    return 0;
785
3.65k
}
786
#endif /* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */
787
788
789
int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
790
18.0k
{
791
18.0k
    size_t i;
792
793
18.0k
    if (ctx == NULL || (m == NULL && bytes > 0))
794
0
        return BAD_FUNC_ARG;
795
796
18.0k
    if (bytes == 0) {
797
        /* valid, but do nothing */
798
7
        return 0;
799
7
    }
800
#ifdef CHACHA_AEAD_TEST
801
    word32 k;
802
    printf("Raw input to poly:\n");
803
    for (k = 0; k < bytes; k++) {
804
        printf("%02x", m[k]);
805
        if ((k+1) % 16 == 0)
806
            printf("\n");
807
    }
808
    printf("\n");
809
#endif
810
811
#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_THUMB2) && \
812
    !defined(WOLFSSL_ARMASM_NO_NEON)
813
    /* handle leftover */
814
    if (ctx->leftover) {
815
        size_t want = sizeof(ctx->buffer) - ctx->leftover;
816
        if (want > bytes)
817
            want = bytes;
818
819
        for (i = 0; i < want; i++)
820
            ctx->buffer[ctx->leftover + i] = m[i];
821
        bytes -= (word32)want;
822
        m += want;
823
        ctx->leftover += want;
824
        if (ctx->leftover < sizeof(ctx->buffer)) {
825
            return 0;
826
        }
827
828
        poly1305_blocks(ctx, ctx->buffer, sizeof(ctx->buffer));
829
        ctx->leftover = 0;
830
    }
831
832
    /* process full blocks */
833
    if (bytes >= sizeof(ctx->buffer)) {
834
        size_t want = bytes & ~((size_t)POLY1305_BLOCK_SIZE - 1);
835
836
        poly1305_blocks(ctx, m, want);
837
        m += want;
838
        bytes -= (word32)want;
839
    }
840
841
    /* store leftover */
842
    if (bytes) {
843
        for (i = 0; i < bytes; i++)
844
            ctx->buffer[ctx->leftover + i] = m[i];
845
        ctx->leftover += bytes;
846
    }
847
#else
848
#ifdef USE_INTEL_POLY1305_SPEEDUP
849
    #ifdef HAVE_INTEL_AVX2
850
    if (IS_INTEL_AVX2(intel_flags)) {
851
        SAVE_VECTOR_REGISTERS(return _svr_ret;);
852
853
        /* handle leftover */
854
        if (ctx->leftover) {
855
            size_t want = sizeof(ctx->buffer) - ctx->leftover;
856
            if (want > bytes)
857
                want = bytes;
858
859
            for (i = 0; i < want; i++)
860
                ctx->buffer[ctx->leftover + i] = m[i];
861
            bytes -= (word32)want;
862
            m += want;
863
            ctx->leftover += want;
864
            if (ctx->leftover < sizeof(ctx->buffer)) {
865
                RESTORE_VECTOR_REGISTERS();
866
                return 0;
867
            }
868
869
            if (!ctx->started) {
870
                poly1305_calc_powers_avx2(ctx);
871
                ctx->started = 1;
872
            }
873
            poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
874
            ctx->leftover = 0;
875
        }
876
877
        /* process full blocks */
878
        if (bytes >= sizeof(ctx->buffer)) {
879
            size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
880
881
            if (!ctx->started) {
882
                poly1305_calc_powers_avx2(ctx);
883
                ctx->started = 1;
884
            }
885
            poly1305_blocks_avx2(ctx, m, want);
886
            m += want;
887
            bytes -= (word32)want;
888
        }
889
890
        /* store leftover */
891
        if (bytes) {
892
            for (i = 0; i < bytes; i++)
893
                ctx->buffer[ctx->leftover + i] = m[i];
894
            ctx->leftover += bytes;
895
        }
896
        RESTORE_VECTOR_REGISTERS();
897
    }
898
    else
899
    #endif
900
#endif
901
18.0k
    {
902
        /* handle leftover */
903
18.0k
        if (ctx->leftover) {
904
7.08k
            size_t want = (POLY1305_BLOCK_SIZE - ctx->leftover);
905
7.08k
            if (want > bytes)
906
12
                want = bytes;
907
54.0k
            for (i = 0; i < want; i++)
908
46.9k
                ctx->buffer[ctx->leftover + i] = m[i];
909
7.08k
            bytes -= (word32)want;
910
7.08k
            m += want;
911
7.08k
            ctx->leftover += want;
912
7.08k
            if (ctx->leftover < POLY1305_BLOCK_SIZE)
913
12
                return 0;
914
7.07k
            poly1305_block(ctx, ctx->buffer);
915
7.07k
            ctx->leftover = 0;
916
7.07k
        }
917
918
        /* process full blocks */
919
17.9k
        if (bytes >= POLY1305_BLOCK_SIZE) {
920
4.48k
            size_t want = ((size_t)bytes & ~((size_t)POLY1305_BLOCK_SIZE - 1));
921
4.48k
#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
922
4.48k
            int ret;
923
4.48k
            ret = poly1305_blocks(ctx, m, want);
924
4.48k
            if (ret != 0)
925
0
                return ret;
926
#else
927
            poly1305_blocks(ctx, m, want);
928
#endif
929
4.48k
            m += want;
930
4.48k
            bytes -= (word32)want;
931
4.48k
        }
932
933
        /* store leftover */
934
17.9k
        if (bytes) {
935
73.3k
            for (i = 0; i < bytes; i++)
936
66.2k
                ctx->buffer[ctx->leftover + i] = m[i];
937
7.08k
            ctx->leftover += bytes;
938
7.08k
        }
939
17.9k
    }
940
0
#endif
941
942
0
    return 0;
943
17.9k
}
944
945
/*  Takes a Poly1305 struct that has a key loaded and pads the provided length
946
    ctx        : Initialized Poly1305 struct to use
947
    lenToPad   : Current number of bytes updated that needs padding to 16
948
 */
949
int wc_Poly1305_Pad(Poly1305* ctx, word32 lenToPad)
950
7.27k
{
951
7.27k
    int ret = 0;
952
7.27k
    word32 paddingLen;
953
7.27k
    byte padding[WC_POLY1305_PAD_SZ - 1];
954
955
7.27k
    if (ctx == NULL) {
956
0
        return BAD_FUNC_ARG;
957
0
    }
958
7.27k
    if (lenToPad == 0) {
959
7
        return 0; /* nothing needs to be done */
960
7
    }
961
962
7.26k
    XMEMSET(padding, 0, sizeof(padding));
963
964
    /* Pad length to 16 bytes */
965
7.26k
    paddingLen = (-(int)lenToPad) & (WC_POLY1305_PAD_SZ - 1);
966
7.26k
    if ((paddingLen > 0) && (paddingLen < WC_POLY1305_PAD_SZ)) {
967
7.04k
        ret = wc_Poly1305Update(ctx, padding, paddingLen);
968
7.04k
    }
969
7.26k
    return ret;
970
7.27k
}
971
972
/*  Takes a Poly1305 struct that has a key loaded and adds the AEAD length
973
    encoding in 64-bit little endian
974
    aadSz      : Size of the additional authentication data
975
    dataSz     : Size of the plaintext or ciphertext
976
 */
977
int wc_Poly1305_EncodeSizes(Poly1305* ctx, word32 aadSz, word32 dataSz)
978
3.63k
{
979
3.63k
    int ret;
980
3.63k
    byte little64[16]; /* sizeof(word64) * 2 */
981
982
3.63k
    if (ctx == NULL) {
983
0
        return BAD_FUNC_ARG;
984
0
    }
985
986
3.63k
    XMEMSET(little64, 0, sizeof(little64));
987
988
    /* size of additional data and input data as little endian 64 bit types */
989
3.63k
    u32tole64(aadSz,  little64);
990
3.63k
    u32tole64(dataSz, little64 + 8);
991
3.63k
    ret = wc_Poly1305Update(ctx, little64, sizeof(little64));
992
993
3.63k
    return ret;
994
3.63k
}
995
996
#ifdef WORD64_AVAILABLE
997
int wc_Poly1305_EncodeSizes64(Poly1305* ctx, word64 aadSz, word64 dataSz)
998
0
{
999
0
    int ret;
1000
0
    word64 little64[2];
1001
1002
0
    if (ctx == NULL) {
1003
0
        return BAD_FUNC_ARG;
1004
0
    }
1005
1006
#ifdef BIG_ENDIAN_ORDER
1007
    little64[0] = ByteReverseWord64(aadSz);
1008
    little64[1] = ByteReverseWord64(dataSz);
1009
#else
1010
0
    little64[0] = aadSz;
1011
0
    little64[1] = dataSz;
1012
0
#endif
1013
1014
0
    ret = wc_Poly1305Update(ctx, (byte *)little64, sizeof(little64));
1015
1016
0
    return ret;
1017
0
}
1018
#endif
1019
1020
/*  Takes in an initialized Poly1305 struct that has a key loaded and creates
1021
    a MAC (tag) using recent TLS AEAD padding scheme.
1022
    ctx        : Initialized Poly1305 struct to use
1023
    additional : Additional data to use
1024
    addSz      : Size of additional buffer
1025
    input      : Input buffer to create tag from
1026
    sz         : Size of input buffer
1027
    tag        : Buffer to hold created tag
1028
    tagSz      : Size of input tag buffer (must be at least
1029
                 WC_POLY1305_MAC_SZ(16))
1030
 */
1031
int wc_Poly1305_MAC(Poly1305* ctx, const byte* additional, word32 addSz,
1032
                    const byte* input, word32 sz, byte* tag, word32 tagSz)
1033
3.63k
{
1034
3.63k
    int ret;
1035
1036
    /* sanity check on arguments */
1037
3.63k
    if (ctx == NULL || input == NULL || tag == NULL ||
1038
3.63k
                                                   tagSz < WC_POLY1305_MAC_SZ) {
1039
0
        return BAD_FUNC_ARG;
1040
0
    }
1041
1042
    /* additional allowed to be 0 */
1043
3.63k
    if (addSz > 0) {
1044
3.63k
        if (additional == NULL)
1045
0
            return BAD_FUNC_ARG;
1046
1047
        /* additional data plus padding */
1048
3.63k
        if ((ret = wc_Poly1305Update(ctx, additional, addSz)) != 0) {
1049
0
            return ret;
1050
0
        }
1051
        /* pad additional data */
1052
3.63k
        if ((ret = wc_Poly1305_Pad(ctx, addSz)) != 0) {
1053
0
            return ret;
1054
0
        }
1055
3.63k
    }
1056
1057
    /* input plus padding */
1058
3.63k
    if ((ret = wc_Poly1305Update(ctx, input, sz)) != 0) {
1059
0
        return ret;
1060
0
    }
1061
    /* pad input data */
1062
3.63k
    if ((ret = wc_Poly1305_Pad(ctx, sz)) != 0) {
1063
0
        return ret;
1064
0
    }
1065
1066
    /* encode size of AAD and input data as little endian 64 bit types */
1067
3.63k
    if ((ret = wc_Poly1305_EncodeSizes(ctx, addSz, sz)) != 0) {
1068
0
        return ret;
1069
0
    }
1070
1071
    /* Finalize the auth tag */
1072
3.63k
    ret = wc_Poly1305Final(ctx, tag);
1073
1074
3.63k
    return ret;
1075
1076
3.63k
}
1077
#endif /* HAVE_POLY1305 */