Coverage Report

Created: 2024-05-21 06:33

/src/openssl/crypto/sha/keccak1600.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved.
3
 *
4
 * Licensed under the Apache License 2.0 (the "License").  You may not use
5
 * this file except in compliance with the License.  You can obtain a copy
6
 * in the file LICENSE in the source distribution or at
7
 * https://www.openssl.org/source/license.html
8
 */
9
10
#include <openssl/e_os2.h>
11
#include <string.h>
12
#include <assert.h>
13
14
size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
15
                   size_t r);
16
void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r, int next);
17
18
#if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
19
20
/*
21
 * Choose some sensible defaults
22
 */
23
#if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \
24
    !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
25
# define KECCAK_2X      /* default to KECCAK_2X variant */
26
#endif
27
28
#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
29
    (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \
30
    defined(__mips) || defined(__riscv) || defined(__s390__) || \
31
    defined(__EMSCRIPTEN__)
32
/*
33
 * These don't have "and with complement" instruction, so minimize amount
34
 * of "not"-s. Implemented only in the [default] KECCAK_2X variant.
35
 */
36
# define KECCAK_COMPLEMENTING_TRANSFORM
37
#endif
38
39
#if defined(__x86_64__) || defined(__aarch64__) || \
40
    defined(__mips64) || defined(__ia64) || \
41
    (defined(__VMS) && !defined(__vax))
42
/*
43
 * These are available even in ILP32 flavours, but even then they are
44
 * capable of performing 64-bit operations as efficiently as in *P64.
45
 * Since it's not given that we can use sizeof(void *), just shunt it.
46
 */
47
787M
# define BIT_INTERLEAVE (0)
48
#else
49
# define BIT_INTERLEAVE (sizeof(void *) < 8)
50
#endif
51
52
0
#define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
53
54
static uint64_t ROL64(uint64_t val, int offset)
55
774M
{
56
774M
    if (offset == 0) {
57
0
        return val;
58
774M
    } else if (!BIT_INTERLEAVE) {
59
774M
        return (val << offset) | (val >> (64-offset));
60
774M
    } else {
61
0
        uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
62
63
0
        if (offset & 1) {
64
0
            uint32_t tmp = hi;
65
66
0
            offset >>= 1;
67
0
            hi = ROL32(lo, offset);
68
0
            lo = ROL32(tmp, offset + 1);
69
0
        } else {
70
0
            offset >>= 1;
71
0
            lo = ROL32(lo, offset);
72
0
            hi = ROL32(hi, offset);
73
0
        }
74
75
0
        return ((uint64_t)hi << 32) | lo;
76
0
    }
77
774M
}
78
79
static const unsigned char rhotates[5][5] = {
80
    {  0,  1, 62, 28, 27 },
81
    { 36, 44,  6, 55, 20 },
82
    {  3, 10, 43, 25, 39 },
83
    { 41, 45, 15, 21,  8 },
84
    { 18,  2, 61, 56, 14 }
85
};
86
87
static const uint64_t iotas[] = {
88
    BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
89
    BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
90
    BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
91
    BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
92
    BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
93
    BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
94
    BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
95
    BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
96
    BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
97
    BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
98
    BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
99
    BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
100
    BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
101
    BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
102
    BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
103
    BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
104
    BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
105
    BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
106
    BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
107
    BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
108
    BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
109
    BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
110
    BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
111
    BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
112
};
113
114
#if defined(KECCAK_REF)
115
/*
116
 * This is straightforward or "maximum clarity" implementation aiming
117
 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
118
 * Permutation-Based Hash and Extendible-Output Functions" as much as
119
 * possible. With one caveat. Because of the way C stores matrices,
120
 * references to A[x,y] in the specification are presented as A[y][x].
121
 * Implementation unrolls inner x-loops so that modulo 5 operations are
122
 * explicitly pre-computed.
123
 */
124
static void Theta(uint64_t A[5][5])
125
{
126
    uint64_t C[5], D[5];
127
    size_t y;
128
129
    C[0] = A[0][0];
130
    C[1] = A[0][1];
131
    C[2] = A[0][2];
132
    C[3] = A[0][3];
133
    C[4] = A[0][4];
134
135
    for (y = 1; y < 5; y++) {
136
        C[0] ^= A[y][0];
137
        C[1] ^= A[y][1];
138
        C[2] ^= A[y][2];
139
        C[3] ^= A[y][3];
140
        C[4] ^= A[y][4];
141
    }
142
143
    D[0] = ROL64(C[1], 1) ^ C[4];
144
    D[1] = ROL64(C[2], 1) ^ C[0];
145
    D[2] = ROL64(C[3], 1) ^ C[1];
146
    D[3] = ROL64(C[4], 1) ^ C[2];
147
    D[4] = ROL64(C[0], 1) ^ C[3];
148
149
    for (y = 0; y < 5; y++) {
150
        A[y][0] ^= D[0];
151
        A[y][1] ^= D[1];
152
        A[y][2] ^= D[2];
153
        A[y][3] ^= D[3];
154
        A[y][4] ^= D[4];
155
    }
156
}
157
158
static void Rho(uint64_t A[5][5])
159
{
160
    size_t y;
161
162
    for (y = 0; y < 5; y++) {
163
        A[y][0] = ROL64(A[y][0], rhotates[y][0]);
164
        A[y][1] = ROL64(A[y][1], rhotates[y][1]);
165
        A[y][2] = ROL64(A[y][2], rhotates[y][2]);
166
        A[y][3] = ROL64(A[y][3], rhotates[y][3]);
167
        A[y][4] = ROL64(A[y][4], rhotates[y][4]);
168
    }
169
}
170
171
static void Pi(uint64_t A[5][5])
172
{
173
    uint64_t T[5][5];
174
175
    /*
176
     * T = A
177
     * A[y][x] = T[x][(3*y+x)%5]
178
     */
179
    memcpy(T, A, sizeof(T));
180
181
    A[0][0] = T[0][0];
182
    A[0][1] = T[1][1];
183
    A[0][2] = T[2][2];
184
    A[0][3] = T[3][3];
185
    A[0][4] = T[4][4];
186
187
    A[1][0] = T[0][3];
188
    A[1][1] = T[1][4];
189
    A[1][2] = T[2][0];
190
    A[1][3] = T[3][1];
191
    A[1][4] = T[4][2];
192
193
    A[2][0] = T[0][1];
194
    A[2][1] = T[1][2];
195
    A[2][2] = T[2][3];
196
    A[2][3] = T[3][4];
197
    A[2][4] = T[4][0];
198
199
    A[3][0] = T[0][4];
200
    A[3][1] = T[1][0];
201
    A[3][2] = T[2][1];
202
    A[3][3] = T[3][2];
203
    A[3][4] = T[4][3];
204
205
    A[4][0] = T[0][2];
206
    A[4][1] = T[1][3];
207
    A[4][2] = T[2][4];
208
    A[4][3] = T[3][0];
209
    A[4][4] = T[4][1];
210
}
211
212
static void Chi(uint64_t A[5][5])
213
{
214
    uint64_t C[5];
215
    size_t y;
216
217
    for (y = 0; y < 5; y++) {
218
        C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
219
        C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
220
        C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
221
        C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
222
        C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
223
224
        A[y][0] = C[0];
225
        A[y][1] = C[1];
226
        A[y][2] = C[2];
227
        A[y][3] = C[3];
228
        A[y][4] = C[4];
229
    }
230
}
231
232
static void Iota(uint64_t A[5][5], size_t i)
233
{
234
    assert(i < (sizeof(iotas) / sizeof(iotas[0])));
235
    A[0][0] ^= iotas[i];
236
}
237
238
static void KeccakF1600(uint64_t A[5][5])
239
{
240
    size_t i;
241
242
    for (i = 0; i < 24; i++) {
243
        Theta(A);
244
        Rho(A);
245
        Pi(A);
246
        Chi(A);
247
        Iota(A, i);
248
    }
249
}
250
251
#elif defined(KECCAK_1X)
252
/*
253
 * This implementation is optimization of above code featuring unroll
254
 * of even y-loops, their fusion and code motion. It also minimizes
255
 * temporary storage. Compiler would normally do all these things for
256
 * you, purpose of manual optimization is to provide "unobscured"
257
 * reference for assembly implementation [in case this approach is
258
 * chosen for implementation on some platform]. In the nutshell it's
259
 * equivalent of "plane-per-plane processing" approach discussed in
260
 * section 2.4 of "Keccak implementation overview".
261
 */
262
static void Round(uint64_t A[5][5], size_t i)
263
{
264
    uint64_t C[5], E[2];        /* registers */
265
    uint64_t D[5], T[2][5];     /* memory    */
266
267
    assert(i < (sizeof(iotas) / sizeof(iotas[0])));
268
269
    C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
270
    C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
271
    C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
272
    C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
273
    C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
274
275
#if defined(__arm__)
276
    D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
277
    D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
278
    D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
279
    D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
280
    D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
281
282
    T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
283
    T[0][1] = A[0][1] ^ E[0]; /* D[1] */
284
    T[0][2] = A[0][2] ^ C[1]; /* D[2] */
285
    T[0][3] = A[0][3] ^ C[2]; /* D[3] */
286
    T[0][4] = A[0][4] ^ E[1]; /* D[4] */
287
288
    C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
289
    C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
290
    C[0] =       A[0][0] ^ C[0]; /* rotate by 0 */  /* D[0] */
291
    C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
292
    C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
293
#else
294
    D[0] = ROL64(C[1], 1) ^ C[4];
295
    D[1] = ROL64(C[2], 1) ^ C[0];
296
    D[2] = ROL64(C[3], 1) ^ C[1];
297
    D[3] = ROL64(C[4], 1) ^ C[2];
298
    D[4] = ROL64(C[0], 1) ^ C[3];
299
300
    T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
301
    T[0][1] = A[0][1] ^ D[1];
302
    T[0][2] = A[0][2] ^ D[2];
303
    T[0][3] = A[0][3] ^ D[3];
304
    T[0][4] = A[0][4] ^ D[4];
305
306
    C[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
307
    C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
308
    C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
309
    C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
310
    C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
311
#endif
312
    A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
313
    A[0][1] = C[1] ^ (~C[2] & C[3]);
314
    A[0][2] = C[2] ^ (~C[3] & C[4]);
315
    A[0][3] = C[3] ^ (~C[4] & C[0]);
316
    A[0][4] = C[4] ^ (~C[0] & C[1]);
317
318
    T[1][0] = A[1][0] ^ (C[3] = D[0]);
319
    T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
320
    T[1][2] = A[1][2] ^ (E[0] = D[2]);
321
    T[1][3] = A[1][3] ^ (E[1] = D[3]);
322
    T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
323
324
    C[0] = ROL64(T[0][3],        rhotates[0][3]);
325
    C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]);   /* D[4] */
326
    C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]);   /* D[0] */
327
    C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]);   /* D[1] */
328
    C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]);   /* D[2] */
329
330
    A[1][0] = C[0] ^ (~C[1] & C[2]);
331
    A[1][1] = C[1] ^ (~C[2] & C[3]);
332
    A[1][2] = C[2] ^ (~C[3] & C[4]);
333
    A[1][3] = C[3] ^ (~C[4] & C[0]);
334
    A[1][4] = C[4] ^ (~C[0] & C[1]);
335
336
    C[0] = ROL64(T[0][1],        rhotates[0][1]);
337
    C[1] = ROL64(T[1][2],        rhotates[1][2]);
338
    C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
339
    C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
340
    C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
341
342
    A[2][0] = C[0] ^ (~C[1] & C[2]);
343
    A[2][1] = C[1] ^ (~C[2] & C[3]);
344
    A[2][2] = C[2] ^ (~C[3] & C[4]);
345
    A[2][3] = C[3] ^ (~C[4] & C[0]);
346
    A[2][4] = C[4] ^ (~C[0] & C[1]);
347
348
    C[0] = ROL64(T[0][4],        rhotates[0][4]);
349
    C[1] = ROL64(T[1][0],        rhotates[1][0]);
350
    C[2] = ROL64(T[1][1],        rhotates[2][1]); /* originally A[2][1] */
351
    C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
352
    C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
353
354
    A[3][0] = C[0] ^ (~C[1] & C[2]);
355
    A[3][1] = C[1] ^ (~C[2] & C[3]);
356
    A[3][2] = C[2] ^ (~C[3] & C[4]);
357
    A[3][3] = C[3] ^ (~C[4] & C[0]);
358
    A[3][4] = C[4] ^ (~C[0] & C[1]);
359
360
    C[0] = ROL64(T[0][2],        rhotates[0][2]);
361
    C[1] = ROL64(T[1][3],        rhotates[1][3]);
362
    C[2] = ROL64(T[1][4],        rhotates[2][4]); /* originally A[2][4] */
363
    C[3] = ROL64(T[0][0],        rhotates[3][0]); /* originally A[3][0] */
364
    C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
365
366
    A[4][0] = C[0] ^ (~C[1] & C[2]);
367
    A[4][1] = C[1] ^ (~C[2] & C[3]);
368
    A[4][2] = C[2] ^ (~C[3] & C[4]);
369
    A[4][3] = C[3] ^ (~C[4] & C[0]);
370
    A[4][4] = C[4] ^ (~C[0] & C[1]);
371
}
372
373
static void KeccakF1600(uint64_t A[5][5])
374
{
375
    size_t i;
376
377
    for (i = 0; i < 24; i++) {
378
        Round(A, i);
379
    }
380
}
381
382
#elif defined(KECCAK_1X_ALT)
383
/*
384
 * This is variant of above KECCAK_1X that reduces requirement for
385
 * temporary storage even further, but at cost of more updates to A[][].
386
 * It's less suitable if A[][] is memory bound, but better if it's
387
 * register bound.
388
 */
389
390
static void Round(uint64_t A[5][5], size_t i)
391
{
392
    uint64_t C[5], D[5];
393
394
    assert(i < (sizeof(iotas) / sizeof(iotas[0])));
395
396
    C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
397
    C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
398
    C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
399
    C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
400
    C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
401
402
    D[1] = C[0] ^  ROL64(C[2], 1);
403
    D[2] = C[1] ^  ROL64(C[3], 1);
404
    D[3] = C[2] ^= ROL64(C[4], 1);
405
    D[4] = C[3] ^= ROL64(C[0], 1);
406
    D[0] = C[4] ^= ROL64(C[1], 1);
407
408
    A[0][1] ^= D[1];
409
    A[1][1] ^= D[1];
410
    A[2][1] ^= D[1];
411
    A[3][1] ^= D[1];
412
    A[4][1] ^= D[1];
413
414
    A[0][2] ^= D[2];
415
    A[1][2] ^= D[2];
416
    A[2][2] ^= D[2];
417
    A[3][2] ^= D[2];
418
    A[4][2] ^= D[2];
419
420
    A[0][3] ^= C[2];
421
    A[1][3] ^= C[2];
422
    A[2][3] ^= C[2];
423
    A[3][3] ^= C[2];
424
    A[4][3] ^= C[2];
425
426
    A[0][4] ^= C[3];
427
    A[1][4] ^= C[3];
428
    A[2][4] ^= C[3];
429
    A[3][4] ^= C[3];
430
    A[4][4] ^= C[3];
431
432
    A[0][0] ^= C[4];
433
    A[1][0] ^= C[4];
434
    A[2][0] ^= C[4];
435
    A[3][0] ^= C[4];
436
    A[4][0] ^= C[4];
437
438
    C[1] = A[0][1];
439
    C[2] = A[0][2];
440
    C[3] = A[0][3];
441
    C[4] = A[0][4];
442
443
    A[0][1] = ROL64(A[1][1], rhotates[1][1]);
444
    A[0][2] = ROL64(A[2][2], rhotates[2][2]);
445
    A[0][3] = ROL64(A[3][3], rhotates[3][3]);
446
    A[0][4] = ROL64(A[4][4], rhotates[4][4]);
447
448
    A[1][1] = ROL64(A[1][4], rhotates[1][4]);
449
    A[2][2] = ROL64(A[2][3], rhotates[2][3]);
450
    A[3][3] = ROL64(A[3][2], rhotates[3][2]);
451
    A[4][4] = ROL64(A[4][1], rhotates[4][1]);
452
453
    A[1][4] = ROL64(A[4][2], rhotates[4][2]);
454
    A[2][3] = ROL64(A[3][4], rhotates[3][4]);
455
    A[3][2] = ROL64(A[2][1], rhotates[2][1]);
456
    A[4][1] = ROL64(A[1][3], rhotates[1][3]);
457
458
    A[4][2] = ROL64(A[2][4], rhotates[2][4]);
459
    A[3][4] = ROL64(A[4][3], rhotates[4][3]);
460
    A[2][1] = ROL64(A[1][2], rhotates[1][2]);
461
    A[1][3] = ROL64(A[3][1], rhotates[3][1]);
462
463
    A[2][4] = ROL64(A[4][0], rhotates[4][0]);
464
    A[4][3] = ROL64(A[3][0], rhotates[3][0]);
465
    A[1][2] = ROL64(A[2][0], rhotates[2][0]);
466
    A[3][1] = ROL64(A[1][0], rhotates[1][0]);
467
468
    A[1][0] = ROL64(C[3],    rhotates[0][3]);
469
    A[2][0] = ROL64(C[1],    rhotates[0][1]);
470
    A[3][0] = ROL64(C[4],    rhotates[0][4]);
471
    A[4][0] = ROL64(C[2],    rhotates[0][2]);
472
473
    C[0] = A[0][0];
474
    C[1] = A[1][0];
475
    D[0] = A[0][1];
476
    D[1] = A[1][1];
477
478
    A[0][0] ^= (~A[0][1] & A[0][2]);
479
    A[1][0] ^= (~A[1][1] & A[1][2]);
480
    A[0][1] ^= (~A[0][2] & A[0][3]);
481
    A[1][1] ^= (~A[1][2] & A[1][3]);
482
    A[0][2] ^= (~A[0][3] & A[0][4]);
483
    A[1][2] ^= (~A[1][3] & A[1][4]);
484
    A[0][3] ^= (~A[0][4] & C[0]);
485
    A[1][3] ^= (~A[1][4] & C[1]);
486
    A[0][4] ^= (~C[0]    & D[0]);
487
    A[1][4] ^= (~C[1]    & D[1]);
488
489
    C[2] = A[2][0];
490
    C[3] = A[3][0];
491
    D[2] = A[2][1];
492
    D[3] = A[3][1];
493
494
    A[2][0] ^= (~A[2][1] & A[2][2]);
495
    A[3][0] ^= (~A[3][1] & A[3][2]);
496
    A[2][1] ^= (~A[2][2] & A[2][3]);
497
    A[3][1] ^= (~A[3][2] & A[3][3]);
498
    A[2][2] ^= (~A[2][3] & A[2][4]);
499
    A[3][2] ^= (~A[3][3] & A[3][4]);
500
    A[2][3] ^= (~A[2][4] & C[2]);
501
    A[3][3] ^= (~A[3][4] & C[3]);
502
    A[2][4] ^= (~C[2]    & D[2]);
503
    A[3][4] ^= (~C[3]    & D[3]);
504
505
    C[4] = A[4][0];
506
    D[4] = A[4][1];
507
508
    A[4][0] ^= (~A[4][1] & A[4][2]);
509
    A[4][1] ^= (~A[4][2] & A[4][3]);
510
    A[4][2] ^= (~A[4][3] & A[4][4]);
511
    A[4][3] ^= (~A[4][4] & C[4]);
512
    A[4][4] ^= (~C[4]    & D[4]);
513
    A[0][0] ^= iotas[i];
514
}
515
516
static void KeccakF1600(uint64_t A[5][5])
517
{
518
    size_t i;
519
520
    for (i = 0; i < 24; i++) {
521
        Round(A, i);
522
    }
523
}
524
525
#elif defined(KECCAK_2X)
526
/*
527
 * This implementation is variant of KECCAK_1X above with outer-most
528
 * round loop unrolled twice. This allows to take temporary storage
529
 * out of round procedure and simplify references to it by alternating
530
 * it with actual data (see round loop below). Originally it was meant
531
 * rather as reference for an assembly implementation, but it seems to
532
 * play best with compilers [as well as provide best instruction per
533
 * processed byte ratio at minimal round unroll factor]...
534
 */
535
static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
536
26.7M
{
537
26.7M
    uint64_t C[5], D[5];
538
539
26.7M
    assert(i < (sizeof(iotas) / sizeof(iotas[0])));
540
541
26.7M
    C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
542
26.7M
    C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
543
26.7M
    C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
544
26.7M
    C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
545
26.7M
    C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
546
547
26.7M
    D[0] = ROL64(C[1], 1) ^ C[4];
548
26.7M
    D[1] = ROL64(C[2], 1) ^ C[0];
549
26.7M
    D[2] = ROL64(C[3], 1) ^ C[1];
550
26.7M
    D[3] = ROL64(C[4], 1) ^ C[2];
551
26.7M
    D[4] = ROL64(C[0], 1) ^ C[3];
552
553
26.7M
    C[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
554
26.7M
    C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
555
26.7M
    C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
556
26.7M
    C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
557
26.7M
    C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
558
559
26.7M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
560
26.7M
    R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
561
26.7M
    R[0][1] = C[1] ^ (~C[2] | C[3]);
562
26.7M
    R[0][2] = C[2] ^ ( C[3] & C[4]);
563
26.7M
    R[0][3] = C[3] ^ ( C[4] | C[0]);
564
26.7M
    R[0][4] = C[4] ^ ( C[0] & C[1]);
565
#else
566
    R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
567
    R[0][1] = C[1] ^ (~C[2] & C[3]);
568
    R[0][2] = C[2] ^ (~C[3] & C[4]);
569
    R[0][3] = C[3] ^ (~C[4] & C[0]);
570
    R[0][4] = C[4] ^ (~C[0] & C[1]);
571
#endif
572
573
26.7M
    C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
574
26.7M
    C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
575
26.7M
    C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
576
26.7M
    C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
577
26.7M
    C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
578
579
26.7M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
580
26.7M
    R[1][0] = C[0] ^ (C[1] |  C[2]);
581
26.7M
    R[1][1] = C[1] ^ (C[2] &  C[3]);
582
26.7M
    R[1][2] = C[2] ^ (C[3] | ~C[4]);
583
26.7M
    R[1][3] = C[3] ^ (C[4] |  C[0]);
584
26.7M
    R[1][4] = C[4] ^ (C[0] &  C[1]);
585
#else
586
    R[1][0] = C[0] ^ (~C[1] & C[2]);
587
    R[1][1] = C[1] ^ (~C[2] & C[3]);
588
    R[1][2] = C[2] ^ (~C[3] & C[4]);
589
    R[1][3] = C[3] ^ (~C[4] & C[0]);
590
    R[1][4] = C[4] ^ (~C[0] & C[1]);
591
#endif
592
593
26.7M
    C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
594
26.7M
    C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
595
26.7M
    C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
596
26.7M
    C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
597
26.7M
    C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
598
599
26.7M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
600
26.7M
    R[2][0] =  C[0] ^ ( C[1] | C[2]);
601
26.7M
    R[2][1] =  C[1] ^ ( C[2] & C[3]);
602
26.7M
    R[2][2] =  C[2] ^ (~C[3] & C[4]);
603
26.7M
    R[2][3] = ~C[3] ^ ( C[4] | C[0]);
604
26.7M
    R[2][4] =  C[4] ^ ( C[0] & C[1]);
605
#else
606
    R[2][0] = C[0] ^ (~C[1] & C[2]);
607
    R[2][1] = C[1] ^ (~C[2] & C[3]);
608
    R[2][2] = C[2] ^ (~C[3] & C[4]);
609
    R[2][3] = C[3] ^ (~C[4] & C[0]);
610
    R[2][4] = C[4] ^ (~C[0] & C[1]);
611
#endif
612
613
26.7M
    C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
614
26.7M
    C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
615
26.7M
    C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
616
26.7M
    C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
617
26.7M
    C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
618
619
26.7M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
620
26.7M
    R[3][0] =  C[0] ^ ( C[1] & C[2]);
621
26.7M
    R[3][1] =  C[1] ^ ( C[2] | C[3]);
622
26.7M
    R[3][2] =  C[2] ^ (~C[3] | C[4]);
623
26.7M
    R[3][3] = ~C[3] ^ ( C[4] & C[0]);
624
26.7M
    R[3][4] =  C[4] ^ ( C[0] | C[1]);
625
#else
626
    R[3][0] = C[0] ^ (~C[1] & C[2]);
627
    R[3][1] = C[1] ^ (~C[2] & C[3]);
628
    R[3][2] = C[2] ^ (~C[3] & C[4]);
629
    R[3][3] = C[3] ^ (~C[4] & C[0]);
630
    R[3][4] = C[4] ^ (~C[0] & C[1]);
631
#endif
632
633
26.7M
    C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
634
26.7M
    C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
635
26.7M
    C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
636
26.7M
    C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
637
26.7M
    C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
638
639
26.7M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
640
26.7M
    R[4][0] =  C[0] ^ (~C[1] & C[2]);
641
26.7M
    R[4][1] = ~C[1] ^ ( C[2] | C[3]);
642
26.7M
    R[4][2] =  C[2] ^ ( C[3] & C[4]);
643
26.7M
    R[4][3] =  C[3] ^ ( C[4] | C[0]);
644
26.7M
    R[4][4] =  C[4] ^ ( C[0] & C[1]);
645
#else
646
    R[4][0] = C[0] ^ (~C[1] & C[2]);
647
    R[4][1] = C[1] ^ (~C[2] & C[3]);
648
    R[4][2] = C[2] ^ (~C[3] & C[4]);
649
    R[4][3] = C[3] ^ (~C[4] & C[0]);
650
    R[4][4] = C[4] ^ (~C[0] & C[1]);
651
#endif
652
26.7M
}
653
654
static void KeccakF1600(uint64_t A[5][5])
655
1.11M
{
656
1.11M
    uint64_t T[5][5];
657
1.11M
    size_t i;
658
659
1.11M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
660
1.11M
    A[0][1] = ~A[0][1];
661
1.11M
    A[0][2] = ~A[0][2];
662
1.11M
    A[1][3] = ~A[1][3];
663
1.11M
    A[2][2] = ~A[2][2];
664
1.11M
    A[3][2] = ~A[3][2];
665
1.11M
    A[4][0] = ~A[4][0];
666
1.11M
#endif
667
668
14.4M
    for (i = 0; i < 24; i += 2) {
669
13.3M
        Round(T, A, i);
670
13.3M
        Round(A, T, i + 1);
671
13.3M
    }
672
673
1.11M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
674
1.11M
    A[0][1] = ~A[0][1];
675
1.11M
    A[0][2] = ~A[0][2];
676
1.11M
    A[1][3] = ~A[1][3];
677
1.11M
    A[2][2] = ~A[2][2];
678
1.11M
    A[3][2] = ~A[3][2];
679
1.11M
    A[4][0] = ~A[4][0];
680
1.11M
#endif
681
1.11M
}
682
683
#else   /* define KECCAK_INPLACE to compile this code path */
684
/*
685
 * This implementation is KECCAK_1X from above combined 4 times with
686
 * a twist that allows to omit temporary storage and perform in-place
687
 * processing. It's discussed in section 2.5 of "Keccak implementation
688
 * overview". It's likely to be best suited for processors with large
689
 * register bank... On the other hand processor with large register
690
 * bank can as well use KECCAK_1X_ALT, it would be as fast but much
691
 * more compact...
692
 */
693
static void FourRounds(uint64_t A[5][5], size_t i)
694
{
695
    uint64_t B[5], C[5], D[5];
696
697
    assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4));
698
699
    /* Round 4*n */
700
    C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
701
    C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
702
    C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
703
    C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
704
    C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
705
706
    D[0] = ROL64(C[1], 1) ^ C[4];
707
    D[1] = ROL64(C[2], 1) ^ C[0];
708
    D[2] = ROL64(C[3], 1) ^ C[1];
709
    D[3] = ROL64(C[4], 1) ^ C[2];
710
    D[4] = ROL64(C[0], 1) ^ C[3];
711
712
    B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
713
    B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
714
    B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
715
    B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
716
    B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
717
718
    C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
719
    C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
720
    C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
721
    C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
722
    C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
723
724
    B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
725
    B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
726
    B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
727
    B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
728
    B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
729
730
    C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
731
    C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
732
    C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
733
    C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
734
    C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
735
736
    B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
737
    B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
738
    B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
739
    B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
740
    B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
741
742
    C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
743
    C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
744
    C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
745
    C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
746
    C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
747
748
    B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
749
    B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
750
    B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
751
    B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
752
    B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
753
754
    C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
755
    C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
756
    C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
757
    C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
758
    C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
759
760
    B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
761
    B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
762
    B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
763
    B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
764
    B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
765
766
    C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
767
    C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
768
    C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
769
    C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
770
    C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
771
772
    /* Round 4*n+1 */
773
    D[0] = ROL64(C[1], 1) ^ C[4];
774
    D[1] = ROL64(C[2], 1) ^ C[0];
775
    D[2] = ROL64(C[3], 1) ^ C[1];
776
    D[3] = ROL64(C[4], 1) ^ C[2];
777
    D[4] = ROL64(C[0], 1) ^ C[3];
778
779
    B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
780
    B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
781
    B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
782
    B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
783
    B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
784
785
    C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
786
    C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
787
    C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
788
    C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
789
    C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
790
791
    B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
792
    B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
793
    B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
794
    B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
795
    B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
796
797
    C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
798
    C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
799
    C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
800
    C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
801
    C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
802
803
    B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
804
    B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
805
    B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
806
    B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
807
    B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
808
809
    C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
810
    C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
811
    C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
812
    C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
813
    C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
814
815
    B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
816
    B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
817
    B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
818
    B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
819
    B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
820
821
    C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
822
    C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
823
    C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
824
    C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
825
    C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
826
827
    B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
828
    B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
829
    B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
830
    B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
831
    B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
832
833
    C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
834
    C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
835
    C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
836
    C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
837
    C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
838
839
    /* Round 4*n+2 */
840
    D[0] = ROL64(C[1], 1) ^ C[4];
841
    D[1] = ROL64(C[2], 1) ^ C[0];
842
    D[2] = ROL64(C[3], 1) ^ C[1];
843
    D[3] = ROL64(C[4], 1) ^ C[2];
844
    D[4] = ROL64(C[0], 1) ^ C[3];
845
846
    B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
847
    B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
848
    B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
849
    B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
850
    B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
851
852
    C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
853
    C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
854
    C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
855
    C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
856
    C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
857
858
    B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
859
    B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
860
    B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
861
    B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
862
    B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
863
864
    C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
865
    C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
866
    C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
867
    C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
868
    C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
869
870
    B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
871
    B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
872
    B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
873
    B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
874
    B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
875
876
    C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
877
    C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
878
    C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
879
    C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
880
    C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
881
882
    B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
883
    B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
884
    B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
885
    B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
886
    B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
887
888
    C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
889
    C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
890
    C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
891
    C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
892
    C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
893
894
    B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
895
    B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
896
    B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
897
    B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
898
    B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
899
900
    C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
901
    C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
902
    C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
903
    C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
904
    C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
905
906
    /* Round 4*n+3 */
907
    D[0] = ROL64(C[1], 1) ^ C[4];
908
    D[1] = ROL64(C[2], 1) ^ C[0];
909
    D[2] = ROL64(C[3], 1) ^ C[1];
910
    D[3] = ROL64(C[4], 1) ^ C[2];
911
    D[4] = ROL64(C[0], 1) ^ C[3];
912
913
    B[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
914
    B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
915
    B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
916
    B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
917
    B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
918
919
    /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
920
    /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
921
    /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
922
    /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
923
    /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
924
925
    B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
926
    B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
927
    B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
928
    B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
929
    B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
930
931
    /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
932
    /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
933
    /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
934
    /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
935
    /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
936
937
    B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
938
    B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
939
    B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
940
    B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
941
    B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
942
943
    /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
944
    /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
945
    /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
946
    /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
947
    /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
948
949
    B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
950
    B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
951
    B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
952
    B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
953
    B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
954
955
    /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
956
    /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
957
    /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
958
    /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
959
    /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
960
961
    B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
962
    B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
963
    B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
964
    B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
965
    B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
966
967
    /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
968
    /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
969
    /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
970
    /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
971
    /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
972
}
973
974
static void KeccakF1600(uint64_t A[5][5])
975
{
976
    size_t i;
977
978
    for (i = 0; i < 24; i += 4) {
979
        FourRounds(A, i);
980
    }
981
}
982
983
#endif
984
985
static uint64_t BitInterleave(uint64_t Ai)
986
13.3M
{
987
13.3M
    if (BIT_INTERLEAVE) {
988
0
        uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
989
0
        uint32_t t0, t1;
990
991
0
        t0 = lo & 0x55555555;
992
0
        t0 |= t0 >> 1;  t0 &= 0x33333333;
993
0
        t0 |= t0 >> 2;  t0 &= 0x0f0f0f0f;
994
0
        t0 |= t0 >> 4;  t0 &= 0x00ff00ff;
995
0
        t0 |= t0 >> 8;  t0 &= 0x0000ffff;
996
997
0
        t1 = hi & 0x55555555;
998
0
        t1 |= t1 >> 1;  t1 &= 0x33333333;
999
0
        t1 |= t1 >> 2;  t1 &= 0x0f0f0f0f;
1000
0
        t1 |= t1 >> 4;  t1 &= 0x00ff00ff;
1001
0
        t1 |= t1 >> 8;  t1 <<= 16;
1002
1003
0
        lo &= 0xaaaaaaaa;
1004
0
        lo |= lo << 1;  lo &= 0xcccccccc;
1005
0
        lo |= lo << 2;  lo &= 0xf0f0f0f0;
1006
0
        lo |= lo << 4;  lo &= 0xff00ff00;
1007
0
        lo |= lo << 8;  lo >>= 16;
1008
1009
0
        hi &= 0xaaaaaaaa;
1010
0
        hi |= hi << 1;  hi &= 0xcccccccc;
1011
0
        hi |= hi << 2;  hi &= 0xf0f0f0f0;
1012
0
        hi |= hi << 4;  hi &= 0xff00ff00;
1013
0
        hi |= hi << 8;  hi &= 0xffff0000;
1014
1015
0
        Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1016
0
    }
1017
1018
13.3M
    return Ai;
1019
13.3M
}
1020
1021
static uint64_t BitDeinterleave(uint64_t Ai)
1022
320
{
1023
320
    if (BIT_INTERLEAVE) {
1024
0
        uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1025
0
        uint32_t t0, t1;
1026
1027
0
        t0 = lo & 0x0000ffff;
1028
0
        t0 |= t0 << 8;  t0 &= 0x00ff00ff;
1029
0
        t0 |= t0 << 4;  t0 &= 0x0f0f0f0f;
1030
0
        t0 |= t0 << 2;  t0 &= 0x33333333;
1031
0
        t0 |= t0 << 1;  t0 &= 0x55555555;
1032
1033
0
        t1 = hi << 16;
1034
0
        t1 |= t1 >> 8;  t1 &= 0xff00ff00;
1035
0
        t1 |= t1 >> 4;  t1 &= 0xf0f0f0f0;
1036
0
        t1 |= t1 >> 2;  t1 &= 0xcccccccc;
1037
0
        t1 |= t1 >> 1;  t1 &= 0xaaaaaaaa;
1038
1039
0
        lo >>= 16;
1040
0
        lo |= lo << 8;  lo &= 0x00ff00ff;
1041
0
        lo |= lo << 4;  lo &= 0x0f0f0f0f;
1042
0
        lo |= lo << 2;  lo &= 0x33333333;
1043
0
        lo |= lo << 1;  lo &= 0x55555555;
1044
1045
0
        hi &= 0xffff0000;
1046
0
        hi |= hi >> 8;  hi &= 0xff00ff00;
1047
0
        hi |= hi >> 4;  hi &= 0xf0f0f0f0;
1048
0
        hi |= hi >> 2;  hi &= 0xcccccccc;
1049
0
        hi |= hi >> 1;  hi &= 0xaaaaaaaa;
1050
1051
0
        Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1052
0
    }
1053
1054
320
    return Ai;
1055
320
}
1056
1057
/*
1058
 * SHA3_absorb can be called multiple times, but at each invocation
1059
 * largest multiple of |r| out of |len| bytes are processed. Then
1060
 * remaining amount of bytes is returned. This is done to spare caller
1061
 * trouble of calculating the largest multiple of |r|. |r| can be viewed
1062
 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1063
 * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1064
 * padding and intermediate sub-block buffering, byte- or bitwise, is
1065
 * caller's responsibility.
1066
 */
1067
size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1068
                   size_t r)
1069
228
{
1070
228
    uint64_t *A_flat = (uint64_t *)A;
1071
228
    size_t i, w = r / 8;
1072
1073
228
    assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1074
1075
1.11M
    while (len >= r) {
1076
14.4M
        for (i = 0; i < w; i++) {
1077
13.3M
            uint64_t Ai = (uint64_t)inp[0]       | (uint64_t)inp[1] << 8  |
1078
13.3M
                          (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
1079
13.3M
                          (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
1080
13.3M
                          (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1081
13.3M
            inp += 8;
1082
1083
13.3M
            A_flat[i] ^= BitInterleave(Ai);
1084
13.3M
        }
1085
1.11M
        KeccakF1600(A);
1086
1.11M
        len -= r;
1087
1.11M
    }
1088
1089
228
    return len;
1090
228
}
1091
1092
/*
1093
 * SHA3_squeeze may be called after SHA3_absorb to generate |out| hash value of
1094
 * |len| bytes.
1095
 * If multiple SHA3_squeeze calls are required the output length |len| must be a
1096
 * multiple of the blocksize, with |next| being 0 on the first call and 1 on
1097
 * subsequent calls. It is the callers responsibility to buffer the results.
1098
 * When only a single call to SHA3_squeeze is required, |len| can be any size
1099
 * and |next| must be 0.
1100
 */
1101
void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r,
1102
                  int next)
1103
57
{
1104
57
    uint64_t *A_flat = (uint64_t *)A;
1105
57
    size_t i, w = r / 8;
1106
1107
57
    assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1108
1109
91
    while (len != 0) {
1110
57
        if (next)
1111
0
            KeccakF1600(A);
1112
57
        next = 1;
1113
354
        for (i = 0; i < w && len != 0; i++) {
1114
320
            uint64_t Ai = BitDeinterleave(A_flat[i]);
1115
1116
320
            if (len < 8) {
1117
115
                for (i = 0; i < len; i++) {
1118
92
                    *out++ = (unsigned char)Ai;
1119
92
                    Ai >>= 8;
1120
92
                }
1121
23
                return;
1122
23
            }
1123
1124
297
            out[0] = (unsigned char)(Ai);
1125
297
            out[1] = (unsigned char)(Ai >> 8);
1126
297
            out[2] = (unsigned char)(Ai >> 16);
1127
297
            out[3] = (unsigned char)(Ai >> 24);
1128
297
            out[4] = (unsigned char)(Ai >> 32);
1129
297
            out[5] = (unsigned char)(Ai >> 40);
1130
297
            out[6] = (unsigned char)(Ai >> 48);
1131
297
            out[7] = (unsigned char)(Ai >> 56);
1132
297
            out += 8;
1133
297
            len -= 8;
1134
297
        }
1135
57
    }
1136
57
}
1137
#endif
1138
1139
#ifdef SELFTEST
1140
/*
1141
 * Post-padding one-shot implementations would look as following:
1142
 *
1143
 * SHA3_224     SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1144
 * SHA3_256     SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1145
 * SHA3_384     SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1146
 * SHA3_512     SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1147
 * SHAKE_128    SHA3_sponge(inp, len, out, d, (1600-256)/8);
1148
 * SHAKE_256    SHA3_sponge(inp, len, out, d, (1600-512)/8);
1149
 */
1150
1151
void SHA3_sponge(const unsigned char *inp, size_t len,
1152
                 unsigned char *out, size_t d, size_t r)
1153
{
1154
    uint64_t A[5][5];
1155
1156
    memset(A, 0, sizeof(A));
1157
    SHA3_absorb(A, inp, len, r);
1158
    SHA3_squeeze(A, out, d, r);
1159
}
1160
1161
# include <stdio.h>
1162
1163
int main(void)
1164
{
1165
    /*
1166
     * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1167
     */
1168
    unsigned char test[168] = { '\xf3', '\x3' };
1169
    unsigned char out[512];
1170
    size_t i;
1171
    static const unsigned char result[512] = {
1172
        0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1173
        0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1174
        0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1175
        0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1176
        0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1177
        0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1178
        0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1179
        0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1180
        0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1181
        0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1182
        0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1183
        0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1184
        0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1185
        0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1186
        0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1187
        0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1188
        0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1189
        0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1190
        0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1191
        0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1192
        0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1193
        0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1194
        0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1195
        0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1196
        0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1197
        0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1198
        0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1199
        0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1200
        0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1201
        0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1202
        0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1203
        0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1204
        0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1205
        0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1206
        0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1207
        0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1208
        0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1209
        0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1210
        0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1211
        0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1212
        0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1213
        0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1214
        0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1215
        0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1216
        0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1217
        0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1218
        0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1219
        0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1220
        0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1221
        0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1222
        0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1223
        0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1224
        0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1225
        0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1226
        0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1227
        0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1228
        0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1229
        0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1230
        0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1231
        0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1232
        0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1233
        0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1234
        0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1235
        0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1236
    };
1237
1238
    test[167] = '\x80';
1239
    SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1240
1241
    /*
1242
     * Rationale behind keeping output [formatted as below] is that
1243
     * one should be able to redirect it to a file, then copy-n-paste
1244
     * final "output val" from official example to another file, and
1245
     * compare the two with diff(1).
1246
     */
1247
    for (i = 0; i < sizeof(out);) {
1248
        printf("%02X", out[i]);
1249
        printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1250
    }
1251
1252
    if (memcmp(out, result, sizeof(out))) {
1253
        fprintf(stderr, "failure\n");
1254
        return 1;
1255
    } else {
1256
        fprintf(stderr, "success\n");
1257
        return 0;
1258
    }
1259
}
1260
#endif