Coverage Report

Created: 2026-01-07 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openssl/crypto/sha/keccak1600.c
Line
Count
Source
1
/*
2
 * Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved.
3
 *
4
 * Licensed under the Apache License 2.0 (the "License").  You may not use
5
 * this file except in compliance with the License.  You can obtain a copy
6
 * in the file LICENSE in the source distribution or at
7
 * https://www.openssl.org/source/license.html
8
 */
9
10
#include <openssl/e_os2.h>
11
#include <string.h>
12
#include <assert.h>
13
14
#include "internal/nelem.h"
15
16
size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
17
    size_t r);
18
void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r, int next);
19
20
#if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
21
22
/*
23
 * Choose some sensible defaults
24
 */
25
#if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
26
#define KECCAK_2X /* default to KECCAK_2X variant */
27
#endif
28
29
#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || defined(__mips) || defined(__riscv) || defined(__s390__) || defined(__EMSCRIPTEN__)
30
/*
31
 * These don't have "and with complement" instruction, so minimize amount
32
 * of "not"-s. Implemented only in the [default] KECCAK_2X variant.
33
 */
34
#define KECCAK_COMPLEMENTING_TRANSFORM
35
#endif
36
37
#if defined(__x86_64__) || defined(__aarch64__) || defined(__mips64) || defined(__ia64) || (defined(__VMS) && !defined(__vax))
38
/*
39
 * These are available even in ILP32 flavours, but even then they are
40
 * capable of performing 64-bit operations as efficiently as in *P64.
41
 * Since it's not given that we can use sizeof(void *), just shunt it.
42
 */
43
798M
#define BIT_INTERLEAVE (0)
44
#else
45
#define BIT_INTERLEAVE (sizeof(void *) < 8)
46
#endif
47
48
0
#define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
49
50
static uint64_t ROL64(uint64_t val, int offset)
51
782M
{
52
782M
    if (offset == 0) {
53
0
        return val;
54
782M
    } else if (!BIT_INTERLEAVE) {
55
782M
        return (val << offset) | (val >> (64 - offset));
56
782M
    } else {
57
0
        uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
58
59
0
        if (offset & 1) {
60
0
            uint32_t tmp = hi;
61
62
0
            offset >>= 1;
63
0
            hi = ROL32(lo, offset);
64
0
            lo = ROL32(tmp, offset + 1);
65
0
        } else {
66
0
            offset >>= 1;
67
0
            lo = ROL32(lo, offset);
68
0
            hi = ROL32(hi, offset);
69
0
        }
70
71
0
        return ((uint64_t)hi << 32) | lo;
72
0
    }
73
782M
}
74
75
static const unsigned char rhotates[5][5] = {
76
    { 0, 1, 62, 28, 27 },
77
    { 36, 44, 6, 55, 20 },
78
    { 3, 10, 43, 25, 39 },
79
    { 41, 45, 15, 21, 8 },
80
    { 18, 2, 61, 56, 14 }
81
};
82
83
static const uint64_t iotas[] = {
84
    BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
85
    BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
86
    BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
87
    BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
88
    BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
89
    BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
90
    BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
91
    BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
92
    BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
93
    BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
94
    BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
95
    BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
96
    BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
97
    BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
98
    BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
99
    BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
100
    BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
101
    BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
102
    BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
103
    BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
104
    BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
105
    BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
106
    BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
107
    BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
108
};
109
110
#if defined(KECCAK_REF)
111
/*
112
 * This is straightforward or "maximum clarity" implementation aiming
113
 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
114
 * Permutation-Based Hash and Extendible-Output Functions" as much as
115
 * possible. With one caveat. Because of the way C stores matrices,
116
 * references to A[x,y] in the specification are presented as A[y][x].
117
 * Implementation unrolls inner x-loops so that modulo 5 operations are
118
 * explicitly pre-computed.
119
 */
120
static void Theta(uint64_t A[5][5])
121
{
122
    uint64_t C[5], D[5];
123
    size_t y;
124
125
    C[0] = A[0][0];
126
    C[1] = A[0][1];
127
    C[2] = A[0][2];
128
    C[3] = A[0][3];
129
    C[4] = A[0][4];
130
131
    for (y = 1; y < 5; y++) {
132
        C[0] ^= A[y][0];
133
        C[1] ^= A[y][1];
134
        C[2] ^= A[y][2];
135
        C[3] ^= A[y][3];
136
        C[4] ^= A[y][4];
137
    }
138
139
    D[0] = ROL64(C[1], 1) ^ C[4];
140
    D[1] = ROL64(C[2], 1) ^ C[0];
141
    D[2] = ROL64(C[3], 1) ^ C[1];
142
    D[3] = ROL64(C[4], 1) ^ C[2];
143
    D[4] = ROL64(C[0], 1) ^ C[3];
144
145
    for (y = 0; y < 5; y++) {
146
        A[y][0] ^= D[0];
147
        A[y][1] ^= D[1];
148
        A[y][2] ^= D[2];
149
        A[y][3] ^= D[3];
150
        A[y][4] ^= D[4];
151
    }
152
}
153
154
static void Rho(uint64_t A[5][5])
155
{
156
    size_t y;
157
158
    for (y = 0; y < 5; y++) {
159
        A[y][0] = ROL64(A[y][0], rhotates[y][0]);
160
        A[y][1] = ROL64(A[y][1], rhotates[y][1]);
161
        A[y][2] = ROL64(A[y][2], rhotates[y][2]);
162
        A[y][3] = ROL64(A[y][3], rhotates[y][3]);
163
        A[y][4] = ROL64(A[y][4], rhotates[y][4]);
164
    }
165
}
166
167
static void Pi(uint64_t A[5][5])
168
{
169
    uint64_t T[5][5];
170
171
    /*
172
     * T = A
173
     * A[y][x] = T[x][(3*y+x)%5]
174
     */
175
    memcpy(T, A, sizeof(T));
176
177
    A[0][0] = T[0][0];
178
    A[0][1] = T[1][1];
179
    A[0][2] = T[2][2];
180
    A[0][3] = T[3][3];
181
    A[0][4] = T[4][4];
182
183
    A[1][0] = T[0][3];
184
    A[1][1] = T[1][4];
185
    A[1][2] = T[2][0];
186
    A[1][3] = T[3][1];
187
    A[1][4] = T[4][2];
188
189
    A[2][0] = T[0][1];
190
    A[2][1] = T[1][2];
191
    A[2][2] = T[2][3];
192
    A[2][3] = T[3][4];
193
    A[2][4] = T[4][0];
194
195
    A[3][0] = T[0][4];
196
    A[3][1] = T[1][0];
197
    A[3][2] = T[2][1];
198
    A[3][3] = T[3][2];
199
    A[3][4] = T[4][3];
200
201
    A[4][0] = T[0][2];
202
    A[4][1] = T[1][3];
203
    A[4][2] = T[2][4];
204
    A[4][3] = T[3][0];
205
    A[4][4] = T[4][1];
206
}
207
208
static void Chi(uint64_t A[5][5])
209
{
210
    uint64_t C[5];
211
    size_t y;
212
213
    for (y = 0; y < 5; y++) {
214
        C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
215
        C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
216
        C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
217
        C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
218
        C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
219
220
        A[y][0] = C[0];
221
        A[y][1] = C[1];
222
        A[y][2] = C[2];
223
        A[y][3] = C[3];
224
        A[y][4] = C[4];
225
    }
226
}
227
228
static void Iota(uint64_t A[5][5], size_t i)
229
{
230
    assert(i < OSSL_NELEM(iotas));
231
    A[0][0] ^= iotas[i];
232
}
233
234
static void KeccakF1600(uint64_t A[5][5])
235
{
236
    size_t i;
237
238
    for (i = 0; i < 24; i++) {
239
        Theta(A);
240
        Rho(A);
241
        Pi(A);
242
        Chi(A);
243
        Iota(A, i);
244
    }
245
}
246
247
#elif defined(KECCAK_1X)
248
/*
249
 * This implementation is optimization of above code featuring unroll
250
 * of even y-loops, their fusion and code motion. It also minimizes
251
 * temporary storage. Compiler would normally do all these things for
252
 * you, purpose of manual optimization is to provide "unobscured"
253
 * reference for assembly implementation [in case this approach is
254
 * chosen for implementation on some platform]. In the nutshell it's
255
 * equivalent of "plane-per-plane processing" approach discussed in
256
 * section 2.4 of "Keccak implementation overview".
257
 */
258
static void Round(uint64_t A[5][5], size_t i)
259
{
260
    uint64_t C[5], E[2]; /* registers */
261
    uint64_t D[5], T[2][5]; /* memory    */
262
263
    assert(i < OSSL_NELEM(iotas));
264
265
    C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
266
    C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
267
    C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
268
    C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
269
    C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
270
271
#if defined(__arm__)
272
    D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
273
    D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
274
    D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
275
    D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
276
    D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
277
278
    T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
279
    T[0][1] = A[0][1] ^ E[0]; /* D[1] */
280
    T[0][2] = A[0][2] ^ C[1]; /* D[2] */
281
    T[0][3] = A[0][3] ^ C[2]; /* D[3] */
282
    T[0][4] = A[0][4] ^ E[1]; /* D[4] */
283
284
    C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
285
    C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
286
    C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
287
    C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
288
    C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
289
#else
290
    D[0] = ROL64(C[1], 1) ^ C[4];
291
    D[1] = ROL64(C[2], 1) ^ C[0];
292
    D[2] = ROL64(C[3], 1) ^ C[1];
293
    D[3] = ROL64(C[4], 1) ^ C[2];
294
    D[4] = ROL64(C[0], 1) ^ C[3];
295
296
    T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
297
    T[0][1] = A[0][1] ^ D[1];
298
    T[0][2] = A[0][2] ^ D[2];
299
    T[0][3] = A[0][3] ^ D[3];
300
    T[0][4] = A[0][4] ^ D[4];
301
302
    C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
303
    C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
304
    C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
305
    C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
306
    C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
307
#endif
308
    A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
309
    A[0][1] = C[1] ^ (~C[2] & C[3]);
310
    A[0][2] = C[2] ^ (~C[3] & C[4]);
311
    A[0][3] = C[3] ^ (~C[4] & C[0]);
312
    A[0][4] = C[4] ^ (~C[0] & C[1]);
313
314
    T[1][0] = A[1][0] ^ (C[3] = D[0]);
315
    T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
316
    T[1][2] = A[1][2] ^ (E[0] = D[2]);
317
    T[1][3] = A[1][3] ^ (E[1] = D[3]);
318
    T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
319
320
    C[0] = ROL64(T[0][3], rhotates[0][3]);
321
    C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
322
    C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
323
    C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
324
    C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
325
326
    A[1][0] = C[0] ^ (~C[1] & C[2]);
327
    A[1][1] = C[1] ^ (~C[2] & C[3]);
328
    A[1][2] = C[2] ^ (~C[3] & C[4]);
329
    A[1][3] = C[3] ^ (~C[4] & C[0]);
330
    A[1][4] = C[4] ^ (~C[0] & C[1]);
331
332
    C[0] = ROL64(T[0][1], rhotates[0][1]);
333
    C[1] = ROL64(T[1][2], rhotates[1][2]);
334
    C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
335
    C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
336
    C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
337
338
    A[2][0] = C[0] ^ (~C[1] & C[2]);
339
    A[2][1] = C[1] ^ (~C[2] & C[3]);
340
    A[2][2] = C[2] ^ (~C[3] & C[4]);
341
    A[2][3] = C[3] ^ (~C[4] & C[0]);
342
    A[2][4] = C[4] ^ (~C[0] & C[1]);
343
344
    C[0] = ROL64(T[0][4], rhotates[0][4]);
345
    C[1] = ROL64(T[1][0], rhotates[1][0]);
346
    C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
347
    C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
348
    C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
349
350
    A[3][0] = C[0] ^ (~C[1] & C[2]);
351
    A[3][1] = C[1] ^ (~C[2] & C[3]);
352
    A[3][2] = C[2] ^ (~C[3] & C[4]);
353
    A[3][3] = C[3] ^ (~C[4] & C[0]);
354
    A[3][4] = C[4] ^ (~C[0] & C[1]);
355
356
    C[0] = ROL64(T[0][2], rhotates[0][2]);
357
    C[1] = ROL64(T[1][3], rhotates[1][3]);
358
    C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
359
    C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
360
    C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
361
362
    A[4][0] = C[0] ^ (~C[1] & C[2]);
363
    A[4][1] = C[1] ^ (~C[2] & C[3]);
364
    A[4][2] = C[2] ^ (~C[3] & C[4]);
365
    A[4][3] = C[3] ^ (~C[4] & C[0]);
366
    A[4][4] = C[4] ^ (~C[0] & C[1]);
367
}
368
369
static void KeccakF1600(uint64_t A[5][5])
370
{
371
    size_t i;
372
373
    for (i = 0; i < 24; i++) {
374
        Round(A, i);
375
    }
376
}
377
378
#elif defined(KECCAK_1X_ALT)
379
/*
380
 * This is variant of above KECCAK_1X that reduces requirement for
381
 * temporary storage even further, but at cost of more updates to A[][].
382
 * It's less suitable if A[][] is memory bound, but better if it's
383
 * register bound.
384
 */
385
386
static void Round(uint64_t A[5][5], size_t i)
387
{
388
    uint64_t C[5], D[5];
389
390
    assert(i < OSSL_NELEM(iotas));
391
392
    C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
393
    C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
394
    C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
395
    C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
396
    C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
397
398
    D[1] = C[0] ^ ROL64(C[2], 1);
399
    D[2] = C[1] ^ ROL64(C[3], 1);
400
    D[3] = C[2] ^= ROL64(C[4], 1);
401
    D[4] = C[3] ^= ROL64(C[0], 1);
402
    D[0] = C[4] ^= ROL64(C[1], 1);
403
404
    A[0][1] ^= D[1];
405
    A[1][1] ^= D[1];
406
    A[2][1] ^= D[1];
407
    A[3][1] ^= D[1];
408
    A[4][1] ^= D[1];
409
410
    A[0][2] ^= D[2];
411
    A[1][2] ^= D[2];
412
    A[2][2] ^= D[2];
413
    A[3][2] ^= D[2];
414
    A[4][2] ^= D[2];
415
416
    A[0][3] ^= C[2];
417
    A[1][3] ^= C[2];
418
    A[2][3] ^= C[2];
419
    A[3][3] ^= C[2];
420
    A[4][3] ^= C[2];
421
422
    A[0][4] ^= C[3];
423
    A[1][4] ^= C[3];
424
    A[2][4] ^= C[3];
425
    A[3][4] ^= C[3];
426
    A[4][4] ^= C[3];
427
428
    A[0][0] ^= C[4];
429
    A[1][0] ^= C[4];
430
    A[2][0] ^= C[4];
431
    A[3][0] ^= C[4];
432
    A[4][0] ^= C[4];
433
434
    C[1] = A[0][1];
435
    C[2] = A[0][2];
436
    C[3] = A[0][3];
437
    C[4] = A[0][4];
438
439
    A[0][1] = ROL64(A[1][1], rhotates[1][1]);
440
    A[0][2] = ROL64(A[2][2], rhotates[2][2]);
441
    A[0][3] = ROL64(A[3][3], rhotates[3][3]);
442
    A[0][4] = ROL64(A[4][4], rhotates[4][4]);
443
444
    A[1][1] = ROL64(A[1][4], rhotates[1][4]);
445
    A[2][2] = ROL64(A[2][3], rhotates[2][3]);
446
    A[3][3] = ROL64(A[3][2], rhotates[3][2]);
447
    A[4][4] = ROL64(A[4][1], rhotates[4][1]);
448
449
    A[1][4] = ROL64(A[4][2], rhotates[4][2]);
450
    A[2][3] = ROL64(A[3][4], rhotates[3][4]);
451
    A[3][2] = ROL64(A[2][1], rhotates[2][1]);
452
    A[4][1] = ROL64(A[1][3], rhotates[1][3]);
453
454
    A[4][2] = ROL64(A[2][4], rhotates[2][4]);
455
    A[3][4] = ROL64(A[4][3], rhotates[4][3]);
456
    A[2][1] = ROL64(A[1][2], rhotates[1][2]);
457
    A[1][3] = ROL64(A[3][1], rhotates[3][1]);
458
459
    A[2][4] = ROL64(A[4][0], rhotates[4][0]);
460
    A[4][3] = ROL64(A[3][0], rhotates[3][0]);
461
    A[1][2] = ROL64(A[2][0], rhotates[2][0]);
462
    A[3][1] = ROL64(A[1][0], rhotates[1][0]);
463
464
    A[1][0] = ROL64(C[3], rhotates[0][3]);
465
    A[2][0] = ROL64(C[1], rhotates[0][1]);
466
    A[3][0] = ROL64(C[4], rhotates[0][4]);
467
    A[4][0] = ROL64(C[2], rhotates[0][2]);
468
469
    C[0] = A[0][0];
470
    C[1] = A[1][0];
471
    D[0] = A[0][1];
472
    D[1] = A[1][1];
473
474
    A[0][0] ^= (~A[0][1] & A[0][2]);
475
    A[1][0] ^= (~A[1][1] & A[1][2]);
476
    A[0][1] ^= (~A[0][2] & A[0][3]);
477
    A[1][1] ^= (~A[1][2] & A[1][3]);
478
    A[0][2] ^= (~A[0][3] & A[0][4]);
479
    A[1][2] ^= (~A[1][3] & A[1][4]);
480
    A[0][3] ^= (~A[0][4] & C[0]);
481
    A[1][3] ^= (~A[1][4] & C[1]);
482
    A[0][4] ^= (~C[0] & D[0]);
483
    A[1][4] ^= (~C[1] & D[1]);
484
485
    C[2] = A[2][0];
486
    C[3] = A[3][0];
487
    D[2] = A[2][1];
488
    D[3] = A[3][1];
489
490
    A[2][0] ^= (~A[2][1] & A[2][2]);
491
    A[3][0] ^= (~A[3][1] & A[3][2]);
492
    A[2][1] ^= (~A[2][2] & A[2][3]);
493
    A[3][1] ^= (~A[3][2] & A[3][3]);
494
    A[2][2] ^= (~A[2][3] & A[2][4]);
495
    A[3][2] ^= (~A[3][3] & A[3][4]);
496
    A[2][3] ^= (~A[2][4] & C[2]);
497
    A[3][3] ^= (~A[3][4] & C[3]);
498
    A[2][4] ^= (~C[2] & D[2]);
499
    A[3][4] ^= (~C[3] & D[3]);
500
501
    C[4] = A[4][0];
502
    D[4] = A[4][1];
503
504
    A[4][0] ^= (~A[4][1] & A[4][2]);
505
    A[4][1] ^= (~A[4][2] & A[4][3]);
506
    A[4][2] ^= (~A[4][3] & A[4][4]);
507
    A[4][3] ^= (~A[4][4] & C[4]);
508
    A[4][4] ^= (~C[4] & D[4]);
509
    A[0][0] ^= iotas[i];
510
}
511
512
static void KeccakF1600(uint64_t A[5][5])
513
{
514
    size_t i;
515
516
    for (i = 0; i < 24; i++) {
517
        Round(A, i);
518
    }
519
}
520
521
#elif defined(KECCAK_2X)
522
/*
523
 * This implementation is variant of KECCAK_1X above with outer-most
524
 * round loop unrolled twice. This allows to take temporary storage
525
 * out of round procedure and simplify references to it by alternating
526
 * it with actual data (see round loop below). Originally it was meant
527
 * rather as reference for an assembly implementation, but it seems to
528
 * play best with compilers [as well as provide best instruction per
529
 * processed byte ratio at minimal round unroll factor]...
530
 */
531
static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
532
26.9M
{
533
26.9M
    uint64_t C[5], D[5];
534
535
26.9M
    assert(i < OSSL_NELEM(iotas));
536
537
26.9M
    C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
538
26.9M
    C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
539
26.9M
    C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
540
26.9M
    C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
541
26.9M
    C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
542
543
26.9M
    D[0] = ROL64(C[1], 1) ^ C[4];
544
26.9M
    D[1] = ROL64(C[2], 1) ^ C[0];
545
26.9M
    D[2] = ROL64(C[3], 1) ^ C[1];
546
26.9M
    D[3] = ROL64(C[4], 1) ^ C[2];
547
26.9M
    D[4] = ROL64(C[0], 1) ^ C[3];
548
549
26.9M
    C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
550
26.9M
    C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
551
26.9M
    C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
552
26.9M
    C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
553
26.9M
    C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
554
555
26.9M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
556
26.9M
    R[0][0] = C[0] ^ (C[1] | C[2]) ^ iotas[i];
557
26.9M
    R[0][1] = C[1] ^ (~C[2] | C[3]);
558
26.9M
    R[0][2] = C[2] ^ (C[3] & C[4]);
559
26.9M
    R[0][3] = C[3] ^ (C[4] | C[0]);
560
26.9M
    R[0][4] = C[4] ^ (C[0] & C[1]);
561
#else
562
    R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
563
    R[0][1] = C[1] ^ (~C[2] & C[3]);
564
    R[0][2] = C[2] ^ (~C[3] & C[4]);
565
    R[0][3] = C[3] ^ (~C[4] & C[0]);
566
    R[0][4] = C[4] ^ (~C[0] & C[1]);
567
#endif
568
569
26.9M
    C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
570
26.9M
    C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
571
26.9M
    C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
572
26.9M
    C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
573
26.9M
    C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
574
575
26.9M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
576
26.9M
    R[1][0] = C[0] ^ (C[1] | C[2]);
577
26.9M
    R[1][1] = C[1] ^ (C[2] & C[3]);
578
26.9M
    R[1][2] = C[2] ^ (C[3] | ~C[4]);
579
26.9M
    R[1][3] = C[3] ^ (C[4] | C[0]);
580
26.9M
    R[1][4] = C[4] ^ (C[0] & C[1]);
581
#else
582
    R[1][0] = C[0] ^ (~C[1] & C[2]);
583
    R[1][1] = C[1] ^ (~C[2] & C[3]);
584
    R[1][2] = C[2] ^ (~C[3] & C[4]);
585
    R[1][3] = C[3] ^ (~C[4] & C[0]);
586
    R[1][4] = C[4] ^ (~C[0] & C[1]);
587
#endif
588
589
26.9M
    C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
590
26.9M
    C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
591
26.9M
    C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
592
26.9M
    C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
593
26.9M
    C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
594
595
26.9M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
596
26.9M
    R[2][0] = C[0] ^ (C[1] | C[2]);
597
26.9M
    R[2][1] = C[1] ^ (C[2] & C[3]);
598
26.9M
    R[2][2] = C[2] ^ (~C[3] & C[4]);
599
26.9M
    R[2][3] = ~C[3] ^ (C[4] | C[0]);
600
26.9M
    R[2][4] = C[4] ^ (C[0] & C[1]);
601
#else
602
    R[2][0] = C[0] ^ (~C[1] & C[2]);
603
    R[2][1] = C[1] ^ (~C[2] & C[3]);
604
    R[2][2] = C[2] ^ (~C[3] & C[4]);
605
    R[2][3] = C[3] ^ (~C[4] & C[0]);
606
    R[2][4] = C[4] ^ (~C[0] & C[1]);
607
#endif
608
609
26.9M
    C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
610
26.9M
    C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
611
26.9M
    C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
612
26.9M
    C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
613
26.9M
    C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
614
615
26.9M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
616
26.9M
    R[3][0] = C[0] ^ (C[1] & C[2]);
617
26.9M
    R[3][1] = C[1] ^ (C[2] | C[3]);
618
26.9M
    R[3][2] = C[2] ^ (~C[3] | C[4]);
619
26.9M
    R[3][3] = ~C[3] ^ (C[4] & C[0]);
620
26.9M
    R[3][4] = C[4] ^ (C[0] | C[1]);
621
#else
622
    R[3][0] = C[0] ^ (~C[1] & C[2]);
623
    R[3][1] = C[1] ^ (~C[2] & C[3]);
624
    R[3][2] = C[2] ^ (~C[3] & C[4]);
625
    R[3][3] = C[3] ^ (~C[4] & C[0]);
626
    R[3][4] = C[4] ^ (~C[0] & C[1]);
627
#endif
628
629
26.9M
    C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
630
26.9M
    C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
631
26.9M
    C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
632
26.9M
    C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
633
26.9M
    C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
634
635
26.9M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
636
26.9M
    R[4][0] = C[0] ^ (~C[1] & C[2]);
637
26.9M
    R[4][1] = ~C[1] ^ (C[2] | C[3]);
638
26.9M
    R[4][2] = C[2] ^ (C[3] & C[4]);
639
26.9M
    R[4][3] = C[3] ^ (C[4] | C[0]);
640
26.9M
    R[4][4] = C[4] ^ (C[0] & C[1]);
641
#else
642
    R[4][0] = C[0] ^ (~C[1] & C[2]);
643
    R[4][1] = C[1] ^ (~C[2] & C[3]);
644
    R[4][2] = C[2] ^ (~C[3] & C[4]);
645
    R[4][3] = C[3] ^ (~C[4] & C[0]);
646
    R[4][4] = C[4] ^ (~C[0] & C[1]);
647
#endif
648
26.9M
}
649
650
static void KeccakF1600(uint64_t A[5][5])
651
1.12M
{
652
1.12M
    uint64_t T[5][5];
653
1.12M
    size_t i;
654
655
1.12M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
656
1.12M
    A[0][1] = ~A[0][1];
657
1.12M
    A[0][2] = ~A[0][2];
658
1.12M
    A[1][3] = ~A[1][3];
659
1.12M
    A[2][2] = ~A[2][2];
660
1.12M
    A[3][2] = ~A[3][2];
661
1.12M
    A[4][0] = ~A[4][0];
662
1.12M
#endif
663
664
14.6M
    for (i = 0; i < 24; i += 2) {
665
13.4M
        Round(T, A, i);
666
13.4M
        Round(A, T, i + 1);
667
13.4M
    }
668
669
1.12M
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
670
1.12M
    A[0][1] = ~A[0][1];
671
1.12M
    A[0][2] = ~A[0][2];
672
1.12M
    A[1][3] = ~A[1][3];
673
1.12M
    A[2][2] = ~A[2][2];
674
1.12M
    A[3][2] = ~A[3][2];
675
1.12M
    A[4][0] = ~A[4][0];
676
1.12M
#endif
677
1.12M
}
678
679
#else /* define KECCAK_INPLACE to compile this code path */
680
/*
681
 * This implementation is KECCAK_1X from above combined 4 times with
682
 * a twist that allows to omit temporary storage and perform in-place
683
 * processing. It's discussed in section 2.5 of "Keccak implementation
684
 * overview". It's likely to be best suited for processors with large
685
 * register bank... On the other hand processor with large register
686
 * bank can as well use KECCAK_1X_ALT, it would be as fast but much
687
 * more compact...
688
 */
689
static void FourRounds(uint64_t A[5][5], size_t i)
690
{
691
    uint64_t B[5], C[5], D[5];
692
693
    assert(i <= OSSL_NELEM(iotas) - 4);
694
695
    /* Round 4*n */
696
    C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
697
    C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
698
    C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
699
    C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
700
    C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
701
702
    D[0] = ROL64(C[1], 1) ^ C[4];
703
    D[1] = ROL64(C[2], 1) ^ C[0];
704
    D[2] = ROL64(C[3], 1) ^ C[1];
705
    D[3] = ROL64(C[4], 1) ^ C[2];
706
    D[4] = ROL64(C[0], 1) ^ C[3];
707
708
    B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
709
    B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
710
    B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
711
    B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
712
    B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
713
714
    C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
715
    C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
716
    C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
717
    C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
718
    C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
719
720
    B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
721
    B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
722
    B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
723
    B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
724
    B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
725
726
    C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
727
    C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
728
    C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
729
    C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
730
    C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
731
732
    B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
733
    B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
734
    B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
735
    B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
736
    B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
737
738
    C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
739
    C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
740
    C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
741
    C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
742
    C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
743
744
    B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
745
    B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
746
    B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
747
    B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
748
    B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
749
750
    C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
751
    C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
752
    C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
753
    C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
754
    C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
755
756
    B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
757
    B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
758
    B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
759
    B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
760
    B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
761
762
    C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
763
    C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
764
    C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
765
    C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
766
    C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
767
768
    /* Round 4*n+1 */
769
    D[0] = ROL64(C[1], 1) ^ C[4];
770
    D[1] = ROL64(C[2], 1) ^ C[0];
771
    D[2] = ROL64(C[3], 1) ^ C[1];
772
    D[3] = ROL64(C[4], 1) ^ C[2];
773
    D[4] = ROL64(C[0], 1) ^ C[3];
774
775
    B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
776
    B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
777
    B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
778
    B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
779
    B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
780
781
    C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
782
    C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
783
    C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
784
    C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
785
    C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
786
787
    B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
788
    B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
789
    B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
790
    B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
791
    B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
792
793
    C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
794
    C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
795
    C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
796
    C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
797
    C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
798
799
    B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
800
    B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
801
    B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
802
    B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
803
    B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
804
805
    C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
806
    C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
807
    C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
808
    C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
809
    C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
810
811
    B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
812
    B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
813
    B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
814
    B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
815
    B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
816
817
    C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
818
    C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
819
    C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
820
    C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
821
    C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
822
823
    B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
824
    B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
825
    B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
826
    B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
827
    B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
828
829
    C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
830
    C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
831
    C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
832
    C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
833
    C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
834
835
    /* Round 4*n+2 */
836
    D[0] = ROL64(C[1], 1) ^ C[4];
837
    D[1] = ROL64(C[2], 1) ^ C[0];
838
    D[2] = ROL64(C[3], 1) ^ C[1];
839
    D[3] = ROL64(C[4], 1) ^ C[2];
840
    D[4] = ROL64(C[0], 1) ^ C[3];
841
842
    B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
843
    B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
844
    B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
845
    B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
846
    B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
847
848
    C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
849
    C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
850
    C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
851
    C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
852
    C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
853
854
    B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
855
    B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
856
    B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
857
    B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
858
    B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
859
860
    C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
861
    C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
862
    C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
863
    C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
864
    C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
865
866
    B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
867
    B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
868
    B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
869
    B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
870
    B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
871
872
    C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
873
    C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
874
    C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
875
    C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
876
    C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
877
878
    B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
879
    B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
880
    B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
881
    B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
882
    B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
883
884
    C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
885
    C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
886
    C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
887
    C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
888
    C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
889
890
    B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
891
    B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
892
    B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
893
    B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
894
    B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
895
896
    C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
897
    C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
898
    C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
899
    C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
900
    C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
901
902
    /* Round 4*n+3 */
903
    D[0] = ROL64(C[1], 1) ^ C[4];
904
    D[1] = ROL64(C[2], 1) ^ C[0];
905
    D[2] = ROL64(C[3], 1) ^ C[1];
906
    D[3] = ROL64(C[4], 1) ^ C[2];
907
    D[4] = ROL64(C[0], 1) ^ C[3];
908
909
    B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
910
    B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
911
    B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
912
    B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
913
    B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
914
915
    /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
916
    /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
917
    /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
918
    /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
919
    /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
920
921
    B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
922
    B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
923
    B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
924
    B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
925
    B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
926
927
    /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
928
    /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
929
    /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
930
    /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
931
    /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
932
933
    B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
934
    B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
935
    B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
936
    B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
937
    B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
938
939
    /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
940
    /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
941
    /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
942
    /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
943
    /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
944
945
    B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
946
    B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
947
    B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
948
    B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
949
    B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
950
951
    /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
952
    /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
953
    /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
954
    /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
955
    /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
956
957
    B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
958
    B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
959
    B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
960
    B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
961
    B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
962
963
    /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
964
    /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
965
    /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
966
    /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
967
    /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
968
}
969
970
static void KeccakF1600(uint64_t A[5][5])
971
{
972
    size_t i;
973
974
    for (i = 0; i < 24; i += 4) {
975
        FourRounds(A, i);
976
    }
977
}
978
979
#endif
980
981
static uint64_t BitInterleave(uint64_t Ai)
982
15.3M
{
983
15.3M
    if (BIT_INTERLEAVE) {
984
0
        uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
985
0
        uint32_t t0, t1;
986
987
0
        t0 = lo & 0x55555555;
988
0
        t0 |= t0 >> 1;
989
0
        t0 &= 0x33333333;
990
0
        t0 |= t0 >> 2;
991
0
        t0 &= 0x0f0f0f0f;
992
0
        t0 |= t0 >> 4;
993
0
        t0 &= 0x00ff00ff;
994
0
        t0 |= t0 >> 8;
995
0
        t0 &= 0x0000ffff;
996
997
0
        t1 = hi & 0x55555555;
998
0
        t1 |= t1 >> 1;
999
0
        t1 &= 0x33333333;
1000
0
        t1 |= t1 >> 2;
1001
0
        t1 &= 0x0f0f0f0f;
1002
0
        t1 |= t1 >> 4;
1003
0
        t1 &= 0x00ff00ff;
1004
0
        t1 |= t1 >> 8;
1005
0
        t1 <<= 16;
1006
1007
0
        lo &= 0xaaaaaaaa;
1008
0
        lo |= lo << 1;
1009
0
        lo &= 0xcccccccc;
1010
0
        lo |= lo << 2;
1011
0
        lo &= 0xf0f0f0f0;
1012
0
        lo |= lo << 4;
1013
0
        lo &= 0xff00ff00;
1014
0
        lo |= lo << 8;
1015
0
        lo >>= 16;
1016
1017
0
        hi &= 0xaaaaaaaa;
1018
0
        hi |= hi << 1;
1019
0
        hi &= 0xcccccccc;
1020
0
        hi |= hi << 2;
1021
0
        hi &= 0xf0f0f0f0;
1022
0
        hi |= hi << 4;
1023
0
        hi &= 0xff00ff00;
1024
0
        hi |= hi << 8;
1025
0
        hi &= 0xffff0000;
1026
1027
0
        Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1028
0
    }
1029
1030
15.3M
    return Ai;
1031
15.3M
}
1032
1033
static uint64_t BitDeinterleave(uint64_t Ai)
1034
334
{
1035
334
    if (BIT_INTERLEAVE) {
1036
0
        uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1037
0
        uint32_t t0, t1;
1038
1039
0
        t0 = lo & 0x0000ffff;
1040
0
        t0 |= t0 << 8;
1041
0
        t0 &= 0x00ff00ff;
1042
0
        t0 |= t0 << 4;
1043
0
        t0 &= 0x0f0f0f0f;
1044
0
        t0 |= t0 << 2;
1045
0
        t0 &= 0x33333333;
1046
0
        t0 |= t0 << 1;
1047
0
        t0 &= 0x55555555;
1048
1049
0
        t1 = hi << 16;
1050
0
        t1 |= t1 >> 8;
1051
0
        t1 &= 0xff00ff00;
1052
0
        t1 |= t1 >> 4;
1053
0
        t1 &= 0xf0f0f0f0;
1054
0
        t1 |= t1 >> 2;
1055
0
        t1 &= 0xcccccccc;
1056
0
        t1 |= t1 >> 1;
1057
0
        t1 &= 0xaaaaaaaa;
1058
1059
0
        lo >>= 16;
1060
0
        lo |= lo << 8;
1061
0
        lo &= 0x00ff00ff;
1062
0
        lo |= lo << 4;
1063
0
        lo &= 0x0f0f0f0f;
1064
0
        lo |= lo << 2;
1065
0
        lo &= 0x33333333;
1066
0
        lo |= lo << 1;
1067
0
        lo &= 0x55555555;
1068
1069
0
        hi &= 0xffff0000;
1070
0
        hi |= hi >> 8;
1071
0
        hi &= 0xff00ff00;
1072
0
        hi |= hi >> 4;
1073
0
        hi &= 0xf0f0f0f0;
1074
0
        hi |= hi >> 2;
1075
0
        hi &= 0xcccccccc;
1076
0
        hi |= hi >> 1;
1077
0
        hi &= 0xaaaaaaaa;
1078
1079
0
        Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1080
0
    }
1081
1082
334
    return Ai;
1083
334
}
1084
1085
/*
1086
 * SHA3_absorb can be called multiple times, but at each invocation
1087
 * largest multiple of |r| out of |len| bytes are processed. Then
1088
 * remaining amount of bytes is returned. This is done to spare caller
1089
 * trouble of calculating the largest multiple of |r|. |r| can be viewed
1090
 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1091
 * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1092
 * padding and intermediate sub-block buffering, byte- or bitwise, is
1093
 * caller's responsibility.
1094
 */
1095
size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1096
    size_t r)
1097
244
{
1098
244
    uint64_t *A_flat = (uint64_t *)A;
1099
244
    size_t i, w = r / 8;
1100
1101
244
    assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1102
1103
1.12M
    while (len >= r) {
1104
16.4M
        for (i = 0; i < w; i++) {
1105
15.3M
            uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 | (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 | (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 | (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1106
15.3M
            inp += 8;
1107
1108
15.3M
            A_flat[i] ^= BitInterleave(Ai);
1109
15.3M
        }
1110
1.12M
        KeccakF1600(A);
1111
1.12M
        len -= r;
1112
1.12M
    }
1113
1114
244
    return len;
1115
244
}
1116
1117
/*
1118
 * SHA3_squeeze may be called after SHA3_absorb to generate |out| hash value of
1119
 * |len| bytes.
1120
 * If multiple SHA3_squeeze calls are required the output length |len| must be a
1121
 * multiple of the blocksize, with |next| being 0 on the first call and 1 on
1122
 * subsequent calls. It is the callers responsibility to buffer the results.
1123
 * When only a single call to SHA3_squeeze is required, |len| can be any size
1124
 * and |next| must be 0.
1125
 */
1126
void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r,
1127
    int next)
1128
61
{
1129
61
    uint64_t *A_flat = (uint64_t *)A;
1130
61
    size_t i, w = r / 8;
1131
1132
61
    assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1133
1134
102
    while (len != 0) {
1135
61
        if (next)
1136
0
            KeccakF1600(A);
1137
61
        next = 1;
1138
375
        for (i = 0; i < w && len != 0; i++) {
1139
334
            uint64_t Ai = BitDeinterleave(A_flat[i]);
1140
1141
334
            if (len < 8) {
1142
100
                for (i = 0; i < len; i++) {
1143
80
                    *out++ = (unsigned char)Ai;
1144
80
                    Ai >>= 8;
1145
80
                }
1146
20
                return;
1147
20
            }
1148
1149
314
            out[0] = (unsigned char)(Ai);
1150
314
            out[1] = (unsigned char)(Ai >> 8);
1151
314
            out[2] = (unsigned char)(Ai >> 16);
1152
314
            out[3] = (unsigned char)(Ai >> 24);
1153
314
            out[4] = (unsigned char)(Ai >> 32);
1154
314
            out[5] = (unsigned char)(Ai >> 40);
1155
314
            out[6] = (unsigned char)(Ai >> 48);
1156
314
            out[7] = (unsigned char)(Ai >> 56);
1157
314
            out += 8;
1158
314
            len -= 8;
1159
314
        }
1160
61
    }
1161
61
}
1162
#endif
1163
1164
#ifdef SELFTEST
1165
/*
1166
 * Post-padding one-shot implementations would look as following:
1167
 *
1168
 * SHA3_224     SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1169
 * SHA3_256     SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1170
 * SHA3_384     SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1171
 * SHA3_512     SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1172
 * SHAKE_128    SHA3_sponge(inp, len, out, d, (1600-256)/8);
1173
 * SHAKE_256    SHA3_sponge(inp, len, out, d, (1600-512)/8);
1174
 */
1175
1176
void SHA3_sponge(const unsigned char *inp, size_t len,
1177
    unsigned char *out, size_t d, size_t r)
1178
{
1179
    uint64_t A[5][5];
1180
1181
    memset(A, 0, sizeof(A));
1182
    SHA3_absorb(A, inp, len, r);
1183
    SHA3_squeeze(A, out, d, r);
1184
}
1185
1186
#include <stdio.h>
1187
1188
int main(void)
1189
{
1190
    /*
1191
     * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1192
     */
1193
    unsigned char test[168] = { '\xf3', '\x3' };
1194
    unsigned char out[512];
1195
    size_t i;
1196
    static const unsigned char result[512] = {
1197
        0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1198
        0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1199
        0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1200
        0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1201
        0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1202
        0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1203
        0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1204
        0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1205
        0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1206
        0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1207
        0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1208
        0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1209
        0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1210
        0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1211
        0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1212
        0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1213
        0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1214
        0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1215
        0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1216
        0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1217
        0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1218
        0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1219
        0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1220
        0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1221
        0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1222
        0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1223
        0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1224
        0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1225
        0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1226
        0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1227
        0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1228
        0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1229
        0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1230
        0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1231
        0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1232
        0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1233
        0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1234
        0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1235
        0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1236
        0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1237
        0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1238
        0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1239
        0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1240
        0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1241
        0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1242
        0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1243
        0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1244
        0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1245
        0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1246
        0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1247
        0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1248
        0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1249
        0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1250
        0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1251
        0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1252
        0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1253
        0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1254
        0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1255
        0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1256
        0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1257
        0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1258
        0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1259
        0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1260
        0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1261
    };
1262
1263
    test[167] = '\x80';
1264
    SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1265
1266
    /*
1267
     * Rationale behind keeping output [formatted as below] is that
1268
     * one should be able to redirect it to a file, then copy-n-paste
1269
     * final "output val" from official example to another file, and
1270
     * compare the two with diff(1).
1271
     */
1272
    for (i = 0; i < sizeof(out);) {
1273
        printf("%02X", out[i]);
1274
        printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1275
    }
1276
1277
    if (memcmp(out, result, sizeof(out))) {
1278
        fprintf(stderr, "failure\n");
1279
        return 1;
1280
    } else {
1281
        fprintf(stderr, "success\n");
1282
        return 0;
1283
    }
1284
}
1285
#endif