/src/openssl/crypto/sha/keccak1600.c

Line	Count	Source
1		/*
2		* Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved.
3		*
4		* Licensed under the Apache License 2.0 (the "License"). You may not use
5		* this file except in compliance with the License. You can obtain a copy
6		* in the file LICENSE in the source distribution or at
7		* https://www.openssl.org/source/license.html
8		*/
9
10		#include <openssl/e_os2.h>
11		#include <string.h>
12		#include <assert.h>
13
14		#include "internal/nelem.h"
15
16		size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
17		size_t r);
18		void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r, int next);
19
20		#if !defined(KECCAK1600_ASM) \|\| !defined(SELFTEST)
21
22		/*
23		* Choose some sensible defaults
24		*/
25		#if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
26		#define KECCAK_2X /* default to KECCAK_2X variant */
27		#endif
28
29		#if defined(__i386) \|\| defined(__i386__) \|\| defined(_M_IX86) \|\| (defined(__x86_64) && !defined(__BMI__)) \|\| defined(_M_X64) \|\| defined(__mips) \|\| defined(__riscv) \|\| defined(__s390__) \|\| defined(__EMSCRIPTEN__)
30		/*
31		* These don't have "and with complement" instruction, so minimize amount
32		* of "not"-s. Implemented only in the [default] KECCAK_2X variant.
33		*/
34		#define KECCAK_COMPLEMENTING_TRANSFORM
35		#endif
36
37		#if defined(__x86_64__) \|\| defined(__aarch64__) \|\| defined(__mips64) \|\| defined(__ia64) \|\| (defined(__VMS) && !defined(__vax))
38		/*
39		* These are available even in ILP32 flavours, but even then they are
40		* capable of performing 64-bit operations as efficiently as in *P64.
41		* Since it's not given that we can use sizeof(void *), just shunt it.
42		*/
43	798M	#define BIT_INTERLEAVE (0)
44		#else
45		#define BIT_INTERLEAVE (sizeof(void *) < 8)
46		#endif
47
48	0	#define ROL32(a, offset) (((a) << (offset)) \| ((a) >> ((32 - (offset)) & 31)))
49
50		static uint64_t ROL64(uint64_t val, int offset)
51	782M	{
52	782M	if (offset == 0) {
53	0	return val;
54	782M	} else if (!BIT_INTERLEAVE) {
55	782M	return (val << offset) \| (val >> (64 - offset));
56	782M	} else {
57	0	uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
58
59	0	if (offset & 1) {
60	0	uint32_t tmp = hi;
61
62	0	offset >>= 1;
63	0	hi = ROL32(lo, offset);
64	0	lo = ROL32(tmp, offset + 1);
65	0	} else {
66	0	offset >>= 1;
67	0	lo = ROL32(lo, offset);
68	0	hi = ROL32(hi, offset);
69	0	}
70
71	0	return ((uint64_t)hi << 32) \| lo;
72	0	}
73	782M	}
74
75		static const unsigned char rhotates[5][5] = {
76		{ 0, 1, 62, 28, 27 },
77		{ 36, 44, 6, 55, 20 },
78		{ 3, 10, 43, 25, 39 },
79		{ 41, 45, 15, 21, 8 },
80		{ 18, 2, 61, 56, 14 }
81		};
82
83		static const uint64_t iotas[] = {
84		BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
85		BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
86		BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
87		BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
88		BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
89		BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
90		BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
91		BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
92		BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
93		BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
94		BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
95		BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
96		BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
97		BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
98		BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
99		BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
100		BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
101		BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
102		BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
103		BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
104		BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
105		BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
106		BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
107		BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
108		};
109
110		#if defined(KECCAK_REF)
111		/*
112		* This is straightforward or "maximum clarity" implementation aiming
113		* to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
114		* Permutation-Based Hash and Extendible-Output Functions" as much as
115		* possible. With one caveat. Because of the way C stores matrices,
116		* references to A[x,y] in the specification are presented as A[y][x].
117		* Implementation unrolls inner x-loops so that modulo 5 operations are
118		* explicitly pre-computed.
119		*/
120		static void Theta(uint64_t A[5][5])
121		{
122		uint64_t C[5], D[5];
123		size_t y;
124
125		C[0] = A[0][0];
126		C[1] = A[0][1];
127		C[2] = A[0][2];
128		C[3] = A[0][3];
129		C[4] = A[0][4];
130
131		for (y = 1; y < 5; y++) {
132		C[0] ^= A[y][0];
133		C[1] ^= A[y][1];
134		C[2] ^= A[y][2];
135		C[3] ^= A[y][3];
136		C[4] ^= A[y][4];
137		}
138
139		D[0] = ROL64(C[1], 1) ^ C[4];
140		D[1] = ROL64(C[2], 1) ^ C[0];
141		D[2] = ROL64(C[3], 1) ^ C[1];
142		D[3] = ROL64(C[4], 1) ^ C[2];
143		D[4] = ROL64(C[0], 1) ^ C[3];
144
145		for (y = 0; y < 5; y++) {
146		A[y][0] ^= D[0];
147		A[y][1] ^= D[1];
148		A[y][2] ^= D[2];
149		A[y][3] ^= D[3];
150		A[y][4] ^= D[4];
151		}
152		}
153
154		static void Rho(uint64_t A[5][5])
155		{
156		size_t y;
157
158		for (y = 0; y < 5; y++) {
159		A[y][0] = ROL64(A[y][0], rhotates[y][0]);
160		A[y][1] = ROL64(A[y][1], rhotates[y][1]);
161		A[y][2] = ROL64(A[y][2], rhotates[y][2]);
162		A[y][3] = ROL64(A[y][3], rhotates[y][3]);
163		A[y][4] = ROL64(A[y][4], rhotates[y][4]);
164		}
165		}
166
167		static void Pi(uint64_t A[5][5])
168		{
169		uint64_t T[5][5];
170
171		/*
172		* T = A
173		* A[y][x] = T[x][(3*y+x)%5]
174		*/
175		memcpy(T, A, sizeof(T));
176
177		A[0][0] = T[0][0];
178		A[0][1] = T[1][1];
179		A[0][2] = T[2][2];
180		A[0][3] = T[3][3];
181		A[0][4] = T[4][4];
182
183		A[1][0] = T[0][3];
184		A[1][1] = T[1][4];
185		A[1][2] = T[2][0];
186		A[1][3] = T[3][1];
187		A[1][4] = T[4][2];
188
189		A[2][0] = T[0][1];
190		A[2][1] = T[1][2];
191		A[2][2] = T[2][3];
192		A[2][3] = T[3][4];
193		A[2][4] = T[4][0];
194
195		A[3][0] = T[0][4];
196		A[3][1] = T[1][0];
197		A[3][2] = T[2][1];
198		A[3][3] = T[3][2];
199		A[3][4] = T[4][3];
200
201		A[4][0] = T[0][2];
202		A[4][1] = T[1][3];
203		A[4][2] = T[2][4];
204		A[4][3] = T[3][0];
205		A[4][4] = T[4][1];
206		}
207
208		static void Chi(uint64_t A[5][5])
209		{
210		uint64_t C[5];
211		size_t y;
212
213		for (y = 0; y < 5; y++) {
214		C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
215		C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
216		C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
217		C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
218		C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
219
220		A[y][0] = C[0];
221		A[y][1] = C[1];
222		A[y][2] = C[2];
223		A[y][3] = C[3];
224		A[y][4] = C[4];
225		}
226		}
227
228		static void Iota(uint64_t A[5][5], size_t i)
229		{
230		assert(i < OSSL_NELEM(iotas));
231		A[0][0] ^= iotas[i];
232		}
233
234		static void KeccakF1600(uint64_t A[5][5])
235		{
236		size_t i;
237
238		for (i = 0; i < 24; i++) {
239		Theta(A);
240		Rho(A);
241		Pi(A);
242		Chi(A);
243		Iota(A, i);
244		}
245		}
246
247		#elif defined(KECCAK_1X)
248		/*
249		* This implementation is optimization of above code featuring unroll
250		* of even y-loops, their fusion and code motion. It also minimizes
251		* temporary storage. Compiler would normally do all these things for
252		* you, purpose of manual optimization is to provide "unobscured"
253		* reference for assembly implementation [in case this approach is
254		* chosen for implementation on some platform]. In the nutshell it's
255		* equivalent of "plane-per-plane processing" approach discussed in
256		* section 2.4 of "Keccak implementation overview".
257		*/
258		static void Round(uint64_t A[5][5], size_t i)
259		{
260		uint64_t C[5], E[2]; /* registers */
261		uint64_t D[5], T[2][5]; /* memory */
262
263		assert(i < OSSL_NELEM(iotas));
264
265		C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
266		C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
267		C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
268		C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
269		C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
270
271		#if defined(__arm__)
272		D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
273		D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
274		D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
275		D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
276		D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
277
278		T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
279		T[0][1] = A[0][1] ^ E[0]; /* D[1] */
280		T[0][2] = A[0][2] ^ C[1]; /* D[2] */
281		T[0][3] = A[0][3] ^ C[2]; /* D[3] */
282		T[0][4] = A[0][4] ^ E[1]; /* D[4] */
283
284		C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
285		C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
286		C[0] = A[0][0] ^ C[0]; /* rotate by 0 / / D[0] */
287		C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
288		C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
289		#else
290		D[0] = ROL64(C[1], 1) ^ C[4];
291		D[1] = ROL64(C[2], 1) ^ C[0];
292		D[2] = ROL64(C[3], 1) ^ C[1];
293		D[3] = ROL64(C[4], 1) ^ C[2];
294		D[4] = ROL64(C[0], 1) ^ C[3];
295
296		T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
297		T[0][1] = A[0][1] ^ D[1];
298		T[0][2] = A[0][2] ^ D[2];
299		T[0][3] = A[0][3] ^ D[3];
300		T[0][4] = A[0][4] ^ D[4];
301
302		C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
303		C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
304		C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
305		C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
306		C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
307		#endif
308		A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
309		A[0][1] = C[1] ^ (~C[2] & C[3]);
310		A[0][2] = C[2] ^ (~C[3] & C[4]);
311		A[0][3] = C[3] ^ (~C[4] & C[0]);
312		A[0][4] = C[4] ^ (~C[0] & C[1]);
313
314		T[1][0] = A[1][0] ^ (C[3] = D[0]);
315		T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
316		T[1][2] = A[1][2] ^ (E[0] = D[2]);
317		T[1][3] = A[1][3] ^ (E[1] = D[3]);
318		T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
319
320		C[0] = ROL64(T[0][3], rhotates[0][3]);
321		C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
322		C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
323		C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
324		C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
325
326		A[1][0] = C[0] ^ (~C[1] & C[2]);
327		A[1][1] = C[1] ^ (~C[2] & C[3]);
328		A[1][2] = C[2] ^ (~C[3] & C[4]);
329		A[1][3] = C[3] ^ (~C[4] & C[0]);
330		A[1][4] = C[4] ^ (~C[0] & C[1]);
331
332		C[0] = ROL64(T[0][1], rhotates[0][1]);
333		C[1] = ROL64(T[1][2], rhotates[1][2]);
334		C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
335		C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
336		C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
337
338		A[2][0] = C[0] ^ (~C[1] & C[2]);
339		A[2][1] = C[1] ^ (~C[2] & C[3]);
340		A[2][2] = C[2] ^ (~C[3] & C[4]);
341		A[2][3] = C[3] ^ (~C[4] & C[0]);
342		A[2][4] = C[4] ^ (~C[0] & C[1]);
343
344		C[0] = ROL64(T[0][4], rhotates[0][4]);
345		C[1] = ROL64(T[1][0], rhotates[1][0]);
346		C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
347		C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
348		C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
349
350		A[3][0] = C[0] ^ (~C[1] & C[2]);
351		A[3][1] = C[1] ^ (~C[2] & C[3]);
352		A[3][2] = C[2] ^ (~C[3] & C[4]);
353		A[3][3] = C[3] ^ (~C[4] & C[0]);
354		A[3][4] = C[4] ^ (~C[0] & C[1]);
355
356		C[0] = ROL64(T[0][2], rhotates[0][2]);
357		C[1] = ROL64(T[1][3], rhotates[1][3]);
358		C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
359		C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
360		C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
361
362		A[4][0] = C[0] ^ (~C[1] & C[2]);
363		A[4][1] = C[1] ^ (~C[2] & C[3]);
364		A[4][2] = C[2] ^ (~C[3] & C[4]);
365		A[4][3] = C[3] ^ (~C[4] & C[0]);
366		A[4][4] = C[4] ^ (~C[0] & C[1]);
367		}
368
369		static void KeccakF1600(uint64_t A[5][5])
370		{
371		size_t i;
372
373		for (i = 0; i < 24; i++) {
374		Round(A, i);
375		}
376		}
377
378		#elif defined(KECCAK_1X_ALT)
379		/*
380		* This is variant of above KECCAK_1X that reduces requirement for
381		* temporary storage even further, but at cost of more updates to A[][].
382		* It's less suitable if A[][] is memory bound, but better if it's
383		* register bound.
384		*/
385
386		static void Round(uint64_t A[5][5], size_t i)
387		{
388		uint64_t C[5], D[5];
389
390		assert(i < OSSL_NELEM(iotas));
391
392		C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
393		C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
394		C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
395		C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
396		C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
397
398		D[1] = C[0] ^ ROL64(C[2], 1);
399		D[2] = C[1] ^ ROL64(C[3], 1);
400		D[3] = C[2] ^= ROL64(C[4], 1);
401		D[4] = C[3] ^= ROL64(C[0], 1);
402		D[0] = C[4] ^= ROL64(C[1], 1);
403
404		A[0][1] ^= D[1];
405		A[1][1] ^= D[1];
406		A[2][1] ^= D[1];
407		A[3][1] ^= D[1];
408		A[4][1] ^= D[1];
409
410		A[0][2] ^= D[2];
411		A[1][2] ^= D[2];
412		A[2][2] ^= D[2];
413		A[3][2] ^= D[2];
414		A[4][2] ^= D[2];
415
416		A[0][3] ^= C[2];
417		A[1][3] ^= C[2];
418		A[2][3] ^= C[2];
419		A[3][3] ^= C[2];
420		A[4][3] ^= C[2];
421
422		A[0][4] ^= C[3];
423		A[1][4] ^= C[3];
424		A[2][4] ^= C[3];
425		A[3][4] ^= C[3];
426		A[4][4] ^= C[3];
427
428		A[0][0] ^= C[4];
429		A[1][0] ^= C[4];
430		A[2][0] ^= C[4];
431		A[3][0] ^= C[4];
432		A[4][0] ^= C[4];
433
434		C[1] = A[0][1];
435		C[2] = A[0][2];
436		C[3] = A[0][3];
437		C[4] = A[0][4];
438
439		A[0][1] = ROL64(A[1][1], rhotates[1][1]);
440		A[0][2] = ROL64(A[2][2], rhotates[2][2]);
441		A[0][3] = ROL64(A[3][3], rhotates[3][3]);
442		A[0][4] = ROL64(A[4][4], rhotates[4][4]);
443
444		A[1][1] = ROL64(A[1][4], rhotates[1][4]);
445		A[2][2] = ROL64(A[2][3], rhotates[2][3]);
446		A[3][3] = ROL64(A[3][2], rhotates[3][2]);
447		A[4][4] = ROL64(A[4][1], rhotates[4][1]);
448
449		A[1][4] = ROL64(A[4][2], rhotates[4][2]);
450		A[2][3] = ROL64(A[3][4], rhotates[3][4]);
451		A[3][2] = ROL64(A[2][1], rhotates[2][1]);
452		A[4][1] = ROL64(A[1][3], rhotates[1][3]);
453
454		A[4][2] = ROL64(A[2][4], rhotates[2][4]);
455		A[3][4] = ROL64(A[4][3], rhotates[4][3]);
456		A[2][1] = ROL64(A[1][2], rhotates[1][2]);
457		A[1][3] = ROL64(A[3][1], rhotates[3][1]);
458
459		A[2][4] = ROL64(A[4][0], rhotates[4][0]);
460		A[4][3] = ROL64(A[3][0], rhotates[3][0]);
461		A[1][2] = ROL64(A[2][0], rhotates[2][0]);
462		A[3][1] = ROL64(A[1][0], rhotates[1][0]);
463
464		A[1][0] = ROL64(C[3], rhotates[0][3]);
465		A[2][0] = ROL64(C[1], rhotates[0][1]);
466		A[3][0] = ROL64(C[4], rhotates[0][4]);
467		A[4][0] = ROL64(C[2], rhotates[0][2]);
468
469		C[0] = A[0][0];
470		C[1] = A[1][0];
471		D[0] = A[0][1];
472		D[1] = A[1][1];
473
474		A[0][0] ^= (~A[0][1] & A[0][2]);
475		A[1][0] ^= (~A[1][1] & A[1][2]);
476		A[0][1] ^= (~A[0][2] & A[0][3]);
477		A[1][1] ^= (~A[1][2] & A[1][3]);
478		A[0][2] ^= (~A[0][3] & A[0][4]);
479		A[1][2] ^= (~A[1][3] & A[1][4]);
480		A[0][3] ^= (~A[0][4] & C[0]);
481		A[1][3] ^= (~A[1][4] & C[1]);
482		A[0][4] ^= (~C[0] & D[0]);
483		A[1][4] ^= (~C[1] & D[1]);
484
485		C[2] = A[2][0];
486		C[3] = A[3][0];
487		D[2] = A[2][1];
488		D[3] = A[3][1];
489
490		A[2][0] ^= (~A[2][1] & A[2][2]);
491		A[3][0] ^= (~A[3][1] & A[3][2]);
492		A[2][1] ^= (~A[2][2] & A[2][3]);
493		A[3][1] ^= (~A[3][2] & A[3][3]);
494		A[2][2] ^= (~A[2][3] & A[2][4]);
495		A[3][2] ^= (~A[3][3] & A[3][4]);
496		A[2][3] ^= (~A[2][4] & C[2]);
497		A[3][3] ^= (~A[3][4] & C[3]);
498		A[2][4] ^= (~C[2] & D[2]);
499		A[3][4] ^= (~C[3] & D[3]);
500
501		C[4] = A[4][0];
502		D[4] = A[4][1];
503
504		A[4][0] ^= (~A[4][1] & A[4][2]);
505		A[4][1] ^= (~A[4][2] & A[4][3]);
506		A[4][2] ^= (~A[4][3] & A[4][4]);
507		A[4][3] ^= (~A[4][4] & C[4]);
508		A[4][4] ^= (~C[4] & D[4]);
509		A[0][0] ^= iotas[i];
510		}
511
512		static void KeccakF1600(uint64_t A[5][5])
513		{
514		size_t i;
515
516		for (i = 0; i < 24; i++) {
517		Round(A, i);
518		}
519		}
520
521		#elif defined(KECCAK_2X)
522		/*
523		* This implementation is variant of KECCAK_1X above with outer-most
524		* round loop unrolled twice. This allows to take temporary storage
525		* out of round procedure and simplify references to it by alternating
526		* it with actual data (see round loop below). Originally it was meant
527		* rather as reference for an assembly implementation, but it seems to
528		* play best with compilers [as well as provide best instruction per
529		* processed byte ratio at minimal round unroll factor]...
530		*/
531		static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
532	26.9M	{
533	26.9M	uint64_t C[5], D[5];
534
535	26.9M	assert(i < OSSL_NELEM(iotas));
536
537	26.9M	C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
538	26.9M	C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
539	26.9M	C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
540	26.9M	C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
541	26.9M	C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
542
543	26.9M	D[0] = ROL64(C[1], 1) ^ C[4];
544	26.9M	D[1] = ROL64(C[2], 1) ^ C[0];
545	26.9M	D[2] = ROL64(C[3], 1) ^ C[1];
546	26.9M	D[3] = ROL64(C[4], 1) ^ C[2];
547	26.9M	D[4] = ROL64(C[0], 1) ^ C[3];
548
549	26.9M	C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
550	26.9M	C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
551	26.9M	C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
552	26.9M	C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
553	26.9M	C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
554
555	26.9M	#ifdef KECCAK_COMPLEMENTING_TRANSFORM
556	26.9M	R[0][0] = C[0] ^ (C[1] \| C[2]) ^ iotas[i];
557	26.9M	R[0][1] = C[1] ^ (~C[2] \| C[3]);
558	26.9M	R[0][2] = C[2] ^ (C[3] & C[4]);
559	26.9M	R[0][3] = C[3] ^ (C[4] \| C[0]);
560	26.9M	R[0][4] = C[4] ^ (C[0] & C[1]);
561		#else
562		R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
563		R[0][1] = C[1] ^ (~C[2] & C[3]);
564		R[0][2] = C[2] ^ (~C[3] & C[4]);
565		R[0][3] = C[3] ^ (~C[4] & C[0]);
566		R[0][4] = C[4] ^ (~C[0] & C[1]);
567		#endif
568
569	26.9M	C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
570	26.9M	C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
571	26.9M	C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
572	26.9M	C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
573	26.9M	C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
574
575	26.9M	#ifdef KECCAK_COMPLEMENTING_TRANSFORM
576	26.9M	R[1][0] = C[0] ^ (C[1] \| C[2]);
577	26.9M	R[1][1] = C[1] ^ (C[2] & C[3]);
578	26.9M	R[1][2] = C[2] ^ (C[3] \| ~C[4]);
579	26.9M	R[1][3] = C[3] ^ (C[4] \| C[0]);
580	26.9M	R[1][4] = C[4] ^ (C[0] & C[1]);
581		#else
582		R[1][0] = C[0] ^ (~C[1] & C[2]);
583		R[1][1] = C[1] ^ (~C[2] & C[3]);
584		R[1][2] = C[2] ^ (~C[3] & C[4]);
585		R[1][3] = C[3] ^ (~C[4] & C[0]);
586		R[1][4] = C[4] ^ (~C[0] & C[1]);
587		#endif
588
589	26.9M	C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
590	26.9M	C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
591	26.9M	C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
592	26.9M	C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
593	26.9M	C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
594
595	26.9M	#ifdef KECCAK_COMPLEMENTING_TRANSFORM
596	26.9M	R[2][0] = C[0] ^ (C[1] \| C[2]);
597	26.9M	R[2][1] = C[1] ^ (C[2] & C[3]);
598	26.9M	R[2][2] = C[2] ^ (~C[3] & C[4]);
599	26.9M	R[2][3] = ~C[3] ^ (C[4] \| C[0]);
600	26.9M	R[2][4] = C[4] ^ (C[0] & C[1]);
601		#else
602		R[2][0] = C[0] ^ (~C[1] & C[2]);
603		R[2][1] = C[1] ^ (~C[2] & C[3]);
604		R[2][2] = C[2] ^ (~C[3] & C[4]);
605		R[2][3] = C[3] ^ (~C[4] & C[0]);
606		R[2][4] = C[4] ^ (~C[0] & C[1]);
607		#endif
608
609	26.9M	C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
610	26.9M	C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
611	26.9M	C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
612	26.9M	C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
613	26.9M	C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
614
615	26.9M	#ifdef KECCAK_COMPLEMENTING_TRANSFORM
616	26.9M	R[3][0] = C[0] ^ (C[1] & C[2]);
617	26.9M	R[3][1] = C[1] ^ (C[2] \| C[3]);
618	26.9M	R[3][2] = C[2] ^ (~C[3] \| C[4]);
619	26.9M	R[3][3] = ~C[3] ^ (C[4] & C[0]);
620	26.9M	R[3][4] = C[4] ^ (C[0] \| C[1]);
621		#else
622		R[3][0] = C[0] ^ (~C[1] & C[2]);
623		R[3][1] = C[1] ^ (~C[2] & C[3]);
624		R[3][2] = C[2] ^ (~C[3] & C[4]);
625		R[3][3] = C[3] ^ (~C[4] & C[0]);
626		R[3][4] = C[4] ^ (~C[0] & C[1]);
627		#endif
628
629	26.9M	C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
630	26.9M	C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
631	26.9M	C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
632	26.9M	C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
633	26.9M	C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
634
635	26.9M	#ifdef KECCAK_COMPLEMENTING_TRANSFORM
636	26.9M	R[4][0] = C[0] ^ (~C[1] & C[2]);
637	26.9M	R[4][1] = ~C[1] ^ (C[2] \| C[3]);
638	26.9M	R[4][2] = C[2] ^ (C[3] & C[4]);
639	26.9M	R[4][3] = C[3] ^ (C[4] \| C[0]);
640	26.9M	R[4][4] = C[4] ^ (C[0] & C[1]);
641		#else
642		R[4][0] = C[0] ^ (~C[1] & C[2]);
643		R[4][1] = C[1] ^ (~C[2] & C[3]);
644		R[4][2] = C[2] ^ (~C[3] & C[4]);
645		R[4][3] = C[3] ^ (~C[4] & C[0]);
646		R[4][4] = C[4] ^ (~C[0] & C[1]);
647		#endif
648	26.9M	}
649
650		static void KeccakF1600(uint64_t A[5][5])
651	1.12M	{
652	1.12M	uint64_t T[5][5];
653	1.12M	size_t i;
654
655	1.12M	#ifdef KECCAK_COMPLEMENTING_TRANSFORM
656	1.12M	A[0][1] = ~A[0][1];
657	1.12M	A[0][2] = ~A[0][2];
658	1.12M	A[1][3] = ~A[1][3];
659	1.12M	A[2][2] = ~A[2][2];
660	1.12M	A[3][2] = ~A[3][2];
661	1.12M	A[4][0] = ~A[4][0];
662	1.12M	#endif
663
664	14.6M	for (i = 0; i < 24; i += 2) {
665	13.4M	Round(T, A, i);
666	13.4M	Round(A, T, i + 1);
667	13.4M	}
668
669	1.12M	#ifdef KECCAK_COMPLEMENTING_TRANSFORM
670	1.12M	A[0][1] = ~A[0][1];
671	1.12M	A[0][2] = ~A[0][2];
672	1.12M	A[1][3] = ~A[1][3];
673	1.12M	A[2][2] = ~A[2][2];
674	1.12M	A[3][2] = ~A[3][2];
675	1.12M	A[4][0] = ~A[4][0];
676	1.12M	#endif
677	1.12M	}
678
679		#else /* define KECCAK_INPLACE to compile this code path */
680		/*
681		* This implementation is KECCAK_1X from above combined 4 times with
682		* a twist that allows to omit temporary storage and perform in-place
683		* processing. It's discussed in section 2.5 of "Keccak implementation
684		* overview". It's likely to be best suited for processors with large
685		* register bank... On the other hand processor with large register
686		* bank can as well use KECCAK_1X_ALT, it would be as fast but much
687		* more compact...
688		*/
689		static void FourRounds(uint64_t A[5][5], size_t i)
690		{
691		uint64_t B[5], C[5], D[5];
692
693		assert(i <= OSSL_NELEM(iotas) - 4);
694
695		/* Round 4n /
696		C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
697		C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
698		C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
699		C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
700		C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
701
702		D[0] = ROL64(C[1], 1) ^ C[4];
703		D[1] = ROL64(C[2], 1) ^ C[0];
704		D[2] = ROL64(C[3], 1) ^ C[1];
705		D[3] = ROL64(C[4], 1) ^ C[2];
706		D[4] = ROL64(C[0], 1) ^ C[3];
707
708		B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
709		B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
710		B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
711		B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
712		B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
713
714		C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
715		C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
716		C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
717		C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
718		C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
719
720		B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
721		B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
722		B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
723		B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
724		B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
725
726		C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
727		C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
728		C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
729		C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
730		C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
731
732		B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
733		B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
734		B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
735		B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
736		B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
737
738		C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
739		C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
740		C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
741		C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
742		C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
743
744		B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
745		B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
746		B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
747		B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
748		B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
749
750		C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
751		C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
752		C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
753		C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
754		C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
755
756		B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
757		B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
758		B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
759		B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
760		B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
761
762		C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
763		C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
764		C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
765		C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
766		C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
767
768		/* Round 4n+1 /
769		D[0] = ROL64(C[1], 1) ^ C[4];
770		D[1] = ROL64(C[2], 1) ^ C[0];
771		D[2] = ROL64(C[3], 1) ^ C[1];
772		D[3] = ROL64(C[4], 1) ^ C[2];
773		D[4] = ROL64(C[0], 1) ^ C[3];
774
775		B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
776		B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
777		B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
778		B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
779		B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
780
781		C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
782		C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
783		C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
784		C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
785		C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
786
787		B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
788		B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
789		B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
790		B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
791		B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
792
793		C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
794		C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
795		C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
796		C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
797		C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
798
799		B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
800		B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
801		B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
802		B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
803		B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
804
805		C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
806		C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
807		C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
808		C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
809		C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
810
811		B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
812		B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
813		B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
814		B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
815		B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
816
817		C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
818		C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
819		C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
820		C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
821		C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
822
823		B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
824		B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
825		B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
826		B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
827		B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
828
829		C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
830		C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
831		C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
832		C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
833		C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
834
835		/* Round 4n+2 /
836		D[0] = ROL64(C[1], 1) ^ C[4];
837		D[1] = ROL64(C[2], 1) ^ C[0];
838		D[2] = ROL64(C[3], 1) ^ C[1];
839		D[3] = ROL64(C[4], 1) ^ C[2];
840		D[4] = ROL64(C[0], 1) ^ C[3];
841
842		B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
843		B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
844		B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
845		B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
846		B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
847
848		C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
849		C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
850		C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
851		C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
852		C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
853
854		B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
855		B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
856		B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
857		B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
858		B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
859
860		C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
861		C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
862		C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
863		C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
864		C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
865
866		B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
867		B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
868		B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
869		B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
870		B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
871
872		C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
873		C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
874		C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
875		C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
876		C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
877
878		B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
879		B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
880		B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
881		B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
882		B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
883
884		C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
885		C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
886		C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
887		C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
888		C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
889
890		B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
891		B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
892		B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
893		B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
894		B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
895
896		C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
897		C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
898		C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
899		C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
900		C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
901
902		/* Round 4n+3 /
903		D[0] = ROL64(C[1], 1) ^ C[4];
904		D[1] = ROL64(C[2], 1) ^ C[0];
905		D[2] = ROL64(C[3], 1) ^ C[1];
906		D[3] = ROL64(C[4], 1) ^ C[2];
907		D[4] = ROL64(C[0], 1) ^ C[3];
908
909		B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
910		B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
911		B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
912		B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
913		B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
914
915		/* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
916		/* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
917		/* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
918		/* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
919		/* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
920
921		B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
922		B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
923		B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
924		B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
925		B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
926
927		/* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
928		/* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
929		/* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
930		/* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
931		/* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
932
933		B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
934		B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
935		B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
936		B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
937		B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
938
939		/* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
940		/* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
941		/* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
942		/* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
943		/* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
944
945		B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
946		B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
947		B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
948		B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
949		B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
950
951		/* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
952		/* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
953		/* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
954		/* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
955		/* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
956
957		B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
958		B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
959		B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
960		B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
961		B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
962
963		/* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
964		/* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
965		/* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
966		/* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
967		/* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
968		}
969
970		static void KeccakF1600(uint64_t A[5][5])
971		{
972		size_t i;
973
974		for (i = 0; i < 24; i += 4) {
975		FourRounds(A, i);
976		}
977		}
978
979		#endif
980
981		static uint64_t BitInterleave(uint64_t Ai)
982	15.3M	{
983	15.3M	if (BIT_INTERLEAVE) {
984	0	uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
985	0	uint32_t t0, t1;
986
987	0	t0 = lo & 0x55555555;
988	0	t0 \|= t0 >> 1;
989	0	t0 &= 0x33333333;
990	0	t0 \|= t0 >> 2;
991	0	t0 &= 0x0f0f0f0f;
992	0	t0 \|= t0 >> 4;
993	0	t0 &= 0x00ff00ff;
994	0	t0 \|= t0 >> 8;
995	0	t0 &= 0x0000ffff;
996
997	0	t1 = hi & 0x55555555;
998	0	t1 \|= t1 >> 1;
999	0	t1 &= 0x33333333;
1000	0	t1 \|= t1 >> 2;
1001	0	t1 &= 0x0f0f0f0f;
1002	0	t1 \|= t1 >> 4;
1003	0	t1 &= 0x00ff00ff;
1004	0	t1 \|= t1 >> 8;
1005	0	t1 <<= 16;
1006
1007	0	lo &= 0xaaaaaaaa;
1008	0	lo \|= lo << 1;
1009	0	lo &= 0xcccccccc;
1010	0	lo \|= lo << 2;
1011	0	lo &= 0xf0f0f0f0;
1012	0	lo \|= lo << 4;
1013	0	lo &= 0xff00ff00;
1014	0	lo \|= lo << 8;
1015	0	lo >>= 16;
1016
1017	0	hi &= 0xaaaaaaaa;
1018	0	hi \|= hi << 1;
1019	0	hi &= 0xcccccccc;
1020	0	hi \|= hi << 2;
1021	0	hi &= 0xf0f0f0f0;
1022	0	hi \|= hi << 4;
1023	0	hi &= 0xff00ff00;
1024	0	hi \|= hi << 8;
1025	0	hi &= 0xffff0000;
1026
1027	0	Ai = ((uint64_t)(hi \| lo) << 32) \| (t1 \| t0);
1028	0	}
1029
1030	15.3M	return Ai;
1031	15.3M	}
1032
1033		static uint64_t BitDeinterleave(uint64_t Ai)
1034	334	{
1035	334	if (BIT_INTERLEAVE) {
1036	0	uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1037	0	uint32_t t0, t1;
1038
1039	0	t0 = lo & 0x0000ffff;
1040	0	t0 \|= t0 << 8;
1041	0	t0 &= 0x00ff00ff;
1042	0	t0 \|= t0 << 4;
1043	0	t0 &= 0x0f0f0f0f;
1044	0	t0 \|= t0 << 2;
1045	0	t0 &= 0x33333333;
1046	0	t0 \|= t0 << 1;
1047	0	t0 &= 0x55555555;
1048
1049	0	t1 = hi << 16;
1050	0	t1 \|= t1 >> 8;
1051	0	t1 &= 0xff00ff00;
1052	0	t1 \|= t1 >> 4;
1053	0	t1 &= 0xf0f0f0f0;
1054	0	t1 \|= t1 >> 2;
1055	0	t1 &= 0xcccccccc;
1056	0	t1 \|= t1 >> 1;
1057	0	t1 &= 0xaaaaaaaa;
1058
1059	0	lo >>= 16;
1060	0	lo \|= lo << 8;
1061	0	lo &= 0x00ff00ff;
1062	0	lo \|= lo << 4;
1063	0	lo &= 0x0f0f0f0f;
1064	0	lo \|= lo << 2;
1065	0	lo &= 0x33333333;
1066	0	lo \|= lo << 1;
1067	0	lo &= 0x55555555;
1068
1069	0	hi &= 0xffff0000;
1070	0	hi \|= hi >> 8;
1071	0	hi &= 0xff00ff00;
1072	0	hi \|= hi >> 4;
1073	0	hi &= 0xf0f0f0f0;
1074	0	hi \|= hi >> 2;
1075	0	hi &= 0xcccccccc;
1076	0	hi \|= hi >> 1;
1077	0	hi &= 0xaaaaaaaa;
1078
1079	0	Ai = ((uint64_t)(hi \| lo) << 32) \| (t1 \| t0);
1080	0	}
1081
1082	334	return Ai;
1083	334	}
1084
1085		/*
1086		* SHA3_absorb can be called multiple times, but at each invocation
1087		* largest multiple of \|r\| out of \|len\| bytes are processed. Then
1088		* remaining amount of bytes is returned. This is done to spare caller
1089		* trouble of calculating the largest multiple of \|r\|. \|r\| can be viewed
1090		* as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1091		* 72, but can also be (1600 - 448)/8 = 144. All this means that message
1092		* padding and intermediate sub-block buffering, byte- or bitwise, is
1093		* caller's responsibility.
1094		*/
1095		size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1096		size_t r)
1097	244	{
1098	244	uint64_t A_flat = (uint64_t )A;
1099	244	size_t i, w = r / 8;
1100
1101	244	assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1102
1103	1.12M	while (len >= r) {
1104	16.4M	for (i = 0; i < w; i++) {
1105	15.3M	uint64_t Ai = (uint64_t)inp[0] \| (uint64_t)inp[1] << 8 \| (uint64_t)inp[2] << 16 \| (uint64_t)inp[3] << 24 \| (uint64_t)inp[4] << 32 \| (uint64_t)inp[5] << 40 \| (uint64_t)inp[6] << 48 \| (uint64_t)inp[7] << 56;
1106	15.3M	inp += 8;
1107
1108	15.3M	A_flat[i] ^= BitInterleave(Ai);
1109	15.3M	}
1110	1.12M	KeccakF1600(A);
1111	1.12M	len -= r;
1112	1.12M	}
1113
1114	244	return len;
1115	244	}
1116
1117		/*
1118		* SHA3_squeeze may be called after SHA3_absorb to generate \|out\| hash value of
1119		* \|len\| bytes.
1120		* If multiple SHA3_squeeze calls are required the output length \|len\| must be a
1121		* multiple of the blocksize, with \|next\| being 0 on the first call and 1 on
1122		* subsequent calls. It is the callers responsibility to buffer the results.
1123		* When only a single call to SHA3_squeeze is required, \|len\| can be any size
1124		* and \|next\| must be 0.
1125		*/
1126		void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r,
1127		int next)
1128	61	{
1129	61	uint64_t A_flat = (uint64_t )A;
1130	61	size_t i, w = r / 8;
1131
1132	61	assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1133
1134	102	while (len != 0) {
1135	61	if (next)
1136	0	KeccakF1600(A);
1137	61	next = 1;
1138	375	for (i = 0; i < w && len != 0; i++) {
1139	334	uint64_t Ai = BitDeinterleave(A_flat[i]);
1140
1141	334	if (len < 8) {
1142	100	for (i = 0; i < len; i++) {
1143	80	*out++ = (unsigned char)Ai;
1144	80	Ai >>= 8;
1145	80	}
1146	20	return;
1147	20	}
1148
1149	314	out[0] = (unsigned char)(Ai);
1150	314	out[1] = (unsigned char)(Ai >> 8);
1151	314	out[2] = (unsigned char)(Ai >> 16);
1152	314	out[3] = (unsigned char)(Ai >> 24);
1153	314	out[4] = (unsigned char)(Ai >> 32);
1154	314	out[5] = (unsigned char)(Ai >> 40);
1155	314	out[6] = (unsigned char)(Ai >> 48);
1156	314	out[7] = (unsigned char)(Ai >> 56);
1157	314	out += 8;
1158	314	len -= 8;
1159	314	}
1160	61	}
1161	61	}
1162		#endif
1163
1164		#ifdef SELFTEST
1165		/*
1166		* Post-padding one-shot implementations would look as following:
1167		*
1168		* SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1169		* SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1170		* SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1171		* SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1172		* SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8);
1173		* SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8);
1174		*/
1175
1176		void SHA3_sponge(const unsigned char *inp, size_t len,
1177		unsigned char *out, size_t d, size_t r)
1178		{
1179		uint64_t A[5][5];
1180
1181		memset(A, 0, sizeof(A));
1182		SHA3_absorb(A, inp, len, r);
1183		SHA3_squeeze(A, out, d, r);
1184		}
1185
1186		#include <stdio.h>
1187
1188		int main(void)
1189		{
1190		/*
1191		* This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1192		*/
1193		unsigned char test[168] = { '\xf3', '\x3' };
1194		unsigned char out[512];
1195		size_t i;
1196		static const unsigned char result[512] = {
1197		0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1198		0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1199		0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1200		0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1201		0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1202		0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1203		0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1204		0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1205		0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1206		0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1207		0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1208		0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1209		0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1210		0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1211		0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1212		0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1213		0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1214		0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1215		0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1216		0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1217		0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1218		0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1219		0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1220		0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1221		0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1222		0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1223		0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1224		0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1225		0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1226		0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1227		0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1228		0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1229		0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1230		0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1231		0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1232		0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1233		0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1234		0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1235		0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1236		0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1237		0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1238		0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1239		0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1240		0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1241		0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1242		0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1243		0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1244		0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1245		0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1246		0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1247		0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1248		0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1249		0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1250		0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1251		0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1252		0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1253		0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1254		0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1255		0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1256		0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1257		0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1258		0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1259		0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1260		0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1261		};
1262
1263		test[167] = '\x80';
1264		SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1265
1266		/*
1267		* Rationale behind keeping output [formatted as below] is that
1268		* one should be able to redirect it to a file, then copy-n-paste
1269		* final "output val" from official example to another file, and
1270		* compare the two with diff(1).
1271		*/
1272		for (i = 0; i < sizeof(out);) {
1273		printf("%02X", out[i]);
1274		printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1275		}
1276
1277		if (memcmp(out, result, sizeof(out))) {
1278		fprintf(stderr, "failure\n");
1279		return 1;
1280		} else {
1281		fprintf(stderr, "success\n");
1282		return 0;
1283		}
1284		}
1285		#endif

Coverage Report

Created: 2026-01-07 06:10