/src/php-src/ext/hash/hash_sha_sse2.c

Source (jump to first uncovered line)
/*-
 * Copyright 2021 Tarsnap Backup Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "php_hash.h"
#include "php_hash_sha.h"

#ifdef __SSE2__
# include <emmintrin.h>

/* Original implementation from libcperciva follows.
 *
 * Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility.
 */

/**
 * mm_bswap_epi32(a):
 * Byte-swap each 32-bit word.
 */
static inline __m128i
mm_bswap_epi32(__m128i a)
{

  /* Swap bytes in each 16-bit word. */
  a = _mm_or_si128(_mm_slli_epi16(a, 8), _mm_srli_epi16(a, 8));

  /* Swap all 16-bit words. */
  a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
  a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));

  return (a);
}

/* SHA256 round constants. */
static const uint32_t Krnd[64] = {
  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z)  ((x & (y | z)) | (y & z))
#define ROTR(x, n)  ((x >> n) | (x << (32 - n)))
#define S0(x)   (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x)   (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))

/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k)      \
  h += S1(e) + Ch(e, f, g) + k;      \
  d += h;           \
  h += S0(a) + Maj(a, b, c)

/* Adjusted round function for rotating state */
#define RNDr(S, W, i, ii)     \
  RND(S[(64 - i) % 8], S[(65 - i) % 8],  \
      S[(66 - i) % 8], S[(67 - i) % 8], \
      S[(68 - i) % 8], S[(69 - i) % 8], \
      S[(70 - i) % 8], S[(71 - i) % 8], \
      W[i + ii] + Krnd[i + ii])

/* Message schedule computation */
#define SHR32(x, n) (_mm_srli_epi32(x, n))
#define ROTR32(x, n) (_mm_or_si128(SHR32(x, n), _mm_slli_epi32(x, (32-n))))
#define s0_128(x) _mm_xor_si128(_mm_xor_si128(      \
  ROTR32(x, 7), ROTR32(x, 18)), SHR32(x, 3))

static inline __m128i
s1_128_high(__m128i a)
{
  __m128i b;
  __m128i c;

  /* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */
  b = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0));
  c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));

  /* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */
  c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));

  /* Shuffle good data back and zero unwanted lanes. */
  c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
  c = _mm_slli_si128(c, 8);

  return (c);
}

static inline __m128i
s1_128_low(__m128i a)
{
  __m128i b;
  __m128i c;

  /* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */
  b = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
  c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));

  /* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */
  c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));

  /* Shuffle good data back and zero unwanted lanes. */
  c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
  c = _mm_srli_si128(c, 8);

  return (c);
}

/**
 * SPAN_ONE_THREE(a, b):
 * Combine the upper three words of ${a} with the lowest word of ${b}.  This
 * could also be thought of returning bits [159:32] of the 256-bit value
 * consisting of (b[127:0] a[127:0]).  In other words, set:
 *     dst[31:0] := a[63:32]
 *     dst[63:32] := a[95:64]
 *     dst[95:64] := a[127:96]
 *     dst[127:96] := b[31:0]
 */
#define SPAN_ONE_THREE(a, b) (_mm_shuffle_epi32(_mm_castps_si128(  \
  _mm_move_ss(_mm_castsi128_ps(a), _mm_castsi128_ps(b))),   \
  _MM_SHUFFLE(0, 3, 2, 1)))

/**
 * MSG4(X0, X1, X2, X3):
 * Calculate the next four values of the message schedule.  If we define
 * ${W[j]} as the first unknown value in the message schedule, then the input
 * arguments are:
 *     X0 = W[j - 16] : W[j - 13]
 *     X1 = W[j - 12] : W[j - 9]
 *     X2 = W[j - 8] : W[j - 5]
 *     X3 = W[j - 4] : W[j - 1]
 * This function therefore calculates:
 *     X4 = W[j + 0] : W[j + 3]
 */
static inline __m128i
MSG4(__m128i X0, __m128i X1, __m128i X2, __m128i X3)
{
  __m128i X4;
  __m128i Xj_minus_seven, Xj_minus_fifteen;

  /* Set up variables which span X values. */
  Xj_minus_seven = SPAN_ONE_THREE(X2, X3);
  Xj_minus_fifteen = SPAN_ONE_THREE(X0, X1);

  /* Begin computing X4. */
  X4 = _mm_add_epi32(X0, Xj_minus_seven);
  X4 = _mm_add_epi32(X4, s0_128(Xj_minus_fifteen));

  /* First half of s1. */
  X4 = _mm_add_epi32(X4, s1_128_low(X3));

  /* Second half of s1; this depends on the above value of X4. */
  X4 = _mm_add_epi32(X4, s1_128_high(X4));

  return (X4);
}

/**
 * SHA256_Transform_sse2(state, block, W, S):
 * Compute the SHA256 block compression function, transforming ${state} using
 * the data in ${block}.  This implementation uses x86 SSE2 instructions, and
 * should only be used if _SSE2 is defined and cpusupport_x86_sse2() returns
 * nonzero.  The arrays W and S may be filled with sensitive data, and should
 * be cleared by the callee.
 */
void
SHA256_Transform_sse2(uint32_t state[PHP_STATIC_RESTRICT 8],
    const uint8_t block[PHP_STATIC_RESTRICT 64], uint32_t W[PHP_STATIC_RESTRICT 64],
    uint32_t S[PHP_STATIC_RESTRICT 8])
{
  __m128i Y[4];
  int i;

  /* 1. Prepare the first part of the message schedule W. */
  Y[0] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[0]));
  _mm_storeu_si128((__m128i *)&W[0], Y[0]);
  Y[1] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[16]));
  _mm_storeu_si128((__m128i *)&W[4], Y[1]);
  Y[2] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[32]));
  _mm_storeu_si128((__m128i *)&W[8], Y[2]);
  Y[3] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[48]));
  _mm_storeu_si128((__m128i *)&W[12], Y[3]);

  /* 2. Initialize working variables. */
  memcpy(S, state, 32);

  /* 3. Mix. */
  for (i = 0; i < 64; i += 16) {
    RNDr(S, W, 0, i);
    RNDr(S, W, 1, i);
    RNDr(S, W, 2, i);
    RNDr(S, W, 3, i);
    RNDr(S, W, 4, i);
    RNDr(S, W, 5, i);
    RNDr(S, W, 6, i);
    RNDr(S, W, 7, i);
    RNDr(S, W, 8, i);
    RNDr(S, W, 9, i);
    RNDr(S, W, 10, i);
    RNDr(S, W, 11, i);
    RNDr(S, W, 12, i);
    RNDr(S, W, 13, i);
    RNDr(S, W, 14, i);
    RNDr(S, W, 15, i);

    if (i == 48)
      break;
    Y[0] = MSG4(Y[0], Y[1], Y[2], Y[3]);
    _mm_storeu_si128((__m128i *)&W[16 + i + 0], Y[0]);
    Y[1] = MSG4(Y[1], Y[2], Y[3], Y[0]);
    _mm_storeu_si128((__m128i *)&W[16 + i + 4], Y[1]);
    Y[2] = MSG4(Y[2], Y[3], Y[0], Y[1]);
    _mm_storeu_si128((__m128i *)&W[16 + i + 8], Y[2]);
    Y[3] = MSG4(Y[3], Y[0], Y[1], Y[2]);
    _mm_storeu_si128((__m128i *)&W[16 + i + 12], Y[3]);
  }

  /* 4. Mix local working variables into global state. */
  for (i = 0; i < 8; i++)
    state[i] += S[i];
}

#endif

Coverage Report

Created: 2025-06-13 06:43

Line	Count	Source (jump to first uncovered line)
1		/*-
2		* Copyright 2021 Tarsnap Backup Inc.
3		* All rights reserved.
4		*
5		* Redistribution and use in source and binary forms, with or without
6		* modification, are permitted provided that the following conditions
7		* are met:
8		* 1. Redistributions of source code must retain the above copyright
9		* notice, this list of conditions and the following disclaimer.
10		* 2. Redistributions in binary form must reproduce the above copyright
11		* notice, this list of conditions and the following disclaimer in the
12		* documentation and/or other materials provided with the distribution.
13		*
14		* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15		* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16		* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17		* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18		* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19		* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20		* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21		* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22		* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23		* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24		* SUCH DAMAGE.
25		*/
26
27		#include "php_hash.h"
28		#include "php_hash_sha.h"
29
30		#ifdef __SSE2__
31		# include <emmintrin.h>
32
33		/* Original implementation from libcperciva follows.
34		*
35		* Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility.
36		*/
37
38		/**
39		* mm_bswap_epi32(a):
40		* Byte-swap each 32-bit word.
41		*/
42		static inline __m128i
43		mm_bswap_epi32(__m128i a)
44	0	{
45
46		/* Swap bytes in each 16-bit word. */
47	0	a = _mm_or_si128(_mm_slli_epi16(a, 8), _mm_srli_epi16(a, 8));
48
49		/* Swap all 16-bit words. */
50	0	a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
51	0	a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
52
53	0	return (a);
54	0	}
55
56		/* SHA256 round constants. */
57		static const uint32_t Krnd[64] = {
58		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
59		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
60		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
61		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
62		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
63		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
64		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
65		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
66		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
67		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
68		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
69		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
70		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
71		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
72		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
73		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
74		};
75
76		/* Elementary functions used by SHA256 */
77	0	#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
78	0	#define Maj(x, y, z) ((x & (y \| z)) \| (y & z))
79	0	#define ROTR(x, n) ((x >> n) \| (x << (32 - n)))
80	0	#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
81	0	#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
82
83		/* SHA256 round function */
84		#define RND(a, b, c, d, e, f, g, h, k) \
85	0	h += S1(e) + Ch(e, f, g) + k; \
86	0	d += h; \
87	0	h += S0(a) + Maj(a, b, c)
88
89		/* Adjusted round function for rotating state */
90		#define RNDr(S, W, i, ii) \
91	0	RND(S[(64 - i) % 8], S[(65 - i) % 8], \
92	0	S[(66 - i) % 8], S[(67 - i) % 8], \
93	0	S[(68 - i) % 8], S[(69 - i) % 8], \
94	0	S[(70 - i) % 8], S[(71 - i) % 8], \
95	0	W[i + ii] + Krnd[i + ii])
96
97		/* Message schedule computation */
98	0	#define SHR32(x, n) (_mm_srli_epi32(x, n))
99	0	#define ROTR32(x, n) (_mm_or_si128(SHR32(x, n), _mm_slli_epi32(x, (32-n))))
100	0	#define s0_128(x) _mm_xor_si128(_mm_xor_si128( \
101	0	ROTR32(x, 7), ROTR32(x, 18)), SHR32(x, 3))
102
103		static inline __m128i
104		s1_128_high(__m128i a)
105	0	{
106	0	__m128i b;
107	0	__m128i c;
108
109		/* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */
110	0	b = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0));
111	0	c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));
112
113		/* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */
114	0	c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));
115
116		/* Shuffle good data back and zero unwanted lanes. */
117	0	c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
118	0	c = _mm_slli_si128(c, 8);
119
120	0	return (c);
121	0	}
122
123		static inline __m128i
124		s1_128_low(__m128i a)
125	0	{
126	0	__m128i b;
127	0	__m128i c;
128
129		/* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */
130	0	b = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
131	0	c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));
132
133		/* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */
134	0	c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));
135
136		/* Shuffle good data back and zero unwanted lanes. */
137	0	c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
138	0	c = _mm_srli_si128(c, 8);
139
140	0	return (c);
141	0	}
142
143		/**
144		* SPAN_ONE_THREE(a, b):
145		* Combine the upper three words of ${a} with the lowest word of ${b}. This
146		* could also be thought of returning bits [159:32] of the 256-bit value
147		* consisting of (b[127:0] a[127:0]). In other words, set:
148		* dst[31:0] := a[63:32]
149		* dst[63:32] := a[95:64]
150		* dst[95:64] := a[127:96]
151		* dst[127:96] := b[31:0]
152		*/
153	0	#define SPAN_ONE_THREE(a, b) (_mm_shuffle_epi32(_mm_castps_si128( \
154	0	_mm_move_ss(_mm_castsi128_ps(a), _mm_castsi128_ps(b))), \
155	0	_MM_SHUFFLE(0, 3, 2, 1)))
156
157		/**
158		* MSG4(X0, X1, X2, X3):
159		* Calculate the next four values of the message schedule. If we define
160		* ${W[j]} as the first unknown value in the message schedule, then the input
161		* arguments are:
162		* X0 = W[j - 16] : W[j - 13]
163		* X1 = W[j - 12] : W[j - 9]
164		* X2 = W[j - 8] : W[j - 5]
165		* X3 = W[j - 4] : W[j - 1]
166		* This function therefore calculates:
167		* X4 = W[j + 0] : W[j + 3]
168		*/
169		static inline __m128i
170		MSG4(__m128i X0, __m128i X1, __m128i X2, __m128i X3)
171	0	{
172	0	__m128i X4;
173	0	__m128i Xj_minus_seven, Xj_minus_fifteen;
174
175		/* Set up variables which span X values. */
176	0	Xj_minus_seven = SPAN_ONE_THREE(X2, X3);
177	0	Xj_minus_fifteen = SPAN_ONE_THREE(X0, X1);
178
179		/* Begin computing X4. */
180	0	X4 = _mm_add_epi32(X0, Xj_minus_seven);
181	0	X4 = _mm_add_epi32(X4, s0_128(Xj_minus_fifteen));
182
183		/* First half of s1. */
184	0	X4 = _mm_add_epi32(X4, s1_128_low(X3));
185
186		/* Second half of s1; this depends on the above value of X4. */
187	0	X4 = _mm_add_epi32(X4, s1_128_high(X4));
188
189	0	return (X4);
190	0	}
191
192		/**
193		* SHA256_Transform_sse2(state, block, W, S):
194		* Compute the SHA256 block compression function, transforming ${state} using
195		* the data in ${block}. This implementation uses x86 SSE2 instructions, and
196		* should only be used if _SSE2 is defined and cpusupport_x86_sse2() returns
197		* nonzero. The arrays W and S may be filled with sensitive data, and should
198		* be cleared by the callee.
199		*/
200		void
201		SHA256_Transform_sse2(uint32_t state[PHP_STATIC_RESTRICT 8],
202		const uint8_t block[PHP_STATIC_RESTRICT 64], uint32_t W[PHP_STATIC_RESTRICT 64],
203		uint32_t S[PHP_STATIC_RESTRICT 8])
204	0	{
205	0	__m128i Y[4];
206	0	int i;
207
208		/* 1. Prepare the first part of the message schedule W. */
209	0	Y[0] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[0]));
210	0	_mm_storeu_si128((__m128i *)&W[0], Y[0]);
211	0	Y[1] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[16]));
212	0	_mm_storeu_si128((__m128i *)&W[4], Y[1]);
213	0	Y[2] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[32]));
214	0	_mm_storeu_si128((__m128i *)&W[8], Y[2]);
215	0	Y[3] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[48]));
216	0	_mm_storeu_si128((__m128i *)&W[12], Y[3]);
217
218		/* 2. Initialize working variables. */
219	0	memcpy(S, state, 32);
220
221		/* 3. Mix. */
222	0	for (i = 0; i < 64; i += 16) {
223	0	RNDr(S, W, 0, i);
224	0	RNDr(S, W, 1, i);
225	0	RNDr(S, W, 2, i);
226	0	RNDr(S, W, 3, i);
227	0	RNDr(S, W, 4, i);
228	0	RNDr(S, W, 5, i);
229	0	RNDr(S, W, 6, i);
230	0	RNDr(S, W, 7, i);
231	0	RNDr(S, W, 8, i);
232	0	RNDr(S, W, 9, i);
233	0	RNDr(S, W, 10, i);
234	0	RNDr(S, W, 11, i);
235	0	RNDr(S, W, 12, i);
236	0	RNDr(S, W, 13, i);
237	0	RNDr(S, W, 14, i);
238	0	RNDr(S, W, 15, i);
239
240	0	if (i == 48)
241	0	break;
242	0	Y[0] = MSG4(Y[0], Y[1], Y[2], Y[3]);
243	0	_mm_storeu_si128((__m128i *)&W[16 + i + 0], Y[0]);
244	0	Y[1] = MSG4(Y[1], Y[2], Y[3], Y[0]);
245	0	_mm_storeu_si128((__m128i *)&W[16 + i + 4], Y[1]);
246	0	Y[2] = MSG4(Y[2], Y[3], Y[0], Y[1]);
247	0	_mm_storeu_si128((__m128i *)&W[16 + i + 8], Y[2]);
248	0	Y[3] = MSG4(Y[3], Y[0], Y[1], Y[2]);
249	0	_mm_storeu_si128((__m128i *)&W[16 + i + 12], Y[3]);
250	0	}
251
252		/* 4. Mix local working variables into global state. */
253	0	for (i = 0; i < 8; i++)
254	0	state[i] += S[i];
255	0	}
256
257		#endif