/src/php-src/ext/hash/hash_sha_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*- |
2 | | * Copyright 2021 Tarsnap Backup Inc. |
3 | | * All rights reserved. |
4 | | * |
5 | | * Redistribution and use in source and binary forms, with or without |
6 | | * modification, are permitted provided that the following conditions |
7 | | * are met: |
8 | | * 1. Redistributions of source code must retain the above copyright |
9 | | * notice, this list of conditions and the following disclaimer. |
10 | | * 2. Redistributions in binary form must reproduce the above copyright |
11 | | * notice, this list of conditions and the following disclaimer in the |
12 | | * documentation and/or other materials provided with the distribution. |
13 | | * |
14 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
15 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
17 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
18 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
19 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
20 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
21 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
22 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
23 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
24 | | * SUCH DAMAGE. |
25 | | */ |
26 | | |
27 | | #include "php_hash.h" |
28 | | #include "php_hash_sha.h" |
29 | | |
30 | | #ifdef __SSE2__ |
31 | | # include <emmintrin.h> |
32 | | |
33 | | /* Original implementation from libcperciva follows. |
34 | | * |
35 | | * Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility. |
36 | | */ |
37 | | |
38 | | /** |
39 | | * mm_bswap_epi32(a): |
40 | | * Byte-swap each 32-bit word. |
41 | | */ |
42 | | static inline __m128i |
43 | | mm_bswap_epi32(__m128i a) |
44 | 0 | { |
45 | | |
46 | | /* Swap bytes in each 16-bit word. */ |
47 | 0 | a = _mm_or_si128(_mm_slli_epi16(a, 8), _mm_srli_epi16(a, 8)); |
48 | | |
49 | | /* Swap all 16-bit words. */ |
50 | 0 | a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1)); |
51 | 0 | a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1)); |
52 | |
|
53 | 0 | return (a); |
54 | 0 | } |
55 | | |
56 | | /* SHA256 round constants. */ |
57 | | static const uint32_t Krnd[64] = { |
58 | | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
59 | | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
60 | | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
61 | | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
62 | | 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, |
63 | | 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
64 | | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, |
65 | | 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
66 | | 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, |
67 | | 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
68 | | 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, |
69 | | 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
70 | | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, |
71 | | 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
72 | | 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, |
73 | | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
74 | | }; |
75 | | |
76 | | /* Elementary functions used by SHA256 */ |
77 | 0 | #define Ch(x, y, z) ((x & (y ^ z)) ^ z) |
78 | 0 | #define Maj(x, y, z) ((x & (y | z)) | (y & z)) |
79 | 0 | #define ROTR(x, n) ((x >> n) | (x << (32 - n))) |
80 | 0 | #define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) |
81 | 0 | #define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) |
82 | | |
83 | | /* SHA256 round function */ |
84 | | #define RND(a, b, c, d, e, f, g, h, k) \ |
85 | 0 | h += S1(e) + Ch(e, f, g) + k; \ |
86 | 0 | d += h; \ |
87 | 0 | h += S0(a) + Maj(a, b, c) |
88 | | |
89 | | /* Adjusted round function for rotating state */ |
90 | | #define RNDr(S, W, i, ii) \ |
91 | 0 | RND(S[(64 - i) % 8], S[(65 - i) % 8], \ |
92 | 0 | S[(66 - i) % 8], S[(67 - i) % 8], \ |
93 | 0 | S[(68 - i) % 8], S[(69 - i) % 8], \ |
94 | 0 | S[(70 - i) % 8], S[(71 - i) % 8], \ |
95 | 0 | W[i + ii] + Krnd[i + ii]) |
96 | | |
97 | | /* Message schedule computation */ |
98 | 0 | #define SHR32(x, n) (_mm_srli_epi32(x, n)) |
99 | 0 | #define ROTR32(x, n) (_mm_or_si128(SHR32(x, n), _mm_slli_epi32(x, (32-n)))) |
100 | 0 | #define s0_128(x) _mm_xor_si128(_mm_xor_si128( \ |
101 | 0 | ROTR32(x, 7), ROTR32(x, 18)), SHR32(x, 3)) |
102 | | |
103 | | static inline __m128i |
104 | | s1_128_high(__m128i a) |
105 | 0 | { |
106 | 0 | __m128i b; |
107 | 0 | __m128i c; |
108 | | |
109 | | /* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */ |
110 | 0 | b = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0)); |
111 | 0 | c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19)); |
112 | | |
113 | | /* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */ |
114 | 0 | c = _mm_xor_si128(c, _mm_srli_epi32(b, 10)); |
115 | | |
116 | | /* Shuffle good data back and zero unwanted lanes. */ |
117 | 0 | c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0)); |
118 | 0 | c = _mm_slli_si128(c, 8); |
119 | |
|
120 | 0 | return (c); |
121 | 0 | } |
122 | | |
123 | | static inline __m128i |
124 | | s1_128_low(__m128i a) |
125 | 0 | { |
126 | 0 | __m128i b; |
127 | 0 | __m128i c; |
128 | | |
129 | | /* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */ |
130 | 0 | b = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2)); |
131 | 0 | c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19)); |
132 | | |
133 | | /* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */ |
134 | 0 | c = _mm_xor_si128(c, _mm_srli_epi32(b, 10)); |
135 | | |
136 | | /* Shuffle good data back and zero unwanted lanes. */ |
137 | 0 | c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0)); |
138 | 0 | c = _mm_srli_si128(c, 8); |
139 | |
|
140 | 0 | return (c); |
141 | 0 | } |
142 | | |
143 | | /** |
144 | | * SPAN_ONE_THREE(a, b): |
145 | | * Combine the upper three words of ${a} with the lowest word of ${b}. This |
146 | | * could also be thought of returning bits [159:32] of the 256-bit value |
147 | | * consisting of (b[127:0] a[127:0]). In other words, set: |
148 | | * dst[31:0] := a[63:32] |
149 | | * dst[63:32] := a[95:64] |
150 | | * dst[95:64] := a[127:96] |
151 | | * dst[127:96] := b[31:0] |
152 | | */ |
153 | 0 | #define SPAN_ONE_THREE(a, b) (_mm_shuffle_epi32(_mm_castps_si128( \ |
154 | 0 | _mm_move_ss(_mm_castsi128_ps(a), _mm_castsi128_ps(b))), \ |
155 | 0 | _MM_SHUFFLE(0, 3, 2, 1))) |
156 | | |
157 | | /** |
158 | | * MSG4(X0, X1, X2, X3): |
159 | | * Calculate the next four values of the message schedule. If we define |
160 | | * ${W[j]} as the first unknown value in the message schedule, then the input |
161 | | * arguments are: |
162 | | * X0 = W[j - 16] : W[j - 13] |
163 | | * X1 = W[j - 12] : W[j - 9] |
164 | | * X2 = W[j - 8] : W[j - 5] |
165 | | * X3 = W[j - 4] : W[j - 1] |
166 | | * This function therefore calculates: |
167 | | * X4 = W[j + 0] : W[j + 3] |
168 | | */ |
169 | | static inline __m128i |
170 | | MSG4(__m128i X0, __m128i X1, __m128i X2, __m128i X3) |
171 | 0 | { |
172 | 0 | __m128i X4; |
173 | 0 | __m128i Xj_minus_seven, Xj_minus_fifteen; |
174 | | |
175 | | /* Set up variables which span X values. */ |
176 | 0 | Xj_minus_seven = SPAN_ONE_THREE(X2, X3); |
177 | 0 | Xj_minus_fifteen = SPAN_ONE_THREE(X0, X1); |
178 | | |
179 | | /* Begin computing X4. */ |
180 | 0 | X4 = _mm_add_epi32(X0, Xj_minus_seven); |
181 | 0 | X4 = _mm_add_epi32(X4, s0_128(Xj_minus_fifteen)); |
182 | | |
183 | | /* First half of s1. */ |
184 | 0 | X4 = _mm_add_epi32(X4, s1_128_low(X3)); |
185 | | |
186 | | /* Second half of s1; this depends on the above value of X4. */ |
187 | 0 | X4 = _mm_add_epi32(X4, s1_128_high(X4)); |
188 | |
|
189 | 0 | return (X4); |
190 | 0 | } |
191 | | |
192 | | /** |
193 | | * SHA256_Transform_sse2(state, block, W, S): |
194 | | * Compute the SHA256 block compression function, transforming ${state} using |
195 | | * the data in ${block}. This implementation uses x86 SSE2 instructions, and |
196 | | * should only be used if _SSE2 is defined and cpusupport_x86_sse2() returns |
197 | | * nonzero. The arrays W and S may be filled with sensitive data, and should |
198 | | * be cleared by the callee. |
199 | | */ |
200 | | void |
201 | | SHA256_Transform_sse2(uint32_t state[PHP_STATIC_RESTRICT 8], |
202 | | const uint8_t block[PHP_STATIC_RESTRICT 64], uint32_t W[PHP_STATIC_RESTRICT 64], |
203 | | uint32_t S[PHP_STATIC_RESTRICT 8]) |
204 | 0 | { |
205 | 0 | __m128i Y[4]; |
206 | 0 | int i; |
207 | | |
208 | | /* 1. Prepare the first part of the message schedule W. */ |
209 | 0 | Y[0] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[0])); |
210 | 0 | _mm_storeu_si128((__m128i *)&W[0], Y[0]); |
211 | 0 | Y[1] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[16])); |
212 | 0 | _mm_storeu_si128((__m128i *)&W[4], Y[1]); |
213 | 0 | Y[2] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[32])); |
214 | 0 | _mm_storeu_si128((__m128i *)&W[8], Y[2]); |
215 | 0 | Y[3] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[48])); |
216 | 0 | _mm_storeu_si128((__m128i *)&W[12], Y[3]); |
217 | | |
218 | | /* 2. Initialize working variables. */ |
219 | 0 | memcpy(S, state, 32); |
220 | | |
221 | | /* 3. Mix. */ |
222 | 0 | for (i = 0; i < 64; i += 16) { |
223 | 0 | RNDr(S, W, 0, i); |
224 | 0 | RNDr(S, W, 1, i); |
225 | 0 | RNDr(S, W, 2, i); |
226 | 0 | RNDr(S, W, 3, i); |
227 | 0 | RNDr(S, W, 4, i); |
228 | 0 | RNDr(S, W, 5, i); |
229 | 0 | RNDr(S, W, 6, i); |
230 | 0 | RNDr(S, W, 7, i); |
231 | 0 | RNDr(S, W, 8, i); |
232 | 0 | RNDr(S, W, 9, i); |
233 | 0 | RNDr(S, W, 10, i); |
234 | 0 | RNDr(S, W, 11, i); |
235 | 0 | RNDr(S, W, 12, i); |
236 | 0 | RNDr(S, W, 13, i); |
237 | 0 | RNDr(S, W, 14, i); |
238 | 0 | RNDr(S, W, 15, i); |
239 | |
|
240 | 0 | if (i == 48) |
241 | 0 | break; |
242 | 0 | Y[0] = MSG4(Y[0], Y[1], Y[2], Y[3]); |
243 | 0 | _mm_storeu_si128((__m128i *)&W[16 + i + 0], Y[0]); |
244 | 0 | Y[1] = MSG4(Y[1], Y[2], Y[3], Y[0]); |
245 | 0 | _mm_storeu_si128((__m128i *)&W[16 + i + 4], Y[1]); |
246 | 0 | Y[2] = MSG4(Y[2], Y[3], Y[0], Y[1]); |
247 | 0 | _mm_storeu_si128((__m128i *)&W[16 + i + 8], Y[2]); |
248 | 0 | Y[3] = MSG4(Y[3], Y[0], Y[1], Y[2]); |
249 | 0 | _mm_storeu_si128((__m128i *)&W[16 + i + 12], Y[3]); |
250 | 0 | } |
251 | | |
252 | | /* 4. Mix local working variables into global state. */ |
253 | 0 | for (i = 0; i < 8; i++) |
254 | 0 | state[i] += S[i]; |
255 | 0 | } |
256 | | |
257 | | #endif |