/src/php-src/ext/hash/hash_sha_ni.c
Line | Count | Source |
1 | | /*- |
2 | | * Copyright 2018 Tarsnap Backup Inc. |
3 | | * All rights reserved. |
4 | | * |
5 | | * Redistribution and use in source and binary forms, with or without |
6 | | * modification, are permitted provided that the following conditions |
7 | | * are met: |
8 | | * 1. Redistributions of source code must retain the above copyright |
9 | | * notice, this list of conditions and the following disclaimer. |
10 | | * 2. Redistributions in binary form must reproduce the above copyright |
11 | | * notice, this list of conditions and the following disclaimer in the |
12 | | * documentation and/or other materials provided with the distribution. |
13 | | * |
14 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
15 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
17 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
18 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
19 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
20 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
21 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
22 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
23 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
24 | | * SUCH DAMAGE. |
25 | | */ |
26 | | |
27 | | #include "php_hash.h" |
28 | | #include "php_hash_sha.h" |
29 | | |
30 | | #if defined(PHP_HASH_INTRIN_SHA_NATIVE) || defined(PHP_HASH_INTRIN_SHA_RESOLVER) |
31 | | |
32 | | # include <immintrin.h> |
33 | | |
34 | | # if defined(PHP_HASH_INTRIN_SHA_RESOLVER) && defined(HAVE_FUNC_ATTRIBUTE_TARGET) |
35 | | static __m128i be32dec_128(const uint8_t * src) __attribute__((target("ssse3"))); |
36 | | void SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8], const uint8_t block[PHP_STATIC_RESTRICT 64]) __attribute__((target("ssse3,sha"))); |
37 | | # endif |
38 | | |
39 | | /* Original implementation from libcperciva follows. |
40 | | * |
41 | | * Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility. |
42 | | */ |
43 | | |
44 | | /** |
45 | | * This code uses intrinsics from the following feature sets: |
46 | | * SHANI: _mm_sha256msg1_epu32, _mm_sha256msg2_epu32, _mm_sha256rnds2_epu32 |
47 | | * SSSE3: _mm_shuffle_epi8, _mm_alignr_epi8 |
48 | | * SSE2: Everything else |
49 | | * |
50 | | * The SSSE3 intrinsics could be avoided at a slight cost by using a few SSE2 |
51 | | * instructions in their place; we have not done this since to our knowledge |
52 | | * there are presently no CPUs which support the SHANI instruction set but do |
53 | | * not support SSSE3. |
54 | | */ |
55 | | |
56 | | /* Load 32-bit big-endian words. */ |
57 | | static __m128i |
58 | | be32dec_128(const uint8_t * src) |
59 | 173k | { |
60 | 173k | const __m128i SHUF = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, |
61 | 173k | 4, 5, 6, 7, 0, 1, 2, 3); |
62 | 173k | __m128i x; |
63 | | |
64 | | /* Load four 32-bit words. */ |
65 | 173k | x = _mm_loadu_si128((const __m128i *)src); |
66 | | |
67 | | /* Reverse the order of the bytes in each word. */ |
68 | 173k | return (_mm_shuffle_epi8(x, SHUF)); |
69 | 173k | } |
70 | | |
71 | | /* Convert an unsigned 32-bit immediate into a signed value. */ |
72 | 2.76M | #define I32(a) ((UINT32_C(a) >= UINT32_C(0x80000000)) ? \ |
73 | 2.76M | -(int32_t)(UINT32_C(0xffffffff) - UINT32_C(a)) - 1 : (int32_t)INT32_C(a)) |
74 | | |
75 | | /* Load four unsigned 32-bit immediates into a vector register. */ |
76 | 692k | #define IMM4(a, b, c, d) _mm_set_epi32(I32(a), I32(b), I32(c), I32(d)) |
77 | | |
78 | | /* Run four rounds of SHA256. */ |
79 | 692k | #define RND4(S, W, K0, K1, K2, K3) do { \ |
80 | 692k | __m128i M; \ |
81 | 692k | \ |
82 | 692k | /* Add the next four words of message schedule and round constants. */ \ |
83 | 692k | M = _mm_add_epi32(W, IMM4(K3, K2, K1, K0)); \ |
84 | 692k | \ |
85 | 692k | /* Perform two rounds of SHA256, using the low two words in M. */ \ |
86 | 692k | S[1] = _mm_sha256rnds2_epu32(S[1], S[0], M); \ |
87 | 692k | \ |
88 | 692k | /* Shift the two words of M down and perform the next two rounds. */ \ |
89 | 692k | M = _mm_srli_si128(M, 8); \ |
90 | 692k | S[0] = _mm_sha256rnds2_epu32(S[0], S[1], M); \ |
91 | 692k | } while (0) |
92 | | |
93 | | /* Compute the ith set of four words of message schedule. */ |
94 | 519k | #define MSG4(W, i) do { \ |
95 | 519k | W[(i + 0) % 4] = _mm_sha256msg1_epu32(W[(i + 0) % 4], W[(i + 1) % 4]); \ |
96 | 519k | W[(i + 0) % 4] = _mm_add_epi32(W[(i + 0) % 4], \ |
97 | 519k | _mm_alignr_epi8(W[(i + 3) % 4], W[(i + 2) % 4], 4)); \ |
98 | 519k | W[(i + 0) % 4] = _mm_sha256msg2_epu32(W[(i + 0) % 4], W[(i + 3) % 4]); \ |
99 | 519k | } while (0) |
100 | | |
101 | | /* Perform 4 rounds of SHA256 and generate more message schedule if needed. */ |
102 | 692k | #define RNDMSG(S, W, i, K0, K1, K2, K3) do { \ |
103 | 692k | RND4(S, W[i % 4], K0, K1, K2, K3); \ |
104 | 692k | if (i < 12) \ |
105 | 692k | MSG4(W, i + 4); \ |
106 | 692k | } while (0) |
107 | | |
108 | | /** |
109 | | * SHA256_Transform_shani(state, block): |
110 | | * Compute the SHA256 block compression function, transforming ${state} using |
111 | | * the data in ${block}. This implementation uses x86 SHANI and SSSE3 |
112 | | * instructions, and should only be used if CPUSUPPORT_X86_SHANI and _SSSE3 |
113 | | * are defined and cpusupport_x86_shani() and _ssse3() return nonzero. |
114 | | */ |
115 | | void |
116 | | SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8], |
117 | | const uint8_t block[PHP_STATIC_RESTRICT 64]) |
118 | 43.2k | { |
119 | 43.2k | __m128i S3210, S7654; |
120 | 43.2k | __m128i S0123, S4567; |
121 | 43.2k | __m128i S0145, S2367; |
122 | 43.2k | __m128i W[4]; |
123 | 43.2k | __m128i S[2]; |
124 | | |
125 | | /* Load state. */ |
126 | 43.2k | S3210 = _mm_loadu_si128((const __m128i *)&state[0]); |
127 | 43.2k | S7654 = _mm_loadu_si128((const __m128i *)&state[4]); |
128 | | |
129 | | /* Shuffle the 8 32-bit values into the order we need them. */ |
130 | 43.2k | S0123 = _mm_shuffle_epi32(S3210, 0x1B); |
131 | 43.2k | S4567 = _mm_shuffle_epi32(S7654, 0x1B); |
132 | 43.2k | S0145 = _mm_unpackhi_epi64(S4567, S0123); |
133 | 43.2k | S2367 = _mm_unpacklo_epi64(S4567, S0123); |
134 | | |
135 | | /* Load input block; this is the start of the message schedule. */ |
136 | 43.2k | W[0] = be32dec_128(&block[0]); |
137 | 43.2k | W[1] = be32dec_128(&block[16]); |
138 | 43.2k | W[2] = be32dec_128(&block[32]); |
139 | 43.2k | W[3] = be32dec_128(&block[48]); |
140 | | |
141 | | /* Initialize working variables. */ |
142 | 43.2k | S[0] = S0145; |
143 | 43.2k | S[1] = S2367; |
144 | | |
145 | | /* Perform 64 rounds, 4 at a time. */ |
146 | 43.2k | RNDMSG(S, W, 0, 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5); |
147 | 43.2k | RNDMSG(S, W, 1, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5); |
148 | 43.2k | RNDMSG(S, W, 2, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3); |
149 | 43.2k | RNDMSG(S, W, 3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174); |
150 | 43.2k | RNDMSG(S, W, 4, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc); |
151 | 43.2k | RNDMSG(S, W, 5, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da); |
152 | 43.2k | RNDMSG(S, W, 6, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7); |
153 | 43.2k | RNDMSG(S, W, 7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967); |
154 | 43.2k | RNDMSG(S, W, 8, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13); |
155 | 43.2k | RNDMSG(S, W, 9, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85); |
156 | 43.2k | RNDMSG(S, W, 10, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3); |
157 | 43.2k | RNDMSG(S, W, 11, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070); |
158 | 43.2k | RNDMSG(S, W, 12, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5); |
159 | 43.2k | RNDMSG(S, W, 13, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3); |
160 | 43.2k | RNDMSG(S, W, 14, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208); |
161 | 43.2k | RNDMSG(S, W, 15, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2); |
162 | | |
163 | | /* Mix local working variables into global state. */ |
164 | 43.2k | S0145 = _mm_add_epi32(S0145, S[0]); |
165 | 43.2k | S2367 = _mm_add_epi32(S2367, S[1]); |
166 | | |
167 | | /* Shuffle state back to the original word order and store. */ |
168 | 43.2k | S0123 = _mm_unpackhi_epi64(S2367, S0145); |
169 | 43.2k | S4567 = _mm_unpacklo_epi64(S2367, S0145); |
170 | 43.2k | S3210 = _mm_shuffle_epi32(S0123, 0x1B); |
171 | 43.2k | S7654 = _mm_shuffle_epi32(S4567, 0x1B); |
172 | 43.2k | _mm_storeu_si128((__m128i *)&state[0], S3210); |
173 | 43.2k | _mm_storeu_si128((__m128i *)&state[4], S7654); |
174 | 43.2k | } |
175 | | |
176 | | #endif |