/src/botan/src/lib/hash/sha1/sha1_sse2/sha1_sse2.cpp
Line | Count | Source |
1 | | /* |
2 | | * SHA-1 using SSE2 |
3 | | * Based on public domain code by Dean Gaudet |
4 | | * (http://arctic.org/~dean/crypto/sha1.html) |
5 | | * (C) 2009-2011,2023 Jack Lloyd |
6 | | * |
7 | | * Botan is released under the Simplified BSD License (see license.txt) |
8 | | */ |
9 | | |
10 | | #include <botan/internal/sha1.h> |
11 | | |
12 | | #include <botan/internal/bit_ops.h> |
13 | | #include <botan/internal/rotate.h> |
14 | | #include <botan/internal/simd_32.h> |
15 | | #include <botan/internal/stl_util.h> |
16 | | #include <emmintrin.h> |
17 | | |
18 | | namespace Botan { |
19 | | |
20 | | namespace SHA1_SSE2_F { |
21 | | |
22 | | namespace { |
23 | | |
24 | | /* |
25 | | For each multiple of 4, t, we want to calculate this: |
26 | | |
27 | | W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); |
28 | | W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1); |
29 | | W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1); |
30 | | W[t+3] = rol(W[t] ^ W[t-5] ^ W[t-11] ^ W[t-13], 1); |
31 | | |
32 | | we'll actually calculate this: |
33 | | |
34 | | W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); |
35 | | W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1); |
36 | | W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1); |
37 | | W[t+3] = rol( 0 ^ W[t-5] ^ W[t-11] ^ W[t-13], 1); |
38 | | W[t+3] ^= rol(W[t+0], 1); |
39 | | |
40 | | the parameters are: |
41 | | |
42 | | W0 = &W[t-16]; |
43 | | W1 = &W[t-12]; |
44 | | W2 = &W[t- 8]; |
45 | | W3 = &W[t- 4]; |
46 | | |
47 | | and on output: |
48 | | prepared = W0 + K |
49 | | W0 = W[t]..W[t+3] |
50 | | */ |
51 | 3.94M | BOTAN_FORCE_INLINE SIMD_4x32 prep(SIMD_4x32& XW0, SIMD_4x32 XW1, SIMD_4x32 XW2, SIMD_4x32 XW3, SIMD_4x32 K) { |
52 | 3.94M | SIMD_4x32 T0 = XW0; |
53 | | /* load W[t-4] 16-byte aligned, and shift */ |
54 | 3.94M | SIMD_4x32 T2 = XW3.shift_elems_right<1>(); |
55 | | /* get high 64-bits of XW0 into low 64-bits */ |
56 | 3.94M | SIMD_4x32 T1 = SIMD_4x32(_mm_shuffle_epi32(XW0.raw(), _MM_SHUFFLE(1, 0, 3, 2))); |
57 | | /* load high 64-bits of T1 */ |
58 | 3.94M | T1 = SIMD_4x32(_mm_unpacklo_epi64(T1.raw(), XW1.raw())); |
59 | | |
60 | 3.94M | T0 ^= T1; |
61 | 3.94M | T2 ^= XW2; |
62 | 3.94M | T0 ^= T2; |
63 | | /* unrotated W[t]..W[t+2] in T0 ... still need W[t+3] */ |
64 | | |
65 | 3.94M | T2 = T0.shift_elems_left<3>(); |
66 | 3.94M | T0 = T0.rotl<1>(); |
67 | 3.94M | T2 = T2.rotl<2>(); |
68 | | |
69 | 3.94M | T0 ^= T2; /* T0 now has W[t+3] */ |
70 | | |
71 | 3.94M | XW0 = T0; |
72 | 3.94M | return T0 + K; |
73 | 3.94M | } |
74 | | |
75 | | /* |
76 | | * SHA-1 F1 Function |
77 | | */ |
78 | 4.93M | inline void F1(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) { |
79 | 4.93M | E += choose(B, C, D) + msg + rotl<5>(A); |
80 | 4.93M | B = rotl<30>(B); |
81 | 4.93M | } |
82 | | |
83 | | /* |
84 | | * SHA-1 F2 Function |
85 | | */ |
86 | 4.93M | inline void F2(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) { |
87 | 4.93M | E += (B ^ C ^ D) + msg + rotl<5>(A); |
88 | 4.93M | B = rotl<30>(B); |
89 | 4.93M | } |
90 | | |
91 | | /* |
92 | | * SHA-1 F3 Function |
93 | | */ |
94 | 4.93M | inline void F3(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) { |
95 | 4.93M | E += majority(B, C, D) + msg + rotl<5>(A); |
96 | 4.93M | B = rotl<30>(B); |
97 | 4.93M | } |
98 | | |
99 | | /* |
100 | | * SHA-1 F4 Function |
101 | | */ |
102 | 4.93M | inline void F4(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) { |
103 | 4.93M | E += (B ^ C ^ D) + msg + rotl<5>(A); |
104 | 4.93M | B = rotl<30>(B); |
105 | 4.93M | } |
106 | | |
107 | | } // namespace |
108 | | |
109 | | } // namespace SHA1_SSE2_F |
110 | | |
111 | | /* |
112 | | * SHA-1 Compression Function using SSE for message expansion |
113 | | */ |
114 | | //static |
115 | 58.1k | BOTAN_FUNC_ISA("sse2") void SHA_1::sse2_compress_n(digest_type& digest, std::span<const uint8_t> input, size_t blocks) { |
116 | 58.1k | using namespace SHA1_SSE2_F; |
117 | | |
118 | 58.1k | const SIMD_4x32 K00_19 = SIMD_4x32::splat(0x5A827999); |
119 | 58.1k | const SIMD_4x32 K20_39 = SIMD_4x32::splat(0x6ED9EBA1); |
120 | 58.1k | const SIMD_4x32 K40_59 = SIMD_4x32::splat(0x8F1BBCDC); |
121 | 58.1k | const SIMD_4x32 K60_79 = SIMD_4x32::splat(0xCA62C1D6); |
122 | | |
123 | 58.1k | uint32_t A = digest[0], B = digest[1], C = digest[2], D = digest[3], E = digest[4]; |
124 | | |
125 | 58.1k | BufferSlicer in(input); |
126 | | |
127 | 304k | for(size_t i = 0; i != blocks; ++i) { |
128 | 246k | uint32_t PT[4]; |
129 | | |
130 | 246k | const auto block = in.take(block_bytes); |
131 | | |
132 | 246k | SIMD_4x32 W0 = SIMD_4x32::load_be(&block[0]); |
133 | 246k | SIMD_4x32 W1 = SIMD_4x32::load_be(&block[16]); |
134 | 246k | SIMD_4x32 W2 = SIMD_4x32::load_be(&block[32]); |
135 | 246k | SIMD_4x32 W3 = SIMD_4x32::load_be(&block[48]); |
136 | | |
137 | 246k | SIMD_4x32 P0 = W0 + K00_19; |
138 | 246k | SIMD_4x32 P1 = W1 + K00_19; |
139 | 246k | SIMD_4x32 P2 = W2 + K00_19; |
140 | 246k | SIMD_4x32 P3 = W3 + K00_19; |
141 | | |
142 | 246k | SIMD_4x32(P0).store_le(PT); |
143 | 246k | F1(A, B, C, D, E, PT[0]); |
144 | 246k | F1(E, A, B, C, D, PT[1]); |
145 | 246k | F1(D, E, A, B, C, PT[2]); |
146 | 246k | F1(C, D, E, A, B, PT[3]); |
147 | 246k | P0 = prep(W0, W1, W2, W3, K00_19); |
148 | | |
149 | 246k | SIMD_4x32(P1).store_le(PT); |
150 | 246k | F1(B, C, D, E, A, PT[0]); |
151 | 246k | F1(A, B, C, D, E, PT[1]); |
152 | 246k | F1(E, A, B, C, D, PT[2]); |
153 | 246k | F1(D, E, A, B, C, PT[3]); |
154 | 246k | P1 = prep(W1, W2, W3, W0, K20_39); |
155 | | |
156 | 246k | SIMD_4x32(P2).store_le(PT); |
157 | 246k | F1(C, D, E, A, B, PT[0]); |
158 | 246k | F1(B, C, D, E, A, PT[1]); |
159 | 246k | F1(A, B, C, D, E, PT[2]); |
160 | 246k | F1(E, A, B, C, D, PT[3]); |
161 | 246k | P2 = prep(W2, W3, W0, W1, K20_39); |
162 | | |
163 | 246k | SIMD_4x32(P3).store_le(PT); |
164 | 246k | F1(D, E, A, B, C, PT[0]); |
165 | 246k | F1(C, D, E, A, B, PT[1]); |
166 | 246k | F1(B, C, D, E, A, PT[2]); |
167 | 246k | F1(A, B, C, D, E, PT[3]); |
168 | 246k | P3 = prep(W3, W0, W1, W2, K20_39); |
169 | | |
170 | 246k | SIMD_4x32(P0).store_le(PT); |
171 | 246k | F1(E, A, B, C, D, PT[0]); |
172 | 246k | F1(D, E, A, B, C, PT[1]); |
173 | 246k | F1(C, D, E, A, B, PT[2]); |
174 | 246k | F1(B, C, D, E, A, PT[3]); |
175 | 246k | P0 = prep(W0, W1, W2, W3, K20_39); |
176 | | |
177 | 246k | SIMD_4x32(P1).store_le(PT); |
178 | 246k | F2(A, B, C, D, E, PT[0]); |
179 | 246k | F2(E, A, B, C, D, PT[1]); |
180 | 246k | F2(D, E, A, B, C, PT[2]); |
181 | 246k | F2(C, D, E, A, B, PT[3]); |
182 | 246k | P1 = prep(W1, W2, W3, W0, K20_39); |
183 | | |
184 | 246k | SIMD_4x32(P2).store_le(PT); |
185 | 246k | F2(B, C, D, E, A, PT[0]); |
186 | 246k | F2(A, B, C, D, E, PT[1]); |
187 | 246k | F2(E, A, B, C, D, PT[2]); |
188 | 246k | F2(D, E, A, B, C, PT[3]); |
189 | 246k | P2 = prep(W2, W3, W0, W1, K40_59); |
190 | | |
191 | 246k | SIMD_4x32(P3).store_le(PT); |
192 | 246k | F2(C, D, E, A, B, PT[0]); |
193 | 246k | F2(B, C, D, E, A, PT[1]); |
194 | 246k | F2(A, B, C, D, E, PT[2]); |
195 | 246k | F2(E, A, B, C, D, PT[3]); |
196 | 246k | P3 = prep(W3, W0, W1, W2, K40_59); |
197 | | |
198 | 246k | SIMD_4x32(P0).store_le(PT); |
199 | 246k | F2(D, E, A, B, C, PT[0]); |
200 | 246k | F2(C, D, E, A, B, PT[1]); |
201 | 246k | F2(B, C, D, E, A, PT[2]); |
202 | 246k | F2(A, B, C, D, E, PT[3]); |
203 | 246k | P0 = prep(W0, W1, W2, W3, K40_59); |
204 | | |
205 | 246k | SIMD_4x32(P1).store_le(PT); |
206 | 246k | F2(E, A, B, C, D, PT[0]); |
207 | 246k | F2(D, E, A, B, C, PT[1]); |
208 | 246k | F2(C, D, E, A, B, PT[2]); |
209 | 246k | F2(B, C, D, E, A, PT[3]); |
210 | 246k | P1 = prep(W1, W2, W3, W0, K40_59); |
211 | | |
212 | 246k | SIMD_4x32(P2).store_le(PT); |
213 | 246k | F3(A, B, C, D, E, PT[0]); |
214 | 246k | F3(E, A, B, C, D, PT[1]); |
215 | 246k | F3(D, E, A, B, C, PT[2]); |
216 | 246k | F3(C, D, E, A, B, PT[3]); |
217 | 246k | P2 = prep(W2, W3, W0, W1, K40_59); |
218 | | |
219 | 246k | SIMD_4x32(P3).store_le(PT); |
220 | 246k | F3(B, C, D, E, A, PT[0]); |
221 | 246k | F3(A, B, C, D, E, PT[1]); |
222 | 246k | F3(E, A, B, C, D, PT[2]); |
223 | 246k | F3(D, E, A, B, C, PT[3]); |
224 | 246k | P3 = prep(W3, W0, W1, W2, K60_79); |
225 | | |
226 | 246k | SIMD_4x32(P0).store_le(PT); |
227 | 246k | F3(C, D, E, A, B, PT[0]); |
228 | 246k | F3(B, C, D, E, A, PT[1]); |
229 | 246k | F3(A, B, C, D, E, PT[2]); |
230 | 246k | F3(E, A, B, C, D, PT[3]); |
231 | 246k | P0 = prep(W0, W1, W2, W3, K60_79); |
232 | | |
233 | 246k | SIMD_4x32(P1).store_le(PT); |
234 | 246k | F3(D, E, A, B, C, PT[0]); |
235 | 246k | F3(C, D, E, A, B, PT[1]); |
236 | 246k | F3(B, C, D, E, A, PT[2]); |
237 | 246k | F3(A, B, C, D, E, PT[3]); |
238 | 246k | P1 = prep(W1, W2, W3, W0, K60_79); |
239 | | |
240 | 246k | SIMD_4x32(P2).store_le(PT); |
241 | 246k | F3(E, A, B, C, D, PT[0]); |
242 | 246k | F3(D, E, A, B, C, PT[1]); |
243 | 246k | F3(C, D, E, A, B, PT[2]); |
244 | 246k | F3(B, C, D, E, A, PT[3]); |
245 | 246k | P2 = prep(W2, W3, W0, W1, K60_79); |
246 | | |
247 | 246k | SIMD_4x32(P3).store_le(PT); |
248 | 246k | F4(A, B, C, D, E, PT[0]); |
249 | 246k | F4(E, A, B, C, D, PT[1]); |
250 | 246k | F4(D, E, A, B, C, PT[2]); |
251 | 246k | F4(C, D, E, A, B, PT[3]); |
252 | 246k | P3 = prep(W3, W0, W1, W2, K60_79); |
253 | | |
254 | 246k | SIMD_4x32(P0).store_le(PT); |
255 | 246k | F4(B, C, D, E, A, PT[0]); |
256 | 246k | F4(A, B, C, D, E, PT[1]); |
257 | 246k | F4(E, A, B, C, D, PT[2]); |
258 | 246k | F4(D, E, A, B, C, PT[3]); |
259 | | |
260 | 246k | SIMD_4x32(P1).store_le(PT); |
261 | 246k | F4(C, D, E, A, B, PT[0]); |
262 | 246k | F4(B, C, D, E, A, PT[1]); |
263 | 246k | F4(A, B, C, D, E, PT[2]); |
264 | 246k | F4(E, A, B, C, D, PT[3]); |
265 | | |
266 | 246k | SIMD_4x32(P2).store_le(PT); |
267 | 246k | F4(D, E, A, B, C, PT[0]); |
268 | 246k | F4(C, D, E, A, B, PT[1]); |
269 | 246k | F4(B, C, D, E, A, PT[2]); |
270 | 246k | F4(A, B, C, D, E, PT[3]); |
271 | | |
272 | 246k | SIMD_4x32(P3).store_le(PT); |
273 | 246k | F4(E, A, B, C, D, PT[0]); |
274 | 246k | F4(D, E, A, B, C, PT[1]); |
275 | 246k | F4(C, D, E, A, B, PT[2]); |
276 | 246k | F4(B, C, D, E, A, PT[3]); |
277 | | |
278 | 246k | A = (digest[0] += A); |
279 | 246k | B = (digest[1] += B); |
280 | 246k | C = (digest[2] += C); |
281 | 246k | D = (digest[3] += D); |
282 | 246k | E = (digest[4] += E); |
283 | 246k | } |
284 | 58.1k | } |
285 | | |
286 | | } // namespace Botan |