Coverage Report

Created: 2024-11-29 06:10

/src/botan/src/lib/hash/sha1/sha1_sse2/sha1_sse2.cpp
Line
Count
Source
1
/*
2
* SHA-1 using SSE2
3
* Based on public domain code by Dean Gaudet
4
*    (http://arctic.org/~dean/crypto/sha1.html)
5
* (C) 2009-2011,2023 Jack Lloyd
6
*
7
* Botan is released under the Simplified BSD License (see license.txt)
8
*/
9
10
#include <botan/internal/sha1.h>
11
12
#include <botan/internal/bit_ops.h>
13
#include <botan/internal/rotate.h>
14
#include <botan/internal/simd_32.h>
15
#include <botan/internal/stl_util.h>
16
#include <emmintrin.h>
17
18
namespace Botan {
19
20
namespace SHA1_SSE2_F {
21
22
namespace {
23
24
/*
25
For each multiple of 4, t, we want to calculate this:
26
27
W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
28
W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
29
W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
30
W[t+3] = rol(W[t]   ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
31
32
we'll actually calculate this:
33
34
W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
35
W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
36
W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
37
W[t+3] = rol(  0    ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
38
W[t+3] ^= rol(W[t+0], 1);
39
40
the parameters are:
41
42
W0 = &W[t-16];
43
W1 = &W[t-12];
44
W2 = &W[t- 8];
45
W3 = &W[t- 4];
46
47
and on output:
48
prepared = W0 + K
49
W0 = W[t]..W[t+3]
50
*/
51
3.94M
BOTAN_FORCE_INLINE SIMD_4x32 prep(SIMD_4x32& XW0, SIMD_4x32 XW1, SIMD_4x32 XW2, SIMD_4x32 XW3, SIMD_4x32 K) {
52
3.94M
   SIMD_4x32 T0 = XW0;
53
   /* load W[t-4] 16-byte aligned, and shift */
54
3.94M
   SIMD_4x32 T2 = XW3.shift_elems_right<1>();
55
   /* get high 64-bits of XW0 into low 64-bits */
56
3.94M
   SIMD_4x32 T1 = SIMD_4x32(_mm_shuffle_epi32(XW0.raw(), _MM_SHUFFLE(1, 0, 3, 2)));
57
   /* load high 64-bits of T1 */
58
3.94M
   T1 = SIMD_4x32(_mm_unpacklo_epi64(T1.raw(), XW1.raw()));
59
60
3.94M
   T0 ^= T1;
61
3.94M
   T2 ^= XW2;
62
3.94M
   T0 ^= T2;
63
   /* unrotated W[t]..W[t+2] in T0 ... still need W[t+3] */
64
65
3.94M
   T2 = T0.shift_elems_left<3>();
66
3.94M
   T0 = T0.rotl<1>();
67
3.94M
   T2 = T2.rotl<2>();
68
69
3.94M
   T0 ^= T2; /* T0 now has W[t+3] */
70
71
3.94M
   XW0 = T0;
72
3.94M
   return T0 + K;
73
3.94M
}
74
75
/*
76
* SHA-1 F1 Function
77
*/
78
4.93M
inline void F1(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) {
79
4.93M
   E += choose(B, C, D) + msg + rotl<5>(A);
80
4.93M
   B = rotl<30>(B);
81
4.93M
}
82
83
/*
84
* SHA-1 F2 Function
85
*/
86
4.93M
inline void F2(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) {
87
4.93M
   E += (B ^ C ^ D) + msg + rotl<5>(A);
88
4.93M
   B = rotl<30>(B);
89
4.93M
}
90
91
/*
92
* SHA-1 F3 Function
93
*/
94
4.93M
inline void F3(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) {
95
4.93M
   E += majority(B, C, D) + msg + rotl<5>(A);
96
4.93M
   B = rotl<30>(B);
97
4.93M
}
98
99
/*
100
* SHA-1 F4 Function
101
*/
102
4.93M
inline void F4(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) {
103
4.93M
   E += (B ^ C ^ D) + msg + rotl<5>(A);
104
4.93M
   B = rotl<30>(B);
105
4.93M
}
106
107
}  // namespace
108
109
}  // namespace SHA1_SSE2_F
110
111
/*
112
* SHA-1 Compression Function using SSE for message expansion
113
*/
114
//static
115
58.1k
BOTAN_FUNC_ISA("sse2") void SHA_1::sse2_compress_n(digest_type& digest, std::span<const uint8_t> input, size_t blocks) {
116
58.1k
   using namespace SHA1_SSE2_F;
117
118
58.1k
   const SIMD_4x32 K00_19 = SIMD_4x32::splat(0x5A827999);
119
58.1k
   const SIMD_4x32 K20_39 = SIMD_4x32::splat(0x6ED9EBA1);
120
58.1k
   const SIMD_4x32 K40_59 = SIMD_4x32::splat(0x8F1BBCDC);
121
58.1k
   const SIMD_4x32 K60_79 = SIMD_4x32::splat(0xCA62C1D6);
122
123
58.1k
   uint32_t A = digest[0], B = digest[1], C = digest[2], D = digest[3], E = digest[4];
124
125
58.1k
   BufferSlicer in(input);
126
127
304k
   for(size_t i = 0; i != blocks; ++i) {
128
246k
      uint32_t PT[4];
129
130
246k
      const auto block = in.take(block_bytes);
131
132
246k
      SIMD_4x32 W0 = SIMD_4x32::load_be(&block[0]);
133
246k
      SIMD_4x32 W1 = SIMD_4x32::load_be(&block[16]);
134
246k
      SIMD_4x32 W2 = SIMD_4x32::load_be(&block[32]);
135
246k
      SIMD_4x32 W3 = SIMD_4x32::load_be(&block[48]);
136
137
246k
      SIMD_4x32 P0 = W0 + K00_19;
138
246k
      SIMD_4x32 P1 = W1 + K00_19;
139
246k
      SIMD_4x32 P2 = W2 + K00_19;
140
246k
      SIMD_4x32 P3 = W3 + K00_19;
141
142
246k
      SIMD_4x32(P0).store_le(PT);
143
246k
      F1(A, B, C, D, E, PT[0]);
144
246k
      F1(E, A, B, C, D, PT[1]);
145
246k
      F1(D, E, A, B, C, PT[2]);
146
246k
      F1(C, D, E, A, B, PT[3]);
147
246k
      P0 = prep(W0, W1, W2, W3, K00_19);
148
149
246k
      SIMD_4x32(P1).store_le(PT);
150
246k
      F1(B, C, D, E, A, PT[0]);
151
246k
      F1(A, B, C, D, E, PT[1]);
152
246k
      F1(E, A, B, C, D, PT[2]);
153
246k
      F1(D, E, A, B, C, PT[3]);
154
246k
      P1 = prep(W1, W2, W3, W0, K20_39);
155
156
246k
      SIMD_4x32(P2).store_le(PT);
157
246k
      F1(C, D, E, A, B, PT[0]);
158
246k
      F1(B, C, D, E, A, PT[1]);
159
246k
      F1(A, B, C, D, E, PT[2]);
160
246k
      F1(E, A, B, C, D, PT[3]);
161
246k
      P2 = prep(W2, W3, W0, W1, K20_39);
162
163
246k
      SIMD_4x32(P3).store_le(PT);
164
246k
      F1(D, E, A, B, C, PT[0]);
165
246k
      F1(C, D, E, A, B, PT[1]);
166
246k
      F1(B, C, D, E, A, PT[2]);
167
246k
      F1(A, B, C, D, E, PT[3]);
168
246k
      P3 = prep(W3, W0, W1, W2, K20_39);
169
170
246k
      SIMD_4x32(P0).store_le(PT);
171
246k
      F1(E, A, B, C, D, PT[0]);
172
246k
      F1(D, E, A, B, C, PT[1]);
173
246k
      F1(C, D, E, A, B, PT[2]);
174
246k
      F1(B, C, D, E, A, PT[3]);
175
246k
      P0 = prep(W0, W1, W2, W3, K20_39);
176
177
246k
      SIMD_4x32(P1).store_le(PT);
178
246k
      F2(A, B, C, D, E, PT[0]);
179
246k
      F2(E, A, B, C, D, PT[1]);
180
246k
      F2(D, E, A, B, C, PT[2]);
181
246k
      F2(C, D, E, A, B, PT[3]);
182
246k
      P1 = prep(W1, W2, W3, W0, K20_39);
183
184
246k
      SIMD_4x32(P2).store_le(PT);
185
246k
      F2(B, C, D, E, A, PT[0]);
186
246k
      F2(A, B, C, D, E, PT[1]);
187
246k
      F2(E, A, B, C, D, PT[2]);
188
246k
      F2(D, E, A, B, C, PT[3]);
189
246k
      P2 = prep(W2, W3, W0, W1, K40_59);
190
191
246k
      SIMD_4x32(P3).store_le(PT);
192
246k
      F2(C, D, E, A, B, PT[0]);
193
246k
      F2(B, C, D, E, A, PT[1]);
194
246k
      F2(A, B, C, D, E, PT[2]);
195
246k
      F2(E, A, B, C, D, PT[3]);
196
246k
      P3 = prep(W3, W0, W1, W2, K40_59);
197
198
246k
      SIMD_4x32(P0).store_le(PT);
199
246k
      F2(D, E, A, B, C, PT[0]);
200
246k
      F2(C, D, E, A, B, PT[1]);
201
246k
      F2(B, C, D, E, A, PT[2]);
202
246k
      F2(A, B, C, D, E, PT[3]);
203
246k
      P0 = prep(W0, W1, W2, W3, K40_59);
204
205
246k
      SIMD_4x32(P1).store_le(PT);
206
246k
      F2(E, A, B, C, D, PT[0]);
207
246k
      F2(D, E, A, B, C, PT[1]);
208
246k
      F2(C, D, E, A, B, PT[2]);
209
246k
      F2(B, C, D, E, A, PT[3]);
210
246k
      P1 = prep(W1, W2, W3, W0, K40_59);
211
212
246k
      SIMD_4x32(P2).store_le(PT);
213
246k
      F3(A, B, C, D, E, PT[0]);
214
246k
      F3(E, A, B, C, D, PT[1]);
215
246k
      F3(D, E, A, B, C, PT[2]);
216
246k
      F3(C, D, E, A, B, PT[3]);
217
246k
      P2 = prep(W2, W3, W0, W1, K40_59);
218
219
246k
      SIMD_4x32(P3).store_le(PT);
220
246k
      F3(B, C, D, E, A, PT[0]);
221
246k
      F3(A, B, C, D, E, PT[1]);
222
246k
      F3(E, A, B, C, D, PT[2]);
223
246k
      F3(D, E, A, B, C, PT[3]);
224
246k
      P3 = prep(W3, W0, W1, W2, K60_79);
225
226
246k
      SIMD_4x32(P0).store_le(PT);
227
246k
      F3(C, D, E, A, B, PT[0]);
228
246k
      F3(B, C, D, E, A, PT[1]);
229
246k
      F3(A, B, C, D, E, PT[2]);
230
246k
      F3(E, A, B, C, D, PT[3]);
231
246k
      P0 = prep(W0, W1, W2, W3, K60_79);
232
233
246k
      SIMD_4x32(P1).store_le(PT);
234
246k
      F3(D, E, A, B, C, PT[0]);
235
246k
      F3(C, D, E, A, B, PT[1]);
236
246k
      F3(B, C, D, E, A, PT[2]);
237
246k
      F3(A, B, C, D, E, PT[3]);
238
246k
      P1 = prep(W1, W2, W3, W0, K60_79);
239
240
246k
      SIMD_4x32(P2).store_le(PT);
241
246k
      F3(E, A, B, C, D, PT[0]);
242
246k
      F3(D, E, A, B, C, PT[1]);
243
246k
      F3(C, D, E, A, B, PT[2]);
244
246k
      F3(B, C, D, E, A, PT[3]);
245
246k
      P2 = prep(W2, W3, W0, W1, K60_79);
246
247
246k
      SIMD_4x32(P3).store_le(PT);
248
246k
      F4(A, B, C, D, E, PT[0]);
249
246k
      F4(E, A, B, C, D, PT[1]);
250
246k
      F4(D, E, A, B, C, PT[2]);
251
246k
      F4(C, D, E, A, B, PT[3]);
252
246k
      P3 = prep(W3, W0, W1, W2, K60_79);
253
254
246k
      SIMD_4x32(P0).store_le(PT);
255
246k
      F4(B, C, D, E, A, PT[0]);
256
246k
      F4(A, B, C, D, E, PT[1]);
257
246k
      F4(E, A, B, C, D, PT[2]);
258
246k
      F4(D, E, A, B, C, PT[3]);
259
260
246k
      SIMD_4x32(P1).store_le(PT);
261
246k
      F4(C, D, E, A, B, PT[0]);
262
246k
      F4(B, C, D, E, A, PT[1]);
263
246k
      F4(A, B, C, D, E, PT[2]);
264
246k
      F4(E, A, B, C, D, PT[3]);
265
266
246k
      SIMD_4x32(P2).store_le(PT);
267
246k
      F4(D, E, A, B, C, PT[0]);
268
246k
      F4(C, D, E, A, B, PT[1]);
269
246k
      F4(B, C, D, E, A, PT[2]);
270
246k
      F4(A, B, C, D, E, PT[3]);
271
272
246k
      SIMD_4x32(P3).store_le(PT);
273
246k
      F4(E, A, B, C, D, PT[0]);
274
246k
      F4(D, E, A, B, C, PT[1]);
275
246k
      F4(C, D, E, A, B, PT[2]);
276
246k
      F4(B, C, D, E, A, PT[3]);
277
278
246k
      A = (digest[0] += A);
279
246k
      B = (digest[1] += B);
280
246k
      C = (digest[2] += C);
281
246k
      D = (digest[3] += D);
282
246k
      E = (digest[4] += E);
283
246k
   }
284
58.1k
}
285
286
}  // namespace Botan