/src/cryptopp/lsh256_avx.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // lsh.cpp - written and placed in the public domain by Jeffrey Walton |
2 | | // Based on the specification and source code provided by |
3 | | // Korea Internet & Security Agency (KISA) website. Also |
4 | | // see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do |
5 | | // and https://seed.kisa.or.kr/kisa/Board/22/detailView.do. |
6 | | |
7 | | // We are hitting some sort of GCC bug in the LSH AVX2 code path. |
8 | | // Clang is OK on the AVX2 code path. We believe it is GCC Issue |
9 | | // 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It |
10 | | // makes using zeroupper a little tricky. |
11 | | |
12 | | #include "pch.h" |
13 | | #include "config.h" |
14 | | |
15 | | #include "lsh.h" |
16 | | #include "misc.h" |
17 | | |
18 | | // Squash MS LNK4221 and libtool warnings |
19 | | extern const char LSH256_AVX_FNAME[] = __FILE__; |
20 | | |
21 | | #if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) |
22 | | |
23 | | #if defined(CRYPTOPP_AVX2_AVAILABLE) |
24 | | # include <emmintrin.h> |
25 | | # include <immintrin.h> |
26 | | #endif |
27 | | |
28 | | #if defined(CRYPTOPP_GCC_COMPATIBLE) |
29 | | # include <x86intrin.h> |
30 | | #endif |
31 | | |
32 | | ANONYMOUS_NAMESPACE_BEGIN |
33 | | |
34 | | /* LSH Constants */ |
35 | | |
36 | | const unsigned int LSH256_MSG_BLK_BYTE_LEN = 128; |
37 | | // const unsigned int LSH256_MSG_BLK_BIT_LEN = 1024; |
38 | | // const unsigned int LSH256_CV_BYTE_LEN = 64; |
39 | | const unsigned int LSH256_HASH_VAL_MAX_BYTE_LEN = 32; |
40 | | |
41 | | // const unsigned int MSG_BLK_WORD_LEN = 32; |
42 | | const unsigned int CV_WORD_LEN = 16; |
43 | | const unsigned int CONST_WORD_LEN = 8; |
44 | | // const unsigned int HASH_VAL_MAX_WORD_LEN = 8; |
45 | | // const unsigned int WORD_BIT_LEN = 32; |
46 | | const unsigned int NUM_STEPS = 26; |
47 | | |
48 | | const unsigned int ROT_EVEN_ALPHA = 29; |
49 | | const unsigned int ROT_EVEN_BETA = 1; |
50 | | const unsigned int ROT_ODD_ALPHA = 5; |
51 | | const unsigned int ROT_ODD_BETA = 17; |
52 | | |
53 | | const unsigned int LSH_TYPE_256_256 = 0x0000020; |
54 | | const unsigned int LSH_TYPE_256_224 = 0x000001C; |
55 | | |
56 | | // const unsigned int LSH_TYPE_224 = LSH_TYPE_256_224; |
57 | | // const unsigned int LSH_TYPE_256 = LSH_TYPE_256_256; |
58 | | |
59 | | /* Error Code */ |
60 | | |
61 | | const unsigned int LSH_SUCCESS = 0x0; |
62 | | // const unsigned int LSH_ERR_NULL_PTR = 0x2401; |
63 | | // const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402; |
64 | | const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403; |
65 | | const unsigned int LSH_ERR_INVALID_STATE = 0x2404; |
66 | | |
67 | | /* Index into our state array */ |
68 | | |
69 | | const unsigned int AlgorithmType = 80; |
70 | | const unsigned int RemainingBits = 81; |
71 | | |
72 | | NAMESPACE_END |
73 | | |
74 | | NAMESPACE_BEGIN(CryptoPP) |
75 | | NAMESPACE_BEGIN(LSH) |
76 | | |
77 | | // lsh256.cpp |
78 | | extern const word32 LSH256_IV224[CV_WORD_LEN]; |
79 | | extern const word32 LSH256_IV256[CV_WORD_LEN]; |
80 | | extern const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS]; |
81 | | |
82 | | NAMESPACE_END // LSH |
83 | | NAMESPACE_END // Crypto++ |
84 | | |
85 | | ANONYMOUS_NAMESPACE_BEGIN |
86 | | |
87 | | using CryptoPP::byte; |
88 | | using CryptoPP::word32; |
89 | | using CryptoPP::rotlFixed; |
90 | | using CryptoPP::rotlConstant; |
91 | | |
92 | | using CryptoPP::GetBlock; |
93 | | using CryptoPP::LittleEndian; |
94 | | using CryptoPP::ConditionalByteReverse; |
95 | | using CryptoPP::LITTLE_ENDIAN_ORDER; |
96 | | |
97 | | typedef byte lsh_u8; |
98 | | typedef word32 lsh_u32; |
99 | | typedef word32 lsh_uint; |
100 | | typedef word32 lsh_err; |
101 | | typedef word32 lsh_type; |
102 | | |
103 | | using CryptoPP::LSH::LSH256_IV224; |
104 | | using CryptoPP::LSH::LSH256_IV256; |
105 | | using CryptoPP::LSH::LSH256_StepConstants; |
106 | | |
107 | | struct LSH256_AVX2_Context |
108 | | { |
109 | | LSH256_AVX2_Context(word32* state, word32 algType, word32& remainingBitLength) : |
110 | | cv_l(state+0), cv_r(state+8), sub_msgs(state+16), |
111 | | last_block(reinterpret_cast<byte*>(state+48)), |
112 | | remain_databitlen(remainingBitLength), |
113 | 96.8k | alg_type(static_cast<lsh_type>(algType)) {} |
114 | | |
115 | | lsh_u32* cv_l; // start of our state block |
116 | | lsh_u32* cv_r; |
117 | | lsh_u32* sub_msgs; |
118 | | lsh_u8* last_block; |
119 | | lsh_u32& remain_databitlen; |
120 | | lsh_type alg_type; |
121 | | }; |
122 | | |
123 | | struct LSH256_AVX2_Internal |
124 | | { |
125 | | LSH256_AVX2_Internal(word32* state) : |
126 | | submsg_e_l(state+16), submsg_e_r(state+24), |
127 | 383k | submsg_o_l(state+32), submsg_o_r(state+40) { } |
128 | | |
129 | | lsh_u32* submsg_e_l; /* even left sub-message */ |
130 | | lsh_u32* submsg_e_r; /* even right sub-message */ |
131 | | lsh_u32* submsg_o_l; /* odd left sub-message */ |
132 | | lsh_u32* submsg_o_r; /* odd right sub-message */ |
133 | | }; |
134 | | |
135 | | // Zero the upper 128 bits of all YMM registers on exit. |
136 | | // It avoids AVX state transition penalties when saving state. |
137 | | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735 |
138 | | // makes using zeroupper a little tricky. |
139 | | |
140 | | struct AVX_Cleanup |
141 | | { |
142 | 96.8k | ~AVX_Cleanup() { |
143 | 96.8k | _mm256_zeroupper(); |
144 | 96.8k | } |
145 | | }; |
146 | | |
147 | | // const word32 g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; |
148 | | |
149 | | /* LSH AlgType Macro */ |
150 | | |
151 | 0 | inline bool LSH_IS_LSH512(lsh_uint val) { |
152 | 0 | return (val & 0xf0000) == 0; |
153 | 0 | } |
154 | | |
155 | 13.8k | inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) { |
156 | 13.8k | return val >> 24; |
157 | 13.8k | } |
158 | | |
159 | 13.8k | inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) { |
160 | 13.8k | return val & 0xffff; |
161 | 13.8k | } |
162 | | |
163 | 0 | inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) { |
164 | 0 | return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val); |
165 | 0 | } |
166 | | |
167 | 0 | inline lsh_u32 loadLE32(lsh_u32 v) { |
168 | 0 | return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v); |
169 | 0 | } |
170 | | |
171 | 0 | lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) { |
172 | 0 | return rotlFixed(x, r); |
173 | 0 | } |
174 | | |
175 | | // Original code relied upon unaligned lsh_u32 buffer |
176 | | inline void load_msg_blk(LSH256_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN]) |
177 | 383k | { |
178 | 383k | CRYPTOPP_ASSERT(i_state != NULLPTR); |
179 | | |
180 | 383k | lsh_u32* submsg_e_l = i_state->submsg_e_l; |
181 | 383k | lsh_u32* submsg_e_r = i_state->submsg_e_r; |
182 | 383k | lsh_u32* submsg_o_l = i_state->submsg_o_l; |
183 | 383k | lsh_u32* submsg_o_r = i_state->submsg_o_r; |
184 | | |
185 | 383k | _mm256_storeu_si256(M256_CAST(submsg_e_l+0), |
186 | 383k | _mm256_loadu_si256(CONST_M256_CAST(msgblk+0))); |
187 | 383k | _mm256_storeu_si256(M256_CAST(submsg_e_r+0), |
188 | 383k | _mm256_loadu_si256(CONST_M256_CAST(msgblk+32))); |
189 | 383k | _mm256_storeu_si256(M256_CAST(submsg_o_l+0), |
190 | 383k | _mm256_loadu_si256(CONST_M256_CAST(msgblk+64))); |
191 | 383k | _mm256_storeu_si256(M256_CAST(submsg_o_r+0), |
192 | 383k | _mm256_loadu_si256(CONST_M256_CAST(msgblk+96))); |
193 | 383k | } |
194 | | |
195 | | inline void msg_exp_even(LSH256_AVX2_Internal* i_state) |
196 | 4.98M | { |
197 | 4.98M | CRYPTOPP_ASSERT(i_state != NULLPTR); |
198 | | |
199 | 4.98M | lsh_u32* submsg_e_l = i_state->submsg_e_l; |
200 | 4.98M | lsh_u32* submsg_e_r = i_state->submsg_e_r; |
201 | 4.98M | lsh_u32* submsg_o_l = i_state->submsg_o_l; |
202 | 4.98M | lsh_u32* submsg_o_r = i_state->submsg_o_r; |
203 | | |
204 | 4.98M | const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514, |
205 | 4.98M | 0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c); |
206 | | |
207 | 4.98M | _mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi32( |
208 | 4.98M | _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), |
209 | 4.98M | _mm256_shuffle_epi8( |
210 | 4.98M | _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), mask))); |
211 | 4.98M | _mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi32( |
212 | 4.98M | _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), |
213 | 4.98M | _mm256_shuffle_epi8( |
214 | 4.98M | _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), mask))); |
215 | 4.98M | } |
216 | | |
217 | | inline void msg_exp_odd(LSH256_AVX2_Internal* i_state) |
218 | 4.59M | { |
219 | 4.59M | CRYPTOPP_ASSERT(i_state != NULLPTR); |
220 | | |
221 | 4.59M | lsh_u32* submsg_e_l = i_state->submsg_e_l; |
222 | 4.59M | lsh_u32* submsg_e_r = i_state->submsg_e_r; |
223 | 4.59M | lsh_u32* submsg_o_l = i_state->submsg_o_l; |
224 | 4.59M | lsh_u32* submsg_o_r = i_state->submsg_o_r; |
225 | | |
226 | 4.59M | const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514, |
227 | 4.59M | 0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c); |
228 | | |
229 | 4.59M | _mm256_storeu_si256(M256_CAST(submsg_o_l+0), _mm256_add_epi32( |
230 | 4.59M | _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), |
231 | 4.59M | _mm256_shuffle_epi8( |
232 | 4.59M | _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), mask))); |
233 | 4.59M | _mm256_storeu_si256(M256_CAST(submsg_o_r+0), _mm256_add_epi32( |
234 | 4.59M | _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), |
235 | 4.59M | _mm256_shuffle_epi8( |
236 | 4.59M | _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), mask))); |
237 | 4.59M | } |
238 | | |
239 | | inline void load_sc(const lsh_u32** p_const_v, size_t i) |
240 | 9.96M | { |
241 | 9.96M | CRYPTOPP_ASSERT(p_const_v != NULLPTR); |
242 | | |
243 | 9.96M | *p_const_v = &LSH256_StepConstants[i]; |
244 | 9.96M | } |
245 | | |
246 | | inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_AVX2_Internal* i_state) |
247 | 5.36M | { |
248 | 5.36M | CRYPTOPP_ASSERT(i_state != NULLPTR); |
249 | | |
250 | 5.36M | lsh_u32* submsg_e_l = i_state->submsg_e_l; |
251 | 5.36M | lsh_u32* submsg_e_r = i_state->submsg_e_r; |
252 | | |
253 | 5.36M | _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_xor_si256( |
254 | 5.36M | _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)), |
255 | 5.36M | _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)))); |
256 | 5.36M | _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_xor_si256( |
257 | 5.36M | _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), |
258 | 5.36M | _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)))); |
259 | 5.36M | } |
260 | | |
261 | | inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_AVX2_Internal* i_state) |
262 | 4.98M | { |
263 | 4.98M | CRYPTOPP_ASSERT(i_state != NULLPTR); |
264 | | |
265 | 4.98M | lsh_u32* submsg_o_l = i_state->submsg_o_l; |
266 | 4.98M | lsh_u32* submsg_o_r = i_state->submsg_o_r; |
267 | | |
268 | 4.98M | _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( |
269 | 4.98M | _mm256_loadu_si256(CONST_M256_CAST(cv_l)), |
270 | 4.98M | _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l)))); |
271 | 4.98M | _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256( |
272 | 4.98M | _mm256_loadu_si256(CONST_M256_CAST(cv_r)), |
273 | 4.98M | _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r)))); |
274 | 4.98M | } |
275 | | |
276 | | inline void add_blk(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) |
277 | 29.8M | { |
278 | 29.8M | _mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi32( |
279 | 29.8M | _mm256_loadu_si256(CONST_M256_CAST(cv_l)), |
280 | 29.8M | _mm256_loadu_si256(CONST_M256_CAST(cv_r)))); |
281 | 29.8M | } |
282 | | |
283 | | template <unsigned int R> |
284 | | inline void rotate_blk(lsh_u32 cv[8]) |
285 | 19.9M | { |
286 | 19.9M | _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256( |
287 | 19.9M | _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R), |
288 | 19.9M | _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R))); |
289 | 19.9M | } lsh256_avx.cpp:void (anonymous namespace)::rotate_blk<29u>(unsigned int*) Line | Count | Source | 285 | 4.98M | { | 286 | 4.98M | _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256( | 287 | 4.98M | _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R), | 288 | 4.98M | _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R))); | 289 | 4.98M | } |
lsh256_avx.cpp:void (anonymous namespace)::rotate_blk<1u>(unsigned int*) Line | Count | Source | 285 | 4.98M | { | 286 | 4.98M | _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256( | 287 | 4.98M | _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R), | 288 | 4.98M | _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R))); | 289 | 4.98M | } |
lsh256_avx.cpp:void (anonymous namespace)::rotate_blk<5u>(unsigned int*) Line | Count | Source | 285 | 4.98M | { | 286 | 4.98M | _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256( | 287 | 4.98M | _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R), | 288 | 4.98M | _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R))); | 289 | 4.98M | } |
lsh256_avx.cpp:void (anonymous namespace)::rotate_blk<17u>(unsigned int*) Line | Count | Source | 285 | 4.98M | { | 286 | 4.98M | _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256( | 287 | 4.98M | _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R), | 288 | 4.98M | _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R))); | 289 | 4.98M | } |
|
290 | | |
291 | | inline void xor_with_const(lsh_u32 cv_l[8], const lsh_u32 const_v[8]) |
292 | 9.96M | { |
293 | 9.96M | _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256( |
294 | 9.96M | _mm256_loadu_si256(CONST_M256_CAST(cv_l)), |
295 | 9.96M | _mm256_loadu_si256(CONST_M256_CAST(const_v)))); |
296 | 9.96M | } |
297 | | |
298 | | inline void rotate_msg_gamma(lsh_u32 cv_r[8]) |
299 | 9.96M | { |
300 | | // g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; |
301 | 9.96M | _mm256_storeu_si256(M256_CAST(cv_r+0), |
302 | 9.96M | _mm256_shuffle_epi8(_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), |
303 | 9.96M | _mm256_set_epi8( |
304 | 9.96M | /* hi lane */ 15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1, |
305 | 9.96M | /* lo lane */ 12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0))); |
306 | 9.96M | } |
307 | | |
308 | | inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) |
309 | 9.96M | { |
310 | 9.96M | __m256i temp = _mm256_shuffle_epi32( |
311 | 9.96M | _mm256_loadu_si256(CONST_M256_CAST(cv_l)), _MM_SHUFFLE(3,1,0,2)); |
312 | 9.96M | _mm256_storeu_si256(M256_CAST(cv_r), |
313 | 9.96M | _mm256_shuffle_epi32( |
314 | 9.96M | _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(1,2,3,0))); |
315 | 9.96M | _mm256_storeu_si256(M256_CAST(cv_l), |
316 | 9.96M | _mm256_permute2x128_si256(temp, |
317 | 9.96M | _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,3,0,1))); |
318 | 9.96M | _mm256_storeu_si256(M256_CAST(cv_r), |
319 | 9.96M | _mm256_permute2x128_si256(temp, |
320 | 9.96M | _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,2,0,0))); |
321 | 9.96M | } |
322 | | |
323 | | /* -------------------------------------------------------- * |
324 | | * step function |
325 | | * -------------------------------------------------------- */ |
326 | | |
327 | | template <unsigned int Alpha, unsigned int Beta> |
328 | | inline void mix(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 const_v[8]) |
329 | 9.96M | { |
330 | 9.96M | add_blk(cv_l, cv_r); |
331 | 9.96M | rotate_blk<Alpha>(cv_l); |
332 | 9.96M | xor_with_const(cv_l, const_v); |
333 | 9.96M | add_blk(cv_r, cv_l); |
334 | 9.96M | rotate_blk<Beta>(cv_r); |
335 | 9.96M | add_blk(cv_l, cv_r); |
336 | 9.96M | rotate_msg_gamma(cv_r); |
337 | 9.96M | } lsh256_avx.cpp:void (anonymous namespace)::mix<29u, 1u>(unsigned int*, unsigned int*, unsigned int const*) Line | Count | Source | 329 | 4.98M | { | 330 | 4.98M | add_blk(cv_l, cv_r); | 331 | 4.98M | rotate_blk<Alpha>(cv_l); | 332 | 4.98M | xor_with_const(cv_l, const_v); | 333 | 4.98M | add_blk(cv_r, cv_l); | 334 | 4.98M | rotate_blk<Beta>(cv_r); | 335 | 4.98M | add_blk(cv_l, cv_r); | 336 | 4.98M | rotate_msg_gamma(cv_r); | 337 | 4.98M | } |
lsh256_avx.cpp:void (anonymous namespace)::mix<5u, 17u>(unsigned int*, unsigned int*, unsigned int const*) Line | Count | Source | 329 | 4.98M | { | 330 | 4.98M | add_blk(cv_l, cv_r); | 331 | 4.98M | rotate_blk<Alpha>(cv_l); | 332 | 4.98M | xor_with_const(cv_l, const_v); | 333 | 4.98M | add_blk(cv_r, cv_l); | 334 | 4.98M | rotate_blk<Beta>(cv_r); | 335 | 4.98M | add_blk(cv_l, cv_r); | 336 | 4.98M | rotate_msg_gamma(cv_r); | 337 | 4.98M | } |
|
338 | | |
339 | | /* -------------------------------------------------------- * |
340 | | * compression function |
341 | | * -------------------------------------------------------- */ |
342 | | |
343 | | inline void compress(LSH256_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_BYTE_LEN]) |
344 | 383k | { |
345 | 383k | CRYPTOPP_ASSERT(ctx != NULLPTR); |
346 | | |
347 | 383k | LSH256_AVX2_Internal s_state(ctx->cv_l); |
348 | 383k | LSH256_AVX2_Internal* i_state = &s_state; |
349 | | |
350 | 383k | const lsh_u32* const_v = NULL; |
351 | 383k | lsh_u32* cv_l = ctx->cv_l; |
352 | 383k | lsh_u32* cv_r = ctx->cv_r; |
353 | | |
354 | 383k | load_msg_blk(i_state, pdMsgBlk); |
355 | | |
356 | 383k | msg_add_even(cv_l, cv_r, i_state); |
357 | 383k | load_sc(&const_v, 0); |
358 | 383k | mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v); |
359 | 383k | word_perm(cv_l, cv_r); |
360 | | |
361 | 383k | msg_add_odd(cv_l, cv_r, i_state); |
362 | 383k | load_sc(&const_v, 8); |
363 | 383k | mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v); |
364 | 383k | word_perm(cv_l, cv_r); |
365 | | |
366 | 4.98M | for (size_t i = 1; i < NUM_STEPS / 2; i++) |
367 | 4.59M | { |
368 | 4.59M | msg_exp_even(i_state); |
369 | 4.59M | msg_add_even(cv_l, cv_r, i_state); |
370 | 4.59M | load_sc(&const_v, 16 * i); |
371 | 4.59M | mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v); |
372 | 4.59M | word_perm(cv_l, cv_r); |
373 | | |
374 | 4.59M | msg_exp_odd(i_state); |
375 | 4.59M | msg_add_odd(cv_l, cv_r, i_state); |
376 | 4.59M | load_sc(&const_v, 16 * i + 8); |
377 | 4.59M | mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v); |
378 | 4.59M | word_perm(cv_l, cv_r); |
379 | 4.59M | } |
380 | | |
381 | 383k | msg_exp_even(i_state); |
382 | 383k | msg_add_even(cv_l, cv_r, i_state); |
383 | 383k | } |
384 | | |
385 | | /* -------------------------------------------------------- */ |
386 | | |
387 | | inline void load_iv(word32 cv_l[8], word32 cv_r[8], const word32 iv[16]) |
388 | 14.1k | { |
389 | | // The IV's are 32-byte aligned so we can use aligned loads. |
390 | 14.1k | _mm256_storeu_si256(M256_CAST(cv_l+0), |
391 | 14.1k | _mm256_load_si256(CONST_M256_CAST(iv+0))); |
392 | 14.1k | _mm256_storeu_si256(M256_CAST(cv_r+0), |
393 | 14.1k | _mm256_load_si256(CONST_M256_CAST(iv+8))); |
394 | 14.1k | } |
395 | | |
396 | | inline void zero_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) |
397 | 0 | { |
398 | 0 | _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256()); |
399 | 0 | _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256()); |
400 | 0 | } |
401 | | |
402 | | inline void zero_submsgs(LSH256_AVX2_Context* ctx) |
403 | 14.1k | { |
404 | 14.1k | lsh_u32* sub_msgs = ctx->sub_msgs; |
405 | | |
406 | 14.1k | _mm256_storeu_si256(M256_CAST(sub_msgs+ 0), _mm256_setzero_si256()); |
407 | 14.1k | _mm256_storeu_si256(M256_CAST(sub_msgs+ 8), _mm256_setzero_si256()); |
408 | 14.1k | _mm256_storeu_si256(M256_CAST(sub_msgs+16), _mm256_setzero_si256()); |
409 | 14.1k | _mm256_storeu_si256(M256_CAST(sub_msgs+24), _mm256_setzero_si256()); |
410 | 14.1k | } |
411 | | |
412 | | inline void init224(LSH256_AVX2_Context* ctx) |
413 | 7.28k | { |
414 | 7.28k | CRYPTOPP_ASSERT(ctx != NULLPTR); |
415 | | |
416 | 7.28k | zero_submsgs(ctx); |
417 | 7.28k | load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV224); |
418 | 7.28k | } |
419 | | |
420 | | inline void init256(LSH256_AVX2_Context* ctx) |
421 | 6.90k | { |
422 | 6.90k | CRYPTOPP_ASSERT(ctx != NULLPTR); |
423 | | |
424 | 6.90k | zero_submsgs(ctx); |
425 | 6.90k | load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV256); |
426 | 6.90k | } |
427 | | |
428 | | /* -------------------------------------------------------- */ |
429 | | |
430 | | inline void fin(LSH256_AVX2_Context* ctx) |
431 | 13.8k | { |
432 | 13.8k | CRYPTOPP_ASSERT(ctx != NULLPTR); |
433 | | |
434 | 13.8k | _mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256( |
435 | 13.8k | _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)), |
436 | 13.8k | _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0)))); |
437 | 13.8k | } |
438 | | |
439 | | /* -------------------------------------------------------- */ |
440 | | |
441 | | inline void get_hash(LSH256_AVX2_Context* ctx, lsh_u8* pbHashVal) |
442 | 13.8k | { |
443 | 13.8k | CRYPTOPP_ASSERT(ctx != NULLPTR); |
444 | 13.8k | CRYPTOPP_ASSERT(ctx->alg_type != 0); |
445 | 13.8k | CRYPTOPP_ASSERT(pbHashVal != NULLPTR); |
446 | | |
447 | 13.8k | lsh_uint alg_type = ctx->alg_type; |
448 | 13.8k | lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type); |
449 | 13.8k | lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type); |
450 | | |
451 | | // Multiplying by looks odd... |
452 | 13.8k | std::memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len); |
453 | 13.8k | if (hash_val_bit_len){ |
454 | 0 | pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len); |
455 | 0 | } |
456 | 13.8k | } |
457 | | |
458 | | /* -------------------------------------------------------- */ |
459 | | |
460 | | lsh_err lsh256_init_avx2(LSH256_AVX2_Context* ctx) |
461 | 14.1k | { |
462 | 14.1k | CRYPTOPP_ASSERT(ctx != NULLPTR); |
463 | 14.1k | CRYPTOPP_ASSERT(ctx->alg_type != 0); |
464 | | |
465 | 14.1k | lsh_u32 alg_type = ctx->alg_type; |
466 | 14.1k | const lsh_u32* const_v = NULL; |
467 | 14.1k | ctx->remain_databitlen = 0; |
468 | | |
469 | | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. |
470 | 14.1k | AVX_Cleanup cleanup; |
471 | | |
472 | 14.1k | switch (alg_type) |
473 | 14.1k | { |
474 | 6.90k | case LSH_TYPE_256_256: |
475 | 6.90k | init256(ctx); |
476 | 6.90k | return LSH_SUCCESS; |
477 | 7.28k | case LSH_TYPE_256_224: |
478 | 7.28k | init224(ctx); |
479 | 7.28k | return LSH_SUCCESS; |
480 | 0 | default: |
481 | 0 | break; |
482 | 14.1k | } |
483 | | |
484 | 0 | lsh_u32* cv_l = ctx->cv_l; |
485 | 0 | lsh_u32* cv_r = ctx->cv_r; |
486 | |
|
487 | 0 | zero_iv(cv_l, cv_r); |
488 | 0 | cv_l[0] = LSH256_HASH_VAL_MAX_BYTE_LEN; |
489 | 0 | cv_l[1] = LSH_GET_HASHBIT(alg_type); |
490 | |
|
491 | 0 | for (size_t i = 0; i < NUM_STEPS / 2; i++) |
492 | 0 | { |
493 | | //Mix |
494 | 0 | load_sc(&const_v, i * 16); |
495 | 0 | mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v); |
496 | 0 | word_perm(cv_l, cv_r); |
497 | |
|
498 | 0 | load_sc(&const_v, i * 16 + 8); |
499 | 0 | mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v); |
500 | 0 | word_perm(cv_l, cv_r); |
501 | 0 | } |
502 | |
|
503 | 0 | return LSH_SUCCESS; |
504 | 14.1k | } |
505 | | |
506 | | lsh_err lsh256_update_avx2(LSH256_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen) |
507 | 68.7k | { |
508 | 68.7k | CRYPTOPP_ASSERT(ctx != NULLPTR); |
509 | 68.7k | CRYPTOPP_ASSERT(data != NULLPTR); |
510 | 68.7k | CRYPTOPP_ASSERT(databitlen % 8 == 0); |
511 | 68.7k | CRYPTOPP_ASSERT(ctx->alg_type != 0); |
512 | | |
513 | | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. |
514 | 68.7k | AVX_Cleanup cleanup; |
515 | | |
516 | 68.7k | if (databitlen == 0){ |
517 | 34.8k | return LSH_SUCCESS; |
518 | 34.8k | } |
519 | | |
520 | | // We are byte oriented. tail bits will always be 0. |
521 | 33.9k | size_t databytelen = databitlen >> 3; |
522 | | // lsh_uint pos2 = databitlen & 0x7; |
523 | 33.9k | const size_t pos2 = 0; |
524 | | |
525 | 33.9k | size_t remain_msg_byte = ctx->remain_databitlen >> 3; |
526 | | // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; |
527 | 33.9k | const size_t remain_msg_bit = 0; |
528 | | |
529 | 33.9k | if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){ |
530 | 0 | return LSH_ERR_INVALID_STATE; |
531 | 0 | } |
532 | 33.9k | if (remain_msg_bit > 0){ |
533 | 0 | return LSH_ERR_INVALID_DATABITLEN; |
534 | 0 | } |
535 | | |
536 | 33.9k | if (databytelen + remain_msg_byte < LSH256_MSG_BLK_BYTE_LEN) |
537 | 18.4k | { |
538 | 18.4k | std::memcpy(ctx->last_block + remain_msg_byte, data, databytelen); |
539 | 18.4k | ctx->remain_databitlen += (lsh_uint)databitlen; |
540 | 18.4k | remain_msg_byte += (lsh_uint)databytelen; |
541 | 18.4k | if (pos2){ |
542 | 0 | ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff); |
543 | 0 | } |
544 | 18.4k | return LSH_SUCCESS; |
545 | 18.4k | } |
546 | | |
547 | 15.4k | if (remain_msg_byte > 0){ |
548 | 2.59k | size_t more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte; |
549 | 2.59k | std::memcpy(ctx->last_block + remain_msg_byte, data, more_byte); |
550 | 2.59k | compress(ctx, ctx->last_block); |
551 | 2.59k | data += more_byte; |
552 | 2.59k | databytelen -= more_byte; |
553 | 2.59k | remain_msg_byte = 0; |
554 | 2.59k | ctx->remain_databitlen = 0; |
555 | 2.59k | } |
556 | | |
557 | 382k | while (databytelen >= LSH256_MSG_BLK_BYTE_LEN) |
558 | 366k | { |
559 | | // This call to compress caused some trouble. |
560 | | // The data pointer can become unaligned in the |
561 | | // previous block. |
562 | 366k | compress(ctx, data); |
563 | 366k | data += LSH256_MSG_BLK_BYTE_LEN; |
564 | 366k | databytelen -= LSH256_MSG_BLK_BYTE_LEN; |
565 | 366k | } |
566 | | |
567 | 15.4k | if (databytelen > 0){ |
568 | 3.64k | std::memcpy(ctx->last_block, data, databytelen); |
569 | 3.64k | ctx->remain_databitlen = (lsh_uint)(databytelen << 3); |
570 | 3.64k | } |
571 | | |
572 | 15.4k | if (pos2){ |
573 | 0 | ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff); |
574 | 0 | ctx->remain_databitlen += pos2; |
575 | 0 | } |
576 | | |
577 | 15.4k | return LSH_SUCCESS; |
578 | 33.9k | } |
579 | | |
580 | | lsh_err lsh256_final_avx2(LSH256_AVX2_Context* ctx, lsh_u8* hashval) |
581 | 13.8k | { |
582 | 13.8k | CRYPTOPP_ASSERT(ctx != NULLPTR); |
583 | 13.8k | CRYPTOPP_ASSERT(hashval != NULLPTR); |
584 | | |
585 | | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. |
586 | 13.8k | AVX_Cleanup cleanup; |
587 | | |
588 | | // We are byte oriented. tail bits will always be 0. |
589 | 13.8k | size_t remain_msg_byte = ctx->remain_databitlen >> 3; |
590 | | // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; |
591 | 13.8k | const size_t remain_msg_bit = 0; |
592 | | |
593 | 13.8k | if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){ |
594 | 0 | return LSH_ERR_INVALID_STATE; |
595 | 0 | } |
596 | | |
597 | 13.8k | if (remain_msg_bit){ |
598 | 0 | ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit)); |
599 | 0 | } |
600 | 13.8k | else{ |
601 | 13.8k | ctx->last_block[remain_msg_byte] = 0x80; |
602 | 13.8k | } |
603 | 13.8k | std::memset(ctx->last_block + remain_msg_byte + 1, 0, LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte - 1); |
604 | | |
605 | 13.8k | compress(ctx, ctx->last_block); |
606 | | |
607 | 13.8k | fin(ctx); |
608 | 13.8k | get_hash(ctx, hashval); |
609 | | |
610 | 13.8k | return LSH_SUCCESS; |
611 | 13.8k | } |
612 | | |
613 | | ANONYMOUS_NAMESPACE_END |
614 | | |
615 | | NAMESPACE_BEGIN(CryptoPP) |
616 | | |
617 | | extern |
618 | | void LSH256_Base_Restart_AVX2(word32* state) |
619 | 14.1k | { |
620 | 14.1k | state[RemainingBits] = 0; |
621 | 14.1k | LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]); |
622 | 14.1k | lsh_err err = lsh256_init_avx2(&ctx); |
623 | | |
624 | 14.1k | if (err != LSH_SUCCESS) |
625 | 0 | throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_init_avx2 failed"); |
626 | 14.1k | } |
627 | | |
628 | | extern |
629 | | void LSH256_Base_Update_AVX2(word32* state, const byte *input, size_t size) |
630 | 68.7k | { |
631 | 68.7k | LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]); |
632 | 68.7k | lsh_err err = lsh256_update_avx2(&ctx, input, 8*size); |
633 | | |
634 | 68.7k | if (err != LSH_SUCCESS) |
635 | 0 | throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_update_avx2 failed"); |
636 | 68.7k | } |
637 | | |
638 | | extern |
639 | | void LSH256_Base_TruncatedFinal_AVX2(word32* state, byte *hash, size_t) |
640 | 13.8k | { |
641 | 13.8k | LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]); |
642 | 13.8k | lsh_err err = lsh256_final_avx2(&ctx, hash); |
643 | | |
644 | 13.8k | if (err != LSH_SUCCESS) |
645 | 0 | throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_final_avx2 failed"); |
646 | 13.8k | } |
647 | | |
648 | | NAMESPACE_END |
649 | | |
650 | | #endif // CRYPTOPP_AVX2_AVAILABLE |