/src/cryptopp/lsh256_sse.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // lsh.cpp - written and placed in the public domain by Jeffrey Walton |
2 | | // Based on the specification and source code provided by |
3 | | // Korea Internet & Security Agency (KISA) website. Also |
4 | | // see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do |
5 | | // and https://seed.kisa.or.kr/kisa/Board/22/detailView.do. |
6 | | |
7 | | // We are hitting some sort of GCC bug in the LSH AVX2 code path. |
8 | | // Clang is OK on the AVX2 code path. We believe it is GCC Issue |
9 | | // 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It |
10 | | // makes using zeroupper a little tricky. |
11 | | |
12 | | #include "pch.h" |
13 | | #include "config.h" |
14 | | |
15 | | #include "lsh.h" |
16 | | #include "cpu.h" |
17 | | #include "misc.h" |
18 | | |
19 | | // Squash MS LNK4221 and libtool warnings |
20 | | extern const char LSH256_SSE_FNAME[] = __FILE__; |
21 | | |
22 | | #if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) |
23 | | |
24 | | #if defined(CRYPTOPP_SSSE3_AVAILABLE) |
25 | | # include <emmintrin.h> |
26 | | # include <tmmintrin.h> |
27 | | #endif |
28 | | |
29 | | #if defined(CRYPTOPP_XOP_AVAILABLE) |
30 | | # include <ammintrin.h> |
31 | | #endif |
32 | | |
33 | | #if defined(CRYPTOPP_GCC_COMPATIBLE) |
34 | | # include <x86intrin.h> |
35 | | #endif |
36 | | |
37 | | ANONYMOUS_NAMESPACE_BEGIN |
38 | | |
39 | | /* LSH Constants */ |
40 | | |
41 | | const unsigned int LSH256_MSG_BLK_BYTE_LEN = 128; |
42 | | // const unsigned int LSH256_MSG_BLK_BIT_LEN = 1024; |
43 | | // const unsigned int LSH256_CV_BYTE_LEN = 64; |
44 | | const unsigned int LSH256_HASH_VAL_MAX_BYTE_LEN = 32; |
45 | | |
46 | | // const unsigned int MSG_BLK_WORD_LEN = 32; |
47 | | const unsigned int CV_WORD_LEN = 16; |
48 | | const unsigned int CONST_WORD_LEN = 8; |
49 | | // const unsigned int HASH_VAL_MAX_WORD_LEN = 8; |
50 | | // const unsigned int WORD_BIT_LEN = 32; |
51 | | const unsigned int NUM_STEPS = 26; |
52 | | |
53 | | const unsigned int ROT_EVEN_ALPHA = 29; |
54 | | const unsigned int ROT_EVEN_BETA = 1; |
55 | | const unsigned int ROT_ODD_ALPHA = 5; |
56 | | const unsigned int ROT_ODD_BETA = 17; |
57 | | |
58 | | const unsigned int LSH_TYPE_256_256 = 0x0000020; |
59 | | const unsigned int LSH_TYPE_256_224 = 0x000001C; |
60 | | |
61 | | // const unsigned int LSH_TYPE_224 = LSH_TYPE_256_224; |
62 | | // const unsigned int LSH_TYPE_256 = LSH_TYPE_256_256; |
63 | | |
64 | | /* Error Code */ |
65 | | |
66 | | const unsigned int LSH_SUCCESS = 0x0; |
67 | | // const unsigned int LSH_ERR_NULL_PTR = 0x2401; |
68 | | // const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402; |
69 | | const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403; |
70 | | const unsigned int LSH_ERR_INVALID_STATE = 0x2404; |
71 | | |
72 | | /* Index into our state array */ |
73 | | |
74 | | const unsigned int AlgorithmType = 80; |
75 | | const unsigned int RemainingBits = 81; |
76 | | |
77 | | NAMESPACE_END |
78 | | |
79 | | NAMESPACE_BEGIN(CryptoPP) |
80 | | NAMESPACE_BEGIN(LSH) |
81 | | |
82 | | // lsh256.cpp |
83 | | extern const word32 LSH256_IV224[CV_WORD_LEN]; |
84 | | extern const word32 LSH256_IV256[CV_WORD_LEN]; |
85 | | extern const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS]; |
86 | | |
87 | | NAMESPACE_END // LSH |
88 | | NAMESPACE_END // Crypto++ |
89 | | |
90 | | ANONYMOUS_NAMESPACE_BEGIN |
91 | | |
92 | | using CryptoPP::byte; |
93 | | using CryptoPP::word32; |
94 | | using CryptoPP::rotlFixed; |
95 | | using CryptoPP::rotlConstant; |
96 | | |
97 | | using CryptoPP::GetBlock; |
98 | | using CryptoPP::LittleEndian; |
99 | | using CryptoPP::ConditionalByteReverse; |
100 | | using CryptoPP::LITTLE_ENDIAN_ORDER; |
101 | | |
102 | | typedef byte lsh_u8; |
103 | | typedef word32 lsh_u32; |
104 | | typedef word32 lsh_uint; |
105 | | typedef word32 lsh_err; |
106 | | typedef word32 lsh_type; |
107 | | |
108 | | using CryptoPP::LSH::LSH256_IV224; |
109 | | using CryptoPP::LSH::LSH256_IV256; |
110 | | using CryptoPP::LSH::LSH256_StepConstants; |
111 | | |
112 | | struct LSH256_SSSE3_Context |
113 | | { |
114 | | LSH256_SSSE3_Context(word32* state, word32 algType, word32& remainingBitLength) : |
115 | | cv_l(state+0), cv_r(state+8), sub_msgs(state+16), |
116 | | last_block(reinterpret_cast<byte*>(state+48)), |
117 | | remain_databitlen(remainingBitLength), |
118 | 0 | alg_type(static_cast<lsh_type>(algType)) {} |
119 | | |
120 | | lsh_u32* cv_l; // start of our state block |
121 | | lsh_u32* cv_r; |
122 | | lsh_u32* sub_msgs; |
123 | | lsh_u8* last_block; |
124 | | lsh_u32& remain_databitlen; |
125 | | lsh_type alg_type; |
126 | | }; |
127 | | |
128 | | struct LSH256_SSSE3_Internal |
129 | | { |
130 | | LSH256_SSSE3_Internal(word32* state) : |
131 | | submsg_e_l(state+16), submsg_e_r(state+24), |
132 | 0 | submsg_o_l(state+32), submsg_o_r(state+40) { } |
133 | | |
134 | | lsh_u32* submsg_e_l; /* even left sub-message */ |
135 | | lsh_u32* submsg_e_r; /* even right sub-message */ |
136 | | lsh_u32* submsg_o_l; /* odd left sub-message */ |
137 | | lsh_u32* submsg_o_r; /* odd right sub-message */ |
138 | | }; |
139 | | |
140 | | // const word32 g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; |
141 | | |
142 | | /* LSH AlgType Macro */ |
143 | | |
144 | 0 | inline bool LSH_IS_LSH512(lsh_uint val) { |
145 | 0 | return (val & 0xf0000) == 0; |
146 | 0 | } |
147 | | |
148 | 0 | inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) { |
149 | 0 | return val >> 24; |
150 | 0 | } |
151 | | |
152 | 0 | inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) { |
153 | 0 | return val & 0xffff; |
154 | 0 | } |
155 | | |
156 | 0 | inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) { |
157 | 0 | return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val); |
158 | 0 | } |
159 | | |
160 | 0 | inline lsh_u32 loadLE32(lsh_u32 v) { |
161 | 0 | return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v); |
162 | 0 | } |
163 | | |
164 | 0 | lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) { |
165 | 0 | return rotlFixed(x, r); |
166 | 0 | } |
167 | | |
168 | | // Original code relied upon unaligned lsh_u32 buffer |
169 | | inline void load_msg_blk(LSH256_SSSE3_Internal* i_state, const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN]) |
170 | 0 | { |
171 | 0 | CRYPTOPP_ASSERT(i_state != NULLPTR); |
172 | 0 | lsh_u32* submsg_e_l = i_state->submsg_e_l; |
173 | 0 | lsh_u32* submsg_e_r = i_state->submsg_e_r; |
174 | 0 | lsh_u32* submsg_o_l = i_state->submsg_o_l; |
175 | 0 | lsh_u32* submsg_o_r = i_state->submsg_o_r; |
176 | |
|
177 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+0), |
178 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+0))); |
179 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+4), |
180 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+16))); |
181 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+0), |
182 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+32))); |
183 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+4), |
184 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+48))); |
185 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+0), |
186 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+64))); |
187 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+4), |
188 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+80))); |
189 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+0), |
190 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+96))); |
191 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+4), |
192 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+112))); |
193 | 0 | } |
194 | | |
195 | | inline void msg_exp_even(LSH256_SSSE3_Internal* i_state) |
196 | 0 | { |
197 | 0 | CRYPTOPP_ASSERT(i_state != NULLPTR); |
198 | |
|
199 | 0 | lsh_u32* submsg_e_l = i_state->submsg_e_l; |
200 | 0 | lsh_u32* submsg_e_r = i_state->submsg_e_r; |
201 | 0 | lsh_u32* submsg_o_l = i_state->submsg_o_l; |
202 | 0 | lsh_u32* submsg_o_r = i_state->submsg_o_r; |
203 | |
|
204 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi32( |
205 | 0 | _mm_shuffle_epi32( |
206 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(3,2,1,0)), |
207 | 0 | _mm_shuffle_epi32( |
208 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(1,0,2,3)))); |
209 | |
|
210 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_add_epi32( |
211 | 0 | _mm_shuffle_epi32( |
212 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(3,2,1,0)), |
213 | 0 | _mm_shuffle_epi32( |
214 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(2,1,0,3)))); |
215 | |
|
216 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+0), _mm_add_epi32( |
217 | 0 | _mm_shuffle_epi32( |
218 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(3,2,1,0)), |
219 | 0 | _mm_shuffle_epi32( |
220 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(1,0,2,3)))); |
221 | |
|
222 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_add_epi32( |
223 | 0 | _mm_shuffle_epi32( |
224 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(3,2,1,0)), |
225 | 0 | _mm_shuffle_epi32( |
226 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(2,1,0,3)))); |
227 | 0 | } |
228 | | |
229 | | inline void msg_exp_odd(LSH256_SSSE3_Internal* i_state) |
230 | 0 | { |
231 | 0 | CRYPTOPP_ASSERT(i_state != NULLPTR); |
232 | |
|
233 | 0 | lsh_u32* submsg_e_l = i_state->submsg_e_l; |
234 | 0 | lsh_u32* submsg_e_r = i_state->submsg_e_r; |
235 | 0 | lsh_u32* submsg_o_l = i_state->submsg_o_l; |
236 | 0 | lsh_u32* submsg_o_r = i_state->submsg_o_r; |
237 | |
|
238 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi32( |
239 | 0 | _mm_shuffle_epi32( |
240 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(3,2,1,0)), |
241 | 0 | _mm_shuffle_epi32( |
242 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(1,0,2,3)))); |
243 | |
|
244 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_add_epi32( |
245 | 0 | _mm_shuffle_epi32( |
246 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(3,2,1,0)), |
247 | 0 | _mm_shuffle_epi32( |
248 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(2,1,0,3)))); |
249 | |
|
250 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+0), _mm_add_epi32( |
251 | 0 | _mm_shuffle_epi32( |
252 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(3,2,1,0)), |
253 | 0 | _mm_shuffle_epi32( |
254 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(1,0,2,3)))); |
255 | |
|
256 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_add_epi32( |
257 | 0 | _mm_shuffle_epi32( |
258 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(3,2,1,0)), |
259 | 0 | _mm_shuffle_epi32( |
260 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(2,1,0,3)))); |
261 | 0 | } |
262 | | |
263 | | inline void load_sc(const lsh_u32** p_const_v, size_t i) |
264 | 0 | { |
265 | 0 | CRYPTOPP_ASSERT(p_const_v != NULLPTR); |
266 | |
|
267 | 0 | *p_const_v = &LSH256_StepConstants[i]; |
268 | 0 | } |
269 | | |
270 | | inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_SSSE3_Internal* i_state) |
271 | 0 | { |
272 | 0 | CRYPTOPP_ASSERT(i_state != NULLPTR); |
273 | |
|
274 | 0 | lsh_u32* submsg_e_l = i_state->submsg_e_l; |
275 | 0 | lsh_u32* submsg_e_r = i_state->submsg_e_r; |
276 | |
|
277 | 0 | _mm_storeu_si128(M128_CAST(cv_l+0), _mm_xor_si128( |
278 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+0)), |
279 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)))); |
280 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( |
281 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), |
282 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); |
283 | 0 | _mm_storeu_si128(M128_CAST(cv_r+0), _mm_xor_si128( |
284 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+0)), |
285 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)))); |
286 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( |
287 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), |
288 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); |
289 | 0 | } |
290 | | |
291 | | inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_SSSE3_Internal* i_state) |
292 | 0 | { |
293 | 0 | CRYPTOPP_ASSERT(i_state != NULLPTR); |
294 | |
|
295 | 0 | lsh_u32* submsg_o_l = i_state->submsg_o_l; |
296 | 0 | lsh_u32* submsg_o_r = i_state->submsg_o_r; |
297 | |
|
298 | 0 | _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( |
299 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l)), |
300 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l)))); |
301 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( |
302 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), |
303 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); |
304 | 0 | _mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128( |
305 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r)), |
306 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r)))); |
307 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( |
308 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), |
309 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); |
310 | 0 | } |
311 | | |
312 | | inline void add_blk(lsh_u32 cv_l[8], const lsh_u32 cv_r[8]) |
313 | 0 | { |
314 | 0 | _mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi32( |
315 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l)), |
316 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r)))); |
317 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_add_epi32( |
318 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), |
319 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+4)))); |
320 | 0 | } |
321 | | |
322 | | template <unsigned int R> |
323 | | inline void rotate_blk(lsh_u32 cv[8]) |
324 | 0 | { |
325 | | #if defined(CRYPTOPP_XOP_AVAILABLE) |
326 | | _mm_storeu_si128(M128_CAST(cv), |
327 | | _mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R)); |
328 | | _mm_storeu_si128(M128_CAST(cv+4), |
329 | | _mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R)); |
330 | | #else |
331 | 0 | _mm_storeu_si128(M128_CAST(cv), _mm_or_si128( |
332 | 0 | _mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R), |
333 | 0 | _mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), 32-R))); |
334 | 0 | _mm_storeu_si128(M128_CAST(cv+4), _mm_or_si128( |
335 | 0 | _mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R), |
336 | 0 | _mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), 32-R))); |
337 | 0 | #endif |
338 | 0 | } Unexecuted instantiation: lsh256_sse.cpp:void (anonymous namespace)::rotate_blk<29u>(unsigned int*) Unexecuted instantiation: lsh256_sse.cpp:void (anonymous namespace)::rotate_blk<1u>(unsigned int*) Unexecuted instantiation: lsh256_sse.cpp:void (anonymous namespace)::rotate_blk<5u>(unsigned int*) Unexecuted instantiation: lsh256_sse.cpp:void (anonymous namespace)::rotate_blk<17u>(unsigned int*) |
339 | | |
340 | | inline void xor_with_const(lsh_u32* cv_l, const lsh_u32* const_v) |
341 | 0 | { |
342 | 0 | _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( |
343 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l)), |
344 | 0 | _mm_loadu_si128(CONST_M128_CAST(const_v)))); |
345 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( |
346 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), |
347 | 0 | _mm_loadu_si128(CONST_M128_CAST(const_v+4)))); |
348 | 0 | } |
349 | | |
350 | | inline void rotate_msg_gamma(lsh_u32 cv_r[8]) |
351 | 0 | { |
352 | | // g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 }; |
353 | 0 | _mm_storeu_si128(M128_CAST(cv_r+0), |
354 | 0 | _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)), |
355 | 0 | _mm_set_epi8(12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0))); |
356 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), |
357 | 0 | _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)), |
358 | 0 | _mm_set_epi8(15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1))); |
359 | 0 | } |
360 | | |
361 | | inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) |
362 | 0 | { |
363 | 0 | _mm_storeu_si128(M128_CAST(cv_l+0), _mm_shuffle_epi32( |
364 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2))); |
365 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_shuffle_epi32( |
366 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2))); |
367 | 0 | _mm_storeu_si128(M128_CAST(cv_r+0), _mm_shuffle_epi32( |
368 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0))); |
369 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), _mm_shuffle_epi32( |
370 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0))); |
371 | |
|
372 | 0 | __m128i temp = _mm_loadu_si128(CONST_M128_CAST(cv_l+0)); |
373 | 0 | _mm_storeu_si128(M128_CAST(cv_l+0), |
374 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4))); |
375 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), |
376 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+4))); |
377 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), |
378 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+0))); |
379 | 0 | _mm_storeu_si128(M128_CAST(cv_r+0), temp); |
380 | 0 | } |
381 | | |
382 | | /* -------------------------------------------------------- * |
383 | | * step function |
384 | | * -------------------------------------------------------- */ |
385 | | |
386 | | template <unsigned int Alpha, unsigned int Beta> |
387 | | inline void mix(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 const_v[8]) |
388 | 0 | { |
389 | 0 | add_blk(cv_l, cv_r); |
390 | 0 | rotate_blk<Alpha>(cv_l); |
391 | 0 | xor_with_const(cv_l, const_v); |
392 | 0 | add_blk(cv_r, cv_l); |
393 | 0 | rotate_blk<Beta>(cv_r); |
394 | 0 | add_blk(cv_l, cv_r); |
395 | 0 | rotate_msg_gamma(cv_r); |
396 | 0 | } Unexecuted instantiation: lsh256_sse.cpp:void (anonymous namespace)::mix<29u, 1u>(unsigned int*, unsigned int*, unsigned int const*) Unexecuted instantiation: lsh256_sse.cpp:void (anonymous namespace)::mix<5u, 17u>(unsigned int*, unsigned int*, unsigned int const*) |
397 | | |
398 | | /* -------------------------------------------------------- * |
399 | | * compression function |
400 | | * -------------------------------------------------------- */ |
401 | | |
402 | | inline void compress(LSH256_SSSE3_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_BYTE_LEN]) |
403 | 0 | { |
404 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
405 | |
|
406 | 0 | LSH256_SSSE3_Internal s_state(ctx->cv_l); |
407 | 0 | LSH256_SSSE3_Internal* i_state = &s_state; |
408 | |
|
409 | 0 | const lsh_u32* const_v = NULL; |
410 | 0 | lsh_u32* cv_l = ctx->cv_l; |
411 | 0 | lsh_u32* cv_r = ctx->cv_r; |
412 | |
|
413 | 0 | load_msg_blk(i_state, pdMsgBlk); |
414 | |
|
415 | 0 | msg_add_even(cv_l, cv_r, i_state); |
416 | 0 | load_sc(&const_v, 0); |
417 | 0 | mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v); |
418 | 0 | word_perm(cv_l, cv_r); |
419 | |
|
420 | 0 | msg_add_odd(cv_l, cv_r, i_state); |
421 | 0 | load_sc(&const_v, 8); |
422 | 0 | mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v); |
423 | 0 | word_perm(cv_l, cv_r); |
424 | |
|
425 | 0 | for (size_t i = 1; i < NUM_STEPS / 2; i++) |
426 | 0 | { |
427 | 0 | msg_exp_even(i_state); |
428 | 0 | msg_add_even(cv_l, cv_r, i_state); |
429 | 0 | load_sc(&const_v, 16 * i); |
430 | 0 | mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v); |
431 | 0 | word_perm(cv_l, cv_r); |
432 | |
|
433 | 0 | msg_exp_odd(i_state); |
434 | 0 | msg_add_odd(cv_l, cv_r, i_state); |
435 | 0 | load_sc(&const_v, 16 * i + 8); |
436 | 0 | mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v); |
437 | 0 | word_perm(cv_l, cv_r); |
438 | 0 | } |
439 | |
|
440 | 0 | msg_exp_even(i_state); |
441 | 0 | msg_add_even(cv_l, cv_r, i_state); |
442 | 0 | } |
443 | | |
444 | | /* -------------------------------------------------------- */ |
445 | | |
446 | | inline void load_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 iv[16]) |
447 | 0 | { |
448 | 0 | _mm_storeu_si128(M128_CAST(cv_l+ 0), |
449 | 0 | _mm_load_si128(CONST_M128_CAST(iv+ 0))); |
450 | 0 | _mm_storeu_si128(M128_CAST(cv_l+ 4), |
451 | 0 | _mm_load_si128(CONST_M128_CAST(iv+ 4))); |
452 | 0 | _mm_storeu_si128(M128_CAST(cv_r+ 0), |
453 | 0 | _mm_load_si128(CONST_M128_CAST(iv+ 8))); |
454 | 0 | _mm_storeu_si128(M128_CAST(cv_r+ 4), |
455 | 0 | _mm_load_si128(CONST_M128_CAST(iv+12))); |
456 | 0 | } |
457 | | |
458 | | inline void zero_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8]) |
459 | 0 | { |
460 | 0 | _mm_storeu_si128(M128_CAST(cv_l+0), _mm_setzero_si128()); |
461 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_setzero_si128()); |
462 | 0 | _mm_storeu_si128(M128_CAST(cv_r+0), _mm_setzero_si128()); |
463 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), _mm_setzero_si128()); |
464 | 0 | } |
465 | | |
466 | | inline void zero_submsgs(LSH256_SSSE3_Context* ctx) |
467 | 0 | { |
468 | 0 | lsh_u32* sub_msgs = ctx->sub_msgs; |
469 | |
|
470 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+ 0), _mm_setzero_si128()); |
471 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+ 4), _mm_setzero_si128()); |
472 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+ 8), _mm_setzero_si128()); |
473 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+12), _mm_setzero_si128()); |
474 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+16), _mm_setzero_si128()); |
475 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+20), _mm_setzero_si128()); |
476 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+24), _mm_setzero_si128()); |
477 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+28), _mm_setzero_si128()); |
478 | 0 | } |
479 | | |
480 | | inline void init224(LSH256_SSSE3_Context* ctx) |
481 | 0 | { |
482 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
483 | |
|
484 | 0 | zero_submsgs(ctx); |
485 | 0 | load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV224); |
486 | 0 | } |
487 | | |
488 | | inline void init256(LSH256_SSSE3_Context* ctx) |
489 | 0 | { |
490 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
491 | |
|
492 | 0 | zero_submsgs(ctx); |
493 | 0 | load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV256); |
494 | 0 | } |
495 | | |
496 | | /* -------------------------------------------------------- */ |
497 | | |
498 | | inline void fin(LSH256_SSSE3_Context* ctx) |
499 | 0 | { |
500 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
501 | |
|
502 | 0 | _mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128( |
503 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)), |
504 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0)))); |
505 | 0 | _mm_storeu_si128(M128_CAST(ctx->cv_l+4), _mm_xor_si128( |
506 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+4)), |
507 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+4)))); |
508 | 0 | } |
509 | | |
510 | | /* -------------------------------------------------------- */ |
511 | | |
512 | | inline void get_hash(LSH256_SSSE3_Context* ctx, lsh_u8* pbHashVal) |
513 | 0 | { |
514 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
515 | 0 | CRYPTOPP_ASSERT(ctx->alg_type != 0); |
516 | 0 | CRYPTOPP_ASSERT(pbHashVal != NULLPTR); |
517 | |
|
518 | 0 | lsh_uint alg_type = ctx->alg_type; |
519 | 0 | lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type); |
520 | 0 | lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type); |
521 | | |
522 | | // Multiplying by sizeof(lsh_u8) looks odd... |
523 | 0 | std::memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len); |
524 | 0 | if (hash_val_bit_len){ |
525 | 0 | pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len); |
526 | 0 | } |
527 | 0 | } |
528 | | |
529 | | /* -------------------------------------------------------- */ |
530 | | |
531 | | lsh_err lsh256_ssse3_init(LSH256_SSSE3_Context* ctx) |
532 | 0 | { |
533 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
534 | 0 | CRYPTOPP_ASSERT(ctx->alg_type != 0); |
535 | |
|
536 | 0 | lsh_u32 alg_type = ctx->alg_type; |
537 | 0 | const lsh_u32* const_v = NULL; |
538 | 0 | ctx->remain_databitlen = 0; |
539 | |
|
540 | 0 | switch (alg_type) |
541 | 0 | { |
542 | 0 | case LSH_TYPE_256_256: |
543 | 0 | init256(ctx); |
544 | 0 | return LSH_SUCCESS; |
545 | 0 | case LSH_TYPE_256_224: |
546 | 0 | init224(ctx); |
547 | 0 | return LSH_SUCCESS; |
548 | 0 | default: |
549 | 0 | break; |
550 | 0 | } |
551 | | |
552 | 0 | lsh_u32* cv_l = ctx->cv_l; |
553 | 0 | lsh_u32* cv_r = ctx->cv_r; |
554 | |
|
555 | 0 | zero_iv(cv_l, cv_r); |
556 | 0 | cv_l[0] = LSH256_HASH_VAL_MAX_BYTE_LEN; |
557 | 0 | cv_l[1] = LSH_GET_HASHBIT(alg_type); |
558 | |
|
559 | 0 | for (size_t i = 0; i < NUM_STEPS / 2; i++) |
560 | 0 | { |
561 | | //Mix |
562 | 0 | load_sc(&const_v, i * 16); |
563 | 0 | mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v); |
564 | 0 | word_perm(cv_l, cv_r); |
565 | |
|
566 | 0 | load_sc(&const_v, i * 16 + 8); |
567 | 0 | mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v); |
568 | 0 | word_perm(cv_l, cv_r); |
569 | 0 | } |
570 | |
|
571 | 0 | return LSH_SUCCESS; |
572 | 0 | } |
573 | | |
574 | | lsh_err lsh256_ssse3_update(LSH256_SSSE3_Context* ctx, const lsh_u8* data, size_t databitlen) |
575 | 0 | { |
576 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
577 | 0 | CRYPTOPP_ASSERT(data != NULLPTR); |
578 | 0 | CRYPTOPP_ASSERT(databitlen % 8 == 0); |
579 | 0 | CRYPTOPP_ASSERT(ctx->alg_type != 0); |
580 | |
|
581 | 0 | if (databitlen == 0){ |
582 | 0 | return LSH_SUCCESS; |
583 | 0 | } |
584 | | |
585 | | // We are byte oriented. tail bits will always be 0. |
586 | 0 | size_t databytelen = databitlen >> 3; |
587 | | // lsh_uint pos2 = databitlen & 0x7; |
588 | 0 | const size_t pos2 = 0; |
589 | |
|
590 | 0 | size_t remain_msg_byte = ctx->remain_databitlen >> 3; |
591 | | // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; |
592 | 0 | const size_t remain_msg_bit = 0; |
593 | |
|
594 | 0 | if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){ |
595 | 0 | return LSH_ERR_INVALID_STATE; |
596 | 0 | } |
597 | 0 | if (remain_msg_bit > 0){ |
598 | 0 | return LSH_ERR_INVALID_DATABITLEN; |
599 | 0 | } |
600 | | |
601 | 0 | if (databytelen + remain_msg_byte < LSH256_MSG_BLK_BYTE_LEN) |
602 | 0 | { |
603 | 0 | std::memcpy(ctx->last_block + remain_msg_byte, data, databytelen); |
604 | 0 | ctx->remain_databitlen += (lsh_uint)databitlen; |
605 | 0 | remain_msg_byte += (lsh_uint)databytelen; |
606 | 0 | if (pos2){ |
607 | 0 | ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff); |
608 | 0 | } |
609 | 0 | return LSH_SUCCESS; |
610 | 0 | } |
611 | | |
612 | 0 | if (remain_msg_byte > 0){ |
613 | 0 | size_t more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte; |
614 | 0 | std::memcpy(ctx->last_block + remain_msg_byte, data, more_byte); |
615 | 0 | compress(ctx, ctx->last_block); |
616 | 0 | data += more_byte; |
617 | 0 | databytelen -= more_byte; |
618 | 0 | remain_msg_byte = 0; |
619 | 0 | ctx->remain_databitlen = 0; |
620 | 0 | } |
621 | |
|
622 | 0 | while (databytelen >= LSH256_MSG_BLK_BYTE_LEN) |
623 | 0 | { |
624 | | // This call to compress caused some trouble. |
625 | | // The data pointer can become unaligned in the |
626 | | // previous block. |
627 | 0 | compress(ctx, data); |
628 | 0 | data += LSH256_MSG_BLK_BYTE_LEN; |
629 | 0 | databytelen -= LSH256_MSG_BLK_BYTE_LEN; |
630 | 0 | } |
631 | |
|
632 | 0 | if (databytelen > 0){ |
633 | 0 | std::memcpy(ctx->last_block, data, databytelen); |
634 | 0 | ctx->remain_databitlen = (lsh_uint)(databytelen << 3); |
635 | 0 | } |
636 | |
|
637 | 0 | if (pos2){ |
638 | 0 | ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff); |
639 | 0 | ctx->remain_databitlen += pos2; |
640 | 0 | } |
641 | |
|
642 | 0 | return LSH_SUCCESS; |
643 | 0 | } |
644 | | |
645 | | lsh_err lsh256_ssse3_final(LSH256_SSSE3_Context* ctx, lsh_u8* hashval) |
646 | 0 | { |
647 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
648 | 0 | CRYPTOPP_ASSERT(hashval != NULLPTR); |
649 | | |
650 | | // We are byte oriented. tail bits will always be 0. |
651 | 0 | size_t remain_msg_byte = ctx->remain_databitlen >> 3; |
652 | | // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; |
653 | 0 | const size_t remain_msg_bit = 0; |
654 | |
|
655 | 0 | if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){ |
656 | 0 | return LSH_ERR_INVALID_STATE; |
657 | 0 | } |
658 | | |
659 | 0 | if (remain_msg_bit){ |
660 | 0 | ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit)); |
661 | 0 | } |
662 | 0 | else{ |
663 | 0 | ctx->last_block[remain_msg_byte] = 0x80; |
664 | 0 | } |
665 | 0 | std::memset(ctx->last_block + remain_msg_byte + 1, 0, LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte - 1); |
666 | |
|
667 | 0 | compress(ctx, ctx->last_block); |
668 | |
|
669 | 0 | fin(ctx); |
670 | 0 | get_hash(ctx, hashval); |
671 | |
|
672 | 0 | return LSH_SUCCESS; |
673 | 0 | } |
674 | | |
675 | | ANONYMOUS_NAMESPACE_END // Anonymous |
676 | | |
677 | | NAMESPACE_BEGIN(CryptoPP) |
678 | | |
679 | | extern |
680 | | void LSH256_Base_Restart_SSSE3(word32* state) |
681 | 0 | { |
682 | 0 | state[RemainingBits] = 0; |
683 | 0 | LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); |
684 | 0 | lsh_err err = lsh256_ssse3_init(&ctx); |
685 | |
|
686 | 0 | if (err != LSH_SUCCESS) |
687 | 0 | throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_init failed"); |
688 | 0 | } |
689 | | |
690 | | extern |
691 | | void LSH256_Base_Update_SSSE3(word32* state, const byte *input, size_t size) |
692 | 0 | { |
693 | 0 | LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); |
694 | 0 | lsh_err err = lsh256_ssse3_update(&ctx, input, 8*size); |
695 | |
|
696 | 0 | if (err != LSH_SUCCESS) |
697 | 0 | throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_update failed"); |
698 | 0 | } |
699 | | |
700 | | extern |
701 | | void LSH256_Base_TruncatedFinal_SSSE3(word32* state, byte *hash, size_t) |
702 | 0 | { |
703 | 0 | LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); |
704 | 0 | lsh_err err = lsh256_ssse3_final(&ctx, hash); |
705 | |
|
706 | 0 | if (err != LSH_SUCCESS) |
707 | 0 | throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_final failed"); |
708 | 0 | } |
709 | | |
710 | | NAMESPACE_END |
711 | | |
712 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |