/src/cryptopp/lsh512_sse.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // lsh.cpp - written and placed in the public domain by Jeffrey Walton |
2 | | // Based on the specification and source code provided by |
3 | | // Korea Internet & Security Agency (KISA) website. Also |
4 | | // see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do |
5 | | // and https://seed.kisa.or.kr/kisa/Board/22/detailView.do. |
6 | | |
7 | | // We are hitting some sort of GCC bug in the LSH AVX2 code path. |
8 | | // Clang is OK on the AVX2 code path. We believe it is GCC Issue |
9 | | // 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It |
10 | | // makes using zeroupper a little tricky. |
11 | | |
12 | | #include "pch.h" |
13 | | #include "config.h" |
14 | | |
15 | | #include "lsh.h" |
16 | | #include "misc.h" |
17 | | |
18 | | // Squash MS LNK4221 and libtool warnings |
19 | | extern const char LSH512_SSE_FNAME[] = __FILE__; |
20 | | |
21 | | #if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE) |
22 | | |
23 | | #if defined(CRYPTOPP_SSSE3_AVAILABLE) |
24 | | # include <emmintrin.h> |
25 | | # include <tmmintrin.h> |
26 | | #endif |
27 | | |
28 | | #if defined(CRYPTOPP_XOP_AVAILABLE) |
29 | | # include <ammintrin.h> |
30 | | #endif |
31 | | |
32 | | #if defined(CRYPTOPP_GCC_COMPATIBLE) |
33 | | # include <x86intrin.h> |
34 | | #endif |
35 | | |
36 | | ANONYMOUS_NAMESPACE_BEGIN |
37 | | |
38 | | /* LSH Constants */ |
39 | | |
40 | | const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256; |
41 | | // const unsigned int LSH512_MSG_BLK_BIT_LEN = 2048; |
42 | | // const unsigned int LSH512_CV_BYTE_LEN = 128; |
43 | | const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64; |
44 | | |
45 | | // const unsigned int MSG_BLK_WORD_LEN = 32; |
46 | | const unsigned int CV_WORD_LEN = 16; |
47 | | const unsigned int CONST_WORD_LEN = 8; |
48 | | // const unsigned int HASH_VAL_MAX_WORD_LEN = 8; |
49 | | const unsigned int NUM_STEPS = 28; |
50 | | |
51 | | const unsigned int ROT_EVEN_ALPHA = 23; |
52 | | const unsigned int ROT_EVEN_BETA = 59; |
53 | | const unsigned int ROT_ODD_ALPHA = 7; |
54 | | const unsigned int ROT_ODD_BETA = 3; |
55 | | |
56 | | const unsigned int LSH_TYPE_512_512 = 0x0010040; |
57 | | const unsigned int LSH_TYPE_512_384 = 0x0010030; |
58 | | const unsigned int LSH_TYPE_512_256 = 0x0010020; |
59 | | const unsigned int LSH_TYPE_512_224 = 0x001001C; |
60 | | |
61 | | // const unsigned int LSH_TYPE_384 = LSH_TYPE_512_384; |
62 | | // const unsigned int LSH_TYPE_512 = LSH_TYPE_512_512; |
63 | | |
64 | | /* Error Code */ |
65 | | |
66 | | const unsigned int LSH_SUCCESS = 0x0; |
67 | | // const unsigned int LSH_ERR_NULL_PTR = 0x2401; |
68 | | // const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402; |
69 | | const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403; |
70 | | const unsigned int LSH_ERR_INVALID_STATE = 0x2404; |
71 | | |
72 | | /* Index into our state array */ |
73 | | |
74 | | const unsigned int AlgorithmType = 80; |
75 | | const unsigned int RemainingBits = 81; |
76 | | |
77 | | NAMESPACE_END |
78 | | |
79 | | NAMESPACE_BEGIN(CryptoPP) |
80 | | NAMESPACE_BEGIN(LSH) |
81 | | |
82 | | // lsh512.cpp |
83 | | extern const word64 LSH512_IV224[CV_WORD_LEN]; |
84 | | extern const word64 LSH512_IV256[CV_WORD_LEN]; |
85 | | extern const word64 LSH512_IV384[CV_WORD_LEN]; |
86 | | extern const word64 LSH512_IV512[CV_WORD_LEN]; |
87 | | extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS]; |
88 | | |
89 | | NAMESPACE_END // LSH |
90 | | NAMESPACE_END // Crypto++ |
91 | | |
92 | | ANONYMOUS_NAMESPACE_BEGIN |
93 | | |
94 | | using CryptoPP::byte; |
95 | | using CryptoPP::word32; |
96 | | using CryptoPP::word64; |
97 | | using CryptoPP::rotlFixed; |
98 | | using CryptoPP::rotlConstant; |
99 | | |
100 | | using CryptoPP::GetBlock; |
101 | | using CryptoPP::LittleEndian; |
102 | | using CryptoPP::ConditionalByteReverse; |
103 | | using CryptoPP::LITTLE_ENDIAN_ORDER; |
104 | | |
105 | | using CryptoPP::LSH::LSH512_IV224; |
106 | | using CryptoPP::LSH::LSH512_IV256; |
107 | | using CryptoPP::LSH::LSH512_IV384; |
108 | | using CryptoPP::LSH::LSH512_IV512; |
109 | | using CryptoPP::LSH::LSH512_StepConstants; |
110 | | |
111 | | typedef byte lsh_u8; |
112 | | typedef word32 lsh_u32; |
113 | | typedef word64 lsh_u64; |
114 | | typedef word32 lsh_uint; |
115 | | typedef word32 lsh_err; |
116 | | typedef word32 lsh_type; |
117 | | |
118 | | struct LSH512_SSSE3_Context |
119 | | { |
120 | | LSH512_SSSE3_Context(word64* state, word64 algType, word64& remainingBitLength) : |
121 | | cv_l(state+0), cv_r(state+8), sub_msgs(state+16), |
122 | | last_block(reinterpret_cast<byte*>(state+48)), |
123 | | remain_databitlen(remainingBitLength), |
124 | 0 | alg_type(static_cast<lsh_type>(algType)) {} |
125 | | |
126 | | lsh_u64* cv_l; // start of our state block |
127 | | lsh_u64* cv_r; |
128 | | lsh_u64* sub_msgs; |
129 | | lsh_u8* last_block; |
130 | | lsh_u64& remain_databitlen; |
131 | | lsh_type alg_type; |
132 | | }; |
133 | | |
134 | | struct LSH512_SSSE3_Internal |
135 | | { |
136 | | LSH512_SSSE3_Internal(word64* state) : |
137 | | submsg_e_l(state+16), submsg_e_r(state+24), |
138 | 0 | submsg_o_l(state+32), submsg_o_r(state+40) { } |
139 | | |
140 | | lsh_u64* submsg_e_l; /* even left sub-message */ |
141 | | lsh_u64* submsg_e_r; /* even right sub-message */ |
142 | | lsh_u64* submsg_o_l; /* odd left sub-message */ |
143 | | lsh_u64* submsg_o_r; /* odd right sub-message */ |
144 | | }; |
145 | | |
146 | | // const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 }; |
147 | | |
148 | | /* LSH AlgType Macro */ |
149 | | |
150 | 0 | inline bool LSH_IS_LSH512(lsh_uint val) { |
151 | 0 | return (val & 0xf0000) == 0x10000; |
152 | 0 | } |
153 | | |
154 | 0 | inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) { |
155 | 0 | return val >> 24; |
156 | 0 | } |
157 | | |
158 | 0 | inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) { |
159 | 0 | return val & 0xffff; |
160 | 0 | } |
161 | | |
162 | 0 | inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) { |
163 | 0 | return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val); |
164 | 0 | } |
165 | | |
166 | 0 | inline lsh_u64 loadLE64(lsh_u64 v) { |
167 | 0 | return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v); |
168 | 0 | } |
169 | | |
170 | 0 | lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) { |
171 | 0 | return rotlFixed(x, r); |
172 | 0 | } |
173 | | |
174 | | // Original code relied upon unaligned lsh_u64 buffer |
175 | | inline void load_msg_blk(LSH512_SSSE3_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN]) |
176 | 0 | { |
177 | 0 | lsh_u64* submsg_e_l = i_state->submsg_e_l; |
178 | 0 | lsh_u64* submsg_e_r = i_state->submsg_e_r; |
179 | 0 | lsh_u64* submsg_o_l = i_state->submsg_o_l; |
180 | 0 | lsh_u64* submsg_o_r = i_state->submsg_o_r; |
181 | |
|
182 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+0), |
183 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+0))); |
184 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+2), |
185 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+16))); |
186 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+4), |
187 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+32))); |
188 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+6), |
189 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+48))); |
190 | |
|
191 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+0), |
192 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+64))); |
193 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+2), |
194 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+80))); |
195 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+4), |
196 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+96))); |
197 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+6), |
198 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+112))); |
199 | |
|
200 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+0), |
201 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+128))); |
202 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+2), |
203 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+144))); |
204 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+4), |
205 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+160))); |
206 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+6), |
207 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+176))); |
208 | |
|
209 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+0), |
210 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+192))); |
211 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+2), |
212 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+208))); |
213 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+4), |
214 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+224))); |
215 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+6), |
216 | 0 | _mm_loadu_si128(CONST_M128_CAST(msgblk+240))); |
217 | 0 | } |
218 | | |
219 | | inline void msg_exp_even(LSH512_SSSE3_Internal* i_state) |
220 | 0 | { |
221 | 0 | CRYPTOPP_ASSERT(i_state != NULLPTR); |
222 | |
|
223 | 0 | lsh_u64* submsg_e_l = i_state->submsg_e_l; |
224 | 0 | lsh_u64* submsg_e_r = i_state->submsg_e_r; |
225 | 0 | lsh_u64* submsg_o_l = i_state->submsg_o_l; |
226 | 0 | lsh_u64* submsg_o_r = i_state->submsg_o_r; |
227 | |
|
228 | 0 | __m128i temp; |
229 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+2), _mm_shuffle_epi32( |
230 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)), _MM_SHUFFLE(1,0,3,2))); |
231 | |
|
232 | 0 | temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)); |
233 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+0), |
234 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2))); |
235 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+2), temp); |
236 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_shuffle_epi32( |
237 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)), _MM_SHUFFLE(1,0,3,2))); |
238 | |
|
239 | 0 | temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)); |
240 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_unpacklo_epi64( |
241 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)), |
242 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); |
243 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_unpackhi_epi64( |
244 | 0 | temp, _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)))); |
245 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+2), _mm_shuffle_epi32( |
246 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)), _MM_SHUFFLE(1,0,3,2))); |
247 | |
|
248 | 0 | temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)); |
249 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+0), |
250 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2))); |
251 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+2), temp); |
252 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_shuffle_epi32( |
253 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)), _MM_SHUFFLE(1,0,3,2))); |
254 | |
|
255 | 0 | temp = _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)); |
256 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_unpacklo_epi64( |
257 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)), |
258 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); |
259 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_unpackhi_epi64( |
260 | 0 | temp, _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)))); |
261 | |
|
262 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi64( |
263 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), |
264 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)))); |
265 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+2), _mm_add_epi64( |
266 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)), |
267 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)))); |
268 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_add_epi64( |
269 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), |
270 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); |
271 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_l+6), _mm_add_epi64( |
272 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)), |
273 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)))); |
274 | |
|
275 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+0), _mm_add_epi64( |
276 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), |
277 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)))); |
278 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+2), _mm_add_epi64( |
279 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)), |
280 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)))); |
281 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_add_epi64( |
282 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), |
283 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); |
284 | 0 | _mm_storeu_si128(M128_CAST(submsg_e_r+6), _mm_add_epi64( |
285 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)), |
286 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)))); |
287 | 0 | } |
288 | | |
289 | | inline void msg_exp_odd(LSH512_SSSE3_Internal* i_state) |
290 | 0 | { |
291 | 0 | CRYPTOPP_ASSERT(i_state != NULLPTR); |
292 | |
|
293 | 0 | lsh_u64* submsg_e_l = i_state->submsg_e_l; |
294 | 0 | lsh_u64* submsg_e_r = i_state->submsg_e_r; |
295 | 0 | lsh_u64* submsg_o_l = i_state->submsg_o_l; |
296 | 0 | lsh_u64* submsg_o_r = i_state->submsg_o_r; |
297 | |
|
298 | 0 | __m128i temp; |
299 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+2), _mm_shuffle_epi32( |
300 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)), _MM_SHUFFLE(1,0,3,2))); |
301 | |
|
302 | 0 | temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)); |
303 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+0), |
304 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2))); |
305 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+2), temp); |
306 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_shuffle_epi32( |
307 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)), _MM_SHUFFLE(1,0,3,2))); |
308 | |
|
309 | 0 | temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)); |
310 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_unpacklo_epi64( |
311 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)), |
312 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); |
313 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_unpackhi_epi64( |
314 | 0 | temp, _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)))); |
315 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+2), _mm_shuffle_epi32( |
316 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)), _MM_SHUFFLE(1,0,3,2))); |
317 | |
|
318 | 0 | temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)); |
319 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+0), |
320 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2))); |
321 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+2), temp); |
322 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_shuffle_epi32( |
323 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)), _MM_SHUFFLE(1,0,3,2))); |
324 | |
|
325 | 0 | temp = _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)); |
326 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_unpacklo_epi64( |
327 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)), |
328 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); |
329 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_unpackhi_epi64( |
330 | 0 | temp, _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)))); |
331 | |
|
332 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi64( |
333 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), |
334 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)))); |
335 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+2), _mm_add_epi64( |
336 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)), |
337 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)))); |
338 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_add_epi64( |
339 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), |
340 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); |
341 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_l+6), _mm_add_epi64( |
342 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)), |
343 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)))); |
344 | |
|
345 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+0), _mm_add_epi64( |
346 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), |
347 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)))); |
348 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+2), _mm_add_epi64( |
349 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)), |
350 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)))); |
351 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_add_epi64( |
352 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), |
353 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); |
354 | 0 | _mm_storeu_si128(M128_CAST(submsg_o_r+6), _mm_add_epi64( |
355 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)), |
356 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)))); |
357 | 0 | } |
358 | | |
359 | | inline void load_sc(const lsh_u64** p_const_v, size_t i) |
360 | 0 | { |
361 | 0 | *p_const_v = &LSH512_StepConstants[i]; |
362 | 0 | } |
363 | | |
364 | | inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_SSSE3_Internal* i_state) |
365 | 0 | { |
366 | 0 | CRYPTOPP_ASSERT(i_state != NULLPTR); |
367 | |
|
368 | 0 | lsh_u64* submsg_e_l = i_state->submsg_e_l; |
369 | 0 | lsh_u64* submsg_e_r = i_state->submsg_e_r; |
370 | |
|
371 | 0 | _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( |
372 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l)), |
373 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l)))); |
374 | 0 | _mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128( |
375 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r)), |
376 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r)))); |
377 | 0 | _mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128( |
378 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), |
379 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+2)))); |
380 | 0 | _mm_storeu_si128(M128_CAST(cv_r+2), _mm_xor_si128( |
381 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), |
382 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+2)))); |
383 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( |
384 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), |
385 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)))); |
386 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( |
387 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), |
388 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)))); |
389 | 0 | _mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128( |
390 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), |
391 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_l+6)))); |
392 | 0 | _mm_storeu_si128(M128_CAST(cv_r+6), _mm_xor_si128( |
393 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), |
394 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_e_r+6)))); |
395 | 0 | } |
396 | | |
397 | | inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_SSSE3_Internal* i_state) |
398 | 0 | { |
399 | 0 | CRYPTOPP_ASSERT(i_state != NULLPTR); |
400 | |
|
401 | 0 | lsh_u64* submsg_o_l = i_state->submsg_o_l; |
402 | 0 | lsh_u64* submsg_o_r = i_state->submsg_o_r; |
403 | |
|
404 | 0 | _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( |
405 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l)), |
406 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l)))); |
407 | 0 | _mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128( |
408 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r)), |
409 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r)))); |
410 | 0 | _mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128( |
411 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), |
412 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+2)))); |
413 | 0 | _mm_storeu_si128(M128_CAST(cv_r+2), _mm_xor_si128( |
414 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), |
415 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+2)))); |
416 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( |
417 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), |
418 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)))); |
419 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128( |
420 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), |
421 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)))); |
422 | 0 | _mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128( |
423 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), |
424 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_l+6)))); |
425 | 0 | _mm_storeu_si128(M128_CAST(cv_r+6), _mm_xor_si128( |
426 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), |
427 | 0 | _mm_loadu_si128(CONST_M128_CAST(submsg_o_r+6)))); |
428 | 0 | } |
429 | | |
430 | | inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) |
431 | 0 | { |
432 | 0 | _mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi64( |
433 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l)), |
434 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r)))); |
435 | 0 | _mm_storeu_si128(M128_CAST(cv_l+2), _mm_add_epi64( |
436 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), |
437 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+2)))); |
438 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_add_epi64( |
439 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), |
440 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+4)))); |
441 | 0 | _mm_storeu_si128(M128_CAST(cv_l+6), _mm_add_epi64( |
442 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), |
443 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+6)))); |
444 | 0 | } |
445 | | |
446 | | template <unsigned int R> |
447 | | inline void rotate_blk(lsh_u64 cv[8]) |
448 | 0 | { |
449 | | #if defined(CRYPTOPP_XOP_AVAILABLE) |
450 | | _mm_storeu_si128(M128_CAST(cv), |
451 | | _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), R)); |
452 | | _mm_storeu_si128(M128_CAST(cv+2), |
453 | | _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), R)); |
454 | | _mm_storeu_si128(M128_CAST(cv+4), |
455 | | _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R)); |
456 | | _mm_storeu_si128(M128_CAST(cv+6), |
457 | | _mm_roti_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), R)); |
458 | | |
459 | | #else |
460 | 0 | _mm_storeu_si128(M128_CAST(cv), _mm_or_si128( |
461 | 0 | _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), R), |
462 | 0 | _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv)), 64-R))); |
463 | 0 | _mm_storeu_si128(M128_CAST(cv+2), _mm_or_si128( |
464 | 0 | _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), R), |
465 | 0 | _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+2)), 64-R))); |
466 | 0 | _mm_storeu_si128(M128_CAST(cv+4), _mm_or_si128( |
467 | 0 | _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R), |
468 | 0 | _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+4)), 64-R))); |
469 | 0 | _mm_storeu_si128(M128_CAST(cv+6), _mm_or_si128( |
470 | 0 | _mm_slli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), R), |
471 | 0 | _mm_srli_epi64(_mm_loadu_si128(CONST_M128_CAST(cv+6)), 64-R))); |
472 | 0 | #endif |
473 | 0 | } Unexecuted instantiation: lsh512_sse.cpp:void (anonymous namespace)::rotate_blk<23u>(unsigned long*) Unexecuted instantiation: lsh512_sse.cpp:void (anonymous namespace)::rotate_blk<59u>(unsigned long*) Unexecuted instantiation: lsh512_sse.cpp:void (anonymous namespace)::rotate_blk<7u>(unsigned long*) Unexecuted instantiation: lsh512_sse.cpp:void (anonymous namespace)::rotate_blk<3u>(unsigned long*) |
474 | | |
475 | | inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8]) |
476 | 0 | { |
477 | 0 | _mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128( |
478 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l)), |
479 | 0 | _mm_loadu_si128(CONST_M128_CAST(const_v)))); |
480 | 0 | _mm_storeu_si128(M128_CAST(cv_l+2), _mm_xor_si128( |
481 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), |
482 | 0 | _mm_loadu_si128(CONST_M128_CAST(const_v+2)))); |
483 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128( |
484 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4)), |
485 | 0 | _mm_loadu_si128(CONST_M128_CAST(const_v+4)))); |
486 | 0 | _mm_storeu_si128(M128_CAST(cv_l+6), _mm_xor_si128( |
487 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), |
488 | 0 | _mm_loadu_si128(CONST_M128_CAST(const_v+6)))); |
489 | 0 | } |
490 | | |
491 | | inline void rotate_msg_gamma(lsh_u64 cv_r[8]) |
492 | 0 | { |
493 | | // g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 }; |
494 | 0 | _mm_storeu_si128(M128_CAST(cv_r+0), |
495 | 0 | _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)), |
496 | 0 | _mm_set_epi8(13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0))); |
497 | 0 | _mm_storeu_si128(M128_CAST(cv_r+2), |
498 | 0 | _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+2)), |
499 | 0 | _mm_set_epi8(9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4))); |
500 | |
|
501 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), |
502 | 0 | _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)), |
503 | 0 | _mm_set_epi8(12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7))); |
504 | 0 | _mm_storeu_si128(M128_CAST(cv_r+6), |
505 | 0 | _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+6)), |
506 | 0 | _mm_set_epi8(8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3))); |
507 | 0 | } |
508 | | |
509 | | inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) |
510 | 0 | { |
511 | 0 | __m128i temp[2]; |
512 | 0 | temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+0)); |
513 | 0 | _mm_storeu_si128(M128_CAST(cv_l+0), _mm_unpacklo_epi64( |
514 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+2)), |
515 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+0)))); |
516 | 0 | _mm_storeu_si128(M128_CAST(cv_l+2), _mm_unpackhi_epi64( |
517 | 0 | temp[0], _mm_loadu_si128(CONST_M128_CAST(cv_l+2)))); |
518 | |
|
519 | 0 | temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+4)); |
520 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_unpacklo_epi64( |
521 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+6)), |
522 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4)))); |
523 | 0 | _mm_storeu_si128(M128_CAST(cv_l+6), _mm_unpackhi_epi64( |
524 | 0 | temp[0], _mm_loadu_si128(CONST_M128_CAST(cv_l+6)))); |
525 | 0 | _mm_storeu_si128(M128_CAST(cv_r+2), _mm_shuffle_epi32( |
526 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), _MM_SHUFFLE(1,0,3,2))); |
527 | |
|
528 | 0 | temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_r+0)); |
529 | 0 | _mm_storeu_si128(M128_CAST(cv_r+0), _mm_unpacklo_epi64( |
530 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+0)), |
531 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+2)))); |
532 | 0 | _mm_storeu_si128(M128_CAST(cv_r+2), _mm_unpackhi_epi64( |
533 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+2)), temp[0])); |
534 | 0 | _mm_storeu_si128(M128_CAST(cv_r+6), _mm_shuffle_epi32( |
535 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), _MM_SHUFFLE(1,0,3,2))); |
536 | |
|
537 | 0 | temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_r+4)); |
538 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), _mm_unpacklo_epi64( |
539 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+4)), |
540 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+6)))); |
541 | 0 | _mm_storeu_si128(M128_CAST(cv_r+6), _mm_unpackhi_epi64( |
542 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+6)), temp[0])); |
543 | |
|
544 | 0 | temp[0] = _mm_loadu_si128(CONST_M128_CAST(cv_l+0)); |
545 | 0 | temp[1] = _mm_loadu_si128(CONST_M128_CAST(cv_l+2)); |
546 | |
|
547 | 0 | _mm_storeu_si128(M128_CAST(cv_l+0), |
548 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+4))); |
549 | 0 | _mm_storeu_si128(M128_CAST(cv_l+2), |
550 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_l+6))); |
551 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), |
552 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+4))); |
553 | 0 | _mm_storeu_si128(M128_CAST(cv_l+6), |
554 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+6))); |
555 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), |
556 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+0))); |
557 | 0 | _mm_storeu_si128(M128_CAST(cv_r+6), |
558 | 0 | _mm_loadu_si128(CONST_M128_CAST(cv_r+2))); |
559 | |
|
560 | 0 | _mm_storeu_si128(M128_CAST(cv_r+0), temp[0]); |
561 | 0 | _mm_storeu_si128(M128_CAST(cv_r+2), temp[1]); |
562 | 0 | } |
563 | | |
564 | | /* -------------------------------------------------------- * |
565 | | * step function |
566 | | * -------------------------------------------------------- */ |
567 | | |
568 | | template <unsigned int Alpha, unsigned int Beta> |
569 | | inline void mix(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 const_v[8]) |
570 | 0 | { |
571 | 0 | add_blk(cv_l, cv_r); |
572 | 0 | rotate_blk<Alpha>(cv_l); |
573 | 0 | xor_with_const(cv_l, const_v); |
574 | 0 | add_blk(cv_r, cv_l); |
575 | 0 | rotate_blk<Beta>(cv_r); |
576 | 0 | add_blk(cv_l, cv_r); |
577 | 0 | rotate_msg_gamma(cv_r); |
578 | 0 | } Unexecuted instantiation: lsh512_sse.cpp:void (anonymous namespace)::mix<23u, 59u>(unsigned long*, unsigned long*, unsigned long const*) Unexecuted instantiation: lsh512_sse.cpp:void (anonymous namespace)::mix<7u, 3u>(unsigned long*, unsigned long*, unsigned long const*) |
579 | | |
580 | | /* -------------------------------------------------------- * |
581 | | * compression function |
582 | | * -------------------------------------------------------- */ |
583 | | |
584 | | inline void compress(LSH512_SSSE3_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_BYTE_LEN]) |
585 | 0 | { |
586 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
587 | |
|
588 | 0 | LSH512_SSSE3_Internal s_state(ctx->cv_l); |
589 | 0 | LSH512_SSSE3_Internal* i_state = &s_state; |
590 | |
|
591 | 0 | const lsh_u64* const_v = NULL; |
592 | 0 | lsh_u64 *cv_l = ctx->cv_l; |
593 | 0 | lsh_u64 *cv_r = ctx->cv_r; |
594 | |
|
595 | 0 | load_msg_blk(i_state, pdMsgBlk); |
596 | |
|
597 | 0 | msg_add_even(cv_l, cv_r, i_state); |
598 | 0 | load_sc(&const_v, 0); |
599 | 0 | mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v); |
600 | 0 | word_perm(cv_l, cv_r); |
601 | |
|
602 | 0 | msg_add_odd(cv_l, cv_r, i_state); |
603 | 0 | load_sc(&const_v, 8); |
604 | 0 | mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v); |
605 | 0 | word_perm(cv_l, cv_r); |
606 | |
|
607 | 0 | for (size_t i = 1; i < NUM_STEPS / 2; i++) |
608 | 0 | { |
609 | 0 | msg_exp_even(i_state); |
610 | 0 | msg_add_even(cv_l, cv_r, i_state); |
611 | 0 | load_sc(&const_v, 16 * i); |
612 | 0 | mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v); |
613 | 0 | word_perm(cv_l, cv_r); |
614 | |
|
615 | 0 | msg_exp_odd(i_state); |
616 | 0 | msg_add_odd(cv_l, cv_r, i_state); |
617 | 0 | load_sc(&const_v, 16 * i + 8); |
618 | 0 | mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v); |
619 | 0 | word_perm(cv_l, cv_r); |
620 | 0 | } |
621 | |
|
622 | 0 | msg_exp_even(i_state); |
623 | 0 | msg_add_even(cv_l, cv_r, i_state); |
624 | 0 | } |
625 | | |
626 | | /* -------------------------------------------------------- */ |
627 | | |
628 | | inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16]) |
629 | 0 | { |
630 | | // The IV's are 32-byte aligned so we can use aligned loads. |
631 | 0 | _mm_storeu_si128(M128_CAST(cv_l+0), |
632 | 0 | _mm_load_si128(CONST_M128_CAST(iv+0))); |
633 | 0 | _mm_storeu_si128(M128_CAST(cv_l+2), |
634 | 0 | _mm_load_si128(CONST_M128_CAST(iv+2))); |
635 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), |
636 | 0 | _mm_load_si128(CONST_M128_CAST(iv+4))); |
637 | 0 | _mm_storeu_si128(M128_CAST(cv_l+6), |
638 | 0 | _mm_load_si128(CONST_M128_CAST(iv+6))); |
639 | 0 | _mm_storeu_si128(M128_CAST(cv_r+0), |
640 | 0 | _mm_load_si128(CONST_M128_CAST(iv+8))); |
641 | 0 | _mm_storeu_si128(M128_CAST(cv_r+2), |
642 | 0 | _mm_load_si128(CONST_M128_CAST(iv+10))); |
643 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), |
644 | 0 | _mm_load_si128(CONST_M128_CAST(iv+12))); |
645 | 0 | _mm_storeu_si128(M128_CAST(cv_r+6), |
646 | 0 | _mm_load_si128(CONST_M128_CAST(iv+14))); |
647 | 0 | } |
648 | | |
649 | | inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8]) |
650 | 0 | { |
651 | 0 | _mm_storeu_si128(M128_CAST(cv_l+0), _mm_setzero_si128()); |
652 | 0 | _mm_storeu_si128(M128_CAST(cv_l+2), _mm_setzero_si128()); |
653 | 0 | _mm_storeu_si128(M128_CAST(cv_l+4), _mm_setzero_si128()); |
654 | 0 | _mm_storeu_si128(M128_CAST(cv_l+6), _mm_setzero_si128()); |
655 | 0 | _mm_storeu_si128(M128_CAST(cv_r+0), _mm_setzero_si128()); |
656 | 0 | _mm_storeu_si128(M128_CAST(cv_r+2), _mm_setzero_si128()); |
657 | 0 | _mm_storeu_si128(M128_CAST(cv_r+4), _mm_setzero_si128()); |
658 | 0 | _mm_storeu_si128(M128_CAST(cv_r+6), _mm_setzero_si128()); |
659 | 0 | } |
660 | | |
661 | | inline void zero_submsgs(LSH512_SSSE3_Context* ctx) |
662 | 0 | { |
663 | 0 | lsh_u64* sub_msgs = ctx->sub_msgs; |
664 | |
|
665 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+ 0), |
666 | 0 | _mm_setzero_si128()); |
667 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+ 2), |
668 | 0 | _mm_setzero_si128()); |
669 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+ 4), |
670 | 0 | _mm_setzero_si128()); |
671 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+ 6), |
672 | 0 | _mm_setzero_si128()); |
673 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+ 8), |
674 | 0 | _mm_setzero_si128()); |
675 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+10), |
676 | 0 | _mm_setzero_si128()); |
677 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+12), |
678 | 0 | _mm_setzero_si128()); |
679 | 0 | _mm_storeu_si128(M128_CAST(sub_msgs+14), |
680 | 0 | _mm_setzero_si128()); |
681 | 0 | } |
682 | | |
683 | | inline void init224(LSH512_SSSE3_Context* ctx) |
684 | 0 | { |
685 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
686 | |
|
687 | 0 | zero_submsgs(ctx); |
688 | 0 | load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224); |
689 | 0 | } |
690 | | |
691 | | inline void init256(LSH512_SSSE3_Context* ctx) |
692 | 0 | { |
693 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
694 | |
|
695 | 0 | zero_submsgs(ctx); |
696 | 0 | load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256); |
697 | 0 | } |
698 | | |
699 | | inline void init384(LSH512_SSSE3_Context* ctx) |
700 | 0 | { |
701 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
702 | |
|
703 | 0 | zero_submsgs(ctx); |
704 | 0 | load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384); |
705 | 0 | } |
706 | | |
707 | | inline void init512(LSH512_SSSE3_Context* ctx) |
708 | 0 | { |
709 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
710 | |
|
711 | 0 | zero_submsgs(ctx); |
712 | 0 | load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512); |
713 | 0 | } |
714 | | |
715 | | /* -------------------------------------------------------- */ |
716 | | |
717 | | inline void fin(LSH512_SSSE3_Context* ctx) |
718 | 0 | { |
719 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
720 | |
|
721 | 0 | _mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128( |
722 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)), |
723 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0)))); |
724 | 0 | _mm_storeu_si128(M128_CAST(ctx->cv_l+2), _mm_xor_si128( |
725 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+2)), |
726 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+2)))); |
727 | 0 | _mm_storeu_si128(M128_CAST(ctx->cv_l+4), _mm_xor_si128( |
728 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+4)), |
729 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+4)))); |
730 | 0 | _mm_storeu_si128(M128_CAST(ctx->cv_l+6), _mm_xor_si128( |
731 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+6)), |
732 | 0 | _mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+6)))); |
733 | 0 | } |
734 | | |
735 | | /* -------------------------------------------------------- */ |
736 | | |
737 | | inline void get_hash(LSH512_SSSE3_Context* ctx, lsh_u8* pbHashVal) |
738 | 0 | { |
739 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
740 | 0 | CRYPTOPP_ASSERT(ctx->alg_type != 0); |
741 | 0 | CRYPTOPP_ASSERT(pbHashVal != NULLPTR); |
742 | |
|
743 | 0 | lsh_uint alg_type = ctx->alg_type; |
744 | 0 | lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type); |
745 | 0 | lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type); |
746 | | |
747 | | // Multiplying by sizeof(lsh_u8) looks odd... |
748 | 0 | std::memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len); |
749 | 0 | if (hash_val_bit_len){ |
750 | 0 | pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len); |
751 | 0 | } |
752 | 0 | } |
753 | | |
754 | | /* -------------------------------------------------------- */ |
755 | | |
756 | | lsh_err lsh512_init_ssse3(LSH512_SSSE3_Context* ctx) |
757 | 0 | { |
758 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
759 | 0 | CRYPTOPP_ASSERT(ctx->alg_type != 0); |
760 | |
|
761 | 0 | lsh_u32 alg_type = ctx->alg_type; |
762 | 0 | const lsh_u64* const_v = NULL; |
763 | 0 | ctx->remain_databitlen = 0; |
764 | |
|
765 | 0 | switch (alg_type){ |
766 | 0 | case LSH_TYPE_512_512: |
767 | 0 | init512(ctx); |
768 | 0 | return LSH_SUCCESS; |
769 | 0 | case LSH_TYPE_512_384: |
770 | 0 | init384(ctx); |
771 | 0 | return LSH_SUCCESS; |
772 | 0 | case LSH_TYPE_512_256: |
773 | 0 | init256(ctx); |
774 | 0 | return LSH_SUCCESS; |
775 | 0 | case LSH_TYPE_512_224: |
776 | 0 | init224(ctx); |
777 | 0 | return LSH_SUCCESS; |
778 | 0 | default: |
779 | 0 | break; |
780 | 0 | } |
781 | | |
782 | 0 | lsh_u64* cv_l = ctx->cv_l; |
783 | 0 | lsh_u64* cv_r = ctx->cv_r; |
784 | |
|
785 | 0 | zero_iv(cv_l, cv_r); |
786 | 0 | cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN; |
787 | 0 | cv_l[1] = LSH_GET_HASHBIT(alg_type); |
788 | |
|
789 | 0 | for (size_t i = 0; i < NUM_STEPS / 2; i++) |
790 | 0 | { |
791 | | //Mix |
792 | 0 | load_sc(&const_v, i * 16); |
793 | 0 | mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v); |
794 | 0 | word_perm(cv_l, cv_r); |
795 | |
|
796 | 0 | load_sc(&const_v, i * 16 + 8); |
797 | 0 | mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v); |
798 | 0 | word_perm(cv_l, cv_r); |
799 | 0 | } |
800 | |
|
801 | 0 | return LSH_SUCCESS; |
802 | 0 | } |
803 | | |
804 | | lsh_err lsh512_update_ssse3(LSH512_SSSE3_Context* ctx, const lsh_u8* data, size_t databitlen) |
805 | 0 | { |
806 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
807 | 0 | CRYPTOPP_ASSERT(data != NULLPTR); |
808 | 0 | CRYPTOPP_ASSERT(databitlen % 8 == 0); |
809 | 0 | CRYPTOPP_ASSERT(ctx->alg_type != 0); |
810 | |
|
811 | 0 | if (databitlen == 0){ |
812 | 0 | return LSH_SUCCESS; |
813 | 0 | } |
814 | | |
815 | | // We are byte oriented. tail bits will always be 0. |
816 | 0 | size_t databytelen = databitlen >> 3; |
817 | | // lsh_uint pos2 = databitlen & 0x7; |
818 | 0 | const size_t pos2 = 0; |
819 | |
|
820 | 0 | size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3); |
821 | | // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; |
822 | 0 | const size_t remain_msg_bit = 0; |
823 | |
|
824 | 0 | if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){ |
825 | 0 | return LSH_ERR_INVALID_STATE; |
826 | 0 | } |
827 | 0 | if (remain_msg_bit > 0){ |
828 | 0 | return LSH_ERR_INVALID_DATABITLEN; |
829 | 0 | } |
830 | | |
831 | 0 | if (databytelen + remain_msg_byte < LSH512_MSG_BLK_BYTE_LEN){ |
832 | 0 | std::memcpy(ctx->last_block + remain_msg_byte, data, databytelen); |
833 | 0 | ctx->remain_databitlen += (lsh_uint)databitlen; |
834 | 0 | remain_msg_byte += (lsh_uint)databytelen; |
835 | 0 | if (pos2){ |
836 | 0 | ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff); |
837 | 0 | } |
838 | 0 | return LSH_SUCCESS; |
839 | 0 | } |
840 | | |
841 | 0 | if (remain_msg_byte > 0){ |
842 | 0 | size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte; |
843 | 0 | std::memcpy(ctx->last_block + remain_msg_byte, data, more_byte); |
844 | 0 | compress(ctx, ctx->last_block); |
845 | 0 | data += more_byte; |
846 | 0 | databytelen -= more_byte; |
847 | 0 | remain_msg_byte = 0; |
848 | 0 | ctx->remain_databitlen = 0; |
849 | 0 | } |
850 | |
|
851 | 0 | while (databytelen >= LSH512_MSG_BLK_BYTE_LEN) |
852 | 0 | { |
853 | | // This call to compress caused some trouble. |
854 | | // The data pointer can become unaligned in the |
855 | | // previous block. |
856 | 0 | compress(ctx, data); |
857 | 0 | data += LSH512_MSG_BLK_BYTE_LEN; |
858 | 0 | databytelen -= LSH512_MSG_BLK_BYTE_LEN; |
859 | 0 | } |
860 | |
|
861 | 0 | if (databytelen > 0){ |
862 | 0 | std::memcpy(ctx->last_block, data, databytelen); |
863 | 0 | ctx->remain_databitlen = (lsh_uint)(databytelen << 3); |
864 | 0 | } |
865 | |
|
866 | 0 | if (pos2){ |
867 | 0 | ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff); |
868 | 0 | ctx->remain_databitlen += pos2; |
869 | 0 | } |
870 | 0 | return LSH_SUCCESS; |
871 | 0 | } |
872 | | |
873 | | lsh_err lsh512_final_ssse3(LSH512_SSSE3_Context* ctx, lsh_u8* hashval) |
874 | 0 | { |
875 | 0 | CRYPTOPP_ASSERT(ctx != NULLPTR); |
876 | 0 | CRYPTOPP_ASSERT(hashval != NULLPTR); |
877 | | |
878 | | // We are byte oriented. tail bits will always be 0. |
879 | 0 | size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3); |
880 | | // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7; |
881 | 0 | const size_t remain_msg_bit = 0; |
882 | |
|
883 | 0 | if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){ |
884 | 0 | return LSH_ERR_INVALID_STATE; |
885 | 0 | } |
886 | | |
887 | 0 | if (remain_msg_bit){ |
888 | 0 | ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit)); |
889 | 0 | } |
890 | 0 | else{ |
891 | 0 | ctx->last_block[remain_msg_byte] = 0x80; |
892 | 0 | } |
893 | 0 | std::memset(ctx->last_block + remain_msg_byte + 1, 0, LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte - 1); |
894 | |
|
895 | 0 | compress(ctx, ctx->last_block); |
896 | |
|
897 | 0 | fin(ctx); |
898 | 0 | get_hash(ctx, hashval); |
899 | |
|
900 | 0 | return LSH_SUCCESS; |
901 | 0 | } |
902 | | |
903 | | ANONYMOUS_NAMESPACE_END |
904 | | |
905 | | NAMESPACE_BEGIN(CryptoPP) |
906 | | |
907 | | extern |
908 | | void LSH512_Base_Restart_SSSE3(word64* state) |
909 | 0 | { |
910 | 0 | state[RemainingBits] = 0; |
911 | 0 | LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); |
912 | 0 | lsh_err err = lsh512_init_ssse3(&ctx); |
913 | |
|
914 | 0 | if (err != LSH_SUCCESS) |
915 | 0 | throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init_ssse3 failed"); |
916 | 0 | } |
917 | | |
918 | | extern |
919 | | void LSH512_Base_Update_SSSE3(word64* state, const byte *input, size_t size) |
920 | 0 | { |
921 | 0 | LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); |
922 | 0 | lsh_err err = lsh512_update_ssse3(&ctx, input, 8*size); |
923 | |
|
924 | 0 | if (err != LSH_SUCCESS) |
925 | 0 | throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update_ssse3 failed"); |
926 | 0 | } |
927 | | |
928 | | extern |
929 | | void LSH512_Base_TruncatedFinal_SSSE3(word64* state, byte *hash, size_t) |
930 | 0 | { |
931 | 0 | LSH512_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]); |
932 | 0 | lsh_err err = lsh512_final_ssse3(&ctx, hash); |
933 | |
|
934 | 0 | if (err != LSH_SUCCESS) |
935 | 0 | throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final_ssse3 failed"); |
936 | 0 | } |
937 | | |
938 | | NAMESPACE_END |
939 | | |
940 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |