Coverage Report

Created: 2024-11-21 07:03

/src/cryptopp/lsh512_avx.cpp
Line
Count
Source (jump to first uncovered line)
1
// lsh.cpp - written and placed in the public domain by Jeffrey Walton
2
//           Based on the specification and source code provided by
3
//           Korea Internet & Security Agency (KISA) website. Also
4
//           see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
5
//           and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
6
7
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
8
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
9
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
10
// makes using zeroupper a little tricky.
11
12
#include "pch.h"
13
#include "config.h"
14
15
#include "lsh.h"
16
#include "misc.h"
17
18
// Squash MS LNK4221 and libtool warnings
19
extern const char LSH512_AVX_FNAME[] = __FILE__;
20
21
#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
22
23
#if defined(CRYPTOPP_AVX2_AVAILABLE)
24
# include <emmintrin.h>
25
# include <immintrin.h>
26
#endif
27
28
#if defined(CRYPTOPP_GCC_COMPATIBLE)
29
# include <x86intrin.h>
30
#endif
31
32
ANONYMOUS_NAMESPACE_BEGIN
33
34
/* LSH Constants */
35
36
const unsigned int LSH512_MSG_BLK_BYTE_LEN = 256;
37
// const unsigned int LSH512_MSG_BLK_BIT_LEN = 2048;
38
// const unsigned int LSH512_CV_BYTE_LEN = 128;
39
const unsigned int LSH512_HASH_VAL_MAX_BYTE_LEN = 64;
40
41
// const unsigned int MSG_BLK_WORD_LEN = 32;
42
const unsigned int CV_WORD_LEN = 16;
43
const unsigned int CONST_WORD_LEN = 8;
44
// const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
45
const unsigned int NUM_STEPS = 28;
46
47
const unsigned int ROT_EVEN_ALPHA = 23;
48
const unsigned int ROT_EVEN_BETA = 59;
49
const unsigned int ROT_ODD_ALPHA = 7;
50
const unsigned int ROT_ODD_BETA = 3;
51
52
const unsigned int LSH_TYPE_512_512 = 0x0010040;
53
const unsigned int LSH_TYPE_512_384 = 0x0010030;
54
const unsigned int LSH_TYPE_512_256 = 0x0010020;
55
const unsigned int LSH_TYPE_512_224 = 0x001001C;
56
57
// const unsigned int LSH_TYPE_384 = LSH_TYPE_512_384;
58
// const unsigned int LSH_TYPE_512 = LSH_TYPE_512_512;
59
60
/* Error Code */
61
62
const unsigned int LSH_SUCCESS = 0x0;
63
// const unsigned int LSH_ERR_NULL_PTR = 0x2401;
64
// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
65
const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
66
const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
67
68
/* Index into our state array */
69
70
const unsigned int AlgorithmType = 80;
71
const unsigned int RemainingBits = 81;
72
73
NAMESPACE_END
74
75
NAMESPACE_BEGIN(CryptoPP)
76
NAMESPACE_BEGIN(LSH)
77
78
// lsh512.cpp
79
extern const word64 LSH512_IV224[CV_WORD_LEN];
80
extern const word64 LSH512_IV256[CV_WORD_LEN];
81
extern const word64 LSH512_IV384[CV_WORD_LEN];
82
extern const word64 LSH512_IV512[CV_WORD_LEN];
83
extern const word64 LSH512_StepConstants[CONST_WORD_LEN * NUM_STEPS];
84
85
NAMESPACE_END  // LSH
86
NAMESPACE_END  // Crypto++
87
88
ANONYMOUS_NAMESPACE_BEGIN
89
90
using CryptoPP::byte;
91
using CryptoPP::word32;
92
using CryptoPP::word64;
93
using CryptoPP::rotlFixed;
94
using CryptoPP::rotlConstant;
95
96
using CryptoPP::GetBlock;
97
using CryptoPP::LittleEndian;
98
using CryptoPP::ConditionalByteReverse;
99
using CryptoPP::LITTLE_ENDIAN_ORDER;
100
101
using CryptoPP::LSH::LSH512_IV224;
102
using CryptoPP::LSH::LSH512_IV256;
103
using CryptoPP::LSH::LSH512_IV384;
104
using CryptoPP::LSH::LSH512_IV512;
105
using CryptoPP::LSH::LSH512_StepConstants;
106
107
typedef byte lsh_u8;
108
typedef word32 lsh_u32;
109
typedef word64 lsh_u64;
110
typedef word32 lsh_uint;
111
typedef word32 lsh_err;
112
typedef word32 lsh_type;
113
114
struct LSH512_AVX2_Context
115
{
116
  LSH512_AVX2_Context(word64* state, word64 algType, word64& remainingBitLength) :
117
    cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
118
    last_block(reinterpret_cast<byte*>(state+48)),
119
    remain_databitlen(remainingBitLength),
120
129k
    alg_type(static_cast<lsh_type>(algType)) {}
121
122
  lsh_u64* cv_l;  // start of our state block
123
  lsh_u64* cv_r;
124
  lsh_u64* sub_msgs;
125
  lsh_u8*  last_block;
126
  lsh_u64& remain_databitlen;
127
  lsh_type alg_type;
128
};
129
130
struct LSH512_AVX2_Internal
131
{
132
  LSH512_AVX2_Internal(word64* state) :
133
    submsg_e_l(state+16), submsg_e_r(state+24),
134
209k
    submsg_o_l(state+32), submsg_o_r(state+40) { }
135
136
  lsh_u64* submsg_e_l; /* even left sub-message  */
137
  lsh_u64* submsg_e_r; /* even right sub-message */
138
  lsh_u64* submsg_o_l; /* odd left sub-message   */
139
  lsh_u64* submsg_o_r; /* odd right sub-message  */
140
};
141
142
// Zero the upper 128 bits of all YMM registers on exit.
143
// It avoids AVX state transition penalties when saving state.
144
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735
145
// makes using zeroupper a little tricky.
146
147
struct AVX_Cleanup
148
{
149
129k
  ~AVX_Cleanup() {
150
129k
    _mm256_zeroupper();
151
129k
  }
152
};
153
154
// const lsh_u32 g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
155
156
/* LSH AlgType Macro */
157
158
0
inline bool LSH_IS_LSH512(lsh_uint val) {
159
0
  return (val & 0xf0000) == 0x10000;
160
0
}
161
162
17.8k
inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
163
17.8k
  return val >> 24;
164
17.8k
}
165
166
17.8k
inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
167
17.8k
  return val & 0xffff;
168
17.8k
}
169
170
0
inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
171
0
  return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
172
0
}
173
174
0
inline lsh_u64 loadLE64(lsh_u64 v) {
175
0
  return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v);
176
0
}
177
178
0
lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) {
179
0
  return rotlFixed(x, r);
180
0
}
181
182
// Original code relied upon unaligned lsh_u64 buffer
183
inline void load_msg_blk(LSH512_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH512_MSG_BLK_BYTE_LEN])
184
209k
{
185
209k
  lsh_u64* submsg_e_l = i_state->submsg_e_l;
186
209k
  lsh_u64* submsg_e_r = i_state->submsg_e_r;
187
209k
  lsh_u64* submsg_o_l = i_state->submsg_o_l;
188
209k
  lsh_u64* submsg_o_r = i_state->submsg_o_r;
189
190
209k
  _mm256_storeu_si256(M256_CAST(submsg_e_l+0),
191
209k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+0)));
192
209k
  _mm256_storeu_si256(M256_CAST(submsg_e_l+4),
193
209k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+32)));
194
195
209k
  _mm256_storeu_si256(M256_CAST(submsg_e_r+0),
196
209k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+64)));
197
209k
  _mm256_storeu_si256(M256_CAST(submsg_e_r+4),
198
209k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+96)));
199
200
209k
  _mm256_storeu_si256(M256_CAST(submsg_o_l+0),
201
209k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+128)));
202
209k
  _mm256_storeu_si256(M256_CAST(submsg_o_l+4),
203
209k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+160)));
204
205
209k
  _mm256_storeu_si256(M256_CAST(submsg_o_r+0),
206
209k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+192)));
207
209k
  _mm256_storeu_si256(M256_CAST(submsg_o_r+4),
208
209k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+224)));
209
209k
}
210
211
inline void msg_exp_even(LSH512_AVX2_Internal* i_state)
212
2.93M
{
213
2.93M
  CRYPTOPP_ASSERT(i_state != NULLPTR);
214
215
2.93M
  lsh_u64* submsg_e_l = i_state->submsg_e_l;
216
2.93M
  lsh_u64* submsg_e_r = i_state->submsg_e_r;
217
2.93M
  lsh_u64* submsg_o_l = i_state->submsg_o_l;
218
2.93M
  lsh_u64* submsg_o_r = i_state->submsg_o_r;
219
220
2.93M
  _mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi64(
221
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
222
2.93M
    _mm256_permute4x64_epi64(
223
2.93M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
224
2.93M
      _MM_SHUFFLE(1,0,2,3))));
225
2.93M
  _mm256_storeu_si256(M256_CAST(submsg_e_l+4), _mm256_add_epi64(
226
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
227
2.93M
    _mm256_permute4x64_epi64(
228
2.93M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
229
2.93M
      _MM_SHUFFLE(2,1,0,3))));
230
231
2.93M
  _mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi64(
232
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
233
2.93M
    _mm256_permute4x64_epi64(
234
2.93M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
235
2.93M
      _MM_SHUFFLE(1,0,2,3))));
236
2.93M
  _mm256_storeu_si256(M256_CAST(submsg_e_r+4), _mm256_add_epi64(
237
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
238
2.93M
    _mm256_permute4x64_epi64(
239
2.93M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
240
2.93M
      _MM_SHUFFLE(2,1,0,3))));
241
2.93M
}
242
243
inline void msg_exp_odd(LSH512_AVX2_Internal* i_state)
244
2.72M
{
245
2.72M
  CRYPTOPP_ASSERT(i_state != NULLPTR);
246
247
2.72M
  lsh_u64* submsg_e_l = i_state->submsg_e_l;
248
2.72M
  lsh_u64* submsg_e_r = i_state->submsg_e_r;
249
2.72M
  lsh_u64* submsg_o_l = i_state->submsg_o_l;
250
2.72M
  lsh_u64* submsg_o_r = i_state->submsg_o_r;
251
252
2.72M
  _mm256_storeu_si256(M256_CAST(submsg_o_l+0),
253
2.72M
    _mm256_add_epi64(
254
2.72M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
255
2.72M
      _mm256_permute4x64_epi64(
256
2.72M
        _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
257
2.72M
        _MM_SHUFFLE(1,0,2,3))));
258
2.72M
  _mm256_storeu_si256(M256_CAST(submsg_o_l+4),
259
2.72M
    _mm256_add_epi64(
260
2.72M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4)),
261
2.72M
      _mm256_permute4x64_epi64(
262
2.72M
        _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4)),
263
2.72M
        _MM_SHUFFLE(2,1,0,3))));
264
265
2.72M
  _mm256_storeu_si256(M256_CAST(submsg_o_r+0),
266
2.72M
    _mm256_add_epi64(
267
2.72M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
268
2.72M
      _mm256_permute4x64_epi64(
269
2.72M
        _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
270
2.72M
        _MM_SHUFFLE(1,0,2,3))));
271
2.72M
  _mm256_storeu_si256(M256_CAST(submsg_o_r+4),
272
2.72M
    _mm256_add_epi64(
273
2.72M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4)),
274
2.72M
      _mm256_permute4x64_epi64(
275
2.72M
        _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4)),
276
2.72M
        _MM_SHUFFLE(2,1,0,3))));
277
2.72M
}
278
279
inline void load_sc(const lsh_u64** p_const_v, size_t i)
280
5.86M
{
281
5.86M
  *p_const_v = &LSH512_StepConstants[i];
282
5.86M
}
283
284
inline void msg_add_even(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
285
3.14M
{
286
3.14M
  CRYPTOPP_ASSERT(i_state != NULLPTR);
287
288
3.14M
  lsh_u64* submsg_e_l = i_state->submsg_e_l;
289
3.14M
  lsh_u64* submsg_e_r = i_state->submsg_e_r;
290
291
3.14M
  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
292
3.14M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
293
3.14M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l))));
294
3.14M
  _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
295
3.14M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r)),
296
3.14M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r))));
297
298
3.14M
  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
299
3.14M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
300
3.14M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+4))));
301
3.14M
  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
302
3.14M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
303
3.14M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+4))));
304
3.14M
}
305
306
inline void msg_add_odd(lsh_u64 cv_l[8], lsh_u64 cv_r[8], LSH512_AVX2_Internal* i_state)
307
2.93M
{
308
2.93M
  CRYPTOPP_ASSERT(i_state != NULLPTR);
309
310
2.93M
  lsh_u64* submsg_o_l = i_state->submsg_o_l;
311
2.93M
  lsh_u64* submsg_o_r = i_state->submsg_o_r;
312
313
2.93M
  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
314
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
315
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l))));
316
2.93M
  _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
317
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r)),
318
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r))));
319
320
2.93M
  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
321
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
322
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+4))));
323
2.93M
  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_xor_si256(
324
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
325
2.93M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+4))));
326
2.93M
}
327
328
inline void add_blk(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
329
17.6M
{
330
17.6M
  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi64(
331
17.6M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
332
17.6M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r))));
333
17.6M
  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_add_epi64(
334
17.6M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
335
17.6M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r+4))));
336
17.6M
}
337
338
template <unsigned int R>
339
inline void rotate_blk(lsh_u64 cv[8])
340
11.7M
{
341
11.7M
  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
342
11.7M
    _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
343
11.7M
    _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R)));
344
11.7M
  _mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256(
345
11.7M
    _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R),
346
11.7M
    _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R)));
347
11.7M
}
lsh512_avx.cpp:void (anonymous namespace)::rotate_blk<23u>(unsigned long*)
Line
Count
Source
340
2.93M
{
341
2.93M
  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
342
2.93M
    _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
343
2.93M
    _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R)));
344
2.93M
  _mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256(
345
2.93M
    _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R),
346
2.93M
    _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R)));
347
2.93M
}
lsh512_avx.cpp:void (anonymous namespace)::rotate_blk<59u>(unsigned long*)
Line
Count
Source
340
2.93M
{
341
2.93M
  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
342
2.93M
    _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
343
2.93M
    _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R)));
344
2.93M
  _mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256(
345
2.93M
    _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R),
346
2.93M
    _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R)));
347
2.93M
}
lsh512_avx.cpp:void (anonymous namespace)::rotate_blk<7u>(unsigned long*)
Line
Count
Source
340
2.93M
{
341
2.93M
  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
342
2.93M
    _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
343
2.93M
    _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R)));
344
2.93M
  _mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256(
345
2.93M
    _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R),
346
2.93M
    _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R)));
347
2.93M
}
lsh512_avx.cpp:void (anonymous namespace)::rotate_blk<3u>(unsigned long*)
Line
Count
Source
340
2.93M
{
341
2.93M
  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
342
2.93M
    _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
343
2.93M
    _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv)), 64-R)));
344
2.93M
  _mm256_storeu_si256(M256_CAST(cv+4), _mm256_or_si256(
345
2.93M
    _mm256_slli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), R),
346
2.93M
    _mm256_srli_epi64(_mm256_loadu_si256(CONST_M256_CAST(cv+4)), 64-R)));
347
2.93M
}
348
349
inline void xor_with_const(lsh_u64 cv_l[8], const lsh_u64 const_v[8])
350
5.86M
{
351
5.86M
  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
352
5.86M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
353
5.86M
    _mm256_loadu_si256(CONST_M256_CAST(const_v))));
354
5.86M
  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_xor_si256(
355
5.86M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)),
356
5.86M
    _mm256_loadu_si256(CONST_M256_CAST(const_v+4))));
357
5.86M
}
358
359
inline void rotate_msg_gamma(lsh_u64 cv_r[8])
360
5.86M
{
361
  // g_gamma512[8] = { 0, 16, 32, 48, 8, 24, 40, 56 };
362
5.86M
  _mm256_storeu_si256(M256_CAST(cv_r+0),
363
5.86M
    _mm256_shuffle_epi8(
364
5.86M
      _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
365
5.86M
      _mm256_set_epi8(
366
5.86M
        /* hi lane */ 9,8,15,14, 13,12,11,10, 3,2,1,0, 7,6,5,4,
367
5.86M
        /* lo lane */ 13,12,11,10, 9,8,15,14, 7,6,5,4, 3,2,1,0)));
368
5.86M
  _mm256_storeu_si256(M256_CAST(cv_r+4),
369
5.86M
    _mm256_shuffle_epi8(
370
5.86M
      _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)),
371
5.86M
      _mm256_set_epi8(
372
5.86M
        /* hi lane */ 8,15,14,13, 12,11,10,9, 2,1,0,7, 6,5,4,3,
373
5.86M
        /* lo lane */ 12,11,10,9, 8,15,14,13, 6,5,4,3, 2,1,0,7)));
374
5.86M
}
375
376
inline void word_perm(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
377
5.86M
{
378
5.86M
  __m256i temp[2];
379
5.86M
  _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_permute4x64_epi64(
380
5.86M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2)));
381
5.86M
  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_permute4x64_epi64(
382
5.86M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2)));
383
5.86M
  _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_permute4x64_epi64(
384
5.86M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0)));
385
5.86M
  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_permute4x64_epi64(
386
5.86M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0)));
387
388
5.86M
  temp[0] = _mm256_loadu_si256(CONST_M256_CAST(cv_l+0));
389
5.86M
  temp[1] = _mm256_loadu_si256(CONST_M256_CAST(cv_r+0));
390
391
5.86M
  _mm256_storeu_si256(M256_CAST(cv_l+0),
392
5.86M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l+4)));
393
5.86M
  _mm256_storeu_si256(M256_CAST(cv_l+4),
394
5.86M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r+4)));
395
396
5.86M
  _mm256_storeu_si256(M256_CAST(cv_r+0), temp[0]);
397
5.86M
  _mm256_storeu_si256(M256_CAST(cv_r+4), temp[1]);
398
5.86M
}
399
400
/* -------------------------------------------------------- *
401
* step function
402
* -------------------------------------------------------- */
403
404
template <unsigned int Alpha, unsigned int Beta>
405
inline void mix(lsh_u64 cv_l[8], lsh_u64 cv_r[8], const lsh_u64 const_v[8])
406
5.86M
{
407
5.86M
  add_blk(cv_l, cv_r);
408
5.86M
  rotate_blk<Alpha>(cv_l);
409
5.86M
  xor_with_const(cv_l, const_v);
410
5.86M
  add_blk(cv_r, cv_l);
411
5.86M
  rotate_blk<Beta>(cv_r);
412
5.86M
  add_blk(cv_l, cv_r);
413
5.86M
  rotate_msg_gamma(cv_r);
414
5.86M
}
lsh512_avx.cpp:void (anonymous namespace)::mix<23u, 59u>(unsigned long*, unsigned long*, unsigned long const*)
Line
Count
Source
406
2.93M
{
407
2.93M
  add_blk(cv_l, cv_r);
408
2.93M
  rotate_blk<Alpha>(cv_l);
409
2.93M
  xor_with_const(cv_l, const_v);
410
2.93M
  add_blk(cv_r, cv_l);
411
2.93M
  rotate_blk<Beta>(cv_r);
412
2.93M
  add_blk(cv_l, cv_r);
413
2.93M
  rotate_msg_gamma(cv_r);
414
2.93M
}
lsh512_avx.cpp:void (anonymous namespace)::mix<7u, 3u>(unsigned long*, unsigned long*, unsigned long const*)
Line
Count
Source
406
2.93M
{
407
2.93M
  add_blk(cv_l, cv_r);
408
2.93M
  rotate_blk<Alpha>(cv_l);
409
2.93M
  xor_with_const(cv_l, const_v);
410
2.93M
  add_blk(cv_r, cv_l);
411
2.93M
  rotate_blk<Beta>(cv_r);
412
2.93M
  add_blk(cv_l, cv_r);
413
2.93M
  rotate_msg_gamma(cv_r);
414
2.93M
}
415
416
/* -------------------------------------------------------- *
417
* compression function
418
* -------------------------------------------------------- */
419
420
inline void compress(LSH512_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_BYTE_LEN])
421
209k
{
422
209k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
423
424
209k
  LSH512_AVX2_Internal  s_state(ctx->cv_l);
425
209k
  LSH512_AVX2_Internal* i_state = &s_state;
426
427
209k
  const lsh_u64* const_v = NULL;
428
209k
  lsh_u64 *cv_l = ctx->cv_l;
429
209k
  lsh_u64 *cv_r = ctx->cv_r;
430
431
209k
  load_msg_blk(i_state, pdMsgBlk);
432
433
209k
  msg_add_even(cv_l, cv_r, i_state);
434
209k
  load_sc(&const_v, 0);
435
209k
  mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
436
209k
  word_perm(cv_l, cv_r);
437
438
209k
  msg_add_odd(cv_l, cv_r, i_state);
439
209k
  load_sc(&const_v, 8);
440
209k
  mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
441
209k
  word_perm(cv_l, cv_r);
442
443
2.93M
  for (size_t i = 1; i < NUM_STEPS / 2; i++)
444
2.72M
  {
445
2.72M
    msg_exp_even(i_state);
446
2.72M
    msg_add_even(cv_l, cv_r, i_state);
447
2.72M
    load_sc(&const_v, 16 * i);
448
2.72M
    mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
449
2.72M
    word_perm(cv_l, cv_r);
450
451
2.72M
    msg_exp_odd(i_state);
452
2.72M
    msg_add_odd(cv_l, cv_r, i_state);
453
2.72M
    load_sc(&const_v, 16 * i + 8);
454
2.72M
    mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
455
2.72M
    word_perm(cv_l, cv_r);
456
2.72M
  }
457
458
209k
  msg_exp_even(i_state);
459
209k
  msg_add_even(cv_l, cv_r, i_state);
460
209k
}
461
462
/* -------------------------------------------------------- */
463
464
inline void load_iv(word64 cv_l[8], word64 cv_r[8], const word64 iv[16])
465
18.2k
{
466
  // The IV's are 32-byte aligned so we can use aligned loads.
467
18.2k
  _mm256_storeu_si256(M256_CAST(cv_l+0),
468
18.2k
    _mm256_load_si256(CONST_M256_CAST(iv+0)));
469
18.2k
  _mm256_storeu_si256(M256_CAST(cv_l+4),
470
18.2k
    _mm256_load_si256(CONST_M256_CAST(iv+4)));
471
472
18.2k
  _mm256_storeu_si256(M256_CAST(cv_r+0),
473
18.2k
    _mm256_load_si256(CONST_M256_CAST(iv+8)));
474
18.2k
  _mm256_storeu_si256(M256_CAST(cv_r+4),
475
18.2k
    _mm256_load_si256(CONST_M256_CAST(iv+12)));
476
18.2k
}
477
478
inline void zero_iv(lsh_u64 cv_l[8], lsh_u64 cv_r[8])
479
0
{
480
0
  _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256());
481
0
  _mm256_storeu_si256(M256_CAST(cv_l+4), _mm256_setzero_si256());
482
0
  _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256());
483
0
  _mm256_storeu_si256(M256_CAST(cv_r+4), _mm256_setzero_si256());
484
0
}
485
486
inline void zero_submsgs(LSH512_AVX2_Context* ctx)
487
18.2k
{
488
18.2k
  lsh_u64* sub_msgs = ctx->sub_msgs;
489
490
18.2k
  _mm256_storeu_si256(M256_CAST(sub_msgs+ 0),
491
18.2k
    _mm256_setzero_si256());
492
18.2k
  _mm256_storeu_si256(M256_CAST(sub_msgs+ 4),
493
18.2k
    _mm256_setzero_si256());
494
495
18.2k
  _mm256_storeu_si256(M256_CAST(sub_msgs+ 8),
496
18.2k
    _mm256_setzero_si256());
497
18.2k
  _mm256_storeu_si256(M256_CAST(sub_msgs+12),
498
18.2k
    _mm256_setzero_si256());
499
18.2k
}
500
501
inline void init224(LSH512_AVX2_Context* ctx)
502
0
{
503
0
  CRYPTOPP_ASSERT(ctx != NULLPTR);
504
505
0
  zero_submsgs(ctx);
506
0
  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV224);
507
0
}
508
509
inline void init256(LSH512_AVX2_Context* ctx)
510
5.48k
{
511
5.48k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
512
513
5.48k
  zero_submsgs(ctx);
514
5.48k
  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV256);
515
5.48k
}
516
517
inline void init384(LSH512_AVX2_Context* ctx)
518
6.51k
{
519
6.51k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
520
521
6.51k
  zero_submsgs(ctx);
522
6.51k
  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV384);
523
6.51k
}
524
525
inline void init512(LSH512_AVX2_Context* ctx)
526
6.27k
{
527
6.27k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
528
529
6.27k
  zero_submsgs(ctx);
530
6.27k
  load_iv(ctx->cv_l, ctx->cv_r, LSH512_IV512);
531
6.27k
}
532
533
/* -------------------------------------------------------- */
534
535
inline void fin(LSH512_AVX2_Context* ctx)
536
17.8k
{
537
17.8k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
538
539
17.8k
  _mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256(
540
17.8k
    _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)),
541
17.8k
    _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0))));
542
543
17.8k
  _mm256_storeu_si256(M256_CAST(ctx->cv_l+4), _mm256_xor_si256(
544
17.8k
    _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+4)),
545
17.8k
    _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+4))));
546
17.8k
}
547
548
/* -------------------------------------------------------- */
549
550
inline void get_hash(LSH512_AVX2_Context* ctx, lsh_u8* pbHashVal)
551
17.8k
{
552
17.8k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
553
17.8k
  CRYPTOPP_ASSERT(ctx->alg_type != 0);
554
17.8k
  CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
555
556
17.8k
  lsh_uint alg_type = ctx->alg_type;
557
17.8k
  lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
558
17.8k
  lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
559
560
  // Multiplying by sizeof(lsh_u8) looks odd...
561
17.8k
  std::memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
562
17.8k
  if (hash_val_bit_len){
563
0
    pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
564
0
  }
565
17.8k
}
566
567
/* -------------------------------------------------------- */
568
569
lsh_err lsh512_init_avx2(LSH512_AVX2_Context* ctx)
570
18.2k
{
571
18.2k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
572
18.2k
  CRYPTOPP_ASSERT(ctx->alg_type != 0);
573
574
18.2k
  lsh_u32 alg_type = ctx->alg_type;
575
18.2k
  const lsh_u64* const_v = NULL;
576
18.2k
  ctx->remain_databitlen = 0;
577
578
  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
579
18.2k
  AVX_Cleanup cleanup;
580
581
18.2k
  switch (alg_type){
582
6.27k
  case LSH_TYPE_512_512:
583
6.27k
    init512(ctx);
584
6.27k
    return LSH_SUCCESS;
585
6.51k
  case LSH_TYPE_512_384:
586
6.51k
    init384(ctx);
587
6.51k
    return LSH_SUCCESS;
588
5.48k
  case LSH_TYPE_512_256:
589
5.48k
    init256(ctx);
590
5.48k
    return LSH_SUCCESS;
591
0
  case LSH_TYPE_512_224:
592
0
    init224(ctx);
593
0
    return LSH_SUCCESS;
594
0
  default:
595
0
    break;
596
18.2k
  }
597
598
0
  lsh_u64* cv_l = ctx->cv_l;
599
0
  lsh_u64* cv_r = ctx->cv_r;
600
601
0
  zero_iv(cv_l, cv_r);
602
0
  cv_l[0] = LSH512_HASH_VAL_MAX_BYTE_LEN;
603
0
  cv_l[1] = LSH_GET_HASHBIT(alg_type);
604
605
0
  for (size_t i = 0; i < NUM_STEPS / 2; i++)
606
0
  {
607
    //Mix
608
0
    load_sc(&const_v, i * 16);
609
0
    mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
610
0
    word_perm(cv_l, cv_r);
611
612
0
    load_sc(&const_v, i * 16 + 8);
613
0
    mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
614
0
    word_perm(cv_l, cv_r);
615
0
  }
616
617
0
  return LSH_SUCCESS;
618
18.2k
}
619
620
lsh_err lsh512_update_avx2(LSH512_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen)
621
93.3k
{
622
93.3k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
623
93.3k
  CRYPTOPP_ASSERT(data != NULLPTR);
624
93.3k
  CRYPTOPP_ASSERT(databitlen % 8 == 0);
625
93.3k
  CRYPTOPP_ASSERT(ctx->alg_type != 0);
626
627
  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
628
93.3k
  AVX_Cleanup cleanup;
629
630
93.3k
  if (databitlen == 0){
631
47.1k
    return LSH_SUCCESS;
632
47.1k
  }
633
634
  // We are byte oriented. tail bits will always be 0.
635
46.2k
  size_t databytelen = databitlen >> 3;
636
  // lsh_uint pos2 = databitlen & 0x7;
637
46.2k
  const size_t pos2 = 0;
638
639
46.2k
  size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
640
  // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
641
46.2k
  const size_t remain_msg_bit = 0;
642
643
46.2k
  if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
644
0
    return LSH_ERR_INVALID_STATE;
645
0
  }
646
46.2k
  if (remain_msg_bit > 0){
647
0
    return LSH_ERR_INVALID_DATABITLEN;
648
0
  }
649
650
46.2k
  if (databytelen + remain_msg_byte < LSH512_MSG_BLK_BYTE_LEN){
651
25.7k
    std::memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
652
25.7k
    ctx->remain_databitlen += (lsh_uint)databitlen;
653
25.7k
    remain_msg_byte += (lsh_uint)databytelen;
654
25.7k
    if (pos2){
655
0
      ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
656
0
    }
657
25.7k
    return LSH_SUCCESS;
658
25.7k
  }
659
660
20.4k
  if (remain_msg_byte > 0){
661
3.30k
    size_t more_byte = LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte;
662
3.30k
    std::memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
663
3.30k
    compress(ctx, ctx->last_block);
664
3.30k
    data += more_byte;
665
3.30k
    databytelen -= more_byte;
666
3.30k
    remain_msg_byte = 0;
667
3.30k
    ctx->remain_databitlen = 0;
668
3.30k
  }
669
670
208k
  while (databytelen >= LSH512_MSG_BLK_BYTE_LEN)
671
188k
  {
672
    // This call to compress caused some trouble.
673
    // The data pointer can become unaligned in the
674
    // previous block.
675
188k
    compress(ctx, data);
676
188k
    data += LSH512_MSG_BLK_BYTE_LEN;
677
188k
    databytelen -= LSH512_MSG_BLK_BYTE_LEN;
678
188k
  }
679
680
20.4k
  if (databytelen > 0){
681
4.41k
    std::memcpy(ctx->last_block, data, databytelen);
682
4.41k
    ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
683
4.41k
  }
684
685
20.4k
  if (pos2){
686
0
    ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
687
0
    ctx->remain_databitlen += pos2;
688
0
  }
689
20.4k
  return LSH_SUCCESS;
690
46.2k
}
691
692
lsh_err lsh512_final_avx2(LSH512_AVX2_Context* ctx, lsh_u8* hashval)
693
17.8k
{
694
17.8k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
695
17.8k
  CRYPTOPP_ASSERT(hashval != NULLPTR);
696
697
  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
698
17.8k
  AVX_Cleanup cleanup;
699
700
  // We are byte oriented. tail bits will always be 0.
701
17.8k
  size_t remain_msg_byte = static_cast<size_t>(ctx->remain_databitlen >> 3);
702
  // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
703
17.8k
  const size_t remain_msg_bit = 0;
704
705
17.8k
  if (remain_msg_byte >= LSH512_MSG_BLK_BYTE_LEN){
706
0
    return LSH_ERR_INVALID_STATE;
707
0
  }
708
709
17.8k
  if (remain_msg_bit){
710
0
    ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
711
0
  }
712
17.8k
  else{
713
17.8k
    ctx->last_block[remain_msg_byte] = 0x80;
714
17.8k
  }
715
17.8k
  std::memset(ctx->last_block + remain_msg_byte + 1, 0, LSH512_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
716
717
17.8k
  compress(ctx, ctx->last_block);
718
719
17.8k
  fin(ctx);
720
17.8k
  get_hash(ctx, hashval);
721
722
17.8k
  return LSH_SUCCESS;
723
17.8k
}
724
725
ANONYMOUS_NAMESPACE_END
726
727
NAMESPACE_BEGIN(CryptoPP)
728
729
extern
730
void LSH512_Base_Restart_AVX2(word64* state)
731
18.2k
{
732
18.2k
  state[RemainingBits] = 0;
733
18.2k
  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
734
18.2k
  lsh_err err = lsh512_init_avx2(&ctx);
735
736
18.2k
  if (err != LSH_SUCCESS)
737
0
    throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_init_avx2 failed");
738
18.2k
}
739
740
extern
741
void LSH512_Base_Update_AVX2(word64* state, const byte *input, size_t size)
742
93.3k
{
743
93.3k
  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
744
93.3k
  lsh_err err = lsh512_update_avx2(&ctx, input, 8*size);
745
746
93.3k
  if (err != LSH_SUCCESS)
747
0
    throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_update_avx2 failed");
748
93.3k
}
749
750
extern
751
void LSH512_Base_TruncatedFinal_AVX2(word64* state, byte *hash, size_t)
752
17.8k
{
753
17.8k
  LSH512_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
754
17.8k
  lsh_err err = lsh512_final_avx2(&ctx, hash);
755
756
17.8k
  if (err != LSH_SUCCESS)
757
0
    throw Exception(Exception::OTHER_ERROR, "LSH512_Base: lsh512_final_avx2 failed");
758
17.8k
}
759
760
NAMESPACE_END
761
762
#endif  // CRYPTOPP_AVX2_AVAILABLE