Coverage Report

Created: 2024-11-21 07:03

/src/cryptopp/lsh256_avx.cpp
Line
Count
Source (jump to first uncovered line)
1
// lsh.cpp - written and placed in the public domain by Jeffrey Walton
2
//           Based on the specification and source code provided by
3
//           Korea Internet & Security Agency (KISA) website. Also
4
//           see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
5
//           and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
6
7
// We are hitting some sort of GCC bug in the LSH AVX2 code path.
8
// Clang is OK on the AVX2 code path. We believe it is GCC Issue
9
// 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
10
// makes using zeroupper a little tricky.
11
12
#include "pch.h"
13
#include "config.h"
14
15
#include "lsh.h"
16
#include "misc.h"
17
18
// Squash MS LNK4221 and libtool warnings
19
extern const char LSH256_AVX_FNAME[] = __FILE__;
20
21
#if defined(CRYPTOPP_AVX2_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
22
23
#if defined(CRYPTOPP_AVX2_AVAILABLE)
24
# include <emmintrin.h>
25
# include <immintrin.h>
26
#endif
27
28
#if defined(CRYPTOPP_GCC_COMPATIBLE)
29
# include <x86intrin.h>
30
#endif
31
32
ANONYMOUS_NAMESPACE_BEGIN
33
34
/* LSH Constants */
35
36
const unsigned int LSH256_MSG_BLK_BYTE_LEN = 128;
37
// const unsigned int LSH256_MSG_BLK_BIT_LEN = 1024;
38
// const unsigned int LSH256_CV_BYTE_LEN = 64;
39
const unsigned int LSH256_HASH_VAL_MAX_BYTE_LEN = 32;
40
41
// const unsigned int MSG_BLK_WORD_LEN = 32;
42
const unsigned int CV_WORD_LEN = 16;
43
const unsigned int CONST_WORD_LEN = 8;
44
// const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
45
// const unsigned int WORD_BIT_LEN = 32;
46
const unsigned int NUM_STEPS = 26;
47
48
const unsigned int ROT_EVEN_ALPHA = 29;
49
const unsigned int ROT_EVEN_BETA = 1;
50
const unsigned int ROT_ODD_ALPHA = 5;
51
const unsigned int ROT_ODD_BETA = 17;
52
53
const unsigned int LSH_TYPE_256_256 = 0x0000020;
54
const unsigned int LSH_TYPE_256_224 = 0x000001C;
55
56
// const unsigned int LSH_TYPE_224 = LSH_TYPE_256_224;
57
// const unsigned int LSH_TYPE_256 = LSH_TYPE_256_256;
58
59
/* Error Code */
60
61
const unsigned int LSH_SUCCESS = 0x0;
62
// const unsigned int LSH_ERR_NULL_PTR = 0x2401;
63
// const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
64
const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
65
const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
66
67
/* Index into our state array */
68
69
const unsigned int AlgorithmType = 80;
70
const unsigned int RemainingBits = 81;
71
72
NAMESPACE_END
73
74
NAMESPACE_BEGIN(CryptoPP)
75
NAMESPACE_BEGIN(LSH)
76
77
// lsh256.cpp
78
extern const word32 LSH256_IV224[CV_WORD_LEN];
79
extern const word32 LSH256_IV256[CV_WORD_LEN];
80
extern const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS];
81
82
NAMESPACE_END  // LSH
83
NAMESPACE_END  // Crypto++
84
85
ANONYMOUS_NAMESPACE_BEGIN
86
87
using CryptoPP::byte;
88
using CryptoPP::word32;
89
using CryptoPP::rotlFixed;
90
using CryptoPP::rotlConstant;
91
92
using CryptoPP::GetBlock;
93
using CryptoPP::LittleEndian;
94
using CryptoPP::ConditionalByteReverse;
95
using CryptoPP::LITTLE_ENDIAN_ORDER;
96
97
typedef byte lsh_u8;
98
typedef word32 lsh_u32;
99
typedef word32 lsh_uint;
100
typedef word32 lsh_err;
101
typedef word32 lsh_type;
102
103
using CryptoPP::LSH::LSH256_IV224;
104
using CryptoPP::LSH::LSH256_IV256;
105
using CryptoPP::LSH::LSH256_StepConstants;
106
107
struct LSH256_AVX2_Context
108
{
109
  LSH256_AVX2_Context(word32* state, word32 algType, word32& remainingBitLength) :
110
    cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
111
    last_block(reinterpret_cast<byte*>(state+48)),
112
    remain_databitlen(remainingBitLength),
113
96.8k
    alg_type(static_cast<lsh_type>(algType)) {}
114
115
  lsh_u32* cv_l;  // start of our state block
116
  lsh_u32* cv_r;
117
  lsh_u32* sub_msgs;
118
  lsh_u8*  last_block;
119
  lsh_u32& remain_databitlen;
120
  lsh_type alg_type;
121
};
122
123
struct LSH256_AVX2_Internal
124
{
125
  LSH256_AVX2_Internal(word32* state) :
126
    submsg_e_l(state+16), submsg_e_r(state+24),
127
383k
    submsg_o_l(state+32), submsg_o_r(state+40) { }
128
129
  lsh_u32* submsg_e_l; /* even left sub-message  */
130
  lsh_u32* submsg_e_r; /* even right sub-message */
131
  lsh_u32* submsg_o_l; /* odd left sub-message   */
132
  lsh_u32* submsg_o_r; /* odd right sub-message  */
133
};
134
135
// Zero the upper 128 bits of all YMM registers on exit.
136
// It avoids AVX state transition penalties when saving state.
137
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735
138
// makes using zeroupper a little tricky.
139
140
struct AVX_Cleanup
141
{
142
96.8k
  ~AVX_Cleanup() {
143
96.8k
    _mm256_zeroupper();
144
96.8k
  }
145
};
146
147
// const word32 g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
148
149
/* LSH AlgType Macro */
150
151
0
inline bool LSH_IS_LSH512(lsh_uint val) {
152
0
  return (val & 0xf0000) == 0;
153
0
}
154
155
13.8k
inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
156
13.8k
  return val >> 24;
157
13.8k
}
158
159
13.8k
inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
160
13.8k
  return val & 0xffff;
161
13.8k
}
162
163
0
inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
164
0
  return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
165
0
}
166
167
0
inline lsh_u32 loadLE32(lsh_u32 v) {
168
0
  return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v);
169
0
}
170
171
0
lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) {
172
0
  return rotlFixed(x, r);
173
0
}
174
175
// Original code relied upon unaligned lsh_u32 buffer
176
inline void load_msg_blk(LSH256_AVX2_Internal* i_state, const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN])
177
383k
{
178
383k
  CRYPTOPP_ASSERT(i_state != NULLPTR);
179
180
383k
  lsh_u32* submsg_e_l = i_state->submsg_e_l;
181
383k
  lsh_u32* submsg_e_r = i_state->submsg_e_r;
182
383k
  lsh_u32* submsg_o_l = i_state->submsg_o_l;
183
383k
  lsh_u32* submsg_o_r = i_state->submsg_o_r;
184
185
383k
  _mm256_storeu_si256(M256_CAST(submsg_e_l+0),
186
383k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+0)));
187
383k
  _mm256_storeu_si256(M256_CAST(submsg_e_r+0),
188
383k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+32)));
189
383k
  _mm256_storeu_si256(M256_CAST(submsg_o_l+0),
190
383k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+64)));
191
383k
  _mm256_storeu_si256(M256_CAST(submsg_o_r+0),
192
383k
    _mm256_loadu_si256(CONST_M256_CAST(msgblk+96)));
193
383k
}
194
195
inline void msg_exp_even(LSH256_AVX2_Internal* i_state)
196
4.98M
{
197
4.98M
  CRYPTOPP_ASSERT(i_state != NULLPTR);
198
199
4.98M
  lsh_u32* submsg_e_l = i_state->submsg_e_l;
200
4.98M
  lsh_u32* submsg_e_r = i_state->submsg_e_r;
201
4.98M
  lsh_u32* submsg_o_l = i_state->submsg_o_l;
202
4.98M
  lsh_u32* submsg_o_r = i_state->submsg_o_r;
203
204
4.98M
  const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514,
205
4.98M
    0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c);
206
207
4.98M
  _mm256_storeu_si256(M256_CAST(submsg_e_l+0), _mm256_add_epi32(
208
4.98M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)),
209
4.98M
    _mm256_shuffle_epi8(
210
4.98M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)), mask)));
211
4.98M
  _mm256_storeu_si256(M256_CAST(submsg_e_r+0), _mm256_add_epi32(
212
4.98M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)),
213
4.98M
    _mm256_shuffle_epi8(
214
4.98M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)), mask)));
215
4.98M
}
216
217
inline void msg_exp_odd(LSH256_AVX2_Internal* i_state)
218
4.59M
{
219
4.59M
  CRYPTOPP_ASSERT(i_state != NULLPTR);
220
221
4.59M
  lsh_u32* submsg_e_l = i_state->submsg_e_l;
222
4.59M
  lsh_u32* submsg_e_r = i_state->submsg_e_r;
223
4.59M
  lsh_u32* submsg_o_l = i_state->submsg_o_l;
224
4.59M
  lsh_u32* submsg_o_r = i_state->submsg_o_r;
225
226
4.59M
  const __m256i mask = _mm256_set_epi32(0x1b1a1918, 0x17161514,
227
4.59M
    0x13121110, 0x1f1e1d1c, 0x07060504, 0x03020100, 0x0b0a0908, 0x0f0e0d0c);
228
229
4.59M
  _mm256_storeu_si256(M256_CAST(submsg_o_l+0), _mm256_add_epi32(
230
4.59M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0)),
231
4.59M
    _mm256_shuffle_epi8(
232
4.59M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l+0)), mask)));
233
4.59M
  _mm256_storeu_si256(M256_CAST(submsg_o_r+0), _mm256_add_epi32(
234
4.59M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0)),
235
4.59M
    _mm256_shuffle_epi8(
236
4.59M
      _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r+0)), mask)));
237
4.59M
}
238
239
inline void load_sc(const lsh_u32** p_const_v, size_t i)
240
9.96M
{
241
9.96M
  CRYPTOPP_ASSERT(p_const_v != NULLPTR);
242
243
9.96M
  *p_const_v = &LSH256_StepConstants[i];
244
9.96M
}
245
246
inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_AVX2_Internal* i_state)
247
5.36M
{
248
5.36M
  CRYPTOPP_ASSERT(i_state != NULLPTR);
249
250
5.36M
  lsh_u32* submsg_e_l = i_state->submsg_e_l;
251
5.36M
  lsh_u32* submsg_e_r = i_state->submsg_e_r;
252
253
5.36M
  _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_xor_si256(
254
5.36M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l+0)),
255
5.36M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_e_l+0))));
256
5.36M
  _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_xor_si256(
257
5.36M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
258
5.36M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_e_r+0))));
259
5.36M
}
260
261
inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_AVX2_Internal* i_state)
262
4.98M
{
263
4.98M
  CRYPTOPP_ASSERT(i_state != NULLPTR);
264
265
4.98M
  lsh_u32* submsg_o_l = i_state->submsg_o_l;
266
4.98M
  lsh_u32* submsg_o_r = i_state->submsg_o_r;
267
268
4.98M
  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
269
4.98M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
270
4.98M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_l))));
271
4.98M
  _mm256_storeu_si256(M256_CAST(cv_r), _mm256_xor_si256(
272
4.98M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r)),
273
4.98M
    _mm256_loadu_si256(CONST_M256_CAST(submsg_o_r))));
274
4.98M
}
275
276
inline void add_blk(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
277
29.8M
{
278
29.8M
  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_add_epi32(
279
29.8M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
280
29.8M
    _mm256_loadu_si256(CONST_M256_CAST(cv_r))));
281
29.8M
}
282
283
template <unsigned int R>
284
inline void rotate_blk(lsh_u32 cv[8])
285
19.9M
{
286
19.9M
  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
287
19.9M
    _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
288
19.9M
    _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R)));
289
19.9M
}
lsh256_avx.cpp:void (anonymous namespace)::rotate_blk<29u>(unsigned int*)
Line
Count
Source
285
4.98M
{
286
4.98M
  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
287
4.98M
    _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
288
4.98M
    _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R)));
289
4.98M
}
lsh256_avx.cpp:void (anonymous namespace)::rotate_blk<1u>(unsigned int*)
Line
Count
Source
285
4.98M
{
286
4.98M
  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
287
4.98M
    _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
288
4.98M
    _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R)));
289
4.98M
}
lsh256_avx.cpp:void (anonymous namespace)::rotate_blk<5u>(unsigned int*)
Line
Count
Source
285
4.98M
{
286
4.98M
  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
287
4.98M
    _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
288
4.98M
    _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R)));
289
4.98M
}
lsh256_avx.cpp:void (anonymous namespace)::rotate_blk<17u>(unsigned int*)
Line
Count
Source
285
4.98M
{
286
4.98M
  _mm256_storeu_si256(M256_CAST(cv), _mm256_or_si256(
287
4.98M
    _mm256_slli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), R),
288
4.98M
    _mm256_srli_epi32(_mm256_loadu_si256(CONST_M256_CAST(cv)), 32-R)));
289
4.98M
}
290
291
inline void xor_with_const(lsh_u32 cv_l[8], const lsh_u32 const_v[8])
292
9.96M
{
293
9.96M
  _mm256_storeu_si256(M256_CAST(cv_l), _mm256_xor_si256(
294
9.96M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l)),
295
9.96M
    _mm256_loadu_si256(CONST_M256_CAST(const_v))));
296
9.96M
}
297
298
inline void rotate_msg_gamma(lsh_u32 cv_r[8])
299
9.96M
{
300
  // g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
301
9.96M
  _mm256_storeu_si256(M256_CAST(cv_r+0),
302
9.96M
    _mm256_shuffle_epi8(_mm256_loadu_si256(CONST_M256_CAST(cv_r+0)),
303
9.96M
      _mm256_set_epi8(
304
9.96M
        /* hi lane */ 15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1,
305
9.96M
        /* lo lane */ 12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0)));
306
9.96M
}
307
308
inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
309
9.96M
{
310
9.96M
  __m256i temp = _mm256_shuffle_epi32(
311
9.96M
    _mm256_loadu_si256(CONST_M256_CAST(cv_l)), _MM_SHUFFLE(3,1,0,2));
312
9.96M
  _mm256_storeu_si256(M256_CAST(cv_r),
313
9.96M
    _mm256_shuffle_epi32(
314
9.96M
      _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(1,2,3,0)));
315
9.96M
  _mm256_storeu_si256(M256_CAST(cv_l),
316
9.96M
    _mm256_permute2x128_si256(temp,
317
9.96M
      _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,3,0,1)));
318
9.96M
  _mm256_storeu_si256(M256_CAST(cv_r),
319
9.96M
    _mm256_permute2x128_si256(temp,
320
9.96M
      _mm256_loadu_si256(CONST_M256_CAST(cv_r)), _MM_SHUFFLE(0,2,0,0)));
321
9.96M
}
322
323
/* -------------------------------------------------------- *
324
* step function
325
* -------------------------------------------------------- */
326
327
template <unsigned int Alpha, unsigned int Beta>
328
inline void mix(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 const_v[8])
329
9.96M
{
330
9.96M
  add_blk(cv_l, cv_r);
331
9.96M
  rotate_blk<Alpha>(cv_l);
332
9.96M
  xor_with_const(cv_l, const_v);
333
9.96M
  add_blk(cv_r, cv_l);
334
9.96M
  rotate_blk<Beta>(cv_r);
335
9.96M
  add_blk(cv_l, cv_r);
336
9.96M
  rotate_msg_gamma(cv_r);
337
9.96M
}
lsh256_avx.cpp:void (anonymous namespace)::mix<29u, 1u>(unsigned int*, unsigned int*, unsigned int const*)
Line
Count
Source
329
4.98M
{
330
4.98M
  add_blk(cv_l, cv_r);
331
4.98M
  rotate_blk<Alpha>(cv_l);
332
4.98M
  xor_with_const(cv_l, const_v);
333
4.98M
  add_blk(cv_r, cv_l);
334
4.98M
  rotate_blk<Beta>(cv_r);
335
4.98M
  add_blk(cv_l, cv_r);
336
4.98M
  rotate_msg_gamma(cv_r);
337
4.98M
}
lsh256_avx.cpp:void (anonymous namespace)::mix<5u, 17u>(unsigned int*, unsigned int*, unsigned int const*)
Line
Count
Source
329
4.98M
{
330
4.98M
  add_blk(cv_l, cv_r);
331
4.98M
  rotate_blk<Alpha>(cv_l);
332
4.98M
  xor_with_const(cv_l, const_v);
333
4.98M
  add_blk(cv_r, cv_l);
334
4.98M
  rotate_blk<Beta>(cv_r);
335
4.98M
  add_blk(cv_l, cv_r);
336
4.98M
  rotate_msg_gamma(cv_r);
337
4.98M
}
338
339
/* -------------------------------------------------------- *
340
* compression function
341
* -------------------------------------------------------- */
342
343
inline void compress(LSH256_AVX2_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_BYTE_LEN])
344
383k
{
345
383k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
346
347
383k
  LSH256_AVX2_Internal  s_state(ctx->cv_l);
348
383k
  LSH256_AVX2_Internal* i_state = &s_state;
349
350
383k
  const lsh_u32* const_v = NULL;
351
383k
  lsh_u32* cv_l = ctx->cv_l;
352
383k
  lsh_u32* cv_r = ctx->cv_r;
353
354
383k
  load_msg_blk(i_state, pdMsgBlk);
355
356
383k
  msg_add_even(cv_l, cv_r, i_state);
357
383k
  load_sc(&const_v, 0);
358
383k
  mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
359
383k
  word_perm(cv_l, cv_r);
360
361
383k
  msg_add_odd(cv_l, cv_r, i_state);
362
383k
  load_sc(&const_v, 8);
363
383k
  mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
364
383k
  word_perm(cv_l, cv_r);
365
366
4.98M
  for (size_t i = 1; i < NUM_STEPS / 2; i++)
367
4.59M
  {
368
4.59M
    msg_exp_even(i_state);
369
4.59M
    msg_add_even(cv_l, cv_r, i_state);
370
4.59M
    load_sc(&const_v, 16 * i);
371
4.59M
    mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
372
4.59M
    word_perm(cv_l, cv_r);
373
374
4.59M
    msg_exp_odd(i_state);
375
4.59M
    msg_add_odd(cv_l, cv_r, i_state);
376
4.59M
    load_sc(&const_v, 16 * i + 8);
377
4.59M
    mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
378
4.59M
    word_perm(cv_l, cv_r);
379
4.59M
  }
380
381
383k
  msg_exp_even(i_state);
382
383k
  msg_add_even(cv_l, cv_r, i_state);
383
383k
}
384
385
/* -------------------------------------------------------- */
386
387
inline void load_iv(word32 cv_l[8], word32 cv_r[8], const word32 iv[16])
388
14.1k
{
389
  // The IV's are 32-byte aligned so we can use aligned loads.
390
14.1k
  _mm256_storeu_si256(M256_CAST(cv_l+0),
391
14.1k
    _mm256_load_si256(CONST_M256_CAST(iv+0)));
392
14.1k
  _mm256_storeu_si256(M256_CAST(cv_r+0),
393
14.1k
    _mm256_load_si256(CONST_M256_CAST(iv+8)));
394
14.1k
}
395
396
inline void zero_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
397
0
{
398
0
  _mm256_storeu_si256(M256_CAST(cv_l+0), _mm256_setzero_si256());
399
0
  _mm256_storeu_si256(M256_CAST(cv_r+0), _mm256_setzero_si256());
400
0
}
401
402
inline void zero_submsgs(LSH256_AVX2_Context* ctx)
403
14.1k
{
404
14.1k
  lsh_u32* sub_msgs = ctx->sub_msgs;
405
406
14.1k
  _mm256_storeu_si256(M256_CAST(sub_msgs+ 0), _mm256_setzero_si256());
407
14.1k
  _mm256_storeu_si256(M256_CAST(sub_msgs+ 8), _mm256_setzero_si256());
408
14.1k
  _mm256_storeu_si256(M256_CAST(sub_msgs+16), _mm256_setzero_si256());
409
14.1k
  _mm256_storeu_si256(M256_CAST(sub_msgs+24), _mm256_setzero_si256());
410
14.1k
}
411
412
inline void init224(LSH256_AVX2_Context* ctx)
413
7.28k
{
414
7.28k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
415
416
7.28k
  zero_submsgs(ctx);
417
7.28k
  load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV224);
418
7.28k
}
419
420
inline void init256(LSH256_AVX2_Context* ctx)
421
6.90k
{
422
6.90k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
423
424
6.90k
  zero_submsgs(ctx);
425
6.90k
  load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV256);
426
6.90k
}
427
428
/* -------------------------------------------------------- */
429
430
inline void fin(LSH256_AVX2_Context* ctx)
431
13.8k
{
432
13.8k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
433
434
13.8k
  _mm256_storeu_si256(M256_CAST(ctx->cv_l+0), _mm256_xor_si256(
435
13.8k
    _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_l+0)),
436
13.8k
    _mm256_loadu_si256(CONST_M256_CAST(ctx->cv_r+0))));
437
13.8k
}
438
439
/* -------------------------------------------------------- */
440
441
inline void get_hash(LSH256_AVX2_Context* ctx, lsh_u8* pbHashVal)
442
13.8k
{
443
13.8k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
444
13.8k
  CRYPTOPP_ASSERT(ctx->alg_type != 0);
445
13.8k
  CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
446
447
13.8k
  lsh_uint alg_type = ctx->alg_type;
448
13.8k
  lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
449
13.8k
  lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
450
451
  // Multiplying by looks odd...
452
13.8k
  std::memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
453
13.8k
  if (hash_val_bit_len){
454
0
    pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
455
0
  }
456
13.8k
}
457
458
/* -------------------------------------------------------- */
459
460
lsh_err lsh256_init_avx2(LSH256_AVX2_Context* ctx)
461
14.1k
{
462
14.1k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
463
14.1k
  CRYPTOPP_ASSERT(ctx->alg_type != 0);
464
465
14.1k
  lsh_u32 alg_type = ctx->alg_type;
466
14.1k
  const lsh_u32* const_v = NULL;
467
14.1k
  ctx->remain_databitlen = 0;
468
469
  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
470
14.1k
  AVX_Cleanup cleanup;
471
472
14.1k
  switch (alg_type)
473
14.1k
  {
474
6.90k
  case LSH_TYPE_256_256:
475
6.90k
    init256(ctx);
476
6.90k
    return LSH_SUCCESS;
477
7.28k
  case LSH_TYPE_256_224:
478
7.28k
    init224(ctx);
479
7.28k
    return LSH_SUCCESS;
480
0
  default:
481
0
    break;
482
14.1k
  }
483
484
0
  lsh_u32* cv_l = ctx->cv_l;
485
0
  lsh_u32* cv_r = ctx->cv_r;
486
487
0
  zero_iv(cv_l, cv_r);
488
0
  cv_l[0] = LSH256_HASH_VAL_MAX_BYTE_LEN;
489
0
  cv_l[1] = LSH_GET_HASHBIT(alg_type);
490
491
0
  for (size_t i = 0; i < NUM_STEPS / 2; i++)
492
0
  {
493
    //Mix
494
0
    load_sc(&const_v, i * 16);
495
0
    mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
496
0
    word_perm(cv_l, cv_r);
497
498
0
    load_sc(&const_v, i * 16 + 8);
499
0
    mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
500
0
    word_perm(cv_l, cv_r);
501
0
  }
502
503
0
  return LSH_SUCCESS;
504
14.1k
}
505
506
lsh_err lsh256_update_avx2(LSH256_AVX2_Context* ctx, const lsh_u8* data, size_t databitlen)
507
68.7k
{
508
68.7k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
509
68.7k
  CRYPTOPP_ASSERT(data != NULLPTR);
510
68.7k
  CRYPTOPP_ASSERT(databitlen % 8 == 0);
511
68.7k
  CRYPTOPP_ASSERT(ctx->alg_type != 0);
512
513
  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
514
68.7k
  AVX_Cleanup cleanup;
515
516
68.7k
  if (databitlen == 0){
517
34.8k
    return LSH_SUCCESS;
518
34.8k
  }
519
520
  // We are byte oriented. tail bits will always be 0.
521
33.9k
  size_t databytelen = databitlen >> 3;
522
  // lsh_uint pos2 = databitlen & 0x7;
523
33.9k
  const size_t pos2 = 0;
524
525
33.9k
  size_t remain_msg_byte = ctx->remain_databitlen >> 3;
526
  // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
527
33.9k
  const size_t remain_msg_bit = 0;
528
529
33.9k
  if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
530
0
    return LSH_ERR_INVALID_STATE;
531
0
  }
532
33.9k
  if (remain_msg_bit > 0){
533
0
    return LSH_ERR_INVALID_DATABITLEN;
534
0
  }
535
536
33.9k
  if (databytelen + remain_msg_byte < LSH256_MSG_BLK_BYTE_LEN)
537
18.4k
  {
538
18.4k
    std::memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
539
18.4k
    ctx->remain_databitlen += (lsh_uint)databitlen;
540
18.4k
    remain_msg_byte += (lsh_uint)databytelen;
541
18.4k
    if (pos2){
542
0
      ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
543
0
    }
544
18.4k
    return LSH_SUCCESS;
545
18.4k
  }
546
547
15.4k
  if (remain_msg_byte > 0){
548
2.59k
    size_t more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte;
549
2.59k
    std::memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
550
2.59k
    compress(ctx, ctx->last_block);
551
2.59k
    data += more_byte;
552
2.59k
    databytelen -= more_byte;
553
2.59k
    remain_msg_byte = 0;
554
2.59k
    ctx->remain_databitlen = 0;
555
2.59k
  }
556
557
382k
  while (databytelen >= LSH256_MSG_BLK_BYTE_LEN)
558
366k
  {
559
    // This call to compress caused some trouble.
560
    // The data pointer can become unaligned in the
561
    // previous block.
562
366k
    compress(ctx, data);
563
366k
    data += LSH256_MSG_BLK_BYTE_LEN;
564
366k
    databytelen -= LSH256_MSG_BLK_BYTE_LEN;
565
366k
  }
566
567
15.4k
  if (databytelen > 0){
568
3.64k
    std::memcpy(ctx->last_block, data, databytelen);
569
3.64k
    ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
570
3.64k
  }
571
572
15.4k
  if (pos2){
573
0
    ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
574
0
    ctx->remain_databitlen += pos2;
575
0
  }
576
577
15.4k
  return LSH_SUCCESS;
578
33.9k
}
579
580
lsh_err lsh256_final_avx2(LSH256_AVX2_Context* ctx, lsh_u8* hashval)
581
13.8k
{
582
13.8k
  CRYPTOPP_ASSERT(ctx != NULLPTR);
583
13.8k
  CRYPTOPP_ASSERT(hashval != NULLPTR);
584
585
  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735.
586
13.8k
  AVX_Cleanup cleanup;
587
588
  // We are byte oriented. tail bits will always be 0.
589
13.8k
  size_t remain_msg_byte = ctx->remain_databitlen >> 3;
590
  // lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
591
13.8k
  const size_t remain_msg_bit = 0;
592
593
13.8k
  if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
594
0
    return LSH_ERR_INVALID_STATE;
595
0
  }
596
597
13.8k
  if (remain_msg_bit){
598
0
    ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
599
0
  }
600
13.8k
  else{
601
13.8k
    ctx->last_block[remain_msg_byte] = 0x80;
602
13.8k
  }
603
13.8k
  std::memset(ctx->last_block + remain_msg_byte + 1, 0, LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
604
605
13.8k
  compress(ctx, ctx->last_block);
606
607
13.8k
  fin(ctx);
608
13.8k
  get_hash(ctx, hashval);
609
610
13.8k
  return LSH_SUCCESS;
611
13.8k
}
612
613
ANONYMOUS_NAMESPACE_END
614
615
NAMESPACE_BEGIN(CryptoPP)
616
617
extern
618
void LSH256_Base_Restart_AVX2(word32* state)
619
14.1k
{
620
14.1k
  state[RemainingBits] = 0;
621
14.1k
  LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
622
14.1k
  lsh_err err = lsh256_init_avx2(&ctx);
623
624
14.1k
  if (err != LSH_SUCCESS)
625
0
    throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_init_avx2 failed");
626
14.1k
}
627
628
extern
629
void LSH256_Base_Update_AVX2(word32* state, const byte *input, size_t size)
630
68.7k
{
631
68.7k
  LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
632
68.7k
  lsh_err err = lsh256_update_avx2(&ctx, input, 8*size);
633
634
68.7k
  if (err != LSH_SUCCESS)
635
0
    throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_update_avx2 failed");
636
68.7k
}
637
638
extern
639
void LSH256_Base_TruncatedFinal_AVX2(word32* state, byte *hash, size_t)
640
13.8k
{
641
13.8k
  LSH256_AVX2_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
642
13.8k
  lsh_err err = lsh256_final_avx2(&ctx, hash);
643
644
13.8k
  if (err != LSH_SUCCESS)
645
0
    throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_final_avx2 failed");
646
13.8k
}
647
648
NAMESPACE_END
649
650
#endif  // CRYPTOPP_AVX2_AVAILABLE