Coverage Report

Created: 2024-11-21 07:03

/src/cryptopp/cham_simd.cpp
Line
Count
Source (jump to first uncovered line)
1
// cham_simd.cpp - written and placed in the public domain by Jeffrey Walton
2
//
3
//    This source file uses intrinsics and built-ins to gain access to
4
//    SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5
//    source file is needed because additional CXXFLAGS are required to enable
6
//    the appropriate instructions sets in some build configurations.
7
8
#include "pch.h"
9
#include "config.h"
10
11
#include "cham.h"
12
#include "misc.h"
13
14
// Uncomment for benchmarking C++ against SSE or NEON.
15
// Do so in both simon.cpp and simon_simd.cpp.
16
// #undef CRYPTOPP_SSSE3_AVAILABLE
17
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
18
19
#if (CRYPTOPP_SSSE3_AVAILABLE)
20
#include "adv_simd.h"
21
# include <pmmintrin.h>
22
# include <tmmintrin.h>
23
#endif
24
25
#if defined(__XOP__)
26
# if defined(CRYPTOPP_GCC_COMPATIBLE)
27
#  include <x86intrin.h>
28
# endif
29
# include <ammintrin.h>
30
#endif  // XOP
31
32
// Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
33
#define DOUBLE_CAST(x) ((double*)(void*)(x))
34
0
#define CONST_DOUBLE_CAST(x) ((const double*)(const void*)(x))
35
36
// Squash MS LNK4221 and libtool warnings
37
extern const char CHAM_SIMD_FNAME[] = __FILE__;
38
39
ANONYMOUS_NAMESPACE_BEGIN
40
41
using CryptoPP::word16;
42
using CryptoPP::word32;
43
44
#if (CRYPTOPP_SSSE3_AVAILABLE)
45
46
//////////////////////////////////////////////////////////////////////////
47
48
NAMESPACE_BEGIN(W32)  // CHAM128, 32-bit word size
49
50
template <unsigned int R>
51
inline __m128i RotateLeft32(const __m128i& val)
52
0
{
53
#if defined(__XOP__)
54
    return _mm_roti_epi32(val, R);
55
#else
56
0
    return _mm_or_si128(
57
0
        _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
58
0
#endif
59
0
}
60
61
template <unsigned int R>
62
inline __m128i RotateRight32(const __m128i& val)
63
0
{
64
#if defined(__XOP__)
65
    return _mm_roti_epi32(val, 32-R);
66
#else
67
0
    return _mm_or_si128(
68
0
        _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
69
0
#endif
70
0
}
71
72
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
73
template <>
74
inline __m128i RotateLeft32<8>(const __m128i& val)
75
0
{
76
#if defined(__XOP__)
77
    return _mm_roti_epi32(val, 8);
78
#else
79
0
    const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
80
0
    return _mm_shuffle_epi8(val, mask);
81
0
#endif
82
0
}
83
84
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
85
template <>
86
inline __m128i RotateRight32<8>(const __m128i& val)
87
0
{
88
#if defined(__XOP__)
89
    return _mm_roti_epi32(val, 32-8);
90
#else
91
0
    const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
92
0
    return _mm_shuffle_epi8(val, mask);
93
0
#endif
94
0
}
95
96
template <unsigned int IDX>
97
inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
98
{
99
    // Should not be instantiated
100
    CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
101
    CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
102
    CRYPTOPP_ASSERT(0);
103
    return _mm_setzero_si128();
104
}
105
106
template <>
107
inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
108
0
{
109
    // The shuffle converts to and from little-endian for SSE. A specialized
110
    // CHAM implementation can avoid the shuffle by framing the data for
111
    // encryption, decryption and benchmarks. The library cannot take the
112
    // speed-up because of the byte oriented API.
113
0
    const __m128i r1 = _mm_unpacklo_epi32(a, b);
114
0
    const __m128i r2 = _mm_unpacklo_epi32(c, d);
115
0
    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
116
0
        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
117
0
}
118
119
template <>
120
inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
121
0
{
122
    // The shuffle converts to and from little-endian for SSE. A specialized
123
    // CHAM implementation can avoid the shuffle by framing the data for
124
    // encryption, decryption and benchmarks. The library cannot take the
125
    // speed-up because of the byte oriented API.
126
0
    const __m128i r1 = _mm_unpacklo_epi32(a, b);
127
0
    const __m128i r2 = _mm_unpacklo_epi32(c, d);
128
0
    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
129
0
        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
130
0
}
131
132
template <>
133
inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
134
0
{
135
    // The shuffle converts to and from little-endian for SSE. A specialized
136
    // CHAM implementation can avoid the shuffle by framing the data for
137
    // encryption, decryption and benchmarks. The library cannot take the
138
    // speed-up because of the byte oriented API.
139
0
    const __m128i r1 = _mm_unpackhi_epi32(a, b);
140
0
    const __m128i r2 = _mm_unpackhi_epi32(c, d);
141
0
    return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
142
0
        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
143
0
}
144
145
template <>
146
inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
147
0
{
148
    // The shuffle converts to and from little-endian for SSE. A specialized
149
    // CHAM implementation can avoid the shuffle by framing the data for
150
    // encryption, decryption and benchmarks. The library cannot take the
151
    // speed-up because of the byte oriented API.
152
0
    const __m128i r1 = _mm_unpackhi_epi32(a, b);
153
0
    const __m128i r2 = _mm_unpackhi_epi32(c, d);
154
0
    return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
155
0
        _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
156
0
}
157
158
template <unsigned int IDX>
159
inline __m128i UnpackXMM(const __m128i& v)
160
{
161
    // Should not be instantiated
162
    CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
163
    return _mm_setzero_si128();
164
}
165
166
template <>
167
inline __m128i UnpackXMM<0>(const __m128i& v)
168
0
{
169
0
    return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3));
170
0
}
171
172
template <>
173
inline __m128i UnpackXMM<1>(const __m128i& v)
174
0
{
175
0
    return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7));
176
0
}
177
178
template <>
179
inline __m128i UnpackXMM<2>(const __m128i& v)
180
0
{
181
0
    return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11));
182
0
}
183
184
template <>
185
inline __m128i UnpackXMM<3>(const __m128i& v)
186
0
{
187
0
    return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15));
188
0
}
189
190
template <unsigned int IDX>
191
inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
192
0
{
193
0
    return UnpackXMM<IDX>(a, b, c, d);
194
0
}
Unexecuted instantiation: cham_simd.cpp:long long __vector(2) (anonymous namespace)::W32::RepackXMM<0u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&)
Unexecuted instantiation: cham_simd.cpp:long long __vector(2) (anonymous namespace)::W32::RepackXMM<1u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&)
Unexecuted instantiation: cham_simd.cpp:long long __vector(2) (anonymous namespace)::W32::RepackXMM<2u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&)
Unexecuted instantiation: cham_simd.cpp:long long __vector(2) (anonymous namespace)::W32::RepackXMM<3u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&)
195
196
template <unsigned int IDX>
197
inline __m128i RepackXMM(const __m128i& v)
198
{
199
    return UnpackXMM<IDX>(v);
200
}
201
202
inline void CHAM128_Enc_Block(__m128i &block0,
203
    const word32 *subkeys, unsigned int rounds)
204
0
{
205
    // Rearrange the data for vectorization. UnpackXMM includes a
206
    // little-endian swap for SSE. Thanks to Peter Cordes for help
207
    // with packing and unpacking.
208
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
209
0
    __m128i a = UnpackXMM<0>(block0);
210
0
    __m128i b = UnpackXMM<1>(block0);
211
0
    __m128i c = UnpackXMM<2>(block0);
212
0
    __m128i d = UnpackXMM<3>(block0);
213
214
0
    __m128i counter = _mm_set_epi32(0,0,0,0);
215
0
    __m128i increment = _mm_set_epi32(1,1,1,1);
216
217
0
    const unsigned int MASK = (rounds == 80 ? 7 : 15);
218
0
    for (int i=0; i<static_cast<int>(rounds); i+=4)
219
0
    {
220
0
        __m128i k, k1, k2, t1, t2;
221
0
        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
222
223
        // Shuffle out two subkeys
224
0
        k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
225
0
        k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
226
227
0
        t1 = _mm_xor_si128(a, counter);
228
0
        t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
229
0
        a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
230
231
0
        counter = _mm_add_epi32(counter, increment);
232
233
0
        t1 = _mm_xor_si128(b, counter);
234
0
        t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
235
0
        b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
236
237
0
        counter = _mm_add_epi32(counter, increment);
238
0
        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+2) & MASK])));
239
240
        // Shuffle out two subkeys
241
0
        k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
242
0
        k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
243
244
0
        t1 = _mm_xor_si128(c, counter);
245
0
        t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
246
0
        c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
247
248
0
        counter = _mm_add_epi32(counter, increment);
249
250
0
        t1 = _mm_xor_si128(d, counter);
251
0
        t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
252
0
        d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
253
254
0
        counter = _mm_add_epi32(counter, increment);
255
0
    }
256
257
    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
258
0
    block0 = RepackXMM<0>(a,b,c,d);
259
0
}
260
261
inline void CHAM128_Dec_Block(__m128i &block0,
262
    const word32 *subkeys, unsigned int rounds)
263
0
{
264
    // Rearrange the data for vectorization. UnpackXMM includes a
265
    // little-endian swap for SSE. Thanks to Peter Cordes for help
266
    // with packing and unpacking.
267
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
268
0
    __m128i a = UnpackXMM<0>(block0);
269
0
    __m128i b = UnpackXMM<1>(block0);
270
0
    __m128i c = UnpackXMM<2>(block0);
271
0
    __m128i d = UnpackXMM<3>(block0);
272
273
0
    __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
274
0
    __m128i decrement = _mm_set_epi32(1,1,1,1);
275
276
0
    const unsigned int MASK = (rounds == 80 ? 7 : 15);
277
0
    for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
278
0
    {
279
0
        __m128i k, k1, k2, t1, t2;
280
0
        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-1) & MASK])));
281
282
        // Shuffle out two subkeys
283
0
        k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
284
0
        k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
285
286
        // Odd round
287
0
        t1 = RotateRight32<1>(d);
288
0
        t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
289
0
        d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
290
291
0
        counter = _mm_sub_epi32(counter, decrement);
292
293
        // Even round
294
0
        t1 = RotateRight32<8>(c);
295
0
        t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
296
0
        c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
297
298
0
        counter = _mm_sub_epi32(counter, decrement);
299
0
        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
300
301
        // Shuffle out two subkeys
302
0
        k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
303
0
        k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
304
305
        // Odd round
306
0
        t1 = RotateRight32<1>(b);
307
0
        t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
308
0
        b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
309
310
0
        counter = _mm_sub_epi32(counter, decrement);
311
312
        // Even round
313
0
        t1 = RotateRight32<8>(a);
314
0
        t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
315
0
        a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
316
317
0
        counter = _mm_sub_epi32(counter, decrement);
318
0
    }
319
320
    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
321
0
    block0 = RepackXMM<0>(a,b,c,d);
322
0
}
323
324
inline void CHAM128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
325
    __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
326
0
{
327
    // Rearrange the data for vectorization. UnpackXMM includes a
328
    // little-endian swap for SSE. Thanks to Peter Cordes for help
329
    // with packing and unpacking.
330
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
331
0
    __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
332
0
    __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
333
0
    __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
334
0
    __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
335
336
0
    __m128i counter = _mm_set_epi32(0,0,0,0);
337
0
    __m128i increment = _mm_set_epi32(1,1,1,1);
338
339
0
    const unsigned int MASK = (rounds == 80 ? 7 : 15);
340
0
    for (int i=0; i<static_cast<int>(rounds); i+=4)
341
0
    {
342
0
        __m128i k, k1, k2, t1, t2;
343
0
        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK])));
344
345
        // Shuffle out two subkeys
346
0
        k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
347
0
        k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
348
349
0
        t1 = _mm_xor_si128(a, counter);
350
0
        t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
351
0
        a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
352
353
0
        counter = _mm_add_epi32(counter, increment);
354
355
0
        t1 = _mm_xor_si128(b, counter);
356
0
        t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
357
0
        b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
358
359
0
        counter = _mm_add_epi32(counter, increment);
360
0
        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+2) & MASK])));
361
362
        // Shuffle out two subkeys
363
0
        k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
364
0
        k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
365
366
0
        t1 = _mm_xor_si128(c, counter);
367
0
        t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
368
0
        c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
369
370
0
        counter = _mm_add_epi32(counter, increment);
371
372
0
        t1 = _mm_xor_si128(d, counter);
373
0
        t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
374
0
        d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
375
376
0
        counter = _mm_add_epi32(counter, increment);
377
0
    }
378
379
    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
380
0
    block0 = RepackXMM<0>(a,b,c,d);
381
0
    block1 = RepackXMM<1>(a,b,c,d);
382
0
    block2 = RepackXMM<2>(a,b,c,d);
383
0
    block3 = RepackXMM<3>(a,b,c,d);
384
0
}
385
386
inline void CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
387
    __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
388
0
{
389
    // Rearrange the data for vectorization. UnpackXMM includes a
390
    // little-endian swap for SSE. Thanks to Peter Cordes for help
391
    // with packing and unpacking.
392
    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
393
0
    __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
394
0
    __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
395
0
    __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
396
0
    __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
397
398
0
    __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
399
0
    __m128i decrement = _mm_set_epi32(1,1,1,1);
400
401
0
    const unsigned int MASK = (rounds == 80 ? 7 : 15);
402
0
    for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
403
0
    {
404
0
        __m128i k, k1, k2, t1, t2;
405
0
        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-1) & MASK])));
406
407
        // Shuffle out two subkeys
408
0
        k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
409
0
        k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
410
411
        // Odd round
412
0
        t1 = RotateRight32<1>(d);
413
0
        t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
414
0
        d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
415
416
0
        counter = _mm_sub_epi32(counter, decrement);
417
418
        // Even round
419
0
        t1 = RotateRight32<8>(c);
420
0
        t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
421
0
        c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
422
423
0
        counter = _mm_sub_epi32(counter, decrement);
424
0
        k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK])));
425
426
        // Shuffle out two subkeys
427
0
        k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
428
0
        k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
429
430
        // Odd round
431
0
        t1 = RotateRight32<1>(b);
432
0
        t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
433
0
        b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
434
435
0
        counter = _mm_sub_epi32(counter, decrement);
436
437
        // Even round
438
0
        t1 = RotateRight32<8>(a);
439
0
        t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
440
0
        a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
441
442
0
        counter = _mm_sub_epi32(counter, decrement);
443
0
    }
444
445
    // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
446
0
    block0 = RepackXMM<0>(a,b,c,d);
447
0
    block1 = RepackXMM<1>(a,b,c,d);
448
0
    block2 = RepackXMM<2>(a,b,c,d);
449
0
    block3 = RepackXMM<3>(a,b,c,d);
450
0
}
451
452
//////////////////////////////////////////////////////////////////////////
453
454
NAMESPACE_END  // W32
455
456
#endif  // CRYPTOPP_SSSE3_AVAILABLE
457
458
ANONYMOUS_NAMESPACE_END
459
460
NAMESPACE_BEGIN(CryptoPP)
461
462
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
463
size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
464
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
465
0
{
466
0
    return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Enc_Block, W32::CHAM128_Enc_4_Blocks,
467
0
        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
468
0
}
469
470
size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
471
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
472
0
{
473
0
    return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Dec_Block, W32::CHAM128_Dec_4_Blocks,
474
0
        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
475
0
}
476
#endif // CRYPTOPP_SSSE3_AVAILABLE
477
478
NAMESPACE_END