/src/cryptopp/cham_simd.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // cham_simd.cpp - written and placed in the public domain by Jeffrey Walton |
2 | | // |
3 | | // This source file uses intrinsics and built-ins to gain access to |
4 | | // SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate |
5 | | // source file is needed because additional CXXFLAGS are required to enable |
6 | | // the appropriate instructions sets in some build configurations. |
7 | | |
8 | | #include "pch.h" |
9 | | #include "config.h" |
10 | | |
11 | | #include "cham.h" |
12 | | #include "misc.h" |
13 | | |
14 | | // Uncomment for benchmarking C++ against SSE or NEON. |
15 | | // Do so in both simon.cpp and simon_simd.cpp. |
16 | | // #undef CRYPTOPP_SSSE3_AVAILABLE |
17 | | // #undef CRYPTOPP_ARM_NEON_AVAILABLE |
18 | | |
19 | | #if (CRYPTOPP_SSSE3_AVAILABLE) |
20 | | #include "adv_simd.h" |
21 | | # include <pmmintrin.h> |
22 | | # include <tmmintrin.h> |
23 | | #endif |
24 | | |
25 | | #if defined(__XOP__) |
26 | | # if defined(CRYPTOPP_GCC_COMPATIBLE) |
27 | | # include <x86intrin.h> |
28 | | # endif |
29 | | # include <ammintrin.h> |
30 | | #endif // XOP |
31 | | |
32 | | // Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670 |
33 | | #define DOUBLE_CAST(x) ((double*)(void*)(x)) |
34 | 0 | #define CONST_DOUBLE_CAST(x) ((const double*)(const void*)(x)) |
35 | | |
36 | | // Squash MS LNK4221 and libtool warnings |
37 | | extern const char CHAM_SIMD_FNAME[] = __FILE__; |
38 | | |
39 | | ANONYMOUS_NAMESPACE_BEGIN |
40 | | |
41 | | using CryptoPP::word16; |
42 | | using CryptoPP::word32; |
43 | | |
44 | | #if (CRYPTOPP_SSSE3_AVAILABLE) |
45 | | |
46 | | ////////////////////////////////////////////////////////////////////////// |
47 | | |
48 | | NAMESPACE_BEGIN(W32) // CHAM128, 32-bit word size |
49 | | |
50 | | template <unsigned int R> |
51 | | inline __m128i RotateLeft32(const __m128i& val) |
52 | 0 | { |
53 | | #if defined(__XOP__) |
54 | | return _mm_roti_epi32(val, R); |
55 | | #else |
56 | 0 | return _mm_or_si128( |
57 | 0 | _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); |
58 | 0 | #endif |
59 | 0 | } |
60 | | |
61 | | template <unsigned int R> |
62 | | inline __m128i RotateRight32(const __m128i& val) |
63 | 0 | { |
64 | | #if defined(__XOP__) |
65 | | return _mm_roti_epi32(val, 32-R); |
66 | | #else |
67 | 0 | return _mm_or_si128( |
68 | 0 | _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); |
69 | 0 | #endif |
70 | 0 | } |
71 | | |
72 | | // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. |
73 | | template <> |
74 | | inline __m128i RotateLeft32<8>(const __m128i& val) |
75 | 0 | { |
76 | | #if defined(__XOP__) |
77 | | return _mm_roti_epi32(val, 8); |
78 | | #else |
79 | 0 | const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); |
80 | 0 | return _mm_shuffle_epi8(val, mask); |
81 | 0 | #endif |
82 | 0 | } |
83 | | |
84 | | // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. |
85 | | template <> |
86 | | inline __m128i RotateRight32<8>(const __m128i& val) |
87 | 0 | { |
88 | | #if defined(__XOP__) |
89 | | return _mm_roti_epi32(val, 32-8); |
90 | | #else |
91 | 0 | const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); |
92 | 0 | return _mm_shuffle_epi8(val, mask); |
93 | 0 | #endif |
94 | 0 | } |
95 | | |
96 | | template <unsigned int IDX> |
97 | | inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
98 | | { |
99 | | // Should not be instantiated |
100 | | CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b); |
101 | | CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d); |
102 | | CRYPTOPP_ASSERT(0); |
103 | | return _mm_setzero_si128(); |
104 | | } |
105 | | |
106 | | template <> |
107 | | inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
108 | 0 | { |
109 | | // The shuffle converts to and from little-endian for SSE. A specialized |
110 | | // CHAM implementation can avoid the shuffle by framing the data for |
111 | | // encryption, decryption and benchmarks. The library cannot take the |
112 | | // speed-up because of the byte oriented API. |
113 | 0 | const __m128i r1 = _mm_unpacklo_epi32(a, b); |
114 | 0 | const __m128i r2 = _mm_unpacklo_epi32(c, d); |
115 | 0 | return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2), |
116 | 0 | _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); |
117 | 0 | } |
118 | | |
119 | | template <> |
120 | | inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
121 | 0 | { |
122 | | // The shuffle converts to and from little-endian for SSE. A specialized |
123 | | // CHAM implementation can avoid the shuffle by framing the data for |
124 | | // encryption, decryption and benchmarks. The library cannot take the |
125 | | // speed-up because of the byte oriented API. |
126 | 0 | const __m128i r1 = _mm_unpacklo_epi32(a, b); |
127 | 0 | const __m128i r2 = _mm_unpacklo_epi32(c, d); |
128 | 0 | return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2), |
129 | 0 | _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); |
130 | 0 | } |
131 | | |
132 | | template <> |
133 | | inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
134 | 0 | { |
135 | | // The shuffle converts to and from little-endian for SSE. A specialized |
136 | | // CHAM implementation can avoid the shuffle by framing the data for |
137 | | // encryption, decryption and benchmarks. The library cannot take the |
138 | | // speed-up because of the byte oriented API. |
139 | 0 | const __m128i r1 = _mm_unpackhi_epi32(a, b); |
140 | 0 | const __m128i r2 = _mm_unpackhi_epi32(c, d); |
141 | 0 | return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2), |
142 | 0 | _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); |
143 | 0 | } |
144 | | |
145 | | template <> |
146 | | inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
147 | 0 | { |
148 | | // The shuffle converts to and from little-endian for SSE. A specialized |
149 | | // CHAM implementation can avoid the shuffle by framing the data for |
150 | | // encryption, decryption and benchmarks. The library cannot take the |
151 | | // speed-up because of the byte oriented API. |
152 | 0 | const __m128i r1 = _mm_unpackhi_epi32(a, b); |
153 | 0 | const __m128i r2 = _mm_unpackhi_epi32(c, d); |
154 | 0 | return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2), |
155 | 0 | _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); |
156 | 0 | } |
157 | | |
158 | | template <unsigned int IDX> |
159 | | inline __m128i UnpackXMM(const __m128i& v) |
160 | | { |
161 | | // Should not be instantiated |
162 | | CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0); |
163 | | return _mm_setzero_si128(); |
164 | | } |
165 | | |
166 | | template <> |
167 | | inline __m128i UnpackXMM<0>(const __m128i& v) |
168 | 0 | { |
169 | 0 | return _mm_shuffle_epi8(v, _mm_set_epi8(0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3)); |
170 | 0 | } |
171 | | |
172 | | template <> |
173 | | inline __m128i UnpackXMM<1>(const __m128i& v) |
174 | 0 | { |
175 | 0 | return _mm_shuffle_epi8(v, _mm_set_epi8(4,5,6,7, 4,5,6,7, 4,5,6,7, 4,5,6,7)); |
176 | 0 | } |
177 | | |
178 | | template <> |
179 | | inline __m128i UnpackXMM<2>(const __m128i& v) |
180 | 0 | { |
181 | 0 | return _mm_shuffle_epi8(v, _mm_set_epi8(8,9,10,11, 8,9,10,11, 8,9,10,11, 8,9,10,11)); |
182 | 0 | } |
183 | | |
184 | | template <> |
185 | | inline __m128i UnpackXMM<3>(const __m128i& v) |
186 | 0 | { |
187 | 0 | return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 12,13,14,15, 12,13,14,15, 12,13,14,15)); |
188 | 0 | } |
189 | | |
190 | | template <unsigned int IDX> |
191 | | inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
192 | 0 | { |
193 | 0 | return UnpackXMM<IDX>(a, b, c, d); |
194 | 0 | } Unexecuted instantiation: cham_simd.cpp:long long __vector(2) (anonymous namespace)::W32::RepackXMM<0u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) Unexecuted instantiation: cham_simd.cpp:long long __vector(2) (anonymous namespace)::W32::RepackXMM<1u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) Unexecuted instantiation: cham_simd.cpp:long long __vector(2) (anonymous namespace)::W32::RepackXMM<2u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) Unexecuted instantiation: cham_simd.cpp:long long __vector(2) (anonymous namespace)::W32::RepackXMM<3u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) |
195 | | |
196 | | template <unsigned int IDX> |
197 | | inline __m128i RepackXMM(const __m128i& v) |
198 | | { |
199 | | return UnpackXMM<IDX>(v); |
200 | | } |
201 | | |
202 | | inline void CHAM128_Enc_Block(__m128i &block0, |
203 | | const word32 *subkeys, unsigned int rounds) |
204 | 0 | { |
205 | | // Rearrange the data for vectorization. UnpackXMM includes a |
206 | | // little-endian swap for SSE. Thanks to Peter Cordes for help |
207 | | // with packing and unpacking. |
208 | | // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... |
209 | 0 | __m128i a = UnpackXMM<0>(block0); |
210 | 0 | __m128i b = UnpackXMM<1>(block0); |
211 | 0 | __m128i c = UnpackXMM<2>(block0); |
212 | 0 | __m128i d = UnpackXMM<3>(block0); |
213 | |
|
214 | 0 | __m128i counter = _mm_set_epi32(0,0,0,0); |
215 | 0 | __m128i increment = _mm_set_epi32(1,1,1,1); |
216 | |
|
217 | 0 | const unsigned int MASK = (rounds == 80 ? 7 : 15); |
218 | 0 | for (int i=0; i<static_cast<int>(rounds); i+=4) |
219 | 0 | { |
220 | 0 | __m128i k, k1, k2, t1, t2; |
221 | 0 | k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK]))); |
222 | | |
223 | | // Shuffle out two subkeys |
224 | 0 | k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); |
225 | 0 | k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); |
226 | |
|
227 | 0 | t1 = _mm_xor_si128(a, counter); |
228 | 0 | t2 = _mm_xor_si128(RotateLeft32<1>(b), k1); |
229 | 0 | a = RotateLeft32<8>(_mm_add_epi32(t1, t2)); |
230 | |
|
231 | 0 | counter = _mm_add_epi32(counter, increment); |
232 | |
|
233 | 0 | t1 = _mm_xor_si128(b, counter); |
234 | 0 | t2 = _mm_xor_si128(RotateLeft32<8>(c), k2); |
235 | 0 | b = RotateLeft32<1>(_mm_add_epi32(t1, t2)); |
236 | |
|
237 | 0 | counter = _mm_add_epi32(counter, increment); |
238 | 0 | k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+2) & MASK]))); |
239 | | |
240 | | // Shuffle out two subkeys |
241 | 0 | k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); |
242 | 0 | k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); |
243 | |
|
244 | 0 | t1 = _mm_xor_si128(c, counter); |
245 | 0 | t2 = _mm_xor_si128(RotateLeft32<1>(d), k1); |
246 | 0 | c = RotateLeft32<8>(_mm_add_epi32(t1, t2)); |
247 | |
|
248 | 0 | counter = _mm_add_epi32(counter, increment); |
249 | |
|
250 | 0 | t1 = _mm_xor_si128(d, counter); |
251 | 0 | t2 = _mm_xor_si128(RotateLeft32<8>(a), k2); |
252 | 0 | d = RotateLeft32<1>(_mm_add_epi32(t1, t2)); |
253 | |
|
254 | 0 | counter = _mm_add_epi32(counter, increment); |
255 | 0 | } |
256 | | |
257 | | // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... |
258 | 0 | block0 = RepackXMM<0>(a,b,c,d); |
259 | 0 | } |
260 | | |
261 | | inline void CHAM128_Dec_Block(__m128i &block0, |
262 | | const word32 *subkeys, unsigned int rounds) |
263 | 0 | { |
264 | | // Rearrange the data for vectorization. UnpackXMM includes a |
265 | | // little-endian swap for SSE. Thanks to Peter Cordes for help |
266 | | // with packing and unpacking. |
267 | | // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... |
268 | 0 | __m128i a = UnpackXMM<0>(block0); |
269 | 0 | __m128i b = UnpackXMM<1>(block0); |
270 | 0 | __m128i c = UnpackXMM<2>(block0); |
271 | 0 | __m128i d = UnpackXMM<3>(block0); |
272 | |
|
273 | 0 | __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1); |
274 | 0 | __m128i decrement = _mm_set_epi32(1,1,1,1); |
275 | |
|
276 | 0 | const unsigned int MASK = (rounds == 80 ? 7 : 15); |
277 | 0 | for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4) |
278 | 0 | { |
279 | 0 | __m128i k, k1, k2, t1, t2; |
280 | 0 | k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-1) & MASK]))); |
281 | | |
282 | | // Shuffle out two subkeys |
283 | 0 | k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); |
284 | 0 | k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); |
285 | | |
286 | | // Odd round |
287 | 0 | t1 = RotateRight32<1>(d); |
288 | 0 | t2 = _mm_xor_si128(RotateLeft32<8>(a), k1); |
289 | 0 | d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); |
290 | |
|
291 | 0 | counter = _mm_sub_epi32(counter, decrement); |
292 | | |
293 | | // Even round |
294 | 0 | t1 = RotateRight32<8>(c); |
295 | 0 | t2 = _mm_xor_si128(RotateLeft32<1>(d), k2); |
296 | 0 | c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); |
297 | |
|
298 | 0 | counter = _mm_sub_epi32(counter, decrement); |
299 | 0 | k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK]))); |
300 | | |
301 | | // Shuffle out two subkeys |
302 | 0 | k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); |
303 | 0 | k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); |
304 | | |
305 | | // Odd round |
306 | 0 | t1 = RotateRight32<1>(b); |
307 | 0 | t2 = _mm_xor_si128(RotateLeft32<8>(c), k1); |
308 | 0 | b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); |
309 | |
|
310 | 0 | counter = _mm_sub_epi32(counter, decrement); |
311 | | |
312 | | // Even round |
313 | 0 | t1 = RotateRight32<8>(a); |
314 | 0 | t2 = _mm_xor_si128(RotateLeft32<1>(b), k2); |
315 | 0 | a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); |
316 | |
|
317 | 0 | counter = _mm_sub_epi32(counter, decrement); |
318 | 0 | } |
319 | | |
320 | | // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... |
321 | 0 | block0 = RepackXMM<0>(a,b,c,d); |
322 | 0 | } |
323 | | |
324 | | inline void CHAM128_Enc_4_Blocks(__m128i &block0, __m128i &block1, |
325 | | __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds) |
326 | 0 | { |
327 | | // Rearrange the data for vectorization. UnpackXMM includes a |
328 | | // little-endian swap for SSE. Thanks to Peter Cordes for help |
329 | | // with packing and unpacking. |
330 | | // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... |
331 | 0 | __m128i a = UnpackXMM<0>(block0, block1, block2, block3); |
332 | 0 | __m128i b = UnpackXMM<1>(block0, block1, block2, block3); |
333 | 0 | __m128i c = UnpackXMM<2>(block0, block1, block2, block3); |
334 | 0 | __m128i d = UnpackXMM<3>(block0, block1, block2, block3); |
335 | |
|
336 | 0 | __m128i counter = _mm_set_epi32(0,0,0,0); |
337 | 0 | __m128i increment = _mm_set_epi32(1,1,1,1); |
338 | |
|
339 | 0 | const unsigned int MASK = (rounds == 80 ? 7 : 15); |
340 | 0 | for (int i=0; i<static_cast<int>(rounds); i+=4) |
341 | 0 | { |
342 | 0 | __m128i k, k1, k2, t1, t2; |
343 | 0 | k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+0) & MASK]))); |
344 | | |
345 | | // Shuffle out two subkeys |
346 | 0 | k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); |
347 | 0 | k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); |
348 | |
|
349 | 0 | t1 = _mm_xor_si128(a, counter); |
350 | 0 | t2 = _mm_xor_si128(RotateLeft32<1>(b), k1); |
351 | 0 | a = RotateLeft32<8>(_mm_add_epi32(t1, t2)); |
352 | |
|
353 | 0 | counter = _mm_add_epi32(counter, increment); |
354 | |
|
355 | 0 | t1 = _mm_xor_si128(b, counter); |
356 | 0 | t2 = _mm_xor_si128(RotateLeft32<8>(c), k2); |
357 | 0 | b = RotateLeft32<1>(_mm_add_epi32(t1, t2)); |
358 | |
|
359 | 0 | counter = _mm_add_epi32(counter, increment); |
360 | 0 | k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i+2) & MASK]))); |
361 | | |
362 | | // Shuffle out two subkeys |
363 | 0 | k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); |
364 | 0 | k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); |
365 | |
|
366 | 0 | t1 = _mm_xor_si128(c, counter); |
367 | 0 | t2 = _mm_xor_si128(RotateLeft32<1>(d), k1); |
368 | 0 | c = RotateLeft32<8>(_mm_add_epi32(t1, t2)); |
369 | |
|
370 | 0 | counter = _mm_add_epi32(counter, increment); |
371 | |
|
372 | 0 | t1 = _mm_xor_si128(d, counter); |
373 | 0 | t2 = _mm_xor_si128(RotateLeft32<8>(a), k2); |
374 | 0 | d = RotateLeft32<1>(_mm_add_epi32(t1, t2)); |
375 | |
|
376 | 0 | counter = _mm_add_epi32(counter, increment); |
377 | 0 | } |
378 | | |
379 | | // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... |
380 | 0 | block0 = RepackXMM<0>(a,b,c,d); |
381 | 0 | block1 = RepackXMM<1>(a,b,c,d); |
382 | 0 | block2 = RepackXMM<2>(a,b,c,d); |
383 | 0 | block3 = RepackXMM<3>(a,b,c,d); |
384 | 0 | } |
385 | | |
386 | | inline void CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1, |
387 | | __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds) |
388 | 0 | { |
389 | | // Rearrange the data for vectorization. UnpackXMM includes a |
390 | | // little-endian swap for SSE. Thanks to Peter Cordes for help |
391 | | // with packing and unpacking. |
392 | | // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... |
393 | 0 | __m128i a = UnpackXMM<0>(block0, block1, block2, block3); |
394 | 0 | __m128i b = UnpackXMM<1>(block0, block1, block2, block3); |
395 | 0 | __m128i c = UnpackXMM<2>(block0, block1, block2, block3); |
396 | 0 | __m128i d = UnpackXMM<3>(block0, block1, block2, block3); |
397 | |
|
398 | 0 | __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1); |
399 | 0 | __m128i decrement = _mm_set_epi32(1,1,1,1); |
400 | |
|
401 | 0 | const unsigned int MASK = (rounds == 80 ? 7 : 15); |
402 | 0 | for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4) |
403 | 0 | { |
404 | 0 | __m128i k, k1, k2, t1, t2; |
405 | 0 | k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-1) & MASK]))); |
406 | | |
407 | | // Shuffle out two subkeys |
408 | 0 | k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); |
409 | 0 | k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); |
410 | | |
411 | | // Odd round |
412 | 0 | t1 = RotateRight32<1>(d); |
413 | 0 | t2 = _mm_xor_si128(RotateLeft32<8>(a), k1); |
414 | 0 | d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); |
415 | |
|
416 | 0 | counter = _mm_sub_epi32(counter, decrement); |
417 | | |
418 | | // Even round |
419 | 0 | t1 = RotateRight32<8>(c); |
420 | 0 | t2 = _mm_xor_si128(RotateLeft32<1>(d), k2); |
421 | 0 | c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); |
422 | |
|
423 | 0 | counter = _mm_sub_epi32(counter, decrement); |
424 | 0 | k = _mm_castpd_si128(_mm_load_sd(CONST_DOUBLE_CAST(&subkeys[(i-3) & MASK]))); |
425 | | |
426 | | // Shuffle out two subkeys |
427 | 0 | k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); |
428 | 0 | k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); |
429 | | |
430 | | // Odd round |
431 | 0 | t1 = RotateRight32<1>(b); |
432 | 0 | t2 = _mm_xor_si128(RotateLeft32<8>(c), k1); |
433 | 0 | b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); |
434 | |
|
435 | 0 | counter = _mm_sub_epi32(counter, decrement); |
436 | | |
437 | | // Even round |
438 | 0 | t1 = RotateRight32<8>(a); |
439 | 0 | t2 = _mm_xor_si128(RotateLeft32<1>(b), k2); |
440 | 0 | a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); |
441 | |
|
442 | 0 | counter = _mm_sub_epi32(counter, decrement); |
443 | 0 | } |
444 | | |
445 | | // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... |
446 | 0 | block0 = RepackXMM<0>(a,b,c,d); |
447 | 0 | block1 = RepackXMM<1>(a,b,c,d); |
448 | 0 | block2 = RepackXMM<2>(a,b,c,d); |
449 | 0 | block3 = RepackXMM<3>(a,b,c,d); |
450 | 0 | } |
451 | | |
452 | | ////////////////////////////////////////////////////////////////////////// |
453 | | |
454 | | NAMESPACE_END // W32 |
455 | | |
456 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |
457 | | |
458 | | ANONYMOUS_NAMESPACE_END |
459 | | |
460 | | NAMESPACE_BEGIN(CryptoPP) |
461 | | |
462 | | #if defined(CRYPTOPP_SSSE3_AVAILABLE) |
463 | | size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, |
464 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
465 | 0 | { |
466 | 0 | return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Enc_Block, W32::CHAM128_Enc_4_Blocks, |
467 | 0 | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
468 | 0 | } |
469 | | |
470 | | size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, |
471 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
472 | 0 | { |
473 | 0 | return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Dec_Block, W32::CHAM128_Dec_4_Blocks, |
474 | 0 | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
475 | 0 | } |
476 | | #endif // CRYPTOPP_SSSE3_AVAILABLE |
477 | | |
478 | | NAMESPACE_END |