/src/cryptopp/sm4_simd.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // sm4_simd.cpp - written and placed in the public domain by |
2 | | // Markku-Juhani O. Saarinen and Jeffrey Walton |
3 | | // |
4 | | // This source file uses intrinsics and built-ins to gain access to |
5 | | // AESNI, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate |
6 | | // source file is needed because additional CXXFLAGS are required to enable |
7 | | // the appropriate instructions sets in some build configurations. |
8 | | // |
9 | | // AES-NI based on Markku-Juhani O. Saarinen work at https://github.com/mjosaarinen/sm4ni. |
10 | | // |
11 | | // ARMv8 is upcoming. |
12 | | |
13 | | #include "pch.h" |
14 | | #include "config.h" |
15 | | |
16 | | #include "sm4.h" |
17 | | #include "misc.h" |
18 | | |
19 | | // Uncomment for benchmarking C++ against SSE. |
20 | | // Do so in both simon.cpp and simon_simd.cpp. |
21 | | // #undef CRYPTOPP_AESNI_AVAILABLE |
22 | | |
23 | | #if (CRYPTOPP_AESNI_AVAILABLE) |
24 | | # include "adv_simd.h" |
25 | | # include <emmintrin.h> |
26 | | # include <tmmintrin.h> |
27 | | # include <wmmintrin.h> |
28 | | #endif |
29 | | |
30 | | // Squash MS LNK4221 and libtool warnings |
31 | | extern const char SM4_SIMD_FNAME[] = __FILE__; |
32 | | |
33 | | ANONYMOUS_NAMESPACE_BEGIN |
34 | | |
35 | | using CryptoPP::word32; |
36 | | |
37 | | #if (CRYPTOPP_AESNI_AVAILABLE) |
38 | | |
39 | | template <unsigned int R> |
40 | | inline __m128i ShiftLeft(const __m128i& val) |
41 | 1.50k | { |
42 | 1.50k | return _mm_slli_epi32(val, R); |
43 | 1.50k | } |
44 | | |
45 | | template <unsigned int R> |
46 | | inline __m128i ShiftRight(const __m128i& val) |
47 | 1.50k | { |
48 | 1.50k | return _mm_srli_epi32(val, R); |
49 | 1.50k | } |
50 | | |
51 | | template <unsigned int R> |
52 | | inline __m128i ShiftLeft64(const __m128i& val) |
53 | | { |
54 | | return _mm_slli_epi64(val, R); |
55 | | } |
56 | | |
57 | | template <unsigned int R> |
58 | | inline __m128i ShiftRight64(const __m128i& val) |
59 | 3.00k | { |
60 | 3.00k | return _mm_srli_epi64(val, R); |
61 | 3.00k | } |
62 | | |
63 | | template <unsigned int R> |
64 | | inline __m128i RotateLeft(const __m128i& val) |
65 | | { |
66 | | return _mm_or_si128( |
67 | | _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); |
68 | | } |
69 | | |
70 | | template <unsigned int R> |
71 | | inline __m128i RotateRight(const __m128i& val) |
72 | | { |
73 | | return _mm_or_si128( |
74 | | _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); |
75 | | } |
76 | | |
77 | | template <> |
78 | | inline __m128i RotateLeft<8>(const __m128i& val) |
79 | 1.50k | { |
80 | 1.50k | const __m128i r08 = _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003); |
81 | 1.50k | return _mm_shuffle_epi8(val, r08); |
82 | 1.50k | } |
83 | | |
84 | | template <> |
85 | | inline __m128i RotateLeft<16>(const __m128i& val) |
86 | 1.50k | { |
87 | 1.50k | const __m128i mask = _mm_set_epi32(0x0D0C0F0E, 0x09080B0A, 0x05040706, 0x01000302); |
88 | 1.50k | return _mm_shuffle_epi8(val, mask); |
89 | 1.50k | } |
90 | | |
91 | | template <> |
92 | | inline __m128i RotateLeft<24>(const __m128i& val) |
93 | 1.50k | { |
94 | 1.50k | const __m128i mask = _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201); |
95 | 1.50k | return _mm_shuffle_epi8(val, mask); |
96 | 1.50k | } |
97 | | |
98 | | /// \brief Unpack XMM words |
99 | | /// \tparam IDX the element from each XMM word |
100 | | /// \param a the first XMM word |
101 | | /// \param b the second XMM word |
102 | | /// \param c the third XMM word |
103 | | /// \param d the fourth XMM word |
104 | | /// \details UnpackXMM selects the IDX element from a, b, c, d and returns a concatenation |
105 | | /// equivalent to <tt>a[IDX] || b[IDX] || c[IDX] || d[IDX]</tt>. |
106 | | template <unsigned int IDX> |
107 | | inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
108 | | { |
109 | | // Should not be instantiated |
110 | | CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b); |
111 | | CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d); |
112 | | CRYPTOPP_ASSERT(0); |
113 | | return _mm_setzero_si128(); |
114 | | } |
115 | | |
116 | | template <> |
117 | | inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
118 | 94 | { |
119 | 94 | const __m128i r1 = _mm_unpacklo_epi32(a, b); |
120 | 94 | const __m128i r2 = _mm_unpacklo_epi32(c, d); |
121 | 94 | return _mm_unpacklo_epi64(r1, r2); |
122 | 94 | } |
123 | | |
124 | | template <> |
125 | | inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
126 | 94 | { |
127 | 94 | const __m128i r1 = _mm_unpacklo_epi32(a, b); |
128 | 94 | const __m128i r2 = _mm_unpacklo_epi32(c, d); |
129 | 94 | return _mm_unpackhi_epi64(r1, r2); |
130 | 94 | } |
131 | | |
132 | | template <> |
133 | | inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
134 | 94 | { |
135 | 94 | const __m128i r1 = _mm_unpackhi_epi32(a, b); |
136 | 94 | const __m128i r2 = _mm_unpackhi_epi32(c, d); |
137 | 94 | return _mm_unpacklo_epi64(r1, r2); |
138 | 94 | } |
139 | | |
140 | | template <> |
141 | | inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
142 | 94 | { |
143 | 94 | const __m128i r1 = _mm_unpackhi_epi32(a, b); |
144 | 94 | const __m128i r2 = _mm_unpackhi_epi32(c, d); |
145 | 94 | return _mm_unpackhi_epi64(r1, r2); |
146 | 94 | } |
147 | | |
148 | | /// \brief Unpack a XMM word |
149 | | /// \tparam IDX the element from each XMM word |
150 | | /// \param v the first XMM word |
151 | | /// \details UnpackXMM selects the IDX element from v and returns a concatenation |
152 | | /// equivalent to <tt>v[IDX] || v[IDX] || v[IDX] || v[IDX]</tt>. |
153 | | template <unsigned int IDX> |
154 | | inline __m128i UnpackXMM(const __m128i& v) |
155 | | { |
156 | | // Should not be instantiated |
157 | | CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0); |
158 | | return _mm_setzero_si128(); |
159 | | } |
160 | | |
161 | | template <> |
162 | | inline __m128i UnpackXMM<0>(const __m128i& v) |
163 | 0 | { |
164 | 0 | // Splat to all lanes |
165 | 0 | return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); |
166 | 0 | } |
167 | | |
168 | | template <> |
169 | | inline __m128i UnpackXMM<1>(const __m128i& v) |
170 | 0 | { |
171 | 0 | // Splat to all lanes |
172 | 0 | return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); |
173 | 0 | } |
174 | | |
175 | | template <> |
176 | | inline __m128i UnpackXMM<2>(const __m128i& v) |
177 | 0 | { |
178 | 0 | // Splat to all lanes |
179 | 0 | return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8)); |
180 | 0 | } |
181 | | |
182 | | template <> |
183 | | inline __m128i UnpackXMM<3>(const __m128i& v) |
184 | 0 | { |
185 | 0 | // Splat to all lanes |
186 | 0 | return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12)); |
187 | 0 | } |
188 | | |
189 | | template <unsigned int IDX> |
190 | | inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) |
191 | 188 | { |
192 | 188 | return UnpackXMM<IDX>(a, b, c, d); |
193 | 188 | } sm4_simd.cpp:long long __vector(2) (anonymous namespace)::RepackXMM<0u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) Line | Count | Source | 191 | 47 | { | 192 | 47 | return UnpackXMM<IDX>(a, b, c, d); | 193 | 47 | } |
sm4_simd.cpp:long long __vector(2) (anonymous namespace)::RepackXMM<1u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) Line | Count | Source | 191 | 47 | { | 192 | 47 | return UnpackXMM<IDX>(a, b, c, d); | 193 | 47 | } |
sm4_simd.cpp:long long __vector(2) (anonymous namespace)::RepackXMM<2u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) Line | Count | Source | 191 | 47 | { | 192 | 47 | return UnpackXMM<IDX>(a, b, c, d); | 193 | 47 | } |
sm4_simd.cpp:long long __vector(2) (anonymous namespace)::RepackXMM<3u>(long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&, long long __vector(2) const&) Line | Count | Source | 191 | 47 | { | 192 | 47 | return UnpackXMM<IDX>(a, b, c, d); | 193 | 47 | } |
|
194 | | |
195 | | template <unsigned int IDX> |
196 | | inline __m128i RepackXMM(const __m128i& v) |
197 | | { |
198 | | return UnpackXMM<IDX>(v); |
199 | | } |
200 | | |
201 | | inline void SM4_Encrypt(__m128i &block0, __m128i &block1, |
202 | | __m128i &block2, __m128i &block3, const word32 *subkeys) |
203 | 47 | { |
204 | | // nibble mask |
205 | 47 | const __m128i c0f = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); |
206 | | |
207 | | // flip all bytes in all 32-bit words |
208 | 47 | const __m128i flp = _mm_set_epi32(0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203); |
209 | | |
210 | | // inverse shift rows |
211 | 47 | const __m128i shr = _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00); |
212 | | |
213 | | // Affine transform 1 (low and high hibbles) |
214 | 47 | const __m128i m1l = _mm_set_epi32(0xC7C1B4B2, 0x22245157, 0x9197E2E4, 0x74720701); |
215 | 47 | const __m128i m1h = _mm_set_epi32(0xF052B91B, 0xF95BB012, 0xE240AB09, 0xEB49A200); |
216 | | |
217 | | // Affine transform 2 (low and high hibbles) |
218 | 47 | const __m128i m2l = _mm_set_epi32(0xEDD14478, 0x172BBE82, 0x5B67F2CE, 0xA19D0834); |
219 | 47 | const __m128i m2h = _mm_set_epi32(0x11CDBE62, 0xCC1063BF, 0xAE7201DD, 0x73AFDC00); |
220 | | |
221 | 47 | __m128i t0 = UnpackXMM<0>(block0, block1, block2, block3); |
222 | 47 | __m128i t1 = UnpackXMM<1>(block0, block1, block2, block3); |
223 | 47 | __m128i t2 = UnpackXMM<2>(block0, block1, block2, block3); |
224 | 47 | __m128i t3 = UnpackXMM<3>(block0, block1, block2, block3); |
225 | | |
226 | 47 | t0 = _mm_shuffle_epi8(t0, flp); |
227 | 47 | t1 = _mm_shuffle_epi8(t1, flp); |
228 | 47 | t2 = _mm_shuffle_epi8(t2, flp); |
229 | 47 | t3 = _mm_shuffle_epi8(t3, flp); |
230 | | |
231 | 47 | const unsigned int ROUNDS = 32; |
232 | 1.55k | for (unsigned int i = 0; i < ROUNDS; i++) |
233 | 1.50k | { |
234 | 1.50k | const __m128i k = _mm_shuffle_epi32(_mm_castps_si128( |
235 | 1.50k | _mm_load_ss((const float*)(subkeys+i))), _MM_SHUFFLE(0,0,0,0)); |
236 | | |
237 | 1.50k | __m128i x, y; |
238 | 1.50k | x = _mm_xor_si128(t1, _mm_xor_si128(t2, _mm_xor_si128(t3, k))); |
239 | | |
240 | 1.50k | y = _mm_and_si128(x, c0f); // inner affine |
241 | 1.50k | y = _mm_shuffle_epi8(m1l, y); |
242 | 1.50k | x = _mm_and_si128(ShiftRight64<4>(x), c0f); |
243 | 1.50k | x = _mm_xor_si128(_mm_shuffle_epi8(m1h, x), y); |
244 | | |
245 | 1.50k | x = _mm_shuffle_epi8(x, shr); // inverse MixColumns |
246 | 1.50k | x = _mm_aesenclast_si128(x, c0f); // AESNI instruction |
247 | | |
248 | 1.50k | y = _mm_andnot_si128(x, c0f); // outer affine |
249 | 1.50k | y = _mm_shuffle_epi8(m2l, y); |
250 | 1.50k | x = _mm_and_si128(ShiftRight64<4>(x), c0f); |
251 | 1.50k | x = _mm_xor_si128(_mm_shuffle_epi8(m2h, x), y); |
252 | | |
253 | | // 4 parallel L1 linear transforms |
254 | 1.50k | y = _mm_xor_si128(x, RotateLeft<8>(x)); |
255 | 1.50k | y = _mm_xor_si128(y, RotateLeft<16>(x)); |
256 | 1.50k | y = _mm_xor_si128(ShiftLeft<2>(y), ShiftRight<30>(y)); |
257 | 1.50k | x = _mm_xor_si128(x, _mm_xor_si128(y, RotateLeft<24>(x))); |
258 | | |
259 | | // rotate registers |
260 | 1.50k | x = _mm_xor_si128(x, t0); |
261 | 1.50k | t0 = t1; t1 = t2; |
262 | 1.50k | t2 = t3; t3 = x; |
263 | 1.50k | } |
264 | | |
265 | 47 | t0 = _mm_shuffle_epi8(t0, flp); |
266 | 47 | t1 = _mm_shuffle_epi8(t1, flp); |
267 | 47 | t2 = _mm_shuffle_epi8(t2, flp); |
268 | 47 | t3 = _mm_shuffle_epi8(t3, flp); |
269 | | |
270 | 47 | block0 = RepackXMM<0>(t3,t2,t1,t0); |
271 | 47 | block1 = RepackXMM<1>(t3,t2,t1,t0); |
272 | 47 | block2 = RepackXMM<2>(t3,t2,t1,t0); |
273 | 47 | block3 = RepackXMM<3>(t3,t2,t1,t0); |
274 | 47 | } |
275 | | |
276 | | inline void SM4_Enc_4_Blocks(__m128i &block0, __m128i &block1, |
277 | | __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/) |
278 | 35 | { |
279 | 35 | SM4_Encrypt(block0, block1, block2, block3, subkeys); |
280 | 35 | } |
281 | | |
282 | | inline void SM4_Dec_4_Blocks(__m128i &block0, __m128i &block1, |
283 | | __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/) |
284 | 0 | { |
285 | 0 | SM4_Encrypt(block0, block1, block2, block3, subkeys); |
286 | 0 | } |
287 | | |
288 | | inline void SM4_Enc_Block(__m128i &block0, |
289 | | const word32 *subkeys, unsigned int /*rounds*/) |
290 | 12 | { |
291 | 12 | __m128i t1 = _mm_setzero_si128(); |
292 | 12 | __m128i t2 = _mm_setzero_si128(); |
293 | 12 | __m128i t3 = _mm_setzero_si128(); |
294 | | |
295 | 12 | SM4_Encrypt(block0, t1, t2, t3, subkeys); |
296 | 12 | } |
297 | | |
298 | | inline void SM4_Dec_Block(__m128i &block0, |
299 | | const word32 *subkeys, unsigned int /*rounds*/) |
300 | 0 | { |
301 | 0 | __m128i t1 = _mm_setzero_si128(); |
302 | 0 | __m128i t2 = _mm_setzero_si128(); |
303 | 0 | __m128i t3 = _mm_setzero_si128(); |
304 | 0 |
|
305 | 0 | SM4_Encrypt(block0, t1, t2, t3, subkeys); |
306 | 0 | } |
307 | | |
308 | | #endif // CRYPTOPP_AESNI_AVAILABLE |
309 | | |
310 | | ANONYMOUS_NAMESPACE_END |
311 | | |
312 | | NAMESPACE_BEGIN(CryptoPP) |
313 | | |
314 | | #if defined(CRYPTOPP_AESNI_AVAILABLE) |
315 | | size_t SM4_Enc_AdvancedProcessBlocks_AESNI(const word32* subKeys, size_t rounds, |
316 | | const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) |
317 | 8 | { |
318 | 8 | return AdvancedProcessBlocks128_4x1_SSE(SM4_Enc_Block, SM4_Enc_4_Blocks, |
319 | 8 | subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); |
320 | 8 | } |
321 | | #endif // CRYPTOPP_AESNI_AVAILABLE |
322 | | |
323 | | NAMESPACE_END |