/src/botan/src/lib/block/aes/aes_vperm/aes_vperm.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * AES using vector permutes (SSSE3, NEON) |
3 | | * (C) 2010,2016,2019 Jack Lloyd |
4 | | * |
5 | | * Based on public domain x86-64 assembly written by Mike Hamburg, |
6 | | * described in "Accelerating AES with Vector Permute Instructions" |
7 | | * (CHES 2009). His original code is available at |
8 | | * https://crypto.stanford.edu/vpaes/ |
9 | | * |
10 | | * Botan is released under the Simplified BSD License (see license.txt) |
11 | | */ |
12 | | |
13 | | #include <botan/internal/aes.h> |
14 | | #include <botan/internal/ct_utils.h> |
15 | | #include <botan/internal/simd_32.h> |
16 | | |
17 | | #if defined(BOTAN_SIMD_USE_SSE2) |
18 | | #include <tmmintrin.h> |
19 | | #endif |
20 | | |
21 | | namespace Botan { |
22 | | |
23 | | namespace { |
24 | | |
25 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) shuffle(SIMD_4x32 a, SIMD_4x32 b) |
26 | 0 | { |
27 | 0 | #if defined(BOTAN_SIMD_USE_SSE2) |
28 | 0 | return SIMD_4x32(_mm_shuffle_epi8(a.raw(), b.raw())); |
29 | | #elif defined(BOTAN_SIMD_USE_NEON) |
30 | | const uint8x16_t tbl = vreinterpretq_u8_u32(a.raw()); |
31 | | const uint8x16_t idx = vreinterpretq_u8_u32(b.raw()); |
32 | | |
33 | | #if defined(BOTAN_TARGET_ARCH_IS_ARM32) |
34 | | const uint8x8x2_t tbl2 = { vget_low_u8(tbl), vget_high_u8(tbl) }; |
35 | | |
36 | | return SIMD_4x32(vreinterpretq_u32_u8( |
37 | | vcombine_u8(vtbl2_u8(tbl2, vget_low_u8(idx)), |
38 | | vtbl2_u8(tbl2, vget_high_u8(idx))))); |
39 | | |
40 | | #else |
41 | | return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(tbl, idx))); |
42 | | #endif |
43 | | |
44 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) |
45 | | |
46 | | const auto zero = vec_splat_s8(0x00); |
47 | | const auto mask = vec_cmplt(reinterpret_cast<__vector signed char>(b.raw()), zero); |
48 | | const auto r = vec_perm(reinterpret_cast<__vector signed char>(a.raw()), reinterpret_cast<__vector signed char>(a.raw()), reinterpret_cast<__vector unsigned char>(b.raw())); |
49 | | return SIMD_4x32(reinterpret_cast<__vector unsigned int>(vec_sel(r, zero, mask))); |
50 | | |
51 | | #else |
52 | | #error "No shuffle implementation available" |
53 | | #endif |
54 | 0 | } |
55 | | |
56 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) alignr8(SIMD_4x32 a, SIMD_4x32 b) |
57 | 0 | { |
58 | 0 | #if defined(BOTAN_SIMD_USE_SSE2) |
59 | 0 | return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 8)); |
60 | | #elif defined(BOTAN_SIMD_USE_NEON) |
61 | | return SIMD_4x32(vextq_u32(b.raw(), a.raw(), 2)); |
62 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) |
63 | | const __vector unsigned char mask = {8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23}; |
64 | | return SIMD_4x32(vec_perm(b.raw(), a.raw(), mask)); |
65 | | #else |
66 | | #error "No alignr8 implementation available" |
67 | | #endif |
68 | 0 | } |
69 | | |
70 | | const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090); |
71 | | const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC); |
72 | | |
73 | | const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309); |
74 | | const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C); |
75 | | |
76 | | const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E); |
77 | | const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1); |
78 | | const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A); |
79 | | const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1); |
80 | | |
81 | | const SIMD_4x32 sboud = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9); |
82 | | const SIMD_4x32 sbotd = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159); |
83 | | |
84 | | const SIMD_4x32 mc_forward[4] = { |
85 | | SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D), |
86 | | SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201), |
87 | | SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605), |
88 | | SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09) |
89 | | }; |
90 | | |
91 | | const SIMD_4x32 vperm_sr[4] = { |
92 | | SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C), |
93 | | SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C), |
94 | | SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C), |
95 | | SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C), |
96 | | }; |
97 | | |
98 | | const SIMD_4x32 rcon[10] = { |
99 | | SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000), |
100 | | SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000), |
101 | | SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000), |
102 | | SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000), |
103 | | SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000), |
104 | | SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000), |
105 | | SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000), |
106 | | SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000), |
107 | | SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000), |
108 | | SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000), |
109 | | }; |
110 | | |
111 | | const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955); |
112 | | const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8); |
113 | | |
114 | | const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E); |
115 | | const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772); |
116 | | |
117 | | const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50); |
118 | | const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E); |
119 | | |
120 | | const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004); |
121 | | const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B); |
122 | | |
123 | | const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13); |
124 | | const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D); |
125 | | |
126 | | const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6); |
127 | | const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E); |
128 | | |
129 | | const SIMD_4x32 mcx[4] = { |
130 | | SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09), |
131 | | SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605), |
132 | | SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201), |
133 | | SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D), |
134 | | }; |
135 | | |
136 | | const SIMD_4x32 mc_backward[4] = { |
137 | | SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F), |
138 | | SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B), |
139 | | SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407), |
140 | | SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003), |
141 | | }; |
142 | | |
143 | | const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F); |
144 | | |
145 | | inline SIMD_4x32 low_nibs(SIMD_4x32 x) |
146 | 0 | { |
147 | 0 | return lo_nibs_mask & x; |
148 | 0 | } |
149 | | |
150 | | inline SIMD_4x32 high_nibs(SIMD_4x32 x) |
151 | 0 | { |
152 | 0 | return (x.shr<4>() & lo_nibs_mask); |
153 | 0 | } |
154 | | |
155 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_first_round(SIMD_4x32 B, SIMD_4x32 K) |
156 | 0 | { |
157 | 0 | return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K; |
158 | 0 | } |
159 | | |
160 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) |
161 | 0 | { |
162 | 0 | const SIMD_4x32 Bh = high_nibs(B); |
163 | 0 | SIMD_4x32 Bl = low_nibs(B); |
164 | 0 | const SIMD_4x32 t2 = shuffle(k_inv2, Bl); |
165 | 0 | Bl ^= Bh; |
166 | |
|
167 | 0 | const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); |
168 | 0 | const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl)); |
169 | |
|
170 | 0 | const SIMD_4x32 t7 = shuffle(sb1t, t6) ^ shuffle(sb1u, t5) ^ K; |
171 | 0 | const SIMD_4x32 t8 = shuffle(sb2t, t6) ^ shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]); |
172 | |
|
173 | 0 | return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8; |
174 | 0 | } |
175 | | |
176 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) |
177 | 0 | { |
178 | 0 | const SIMD_4x32 Bh = high_nibs(B); |
179 | 0 | SIMD_4x32 Bl = low_nibs(B); |
180 | 0 | const SIMD_4x32 t2 = shuffle(k_inv2, Bl); |
181 | 0 | Bl ^= Bh; |
182 | |
|
183 | 0 | const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); |
184 | 0 | const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl)); |
185 | |
|
186 | 0 | return shuffle(shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K, vperm_sr[r % 4]); |
187 | 0 | } |
188 | | |
189 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_first_round(SIMD_4x32 B, SIMD_4x32 K) |
190 | 0 | { |
191 | 0 | return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K; |
192 | 0 | } |
193 | | |
194 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) |
195 | 0 | { |
196 | 0 | const SIMD_4x32 Bh = high_nibs(B); |
197 | 0 | B = low_nibs(B); |
198 | 0 | const SIMD_4x32 t2 = shuffle(k_inv2, B); |
199 | |
|
200 | 0 | B ^= Bh; |
201 | |
|
202 | 0 | const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); |
203 | 0 | const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B)); |
204 | |
|
205 | 0 | const SIMD_4x32 mc = mcx[(r-1)%4]; |
206 | |
|
207 | 0 | const SIMD_4x32 t8 = shuffle(sb9t, t6) ^ shuffle(sb9u, t5) ^ K; |
208 | 0 | const SIMD_4x32 t9 = shuffle(t8, mc) ^ shuffle(sbdu, t5) ^ shuffle(sbdt, t6); |
209 | 0 | const SIMD_4x32 t12 = shuffle(t9, mc) ^ shuffle(sbbu, t5) ^ shuffle(sbbt, t6); |
210 | 0 | return shuffle(t12, mc) ^ shuffle(sbeu, t5) ^ shuffle(sbet, t6); |
211 | 0 | } |
212 | | |
213 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) |
214 | 0 | { |
215 | 0 | const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16; |
216 | |
|
217 | 0 | const SIMD_4x32 Bh = high_nibs(B); |
218 | 0 | B = low_nibs(B); |
219 | 0 | const SIMD_4x32 t2 = shuffle(k_inv2, B); |
220 | |
|
221 | 0 | B ^= Bh; |
222 | |
|
223 | 0 | const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); |
224 | 0 | const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B)); |
225 | |
|
226 | 0 | const SIMD_4x32 x = shuffle(sboud, t5) ^ shuffle(sbotd, t6) ^ K; |
227 | 0 | return shuffle(x, vperm_sr[which_sr]); |
228 | 0 | } |
229 | | |
230 | | void BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) |
231 | | vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, |
232 | | const SIMD_4x32 K[], size_t rounds) |
233 | 0 | { |
234 | 0 | CT::poison(in, blocks * 16); |
235 | |
|
236 | 0 | const size_t blocks2 = blocks - (blocks % 2); |
237 | |
|
238 | 0 | for(size_t i = 0; i != blocks2; i += 2) |
239 | 0 | { |
240 | 0 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16); |
241 | 0 | SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16); |
242 | |
|
243 | 0 | B0 = aes_enc_first_round(B0, K[0]); |
244 | 0 | B1 = aes_enc_first_round(B1, K[0]); |
245 | |
|
246 | 0 | for(size_t r = 1; r != rounds; ++r) |
247 | 0 | { |
248 | 0 | B0 = aes_enc_round(B0, K[r], r); |
249 | 0 | B1 = aes_enc_round(B1, K[r], r); |
250 | 0 | } |
251 | |
|
252 | 0 | B0 = aes_enc_last_round(B0, K[rounds], rounds); |
253 | 0 | B1 = aes_enc_last_round(B1, K[rounds], rounds); |
254 | |
|
255 | 0 | B0.store_le(out + i*16); |
256 | 0 | B1.store_le(out + (i+1)*16); |
257 | 0 | } |
258 | |
|
259 | 0 | for(size_t i = blocks2; i < blocks; ++i) |
260 | 0 | { |
261 | 0 | SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ??? |
262 | |
|
263 | 0 | B = aes_enc_first_round(B, K[0]); |
264 | |
|
265 | 0 | for(size_t r = 1; r != rounds; ++r) |
266 | 0 | { |
267 | 0 | B = aes_enc_round(B, K[r], r); |
268 | 0 | } |
269 | |
|
270 | 0 | B = aes_enc_last_round(B, K[rounds], rounds); |
271 | 0 | B.store_le(out + i*16); |
272 | 0 | } |
273 | |
|
274 | 0 | CT::unpoison(in, blocks * 16); |
275 | 0 | CT::unpoison(out, blocks * 16); |
276 | 0 | } |
277 | | |
278 | | void BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) |
279 | | vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, |
280 | | const SIMD_4x32 K[], size_t rounds) |
281 | 0 | { |
282 | 0 | CT::poison(in, blocks * 16); |
283 | |
|
284 | 0 | const size_t blocks2 = blocks - (blocks % 2); |
285 | |
|
286 | 0 | for(size_t i = 0; i != blocks2; i += 2) |
287 | 0 | { |
288 | 0 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16); |
289 | 0 | SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16); |
290 | |
|
291 | 0 | B0 = aes_dec_first_round(B0, K[0]); |
292 | 0 | B1 = aes_dec_first_round(B1, K[0]); |
293 | |
|
294 | 0 | for(size_t r = 1; r != rounds; ++r) |
295 | 0 | { |
296 | 0 | B0 = aes_dec_round(B0, K[r], r); |
297 | 0 | B1 = aes_dec_round(B1, K[r], r); |
298 | 0 | } |
299 | |
|
300 | 0 | B0 = aes_dec_last_round(B0, K[rounds], rounds); |
301 | 0 | B1 = aes_dec_last_round(B1, K[rounds], rounds); |
302 | |
|
303 | 0 | B0.store_le(out + i*16); |
304 | 0 | B1.store_le(out + (i+1)*16); |
305 | 0 | } |
306 | |
|
307 | 0 | for(size_t i = blocks2; i < blocks; ++i) |
308 | 0 | { |
309 | 0 | SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ??? |
310 | |
|
311 | 0 | B = aes_dec_first_round(B, K[0]); |
312 | |
|
313 | 0 | for(size_t r = 1; r != rounds; ++r) |
314 | 0 | { |
315 | 0 | B = aes_dec_round(B, K[r], r); |
316 | 0 | } |
317 | |
|
318 | 0 | B = aes_dec_last_round(B, K[rounds], rounds); |
319 | 0 | B.store_le(out + i*16); |
320 | 0 | } |
321 | |
|
322 | 0 | CT::unpoison(in, blocks * 16); |
323 | 0 | CT::unpoison(out, blocks * 16); |
324 | 0 | } |
325 | | |
326 | | } |
327 | | |
328 | | void AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
329 | 0 | { |
330 | 0 | const SIMD_4x32 K[11] = { |
331 | 0 | SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]), |
332 | 0 | SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]), |
333 | 0 | SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]), |
334 | 0 | SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), |
335 | 0 | }; |
336 | |
|
337 | 0 | return vperm_encrypt_blocks(in, out, blocks, K, 10); |
338 | 0 | } |
339 | | |
340 | | void AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
341 | 0 | { |
342 | 0 | const SIMD_4x32 K[11] = { |
343 | 0 | SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]), |
344 | 0 | SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]), |
345 | 0 | SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]), |
346 | 0 | SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), |
347 | 0 | }; |
348 | |
|
349 | 0 | return vperm_decrypt_blocks(in, out, blocks, K, 10); |
350 | 0 | } |
351 | | |
352 | | void AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
353 | 0 | { |
354 | 0 | const SIMD_4x32 K[13] = { |
355 | 0 | SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]), |
356 | 0 | SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]), |
357 | 0 | SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]), |
358 | 0 | SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]), |
359 | 0 | SIMD_4x32(&m_EK[4*12]), |
360 | 0 | }; |
361 | |
|
362 | 0 | return vperm_encrypt_blocks(in, out, blocks, K, 12); |
363 | 0 | } |
364 | | |
365 | | void AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
366 | 0 | { |
367 | 0 | const SIMD_4x32 K[13] = { |
368 | 0 | SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]), |
369 | 0 | SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]), |
370 | 0 | SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]), |
371 | 0 | SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]), |
372 | 0 | SIMD_4x32(&m_DK[4*12]), |
373 | 0 | }; |
374 | |
|
375 | 0 | return vperm_decrypt_blocks(in, out, blocks, K, 12); |
376 | 0 | } |
377 | | |
378 | | void AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
379 | 0 | { |
380 | 0 | const SIMD_4x32 K[15] = { |
381 | 0 | SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]), |
382 | 0 | SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]), |
383 | 0 | SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]), |
384 | 0 | SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]), |
385 | 0 | SIMD_4x32(&m_EK[4*12]), SIMD_4x32(&m_EK[4*13]), SIMD_4x32(&m_EK[4*14]), |
386 | 0 | }; |
387 | |
|
388 | 0 | return vperm_encrypt_blocks(in, out, blocks, K, 14); |
389 | 0 | } |
390 | | |
391 | | void AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const |
392 | 0 | { |
393 | 0 | const SIMD_4x32 K[15] = { |
394 | 0 | SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]), |
395 | 0 | SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]), |
396 | 0 | SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]), |
397 | 0 | SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]), |
398 | 0 | SIMD_4x32(&m_DK[4*12]), SIMD_4x32(&m_DK[4*13]), SIMD_4x32(&m_DK[4*14]), |
399 | 0 | }; |
400 | |
|
401 | 0 | return vperm_decrypt_blocks(in, out, blocks, K, 14); |
402 | 0 | } |
403 | | |
404 | | namespace { |
405 | | |
406 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) |
407 | | aes_schedule_transform(SIMD_4x32 input, |
408 | | SIMD_4x32 table_1, |
409 | | SIMD_4x32 table_2) |
410 | 0 | { |
411 | 0 | return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input)); |
412 | 0 | } |
413 | | |
414 | | SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no) |
415 | 0 | { |
416 | 0 | const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D); |
417 | |
|
418 | 0 | SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0); |
419 | 0 | SIMD_4x32 t2 = t; |
420 | 0 | t = shuffle(t, mc_forward0); |
421 | 0 | t2 = t ^ t2 ^ shuffle(t, mc_forward0); |
422 | 0 | return shuffle(t2, vperm_sr[round_no % 4]); |
423 | 0 | } |
424 | | |
425 | | SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no) |
426 | 0 | { |
427 | 0 | const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D); |
428 | |
|
429 | 0 | const SIMD_4x32 dsk[8] = { |
430 | 0 | SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334), |
431 | 0 | SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC), |
432 | 0 | SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A), |
433 | 0 | SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C), |
434 | 0 | SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9), |
435 | 0 | SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D), |
436 | 0 | SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3), |
437 | 0 | SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4), |
438 | 0 | }; |
439 | |
|
440 | 0 | SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]); |
441 | 0 | SIMD_4x32 output = shuffle(t, mc_forward0); |
442 | |
|
443 | 0 | t = aes_schedule_transform(t, dsk[2], dsk[3]); |
444 | 0 | output = shuffle(t ^ output, mc_forward0); |
445 | |
|
446 | 0 | t = aes_schedule_transform(t, dsk[4], dsk[5]); |
447 | 0 | output = shuffle(t ^ output, mc_forward0); |
448 | |
|
449 | 0 | t = aes_schedule_transform(t, dsk[6], dsk[7]); |
450 | 0 | output = shuffle(t ^ output, mc_forward0); |
451 | |
|
452 | 0 | return shuffle(output, vperm_sr[round_no % 4]); |
453 | 0 | } |
454 | | |
455 | | SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no) |
456 | 0 | { |
457 | 0 | const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121); |
458 | 0 | const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1); |
459 | |
|
460 | 0 | k = shuffle(k, vperm_sr[round_no % 4]); |
461 | 0 | k ^= SIMD_4x32::splat_u8(0x5B); |
462 | 0 | return aes_schedule_transform(k, out_tr1, out_tr2); |
463 | 0 | } |
464 | | |
465 | | SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_last_dec(SIMD_4x32 k) |
466 | 0 | { |
467 | 0 | const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A); |
468 | 0 | const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB); |
469 | |
|
470 | 0 | k ^= SIMD_4x32::splat_u8(0x5B); |
471 | 0 | return aes_schedule_transform(k, deskew1, deskew2); |
472 | 0 | } |
473 | | |
474 | | SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2) |
475 | 0 | { |
476 | 0 | SIMD_4x32 smeared = input2 ^ input2.shift_elems_left<1>(); |
477 | 0 | smeared ^= smeared.shift_elems_left<2>(); |
478 | 0 | smeared ^= SIMD_4x32::splat_u8(0x5B); |
479 | |
|
480 | 0 | const SIMD_4x32 Bh = high_nibs(input1); |
481 | 0 | SIMD_4x32 Bl = low_nibs(input1); |
482 | |
|
483 | 0 | const SIMD_4x32 t2 = shuffle(k_inv2, Bl); |
484 | |
|
485 | 0 | Bl ^= Bh; |
486 | |
|
487 | 0 | SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); |
488 | 0 | SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl)); |
489 | |
|
490 | 0 | return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6); |
491 | 0 | } |
492 | | |
493 | | SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_round(SIMD_4x32 rc, SIMD_4x32 input1, SIMD_4x32 input2) |
494 | 0 | { |
495 | | // This byte shuffle is equivalent to alignr<1>(shuffle32(input1, (3,3,3,3))); |
496 | 0 | const SIMD_4x32 shuffle3333_15 = SIMD_4x32::splat(0x0C0F0E0D); |
497 | 0 | return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rc); |
498 | 0 | } |
499 | | |
500 | | SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y) |
501 | 0 | { |
502 | 0 | const SIMD_4x32 shuffle3332 = |
503 | 0 | SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C); |
504 | 0 | const SIMD_4x32 shuffle2000 = |
505 | 0 | SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908); |
506 | |
|
507 | 0 | const SIMD_4x32 zero_top_half(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); |
508 | 0 | y &= zero_top_half; |
509 | 0 | return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000); |
510 | 0 | } |
511 | | |
512 | | } |
513 | | |
514 | | void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) |
515 | 0 | { |
516 | 0 | m_EK.resize(11*4); |
517 | 0 | m_DK.resize(11*4); |
518 | |
|
519 | 0 | SIMD_4x32 key = SIMD_4x32::load_le(keyb); |
520 | |
|
521 | 0 | shuffle(key, vperm_sr[2]).store_le(&m_DK[4*10]); |
522 | |
|
523 | 0 | key = aes_schedule_transform(key, k_ipt1, k_ipt2); |
524 | 0 | key.store_le(&m_EK[0]); |
525 | |
|
526 | 0 | for(size_t i = 1; i != 10; ++i) |
527 | 0 | { |
528 | 0 | key = aes_schedule_round(rcon[i-1], key, key); |
529 | |
|
530 | 0 | aes_schedule_mangle(key, (12-i) % 4).store_le(&m_EK[4*i]); |
531 | |
|
532 | 0 | aes_schedule_mangle_dec(key, (10-i)%4).store_le(&m_DK[4*(10-i)]); |
533 | 0 | } |
534 | |
|
535 | 0 | key = aes_schedule_round(rcon[9], key, key); |
536 | 0 | aes_schedule_mangle_last(key, 2).store_le(&m_EK[4*10]); |
537 | 0 | aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]); |
538 | 0 | } |
539 | | |
540 | | void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) |
541 | 0 | { |
542 | 0 | m_EK.resize(13*4); |
543 | 0 | m_DK.resize(13*4); |
544 | |
|
545 | 0 | SIMD_4x32 key1 = SIMD_4x32::load_le(keyb); |
546 | 0 | SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8); |
547 | |
|
548 | 0 | shuffle(key1, vperm_sr[0]).store_le(&m_DK[12*4]); |
549 | |
|
550 | 0 | key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); |
551 | 0 | key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); |
552 | |
|
553 | 0 | key1.store_le(&m_EK[0]); |
554 | |
|
555 | 0 | for(size_t i = 0; i != 4; ++i) |
556 | 0 | { |
557 | | // key2 with 8 high bytes masked off |
558 | 0 | SIMD_4x32 t = key2; |
559 | 0 | key2 = aes_schedule_round(rcon[2*i], key2, key1); |
560 | 0 | const SIMD_4x32 key2t = alignr8(key2, t); |
561 | 0 | aes_schedule_mangle(key2t, (i+3)%4).store_le(&m_EK[4*(3*i+1)]); |
562 | 0 | aes_schedule_mangle_dec(key2t, (i+3)%4).store_le(&m_DK[4*(11-3*i)]); |
563 | |
|
564 | 0 | t = aes_schedule_192_smear(key2, t); |
565 | |
|
566 | 0 | aes_schedule_mangle(t, (i+2)%4).store_le(&m_EK[4*(3*i+2)]); |
567 | 0 | aes_schedule_mangle_dec(t, (i+2)%4).store_le(&m_DK[4*(10-3*i)]); |
568 | |
|
569 | 0 | key2 = aes_schedule_round(rcon[2*i+1], t, key2); |
570 | |
|
571 | 0 | if(i == 3) |
572 | 0 | { |
573 | 0 | aes_schedule_mangle_last(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]); |
574 | 0 | aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4*(9-3*i)]); |
575 | 0 | } |
576 | 0 | else |
577 | 0 | { |
578 | 0 | aes_schedule_mangle(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]); |
579 | 0 | aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(9-3*i)]); |
580 | 0 | } |
581 | |
|
582 | 0 | key1 = key2; |
583 | 0 | key2 = aes_schedule_192_smear(key2, t); |
584 | 0 | } |
585 | 0 | } |
586 | | |
587 | | void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) |
588 | 0 | { |
589 | 0 | m_EK.resize(15*4); |
590 | 0 | m_DK.resize(15*4); |
591 | |
|
592 | 0 | SIMD_4x32 key1 = SIMD_4x32::load_le(keyb); |
593 | 0 | SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16); |
594 | |
|
595 | 0 | shuffle(key1, vperm_sr[2]).store_le(&m_DK[4*14]); |
596 | |
|
597 | 0 | key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); |
598 | 0 | key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); |
599 | |
|
600 | 0 | key1.store_le(&m_EK[0]); |
601 | 0 | aes_schedule_mangle(key2, 3).store_le(&m_EK[4]); |
602 | |
|
603 | 0 | aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4*13]); |
604 | |
|
605 | 0 | const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C); |
606 | |
|
607 | 0 | for(size_t i = 2; i != 14; i += 2) |
608 | 0 | { |
609 | 0 | const SIMD_4x32 k_t = key2; |
610 | 0 | key1 = key2 = aes_schedule_round(rcon[(i/2)-1], key2, key1); |
611 | |
|
612 | 0 | aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4*i]); |
613 | 0 | aes_schedule_mangle_dec(key2, (i+2)%4).store_le(&m_DK[4*(14-i)]); |
614 | |
|
615 | 0 | key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t); |
616 | |
|
617 | 0 | aes_schedule_mangle(key2, (i-1)%4).store_le(&m_EK[4*(i+1)]); |
618 | 0 | aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(13-i)]); |
619 | 0 | } |
620 | |
|
621 | 0 | key2 = aes_schedule_round(rcon[6], key2, key1); |
622 | |
|
623 | 0 | aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4*14]); |
624 | 0 | aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]); |
625 | 0 | } |
626 | | |
627 | | } |