/src/botan/src/lib/block/aes/aes_vperm/aes_vperm.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * AES using vector permutes (SSSE3, NEON) |
3 | | * (C) 2010,2016,2019 Jack Lloyd |
4 | | * |
5 | | * Based on public domain x86-64 assembly written by Mike Hamburg, |
6 | | * described in "Accelerating AES with Vector Permute Instructions" |
7 | | * (CHES 2009). His original code is available at |
8 | | * https://crypto.stanford.edu/vpaes/ |
9 | | * |
10 | | * Botan is released under the Simplified BSD License (see license.txt) |
11 | | */ |
12 | | |
13 | | #include <botan/internal/aes.h> |
14 | | |
15 | | #include <botan/internal/ct_utils.h> |
16 | | #include <botan/internal/isa_extn.h> |
17 | | #include <botan/internal/simd_4x32.h> |
18 | | #include <botan/internal/target_info.h> |
19 | | #include <bit> |
20 | | |
21 | | namespace Botan { |
22 | | |
23 | | namespace { |
24 | | |
25 | 0 | inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 shuffle(SIMD_4x32 tbl, SIMD_4x32 idx) { |
26 | 0 | if constexpr(std::endian::native == std::endian::little) { |
27 | 0 | return SIMD_4x32::byte_shuffle(tbl, idx); |
28 | | } else { |
29 | | return SIMD_4x32::byte_shuffle(tbl.bswap(), idx.bswap()).bswap(); |
30 | | } |
31 | 0 | } |
32 | | |
33 | 0 | inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 masked_shuffle(SIMD_4x32 tbl, SIMD_4x32 idx) { |
34 | 0 | if constexpr(std::endian::native == std::endian::little) { |
35 | 0 | return SIMD_4x32::masked_byte_shuffle(tbl, idx); |
36 | | } else { |
37 | | return SIMD_4x32::masked_byte_shuffle(tbl.bswap(), idx.bswap()).bswap(); |
38 | | } |
39 | 0 | } |
40 | | |
41 | | const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090); |
42 | | const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC); |
43 | | |
44 | | const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309); |
45 | | const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C); |
46 | | |
47 | | const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E); |
48 | | const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1); |
49 | | const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A); |
50 | | const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1); |
51 | | |
52 | | const SIMD_4x32 sboud = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9); |
53 | | const SIMD_4x32 sbotd = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159); |
54 | | |
55 | | const SIMD_4x32 mc_forward[4] = {SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D), |
56 | | SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201), |
57 | | SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605), |
58 | | SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09)}; |
59 | | |
60 | | const SIMD_4x32 vperm_sr[4] = { |
61 | | SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C), |
62 | | SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C), |
63 | | SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C), |
64 | | SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C), |
65 | | }; |
66 | | |
67 | | const SIMD_4x32 rcon[10] = { |
68 | | SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000), |
69 | | SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000), |
70 | | SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000), |
71 | | SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000), |
72 | | SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000), |
73 | | SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000), |
74 | | SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000), |
75 | | SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000), |
76 | | SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000), |
77 | | SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000), |
78 | | }; |
79 | | |
80 | | const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955); |
81 | | const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8); |
82 | | |
83 | | const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E); |
84 | | const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772); |
85 | | |
86 | | const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50); |
87 | | const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E); |
88 | | |
89 | | const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004); |
90 | | const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B); |
91 | | |
92 | | const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13); |
93 | | const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D); |
94 | | |
95 | | const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6); |
96 | | const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E); |
97 | | |
98 | | const SIMD_4x32 mcx[4] = { |
99 | | SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09), |
100 | | SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605), |
101 | | SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201), |
102 | | SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D), |
103 | | }; |
104 | | |
105 | | const SIMD_4x32 mc_backward[4] = { |
106 | | SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F), |
107 | | SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B), |
108 | | SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407), |
109 | | SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003), |
110 | | }; |
111 | | |
112 | | const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F); |
113 | | |
114 | 0 | inline SIMD_4x32 low_nibs(SIMD_4x32 x) { |
115 | 0 | return lo_nibs_mask & x; |
116 | 0 | } |
117 | | |
118 | 0 | inline SIMD_4x32 high_nibs(SIMD_4x32 x) { |
119 | 0 | return (x.shr<4>() & lo_nibs_mask); |
120 | 0 | } |
121 | | |
122 | 0 | inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_enc_first_round(SIMD_4x32 B, SIMD_4x32 K) { |
123 | 0 | return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K; |
124 | 0 | } |
125 | | |
126 | 0 | inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_enc_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) { |
127 | 0 | const SIMD_4x32 Bh = high_nibs(B); |
128 | 0 | SIMD_4x32 Bl = low_nibs(B); |
129 | 0 | const SIMD_4x32 t2 = shuffle(k_inv2, Bl); |
130 | 0 | Bl ^= Bh; |
131 | |
|
132 | 0 | const SIMD_4x32 t5 = Bl ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); |
133 | 0 | const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl)); |
134 | |
|
135 | 0 | const SIMD_4x32 t7 = masked_shuffle(sb1t, t6) ^ masked_shuffle(sb1u, t5) ^ K; |
136 | 0 | const SIMD_4x32 t8 = masked_shuffle(sb2t, t6) ^ masked_shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]); |
137 | |
|
138 | 0 | return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8; |
139 | 0 | } |
140 | | |
141 | 0 | inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_enc_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) { |
142 | 0 | const SIMD_4x32 Bh = high_nibs(B); |
143 | 0 | SIMD_4x32 Bl = low_nibs(B); |
144 | 0 | const SIMD_4x32 t2 = shuffle(k_inv2, Bl); |
145 | 0 | Bl ^= Bh; |
146 | |
|
147 | 0 | const SIMD_4x32 t5 = Bl ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); |
148 | 0 | const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl)); |
149 | |
|
150 | 0 | return shuffle(masked_shuffle(sbou, t5) ^ masked_shuffle(sbot, t6) ^ K, vperm_sr[r % 4]); |
151 | 0 | } |
152 | | |
153 | 0 | inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_dec_first_round(SIMD_4x32 B, SIMD_4x32 K) { |
154 | 0 | return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K; |
155 | 0 | } |
156 | | |
157 | 0 | inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_dec_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) { |
158 | 0 | const SIMD_4x32 Bh = high_nibs(B); |
159 | 0 | B = low_nibs(B); |
160 | 0 | const SIMD_4x32 t2 = shuffle(k_inv2, B); |
161 | |
|
162 | 0 | B ^= Bh; |
163 | |
|
164 | 0 | const SIMD_4x32 t5 = B ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); |
165 | 0 | const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, B)); |
166 | |
|
167 | 0 | const SIMD_4x32 mc = mcx[(r - 1) % 4]; |
168 | |
|
169 | 0 | const SIMD_4x32 t8 = masked_shuffle(sb9t, t6) ^ masked_shuffle(sb9u, t5) ^ K; |
170 | 0 | const SIMD_4x32 t9 = shuffle(t8, mc) ^ masked_shuffle(sbdu, t5) ^ masked_shuffle(sbdt, t6); |
171 | 0 | const SIMD_4x32 t12 = shuffle(t9, mc) ^ masked_shuffle(sbbu, t5) ^ masked_shuffle(sbbt, t6); |
172 | 0 | return shuffle(t12, mc) ^ masked_shuffle(sbeu, t5) ^ masked_shuffle(sbet, t6); |
173 | 0 | } |
174 | | |
175 | 0 | inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_dec_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) { |
176 | 0 | const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16; |
177 | |
|
178 | 0 | const SIMD_4x32 Bh = high_nibs(B); |
179 | 0 | B = low_nibs(B); |
180 | 0 | const SIMD_4x32 t2 = shuffle(k_inv2, B); |
181 | |
|
182 | 0 | B ^= Bh; |
183 | |
|
184 | 0 | const SIMD_4x32 t5 = B ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); |
185 | 0 | const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, B)); |
186 | |
|
187 | 0 | const SIMD_4x32 x = masked_shuffle(sboud, t5) ^ masked_shuffle(sbotd, t6) ^ K; |
188 | 0 | return shuffle(x, vperm_sr[which_sr]); |
189 | 0 | } |
190 | | |
191 | | void BOTAN_FN_ISA_SIMD_4X32 |
192 | 0 | vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, const SIMD_4x32 K[], size_t rounds) { |
193 | 0 | CT::poison(in, blocks * 16); |
194 | |
|
195 | 0 | const size_t blocks2 = blocks - (blocks % 2); |
196 | |
|
197 | 0 | for(size_t i = 0; i != blocks2; i += 2) { |
198 | 0 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + i * 16); |
199 | 0 | SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i + 1) * 16); |
200 | |
|
201 | 0 | B0 = aes_enc_first_round(B0, K[0]); |
202 | 0 | B1 = aes_enc_first_round(B1, K[0]); |
203 | |
|
204 | 0 | for(size_t r = 1; r != rounds; ++r) { |
205 | 0 | B0 = aes_enc_round(B0, K[r], r); |
206 | 0 | B1 = aes_enc_round(B1, K[r], r); |
207 | 0 | } |
208 | |
|
209 | 0 | B0 = aes_enc_last_round(B0, K[rounds], rounds); |
210 | 0 | B1 = aes_enc_last_round(B1, K[rounds], rounds); |
211 | |
|
212 | 0 | B0.store_le(out + i * 16); |
213 | 0 | B1.store_le(out + (i + 1) * 16); |
214 | 0 | } |
215 | |
|
216 | 0 | for(size_t i = blocks2; i < blocks; ++i) { |
217 | 0 | SIMD_4x32 B = SIMD_4x32::load_le(in + i * 16); // ??? |
218 | |
|
219 | 0 | B = aes_enc_first_round(B, K[0]); |
220 | |
|
221 | 0 | for(size_t r = 1; r != rounds; ++r) { |
222 | 0 | B = aes_enc_round(B, K[r], r); |
223 | 0 | } |
224 | |
|
225 | 0 | B = aes_enc_last_round(B, K[rounds], rounds); |
226 | 0 | B.store_le(out + i * 16); |
227 | 0 | } |
228 | |
|
229 | 0 | CT::unpoison(in, blocks * 16); |
230 | 0 | CT::unpoison(out, blocks * 16); |
231 | 0 | } |
232 | | |
233 | | void BOTAN_FN_ISA_SIMD_4X32 |
234 | 0 | vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, const SIMD_4x32 K[], size_t rounds) { |
235 | 0 | CT::poison(in, blocks * 16); |
236 | |
|
237 | 0 | const size_t blocks2 = blocks - (blocks % 2); |
238 | |
|
239 | 0 | for(size_t i = 0; i != blocks2; i += 2) { |
240 | 0 | SIMD_4x32 B0 = SIMD_4x32::load_le(in + i * 16); |
241 | 0 | SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i + 1) * 16); |
242 | |
|
243 | 0 | B0 = aes_dec_first_round(B0, K[0]); |
244 | 0 | B1 = aes_dec_first_round(B1, K[0]); |
245 | |
|
246 | 0 | for(size_t r = 1; r != rounds; ++r) { |
247 | 0 | B0 = aes_dec_round(B0, K[r], r); |
248 | 0 | B1 = aes_dec_round(B1, K[r], r); |
249 | 0 | } |
250 | |
|
251 | 0 | B0 = aes_dec_last_round(B0, K[rounds], rounds); |
252 | 0 | B1 = aes_dec_last_round(B1, K[rounds], rounds); |
253 | |
|
254 | 0 | B0.store_le(out + i * 16); |
255 | 0 | B1.store_le(out + (i + 1) * 16); |
256 | 0 | } |
257 | |
|
258 | 0 | for(size_t i = blocks2; i < blocks; ++i) { |
259 | 0 | SIMD_4x32 B = SIMD_4x32::load_le(in + i * 16); // ??? |
260 | |
|
261 | 0 | B = aes_dec_first_round(B, K[0]); |
262 | |
|
263 | 0 | for(size_t r = 1; r != rounds; ++r) { |
264 | 0 | B = aes_dec_round(B, K[r], r); |
265 | 0 | } |
266 | |
|
267 | 0 | B = aes_dec_last_round(B, K[rounds], rounds); |
268 | 0 | B.store_le(out + i * 16); |
269 | 0 | } |
270 | |
|
271 | 0 | CT::unpoison(in, blocks * 16); |
272 | 0 | CT::unpoison(out, blocks * 16); |
273 | 0 | } |
274 | | |
275 | | } // namespace |
276 | | |
277 | 0 | void AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
278 | 0 | const SIMD_4x32 K[11] = { |
279 | 0 | SIMD_4x32::load_le(&m_EK[4 * 0]), |
280 | 0 | SIMD_4x32::load_le(&m_EK[4 * 1]), |
281 | 0 | SIMD_4x32::load_le(&m_EK[4 * 2]), |
282 | 0 | SIMD_4x32::load_le(&m_EK[4 * 3]), |
283 | 0 | SIMD_4x32::load_le(&m_EK[4 * 4]), |
284 | 0 | SIMD_4x32::load_le(&m_EK[4 * 5]), |
285 | 0 | SIMD_4x32::load_le(&m_EK[4 * 6]), |
286 | 0 | SIMD_4x32::load_le(&m_EK[4 * 7]), |
287 | 0 | SIMD_4x32::load_le(&m_EK[4 * 8]), |
288 | 0 | SIMD_4x32::load_le(&m_EK[4 * 9]), |
289 | 0 | SIMD_4x32::load_le(&m_EK[4 * 10]), |
290 | 0 | }; |
291 | |
|
292 | 0 | return vperm_encrypt_blocks(in, out, blocks, K, 10); |
293 | 0 | } |
294 | | |
295 | 0 | void AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
296 | 0 | const SIMD_4x32 K[11] = { |
297 | 0 | SIMD_4x32::load_le(&m_DK[4 * 0]), |
298 | 0 | SIMD_4x32::load_le(&m_DK[4 * 1]), |
299 | 0 | SIMD_4x32::load_le(&m_DK[4 * 2]), |
300 | 0 | SIMD_4x32::load_le(&m_DK[4 * 3]), |
301 | 0 | SIMD_4x32::load_le(&m_DK[4 * 4]), |
302 | 0 | SIMD_4x32::load_le(&m_DK[4 * 5]), |
303 | 0 | SIMD_4x32::load_le(&m_DK[4 * 6]), |
304 | 0 | SIMD_4x32::load_le(&m_DK[4 * 7]), |
305 | 0 | SIMD_4x32::load_le(&m_DK[4 * 8]), |
306 | 0 | SIMD_4x32::load_le(&m_DK[4 * 9]), |
307 | 0 | SIMD_4x32::load_le(&m_DK[4 * 10]), |
308 | 0 | }; |
309 | |
|
310 | 0 | return vperm_decrypt_blocks(in, out, blocks, K, 10); |
311 | 0 | } |
312 | | |
313 | 0 | void AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
314 | 0 | const SIMD_4x32 K[13] = { |
315 | 0 | SIMD_4x32::load_le(&m_EK[4 * 0]), |
316 | 0 | SIMD_4x32::load_le(&m_EK[4 * 1]), |
317 | 0 | SIMD_4x32::load_le(&m_EK[4 * 2]), |
318 | 0 | SIMD_4x32::load_le(&m_EK[4 * 3]), |
319 | 0 | SIMD_4x32::load_le(&m_EK[4 * 4]), |
320 | 0 | SIMD_4x32::load_le(&m_EK[4 * 5]), |
321 | 0 | SIMD_4x32::load_le(&m_EK[4 * 6]), |
322 | 0 | SIMD_4x32::load_le(&m_EK[4 * 7]), |
323 | 0 | SIMD_4x32::load_le(&m_EK[4 * 8]), |
324 | 0 | SIMD_4x32::load_le(&m_EK[4 * 9]), |
325 | 0 | SIMD_4x32::load_le(&m_EK[4 * 10]), |
326 | 0 | SIMD_4x32::load_le(&m_EK[4 * 11]), |
327 | 0 | SIMD_4x32::load_le(&m_EK[4 * 12]), |
328 | 0 | }; |
329 | |
|
330 | 0 | return vperm_encrypt_blocks(in, out, blocks, K, 12); |
331 | 0 | } |
332 | | |
333 | 0 | void AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
334 | 0 | const SIMD_4x32 K[13] = { |
335 | 0 | SIMD_4x32::load_le(&m_DK[4 * 0]), |
336 | 0 | SIMD_4x32::load_le(&m_DK[4 * 1]), |
337 | 0 | SIMD_4x32::load_le(&m_DK[4 * 2]), |
338 | 0 | SIMD_4x32::load_le(&m_DK[4 * 3]), |
339 | 0 | SIMD_4x32::load_le(&m_DK[4 * 4]), |
340 | 0 | SIMD_4x32::load_le(&m_DK[4 * 5]), |
341 | 0 | SIMD_4x32::load_le(&m_DK[4 * 6]), |
342 | 0 | SIMD_4x32::load_le(&m_DK[4 * 7]), |
343 | 0 | SIMD_4x32::load_le(&m_DK[4 * 8]), |
344 | 0 | SIMD_4x32::load_le(&m_DK[4 * 9]), |
345 | 0 | SIMD_4x32::load_le(&m_DK[4 * 10]), |
346 | 0 | SIMD_4x32::load_le(&m_DK[4 * 11]), |
347 | 0 | SIMD_4x32::load_le(&m_DK[4 * 12]), |
348 | 0 | }; |
349 | |
|
350 | 0 | return vperm_decrypt_blocks(in, out, blocks, K, 12); |
351 | 0 | } |
352 | | |
353 | 0 | void AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
354 | 0 | const SIMD_4x32 K[15] = { |
355 | 0 | SIMD_4x32::load_le(&m_EK[4 * 0]), |
356 | 0 | SIMD_4x32::load_le(&m_EK[4 * 1]), |
357 | 0 | SIMD_4x32::load_le(&m_EK[4 * 2]), |
358 | 0 | SIMD_4x32::load_le(&m_EK[4 * 3]), |
359 | 0 | SIMD_4x32::load_le(&m_EK[4 * 4]), |
360 | 0 | SIMD_4x32::load_le(&m_EK[4 * 5]), |
361 | 0 | SIMD_4x32::load_le(&m_EK[4 * 6]), |
362 | 0 | SIMD_4x32::load_le(&m_EK[4 * 7]), |
363 | 0 | SIMD_4x32::load_le(&m_EK[4 * 8]), |
364 | 0 | SIMD_4x32::load_le(&m_EK[4 * 9]), |
365 | 0 | SIMD_4x32::load_le(&m_EK[4 * 10]), |
366 | 0 | SIMD_4x32::load_le(&m_EK[4 * 11]), |
367 | 0 | SIMD_4x32::load_le(&m_EK[4 * 12]), |
368 | 0 | SIMD_4x32::load_le(&m_EK[4 * 13]), |
369 | 0 | SIMD_4x32::load_le(&m_EK[4 * 14]), |
370 | 0 | }; |
371 | |
|
372 | 0 | return vperm_encrypt_blocks(in, out, blocks, K, 14); |
373 | 0 | } |
374 | | |
375 | 0 | void AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
376 | 0 | const SIMD_4x32 K[15] = { |
377 | 0 | SIMD_4x32::load_le(&m_DK[4 * 0]), |
378 | 0 | SIMD_4x32::load_le(&m_DK[4 * 1]), |
379 | 0 | SIMD_4x32::load_le(&m_DK[4 * 2]), |
380 | 0 | SIMD_4x32::load_le(&m_DK[4 * 3]), |
381 | 0 | SIMD_4x32::load_le(&m_DK[4 * 4]), |
382 | 0 | SIMD_4x32::load_le(&m_DK[4 * 5]), |
383 | 0 | SIMD_4x32::load_le(&m_DK[4 * 6]), |
384 | 0 | SIMD_4x32::load_le(&m_DK[4 * 7]), |
385 | 0 | SIMD_4x32::load_le(&m_DK[4 * 8]), |
386 | 0 | SIMD_4x32::load_le(&m_DK[4 * 9]), |
387 | 0 | SIMD_4x32::load_le(&m_DK[4 * 10]), |
388 | 0 | SIMD_4x32::load_le(&m_DK[4 * 11]), |
389 | 0 | SIMD_4x32::load_le(&m_DK[4 * 12]), |
390 | 0 | SIMD_4x32::load_le(&m_DK[4 * 13]), |
391 | 0 | SIMD_4x32::load_le(&m_DK[4 * 14]), |
392 | 0 | }; |
393 | |
|
394 | 0 | return vperm_decrypt_blocks(in, out, blocks, K, 14); |
395 | 0 | } |
396 | | |
397 | | namespace { |
398 | | |
399 | 0 | inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_transform(SIMD_4x32 input, SIMD_4x32 table_1, SIMD_4x32 table_2) { |
400 | 0 | return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input)); |
401 | 0 | } |
402 | | |
403 | 0 | SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no) { |
404 | 0 | const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D); |
405 | |
|
406 | 0 | SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0); |
407 | 0 | SIMD_4x32 t2 = t; |
408 | 0 | t = shuffle(t, mc_forward0); |
409 | 0 | t2 = t ^ t2 ^ shuffle(t, mc_forward0); |
410 | 0 | return shuffle(t2, vperm_sr[round_no % 4]); |
411 | 0 | } |
412 | | |
413 | 0 | SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no) { |
414 | 0 | const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D); |
415 | |
|
416 | 0 | const SIMD_4x32 dsk[8] = { |
417 | 0 | SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334), |
418 | 0 | SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC), |
419 | 0 | SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A), |
420 | 0 | SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C), |
421 | 0 | SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9), |
422 | 0 | SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D), |
423 | 0 | SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3), |
424 | 0 | SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4), |
425 | 0 | }; |
426 | |
|
427 | 0 | SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]); |
428 | 0 | SIMD_4x32 output = shuffle(t, mc_forward0); |
429 | |
|
430 | 0 | t = aes_schedule_transform(t, dsk[2], dsk[3]); |
431 | 0 | output = shuffle(t ^ output, mc_forward0); |
432 | |
|
433 | 0 | t = aes_schedule_transform(t, dsk[4], dsk[5]); |
434 | 0 | output = shuffle(t ^ output, mc_forward0); |
435 | |
|
436 | 0 | t = aes_schedule_transform(t, dsk[6], dsk[7]); |
437 | 0 | output = shuffle(t ^ output, mc_forward0); |
438 | |
|
439 | 0 | return shuffle(output, vperm_sr[round_no % 4]); |
440 | 0 | } |
441 | | |
442 | 0 | SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no) { |
443 | 0 | const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121); |
444 | 0 | const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1); |
445 | |
|
446 | 0 | k = shuffle(k, vperm_sr[round_no % 4]); |
447 | 0 | k ^= SIMD_4x32::splat_u8(0x5B); |
448 | 0 | return aes_schedule_transform(k, out_tr1, out_tr2); |
449 | 0 | } |
450 | | |
451 | 0 | SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle_last_dec(SIMD_4x32 k) { |
452 | 0 | const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A); |
453 | 0 | const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB); |
454 | |
|
455 | 0 | k ^= SIMD_4x32::splat_u8(0x5B); |
456 | 0 | return aes_schedule_transform(k, deskew1, deskew2); |
457 | 0 | } |
458 | | |
459 | 0 | SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2) { |
460 | 0 | SIMD_4x32 smeared = input2 ^ input2.shift_elems_left<1>(); |
461 | 0 | smeared ^= smeared.shift_elems_left<2>(); |
462 | 0 | smeared ^= SIMD_4x32::splat_u8(0x5B); |
463 | |
|
464 | 0 | const SIMD_4x32 Bh = high_nibs(input1); |
465 | 0 | SIMD_4x32 Bl = low_nibs(input1); |
466 | |
|
467 | 0 | const SIMD_4x32 t2 = shuffle(k_inv2, Bl); |
468 | |
|
469 | 0 | Bl ^= Bh; |
470 | |
|
471 | 0 | SIMD_4x32 t5 = Bl ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); |
472 | 0 | SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl)); |
473 | |
|
474 | 0 | return smeared ^ masked_shuffle(sb1u, t5) ^ masked_shuffle(sb1t, t6); |
475 | 0 | } |
476 | | |
477 | 0 | SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_round(SIMD_4x32 rc, SIMD_4x32 input1, SIMD_4x32 input2) { |
478 | | // This byte shuffle is equivalent to alignr<1>(shuffle32(input1, (3,3,3,3))); |
479 | 0 | const SIMD_4x32 shuffle3333_15 = SIMD_4x32::splat(0x0C0F0E0D); |
480 | 0 | return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rc); |
481 | 0 | } |
482 | | |
483 | 0 | SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y) { |
484 | 0 | const SIMD_4x32 shuffle3332 = SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C); |
485 | 0 | const SIMD_4x32 shuffle2000 = SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908); |
486 | |
|
487 | 0 | const SIMD_4x32 zero_top_half(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); |
488 | 0 | y &= zero_top_half; |
489 | 0 | return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000); |
490 | 0 | } |
491 | | |
492 | | } // namespace |
493 | | |
494 | 0 | void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) { |
495 | 0 | m_EK.resize(11 * 4); |
496 | 0 | m_DK.resize(11 * 4); |
497 | |
|
498 | 0 | SIMD_4x32 key = SIMD_4x32::load_le(keyb); |
499 | |
|
500 | 0 | shuffle(key, vperm_sr[2]).store_le(&m_DK[4 * 10]); |
501 | |
|
502 | 0 | key = aes_schedule_transform(key, k_ipt1, k_ipt2); |
503 | 0 | key.store_le(&m_EK[0]); |
504 | |
|
505 | 0 | for(size_t i = 1; i != 10; ++i) { |
506 | 0 | key = aes_schedule_round(rcon[i - 1], key, key); |
507 | |
|
508 | 0 | aes_schedule_mangle(key, (12 - i) % 4).store_le(&m_EK[4 * i]); |
509 | |
|
510 | 0 | aes_schedule_mangle_dec(key, (10 - i) % 4).store_le(&m_DK[4 * (10 - i)]); |
511 | 0 | } |
512 | |
|
513 | 0 | key = aes_schedule_round(rcon[9], key, key); |
514 | 0 | aes_schedule_mangle_last(key, 2).store_le(&m_EK[4 * 10]); |
515 | 0 | aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]); |
516 | 0 | } |
517 | | |
518 | 0 | void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) { |
519 | 0 | m_EK.resize(13 * 4); |
520 | 0 | m_DK.resize(13 * 4); |
521 | |
|
522 | 0 | SIMD_4x32 key1 = SIMD_4x32::load_le(keyb); |
523 | 0 | SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8); |
524 | |
|
525 | 0 | shuffle(key1, vperm_sr[0]).store_le(&m_DK[12 * 4]); |
526 | |
|
527 | 0 | key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); |
528 | 0 | key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); |
529 | |
|
530 | 0 | key1.store_le(&m_EK[0]); |
531 | |
|
532 | 0 | for(size_t i = 0; i != 4; ++i) { |
533 | | // key2 with 8 high bytes masked off |
534 | 0 | SIMD_4x32 t = key2; |
535 | 0 | key2 = aes_schedule_round(rcon[2 * i], key2, key1); |
536 | 0 | const auto key2t = SIMD_4x32::alignr8(key2, t); |
537 | |
|
538 | 0 | aes_schedule_mangle(key2t, (i + 3) % 4).store_le(&m_EK[4 * (3 * i + 1)]); |
539 | 0 | aes_schedule_mangle_dec(key2t, (i + 3) % 4).store_le(&m_DK[4 * (11 - 3 * i)]); |
540 | |
|
541 | 0 | t = aes_schedule_192_smear(key2, t); |
542 | |
|
543 | 0 | aes_schedule_mangle(t, (i + 2) % 4).store_le(&m_EK[4 * (3 * i + 2)]); |
544 | 0 | aes_schedule_mangle_dec(t, (i + 2) % 4).store_le(&m_DK[4 * (10 - 3 * i)]); |
545 | |
|
546 | 0 | key2 = aes_schedule_round(rcon[2 * i + 1], t, key2); |
547 | |
|
548 | 0 | if(i == 3) { |
549 | 0 | aes_schedule_mangle_last(key2, (i + 1) % 4).store_le(&m_EK[4 * (3 * i + 3)]); |
550 | 0 | aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4 * (9 - 3 * i)]); |
551 | 0 | } else { |
552 | 0 | aes_schedule_mangle(key2, (i + 1) % 4).store_le(&m_EK[4 * (3 * i + 3)]); |
553 | 0 | aes_schedule_mangle_dec(key2, (i + 1) % 4).store_le(&m_DK[4 * (9 - 3 * i)]); |
554 | 0 | } |
555 | |
|
556 | 0 | key1 = key2; |
557 | 0 | key2 = aes_schedule_192_smear(key2, t); |
558 | 0 | } |
559 | 0 | } |
560 | | |
561 | 0 | void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) { |
562 | 0 | m_EK.resize(15 * 4); |
563 | 0 | m_DK.resize(15 * 4); |
564 | |
|
565 | 0 | SIMD_4x32 key1 = SIMD_4x32::load_le(keyb); |
566 | 0 | SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16); |
567 | |
|
568 | 0 | shuffle(key1, vperm_sr[2]).store_le(&m_DK[4 * 14]); |
569 | |
|
570 | 0 | key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); |
571 | 0 | key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); |
572 | |
|
573 | 0 | key1.store_le(&m_EK[0]); |
574 | 0 | aes_schedule_mangle(key2, 3).store_le(&m_EK[4]); |
575 | |
|
576 | 0 | aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4 * 13]); |
577 | |
|
578 | 0 | const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C); |
579 | |
|
580 | 0 | for(size_t i = 2; i != 14; i += 2) { |
581 | 0 | const SIMD_4x32 k_t = key2; |
582 | 0 | key1 = key2 = aes_schedule_round(rcon[(i / 2) - 1], key2, key1); |
583 | |
|
584 | 0 | aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4 * i]); |
585 | 0 | aes_schedule_mangle_dec(key2, (i + 2) % 4).store_le(&m_DK[4 * (14 - i)]); |
586 | |
|
587 | 0 | key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t); |
588 | |
|
589 | 0 | aes_schedule_mangle(key2, (i - 1) % 4).store_le(&m_EK[4 * (i + 1)]); |
590 | 0 | aes_schedule_mangle_dec(key2, (i + 1) % 4).store_le(&m_DK[4 * (13 - i)]); |
591 | 0 | } |
592 | |
|
593 | 0 | key2 = aes_schedule_round(rcon[6], key2, key1); |
594 | |
|
595 | 0 | aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4 * 14]); |
596 | 0 | aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]); |
597 | 0 | } |
598 | | |
599 | | } // namespace Botan |