/src/botan/src/lib/block/aes/aes.cpp
Line | Count | Source |
1 | | /* |
2 | | * (C) 1999-2010,2015,2017,2018,2020 Jack Lloyd |
3 | | * |
4 | | * Botan is released under the Simplified BSD License (see license.txt) |
5 | | */ |
6 | | |
7 | | #include <botan/internal/aes.h> |
8 | | |
9 | | #include <botan/internal/bit_ops.h> |
10 | | #include <botan/internal/bswap.h> |
11 | | #include <botan/internal/ct_utils.h> |
12 | | #include <botan/internal/loadstor.h> |
13 | | #include <botan/internal/rotate.h> |
14 | | |
15 | | #if defined(BOTAN_HAS_CPUID) |
16 | | #include <botan/internal/cpuid.h> |
17 | | #endif |
18 | | |
19 | | #if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI) |
20 | | #define BOTAN_HAS_HW_AES_SUPPORT |
21 | | #endif |
22 | | |
23 | | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
24 | | #include <bit> |
25 | | #endif |
26 | | |
27 | | namespace Botan { |
28 | | |
29 | | /* |
30 | | * One of three AES implementation strategies are used to get a constant time |
31 | | * implementation which is immune to common cache/timing based side channels: |
32 | | * |
33 | | * - If AES hardware support is available (AES-NI, POWER8, Aarch64) use that |
34 | | * |
35 | | * - If 128-bit SIMD with byte shuffles are available (SSSE3, NEON, or Altivec), |
36 | | * use the vperm technique published by Mike Hamburg at CHES 2009. |
37 | | * |
38 | | * - If no hardware or SIMD support, fall back to a constant time bitsliced |
39 | | * implementation. This uses 32-bit words resulting in 2 blocks being processed |
40 | | * in parallel. Moving to 4 blocks (with 64-bit words) would approximately |
41 | | * double performance on 64-bit CPUs. Likewise moving to 128 bit SIMD would |
42 | | * again approximately double performance vs 64-bit. However the assumption is |
43 | | * that most 64-bit CPUs either have hardware AES or SIMD shuffle support and |
44 | | * that the majority of users falling back to this code will be 32-bit cores. |
45 | | * If this assumption proves to be unsound, the bitsliced code can easily be |
46 | | * extended to operate on either 32 or 64 bit words depending on the native |
47 | | * wordsize of the target processor. |
48 | | * |
49 | | * Useful references |
50 | | * |
51 | | * - "Accelerating AES with Vector Permute Instructions" Mike Hamburg |
52 | | * https://www.shiftleft.org/papers/vector_aes/vector_aes.pdf |
53 | | * |
54 | | * - "Faster and Timing-Attack Resistant AES-GCM" Käsper and Schwabe |
55 | | * https://eprint.iacr.org/2009/129.pdf |
56 | | * |
57 | | * - "A new combinational logic minimization technique with applications to cryptology." |
58 | | * Boyar and Peralta https://eprint.iacr.org/2009/191.pdf |
59 | | * |
60 | | * - "A depth-16 circuit for the AES S-box" Boyar and Peralta |
61 | | * https://eprint.iacr.org/2011/332.pdf |
62 | | * |
63 | | * - "A Very Compact S-box for AES" Canright |
64 | | * https://www.iacr.org/archive/ches2005/032.pdf |
65 | | * https://core.ac.uk/download/pdf/36694529.pdf (extended) |
66 | | */ |
67 | | |
68 | | namespace { |
69 | | |
70 | | /* |
71 | | This is an AES sbox circuit which can execute in bitsliced mode up to 32x in |
72 | | parallel. |
73 | | |
74 | | The circuit is from the "Circuit Minimization Team" group |
75 | | http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html |
76 | | http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt |
77 | | |
78 | | This circuit has size 113 and depth 27. In software it is much faster than |
79 | | circuits which are considered faster for hardware purposes (where circuit depth |
80 | | is the critical constraint), because unlike in hardware, on common CPUs we can |
81 | | only execute - at best - 3 or 4 logic operations per cycle. So a smaller circuit |
82 | | is superior. On an x86-64 machine this circuit is about 15% faster than the |
83 | | circuit of size 128 and depth 16 given in "A depth-16 circuit for the AES S-box". |
84 | | |
85 | | Another circuit for AES Sbox of size 102 and depth 24 is describted in "New |
86 | | Circuit Minimization Techniques for Smaller and Faster AES SBoxes" |
87 | | [https://eprint.iacr.org/2019/802] however it relies on "non-standard" gates |
88 | | like MUX, NOR, NAND, etc and so in practice in bitsliced software, its size is |
89 | | actually a bit larger than this circuit, as few CPUs have such instructions and |
90 | | otherwise they must be emulated using a sequence of available bit operations. |
91 | | */ |
92 | 0 | void AES_SBOX(uint32_t V[8]) { |
93 | 0 | const uint32_t U0 = V[0]; |
94 | 0 | const uint32_t U1 = V[1]; |
95 | 0 | const uint32_t U2 = V[2]; |
96 | 0 | const uint32_t U3 = V[3]; |
97 | 0 | const uint32_t U4 = V[4]; |
98 | 0 | const uint32_t U5 = V[5]; |
99 | 0 | const uint32_t U6 = V[6]; |
100 | 0 | const uint32_t U7 = V[7]; |
101 | |
|
102 | 0 | const uint32_t y14 = U3 ^ U5; |
103 | 0 | const uint32_t y13 = U0 ^ U6; |
104 | 0 | const uint32_t y9 = U0 ^ U3; |
105 | 0 | const uint32_t y8 = U0 ^ U5; |
106 | 0 | const uint32_t t0 = U1 ^ U2; |
107 | 0 | const uint32_t y1 = t0 ^ U7; |
108 | 0 | const uint32_t y4 = y1 ^ U3; |
109 | 0 | const uint32_t y12 = y13 ^ y14; |
110 | 0 | const uint32_t y2 = y1 ^ U0; |
111 | 0 | const uint32_t y5 = y1 ^ U6; |
112 | 0 | const uint32_t y3 = y5 ^ y8; |
113 | 0 | const uint32_t t1 = U4 ^ y12; |
114 | 0 | const uint32_t y15 = t1 ^ U5; |
115 | 0 | const uint32_t y20 = t1 ^ U1; |
116 | 0 | const uint32_t y6 = y15 ^ U7; |
117 | 0 | const uint32_t y10 = y15 ^ t0; |
118 | 0 | const uint32_t y11 = y20 ^ y9; |
119 | 0 | const uint32_t y7 = U7 ^ y11; |
120 | 0 | const uint32_t y17 = y10 ^ y11; |
121 | 0 | const uint32_t y19 = y10 ^ y8; |
122 | 0 | const uint32_t y16 = t0 ^ y11; |
123 | 0 | const uint32_t y21 = y13 ^ y16; |
124 | 0 | const uint32_t y18 = U0 ^ y16; |
125 | 0 | const uint32_t t2 = y12 & y15; |
126 | 0 | const uint32_t t3 = y3 & y6; |
127 | 0 | const uint32_t t4 = t3 ^ t2; |
128 | 0 | const uint32_t t5 = y4 & U7; |
129 | 0 | const uint32_t t6 = t5 ^ t2; |
130 | 0 | const uint32_t t7 = y13 & y16; |
131 | 0 | const uint32_t t8 = y5 & y1; |
132 | 0 | const uint32_t t9 = t8 ^ t7; |
133 | 0 | const uint32_t t10 = y2 & y7; |
134 | 0 | const uint32_t t11 = t10 ^ t7; |
135 | 0 | const uint32_t t12 = y9 & y11; |
136 | 0 | const uint32_t t13 = y14 & y17; |
137 | 0 | const uint32_t t14 = t13 ^ t12; |
138 | 0 | const uint32_t t15 = y8 & y10; |
139 | 0 | const uint32_t t16 = t15 ^ t12; |
140 | 0 | const uint32_t t17 = t4 ^ y20; |
141 | 0 | const uint32_t t18 = t6 ^ t16; |
142 | 0 | const uint32_t t19 = t9 ^ t14; |
143 | 0 | const uint32_t t20 = t11 ^ t16; |
144 | 0 | const uint32_t t21 = t17 ^ t14; |
145 | 0 | const uint32_t t22 = t18 ^ y19; |
146 | 0 | const uint32_t t23 = t19 ^ y21; |
147 | 0 | const uint32_t t24 = t20 ^ y18; |
148 | 0 | const uint32_t t25 = t21 ^ t22; |
149 | 0 | const uint32_t t26 = t21 & t23; |
150 | 0 | const uint32_t t27 = t24 ^ t26; |
151 | 0 | const uint32_t t28 = t25 & t27; |
152 | 0 | const uint32_t t29 = t28 ^ t22; |
153 | 0 | const uint32_t t30 = t23 ^ t24; |
154 | 0 | const uint32_t t31 = t22 ^ t26; |
155 | 0 | const uint32_t t32 = t31 & t30; |
156 | 0 | const uint32_t t33 = t32 ^ t24; |
157 | 0 | const uint32_t t34 = t23 ^ t33; |
158 | 0 | const uint32_t t35 = t27 ^ t33; |
159 | 0 | const uint32_t t36 = t24 & t35; |
160 | 0 | const uint32_t t37 = t36 ^ t34; |
161 | 0 | const uint32_t t38 = t27 ^ t36; |
162 | 0 | const uint32_t t39 = t29 & t38; |
163 | 0 | const uint32_t t40 = t25 ^ t39; |
164 | 0 | const uint32_t t41 = t40 ^ t37; |
165 | 0 | const uint32_t t42 = t29 ^ t33; |
166 | 0 | const uint32_t t43 = t29 ^ t40; |
167 | 0 | const uint32_t t44 = t33 ^ t37; |
168 | 0 | const uint32_t t45 = t42 ^ t41; |
169 | 0 | const uint32_t z0 = t44 & y15; |
170 | 0 | const uint32_t z1 = t37 & y6; |
171 | 0 | const uint32_t z2 = t33 & U7; |
172 | 0 | const uint32_t z3 = t43 & y16; |
173 | 0 | const uint32_t z4 = t40 & y1; |
174 | 0 | const uint32_t z5 = t29 & y7; |
175 | 0 | const uint32_t z6 = t42 & y11; |
176 | 0 | const uint32_t z7 = t45 & y17; |
177 | 0 | const uint32_t z8 = t41 & y10; |
178 | 0 | const uint32_t z9 = t44 & y12; |
179 | 0 | const uint32_t z10 = t37 & y3; |
180 | 0 | const uint32_t z11 = t33 & y4; |
181 | 0 | const uint32_t z12 = t43 & y13; |
182 | 0 | const uint32_t z13 = t40 & y5; |
183 | 0 | const uint32_t z14 = t29 & y2; |
184 | 0 | const uint32_t z15 = t42 & y9; |
185 | 0 | const uint32_t z16 = t45 & y14; |
186 | 0 | const uint32_t z17 = t41 & y8; |
187 | 0 | const uint32_t tc1 = z15 ^ z16; |
188 | 0 | const uint32_t tc2 = z10 ^ tc1; |
189 | 0 | const uint32_t tc3 = z9 ^ tc2; |
190 | 0 | const uint32_t tc4 = z0 ^ z2; |
191 | 0 | const uint32_t tc5 = z1 ^ z0; |
192 | 0 | const uint32_t tc6 = z3 ^ z4; |
193 | 0 | const uint32_t tc7 = z12 ^ tc4; |
194 | 0 | const uint32_t tc8 = z7 ^ tc6; |
195 | 0 | const uint32_t tc9 = z8 ^ tc7; |
196 | 0 | const uint32_t tc10 = tc8 ^ tc9; |
197 | 0 | const uint32_t tc11 = tc6 ^ tc5; |
198 | 0 | const uint32_t tc12 = z3 ^ z5; |
199 | 0 | const uint32_t tc13 = z13 ^ tc1; |
200 | 0 | const uint32_t tc14 = tc4 ^ tc12; |
201 | 0 | const uint32_t S3 = tc3 ^ tc11; |
202 | 0 | const uint32_t tc16 = z6 ^ tc8; |
203 | 0 | const uint32_t tc17 = z14 ^ tc10; |
204 | 0 | const uint32_t tc18 = ~tc13 ^ tc14; |
205 | 0 | const uint32_t S7 = z12 ^ tc18; |
206 | 0 | const uint32_t tc20 = z15 ^ tc16; |
207 | 0 | const uint32_t tc21 = tc2 ^ z11; |
208 | 0 | const uint32_t S0 = tc3 ^ tc16; |
209 | 0 | const uint32_t S6 = tc10 ^ tc18; |
210 | 0 | const uint32_t S4 = tc14 ^ S3; |
211 | 0 | const uint32_t S1 = ~(S3 ^ tc16); |
212 | 0 | const uint32_t tc26 = tc17 ^ tc20; |
213 | 0 | const uint32_t S2 = ~(tc26 ^ z17); |
214 | 0 | const uint32_t S5 = tc21 ^ tc17; |
215 | |
|
216 | 0 | V[0] = S0; |
217 | 0 | V[1] = S1; |
218 | 0 | V[2] = S2; |
219 | 0 | V[3] = S3; |
220 | 0 | V[4] = S4; |
221 | 0 | V[5] = S5; |
222 | 0 | V[6] = S6; |
223 | 0 | V[7] = S7; |
224 | 0 | } |
225 | | |
226 | | /* |
227 | | A circuit for inverse AES Sbox of size 121 and depth 21 from |
228 | | http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html |
229 | | http://www.cs.yale.edu/homes/peralta/CircuitStuff/Sinv.txt |
230 | | */ |
231 | 0 | void AES_INV_SBOX(uint32_t V[8]) { |
232 | 0 | const uint32_t U0 = V[0]; |
233 | 0 | const uint32_t U1 = V[1]; |
234 | 0 | const uint32_t U2 = V[2]; |
235 | 0 | const uint32_t U3 = V[3]; |
236 | 0 | const uint32_t U4 = V[4]; |
237 | 0 | const uint32_t U5 = V[5]; |
238 | 0 | const uint32_t U6 = V[6]; |
239 | 0 | const uint32_t U7 = V[7]; |
240 | |
|
241 | 0 | const uint32_t Y0 = U0 ^ U3; |
242 | 0 | const uint32_t Y2 = ~(U1 ^ U3); |
243 | 0 | const uint32_t Y4 = U0 ^ Y2; |
244 | 0 | const uint32_t RTL0 = U6 ^ U7; |
245 | 0 | const uint32_t Y1 = Y2 ^ RTL0; |
246 | 0 | const uint32_t Y7 = ~(U2 ^ Y1); |
247 | 0 | const uint32_t RTL1 = U3 ^ U4; |
248 | 0 | const uint32_t Y6 = ~(U7 ^ RTL1); |
249 | 0 | const uint32_t Y3 = Y1 ^ RTL1; |
250 | 0 | const uint32_t RTL2 = ~(U0 ^ U2); |
251 | 0 | const uint32_t Y5 = U5 ^ RTL2; |
252 | 0 | const uint32_t sa1 = Y0 ^ Y2; |
253 | 0 | const uint32_t sa0 = Y1 ^ Y3; |
254 | 0 | const uint32_t sb1 = Y4 ^ Y6; |
255 | 0 | const uint32_t sb0 = Y5 ^ Y7; |
256 | 0 | const uint32_t ah = Y0 ^ Y1; |
257 | 0 | const uint32_t al = Y2 ^ Y3; |
258 | 0 | const uint32_t aa = sa0 ^ sa1; |
259 | 0 | const uint32_t bh = Y4 ^ Y5; |
260 | 0 | const uint32_t bl = Y6 ^ Y7; |
261 | 0 | const uint32_t bb = sb0 ^ sb1; |
262 | 0 | const uint32_t ab20 = sa0 ^ sb0; |
263 | 0 | const uint32_t ab22 = al ^ bl; |
264 | 0 | const uint32_t ab23 = Y3 ^ Y7; |
265 | 0 | const uint32_t ab21 = sa1 ^ sb1; |
266 | 0 | const uint32_t abcd1 = ah & bh; |
267 | 0 | const uint32_t rr1 = Y0 & Y4; |
268 | 0 | const uint32_t ph11 = ab20 ^ abcd1; |
269 | 0 | const uint32_t t01 = Y1 & Y5; |
270 | 0 | const uint32_t ph01 = t01 ^ abcd1; |
271 | 0 | const uint32_t abcd2 = al & bl; |
272 | 0 | const uint32_t r1 = Y2 & Y6; |
273 | 0 | const uint32_t pl11 = ab22 ^ abcd2; |
274 | 0 | const uint32_t r2 = Y3 & Y7; |
275 | 0 | const uint32_t pl01 = r2 ^ abcd2; |
276 | 0 | const uint32_t r3 = sa0 & sb0; |
277 | 0 | const uint32_t vr1 = aa & bb; |
278 | 0 | const uint32_t pr1 = vr1 ^ r3; |
279 | 0 | const uint32_t wr1 = sa1 & sb1; |
280 | 0 | const uint32_t qr1 = wr1 ^ r3; |
281 | 0 | const uint32_t ab0 = ph11 ^ rr1; |
282 | 0 | const uint32_t ab1 = ph01 ^ ab21; |
283 | 0 | const uint32_t ab2 = pl11 ^ r1; |
284 | 0 | const uint32_t ab3 = pl01 ^ qr1; |
285 | 0 | const uint32_t cp1 = ab0 ^ pr1; |
286 | 0 | const uint32_t cp2 = ab1 ^ qr1; |
287 | 0 | const uint32_t cp3 = ab2 ^ pr1; |
288 | 0 | const uint32_t cp4 = ab3 ^ ab23; |
289 | 0 | const uint32_t tinv1 = cp3 ^ cp4; |
290 | 0 | const uint32_t tinv2 = cp3 & cp1; |
291 | 0 | const uint32_t tinv3 = cp2 ^ tinv2; |
292 | 0 | const uint32_t tinv4 = cp1 ^ cp2; |
293 | 0 | const uint32_t tinv5 = cp4 ^ tinv2; |
294 | 0 | const uint32_t tinv6 = tinv5 & tinv4; |
295 | 0 | const uint32_t tinv7 = tinv3 & tinv1; |
296 | 0 | const uint32_t d2 = cp4 ^ tinv7; |
297 | 0 | const uint32_t d0 = cp2 ^ tinv6; |
298 | 0 | const uint32_t tinv8 = cp1 & cp4; |
299 | 0 | const uint32_t tinv9 = tinv4 & tinv8; |
300 | 0 | const uint32_t tinv10 = tinv4 ^ tinv2; |
301 | 0 | const uint32_t d1 = tinv9 ^ tinv10; |
302 | 0 | const uint32_t tinv11 = cp2 & cp3; |
303 | 0 | const uint32_t tinv12 = tinv1 & tinv11; |
304 | 0 | const uint32_t tinv13 = tinv1 ^ tinv2; |
305 | 0 | const uint32_t d3 = tinv12 ^ tinv13; |
306 | 0 | const uint32_t sd1 = d1 ^ d3; |
307 | 0 | const uint32_t sd0 = d0 ^ d2; |
308 | 0 | const uint32_t dl = d0 ^ d1; // NOLINT(misc-confusable-identifiers) |
309 | 0 | const uint32_t dh = d2 ^ d3; |
310 | 0 | const uint32_t dd = sd0 ^ sd1; |
311 | 0 | const uint32_t abcd3 = dh & bh; |
312 | 0 | const uint32_t rr2 = d3 & Y4; |
313 | 0 | const uint32_t t02 = d2 & Y5; |
314 | 0 | const uint32_t abcd4 = dl & bl; |
315 | 0 | const uint32_t r4 = d1 & Y6; |
316 | 0 | const uint32_t r5 = d0 & Y7; |
317 | 0 | const uint32_t r6 = sd0 & sb0; |
318 | 0 | const uint32_t vr2 = dd & bb; |
319 | 0 | const uint32_t wr2 = sd1 & sb1; |
320 | 0 | const uint32_t abcd5 = dh & ah; |
321 | 0 | const uint32_t r7 = d3 & Y0; |
322 | 0 | const uint32_t r8 = d2 & Y1; |
323 | 0 | const uint32_t abcd6 = dl & al; |
324 | 0 | const uint32_t r9 = d1 & Y2; |
325 | 0 | const uint32_t r10 = d0 & Y3; |
326 | 0 | const uint32_t r11 = sd0 & sa0; |
327 | 0 | const uint32_t vr3 = dd & aa; |
328 | 0 | const uint32_t wr3 = sd1 & sa1; |
329 | 0 | const uint32_t ph12 = rr2 ^ abcd3; |
330 | 0 | const uint32_t ph02 = t02 ^ abcd3; |
331 | 0 | const uint32_t pl12 = r4 ^ abcd4; |
332 | 0 | const uint32_t pl02 = r5 ^ abcd4; |
333 | 0 | const uint32_t pr2 = vr2 ^ r6; |
334 | 0 | const uint32_t qr2 = wr2 ^ r6; |
335 | 0 | const uint32_t p0 = ph12 ^ pr2; |
336 | 0 | const uint32_t p1 = ph02 ^ qr2; |
337 | 0 | const uint32_t p2 = pl12 ^ pr2; |
338 | 0 | const uint32_t p3 = pl02 ^ qr2; |
339 | 0 | const uint32_t ph13 = r7 ^ abcd5; |
340 | 0 | const uint32_t ph03 = r8 ^ abcd5; |
341 | 0 | const uint32_t pl13 = r9 ^ abcd6; |
342 | 0 | const uint32_t pl03 = r10 ^ abcd6; |
343 | 0 | const uint32_t pr3 = vr3 ^ r11; |
344 | 0 | const uint32_t qr3 = wr3 ^ r11; |
345 | 0 | const uint32_t p4 = ph13 ^ pr3; |
346 | 0 | const uint32_t S7 = ph03 ^ qr3; |
347 | 0 | const uint32_t p6 = pl13 ^ pr3; |
348 | 0 | const uint32_t p7 = pl03 ^ qr3; |
349 | 0 | const uint32_t S3 = p1 ^ p6; |
350 | 0 | const uint32_t S6 = p2 ^ p6; |
351 | 0 | const uint32_t S0 = p3 ^ p6; |
352 | 0 | const uint32_t X11 = p0 ^ p2; |
353 | 0 | const uint32_t S5 = S0 ^ X11; |
354 | 0 | const uint32_t X13 = p4 ^ p7; |
355 | 0 | const uint32_t X14 = X11 ^ X13; |
356 | 0 | const uint32_t S1 = S3 ^ X14; |
357 | 0 | const uint32_t X16 = p1 ^ S7; |
358 | 0 | const uint32_t S2 = X14 ^ X16; |
359 | 0 | const uint32_t X18 = p0 ^ p4; |
360 | 0 | const uint32_t X19 = S5 ^ X16; |
361 | 0 | const uint32_t S4 = X18 ^ X19; |
362 | |
|
363 | 0 | V[0] = S0; |
364 | 0 | V[1] = S1; |
365 | 0 | V[2] = S2; |
366 | 0 | V[3] = S3; |
367 | 0 | V[4] = S4; |
368 | 0 | V[5] = S5; |
369 | 0 | V[6] = S6; |
370 | 0 | V[7] = S7; |
371 | 0 | } |
372 | | |
373 | 0 | inline void bit_transpose(uint32_t B[8]) { |
374 | 0 | swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1); |
375 | 0 | swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1); |
376 | 0 | swap_bits<uint32_t>(B[5], B[4], 0x55555555, 1); |
377 | 0 | swap_bits<uint32_t>(B[7], B[6], 0x55555555, 1); |
378 | |
|
379 | 0 | swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2); |
380 | 0 | swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2); |
381 | 0 | swap_bits<uint32_t>(B[6], B[4], 0x33333333, 2); |
382 | 0 | swap_bits<uint32_t>(B[7], B[5], 0x33333333, 2); |
383 | |
|
384 | 0 | swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4); |
385 | 0 | swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4); |
386 | 0 | swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4); |
387 | 0 | swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4); |
388 | 0 | } |
389 | | |
390 | 0 | inline void ks_expand(uint32_t B[8], const uint32_t K[], size_t r) { |
391 | | /* |
392 | | This is bit_transpose of K[r..r+4] || K[r..r+4], we can save some computation |
393 | | due to knowing the first and second halves are the same data. |
394 | | */ |
395 | 0 | for(size_t i = 0; i != 4; ++i) { |
396 | 0 | B[i] = K[r + i]; |
397 | 0 | } |
398 | |
|
399 | 0 | swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1); |
400 | 0 | swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1); |
401 | |
|
402 | 0 | swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2); |
403 | 0 | swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2); |
404 | |
|
405 | 0 | B[4] = B[0]; |
406 | 0 | B[5] = B[1]; |
407 | 0 | B[6] = B[2]; |
408 | 0 | B[7] = B[3]; |
409 | |
|
410 | 0 | swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4); |
411 | 0 | swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4); |
412 | 0 | swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4); |
413 | 0 | swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4); |
414 | 0 | } |
415 | | |
416 | 0 | inline void shift_rows(uint32_t B[8]) { |
417 | | // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31 |
418 | 0 | if constexpr(HasNative64BitRegisters) { |
419 | 0 | for(size_t i = 0; i != 8; i += 2) { |
420 | 0 | uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i + 1]; |
421 | 0 | x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2); |
422 | 0 | x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1); |
423 | 0 | B[i] = static_cast<uint32_t>(x >> 32); |
424 | 0 | B[i + 1] = static_cast<uint32_t>(x); |
425 | 0 | } |
426 | | } else { |
427 | | for(size_t i = 0; i != 8; ++i) { |
428 | | uint32_t x = B[i]; |
429 | | x = bit_permute_step<uint32_t>(x, 0x00223311, 2); |
430 | | x = bit_permute_step<uint32_t>(x, 0x00550055, 1); |
431 | | B[i] = x; |
432 | | } |
433 | | } |
434 | 0 | } |
435 | | |
436 | 0 | inline void inv_shift_rows(uint32_t B[8]) { |
437 | | // Inverse of shift_rows, just inverting the steps |
438 | |
|
439 | 0 | if constexpr(HasNative64BitRegisters) { |
440 | 0 | for(size_t i = 0; i != 8; i += 2) { |
441 | 0 | uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i + 1]; |
442 | 0 | x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1); |
443 | 0 | x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2); |
444 | 0 | B[i] = static_cast<uint32_t>(x >> 32); |
445 | 0 | B[i + 1] = static_cast<uint32_t>(x); |
446 | 0 | } |
447 | | } else { |
448 | | for(size_t i = 0; i != 8; ++i) { |
449 | | uint32_t x = B[i]; |
450 | | x = bit_permute_step<uint32_t>(x, 0x00550055, 1); |
451 | | x = bit_permute_step<uint32_t>(x, 0x00223311, 2); |
452 | | B[i] = x; |
453 | | } |
454 | | } |
455 | 0 | } |
456 | | |
457 | 0 | inline void mix_columns(uint32_t B[8]) { |
458 | | // carry high bits in B[0] to positions in 0x1b == 0b11011 |
459 | 0 | const uint32_t X2[8] = { |
460 | 0 | B[1], |
461 | 0 | B[2], |
462 | 0 | B[3], |
463 | 0 | B[4] ^ B[0], |
464 | 0 | B[5] ^ B[0], |
465 | 0 | B[6], |
466 | 0 | B[7] ^ B[0], |
467 | 0 | B[0], |
468 | 0 | }; |
469 | |
|
470 | 0 | for(size_t i = 0; i != 8; i++) { |
471 | 0 | const uint32_t X3 = B[i] ^ X2[i]; |
472 | 0 | B[i] = X2[i] ^ rotr<8>(B[i]) ^ rotr<16>(B[i]) ^ rotr<24>(X3); |
473 | 0 | } |
474 | 0 | } |
475 | | |
476 | 0 | void inv_mix_columns(uint32_t B[8]) { |
477 | | /* |
478 | | OpenSSL's bsaes implementation credits Jussi Kivilinna with the lovely |
479 | | matrix decomposition |
480 | | |
481 | | | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | |
482 | | | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | |
483 | | | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | |
484 | | | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | |
485 | | |
486 | | Notice the first component is simply the MixColumns matrix. So we can |
487 | | multiply first by (05,00,04,00) then perform MixColumns to get the equivalent |
488 | | of InvMixColumn. |
489 | | */ |
490 | 0 | const uint32_t X4[8] = { |
491 | 0 | B[2], |
492 | 0 | B[3], |
493 | 0 | B[4] ^ B[0], |
494 | 0 | B[5] ^ B[0] ^ B[1], |
495 | 0 | B[6] ^ B[1], |
496 | 0 | B[7] ^ B[0], |
497 | 0 | B[0] ^ B[1], |
498 | 0 | B[1], |
499 | 0 | }; |
500 | |
|
501 | 0 | for(size_t i = 0; i != 8; i++) { |
502 | 0 | const uint32_t X5 = X4[i] ^ B[i]; |
503 | 0 | B[i] = X5 ^ rotr<16>(X4[i]); |
504 | 0 | } |
505 | |
|
506 | 0 | mix_columns(B); |
507 | 0 | } |
508 | | |
509 | | /* |
510 | | * AES Encryption |
511 | | */ |
512 | 0 | void aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, const secure_vector<uint32_t>& EK) { |
513 | 0 | BOTAN_ASSERT(EK.size() == 44 || EK.size() == 52 || EK.size() == 60, "Key was set"); |
514 | |
|
515 | 0 | const size_t rounds = (EK.size() - 4) / 4; |
516 | |
|
517 | 0 | uint32_t KS[13 * 8] = {0}; // actual maximum is (rounds - 1) * 8 |
518 | 0 | for(size_t i = 0; i < rounds - 1; i += 1) { |
519 | 0 | ks_expand(&KS[8 * i], EK.data(), 4 * i + 4); |
520 | 0 | } |
521 | |
|
522 | 0 | const size_t BLOCK_SIZE = 16; |
523 | 0 | const size_t BITSLICED_BLOCKS = 8 * sizeof(uint32_t) / BLOCK_SIZE; |
524 | |
|
525 | 0 | while(blocks > 0) { |
526 | 0 | const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS); |
527 | |
|
528 | 0 | uint32_t B[8] = {0}; |
529 | |
|
530 | 0 | load_be(B, in, this_loop * 4); |
531 | |
|
532 | 0 | CT::poison(B, 8); |
533 | |
|
534 | 0 | for(size_t i = 0; i != 8; ++i) { |
535 | 0 | B[i] ^= EK[i % 4]; |
536 | 0 | } |
537 | |
|
538 | 0 | bit_transpose(B); |
539 | |
|
540 | 0 | for(size_t r = 0; r != rounds - 1; ++r) { |
541 | 0 | AES_SBOX(B); |
542 | 0 | shift_rows(B); |
543 | 0 | mix_columns(B); |
544 | |
|
545 | 0 | for(size_t i = 0; i != 8; ++i) { |
546 | 0 | B[i] ^= KS[8 * r + i]; |
547 | 0 | } |
548 | 0 | } |
549 | | |
550 | | // Final round: |
551 | 0 | AES_SBOX(B); |
552 | 0 | shift_rows(B); |
553 | 0 | bit_transpose(B); |
554 | |
|
555 | 0 | for(size_t i = 0; i != 8; ++i) { |
556 | 0 | B[i] ^= EK[4 * rounds + i % 4]; |
557 | 0 | } |
558 | |
|
559 | 0 | CT::unpoison(B, 8); |
560 | |
|
561 | 0 | copy_out_be(std::span(out, this_loop * 4 * sizeof(uint32_t)), B); |
562 | |
|
563 | 0 | in += this_loop * BLOCK_SIZE; |
564 | 0 | out += this_loop * BLOCK_SIZE; |
565 | 0 | blocks -= this_loop; |
566 | 0 | } |
567 | 0 | } |
568 | | |
569 | | /* |
570 | | * AES Decryption |
571 | | */ |
572 | 0 | void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, const secure_vector<uint32_t>& DK) { |
573 | 0 | BOTAN_ASSERT(DK.size() == 44 || DK.size() == 52 || DK.size() == 60, "Key was set"); |
574 | |
|
575 | 0 | const size_t rounds = (DK.size() - 4) / 4; |
576 | |
|
577 | 0 | uint32_t KS[13 * 8] = {0}; // actual maximum is (rounds - 1) * 8 |
578 | 0 | for(size_t i = 0; i < rounds - 1; i += 1) { |
579 | 0 | ks_expand(&KS[8 * i], DK.data(), 4 * i + 4); |
580 | 0 | } |
581 | |
|
582 | 0 | const size_t BLOCK_SIZE = 16; |
583 | 0 | const size_t BITSLICED_BLOCKS = 8 * sizeof(uint32_t) / BLOCK_SIZE; |
584 | |
|
585 | 0 | while(blocks > 0) { |
586 | 0 | const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS); |
587 | |
|
588 | 0 | uint32_t B[8] = {0}; |
589 | |
|
590 | 0 | CT::poison(B, 8); |
591 | |
|
592 | 0 | load_be(B, in, this_loop * 4); |
593 | |
|
594 | 0 | for(size_t i = 0; i != 8; ++i) { |
595 | 0 | B[i] ^= DK[i % 4]; |
596 | 0 | } |
597 | |
|
598 | 0 | bit_transpose(B); |
599 | |
|
600 | 0 | for(size_t r = 0; r != rounds - 1; ++r) { |
601 | 0 | AES_INV_SBOX(B); |
602 | 0 | inv_shift_rows(B); |
603 | 0 | inv_mix_columns(B); |
604 | |
|
605 | 0 | for(size_t i = 0; i != 8; ++i) { |
606 | 0 | B[i] ^= KS[8 * r + i]; |
607 | 0 | } |
608 | 0 | } |
609 | | |
610 | | // Final round: |
611 | 0 | AES_INV_SBOX(B); |
612 | 0 | inv_shift_rows(B); |
613 | 0 | bit_transpose(B); |
614 | |
|
615 | 0 | for(size_t i = 0; i != 8; ++i) { |
616 | 0 | B[i] ^= DK[4 * rounds + i % 4]; |
617 | 0 | } |
618 | |
|
619 | 0 | CT::unpoison(B, 8); |
620 | |
|
621 | 0 | copy_out_be(std::span(out, this_loop * 4 * sizeof(uint32_t)), B); |
622 | |
|
623 | 0 | in += this_loop * BLOCK_SIZE; |
624 | 0 | out += this_loop * BLOCK_SIZE; |
625 | 0 | blocks -= this_loop; |
626 | 0 | } |
627 | 0 | } |
628 | | |
629 | 0 | inline uint32_t xtime32(uint32_t s) { |
630 | 0 | const uint32_t lo_bit = 0x01010101; |
631 | 0 | const uint32_t mask = 0x7F7F7F7F; |
632 | 0 | const uint32_t poly = 0x1B; |
633 | |
|
634 | 0 | return ((s & mask) << 1) ^ (((s >> 7) & lo_bit) * poly); |
635 | 0 | } |
636 | | |
637 | 0 | inline uint32_t InvMixColumn(uint32_t s1) { |
638 | 0 | const uint32_t s2 = xtime32(s1); |
639 | 0 | const uint32_t s4 = xtime32(s2); |
640 | 0 | const uint32_t s8 = xtime32(s4); |
641 | 0 | const uint32_t s9 = s8 ^ s1; |
642 | 0 | const uint32_t s11 = s9 ^ s2; |
643 | 0 | const uint32_t s13 = s9 ^ s4; |
644 | 0 | const uint32_t s14 = s8 ^ s4 ^ s2; |
645 | |
|
646 | 0 | return s14 ^ rotr<8>(s9) ^ rotr<16>(s13) ^ rotr<24>(s11); |
647 | 0 | } |
648 | | |
649 | 0 | void InvMixColumn_x4(uint32_t x[4]) { |
650 | 0 | x[0] = InvMixColumn(x[0]); |
651 | 0 | x[1] = InvMixColumn(x[1]); |
652 | 0 | x[2] = InvMixColumn(x[2]); |
653 | 0 | x[3] = InvMixColumn(x[3]); |
654 | 0 | } |
655 | | |
656 | 0 | uint32_t SE_word(uint32_t x) { |
657 | 0 | uint32_t I[8] = {0}; |
658 | |
|
659 | 0 | for(size_t i = 0; i != 8; ++i) { |
660 | 0 | I[i] = (x >> (7 - i)) & 0x01010101; |
661 | 0 | } |
662 | |
|
663 | 0 | AES_SBOX(I); |
664 | |
|
665 | 0 | x = 0; |
666 | |
|
667 | 0 | for(size_t i = 0; i != 8; ++i) { |
668 | 0 | x |= ((I[i] & 0x01010101) << (7 - i)); |
669 | 0 | } |
670 | |
|
671 | 0 | return x; |
672 | 0 | } |
673 | | |
674 | | void aes_key_schedule(const uint8_t key[], |
675 | | size_t length, |
676 | | secure_vector<uint32_t>& EK, |
677 | | secure_vector<uint32_t>& DK, |
678 | 0 | bool bswap_keys = false) { |
679 | 0 | static const uint32_t RC[10] = {0x01000000, |
680 | 0 | 0x02000000, |
681 | 0 | 0x04000000, |
682 | 0 | 0x08000000, |
683 | 0 | 0x10000000, |
684 | 0 | 0x20000000, |
685 | 0 | 0x40000000, |
686 | 0 | 0x80000000, |
687 | 0 | 0x1B000000, |
688 | 0 | 0x36000000}; |
689 | |
|
690 | 0 | const size_t X = length / 4; |
691 | | |
692 | | // Can't happen, but make static analyzers happy |
693 | 0 | BOTAN_ASSERT_NOMSG(X == 4 || X == 6 || X == 8); |
694 | |
|
695 | 0 | const size_t rounds = (length / 4) + 6; |
696 | | |
697 | | // Help the optimizer |
698 | 0 | BOTAN_ASSERT_NOMSG(rounds == 10 || rounds == 12 || rounds == 14); |
699 | |
|
700 | 0 | CT::poison(key, length); |
701 | |
|
702 | 0 | const size_t KS_len = length + 28; |
703 | 0 | EK.resize(KS_len); |
704 | 0 | DK.resize(KS_len); |
705 | |
|
706 | 0 | for(size_t i = 0; i != X; ++i) { |
707 | 0 | EK[i] = load_be<uint32_t>(key, i); |
708 | 0 | } |
709 | |
|
710 | 0 | for(size_t i = X; i < 4 * (rounds + 1); i += X) { |
711 | 0 | EK[i] = EK[i - X] ^ RC[(i - X) / X] ^ rotl<8>(SE_word(EK[i - 1])); |
712 | |
|
713 | 0 | for(size_t j = 1; j != X && (i + j) < EK.size(); ++j) { |
714 | 0 | EK[i + j] = EK[i + j - X]; |
715 | |
|
716 | 0 | if(X == 8 && j == 4) { |
717 | 0 | EK[i + j] ^= SE_word(EK[i + j - 1]); |
718 | 0 | } else { |
719 | 0 | EK[i + j] ^= EK[i + j - 1]; |
720 | 0 | } |
721 | 0 | } |
722 | 0 | } |
723 | |
|
724 | 0 | for(size_t i = 0; i != 4 * (rounds + 1); i += 4) { |
725 | 0 | DK[i] = EK[4 * rounds - i]; |
726 | 0 | DK[i + 1] = EK[4 * rounds - i + 1]; |
727 | 0 | DK[i + 2] = EK[4 * rounds - i + 2]; |
728 | 0 | DK[i + 3] = EK[4 * rounds - i + 3]; |
729 | 0 | } |
730 | |
|
731 | 0 | for(size_t i = 4; i != 4 * rounds; i += 4) { |
732 | 0 | InvMixColumn_x4(&DK[i]); |
733 | 0 | } |
734 | |
|
735 | 0 | if(bswap_keys) { |
736 | | // HW AES on little endian needs the subkeys to be byte reversed |
737 | 0 | for(size_t i = 0; i != KS_len; ++i) { |
738 | 0 | EK[i] = reverse_bytes(EK[i]); |
739 | 0 | DK[i] = reverse_bytes(DK[i]); |
740 | 0 | } |
741 | 0 | } |
742 | |
|
743 | 0 | CT::unpoison(EK.data(), EK.size()); |
744 | 0 | CT::unpoison(DK.data(), DK.size()); |
745 | 0 | CT::unpoison(key, length); |
746 | 0 | } |
747 | | |
748 | 0 | size_t aes_parallelism() { |
749 | 0 | #if defined(BOTAN_HAS_AES_VAES) |
750 | 0 | if(CPUID::has(CPUID::Feature::AVX2_AES)) { |
751 | 0 | return 8; // pipelined |
752 | 0 | } |
753 | 0 | #endif |
754 | | |
755 | 0 | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
756 | 0 | if(CPUID::has(CPUID::Feature::HW_AES)) { |
757 | 0 | return 4; // pipelined |
758 | 0 | } |
759 | 0 | #endif |
760 | | |
761 | 0 | #if defined(BOTAN_HAS_AES_VPERM) |
762 | 0 | if(CPUID::has(CPUID::Feature::SIMD_4X32)) { |
763 | 0 | return 2; // pipelined |
764 | 0 | } |
765 | 0 | #endif |
766 | | |
767 | | // bitsliced: |
768 | 0 | return 2; |
769 | 0 | } |
770 | | |
771 | 0 | std::string aes_provider() { |
772 | 0 | #if defined(BOTAN_HAS_AES_VAES) |
773 | 0 | if(auto feat = CPUID::check(CPUID::Feature::AVX2_AES)) { |
774 | 0 | return *feat; |
775 | 0 | } |
776 | 0 | #endif |
777 | | |
778 | 0 | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
779 | 0 | if(auto feat = CPUID::check(CPUID::Feature::HW_AES)) { |
780 | 0 | return *feat; |
781 | 0 | } |
782 | 0 | #endif |
783 | | |
784 | 0 | #if defined(BOTAN_HAS_AES_VPERM) |
785 | 0 | if(auto feat = CPUID::check(CPUID::Feature::SIMD_4X32)) { |
786 | 0 | return *feat; |
787 | 0 | } |
788 | 0 | #endif |
789 | | |
790 | 0 | return "base"; |
791 | 0 | } |
792 | | |
793 | | } // namespace |
794 | | |
795 | 0 | std::string AES_128::provider() const { |
796 | 0 | return aes_provider(); |
797 | 0 | } |
798 | | |
799 | 0 | std::string AES_192::provider() const { |
800 | 0 | return aes_provider(); |
801 | 0 | } |
802 | | |
803 | 0 | std::string AES_256::provider() const { |
804 | 0 | return aes_provider(); |
805 | 0 | } |
806 | | |
807 | 0 | size_t AES_128::parallelism() const { |
808 | 0 | return aes_parallelism(); |
809 | 0 | } |
810 | | |
811 | 0 | size_t AES_192::parallelism() const { |
812 | 0 | return aes_parallelism(); |
813 | 0 | } |
814 | | |
815 | 0 | size_t AES_256::parallelism() const { |
816 | 0 | return aes_parallelism(); |
817 | 0 | } |
818 | | |
819 | 0 | bool AES_128::has_keying_material() const { |
820 | 0 | return !m_EK.empty(); |
821 | 0 | } |
822 | | |
823 | 0 | bool AES_192::has_keying_material() const { |
824 | 0 | return !m_EK.empty(); |
825 | 0 | } |
826 | | |
827 | 0 | bool AES_256::has_keying_material() const { |
828 | 0 | return !m_EK.empty(); |
829 | 0 | } |
830 | | |
831 | 0 | void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
832 | 0 | assert_key_material_set(); |
833 | |
|
834 | 0 | #if defined(BOTAN_HAS_AES_VAES) |
835 | 0 | if(CPUID::has(CPUID::Feature::AVX2_AES)) { |
836 | 0 | return x86_vaes_encrypt_n(in, out, blocks); |
837 | 0 | } |
838 | 0 | #endif |
839 | | |
840 | 0 | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
841 | 0 | if(CPUID::has(CPUID::Feature::HW_AES)) { |
842 | 0 | return hw_aes_encrypt_n(in, out, blocks); |
843 | 0 | } |
844 | 0 | #endif |
845 | | |
846 | 0 | #if defined(BOTAN_HAS_AES_VPERM) |
847 | 0 | if(CPUID::has(CPUID::Feature::SIMD_4X32)) { |
848 | 0 | return vperm_encrypt_n(in, out, blocks); |
849 | 0 | } |
850 | 0 | #endif |
851 | | |
852 | 0 | aes_encrypt_n(in, out, blocks, m_EK); |
853 | 0 | } |
854 | | |
855 | 0 | void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
856 | 0 | assert_key_material_set(); |
857 | |
|
858 | 0 | #if defined(BOTAN_HAS_AES_VAES) |
859 | 0 | if(CPUID::has(CPUID::Feature::AVX2_AES)) { |
860 | 0 | return x86_vaes_decrypt_n(in, out, blocks); |
861 | 0 | } |
862 | 0 | #endif |
863 | | |
864 | 0 | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
865 | 0 | if(CPUID::has(CPUID::Feature::HW_AES)) { |
866 | 0 | return hw_aes_decrypt_n(in, out, blocks); |
867 | 0 | } |
868 | 0 | #endif |
869 | | |
870 | 0 | #if defined(BOTAN_HAS_AES_VPERM) |
871 | 0 | if(CPUID::has(CPUID::Feature::SIMD_4X32)) { |
872 | 0 | return vperm_decrypt_n(in, out, blocks); |
873 | 0 | } |
874 | 0 | #endif |
875 | | |
876 | 0 | aes_decrypt_n(in, out, blocks, m_DK); |
877 | 0 | } |
878 | | |
879 | 0 | void AES_128::key_schedule(std::span<const uint8_t> key) { |
880 | 0 | #if defined(BOTAN_HAS_AES_NI) |
881 | 0 | if(CPUID::has(CPUID::Feature::AESNI)) { |
882 | 0 | return aesni_key_schedule(key.data(), key.size()); |
883 | 0 | } |
884 | 0 | #endif |
885 | | |
886 | 0 | #if defined(BOTAN_HAS_AES_VAES) |
887 | 0 | if(CPUID::has(CPUID::Feature::AVX2_AES)) { |
888 | 0 | return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, true); |
889 | 0 | } |
890 | 0 | #endif |
891 | | |
892 | 0 | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
893 | 0 | if(CPUID::has(CPUID::Feature::HW_AES)) { |
894 | 0 | constexpr bool is_little_endian = std::endian::native == std::endian::little; |
895 | 0 | return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, is_little_endian); |
896 | 0 | } |
897 | 0 | #endif |
898 | | |
899 | 0 | #if defined(BOTAN_HAS_AES_VPERM) |
900 | 0 | if(CPUID::has(CPUID::Feature::SIMD_4X32)) { |
901 | 0 | return vperm_key_schedule(key.data(), key.size()); |
902 | 0 | } |
903 | 0 | #endif |
904 | | |
905 | 0 | aes_key_schedule(key.data(), key.size(), m_EK, m_DK); |
906 | 0 | } |
907 | | |
908 | 0 | void AES_128::clear() { |
909 | 0 | zap(m_EK); |
910 | 0 | zap(m_DK); |
911 | 0 | } |
912 | | |
913 | 0 | void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
914 | 0 | assert_key_material_set(); |
915 | |
|
916 | 0 | #if defined(BOTAN_HAS_AES_VAES) |
917 | 0 | if(CPUID::has(CPUID::Feature::AVX2_AES)) { |
918 | 0 | return x86_vaes_encrypt_n(in, out, blocks); |
919 | 0 | } |
920 | 0 | #endif |
921 | | |
922 | 0 | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
923 | 0 | if(CPUID::has(CPUID::Feature::HW_AES)) { |
924 | 0 | return hw_aes_encrypt_n(in, out, blocks); |
925 | 0 | } |
926 | 0 | #endif |
927 | | |
928 | 0 | #if defined(BOTAN_HAS_AES_VPERM) |
929 | 0 | if(CPUID::has(CPUID::Feature::SIMD_4X32)) { |
930 | 0 | return vperm_encrypt_n(in, out, blocks); |
931 | 0 | } |
932 | 0 | #endif |
933 | | |
934 | 0 | aes_encrypt_n(in, out, blocks, m_EK); |
935 | 0 | } |
936 | | |
937 | 0 | void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
938 | 0 | assert_key_material_set(); |
939 | |
|
940 | 0 | #if defined(BOTAN_HAS_AES_VAES) |
941 | 0 | if(CPUID::has(CPUID::Feature::AVX2_AES)) { |
942 | 0 | return x86_vaes_decrypt_n(in, out, blocks); |
943 | 0 | } |
944 | 0 | #endif |
945 | | |
946 | 0 | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
947 | 0 | if(CPUID::has(CPUID::Feature::HW_AES)) { |
948 | 0 | return hw_aes_decrypt_n(in, out, blocks); |
949 | 0 | } |
950 | 0 | #endif |
951 | | |
952 | 0 | #if defined(BOTAN_HAS_AES_VPERM) |
953 | 0 | if(CPUID::has(CPUID::Feature::SIMD_4X32)) { |
954 | 0 | return vperm_decrypt_n(in, out, blocks); |
955 | 0 | } |
956 | 0 | #endif |
957 | | |
958 | 0 | aes_decrypt_n(in, out, blocks, m_DK); |
959 | 0 | } |
960 | | |
961 | 0 | void AES_192::key_schedule(std::span<const uint8_t> key) { |
962 | 0 | #if defined(BOTAN_HAS_AES_NI) |
963 | 0 | if(CPUID::has(CPUID::Feature::AESNI)) { |
964 | 0 | return aesni_key_schedule(key.data(), key.size()); |
965 | 0 | } |
966 | 0 | #endif |
967 | | |
968 | 0 | #if defined(BOTAN_HAS_AES_VAES) |
969 | 0 | if(CPUID::has(CPUID::Feature::AVX2_AES)) { |
970 | 0 | return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, true); |
971 | 0 | } |
972 | 0 | #endif |
973 | | |
974 | 0 | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
975 | 0 | if(CPUID::has(CPUID::Feature::HW_AES)) { |
976 | 0 | constexpr bool is_little_endian = std::endian::native == std::endian::little; |
977 | 0 | return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, is_little_endian); |
978 | 0 | } |
979 | 0 | #endif |
980 | | |
981 | 0 | #if defined(BOTAN_HAS_AES_VPERM) |
982 | 0 | if(CPUID::has(CPUID::Feature::SIMD_4X32)) { |
983 | 0 | return vperm_key_schedule(key.data(), key.size()); |
984 | 0 | } |
985 | 0 | #endif |
986 | | |
987 | 0 | aes_key_schedule(key.data(), key.size(), m_EK, m_DK); |
988 | 0 | } |
989 | | |
990 | 0 | void AES_192::clear() { |
991 | 0 | zap(m_EK); |
992 | 0 | zap(m_DK); |
993 | 0 | } |
994 | | |
995 | 0 | void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
996 | 0 | assert_key_material_set(); |
997 | |
|
998 | 0 | #if defined(BOTAN_HAS_AES_VAES) |
999 | 0 | if(CPUID::has(CPUID::Feature::AVX2_AES)) { |
1000 | 0 | return x86_vaes_encrypt_n(in, out, blocks); |
1001 | 0 | } |
1002 | 0 | #endif |
1003 | | |
1004 | 0 | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
1005 | 0 | if(CPUID::has(CPUID::Feature::HW_AES)) { |
1006 | 0 | return hw_aes_encrypt_n(in, out, blocks); |
1007 | 0 | } |
1008 | 0 | #endif |
1009 | | |
1010 | 0 | #if defined(BOTAN_HAS_AES_VPERM) |
1011 | 0 | if(CPUID::has(CPUID::Feature::SIMD_4X32)) { |
1012 | 0 | return vperm_encrypt_n(in, out, blocks); |
1013 | 0 | } |
1014 | 0 | #endif |
1015 | | |
1016 | 0 | aes_encrypt_n(in, out, blocks, m_EK); |
1017 | 0 | } |
1018 | | |
1019 | 0 | void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { |
1020 | 0 | assert_key_material_set(); |
1021 | |
|
1022 | 0 | #if defined(BOTAN_HAS_AES_VAES) |
1023 | 0 | if(CPUID::has(CPUID::Feature::AVX2_AES)) { |
1024 | 0 | return x86_vaes_decrypt_n(in, out, blocks); |
1025 | 0 | } |
1026 | 0 | #endif |
1027 | | |
1028 | 0 | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
1029 | 0 | if(CPUID::has(CPUID::Feature::HW_AES)) { |
1030 | 0 | return hw_aes_decrypt_n(in, out, blocks); |
1031 | 0 | } |
1032 | 0 | #endif |
1033 | | |
1034 | 0 | #if defined(BOTAN_HAS_AES_VPERM) |
1035 | 0 | if(CPUID::has(CPUID::Feature::SIMD_4X32)) { |
1036 | 0 | return vperm_decrypt_n(in, out, blocks); |
1037 | 0 | } |
1038 | 0 | #endif |
1039 | | |
1040 | 0 | aes_decrypt_n(in, out, blocks, m_DK); |
1041 | 0 | } |
1042 | | |
1043 | 0 | void AES_256::key_schedule(std::span<const uint8_t> key) { |
1044 | 0 | #if defined(BOTAN_HAS_AES_NI) |
1045 | 0 | if(CPUID::has(CPUID::Feature::AESNI)) { |
1046 | 0 | return aesni_key_schedule(key.data(), key.size()); |
1047 | 0 | } |
1048 | 0 | #endif |
1049 | | |
1050 | 0 | #if defined(BOTAN_HAS_AES_VAES) |
1051 | 0 | if(CPUID::has(CPUID::Feature::AVX2_AES)) { |
1052 | 0 | return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, true); |
1053 | 0 | } |
1054 | 0 | #endif |
1055 | | |
1056 | 0 | #if defined(BOTAN_HAS_HW_AES_SUPPORT) |
1057 | 0 | if(CPUID::has(CPUID::Feature::HW_AES)) { |
1058 | 0 | constexpr bool is_little_endian = std::endian::native == std::endian::little; |
1059 | 0 | return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, is_little_endian); |
1060 | 0 | } |
1061 | 0 | #endif |
1062 | | |
1063 | 0 | #if defined(BOTAN_HAS_AES_VPERM) |
1064 | 0 | if(CPUID::has(CPUID::Feature::SIMD_4X32)) { |
1065 | 0 | return vperm_key_schedule(key.data(), key.size()); |
1066 | 0 | } |
1067 | 0 | #endif |
1068 | | |
1069 | 0 | aes_key_schedule(key.data(), key.size(), m_EK, m_DK); |
1070 | 0 | } |
1071 | | |
1072 | 0 | void AES_256::clear() { |
1073 | 0 | zap(m_EK); |
1074 | 0 | zap(m_DK); |
1075 | 0 | } |
1076 | | |
1077 | | } // namespace Botan |