/src/boringssl/crypto/fipsmodule/aes/aes_nohw.c.inc
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (c) 2019, Google Inc. |
2 | | * |
3 | | * Permission to use, copy, modify, and/or distribute this software for any |
4 | | * purpose with or without fee is hereby granted, provided that the above |
5 | | * copyright notice and this permission notice appear in all copies. |
6 | | * |
7 | | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
8 | | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
9 | | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
10 | | * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
11 | | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
12 | | * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
13 | | * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ |
14 | | |
15 | | #include <openssl/aes.h> |
16 | | |
17 | | #include <assert.h> |
18 | | #include <string.h> |
19 | | |
20 | | #include "../../internal.h" |
21 | | #include "internal.h" |
22 | | |
23 | | #if defined(OPENSSL_SSE2) |
24 | | #include <emmintrin.h> |
25 | | #endif |
26 | | |
27 | | |
28 | | // This file contains a constant-time implementation of AES, bitsliced with |
29 | | // 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block |
30 | | // batches, respectively. The 128-bit implementation requires SSE2 intrinsics. |
31 | | // |
32 | | // This implementation is based on the algorithms described in the following |
33 | | // references: |
34 | | // - https://bearssl.org/constanttime.html#aes |
35 | | // - https://eprint.iacr.org/2009/129.pdf |
36 | | // - https://eprint.iacr.org/2009/191.pdf |
37 | | |
38 | | |
39 | | // Word operations. |
40 | | // |
41 | | // An aes_word_t is the word used for this AES implementation. Throughout this |
42 | | // file, bits and bytes are ordered little-endian, though "left" and "right" |
43 | | // shifts match the operations themselves, which makes them reversed in a |
44 | | // little-endian, left-to-right reading. |
45 | | // |
46 | | // Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an |
47 | | // |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE| |
48 | | // bits each, each corresponding to a byte in an AES block in column-major |
49 | | // order (AES's byte order). We refer to these as "logical bytes". Note, in the |
50 | | // 32-bit and 64-bit implementations, they are smaller than a byte. (The |
51 | | // contents of a logical byte will be described later.) |
52 | | // |
53 | | // MSVC does not support C bit operators on |__m128i|, so the wrapper functions |
54 | | // |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and |
55 | | // |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift |
56 | | // value ranges from 0 to 15 independent of |aes_word_t| and |
57 | | // |AES_NOHW_BATCH_SIZE|. |
58 | | // |
59 | | // This ordering is different from https://eprint.iacr.org/2009/129.pdf, which |
60 | | // uses row-major order. Matching the AES order was easier to reason about, and |
61 | | // we do not have PSHUFB available to arbitrarily permute bytes. |
62 | | |
63 | | #if defined(OPENSSL_SSE2) |
64 | | typedef __m128i aes_word_t; |
65 | | // AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in |
66 | | // MSVC, so we define a constant. |
67 | | #define AES_NOHW_WORD_SIZE 16 |
68 | 4.18M | #define AES_NOHW_BATCH_SIZE 8 |
69 | | #define AES_NOHW_ROW0_MASK \ |
70 | 3.34M | _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff) |
71 | | #define AES_NOHW_ROW1_MASK \ |
72 | 3.34M | _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00) |
73 | | #define AES_NOHW_ROW2_MASK \ |
74 | 3.34M | _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000) |
75 | | #define AES_NOHW_ROW3_MASK \ |
76 | 3.34M | _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000) |
77 | | #define AES_NOHW_COL01_MASK \ |
78 | 8 | _mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff) |
79 | | #define AES_NOHW_COL2_MASK \ |
80 | 8 | _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000) |
81 | | #define AES_NOHW_COL3_MASK \ |
82 | 8 | _mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000) |
83 | | |
84 | 29.9M | static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { |
85 | 29.9M | return _mm_and_si128(a, b); |
86 | 29.9M | } |
87 | | |
88 | 20.0M | static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { |
89 | 20.0M | return _mm_or_si128(a, b); |
90 | 20.0M | } |
91 | | |
92 | 57.7M | static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { |
93 | 57.7M | return _mm_xor_si128(a, b); |
94 | 57.7M | } |
95 | | |
96 | 2.08M | static inline aes_word_t aes_nohw_not(aes_word_t a) { |
97 | 2.08M | return _mm_xor_si128( |
98 | 2.08M | a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff)); |
99 | 2.08M | } |
100 | | |
101 | | // These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128| |
102 | | // must be constants. |
103 | | #define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \ |
104 | 10.3M | _mm_slli_si128((a), (i)) |
105 | | #define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \ |
106 | 10.1M | _mm_srli_si128((a), (i)) |
107 | | #else // !OPENSSL_SSE2 |
108 | | #if defined(OPENSSL_64_BIT) |
109 | | typedef uint64_t aes_word_t; |
110 | | #define AES_NOHW_WORD_SIZE 8 |
111 | | #define AES_NOHW_BATCH_SIZE 4 |
112 | | #define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f) |
113 | | #define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0) |
114 | | #define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00) |
115 | | #define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000) |
116 | | #define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff) |
117 | | #define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000) |
118 | | #define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000) |
119 | | #else // !OPENSSL_64_BIT |
120 | | typedef uint32_t aes_word_t; |
121 | | #define AES_NOHW_WORD_SIZE 4 |
122 | | #define AES_NOHW_BATCH_SIZE 2 |
123 | | #define AES_NOHW_ROW0_MASK 0x03030303 |
124 | | #define AES_NOHW_ROW1_MASK 0x0c0c0c0c |
125 | | #define AES_NOHW_ROW2_MASK 0x30303030 |
126 | | #define AES_NOHW_ROW3_MASK 0xc0c0c0c0 |
127 | | #define AES_NOHW_COL01_MASK 0x0000ffff |
128 | | #define AES_NOHW_COL2_MASK 0x00ff0000 |
129 | | #define AES_NOHW_COL3_MASK 0xff000000 |
130 | | #endif // OPENSSL_64_BIT |
131 | | |
132 | | static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { |
133 | | return a & b; |
134 | | } |
135 | | |
136 | | static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { |
137 | | return a | b; |
138 | | } |
139 | | |
140 | | static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { |
141 | | return a ^ b; |
142 | | } |
143 | | |
144 | | static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; } |
145 | | |
146 | | static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) { |
147 | | return a << (i * AES_NOHW_BATCH_SIZE); |
148 | | } |
149 | | |
150 | | static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) { |
151 | | return a >> (i * AES_NOHW_BATCH_SIZE); |
152 | | } |
153 | | #endif // OPENSSL_SSE2 |
154 | | |
155 | | static_assert(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t), |
156 | | "batch size does not match word size"); |
157 | | static_assert(AES_NOHW_WORD_SIZE == sizeof(aes_word_t), |
158 | | "AES_NOHW_WORD_SIZE is incorrect"); |
159 | | |
160 | | |
161 | | // Block representations. |
162 | | // |
163 | | // This implementation uses three representations for AES blocks. First, the |
164 | | // public API represents blocks as uint8_t[16] in the usual way. Second, most |
165 | | // AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|. |
166 | | // This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words |
167 | | // containing bitsliced blocks a, b, c, d, this would be as follows (vertical |
168 | | // bars divide logical bytes): |
169 | | // |
170 | | // batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... |
171 | | // batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... |
172 | | // batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... |
173 | | // batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... |
174 | | // ... |
175 | | // |
176 | | // Finally, an individual block may be stored as an intermediate form in an |
177 | | // aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each |
178 | | // block, so that block[0]'s ith logical byte contains least-significant |
179 | | // |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of |
180 | | // |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as |
181 | | // "compacting" the block. Note this is no-op with 128-bit words because then |
182 | | // |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit |
183 | | // words, one block would be stored in two words: |
184 | | // |
185 | | // block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... |
186 | | // block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ... |
187 | | // |
188 | | // Observe that the distances between corresponding bits in bitsliced and |
189 | | // compact bit orders match. If we line up corresponding words of each block, |
190 | | // the bitsliced and compact representations may be converted by tranposing bits |
191 | | // in corresponding logical bytes. Continuing the 64-bit example: |
192 | | // |
193 | | // block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... |
194 | | // block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ... |
195 | | // block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ... |
196 | | // block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ... |
197 | | // |
198 | | // batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... |
199 | | // batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... |
200 | | // batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... |
201 | | // batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... |
202 | | // |
203 | | // Note also that bitwise operations and (logical) byte permutations on an |
204 | | // |aes_word_t| work equally for the bitsliced and compact words. |
205 | | // |
206 | | // We use the compact form in the |AES_KEY| representation to save work |
207 | | // inflating round keys into |AES_NOHW_BATCH|. The compact form also exists |
208 | | // temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately |
209 | | // before or after |aes_nohw_transpose|. |
210 | | |
211 | 200k | #define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t)) |
212 | | |
213 | | // An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise |
214 | | // specified, it is in bitsliced form. |
215 | | typedef struct { |
216 | | aes_word_t w[8]; |
217 | | } AES_NOHW_BATCH; |
218 | | |
219 | | // An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is |
220 | | // suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH| |
221 | | // |AES_KEY|s so it should not be used as a long-term key representation. |
222 | | typedef struct { |
223 | | // keys is an array of batches, one for each round key. Each batch stores |
224 | | // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form. |
225 | | AES_NOHW_BATCH keys[AES_MAXNR + 1]; |
226 | | } AES_NOHW_SCHEDULE; |
227 | | |
228 | | // aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in |
229 | | // compact form. |
230 | | static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch, |
231 | | const aes_word_t in[AES_NOHW_BLOCK_WORDS], |
232 | 3.69M | size_t i) { |
233 | | // Note the words are interleaved. The order comes from |aes_nohw_transpose|. |
234 | | // If |i| is zero and this is the 64-bit implementation, in[0] contains bits |
235 | | // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at |
236 | | // w[4] so that bits 0 and 4 are in the correct position. (In general, bits |
237 | | // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares |
238 | | // will be correctly placed.) |
239 | 3.69M | assert(i < AES_NOHW_BATCH_SIZE); |
240 | 3.69M | #if defined(OPENSSL_SSE2) |
241 | 3.69M | batch->w[i] = in[0]; |
242 | | #elif defined(OPENSSL_64_BIT) |
243 | | batch->w[i] = in[0]; |
244 | | batch->w[i + 4] = in[1]; |
245 | | #else |
246 | | batch->w[i] = in[0]; |
247 | | batch->w[i + 2] = in[1]; |
248 | | batch->w[i + 4] = in[2]; |
249 | | batch->w[i + 6] = in[3]; |
250 | | #endif |
251 | 3.69M | } |
252 | | |
253 | | // aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in |
254 | | // compact form. |
255 | | static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch, |
256 | | aes_word_t out[AES_NOHW_BLOCK_WORDS], |
257 | 140k | size_t i) { |
258 | 140k | assert(i < AES_NOHW_BATCH_SIZE); |
259 | 140k | #if defined(OPENSSL_SSE2) |
260 | 140k | out[0] = batch->w[i]; |
261 | | #elif defined(OPENSSL_64_BIT) |
262 | | out[0] = batch->w[i]; |
263 | | out[1] = batch->w[i + 4]; |
264 | | #else |
265 | | out[0] = batch->w[i]; |
266 | | out[1] = batch->w[i + 2]; |
267 | | out[2] = batch->w[i + 4]; |
268 | | out[3] = batch->w[i + 6]; |
269 | | #endif |
270 | 140k | } |
271 | | |
272 | | #if !defined(OPENSSL_SSE2) |
273 | | // aes_nohw_delta_swap returns |a| with bits |a & mask| and |
274 | | // |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap. |
275 | | static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask, |
276 | | aes_word_t shift) { |
277 | | // See |
278 | | // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/ |
279 | | aes_word_t b = (a ^ (a >> shift)) & mask; |
280 | | return a ^ b ^ (b << shift); |
281 | | } |
282 | | |
283 | | // In the 32-bit and 64-bit implementations, a block spans multiple words. |
284 | | // |aes_nohw_compact_block| must permute bits across different words. First we |
285 | | // implement |aes_nohw_compact_word| which performs a smaller version of the |
286 | | // transformation which stays within a single word. |
287 | | // |
288 | | // These transformations are generalizations of the output of |
289 | | // http://programming.sirrida.de/calcperm.php on smaller inputs. |
290 | | #if defined(OPENSSL_64_BIT) |
291 | | static inline uint64_t aes_nohw_compact_word(uint64_t a) { |
292 | | // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap |
293 | | // quartets of those chunks: |
294 | | // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => |
295 | | // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 |
296 | | a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); |
297 | | // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks): |
298 | | // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 => |
299 | | // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 |
300 | | a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); |
301 | | // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks): |
302 | | // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 => |
303 | | // 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15 |
304 | | a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); |
305 | | return a; |
306 | | } |
307 | | |
308 | | static inline uint64_t aes_nohw_uncompact_word(uint64_t a) { |
309 | | // Reverse the steps of |aes_nohw_uncompact_word|. |
310 | | a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); |
311 | | a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); |
312 | | a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); |
313 | | return a; |
314 | | } |
315 | | #else // !OPENSSL_64_BIT |
316 | | static inline uint32_t aes_nohw_compact_word(uint32_t a) { |
317 | | // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap: |
318 | | // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => |
319 | | // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 |
320 | | // Note: 0x00cc = 0b0000_0000_1100_1100 |
321 | | // 0x00cc << 6 = 0b0011_0011_0000_0000 |
322 | | a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); |
323 | | // Now we swap groups of four bits (still numbering by pairs): |
324 | | // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 => |
325 | | // 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15 |
326 | | // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000 |
327 | | a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); |
328 | | return a; |
329 | | } |
330 | | |
331 | | static inline uint32_t aes_nohw_uncompact_word(uint32_t a) { |
332 | | // Reverse the steps of |aes_nohw_uncompact_word|. |
333 | | a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); |
334 | | a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); |
335 | | return a; |
336 | | } |
337 | | |
338 | | static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1, |
339 | | uint8_t a2, uint8_t a3) { |
340 | | return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) | |
341 | | ((uint32_t)a3 << 24); |
342 | | } |
343 | | #endif // OPENSSL_64_BIT |
344 | | #endif // !OPENSSL_SSE2 |
345 | | |
346 | | static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], |
347 | 55.6k | const uint8_t in[16]) { |
348 | 55.6k | memcpy(out, in, 16); |
349 | 55.6k | #if defined(OPENSSL_SSE2) |
350 | | // No conversions needed. |
351 | | #elif defined(OPENSSL_64_BIT) |
352 | | uint64_t a0 = aes_nohw_compact_word(out[0]); |
353 | | uint64_t a1 = aes_nohw_compact_word(out[1]); |
354 | | out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32); |
355 | | out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32); |
356 | | #else |
357 | | uint32_t a0 = aes_nohw_compact_word(out[0]); |
358 | | uint32_t a1 = aes_nohw_compact_word(out[1]); |
359 | | uint32_t a2 = aes_nohw_compact_word(out[2]); |
360 | | uint32_t a3 = aes_nohw_compact_word(out[3]); |
361 | | // Note clang, when building for ARM Thumb2, will sometimes miscompile |
362 | | // expressions such as (a0 & 0x0000ff00) << 8, particularly when building |
363 | | // without optimizations. This bug was introduced in |
364 | | // https://reviews.llvm.org/rL340261 and fixed in |
365 | | // https://reviews.llvm.org/rL351310. The following is written to avoid this. |
366 | | out[0] = aes_nohw_word_from_bytes(a0, a1, a2, a3); |
367 | | out[1] = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8); |
368 | | out[2] = aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16); |
369 | | out[3] = aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24); |
370 | | #endif |
371 | 55.6k | } |
372 | | |
373 | | static inline void aes_nohw_uncompact_block( |
374 | 40.7k | uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { |
375 | 40.7k | #if defined(OPENSSL_SSE2) |
376 | 40.7k | memcpy(out, in, 16); // No conversions needed. |
377 | | #elif defined(OPENSSL_64_BIT) |
378 | | uint64_t a0 = in[0]; |
379 | | uint64_t a1 = in[1]; |
380 | | uint64_t b0 = |
381 | | aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32)); |
382 | | uint64_t b1 = |
383 | | aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32)); |
384 | | memcpy(out, &b0, 8); |
385 | | memcpy(out + 8, &b1, 8); |
386 | | #else |
387 | | uint32_t a0 = in[0]; |
388 | | uint32_t a1 = in[1]; |
389 | | uint32_t a2 = in[2]; |
390 | | uint32_t a3 = in[3]; |
391 | | // Note clang, when building for ARM Thumb2, will sometimes miscompile |
392 | | // expressions such as (a0 & 0x0000ff00) << 8, particularly when building |
393 | | // without optimizations. This bug was introduced in |
394 | | // https://reviews.llvm.org/rL340261 and fixed in |
395 | | // https://reviews.llvm.org/rL351310. The following is written to avoid this. |
396 | | uint32_t b0 = aes_nohw_word_from_bytes(a0, a1, a2, a3); |
397 | | uint32_t b1 = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8); |
398 | | uint32_t b2 = |
399 | | aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16); |
400 | | uint32_t b3 = |
401 | | aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24); |
402 | | b0 = aes_nohw_uncompact_word(b0); |
403 | | b1 = aes_nohw_uncompact_word(b1); |
404 | | b2 = aes_nohw_uncompact_word(b2); |
405 | | b3 = aes_nohw_uncompact_word(b3); |
406 | | memcpy(out, &b0, 4); |
407 | | memcpy(out + 4, &b1, 4); |
408 | | memcpy(out + 8, &b2, 4); |
409 | | memcpy(out + 12, &b3, 4); |
410 | | #endif |
411 | 40.7k | } |
412 | | |
413 | | // aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in |
414 | | // |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and |
415 | | // |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it |
416 | | // is repeated to the full width of |aes_word_t|. |
417 | | #if defined(OPENSSL_SSE2) |
418 | | // This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require |
419 | | // constant shift values. |
420 | | #define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b, \ |
421 | | /* uint32_t */ mask, /* const */ shift) \ |
422 | 8.47M | do { \ |
423 | 8.47M | __m128i swap = \ |
424 | 8.47M | _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \ |
425 | 8.47M | _mm_set_epi32((mask), (mask), (mask), (mask))); \ |
426 | 8.47M | *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift))); \ |
427 | 8.47M | *(b) = _mm_xor_si128(*(b), swap); \ |
428 | 8.47M | \ |
429 | 8.47M | } while (0) |
430 | | #else |
431 | | static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b, |
432 | | uint32_t mask, aes_word_t shift) { |
433 | | #if defined(OPENSSL_64_BIT) |
434 | | aes_word_t mask_w = (((uint64_t)mask) << 32) | mask; |
435 | | #else |
436 | | aes_word_t mask_w = mask; |
437 | | #endif |
438 | | // This is a variation on a delta swap. |
439 | | aes_word_t swap = ((*a >> shift) ^ *b) & mask_w; |
440 | | *a ^= swap << shift; |
441 | | *b ^= swap; |
442 | | } |
443 | | #endif // OPENSSL_SSE2 |
444 | | |
445 | | // aes_nohw_transpose converts |batch| to and from bitsliced form. It divides |
446 | | // the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares |
447 | | // and transposes each square. |
448 | 706k | static void aes_nohw_transpose(AES_NOHW_BATCH *batch) { |
449 | | // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101). |
450 | 706k | aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1); |
451 | 706k | aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1); |
452 | 706k | aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1); |
453 | 706k | aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1); |
454 | | |
455 | 706k | #if AES_NOHW_BATCH_SIZE >= 4 |
456 | | // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011). |
457 | 706k | aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2); |
458 | 706k | aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2); |
459 | 706k | aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2); |
460 | 706k | aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2); |
461 | 706k | #endif |
462 | | |
463 | 706k | #if AES_NOHW_BATCH_SIZE >= 8 |
464 | | // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111). |
465 | 706k | aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4); |
466 | 706k | aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4); |
467 | 706k | aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4); |
468 | 706k | aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4); |
469 | 706k | #endif |
470 | 706k | } |
471 | | |
472 | | // aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|. |
473 | | // |num_blocks| must be at most |AES_NOHW_BATCH|. |
474 | | static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in, |
475 | 31.0k | size_t num_blocks) { |
476 | | // Don't leave unused blocks uninitialized. |
477 | 31.0k | memset(out, 0, sizeof(AES_NOHW_BATCH)); |
478 | 31.0k | assert(num_blocks <= AES_NOHW_BATCH_SIZE); |
479 | 71.7k | for (size_t i = 0; i < num_blocks; i++) { |
480 | 40.7k | aes_word_t block[AES_NOHW_BLOCK_WORDS]; |
481 | 40.7k | aes_nohw_compact_block(block, in + 16 * i); |
482 | 40.7k | aes_nohw_batch_set(out, block, i); |
483 | 40.7k | } |
484 | | |
485 | 31.0k | aes_nohw_transpose(out); |
486 | 31.0k | } |
487 | | |
488 | | // aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|. |
489 | | // |num_blocks| must be at most |AES_NOHW_BATCH|. |
490 | | static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks, |
491 | 31.0k | const AES_NOHW_BATCH *batch) { |
492 | 31.0k | AES_NOHW_BATCH copy = *batch; |
493 | 31.0k | aes_nohw_transpose(©); |
494 | | |
495 | 31.0k | assert(num_blocks <= AES_NOHW_BATCH_SIZE); |
496 | 71.7k | for (size_t i = 0; i < num_blocks; i++) { |
497 | 40.7k | aes_word_t block[AES_NOHW_BLOCK_WORDS]; |
498 | 40.7k | aes_nohw_batch_get(©, block, i); |
499 | 40.7k | aes_nohw_uncompact_block(out + 16 * i, block); |
500 | 40.7k | } |
501 | 31.0k | } |
502 | | |
503 | | |
504 | | // AES round steps. |
505 | | |
506 | | static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch, |
507 | 448k | const AES_NOHW_BATCH *key) { |
508 | 4.04M | for (size_t i = 0; i < 8; i++) { |
509 | 3.59M | batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]); |
510 | 3.59M | } |
511 | 448k | } |
512 | | |
513 | 517k | static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) { |
514 | | // See https://eprint.iacr.org/2009/191.pdf, Appendix C. |
515 | 517k | aes_word_t x0 = batch->w[7]; |
516 | 517k | aes_word_t x1 = batch->w[6]; |
517 | 517k | aes_word_t x2 = batch->w[5]; |
518 | 517k | aes_word_t x3 = batch->w[4]; |
519 | 517k | aes_word_t x4 = batch->w[3]; |
520 | 517k | aes_word_t x5 = batch->w[2]; |
521 | 517k | aes_word_t x6 = batch->w[1]; |
522 | 517k | aes_word_t x7 = batch->w[0]; |
523 | | |
524 | | // Figure 2, the top linear transformation. |
525 | 517k | aes_word_t y14 = aes_nohw_xor(x3, x5); |
526 | 517k | aes_word_t y13 = aes_nohw_xor(x0, x6); |
527 | 517k | aes_word_t y9 = aes_nohw_xor(x0, x3); |
528 | 517k | aes_word_t y8 = aes_nohw_xor(x0, x5); |
529 | 517k | aes_word_t t0 = aes_nohw_xor(x1, x2); |
530 | 517k | aes_word_t y1 = aes_nohw_xor(t0, x7); |
531 | 517k | aes_word_t y4 = aes_nohw_xor(y1, x3); |
532 | 517k | aes_word_t y12 = aes_nohw_xor(y13, y14); |
533 | 517k | aes_word_t y2 = aes_nohw_xor(y1, x0); |
534 | 517k | aes_word_t y5 = aes_nohw_xor(y1, x6); |
535 | 517k | aes_word_t y3 = aes_nohw_xor(y5, y8); |
536 | 517k | aes_word_t t1 = aes_nohw_xor(x4, y12); |
537 | 517k | aes_word_t y15 = aes_nohw_xor(t1, x5); |
538 | 517k | aes_word_t y20 = aes_nohw_xor(t1, x1); |
539 | 517k | aes_word_t y6 = aes_nohw_xor(y15, x7); |
540 | 517k | aes_word_t y10 = aes_nohw_xor(y15, t0); |
541 | 517k | aes_word_t y11 = aes_nohw_xor(y20, y9); |
542 | 517k | aes_word_t y7 = aes_nohw_xor(x7, y11); |
543 | 517k | aes_word_t y17 = aes_nohw_xor(y10, y11); |
544 | 517k | aes_word_t y19 = aes_nohw_xor(y10, y8); |
545 | 517k | aes_word_t y16 = aes_nohw_xor(t0, y11); |
546 | 517k | aes_word_t y21 = aes_nohw_xor(y13, y16); |
547 | 517k | aes_word_t y18 = aes_nohw_xor(x0, y16); |
548 | | |
549 | | // Figure 3, the middle non-linear section. |
550 | 517k | aes_word_t t2 = aes_nohw_and(y12, y15); |
551 | 517k | aes_word_t t3 = aes_nohw_and(y3, y6); |
552 | 517k | aes_word_t t4 = aes_nohw_xor(t3, t2); |
553 | 517k | aes_word_t t5 = aes_nohw_and(y4, x7); |
554 | 517k | aes_word_t t6 = aes_nohw_xor(t5, t2); |
555 | 517k | aes_word_t t7 = aes_nohw_and(y13, y16); |
556 | 517k | aes_word_t t8 = aes_nohw_and(y5, y1); |
557 | 517k | aes_word_t t9 = aes_nohw_xor(t8, t7); |
558 | 517k | aes_word_t t10 = aes_nohw_and(y2, y7); |
559 | 517k | aes_word_t t11 = aes_nohw_xor(t10, t7); |
560 | 517k | aes_word_t t12 = aes_nohw_and(y9, y11); |
561 | 517k | aes_word_t t13 = aes_nohw_and(y14, y17); |
562 | 517k | aes_word_t t14 = aes_nohw_xor(t13, t12); |
563 | 517k | aes_word_t t15 = aes_nohw_and(y8, y10); |
564 | 517k | aes_word_t t16 = aes_nohw_xor(t15, t12); |
565 | 517k | aes_word_t t17 = aes_nohw_xor(t4, t14); |
566 | 517k | aes_word_t t18 = aes_nohw_xor(t6, t16); |
567 | 517k | aes_word_t t19 = aes_nohw_xor(t9, t14); |
568 | 517k | aes_word_t t20 = aes_nohw_xor(t11, t16); |
569 | 517k | aes_word_t t21 = aes_nohw_xor(t17, y20); |
570 | 517k | aes_word_t t22 = aes_nohw_xor(t18, y19); |
571 | 517k | aes_word_t t23 = aes_nohw_xor(t19, y21); |
572 | 517k | aes_word_t t24 = aes_nohw_xor(t20, y18); |
573 | 517k | aes_word_t t25 = aes_nohw_xor(t21, t22); |
574 | 517k | aes_word_t t26 = aes_nohw_and(t21, t23); |
575 | 517k | aes_word_t t27 = aes_nohw_xor(t24, t26); |
576 | 517k | aes_word_t t28 = aes_nohw_and(t25, t27); |
577 | 517k | aes_word_t t29 = aes_nohw_xor(t28, t22); |
578 | 517k | aes_word_t t30 = aes_nohw_xor(t23, t24); |
579 | 517k | aes_word_t t31 = aes_nohw_xor(t22, t26); |
580 | 517k | aes_word_t t32 = aes_nohw_and(t31, t30); |
581 | 517k | aes_word_t t33 = aes_nohw_xor(t32, t24); |
582 | 517k | aes_word_t t34 = aes_nohw_xor(t23, t33); |
583 | 517k | aes_word_t t35 = aes_nohw_xor(t27, t33); |
584 | 517k | aes_word_t t36 = aes_nohw_and(t24, t35); |
585 | 517k | aes_word_t t37 = aes_nohw_xor(t36, t34); |
586 | 517k | aes_word_t t38 = aes_nohw_xor(t27, t36); |
587 | 517k | aes_word_t t39 = aes_nohw_and(t29, t38); |
588 | 517k | aes_word_t t40 = aes_nohw_xor(t25, t39); |
589 | 517k | aes_word_t t41 = aes_nohw_xor(t40, t37); |
590 | 517k | aes_word_t t42 = aes_nohw_xor(t29, t33); |
591 | 517k | aes_word_t t43 = aes_nohw_xor(t29, t40); |
592 | 517k | aes_word_t t44 = aes_nohw_xor(t33, t37); |
593 | 517k | aes_word_t t45 = aes_nohw_xor(t42, t41); |
594 | 517k | aes_word_t z0 = aes_nohw_and(t44, y15); |
595 | 517k | aes_word_t z1 = aes_nohw_and(t37, y6); |
596 | 517k | aes_word_t z2 = aes_nohw_and(t33, x7); |
597 | 517k | aes_word_t z3 = aes_nohw_and(t43, y16); |
598 | 517k | aes_word_t z4 = aes_nohw_and(t40, y1); |
599 | 517k | aes_word_t z5 = aes_nohw_and(t29, y7); |
600 | 517k | aes_word_t z6 = aes_nohw_and(t42, y11); |
601 | 517k | aes_word_t z7 = aes_nohw_and(t45, y17); |
602 | 517k | aes_word_t z8 = aes_nohw_and(t41, y10); |
603 | 517k | aes_word_t z9 = aes_nohw_and(t44, y12); |
604 | 517k | aes_word_t z10 = aes_nohw_and(t37, y3); |
605 | 517k | aes_word_t z11 = aes_nohw_and(t33, y4); |
606 | 517k | aes_word_t z12 = aes_nohw_and(t43, y13); |
607 | 517k | aes_word_t z13 = aes_nohw_and(t40, y5); |
608 | 517k | aes_word_t z14 = aes_nohw_and(t29, y2); |
609 | 517k | aes_word_t z15 = aes_nohw_and(t42, y9); |
610 | 517k | aes_word_t z16 = aes_nohw_and(t45, y14); |
611 | 517k | aes_word_t z17 = aes_nohw_and(t41, y8); |
612 | | |
613 | | // Figure 4, bottom linear transformation. |
614 | 517k | aes_word_t t46 = aes_nohw_xor(z15, z16); |
615 | 517k | aes_word_t t47 = aes_nohw_xor(z10, z11); |
616 | 517k | aes_word_t t48 = aes_nohw_xor(z5, z13); |
617 | 517k | aes_word_t t49 = aes_nohw_xor(z9, z10); |
618 | 517k | aes_word_t t50 = aes_nohw_xor(z2, z12); |
619 | 517k | aes_word_t t51 = aes_nohw_xor(z2, z5); |
620 | 517k | aes_word_t t52 = aes_nohw_xor(z7, z8); |
621 | 517k | aes_word_t t53 = aes_nohw_xor(z0, z3); |
622 | 517k | aes_word_t t54 = aes_nohw_xor(z6, z7); |
623 | 517k | aes_word_t t55 = aes_nohw_xor(z16, z17); |
624 | 517k | aes_word_t t56 = aes_nohw_xor(z12, t48); |
625 | 517k | aes_word_t t57 = aes_nohw_xor(t50, t53); |
626 | 517k | aes_word_t t58 = aes_nohw_xor(z4, t46); |
627 | 517k | aes_word_t t59 = aes_nohw_xor(z3, t54); |
628 | 517k | aes_word_t t60 = aes_nohw_xor(t46, t57); |
629 | 517k | aes_word_t t61 = aes_nohw_xor(z14, t57); |
630 | 517k | aes_word_t t62 = aes_nohw_xor(t52, t58); |
631 | 517k | aes_word_t t63 = aes_nohw_xor(t49, t58); |
632 | 517k | aes_word_t t64 = aes_nohw_xor(z4, t59); |
633 | 517k | aes_word_t t65 = aes_nohw_xor(t61, t62); |
634 | 517k | aes_word_t t66 = aes_nohw_xor(z1, t63); |
635 | 517k | aes_word_t s0 = aes_nohw_xor(t59, t63); |
636 | 517k | aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62)); |
637 | 517k | aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60)); |
638 | 517k | aes_word_t t67 = aes_nohw_xor(t64, t65); |
639 | 517k | aes_word_t s3 = aes_nohw_xor(t53, t66); |
640 | 517k | aes_word_t s4 = aes_nohw_xor(t51, t66); |
641 | 517k | aes_word_t s5 = aes_nohw_xor(t47, t65); |
642 | 517k | aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3)); |
643 | 517k | aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67)); |
644 | | |
645 | 517k | batch->w[0] = s7; |
646 | 517k | batch->w[1] = s6; |
647 | 517k | batch->w[2] = s5; |
648 | 517k | batch->w[3] = s4; |
649 | 517k | batch->w[4] = s3; |
650 | 517k | batch->w[5] = s2; |
651 | 517k | batch->w[6] = s1; |
652 | 517k | batch->w[7] = s0; |
653 | 517k | } |
654 | | |
655 | | // aes_nohw_sub_bytes_inv_affine inverts the affine transform portion of the AES |
656 | | // S-box, defined in FIPS PUB 197, section 5.1.1, step 2. |
657 | 8.40k | static void aes_nohw_sub_bytes_inv_affine(AES_NOHW_BATCH *batch) { |
658 | 8.40k | aes_word_t a0 = batch->w[0]; |
659 | 8.40k | aes_word_t a1 = batch->w[1]; |
660 | 8.40k | aes_word_t a2 = batch->w[2]; |
661 | 8.40k | aes_word_t a3 = batch->w[3]; |
662 | 8.40k | aes_word_t a4 = batch->w[4]; |
663 | 8.40k | aes_word_t a5 = batch->w[5]; |
664 | 8.40k | aes_word_t a6 = batch->w[6]; |
665 | 8.40k | aes_word_t a7 = batch->w[7]; |
666 | | |
667 | | // Apply the circulant [0 0 1 0 0 1 0 1]. This is the inverse of the circulant |
668 | | // [1 0 0 0 1 1 1 1]. |
669 | 8.40k | aes_word_t b0 = aes_nohw_xor(a2, aes_nohw_xor(a5, a7)); |
670 | 8.40k | aes_word_t b1 = aes_nohw_xor(a3, aes_nohw_xor(a6, a0)); |
671 | 8.40k | aes_word_t b2 = aes_nohw_xor(a4, aes_nohw_xor(a7, a1)); |
672 | 8.40k | aes_word_t b3 = aes_nohw_xor(a5, aes_nohw_xor(a0, a2)); |
673 | 8.40k | aes_word_t b4 = aes_nohw_xor(a6, aes_nohw_xor(a1, a3)); |
674 | 8.40k | aes_word_t b5 = aes_nohw_xor(a7, aes_nohw_xor(a2, a4)); |
675 | 8.40k | aes_word_t b6 = aes_nohw_xor(a0, aes_nohw_xor(a3, a5)); |
676 | 8.40k | aes_word_t b7 = aes_nohw_xor(a1, aes_nohw_xor(a4, a6)); |
677 | | |
678 | | // XOR 0x05. Equivalently, we could XOR 0x63 before applying the circulant, |
679 | | // but 0x05 has lower Hamming weight. (0x05 is the circulant applied to 0x63.) |
680 | 8.40k | batch->w[0] = aes_nohw_not(b0); |
681 | 8.40k | batch->w[1] = b1; |
682 | 8.40k | batch->w[2] = aes_nohw_not(b2); |
683 | 8.40k | batch->w[3] = b3; |
684 | 8.40k | batch->w[4] = b4; |
685 | 8.40k | batch->w[5] = b5; |
686 | 8.40k | batch->w[6] = b6; |
687 | 8.40k | batch->w[7] = b7; |
688 | 8.40k | } |
689 | | |
690 | 4.20k | static void aes_nohw_inv_sub_bytes(AES_NOHW_BATCH *batch) { |
691 | | // We implement the inverse S-box using the forwards implementation with the |
692 | | // technique described in https://www.bearssl.org/constanttime.html#aes. |
693 | | // |
694 | | // The forwards S-box inverts its input and applies an affine transformation: |
695 | | // S(x) = A(Inv(x)). Thus Inv(x) = InvA(S(x)). The inverse S-box is then: |
696 | | // |
697 | | // InvS(x) = Inv(InvA(x)). |
698 | | // = InvA(S(InvA(x))) |
699 | 4.20k | aes_nohw_sub_bytes_inv_affine(batch); |
700 | 4.20k | aes_nohw_sub_bytes(batch); |
701 | 4.20k | aes_nohw_sub_bytes_inv_affine(batch); |
702 | 4.20k | } |
703 | | |
704 | | // aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated |
705 | | // to the right by |n|. This is a macro because |aes_nohw_shift_*| require |
706 | | // constant shift counts in the SSE2 implementation. |
707 | | #define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \ |
708 | 10.0M | (aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \ |
709 | 10.0M | aes_nohw_shift_left((v), 16 - (n)*4))) |
710 | | |
711 | 413k | static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) { |
712 | 3.72M | for (size_t i = 0; i < 8; i++) { |
713 | 3.30M | aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK); |
714 | 3.30M | aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK); |
715 | 3.30M | aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK); |
716 | 3.30M | aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK); |
717 | 3.30M | row1 = aes_nohw_rotate_cols_right(row1, 1); |
718 | 3.30M | row2 = aes_nohw_rotate_cols_right(row2, 2); |
719 | 3.30M | row3 = aes_nohw_rotate_cols_right(row3, 3); |
720 | 3.30M | batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3)); |
721 | 3.30M | } |
722 | 413k | } |
723 | | |
724 | 4.20k | static void aes_nohw_inv_shift_rows(AES_NOHW_BATCH *batch) { |
725 | 37.8k | for (size_t i = 0; i < 8; i++) { |
726 | 33.6k | aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK); |
727 | 33.6k | aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK); |
728 | 33.6k | aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK); |
729 | 33.6k | aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK); |
730 | 33.6k | row1 = aes_nohw_rotate_cols_right(row1, 3); |
731 | 33.6k | row2 = aes_nohw_rotate_cols_right(row2, 2); |
732 | 33.6k | row3 = aes_nohw_rotate_cols_right(row3, 1); |
733 | 33.6k | batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3)); |
734 | 33.6k | } |
735 | 4.20k | } |
736 | | |
737 | | // aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated |
738 | | // down by one. |
739 | 3.15M | static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) { |
740 | 3.15M | #if defined(OPENSSL_SSE2) |
741 | 3.15M | return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24)); |
742 | | #elif defined(OPENSSL_64_BIT) |
743 | | return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) | |
744 | | ((v << 12) & UINT64_C(0xf000f000f000f000)); |
745 | | #else |
746 | | return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0); |
747 | | #endif |
748 | 3.15M | } |
749 | | |
750 | | // aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated |
751 | | // by two. |
752 | 3.12M | static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) { |
753 | 3.12M | #if defined(OPENSSL_SSE2) |
754 | 3.12M | return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16)); |
755 | | #elif defined(OPENSSL_64_BIT) |
756 | | return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) | |
757 | | ((v << 8) & UINT64_C(0xff00ff00ff00ff00)); |
758 | | #else |
759 | | return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0); |
760 | | #endif |
761 | 3.12M | } |
762 | | |
763 | 386k | static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) { |
764 | | // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A. |
765 | 386k | aes_word_t a0 = batch->w[0]; |
766 | 386k | aes_word_t a1 = batch->w[1]; |
767 | 386k | aes_word_t a2 = batch->w[2]; |
768 | 386k | aes_word_t a3 = batch->w[3]; |
769 | 386k | aes_word_t a4 = batch->w[4]; |
770 | 386k | aes_word_t a5 = batch->w[5]; |
771 | 386k | aes_word_t a6 = batch->w[6]; |
772 | 386k | aes_word_t a7 = batch->w[7]; |
773 | | |
774 | 386k | aes_word_t r0 = aes_nohw_rotate_rows_down(a0); |
775 | 386k | aes_word_t a0_r0 = aes_nohw_xor(a0, r0); |
776 | 386k | aes_word_t r1 = aes_nohw_rotate_rows_down(a1); |
777 | 386k | aes_word_t a1_r1 = aes_nohw_xor(a1, r1); |
778 | 386k | aes_word_t r2 = aes_nohw_rotate_rows_down(a2); |
779 | 386k | aes_word_t a2_r2 = aes_nohw_xor(a2, r2); |
780 | 386k | aes_word_t r3 = aes_nohw_rotate_rows_down(a3); |
781 | 386k | aes_word_t a3_r3 = aes_nohw_xor(a3, r3); |
782 | 386k | aes_word_t r4 = aes_nohw_rotate_rows_down(a4); |
783 | 386k | aes_word_t a4_r4 = aes_nohw_xor(a4, r4); |
784 | 386k | aes_word_t r5 = aes_nohw_rotate_rows_down(a5); |
785 | 386k | aes_word_t a5_r5 = aes_nohw_xor(a5, r5); |
786 | 386k | aes_word_t r6 = aes_nohw_rotate_rows_down(a6); |
787 | 386k | aes_word_t a6_r6 = aes_nohw_xor(a6, r6); |
788 | 386k | aes_word_t r7 = aes_nohw_rotate_rows_down(a7); |
789 | 386k | aes_word_t a7_r7 = aes_nohw_xor(a7, r7); |
790 | | |
791 | 386k | batch->w[0] = |
792 | 386k | aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0)); |
793 | 386k | batch->w[1] = |
794 | 386k | aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7), |
795 | 386k | aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1))); |
796 | 386k | batch->w[2] = |
797 | 386k | aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2)); |
798 | 386k | batch->w[3] = |
799 | 386k | aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7), |
800 | 386k | aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3))); |
801 | 386k | batch->w[4] = |
802 | 386k | aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7), |
803 | 386k | aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4))); |
804 | 386k | batch->w[5] = |
805 | 386k | aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5)); |
806 | 386k | batch->w[6] = |
807 | 386k | aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6)); |
808 | 386k | batch->w[7] = |
809 | 386k | aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7)); |
810 | 386k | } |
811 | | |
812 | 3.78k | static void aes_nohw_inv_mix_columns(AES_NOHW_BATCH *batch) { |
813 | 3.78k | aes_word_t a0 = batch->w[0]; |
814 | 3.78k | aes_word_t a1 = batch->w[1]; |
815 | 3.78k | aes_word_t a2 = batch->w[2]; |
816 | 3.78k | aes_word_t a3 = batch->w[3]; |
817 | 3.78k | aes_word_t a4 = batch->w[4]; |
818 | 3.78k | aes_word_t a5 = batch->w[5]; |
819 | 3.78k | aes_word_t a6 = batch->w[6]; |
820 | 3.78k | aes_word_t a7 = batch->w[7]; |
821 | | |
822 | | // bsaes-x86_64.pl describes the following decomposition of the inverse |
823 | | // MixColumns matrix, credited to Jussi Kivilinna. This gives a much simpler |
824 | | // multiplication. |
825 | | // |
826 | | // | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | |
827 | | // | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | |
828 | | // | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | |
829 | | // | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | |
830 | | // |
831 | | // First, apply the [5 0 4 0] matrix. Multiplying by 4 in F_(2^8) is described |
832 | | // by the following bit equations: |
833 | | // |
834 | | // b0 = a6 |
835 | | // b1 = a6 ^ a7 |
836 | | // b2 = a0 ^ a7 |
837 | | // b3 = a1 ^ a6 |
838 | | // b4 = a2 ^ a6 ^ a7 |
839 | | // b5 = a3 ^ a7 |
840 | | // b6 = a4 |
841 | | // b7 = a5 |
842 | | // |
843 | | // Each coefficient is given by: |
844 | | // |
845 | | // b_ij = 05·a_ij ⊕ 04·a_i(j+2) = 04·(a_ij ⊕ a_i(j+2)) ⊕ a_ij |
846 | | // |
847 | | // We combine the two equations below. Note a_i(j+2) is a row rotation. |
848 | 3.78k | aes_word_t a0_r0 = aes_nohw_xor(a0, aes_nohw_rotate_rows_twice(a0)); |
849 | 3.78k | aes_word_t a1_r1 = aes_nohw_xor(a1, aes_nohw_rotate_rows_twice(a1)); |
850 | 3.78k | aes_word_t a2_r2 = aes_nohw_xor(a2, aes_nohw_rotate_rows_twice(a2)); |
851 | 3.78k | aes_word_t a3_r3 = aes_nohw_xor(a3, aes_nohw_rotate_rows_twice(a3)); |
852 | 3.78k | aes_word_t a4_r4 = aes_nohw_xor(a4, aes_nohw_rotate_rows_twice(a4)); |
853 | 3.78k | aes_word_t a5_r5 = aes_nohw_xor(a5, aes_nohw_rotate_rows_twice(a5)); |
854 | 3.78k | aes_word_t a6_r6 = aes_nohw_xor(a6, aes_nohw_rotate_rows_twice(a6)); |
855 | 3.78k | aes_word_t a7_r7 = aes_nohw_xor(a7, aes_nohw_rotate_rows_twice(a7)); |
856 | | |
857 | 3.78k | batch->w[0] = aes_nohw_xor(a0, a6_r6); |
858 | 3.78k | batch->w[1] = aes_nohw_xor(a1, aes_nohw_xor(a6_r6, a7_r7)); |
859 | 3.78k | batch->w[2] = aes_nohw_xor(a2, aes_nohw_xor(a0_r0, a7_r7)); |
860 | 3.78k | batch->w[3] = aes_nohw_xor(a3, aes_nohw_xor(a1_r1, a6_r6)); |
861 | 3.78k | batch->w[4] = |
862 | 3.78k | aes_nohw_xor(aes_nohw_xor(a4, a2_r2), aes_nohw_xor(a6_r6, a7_r7)); |
863 | 3.78k | batch->w[5] = aes_nohw_xor(a5, aes_nohw_xor(a3_r3, a7_r7)); |
864 | 3.78k | batch->w[6] = aes_nohw_xor(a6, a4_r4); |
865 | 3.78k | batch->w[7] = aes_nohw_xor(a7, a5_r5); |
866 | | |
867 | | // Apply the [02 03 01 01] matrix, which is just MixColumns. |
868 | 3.78k | aes_nohw_mix_columns(batch); |
869 | 3.78k | } |
870 | | |
871 | | static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key, |
872 | 30.6k | size_t num_rounds, AES_NOHW_BATCH *batch) { |
873 | 30.6k | aes_nohw_add_round_key(batch, &key->keys[0]); |
874 | 413k | for (size_t i = 1; i < num_rounds; i++) { |
875 | 383k | aes_nohw_sub_bytes(batch); |
876 | 383k | aes_nohw_shift_rows(batch); |
877 | 383k | aes_nohw_mix_columns(batch); |
878 | 383k | aes_nohw_add_round_key(batch, &key->keys[i]); |
879 | 383k | } |
880 | 30.6k | aes_nohw_sub_bytes(batch); |
881 | 30.6k | aes_nohw_shift_rows(batch); |
882 | 30.6k | aes_nohw_add_round_key(batch, &key->keys[num_rounds]); |
883 | 30.6k | } |
884 | | |
885 | | static void aes_nohw_decrypt_batch(const AES_NOHW_SCHEDULE *key, |
886 | 412 | size_t num_rounds, AES_NOHW_BATCH *batch) { |
887 | 412 | aes_nohw_add_round_key(batch, &key->keys[num_rounds]); |
888 | 412 | aes_nohw_inv_shift_rows(batch); |
889 | 412 | aes_nohw_inv_sub_bytes(batch); |
890 | 4.20k | for (size_t i = num_rounds - 1; i > 0; i--) { |
891 | 3.78k | aes_nohw_add_round_key(batch, &key->keys[i]); |
892 | 3.78k | aes_nohw_inv_mix_columns(batch); |
893 | 3.78k | aes_nohw_inv_shift_rows(batch); |
894 | 3.78k | aes_nohw_inv_sub_bytes(batch); |
895 | 3.78k | } |
896 | 412 | aes_nohw_add_round_key(batch, &key->keys[0]); |
897 | 412 | } |
898 | | |
899 | | |
900 | | // Key schedule. |
901 | | |
902 | | static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out, |
903 | 30.6k | const AES_KEY *key) { |
904 | 474k | for (size_t i = 0; i <= key->rounds; i++) { |
905 | | // Copy the round key into each block in the batch. |
906 | 3.99M | for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) { |
907 | 3.55M | aes_word_t tmp[AES_NOHW_BLOCK_WORDS]; |
908 | 3.55M | memcpy(tmp, key->rd_key + 4 * i, 16); |
909 | 3.55M | aes_nohw_batch_set(&out->keys[i], tmp, j); |
910 | 3.55M | } |
911 | 444k | aes_nohw_transpose(&out->keys[i]); |
912 | 444k | } |
913 | 30.6k | } |
914 | | |
915 | | static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10, |
916 | | 0x20, 0x40, 0x80, 0x1b, 0x36}; |
917 | | |
918 | | // aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in |
919 | | // |rcon|, stored in a |aes_word_t|. |
920 | 57.7k | static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) { |
921 | 57.7k | rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1); |
922 | 57.7k | #if defined(OPENSSL_SSE2) |
923 | 57.7k | return _mm_set_epi32(0, 0, 0, rcon); |
924 | | #else |
925 | | return ((aes_word_t)rcon); |
926 | | #endif |
927 | 57.7k | } |
928 | | |
929 | | static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], |
930 | 100k | const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { |
931 | 100k | AES_NOHW_BATCH batch; |
932 | 100k | memset(&batch, 0, sizeof(batch)); |
933 | 100k | aes_nohw_batch_set(&batch, in, 0); |
934 | 100k | aes_nohw_transpose(&batch); |
935 | 100k | aes_nohw_sub_bytes(&batch); |
936 | 100k | aes_nohw_transpose(&batch); |
937 | 100k | aes_nohw_batch_get(&batch, out, 0); |
938 | 100k | } |
939 | | |
940 | 846 | static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) { |
941 | 846 | key->rounds = 10; |
942 | | |
943 | 846 | aes_word_t block[AES_NOHW_BLOCK_WORDS]; |
944 | 846 | aes_nohw_compact_block(block, in); |
945 | 846 | memcpy(key->rd_key, block, 16); |
946 | | |
947 | 9.30k | for (size_t i = 1; i <= 10; i++) { |
948 | 8.46k | aes_word_t sub[AES_NOHW_BLOCK_WORDS]; |
949 | 8.46k | aes_nohw_sub_block(sub, block); |
950 | 8.46k | uint8_t rcon = aes_nohw_rcon[i - 1]; |
951 | 16.9k | for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { |
952 | | // Incorporate |rcon| and the transformed word into the first word. |
953 | 8.46k | block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j)); |
954 | 8.46k | block[j] = aes_nohw_xor( |
955 | 8.46k | block[j], |
956 | 8.46k | aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); |
957 | | // Propagate to the remaining words. Note this is reordered from the usual |
958 | | // formulation to avoid needing masks. |
959 | 8.46k | aes_word_t v = block[j]; |
960 | 8.46k | block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4)); |
961 | 8.46k | block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8)); |
962 | 8.46k | block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12)); |
963 | 8.46k | } |
964 | 8.46k | memcpy(key->rd_key + 4 * i, block, 16); |
965 | 8.46k | } |
966 | 846 | } |
967 | | |
968 | 2 | static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24]) { |
969 | 2 | key->rounds = 12; |
970 | | |
971 | 2 | aes_word_t storage1[AES_NOHW_BLOCK_WORDS], storage2[AES_NOHW_BLOCK_WORDS]; |
972 | 2 | aes_word_t *block1 = storage1, *block2 = storage2; |
973 | | |
974 | | // AES-192's key schedule is complex because each key schedule iteration |
975 | | // produces six words, but we compute on blocks and each block is four words. |
976 | | // We maintain a sliding window of two blocks, filled to 1.5 blocks at a time. |
977 | | // We loop below every three blocks or two key schedule iterations. |
978 | | // |
979 | | // On entry to the loop, |block1| and the first half of |block2| contain the |
980 | | // previous key schedule iteration. |block1| has been written to |key|, but |
981 | | // |block2| has not as it is incomplete. |
982 | 2 | aes_nohw_compact_block(block1, in); |
983 | 2 | memcpy(key->rd_key, block1, 16); |
984 | | |
985 | 2 | uint8_t half_block[16] = {0}; |
986 | 2 | memcpy(half_block, in + 16, 8); |
987 | 2 | aes_nohw_compact_block(block2, half_block); |
988 | | |
989 | 10 | for (size_t i = 0; i < 4; i++) { |
990 | 8 | aes_word_t sub[AES_NOHW_BLOCK_WORDS]; |
991 | 8 | aes_nohw_sub_block(sub, block2); |
992 | 8 | uint8_t rcon = aes_nohw_rcon[2 * i]; |
993 | 16 | for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { |
994 | | // Compute the first two words of the next key schedule iteration, which |
995 | | // go in the second half of |block2|. The first two words of the previous |
996 | | // iteration are in the first half of |block1|. Apply |rcon| here too |
997 | | // because the shifts match. |
998 | 8 | block2[j] = aes_nohw_or( |
999 | 8 | block2[j], |
1000 | 8 | aes_nohw_shift_left( |
1001 | 8 | aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)), 8)); |
1002 | | // Incorporate the transformed word and propagate. Note the last word of |
1003 | | // the previous iteration corresponds to the second word of |copy|. This |
1004 | | // is incorporated into the first word of the next iteration, or the third |
1005 | | // word of |block2|. |
1006 | 8 | block2[j] = aes_nohw_xor( |
1007 | 8 | block2[j], aes_nohw_and(aes_nohw_shift_left( |
1008 | 8 | aes_nohw_rotate_rows_down(sub[j]), 4), |
1009 | 8 | AES_NOHW_COL2_MASK)); |
1010 | 8 | block2[j] = aes_nohw_xor( |
1011 | 8 | block2[j], |
1012 | 8 | aes_nohw_and(aes_nohw_shift_left(block2[j], 4), AES_NOHW_COL3_MASK)); |
1013 | | |
1014 | | // Compute the remaining four words, which fill |block1|. Begin by moving |
1015 | | // the corresponding words of the previous iteration: the second half of |
1016 | | // |block1| and the first half of |block2|. |
1017 | 8 | block1[j] = aes_nohw_shift_right(block1[j], 8); |
1018 | 8 | block1[j] = aes_nohw_or(block1[j], aes_nohw_shift_left(block2[j], 8)); |
1019 | | // Incorporate the second word, computed previously in |block2|, and |
1020 | | // propagate. |
1021 | 8 | block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12)); |
1022 | 8 | aes_word_t v = block1[j]; |
1023 | 8 | block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4)); |
1024 | 8 | block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); |
1025 | 8 | block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); |
1026 | 8 | } |
1027 | | |
1028 | | // This completes two round keys. Note half of |block2| was computed in the |
1029 | | // previous loop iteration but was not yet output. |
1030 | 8 | memcpy(key->rd_key + 4 * (3 * i + 1), block2, 16); |
1031 | 8 | memcpy(key->rd_key + 4 * (3 * i + 2), block1, 16); |
1032 | | |
1033 | 8 | aes_nohw_sub_block(sub, block1); |
1034 | 8 | rcon = aes_nohw_rcon[2 * i + 1]; |
1035 | 16 | for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { |
1036 | | // Compute the first four words of the next key schedule iteration in |
1037 | | // |block2|. Begin by moving the corresponding words of the previous |
1038 | | // iteration: the second half of |block2| and the first half of |block1|. |
1039 | 8 | block2[j] = aes_nohw_shift_right(block2[j], 8); |
1040 | 8 | block2[j] = aes_nohw_or(block2[j], aes_nohw_shift_left(block1[j], 8)); |
1041 | | // Incorporate rcon and the transformed word. Note the last word of the |
1042 | | // previous iteration corresponds to the last word of |copy|. |
1043 | 8 | block2[j] = aes_nohw_xor(block2[j], aes_nohw_rcon_slice(rcon, j)); |
1044 | 8 | block2[j] = aes_nohw_xor( |
1045 | 8 | block2[j], |
1046 | 8 | aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); |
1047 | | // Propagate to the remaining words. |
1048 | 8 | aes_word_t v = block2[j]; |
1049 | 8 | block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4)); |
1050 | 8 | block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); |
1051 | 8 | block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); |
1052 | | |
1053 | | // Compute the last two words, which go in the first half of |block1|. The |
1054 | | // last two words of the previous iteration are in the second half of |
1055 | | // |block1|. |
1056 | 8 | block1[j] = aes_nohw_shift_right(block1[j], 8); |
1057 | | // Propagate blocks and mask off the excess. |
1058 | 8 | block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12)); |
1059 | 8 | block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(block1[j], 4)); |
1060 | 8 | block1[j] = aes_nohw_and(block1[j], AES_NOHW_COL01_MASK); |
1061 | 8 | } |
1062 | | |
1063 | | // |block2| has a complete round key. |block1| will be completed in the next |
1064 | | // iteration. |
1065 | 8 | memcpy(key->rd_key + 4 * (3 * i + 3), block2, 16); |
1066 | | |
1067 | | // Swap blocks to restore the invariant. |
1068 | 8 | aes_word_t *tmp = block1; |
1069 | 8 | block1 = block2; |
1070 | 8 | block2 = tmp; |
1071 | 8 | } |
1072 | 2 | } |
1073 | | |
1074 | 7.04k | static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) { |
1075 | 7.04k | key->rounds = 14; |
1076 | | |
1077 | | // Each key schedule iteration produces two round keys. |
1078 | 7.04k | aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS]; |
1079 | 7.04k | aes_nohw_compact_block(block1, in); |
1080 | 7.04k | memcpy(key->rd_key, block1, 16); |
1081 | | |
1082 | 7.04k | aes_nohw_compact_block(block2, in + 16); |
1083 | 7.04k | memcpy(key->rd_key + 4, block2, 16); |
1084 | | |
1085 | 49.3k | for (size_t i = 2; i <= 14; i += 2) { |
1086 | 49.3k | aes_word_t sub[AES_NOHW_BLOCK_WORDS]; |
1087 | 49.3k | aes_nohw_sub_block(sub, block2); |
1088 | 49.3k | uint8_t rcon = aes_nohw_rcon[i / 2 - 1]; |
1089 | 98.6k | for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { |
1090 | | // Incorporate |rcon| and the transformed word into the first word. |
1091 | 49.3k | block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)); |
1092 | 49.3k | block1[j] = aes_nohw_xor( |
1093 | 49.3k | block1[j], |
1094 | 49.3k | aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); |
1095 | | // Propagate to the remaining words. |
1096 | 49.3k | aes_word_t v = block1[j]; |
1097 | 49.3k | block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4)); |
1098 | 49.3k | block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); |
1099 | 49.3k | block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); |
1100 | 49.3k | } |
1101 | 49.3k | memcpy(key->rd_key + 4 * i, block1, 16); |
1102 | | |
1103 | 49.3k | if (i == 14) { |
1104 | 7.04k | break; |
1105 | 7.04k | } |
1106 | | |
1107 | 42.2k | aes_nohw_sub_block(sub, block1); |
1108 | 84.5k | for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { |
1109 | | // Incorporate the transformed word into the first word. |
1110 | 42.2k | block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12)); |
1111 | | // Propagate to the remaining words. |
1112 | 42.2k | aes_word_t v = block2[j]; |
1113 | 42.2k | block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4)); |
1114 | 42.2k | block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); |
1115 | 42.2k | block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); |
1116 | 42.2k | } |
1117 | 42.2k | memcpy(key->rd_key + 4 * (i + 1), block2, 16); |
1118 | 42.2k | } |
1119 | 7.04k | } |
1120 | | |
1121 | | |
1122 | | // External API. |
1123 | | |
1124 | | int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits, |
1125 | 7.89k | AES_KEY *aeskey) { |
1126 | 7.89k | switch (bits) { |
1127 | 846 | case 128: |
1128 | 846 | aes_nohw_setup_key_128(aeskey, key); |
1129 | 846 | return 0; |
1130 | 2 | case 192: |
1131 | 2 | aes_nohw_setup_key_192(aeskey, key); |
1132 | 2 | return 0; |
1133 | 7.04k | case 256: |
1134 | 7.04k | aes_nohw_setup_key_256(aeskey, key); |
1135 | 7.04k | return 0; |
1136 | 7.89k | } |
1137 | 0 | return 1; |
1138 | 7.89k | } |
1139 | | |
1140 | | int aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits, |
1141 | 120 | AES_KEY *aeskey) { |
1142 | 120 | return aes_nohw_set_encrypt_key(key, bits, aeskey); |
1143 | 120 | } |
1144 | | |
1145 | 26.5k | void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { |
1146 | 26.5k | AES_NOHW_SCHEDULE sched; |
1147 | 26.5k | aes_nohw_expand_round_keys(&sched, key); |
1148 | 26.5k | AES_NOHW_BATCH batch; |
1149 | 26.5k | aes_nohw_to_batch(&batch, in, /*num_blocks=*/1); |
1150 | 26.5k | aes_nohw_encrypt_batch(&sched, key->rounds, &batch); |
1151 | 26.5k | aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); |
1152 | 26.5k | } |
1153 | | |
1154 | 392 | void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { |
1155 | 392 | AES_NOHW_SCHEDULE sched; |
1156 | 392 | aes_nohw_expand_round_keys(&sched, key); |
1157 | 392 | AES_NOHW_BATCH batch; |
1158 | 392 | aes_nohw_to_batch(&batch, in, /*num_blocks=*/1); |
1159 | 392 | aes_nohw_decrypt_batch(&sched, key->rounds, &batch); |
1160 | 392 | aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); |
1161 | 392 | } |
1162 | | |
1163 | | static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16], |
1164 | 13.7k | const uint8_t b[16]) { |
1165 | 27.4k | for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) { |
1166 | 13.7k | aes_word_t x, y; |
1167 | 13.7k | memcpy(&x, a + i, sizeof(aes_word_t)); |
1168 | 13.7k | memcpy(&y, b + i, sizeof(aes_word_t)); |
1169 | 13.7k | x = aes_nohw_xor(x, y); |
1170 | 13.7k | memcpy(out + i, &x, sizeof(aes_word_t)); |
1171 | 13.7k | } |
1172 | 13.7k | } |
1173 | | |
1174 | | void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, |
1175 | | size_t blocks, const AES_KEY *key, |
1176 | 3.63k | const uint8_t ivec[16]) { |
1177 | 3.63k | if (blocks == 0) { |
1178 | 0 | return; |
1179 | 0 | } |
1180 | | |
1181 | 3.63k | AES_NOHW_SCHEDULE sched; |
1182 | 3.63k | aes_nohw_expand_round_keys(&sched, key); |
1183 | | |
1184 | | // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|. |
1185 | 3.63k | alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16]; |
1186 | 3.63k | alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16]; |
1187 | 32.6k | for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { |
1188 | 29.0k | memcpy(ivs + 16 * i, ivec, 16); |
1189 | 29.0k | } |
1190 | | |
1191 | 3.63k | uint32_t ctr = CRYPTO_load_u32_be(ivs + 12); |
1192 | 3.98k | for (;;) { |
1193 | | // Update counters. |
1194 | 35.8k | for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { |
1195 | 31.9k | CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i); |
1196 | 31.9k | } |
1197 | | |
1198 | 3.98k | size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks; |
1199 | 3.98k | AES_NOHW_BATCH batch; |
1200 | 3.98k | aes_nohw_to_batch(&batch, ivs, todo); |
1201 | 3.98k | aes_nohw_encrypt_batch(&sched, key->rounds, &batch); |
1202 | 3.98k | aes_nohw_from_batch(enc_ivs, todo, &batch); |
1203 | | |
1204 | 17.6k | for (size_t i = 0; i < todo; i++) { |
1205 | 13.6k | aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i); |
1206 | 13.6k | } |
1207 | | |
1208 | 3.98k | blocks -= todo; |
1209 | 3.98k | if (blocks == 0) { |
1210 | 3.63k | break; |
1211 | 3.63k | } |
1212 | | |
1213 | 358 | in += 16 * AES_NOHW_BATCH_SIZE; |
1214 | 358 | out += 16 * AES_NOHW_BATCH_SIZE; |
1215 | 358 | ctr += AES_NOHW_BATCH_SIZE; |
1216 | 358 | } |
1217 | 3.63k | } |
1218 | | |
1219 | | void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, |
1220 | 79 | const AES_KEY *key, uint8_t *ivec, const int enc) { |
1221 | 79 | assert(len % 16 == 0); |
1222 | 79 | size_t blocks = len / 16; |
1223 | 79 | if (blocks == 0) { |
1224 | 0 | return; |
1225 | 0 | } |
1226 | | |
1227 | 79 | AES_NOHW_SCHEDULE sched; |
1228 | 79 | aes_nohw_expand_round_keys(&sched, key); |
1229 | 79 | alignas(AES_NOHW_WORD_SIZE) uint8_t iv[16]; |
1230 | 79 | memcpy(iv, ivec, 16); |
1231 | | |
1232 | 79 | if (enc) { |
1233 | | // CBC encryption is not parallelizable. |
1234 | 119 | while (blocks > 0) { |
1235 | 60 | aes_nohw_xor_block(iv, iv, in); |
1236 | | |
1237 | 60 | AES_NOHW_BATCH batch; |
1238 | 60 | aes_nohw_to_batch(&batch, iv, /*num_blocks=*/1); |
1239 | 60 | aes_nohw_encrypt_batch(&sched, key->rounds, &batch); |
1240 | 60 | aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); |
1241 | | |
1242 | 60 | memcpy(iv, out, 16); |
1243 | | |
1244 | 60 | in += 16; |
1245 | 60 | out += 16; |
1246 | 60 | blocks--; |
1247 | 60 | } |
1248 | 59 | memcpy(ivec, iv, 16); |
1249 | 59 | return; |
1250 | 59 | } |
1251 | | |
1252 | 20 | for (;;) { |
1253 | 20 | size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks; |
1254 | | // Make a copy of the input so we can decrypt in-place. |
1255 | 20 | alignas(AES_NOHW_WORD_SIZE) uint8_t copy[AES_NOHW_BATCH_SIZE * 16]; |
1256 | 20 | memcpy(copy, in, todo * 16); |
1257 | | |
1258 | 20 | AES_NOHW_BATCH batch; |
1259 | 20 | aes_nohw_to_batch(&batch, in, todo); |
1260 | 20 | aes_nohw_decrypt_batch(&sched, key->rounds, &batch); |
1261 | 20 | aes_nohw_from_batch(out, todo, &batch); |
1262 | | |
1263 | 20 | aes_nohw_xor_block(out, out, iv); |
1264 | 60 | for (size_t i = 1; i < todo; i++) { |
1265 | 40 | aes_nohw_xor_block(out + 16 * i, out + 16 * i, copy + 16 * (i - 1)); |
1266 | 40 | } |
1267 | | |
1268 | | // Save the last block as the IV. |
1269 | 20 | memcpy(iv, copy + 16 * (todo - 1), 16); |
1270 | | |
1271 | 20 | blocks -= todo; |
1272 | 20 | if (blocks == 0) { |
1273 | 20 | break; |
1274 | 20 | } |
1275 | | |
1276 | 0 | in += 16 * AES_NOHW_BATCH_SIZE; |
1277 | 0 | out += 16 * AES_NOHW_BATCH_SIZE; |
1278 | 0 | } |
1279 | | |
1280 | 20 | memcpy(ivec, iv, 16); |
1281 | 20 | } |